il y a 2 ans · 66bf3835e4
--- a/research/long-context-llama/H2O/cache_utils.py
+++ b/research/long-context-llama/H2O/cache_utils.py
@@ -464,7 +464,6 @@ class HHCache(Cache):
 
				 
			
 
				         if layer_idx == 0:
			
 
				             self._seen_tokens += key_states.shape[-2]
			
 
				-            import pdb; pdb.set_trace()
			
 
				 
			
 
				         # Update the cache
			
 
				         if len(self.key_cache) <= layer_idx:
			
@@ -499,10 +498,6 @@ class HHCache(Cache):
 
				             A tuple containing the updated key and value states.
			
 
				         """
			
 
				 
			
 
				-        if layer_idx == 0:
			
 
				-            import pdb; pdb.set_trace()
			
 
				-
			
 
				-
			
 
				         # Update score metrics (Accumulated attention scores)
			
 
				         if len(self.accumulated_attention_scores) <= layer_idx:
			
 
				             self.accumulated_attention_scores.append(attention_scores.sum(2)[:,::num_kv_groups, :]) # [bs, num_heads, key_len]
			
@@ -529,9 +524,6 @@ class HHCache(Cache):
 
				             self.key_cache[layer_idx] = self.key_cache[layer_idx][mask].view(bsz, num_heads, -1, head_dim)
			
 
				             self.value_cache[layer_idx] = self.value_cache[layer_idx][mask].view(bsz, num_heads, -1, head_dim)
			
 
				             self.accumulated_attention_scores[layer_idx] = self.accumulated_attention_scores[layer_idx][mask].view(bsz, num_heads, -1)
			
 
				-        
			
 
				-        if layer_idx == 0:
			
 
				-            import pdb; pdb.set_trace()
			
 
				 
			
 
				 
			
 
				     def reorder_cache(self, beam_idx: torch.LongTensor):
			
--- a/research/long-context-llama/H2O/utils_llama.py
+++ b/research/long-context-llama/H2O/utils_llama.py
@@ -227,9 +227,6 @@ class H2OLlamaAttention(nn.Module):
 
				     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
			
 
				         bsz, q_len, _ = hidden_states.size()
			
 
				 
			
 
				-        if self.layer_idx == 0:
			
 
				-            import pdb;pdb.set_trace()
			
 
				-
			
 
				         if self.config.pretraining_tp > 1:
			
 
				             key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
			
 
				             query_slices = self.q_proj.weight.split(
			
@@ -304,8 +301,6 @@ class H2OLlamaAttention(nn.Module):
 
				         if not output_attentions:
			
 
				             attn_weights = None
			
 
				 
			
 
				-        print(past_key_value.key_cache[self.layer_idx].shape, past_key_value.accumulated_attention_scores[self.layer_idx].shape)
			
 
				-
			
 
				         return attn_output, attn_weights, past_key_value