пре 2 година · bc1047e64b
--- a/research/long-context-llama/H2O/utils_llama.py
+++ b/research/long-context-llama/H2O/utils_llama.py
@@ -186,9 +186,6 @@ class H2OLlamaAttention(nn.Module):
 
				         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
			
 
				         self._init_rope()
			
 
				 
			
 
				-        # self.past_key_value = HHCache()
			
 
				-        # pdb.set_trace()
			
 
				-
			
 
				     def _init_rope(self):
			
 
				         if self.config.rope_scaling is None:
			
 
				             self.rotary_emb = LlamaRotaryEmbedding(
			
@@ -254,9 +251,6 @@ class H2OLlamaAttention(nn.Module):
 
				         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
			
 
				         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
			
 
				         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
			
 
				-        
			
 
				-        import pdb; pdb.set_trace()
			
 
				-
			
 
				 
			
 
				         past_key_value = getattr(self, "past_key_value", past_key_value)
			
 
				         cos, sin = self.rotary_emb(value_states, position_ids)
			
@@ -304,9 +298,175 @@ class H2OLlamaAttention(nn.Module):
 
				         return attn_output, attn_weights, past_key_value
			
 
				 
			
 
				 
			
 
				+def enable_h2ocache_forward(
			
 
				+    self,
			
 
				+    input_ids: torch.LongTensor = None,
			
 
				+    attention_mask: Optional[torch.Tensor] = None,
			
 
				+    position_ids: Optional[torch.LongTensor] = None,
			
 
				+    past_key_values: Optional[List[torch.FloatTensor]] = None,
			
 
				+    inputs_embeds: Optional[torch.FloatTensor] = None,
			
 
				+    use_cache: Optional[bool] = None,
			
 
				+    output_attentions: Optional[bool] = None,
			
 
				+    output_hidden_states: Optional[bool] = None,
			
 
				+    return_dict: Optional[bool] = None,
			
 
				+    cache_position: Optional[torch.LongTensor] = None,
			
 
				+) -> Union[Tuple, BaseModelOutputWithPast]:
			
 
				+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
			
 
				+    output_hidden_states = (
			
 
				+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
			
 
				+    )
			
 
				+    use_cache = use_cache if use_cache is not None else self.config.use_cache
			
 
				+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
			
 
				+
			
 
				+    if (input_ids is None) ^ (inputs_embeds is not None):
			
 
				+        raise ValueError(
			
 
				+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
			
 
				+        )
			
 
				+
			
 
				+    if self.gradient_checkpointing and self.training and use_cache:
			
 
				+        logger.warning_once(
			
 
				+            "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
			
 
				+        )
			
 
				+        use_cache = False
			
 
				+
			
 
				+    if inputs_embeds is None:
			
 
				+        inputs_embeds = self.embed_tokens(input_ids)
			
 
				+
			
 
				+    past_seen_tokens = 0
			
 
				+    if use_cache:  # kept for BC (cache positions)
			
 
				+        if not isinstance(past_key_values, StaticCache):
			
 
				+            past_key_values = HHCache.from_legacy_cache(past_key_values)
			
 
				+            past_seen_tokens = past_key_values.get_seq_length()
			
 
				+
			
 
				+    if cache_position is None:
			
 
				+        if isinstance(past_key_values, StaticCache):
			
 
				+            raise ValueError("cache_position is a required argument when using StaticCache.")
			
 
				+        cache_position = torch.arange(
			
 
				+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
			
 
				+        )
			
 
				+
			
 
				+    if position_ids is None:
			
 
				+        position_ids = cache_position.unsqueeze(0)
			
 
				+
			
 
				+    causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
			
 
				+
			
 
				+    # embed positions
			
 
				+    hidden_states = inputs_embeds
			
 
				+
			
 
				+    # decoder layers
			
 
				+    all_hidden_states = () if output_hidden_states else None
			
 
				+    all_self_attns = () if output_attentions else None
			
 
				+    next_decoder_cache = None
			
 
				+
			
 
				+    for decoder_layer in self.layers:
			
 
				+        if output_hidden_states:
			
 
				+            all_hidden_states += (hidden_states,)
			
 
				+
			
 
				+        if self.gradient_checkpointing and self.training:
			
 
				+            layer_outputs = self._gradient_checkpointing_func(
			
 
				+                decoder_layer.__call__,
			
 
				+                hidden_states,
			
 
				+                causal_mask,
			
 
				+                position_ids,
			
 
				+                past_key_values,
			
 
				+                output_attentions,
			
 
				+                use_cache,
			
 
				+                cache_position,
			
 
				+            )
			
 
				+        else:
			
 
				+            layer_outputs = decoder_layer(
			
 
				+                hidden_states,
			
 
				+                attention_mask=causal_mask,
			
 
				+                position_ids=position_ids,
			
 
				+                past_key_value=past_key_values,
			
 
				+                output_attentions=output_attentions,
			
 
				+                use_cache=use_cache,
			
 
				+                cache_position=cache_position,
			
 
				+            )
			
 
				 
			
 
				+        hidden_states = layer_outputs[0]
			
 
				+
			
 
				+        if use_cache:
			
 
				+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
			
 
				+
			
 
				+        if output_attentions:
			
 
				+            all_self_attns += (layer_outputs[1],)
			
 
				+
			
 
				+    hidden_states = self.norm(hidden_states)
			
 
				+
			
 
				+    # add hidden states from the last decoder layer
			
 
				+    if output_hidden_states:
			
 
				+        all_hidden_states += (hidden_states,)
			
 
				+
			
 
				+    next_cache = None
			
 
				+    if use_cache:
			
 
				+        next_cache = (
			
 
				+            next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
			
 
				+        )
			
 
				+    if not return_dict:
			
 
				+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
			
 
				+    return BaseModelOutputWithPast(
			
 
				+        last_hidden_state=hidden_states,
			
 
				+        past_key_values=next_cache,
			
 
				+        hidden_states=all_hidden_states,
			
 
				+        attentions=all_self_attns,
			
 
				+    )
			
 
				+
			
 
				+# TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
			
 
				+# KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
			
 
				+# (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
			
 
				+# `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
			
 
				+def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
			
 
				+    if self.config._attn_implementation == "flash_attention_2":
			
 
				+        if attention_mask is not None and 0.0 in attention_mask:
			
 
				+            return attention_mask
			
 
				+        return None
			
 
				+
			
 
				+    dtype, device = input_tensor.dtype, input_tensor.device
			
 
				+    min_dtype = torch.finfo(dtype).min
			
 
				+    sequence_length = input_tensor.shape[1]
			
 
				+    if hasattr(self.layers[0].self_attn, "past_key_value"):  # static cache
			
 
				+        target_length = self.config.max_position_embeddings
			
 
				+    else:  # dynamic cache
			
 
				+        target_length = (
			
 
				+            attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
			
 
				+        )
			
 
				+
			
 
				+    causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
			
 
				+    if sequence_length != 1:
			
 
				+        causal_mask = torch.triu(causal_mask, diagonal=1)
			
 
				+    causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
			
 
				+    causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
			
 
				+    if attention_mask is not None:
			
 
				+        causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
			
 
				+        if attention_mask.dim() == 2:
			
 
				+            mask_length = attention_mask.shape[-1]
			
 
				+            padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
			
 
				+            causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
			
 
				+        elif attention_mask.dim() == 4:
			
 
				+            # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
			
 
				+            # cache. In that case, the 4D attention mask attends to the newest tokens only.
			
 
				+            if attention_mask.shape[-2] < cache_position[0] + sequence_length:
			
 
				+                offset = cache_position[0]
			
 
				+            else:
			
 
				+                offset = 0
			
 
				+            mask_shape = attention_mask.shape
			
 
				+            mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
			
 
				+            causal_mask[
			
 
				+                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
			
 
				+            ] = mask_slice
			
 
				+
			
 
				+    if (
			
 
				+        self.config._attn_implementation == "sdpa"
			
 
				+        and attention_mask is not None
			
 
				+        and attention_mask.device.type == "cuda"
			
 
				+    ):
			
 
				+        # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
			
 
				+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
			
 
				+        # Details: https://github.com/pytorch/pytorch/issues/110213
			
 
				+        causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
			
 
				 
			
 
				-
			
 
				+    return causal_mask
			
 
				 
			
 
				 
			
 
				 
			
@@ -545,3 +705,5 @@ class H2OLlamaForCausalLM(LlamaForCausalLM):
 
				         num_layers = len(self.model.layers)
			
 
				         for layer_idx in range(num_layers):
			
 
				             self.model.layers[layer_idx].self_attn = H2OLlamaAttention(config, layer_idx)
			
 
				+
			
 
				+        self.model.forward = types.MethodType(enable_h2ocache_forward, self.model)