o
    wi.                     @   s   d dl mZmZ d dlZd dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZmZ ddlmZ eeZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deeZ$G dd deZ%g dZ&dS )    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)
LossKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                       s0   e Zd ZdZddedee f fddZ  ZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s   t  || |j| _d S N)super__init__attention_multiplierscalingselfr   r   	__class__ h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/granite/modular_granite.pyr   ,   s   zGraniteAttention.__init__r   )	__name__
__module____qualname____doc__r   r   intr   __classcell__r#   r#   r!   r$   r   )   s    $r   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  deejeeejejf  f fddZ  ZS )GraniteDecoderLayerr   r   c                    s(   t  || |j| _t||d| _d S )N)r   r   )r   r   residual_multiplierr   	self_attnr   r!   r#   r$   r   2   s   zGraniteDecoderLayer.__init__NFhidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                 K   s   |}
|  |}| jd||||||||d|	\}}|
|| j  }|}
| |}| |}|
|| j  }|f}|r>||f7 }|S )a.  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r.   r/   r0   r1   r2   r3   r4   r5   Nr#   )input_layernormr-   r,   post_attention_layernormmlp)r    r.   r/   r0   r1   r2   r3   r4   r5   kwargsresidualself_attn_weightsoutputsr#   r#   r$   forward7   s.   "
	



zGraniteDecoderLayer.forward)NNNFFNN)r%   r&   r'   r   r)   r   torchTensorr   
LongTensorr   booltupleFloatTensorr>   r*   r#   r#   r!   r$   r+   1   s8    	r+   c                   @      e Zd ZdS )GranitePreTrainedModelNr%   r&   r'   r#   r#   r#   r$   rF   y   s    rF   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	 d	eej
 d
ee dee dee deej dee defddZ  ZS )GraniteModelr   c                    s8   t     j| _t fddt jD | _d S )Nc                    s   g | ]}t  |qS r#   )r+   ).0r   r   r#   r$   
<listcomp>   s    z)GraniteModel.__init__.<locals>.<listcomp>)r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layerslayers)r    r   r!   rJ   r$   r   ~   s
   
zGraniteModel.__init__N	input_idsr/   r0   past_key_valuesinputs_embedsr3   r2   output_hidden_statesr4   flash_attn_kwargsr6   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|| j
 }|rP|d u rPt }|	d u rl|d ur\| nd}tj|||jd  |jd}	|d u ru|	d}t| j |||	||d}|}| ||}|rdnd }|rdnd }| jd | j j D ]&}|r||f7 }||f||||||	|d	|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )device)r   input_embedsr/   r4   rR   r0   r#   )r/   r0   r1   r2   r3   r4   r5   )last_hidden_staterR   r.   
attentions)r   r2   rT   r3   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrL   r   get_seq_lengthr?   arangeshaperV   	unsqueezer   
rotary_embrP   rO   normr
   )r    rQ   r/   r0   rR   rS   r3   r2   rT   r4   rU   past_seen_tokenscausal_maskr.   r5   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr#   r#   r$   r>      s   


	
	


zGraniteModel.forward)	NNNNNNNNN)r%   r&   r'   r   r   r   r?   rA   r@   r   rD   rB   r   r	   r
   r>   r*   r#   r#   r!   r$   rH   }   sD    		
rH   c                   @   rE   )KwargsForCausalLMNrG   r#   r#   r#   r$   rl      s    rl   c                   @   s   e Zd Z											ddeej deej deej deeee	ej
 f  deej
 deej d	ee d
ee dee deej deeejf dee defddZdS )GraniteForCausalLMNr   rQ   r/   r0   rR   rS   labelsr3   r2   rT   r4   logits_to_keepr:   r6   c                 K   s   |d ur|n| j j}|	d ur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| d n|}| |d d |d d f }|| j j	 }d }|d ur^| j
d||| j jd|}t|||j|j|jdS )N)	rQ   r/   r0   rR   rS   r3   r2   rT   r4   )logitsrn   
vocab_size)lossrp   rR   r.   rY   r#   )r   r2   rT   modelrX   
isinstancer)   slicelm_headlogits_scalingloss_functionrq   r   rR   r.   rY   )r    rQ   r/   r0   rR   rS   rn   r3   r2   rT   r4   ro   r:   r=   r.   slice_indicesrp   rr   r#   r#   r$   r>      s<   
zGraniteForCausalLM.forward)NNNNNNNNNNr   )r%   r&   r'   r   r?   rA   r@   r   r   listrD   rB   r)   r   rl   r   r>   r#   r#   r#   r$   rm      sN    	
rm   )rm   rH   rF   )'typingr   r   r?   torch.utils.checkpointr   cache_utilsr   r   masking_utilsr   modeling_flash_attention_utilsr	   modeling_outputsr
   r   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr%   r]   r   r+   rF   rH   rl   rm   __all__r#   r#   r#   r$   <module>   s(   
Hj6