o
    ei,                     @   s   d dl Z d dl mZ ddlmZmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ d	d
lmZmZmZmZmZ ddlmZ eeZG dd deZG dd deZG dd deZG dd deZG dd deZg dZdS )    N)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                       s0   e Zd ZdZddededB f fddZ  ZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s   t  || |j| _d S N)super__init__attention_multiplierscalingselfr   r   	__class__ i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/granite/modular_granite.pyr   (   s   zGraniteAttention.__init__r   )__name__
__module____qualname____doc__r   intr   __classcell__r    r    r   r!   r   %   s    $r   c                       s   e Zd Zdedef fddZ							ddejdejdB d	ejdB d
e	dB de
dB de
dB dejdB deejejf dB deejeejejf dB f fddZ  ZS )GraniteDecoderLayerr   r   c                    s(   t  || |j| _t||d| _d S )N)r   r   )r   r   residual_multiplierr   	self_attnr   r   r    r!   r   .   s   zGraniteDecoderLayer.__init__NFhidden_statesattention_maskposition_idspast_key_valuesoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                 K   s   |}
|  |}| jd||||||||d|	\}}|
|| j  }|}
| |}| |}|
|| j  }|f}|r>||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r+   r,   r-   r.   r/   r0   r1   r2   Nr    )input_layernormr*   r)   post_attention_layernormmlp)r   r+   r,   r-   r.   r/   r0   r1   r2   kwargsresidualself_attn_weightsoutputsr    r    r!   forward3   s.   "
	



zGraniteDecoderLayer.forward)NNNFFNN)r"   r#   r$   r   r&   r   torchTensor
LongTensorr   booltupleFloatTensorr;   r'   r    r    r   r!   r(   -   s8    	r(   c                   @   s   e Zd ZdS )GranitePreTrainedModelN)r"   r#   r$   r    r    r    r!   rB   u   s    rB   c                       s   e Zd Zdef fddZ									ddejdB dejdB dejdB dedB d	ej	dB d
e
dB de
dB de
dB dejdB dee defddZ  ZS )GraniteModelr   c                    s8   t     j| _t fddt jD | _d S )Nc                    s   g | ]}t  |qS r    )r(   ).0r   r   r    r!   
<listcomp>~   s    z)GraniteModel.__init__.<locals>.<listcomp>)r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layerslayers)r   r   r   rE   r!   r   z   s
   
zGraniteModel.__init__N	input_idsr,   r-   r.   inputs_embedsr0   r/   output_hidden_statesr1   r7   r3   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|| j
 }|rS|d u rSt| j d}|	d u ro|d ur_| nd}tj|||jd  |jd}	|d u rx|	d}t| j |||	||d}|}| j||d	}|rd
nd }|rd
nd }| jd | j j D ]&}|r||f7 }||f||||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrE   r   r   )device)r   rM   r,   r1   r.   r-   )r-   r    )r,   r-   r.   r/   r0   r1   r2   )last_hidden_stater.   r+   
attentions)r   r/   rN   r0   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrG   r   get_seq_lengthr<   arangeshaperO   	unsqueezer   
rotary_embrK   rJ   normr   )r   rL   r,   r-   r.   rM   r0   r/   rN   r1   r7   past_seen_tokenscausal_maskr+   r2   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr    r    r!   r;      s   


	
	


zGraniteModel.forward)	NNNNNNNNN)r"   r#   r$   r   r   r<   r>   r=   r   rA   r?   r	   r
   r   r;   r'   r    r    r   r!   rC   y   sD    		
rC   c                   @   s   e Zd Z											ddejdB dejdB dejdB dedB dejdB dejdB d	edB d
edB dedB dejdB de	ejB de
e defddZdS )GraniteForCausalLMNr   rL   r,   r-   r.   rM   labelsr0   r/   rN   r1   logits_to_keepr7   r3   c                 K   s   |d ur|n| j j}|	d ur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| d n|}| |d d |d d f }|| j j	 }d }|d ur^| j
d||| j jd|}t|||j|j|jdS )N)	rL   r,   r-   r.   rM   r0   r/   rN   r1   )logitsre   
vocab_size)lossrg   r.   r+   rQ   r    )r   r/   rN   modelrP   
isinstancer&   slicelm_headlogits_scalingloss_functionrh   r   r.   r+   rQ   )r   rL   r,   r-   r.   rM   re   r0   r/   rN   r1   rf   r7   r:   r+   slice_indicesrg   ri   r    r    r!   r;      s<   
zGraniteForCausalLM.forward)NNNNNNNNNNr   )r"   r#   r$   r<   r>   r=   r   rA   r?   r&   r	   r
   r   r;   r    r    r    r!   rd      sN    	
rd   )rd   rC   rB   ) r<   r   cache_utilsr   r   masking_utilsr   modeling_outputsr   r   processing_utilsr	   utilsr
   r   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr"   rU   r   r(   rB   rC   rd   __all__r    r    r    r!   <module>   s    
Hh6