o
    	۷il.                     @   s  d dl mZmZ d dlZd dlmZ ddlmZmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ eeZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#g dZ$dS )    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                       s0   e Zd ZdZddedee f fddZ  ZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s   t  || |j| _d S N)super__init__attention_multiplierscalingselfr   r   	__class__ a/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/granite/modular_granite.pyr   +   s   zGraniteAttention.__init__r   )	__name__
__module____qualname____doc__r   r   intr   __classcell__r#   r#   r!   r$   r   (   s    $r   c                       s   e Zd Zdedef fddZedddd							
	
				ddejde	ej de	ej
 de	e de	e de	e de	ej
 de	eejejf  deeje	eejejf  f fddZ  ZS )GraniteDecoderLayerr   r   c                    s(   t  || |j| _t||d| _d S )N)r   r   )r   r   residual_multiplierr   	self_attnr   r!   r#   r$   r   1   s   zGraniteDecoderLayer.__init__past_key_valuepast_key_valuesz4.58)new_nameversionNFhidden_statesattention_maskposition_idsoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                 K   s   |}
|  |}| jd||||||||d|	\}}|
|| j  }|}
| |}| |}|
|| j  }|f}|r>||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r2   r3   r4   r/   r5   r6   r7   r8   Nr#   )input_layernormr-   r,   post_attention_layernormmlp)r    r2   r3   r4   r/   r5   r6   r7   r8   kwargsresidualself_attn_weightsoutputsr#   r#   r$   forward6   s.   #
	



zGraniteDecoderLayer.forward)NNNFFNN)r%   r&   r'   r   r)   r   r   torchTensorr   
LongTensorr   booltupleFloatTensorrA   r*   r#   r#   r!   r$   r+   0   s:    	r+   c                   @   s   e Zd ZdS )GranitePreTrainedModelN)r%   r&   r'   r#   r#   r#   r$   rH   y   s    rH   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	 d	eej
 d
ee dee dee deej dee defddZ  ZS )GraniteModelr   c                    s8   t     j| _t fddt jD | _d S )Nc                    s   g | ]}t  |qS r#   )r+   ).0r   r   r#   r$   
<listcomp>   s    z)GraniteModel.__init__.<locals>.<listcomp>)r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layerslayers)r    r   r!   rK   r$   r   ~   s
   
zGraniteModel.__init__N	input_idsr3   r4   r/   inputs_embedsr6   r5   output_hidden_statesr7   r=   r9   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|| j
 }|rS|d u rSt| j d}|	d u ro|d ur_| nd}tj|||jd  |jd}	|d u rx|	d}t| j |||	||d}|}| ||}|rd	nd }|rd	nd }| jd | j j D ]&}|r||f7 }||f||||||	|d
|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrK   r   r   )device)r   input_embedsr3   r7   r/   r4   r#   )r3   r4   r/   r5   r6   r7   r8   )last_hidden_stater/   r2   
attentions)r   r5   rT   r6   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrM   r   get_seq_lengthrB   arangeshaperU   	unsqueezer   
rotary_embrQ   rP   normr	   )r    rR   r3   r4   r/   rS   r6   r5   rT   r7   r=   past_seen_tokenscausal_maskr2   r8   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr#   r#   r$   rA      s   


	
	


zGraniteModel.forward)	NNNNNNNNN)r%   r&   r'   r   r   r   rB   rD   rC   r   rG   rE   r   r   r	   rA   r*   r#   r#   r!   r$   rI   }   sD    		
rI   c                   @   s   e Zd Z											ddeej deej deej deeee	ej
 f  deej
 deej d	ee d
ee dee deej deeejf dee defddZdS )GraniteForCausalLMNr   rR   r3   r4   r/   rS   labelsr6   r5   rT   r7   logits_to_keepr=   r9   c                 K   s   |d ur|n| j j}|	d ur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| d n|}| |d d |d d f }|| j j	 }d }|d ur^| j
d||| j jd|}t|||j|j|jdS )N)	rR   r3   r4   r/   rS   r6   r5   rT   r7   )logitsrl   
vocab_size)lossrn   r/   r2   rX   r#   )r   r5   rT   modelrW   
isinstancer)   slicelm_headlogits_scalingloss_functionro   r
   r/   r2   rX   )r    rR   r3   r4   r/   rS   rl   r6   r5   rT   r7   rm   r=   r@   r2   slice_indicesrn   rp   r#   r#   r$   rA      s<   
zGraniteForCausalLM.forward)NNNNNNNNNNr   )r%   r&   r'   r   rB   rD   rC   r   r   listrG   rE   r)   r   r   r
   rA   r#   r#   r#   r$   rk      sN    	
rk   )rk   rI   rH   )%typingr   r   rB   r   cache_utilsr   r   masking_utilsr   modeling_outputsr	   r
   processing_utilsr   utilsr   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr%   r\   r   r+   rH   rI   rk   __all__r#   r#   r#   r$   <module>   s$   
Ij6