o
    wio1                     @   sF  d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ eeZdZ dZ!G dd dej"Z#d#ddZ$G dd dej"Z%G dd deZ&G dd deZ'G dd dee'Z(G dd deZ)G d d! d!eZ*g d"Z+dS )$zPyTorch Phi-3 model.    )CallableOptionalN)nn   )ACT2FN)Cache)FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )MistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralPreTrainedModeleager_attention_forwardrotate_half   )
Phi3Configz microsoft/Phi-3-mini-4k-instructr   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Phi3MLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )Nr   Fbias)super__init__configr   Linearhidden_sizeintermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fn)selfr   	__class__ b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/phi3/modular_phi3.pyr   1   s
   
zPhi3MLP.__init__hidden_statesreturnc                 C   s4   |  |}|jddd\}}|| | }| |S )Nr   dim)r   chunkr"   r    )r#   r(   	up_statesgater&   r&   r'   forward9   s   

zPhi3MLP.forward)__name__
__module____qualname__r   torchFloatTensorr0   __classcell__r&   r&   r$   r'   r   0   s    r   c                 C   s   | |}| |}|jd }| dd|f | d|df }}|dd|f |d|df }	}
tj|| t||  |gdd}tj|	| t|	|  |
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r*   .Nr+   )	unsqueezeshaper4   catr   )qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr&   r&   r'   apply_rotary_pos_embB   s   


""""rG   c                       s   e Zd ZdZddedee f fddZ		ddej	de
ej	ej	f d	eej	 d
ee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j| _| jd | _
|j| _d| _|j| j d|j| j   }tj|j| j |jdd| _tj|j|dd| _d S )Nhead_dimg      Tr   Fr   )r   r   r   rI   getattrr   num_attention_headsrJ   num_key_value_headsnum_key_value_groupsscalingattention_dropout	is_causalr   r   o_projqkv_proj)r#   r   rI   op_sizer$   r&   r'   r   e   s   
zPhi3Attention.__init__r(   position_embeddingsattention_maskpast_key_valuecache_positionkwargsr)   c                 K   s~  |j d d }g |d| jR }| |}	| jj| j }
|	dd |
f }|	d|
|
| j| j  f }|	d|
| j| j  d f }||dd}||dd}||dd}|\}}t||||\}}|d ur~|||d}|	||| j
|\}}t}| jjdkrt| jj }|| ||||f| jsdn| j| jt| jdd d	|\}}|jg |dR   }| |}||fS )
Nr*   .r   r   )r=   r<   rX   eagerg        sliding_window)dropoutrO   r[   )r8   rJ   rS   r   rL   rM   view	transposerG   updaterI   r   _attn_implementationr	   trainingrP   rO   rK   reshape
contiguousrR   )r#   r(   rU   rV   rW   rX   rY   input_shapehidden_shapeqkv	query_posquery_states
key_statesvalue_statesr<   r=   cache_kwargsattention_interfaceattn_outputattn_weightsr&   r&   r'   r0   t   sD   	
	

zPhi3Attention.forward)N)NN)r1   r2   r3   __doc__r   r   intr   r4   Tensortupler   
LongTensorr
   r   r0   r6   r&   r&   r$   r'   rH   b   s(    rH   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )Phi3DecoderLayerr   rI   c                    sL   t  || || _t||d| _t|| _t|j	| _
t|j	| _d S )N)r   rI   )r   r   r   rH   	self_attnr   mlpr   Dropoutresid_pdropresid_attn_dropoutresid_mlp_dropout)r#   r   rI   r$   r&   r'   r      s   
zPhi3DecoderLayer.__init__NFr(   rV   r>   rW   output_attentions	use_cacherX   rU   rY   r)   c	                 K   s   |}
|  |}| jd||||||||d|	\}}|
| | }|}
| |}| |}|
| | }|f}|r>||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
            past_key_value (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r(   rV   r>   rW   r{   r|   rX   rU   Nr&   )input_layernormru   ry   post_attention_layernormrv   rz   )r#   r(   rV   r>   rW   r{   r|   rX   rU   rY   residualself_attn_weightsoutputsr&   r&   r'   r0      s.   "
	



zPhi3DecoderLayer.forward)NNNFFNN)r1   r2   r3   r   rp   r   r4   rq   r   rs   r   boolrr   r
   r   r5   r0   r6   r&   r&   r$   r'   rt      s<    	
rt   c                   @   s   e Zd ZdZdS )Phi3PreTrainedModelz0.0.5N)r1   r2   r3   _versionr&   r&   r&   r'   r      s    r   c                   @   s$   e Zd Z							dddZdS )Phi3ForCausalLMNTc	                 K   sb   |r| j jr|jd | j jd kr|d }
|
| j jkrd }t jd||||||||d|	}|S )Nr   r   )	input_idspast_key_valuesrV   inputs_embedsrX   r>   r|   logits_to_keepr&   )r   rope_scalingr8    original_max_position_embeddingsr   prepare_inputs_for_generation)r#   r   r   rV   r   rX   r>   r|   r   rY   past_lengthmodel_inputsr&   r&   r'   r      s*   	z-Phi3ForCausalLM.prepare_inputs_for_generation)NNNNNTN)r1   r2   r3   r   r&   r&   r&   r'   r      s    r   c                   @      e Zd ZdS )Phi3ForSequenceClassificationNr1   r2   r3   r&   r&   r&   r'   r         r   c                   @   r   )Phi3ForTokenClassificationNr   r&   r&   r&   r'   r   !  r   r   )r   	Phi3Modelr   r   r   )Nr   ),ro   typingr   r   r4   torch.utils.checkpointr   activationsr   cache_utilsr   modeling_flash_attention_utilsr   modeling_utilsr	   processing_utilsr
   utilsr   mistral.modeling_mistralr   r   r   r   r   r   r   configuration_phi3r   
get_loggerr1   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCModuler   rG   rH   rt   r   r   r   r   __all__r&   r&   r&   r'   <module>   s2   $	

 EI)