o
    ei9*                     @   s`  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ ddlmZ ee Z!dZ"dZ#G dd dej$Z%G dd deZ&d'ddZ'G dd dej$Z(G dd deZ)G dd deZ*G d d! d!eZ+G d"d# d#eZ,G d$d% d%eZ-g d&Z.dS )(zPyTorch Phi-3 model.    )CallableN)nn   )ACT2FN)Cache)GenerationMixin)FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )MistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralPreTrainedModeleager_attention_forwardrotate_half)PhiRotaryEmbedding   )
Phi3Configz microsoft/Phi-3-mini-4k-instructr   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Phi3MLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )Nr   Fbias)super__init__configr   Linearhidden_sizeintermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fn)selfr   	__class__ c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/phi3/modular_phi3.pyr   1   s
   
zPhi3MLP.__init__hidden_statesreturnc                 C   s4   |  |}|jddd\}}|| | }| |S )Nr   dim)r    chunkr#   r!   )r$   r)   	up_statesgater'   r'   r(   forward9   s   

zPhi3MLP.forward)__name__
__module____qualname__r   torchFloatTensorr1   __classcell__r'   r'   r%   r(   r   0   s    r   c                   @      e Zd ZdS )Phi3RotaryEmbeddingNr2   r3   r4   r'   r'   r'   r(   r9   B       r9   c                 C   s   | |}| |}|jd }| dd|f | d|df }}|dd|f |d|df }}	tj|| t||  |gdd}
tj|| t||  |	gdd}|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r+   .Nr,   )	unsqueezeshaper5   catr   )qkcossinunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr'   r'   r(   apply_rotary_pos_embF   s   


""""rK   c                       s   e Zd ZdZddededB f fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j| _| jd | _
|j| _d| _|j| j d|j| j   }tj|j| j |jdd| _tj|j|dd| _d S )Nhead_dimg      Tr   Fr   )r   r   r   rM   getattrr   num_attention_headsrN   num_key_value_headsnum_key_value_groupsscalingattention_dropout	is_causalr   r   o_projqkv_proj)r$   r   rM   op_sizer%   r'   r(   r   g   s   
zPhi3Attention.__init__r)   position_embeddingsattention_maskpast_key_valuescache_positionkwargsr*   c                 K   sr  |j d d }g |d| jR }| |}	| jj| j }
|	dd |
f }|	d|
|
| j| j  f }|	d|
| j| j  d f }||dd}||dd}||dd}|\}}t||||\}}|d ur~|||d}|	||| j
|\}}t| jjt}|| ||||f| jsdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )	Nr+   .r   r   )rB   rA   r\   g        sliding_window)dropoutrS   r^   )r=   rN   rW   r   rP   rQ   view	transposerK   updaterM   r	   get_interface_attn_implementationr   trainingrT   rS   rO   reshape
contiguousrV   )r$   r)   rY   rZ   r[   r\   r]   input_shapehidden_shapeqkv	query_posquery_states
key_statesvalue_statesrA   rB   cache_kwargsattention_interfaceattn_outputattn_weightsr'   r'   r(   r1   v   sD   	
	

zPhi3Attention.forward)N)NN)r2   r3   r4   __doc__r   intr   r5   Tensortupler   
LongTensorr
   r   r1   r7   r'   r'   r%   r(   rL   d   s(    rL   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee deejeejejf dB f fddZ  ZS )Phi3DecoderLayerr   rM   c                    sL   t  || || _t||d| _t|| _t|j	| _
t|j	| _d S )N)r   rM   )r   r   r   rL   	self_attnr   mlpr   Dropoutresid_pdropresid_attn_dropoutresid_mlp_dropout)r$   r   rM   r%   r'   r(   r      s   
zPhi3DecoderLayer.__init__NFr)   rZ   position_idsr[   	use_cacher\   rY   r]   r*   c              
   K   sj   |}	|  |}| jd|||||||d|\}}
|	| | }|}	| |}| |}|	| | }|S )N)r)   rZ   r   r[   r   r\   rY   r'   )input_layernormry   r}   post_attention_layernormrz   r~   )r$   r)   rZ   r   r[   r   r\   rY   r]   residualself_attn_weightsr'   r'   r(   r1      s&   




zPhi3DecoderLayer.forward)NNNFNN)r2   r3   r4   r   rt   r   r5   ru   rw   r   boolrv   r
   r   r6   r1   r7   r'   r'   r%   r(   rx      s6    	
rx   c                   @   s   e Zd ZdZdS )Phi3PreTrainedModelz0.0.5N)r2   r3   r4   _versionr'   r'   r'   r(   r      s    r   c                   @   s$   e Zd Z							dddZdS )Phi3ForCausalLMNTc	                 K   sf   |rt | jdr|jd | jjd kr|d }
|
| jjkrd }tj| f||||||||d|	}|S )N original_max_position_embeddingsr   r   )	input_idsr[   rZ   inputs_embedsr\   r   r   logits_to_keep)hasattrr   r=   r   r   prepare_inputs_for_generation)r$   r   r[   rZ   r   r\   r   r   r   r]   past_lengthmodel_inputsr'   r'   r(   r      s.   

z-Phi3ForCausalLM.prepare_inputs_for_generation)NNNNNTN)r2   r3   r4   r   r'   r'   r'   r(   r      s    r   c                   @   r8   )Phi3ForSequenceClassificationNr:   r'   r'   r'   r(   r     r;   r   c                   @   r8   )Phi3ForTokenClassificationNr:   r'   r'   r'   r(   r     r;   r   )r   	Phi3Modelr   r   r   )r   )/rs   collections.abcr   r5   r   activationsr   cache_utilsr   
generationr   modeling_flash_attention_utilsr   modeling_utilsr	   processing_utilsr
   utilsr   mistral.modeling_mistralr   r   r   r   r   r   r   phi.modeling_phir   configuration_phi3r   
get_loggerr2   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCModuler   r9   rK   rL   rx   r   r   r   r   __all__r'   r'   r'   r(   <module>   s6   $	

E**