o
    ei1                     @   sf  d dl mZ d dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z! ddl"m#Z# e$e%Z&dZ'dZ(G dd deZ)G dd deZ*G dd deZ+G dd deZ,G dd deZ-G dd deZ.G dd  d eZ/G d!d" d"eZ0g d#Z1dS )$    )Callable)OptionalN   )CacheDynamicCache)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )CLIPMLP)LlamaAttentionLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )	PhiConfigzmicrosoft/phi-1r   c                   @   sF   e Zd Ze			d
dedB ded dedB dedef fdd	Z	dS )PhiRotaryEmbeddingNconfigdeviceztorch.deviceseq_lenreturnztorch.Tensorc           	      C   st   | j d }| j dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj	|tj
d	|   }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   r   )dtype)r   r"   )rope_parametersgetgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r   r   r   baser    r!   dimattention_factorinv_freq r2   a/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/phi/modular_phi.pycompute_default_rope_parameters%   s   
&z2PhiRotaryEmbedding.compute_default_rope_parameters)NNN)
__name__
__module____qualname__staticmethodr   r   r(   tupler-   r4   r2   r2   r2   r3   r   $   s    
r   c                       s|   e Zd Zdedef fddZ		ddejdeejejf dejdB d	e	dB d
ej
dB deejejdB f fddZ  ZS )PhiAttentionr   	layer_idxc                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _| `t| j|jd  | _|j| _| jrqtj|j|j |jdd| _tj|j|j |jdd| _d S d S )NTbiasr    )epselementwise_affine)super__init__nnLinearr&   r'   r!   q_projnum_key_value_headsk_projv_projdenseo_projr(   r#   rotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfr   r;   	__class__r2   r3   rA   G   s    zPhiAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionr   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| jrB| |	}	| 	|
}
|\}}|	dd | j
f |	d| j
d f }}|
dd | j
f |
d| j
d f }}t||||\}}tj||fdd}	tj||fdd}
|d ur|||d}||
|| j|\}
}t| jjt}|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )	Nr   r   .)r/   )sincosrX   g        )dropoutscaling)shaper!   rD   view	transposerF   rG   rK   rN   rO   rJ   r   r)   catupdater;   r
   get_interfacer   _attn_implementationr   trainingattention_dropoutr]   reshape
contiguousrH   )rQ   rT   rU   rV   rW   rX   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesr[   rZ   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfaceattn_outputattn_weightsr2   r2   r3   forwardX   sN   	



zPhiAttention.forward)NN)r5   r6   r7   r   r(   rA   r)   Tensorr9   r   
LongTensorrw   __classcell__r2   r2   rR   r3   r:   F   s"    r:   c                   @      e Zd ZdS )PhiMLPNr5   r6   r7   r2   r2   r2   r3   r|          r|   c                       s   e Zd Zdedef fddZ							ddejdejdB d	ejdB d
e	dB de
dB de
dB dejdB deejejf dB deejeejejf dB f fddZ  ZS )PhiDecoderLayerr   r;   c                    sH   t    t||d| _t|| _tj|j|j	d| _
t|j| _d S )N)r;   r>   )r@   rA   r:   	self_attnr|   mlprB   rL   r&   rM   input_layernormDropoutresid_pdropresid_dropoutrP   rR   r2   r3   rA      s
   

zPhiDecoderLayer.__init__NFrT   rV   position_idsrW   output_attentions	use_cacherX   rU   r   c	                 K   sr   |}
|  |}| jd||||||||d|	\}}| |}| | |}|| |
 }|f}|r7||f7 }|S )N)rT   rV   r   rW   r   r   rX   rU   r2   )r   r   r   r   )rQ   rT   rV   r   rW   r   r   rX   rU   ri   residualattn_outputsself_attn_weightsfeed_forward_hidden_statesoutputsr2   r2   r3   rw      s*   
	


zPhiDecoderLayer.forward)NNNFFNN)r5   r6   r7   r   r(   rA   r)   rx   ry   r   boolr9   FloatTensorrw   rz   r2   r2   rR   r3   r      s8    
	r   c                       s   e Zd Zdef fddZ									ddejdB dejdB dejdB dedB d	ej	dB d
e
dB de
dB de
dB dejdB dee defddZ  ZS )PhiModelr   c                    sV   t    t fddt jD | _t j| _	tj
 j jd| _| `d S )Nc                    s   g | ]}t  |qS r2   )r   ).0r;   r   r2   r3   
<listcomp>   s    z%PhiModel.__init__.<locals>.<listcomp>r   )r@   rA   rB   
ModuleListrangenum_hidden_layerslayersr   
embd_pdropembed_dropoutrL   r&   rM   final_layernormnormrQ   r   rR   r   r3   rA      s   zPhiModel.__init__N	input_idsrV   r   rW   inputs_embedsr   r   output_hidden_statesrX   ri   r   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rN|d u rNt
| j d}|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}t| j |||	||d}| |}|}| j||d	}|rd
nd }|rd
nd }| jd | j j D ]&}|r||f7 }||f||||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r   )r   )r   r   rV   rX   rW   r   )r   r2   )rV   r   rW   r   r   rX   rU   )last_hidden_staterW   rT   
attentions)r   r   r   r   
ValueErrorgradient_checkpointingre   loggerwarning_onceembed_tokensr   get_seq_lengthr)   r*   r^   r   	unsqueezer   r   
rotary_embr   r   r   r	   )rQ   r   rV   r   rW   r   r   r   r   rX   ri   past_seen_tokenscausal_maskrT   rU   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr2   r2   r3   rw      s   


	
	


zPhiModel.forward)	NNNNNNNNN)r5   r6   r7   r   rA   r)   ry   rx   r   r   r   r   r   r	   rw   rz   r2   r2   rR   r3   r      sD    	
r   c                       s   e Zd Z fddZ  ZS )PhiForCausalLMc                    s&   t  | tj|j|jdd| _d S )NTr<   )r@   rA   rB   rC   r&   
vocab_sizelm_headr   rR   r2   r3   rA   4  s   zPhiForCausalLM.__init__)r5   r6   r7   rA   rz   r2   r2   rR   r3   r   3  s    r   c                   @   r{   )PhiForSequenceClassificationNr}   r2   r2   r2   r3   r   9  r~   r   c                   @   r{   )PhiForTokenClassificationNr}   r2   r2   r2   r3   r   =  r~   r   )PhiPreTrainedModelr   r   r   r   )2collections.abcr   typingr   r)   torch.nnrB   cache_utilsr   r   masking_utilsr   modeling_layersr   modeling_outputsr	   modeling_utilsr
   processing_utilsr   utilsr   r   clip.modeling_clipr   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_phir   
get_loggerr5   r   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r:   r|   r   r   r   r   r   __all__r2   r2   r2   r3   <module>   s4    (

"P0i