o
    ib-                     @   sj  d dl mZmZ d dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ e%e&Z'dZ(dZ)G dd deZ*G dd deZ+G dd deZ,G dd de Z-G dd deZ.G dd deZ/G dd  d eZ0G d!d" d"eZ1g d#Z2dS )$    )CallableOptionalN   )CacheDynamicCache)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )CLIPMLP)LlamaAttentionLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )	PhiConfigzmicrosoft/phi-1r   c                       s   e Zd Zdedef fddZedddd				dd
ejde	ejejf de
ej de
e de
ej de	eje
ej f fddZ  ZS )PhiAttentionconfig	layer_idxc                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _| `t| j|j | _|j| _| jrotj|j|j |jdd| _tj|j|j |jdd| _d S d S )NTbias)epselementwise_affine)super__init__nnLinearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projdenseo_projintpartial_rotary_factorrotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfr   r   	__class__ `/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/phi/modular_phi.pyr#   %   s    zPhiAttention.__init__past_key_valuepast_key_values4.58new_nameversionNhidden_statesposition_embeddingsattention_maskcache_positionreturnc                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| jrB| |	}	| 	|
}
|\}}|	dd | j
f |	d| j
d f }}|
dd | j
f |
d| j
d f }}t||||\}}tj||fdd}	tj||fdd}
|d ur|||d}||
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd	|\}}|jg |dR   }| |}||fS )
Nr   r   .)dim)sincosrF   eagerg        )dropoutscaling)shaper(   r)   view	transposer+   r,   r2   r5   r6   r1   r   torchcatupdater   r   r   _attn_implementationr
   trainingattention_dropoutrN   reshape
contiguousr-   )r8   rC   rD   rE   r>   rF   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrK   rJ   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfaceattn_outputattn_weightsr;   r;   r<   forward6   sN   




zPhiAttention.forward)NN)__name__
__module____qualname__r   r/   r#   r   rR   Tensortupler   r   
LongTensorrh   __classcell__r;   r;   r9   r<   r   $   s$    r   c                   @      e Zd ZdS )PhiMLPNri   rj   rk   r;   r;   r;   r<   rq   u       rq   c                       s   e Zd Zdedef fddZedddd							
	
				ddejde	ej de	ej
 de	e de	e de	e de	ej
 de	eejejf  deeje	eejejf  f fddZ  ZS )PhiDecoderLayerr   r   c                    sH   t    t||d| _t|| _tj|j|j	d| _
t|j| _d S )N)r   r    )r"   r#   r   	self_attnrq   mlpr$   r3   r&   r4   input_layernormDropoutresid_pdropresid_dropoutr7   r9   r;   r<   r#   z   s
   

zPhiDecoderLayer.__init__r=   r>   r?   r@   NFrC   rE   position_idsoutput_attentions	use_cacherF   rD   rG   c	                 K   sr   |}
|  |}| jd||||||||d|	\}}| |}| | |}|| |
 }|f}|r7||f7 }|S )N)rC   rE   r|   r>   r}   r~   rF   rD   r;   )rx   rv   r{   rw   )r8   rC   rE   r|   r>   r}   r~   rF   rD   rZ   residualattn_outputsself_attn_weightsfeed_forward_hidden_statesoutputsr;   r;   r<   rh      s*   
	


zPhiDecoderLayer.forward)NNNFFNN)ri   rj   rk   r   r/   r#   r   rR   rl   r   rn   r   boolrm   FloatTensorrh   ro   r;   r;   r9   r<   rt   y   s:    	rt   c                   @   rp   )PhiRotaryEmbeddingNrr   r;   r;   r;   r<   r      rs   r   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	 d	eej
 d
ee dee dee deej dee defddZ  ZS )PhiModelr   c                    sV   t    t fddt jD | _t j| _	tj
 j jd| _| `d S )Nc                    s   g | ]}t  |qS r;   )rt   ).0r   r   r;   r<   
<listcomp>   s    z%PhiModel.__init__.<locals>.<listcomp>ru   )r"   r#   r$   
ModuleListrangenum_hidden_layerslayersry   
embd_pdropembed_dropoutr3   r&   r4   final_layernormnormr8   r   r9   r   r<   r#      s   zPhiModel.__init__N	input_idsrE   r|   r>   inputs_embedsr~   r}   output_hidden_statesrF   rZ   rG   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rN|d u rNt
| j d}|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}t| j |||	||d}| |}|}| ||}|rd	nd }|rd	nd }| jd | j j D ]&}|r||f7 }||f||||||	|d
|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r   )device)r   input_embedsrE   rF   r>   r|   r;   )rE   r|   r>   r}   r~   rF   rD   )last_hidden_stater>   rC   
attentions)r   r}   r   r~   
ValueErrorgradient_checkpointingrV   loggerwarning_onceembed_tokensr   get_seq_lengthrR   arangerO   r   	unsqueezer   r   
rotary_embr   r   r   r	   )r8   r   rE   r|   r>   r   r~   r}   r   rF   rZ   past_seen_tokenscausal_maskrC   rD   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr;   r;   r<   rh      s   


	
	


zPhiModel.forward)	NNNNNNNNN)ri   rj   rk   r   r#   r   rR   rn   rl   r   r   r   r   r   r	   rh   ro   r;   r;   r9   r<   r      sD    	
r   c                       s   e Zd Z fddZ  ZS )PhiForCausalLMc                    s&   t  | tj|j|jdd| _d S )NTr   )r"   r#   r$   r%   r&   
vocab_sizelm_headr   r9   r;   r<   r#     s   zPhiForCausalLM.__init__)ri   rj   rk   r#   ro   r;   r;   r9   r<   r     s    r   c                   @   rp   )PhiForSequenceClassificationNrr   r;   r;   r;   r<   r     rs   r   c                   @   rp   )PhiForTokenClassificationNrr   r;   r;   r;   r<   r   #  rs   r   )PhiPreTrainedModelr   r   r   r   )3typingr   r   rR   torch.nnr$   cache_utilsr   r   masking_utilsr   modeling_layersr   modeling_outputsr	   modeling_utilsr
   processing_utilsr   utilsr   r   utils.deprecationr   clip.modeling_clipr   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_phir   
get_loggerri   r   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   rq   rt   r   r   r   r   r   __all__r;   r;   r;   r<   <module>   s4    (

Q1k