o
    wi/                     @   sz  d dl mZmZ d dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ e%e&Z'dZ(dZ)G dd deZ*G dd deZ+G dd deZ,G dd de Z-G dd deZ.G dd deZ/G dd  d eZ0G d!d" d"eZ1G d#d$ d$eZ2g d%Z3dS )&    )CallableOptionalN   )CacheDynamicCache)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )CLIPMLP)	LlamaAttentionLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )	PhiConfigzmicrosoft/phi-1r   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej deeje	ej e	eej  f fddZ  ZS )PhiAttentionconfig	layer_idxc                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _| `t| j|j | _|j| _| jrotj|j|j |jdd| _tj|j|j |jdd| _d S d S )NTbias)epselementwise_affine)super__init__nnLinearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projdenseo_projintpartial_rotary_factorrotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfr   r   	__class__ `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/phi/modular_phi.pyr#   &   s    zPhiAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionreturnc                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| jrB| |	}	| 	|
}
|\}}|	dd | j
f |	d| j
d f }}|
dd | j
f |
d| j
d f }}t||||\}}tj||fdd}	tj||fdd}
|d ur|||d}||
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd	|\}}|jg |dR   }| |}||fS )
Nr   r   .)dim)sincosrA   eager        )dropoutscaling)shaper(   r)   view	transposer+   r,   r2   r5   r6   r1   r   torchcatupdater   r   r   _attn_implementationr   trainingattention_dropoutrJ   reshape
contiguousr-   )r8   r=   r>   r?   r@   rA   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrF   rE   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfaceattn_outputattn_weightsr;   r;   r<   forward7   sN   	



zPhiAttention.forward)NN)__name__
__module____qualname__r   r/   r#   rN   Tensortupler   r   
LongTensorrd   __classcell__r;   r;   r9   r<   r   %   s"    r   c                   @      e Zd ZdS )PhiMLPNre   rf   rg   r;   r;   r;   r<   rm   u       rm   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
ej  dee dee deej	 dee
ejejf  de
ejee
ejejf  f fddZ  ZS )PhiDecoderLayerr   r   c                    sH   t    t||d| _t|| _tj|j|j	d| _
t|j| _d S )N)r   r    )r"   r#   r   	self_attnrm   mlpr$   r3   r&   r4   input_layernormDropoutresid_pdropresid_dropoutr7   r9   r;   r<   r#   z   s
   

zPhiDecoderLayer.__init__NFr=   r?   position_idsr@   output_attentions	use_cacherA   r>   rB   c	                 K   sr   |}
|  |}| jd||||||||d|	\}}| |}| | |}|| |
 }|f}|r7||f7 }|S )N)r=   r?   rx   r@   ry   rz   rA   r>   r;   )rt   rr   rw   rs   )r8   r=   r?   rx   r@   ry   rz   rA   r>   rV   residualattn_outputsself_attn_weightsfeed_forward_hidden_statesoutputsr;   r;   r<   rd      s*   
	


zPhiDecoderLayer.forward)NNNFFNN)re   rf   rg   r   r/   r#   rN   rh   r   rj   ri   boolFloatTensorrd   rk   r;   r;   r9   r<   rp   y   s8    
	rp   c                   @   rl   )PhiRotaryEmbeddingNrn   r;   r;   r;   r<   r      ro   r   c                   @   s   e Zd Zdd ZdS )PhiPreTrainedModelc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )NrH   )meanstdg      ?)r   initializer_range
isinstancer$   r%   weightdatanormal_r   zero_	Embeddingpadding_idxr3   fill_)r8   moduler   r;   r;   r<   _init_weights   s   

z PhiPreTrainedModel._init_weightsN)re   rf   rg   r   r;   r;   r;   r<   r      s    r   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	 d	eej
 d
ee dee dee deej dee defddZ  ZS )PhiModelr   c                    sV   t    t fddt jD | _t j| _	tj
 j jd| _| `d S )Nc                    s   g | ]}t  |qS r;   )rp   ).0r   r   r;   r<   
<listcomp>   s    z%PhiModel.__init__.<locals>.<listcomp>rq   )r"   r#   r$   
ModuleListrangenum_hidden_layerslayersru   
embd_pdropembed_dropoutr3   r&   r4   final_layernormnormr8   r   r9   r   r<   r#      s   zPhiModel.__init__N	input_idsr?   rx   past_key_valuesinputs_embedsrz   ry   output_hidden_statesrA   flash_attn_kwargsrB   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rK|d u rKt
 }|	d u rg|d urW| nd}tj|||jd  |jd}	|d u rp|	d}t| j |||	||d}| |}|}| ||}|rdnd }|rdnd }| jd | j j D ]&}|r||f7 }||f||||||	|d	|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )device)r   input_embedsr?   rA   r   rx   r;   )r?   rx   r@   ry   rz   rA   r>   )last_hidden_stater   r=   
attentions)r   ry   r   rz   
ValueErrorgradient_checkpointingrR   loggerwarning_onceembed_tokensr   get_seq_lengthrN   arangerK   r   	unsqueezer   r   
rotary_embr   r   r   r
   )r8   r   r?   rx   r   r   rz   ry   r   rA   r   past_seen_tokenscausal_maskr=   r>   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr;   r;   r<   rd      s   


	
	


zPhiModel.forward)	NNNNNNNNN)re   rf   rg   r   r#   r   rN   rj   rh   r   r   r   r   r   r
   rd   rk   r;   r;   r9   r<   r      sD    	
r   c                       s   e Zd Z fddZ  ZS )PhiForCausalLMc                    s&   t  | tj|j|jdd| _d S )NTr   )r"   r#   r$   r%   r&   
vocab_sizelm_headr   r9   r;   r<   r#   )  s   zPhiForCausalLM.__init__)re   rf   rg   r#   rk   r;   r;   r9   r<   r   (  s    r   c                   @   rl   )PhiForSequenceClassificationNrn   r;   r;   r;   r<   r   .  ro   r   c                   @   rl   )PhiForTokenClassificationNrn   r;   r;   r;   r<   r   2  ro   r   )r   r   r   r   r   )4typingr   r   rN   torch.nnr$   cache_utilsr   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   processing_utilsr   utilsr   clip.modeling_clipr   llama.modeling_llamar   r   r   r   r   r   r   r   r   configuration_phir   
get_loggerre   r   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   rm   rp   r   r   r   r   r   r   __all__r;   r;   r;   r<   <module>   s6    ,
P0k