o
    isV                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) e#*e+Z,dd Z-d9ddZ.dej/de0dej/fddZ1	d:dej2dej/dej/dej/d eej/ d!e3d"e3d#ee  fd$d%Z4G d&d' d'ej2Z5G d(d) d)ej2Z6G d*d+ d+eZ7G d,d- d-ej2Z8e!G d.d/ d/eZ9e!G d0d1 d1e9Z:e!G d2d3 d3e9eZ;G d4d5 d5ee9Z<G d6d7 d7ee9Z=g d8Z>dS );    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )	PhiConfigc                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2 r'   X/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/phi/modeling_phi.pyrotate_half"   s   r)   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer)   )qkcossinposition_idsunsqueeze_dimq_embedk_embedr'   r'   r(   apply_rotary_pos_emb)   s
   

r3   hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r!   expandreshape)r4   r5   batchnum_key_value_headsslenhead_dimr'   r'   r(   	repeat_kvD   s
   0r=           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   r   )r    dtype)ptrainingr   )r=   num_key_value_groupsr"   matmul	transposer!   nn
functionalsoftmaxfloat32torH   rE   rJ   
contiguous)r?   r@   rA   rB   rC   rD   rE   rF   
key_statesvalue_statesattn_weightscausal_maskattn_outputr'   r'   r(   eager_attention_forwardP   s   
&rY   c                       s   e Zd ZdZdedef fddZedddd		
	
ddej	de
ej	ej	f deej	 dee deej de
ej	eej	 f fddZ  ZS )PhiAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    s$  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _t| j|j | _|j| _| jrtj|j|j |jdd| _tj|j|j |jdd| _d S d S )Nr<   g      Tbias)epselementwise_affine)super__init__r[   r\   getattrhidden_sizenum_attention_headsr<   r:   rK   rD   attention_dropout	is_causalrN   Linearq_projk_projv_projdenseintpartial_rotary_factorrotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfr[   r\   	__class__r'   r(   rb   m   s,   
zPhiAttention.__init__past_key_valuepast_key_values4.58new_nameversionNr4   position_embeddingsrC   cache_positionr6   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| jrB| |	}	| 	|
}
|\}}|	dd | j
f |	d| j
d f }}|
dd | j
f |
d| j
d f }}t||||\}}tj||fdd}	tj||fdd}
|d ur|||d}||
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd	|\}}|jg |dR   }| |}||fS )
Nr   r   r   .r   )r.   r-   r   eagerr>   )rE   rD   )r!   r<   ri   viewrM   rj   rk   rp   rs   rt   ro   r3   r"   r#   updater\   rY   r[   _attn_implementationr   rJ   rf   rD   r8   rS   rl   )rv   r4   r   rC   rz   r   rF   input_shapehidden_shapequery_statesrT   rU   r-   r.   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfacerX   rV   r'   r'   r(   forward   sN   




zPhiAttention.forward)NN)__name__
__module____qualname____doc__r   rm   rb   r   r"   Tensortupler   r   
LongTensorr   __classcell__r'   r'   rw   r(   rZ   j   s&    rZ   c                       s2   e Zd Z fddZdejdejfddZ  ZS )PhiMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)ra   rb   r[   r   
hidden_actactivation_fnrN   rh   rd   intermediate_sizefc1fc2rv   r[   rw   r'   r(   rb      s
   
zPhiMLP.__init__r4   r6   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rv   r4   r'   r'   r(   r      s   


zPhiMLP.forward)r   r   r   rb   r"   r   r   r   r'   r'   rw   r(   r      s    r   c                       s   e Zd Zdedef fddZedddd							
	
				ddejde	ej de	ej
 de	e de	e de	e de	ej
 de	eejejf  deeje	eejejf  f fddZ  ZS )PhiDecoderLayerr[   r\   c                    sH   t    t||d| _t|| _tj|j|j	d| _
t|j| _d S )N)r\   r_   )ra   rb   rZ   	self_attnr   mlprN   rq   rd   rr   input_layernormDropoutresid_pdropresid_dropoutru   rw   r'   r(   rb      s
   

zPhiDecoderLayer.__init__ry   rz   r{   r|   NFr4   rC   r/   output_attentions	use_cacher   r   r6   c	                 K   sr   |}
|  |}| jd||||||||d|	\}}| |}| | |}|| |
 }|f}|r7||f7 }|S )N)r4   rC   r/   rz   r   r   r   r   r'   )r   r   r   r   )rv   r4   rC   r/   rz   r   r   r   r   rF   residualattn_outputsself_attn_weightsfeed_forward_hidden_statesoutputsr'   r'   r(   r      s*   
	


zPhiDecoderLayer.forward)NNNFFNN)r   r   r   r   rm   rb   r   r"   r   r   r   r   boolr   FloatTensorr   r   r'   r'   rw   r(   r      s:    	r   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	PhiRotaryEmbeddinginv_freqNr[   c                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)ra   rb   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr[   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)rv   r[   devicer   rw   r'   r(   rb     s   
zPhiRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r   r   mpscpuF)device_typeenabledr   r   )rH   )r   floatr7   r!   rR   r   r   r   strr"   autocastrM   r#   r-   r   r.   rH   )
rv   r$   r/   inv_freq_expandedposition_ids_expandedr   freqsembr-   r.   r'   r'   r(   r     s   0&zPhiRotaryEmbedding.forwardr   )r   r   r   r"   r   __annotations__r   rb   no_gradr   r   r   r'   r'   rw   r(   r     s   
 
r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )PhiPreTrainedModelr[   modelTr   rz   )r4   
attentionsN)r   r   r   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   rZ   _can_record_outputsr'   r'   r'   r(   r   '  s   
 
r   c                       s   e Zd Zdef fddZee									ddeej	 deej
 deej	 dee d	eej d
ee dee dee deej	 dee defddZ  ZS )PhiModelr[   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t d| _d| _t j| _tj j jd| _|   d S )Nc                    s   g | ]}t  |qS r'   )r   ).0r\   r[   r'   r(   
<listcomp>C  s    z%PhiModel.__init__.<locals>.<listcomp>r   Fr   )ra   rb   pad_token_idpadding_idx
vocab_sizerN   	Embeddingrd   embed_tokens
ModuleListrangenum_hidden_layerslayersr   
rotary_embgradient_checkpointingr   
embd_pdropembed_dropoutrq   rr   final_layernorm	post_initr   rw   r   r(   rb   <  s   zPhiModel.__init__N	input_idsrC   r/   rz   inputs_embedsr   r   output_hidden_statesr   rF   r6   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rN|d u rNt
| j d}|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}t| j |||	||d}| |}|}| ||}|rd	nd }|rd	nd }| jd | j j D ]&}|r||f7 }||f||||||	|d
|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r   )r   )r[   input_embedsrC   r   rz   r/   r'   )rC   r/   rz   r   r   r   r   )last_hidden_staterz   r4   r   )r[   r   r   r   
ValueErrorr   rJ   loggerwarning_oncer   r   get_seq_lengthr"   aranger!   r   r*   r
   r   r   r   r   r   r   )rv   r   rC   r/   rz   r   r   r   r   r   rF   past_seen_tokensrW   r4   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr'   r'   r(   r   M  s   


	
	


zPhiModel.forward)	NNNNNNNNN)r   r   r   r   rb   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r'   r'   rw   r(   r   :  sH    	
r   c                       s   e Zd ZdgZddiZddgdgfiZ fddZee										dd
e	e
j de	e
j de	e
j de	e de	e
j de	e
j de	e de	e
j deee
jf dee defddZ  ZS )PhiForCausalLMzlm_head.weightlm_headcolwise_repr4   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NTr]   )
ra   rb   r   r   r   rN   rh   rd   r   r   r   rw   r'   r(   rb     s
   
zPhiForCausalLM.__init__Nr   r   rC   r/   rz   r   labelsr   r   logits_to_keeprF   r6   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, PhiForCausalLM

        >>> model = PhiForCausalLM.from_pretrained("meta-phi/Phi-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi/Phi-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r   rC   r/   rz   r   r   r   N)r  r  r   )lossr  rz   r4   r   r'   )r   r   r   rm   slicer   loss_functionr[   r   r   rz   r4   r   )rv   r   rC   r/   rz   r   r  r   r   r  rF   r   r4   slice_indicesr  r  r'   r'   r(   r     s0    zPhiForCausalLM.forward)	NNNNNNNNr   )r   r   r   _tied_weights_keys_tp_plan_pp_planrb   r   r   r   r"   r   r   r   r   r   r   rm   r   r   r   r   r   r'   r'   rw   r(   r     sN    		
r   c                   @      e Zd ZdS )PhiForSequenceClassificationNr   r   r   r'   r'   r'   r(   r        r  c                   @   r  )PhiForTokenClassificationNr  r'   r'   r'   r(   r     r  r  )r   r   r   r  r  )Nr   )r>   )?typingr   r   r   r"   torch.nnrN   activationsr   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   configuration_phir   
get_loggerr   r   r)   r3   r   rm   r=   Moduler   rY   rZ   r   r   r   r   r   r   r  r  __all__r'   r'   r'   r(   <module>   sf   


Y1$uK