o
    wi                     @   sD  d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) e'*e+Z,G dd dej-Z.dd Z/dej0de1dej0fddZ2	d@dej-dej0dej0d ej0d!eej0 d"e3d#e3fd$d%Z4dAd&d'Z5G d(d) d)ej-Z6ed*G d+d, d,ej-Z7G d-d. d.eZ8e%G d/d0 d0e Z9G d1d2 d2ej-Z:e%G d3d4 d4e9Z;G d5d6 d6ee$Z<e%G d7d8 d8e9eZ=e%d9d:G d;d< d<e9Z>e%G d=d> d>e9Z?g d?Z@dS )B    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tuplelogging   )
Phi3Configc                       s2   e Zd Z fddZdejdejfddZ  ZS )Phi3MLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )N   Fbias)super__init__configr   Linearhidden_sizeintermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnselfr%   	__class__ c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/phi3/modeling_phi3.pyr$   4   s
   
zPhi3MLP.__init__hidden_statesreturnc                 C   s4   |  |}|jddd\}}|| | }| |S )Nr    dim)r)   chunkr,   r*   )r.   r3   	up_statesgater1   r1   r2   forward<   s   

zPhi3MLP.forward)__name__
__module____qualname__r$   torchFloatTensorr;   __classcell__r1   r1   r/   r2   r   3   s    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr5   r    r6   )shaper?   cat)xx1x2r1   r1   r2   rotate_halfE   s   rG   r3   n_repr4   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rB   expandreshape)r3   rH   batchnum_key_value_headsslenhead_dimr1   r1   r2   	repeat_kvL   s
   0rO           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr    r   r5   )r7   dtype)ptrainingr   )rO   num_key_value_groupsr?   matmul	transposerB   r   
functionalsoftmaxfloat32torY   rW   r[   
contiguous)rQ   rR   rS   rT   rU   rV   rW   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr1   r1   r2   eager_attention_forwardX   s   
&rj   c                 C   s   | |}| |}|jd }| dd|f | d|df }}|dd|f |d|df }	}
tj|| t||  |gdd}tj|	| t|	|  |
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r5   .Nr6   )	unsqueezerB   r?   rC   rG   )qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr1   r1   r2   apply_rotary_pos_embr   s   


""""ry   c                       s   e Zd ZdZddedee f fddZ		ddej	de
ej	ej	f d	eej	 d
ee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr%   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j| _| jd | _
|j| _d| _|j| j d|j| j   }tj|j| j |jdd| _tj|j|dd| _d S )NrN   g      Tr    Fr!   )r#   r$   r%   r{   getattrr'   num_attention_headsrN   rL   r\   rV   attention_dropout	is_causalr   r&   o_projqkv_proj)r.   r%   r{   op_sizer/   r1   r2   r$      s   
zPhi3Attention.__init__r3   position_embeddingsrU   past_key_valuecache_positionrd   r4   c                 K   s~  |j d d }g |d| jR }| |}	| jj| j }
|	dd |
f }|	d|
|
| j| j  f }|	d|
| j| j  d f }||dd}||dd}||dd}|\}}t||||\}}|d ur~|||d}|	||| j
|\}}t}| jjdkrt| jj }|| ||||f| jsdn| j| jt| jdd d	|\}}|jg |dR   }| |}||fS )
Nr5   .r   r    )ro   rn   r   eagerrP   sliding_window)rW   rV   r   )rB   rN   r   r%   r}   rL   viewr^   ry   updater{   rj   _attn_implementationr   r[   r~   rV   r|   rJ   rc   r   )r.   r3   r   rU   r   r   rd   input_shapehidden_shapeqkv	query_posquery_statesre   rf   rn   ro   cache_kwargsattention_interfaceri   rg   r1   r1   r2   r;      sD   	
	

zPhi3Attention.forwardN)NN)r<   r=   r>   __doc__r   r   intr$   r?   Tensortupler   
LongTensorr   r   r;   rA   r1   r1   r/   r2   rz      s(    rz   RMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	Phi3RMSNormư>c                    s&   t    tt|| _|| _dS )z:
        Phi3RMSNorm is equivalent to T5LayerNorm
        N)r#   r$   r   	Parameterr?   onesweightvariance_epsilon)r.   r'   epsr/   r1   r2   r$      s   

zPhi3RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr    r5   T)keepdim)	rY   rb   r?   ra   powmeanrsqrtr   r   )r.   r3   input_dtypevariancer1   r1   r2   r;      s
   zPhi3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r   rB   r   r.   r1   r1   r2   
extra_repr   s   zPhi3RMSNorm.extra_repr)r   )r<   r=   r>   r$   r;   r   rA   r1   r1   r/   r2   r      s    r   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )Phi3DecoderLayerr%   r{   c                    st   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
|| _t|j| _t|j| _d S )N)r%   r{   r   )r#   r$   r'   rz   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormr%   r   Dropoutresid_pdropresid_attn_dropoutresid_mlp_dropout)r.   r%   r{   r/   r1   r2   r$      s   

zPhi3DecoderLayer.__init__NFr3   rU   rp   r   output_attentions	use_cacher   r   rd   r4   c	                 K   s   |}
|  |}| jd||||||||d|	\}}|
| | }|}
| |}| |}|
| | }|f}|r>||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
            past_key_value (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r3   rU   rp   r   r   r   r   r   Nr1   )r   r   r   r   r   r   )r.   r3   rU   rp   r   r   r   r   r   rd   residualself_attn_weightsoutputsr1   r1   r2   r;      s.   "
	



zPhi3DecoderLayer.forward)NNNFFNN)r<   r=   r>   r   r   r$   r?   r   r   r   r   boolr   r   r   r@   r;   rA   r1   r1   r/   r2   r      s<    	
r   c                   @   sP   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdZdZdZdd ZdS )	Phi3PreTrainedModelmodelTr   past_key_valuesz0.0.5c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S d S )NrP   )r   stdg      ?)r%   initializer_range
isinstancer   r&   r   datanormal_r"   zero_	Embeddingpadding_idxr   fill_)r.   rQ   r   r1   r1   r2   _init_weightsI  s   


z!Phi3PreTrainedModel._init_weightsN)r<   r=   r>   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_3_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backend_versionr   r1   r1   r1   r2   r   8  s     r   c                       s8   e Zd Zddef fddZe edd Z  Z	S )Phi3RotaryEmbeddingNr%   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r#   r$   hasattrr   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr%   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r.   r%   devicer   r/   r1   r2   r$   X  s   
zPhi3RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r5   r   mpscpuF)device_typeenabledr    r6   )rY   )r   floatrI   rB   rb   r   r   r   strr?   autocastr^   rC   rn   r   ro   rY   )
r.   rD   rp   inv_freq_expandedposition_ids_expandedr   freqsembrn   ro   r1   r1   r2   r;   i  s   0&zPhi3RotaryEmbedding.forwardr   )
r<   r=   r>   r   r$   r?   no_gradr   r;   rA   r1   r1   r/   r2   r   W  s
    r   c                       s   e Zd Zdef fddZdd Zdd Zee									dd	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e de	e de	e
j dee defddZ  ZS )	Phi3Modelr%   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r1   )r   ).0r{   r%   r1   r2   
<listcomp>  s    z&Phi3Model.__init__.<locals>.<listcomp>r   r   F)r#   r$   pad_token_idr   
vocab_sizer   r   r'   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointing	post_initr-   r/   r   r2   r$   {  s   zPhi3Model.__init__c                 C      | j S r   r   r   r1   r1   r2   get_input_embeddings     zPhi3Model.get_input_embeddingsc                 C   
   || _ d S r   r  r.   rT   r1   r1   r2   set_input_embeddings     
zPhi3Model.set_input_embeddingsN	input_idsrU   rp   r   inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr4   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}t	|t
d tfsFtd|d u rO| |}|rX|d u rXt }|	d u rt|d urd| nd}tj|||jd  |jd}	|d u r}|	d}| j jd u rtnt}|| j |||	||d}|}| ||}|rd	nd }|rd	nd }| jd | j j D ]&}|r||f7 }||f||||||	|d
|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   r   )r%   input_embedsrU   r   r   rp   r1   )rU   rp   r   r   r   r   r   )last_hidden_stater   r3   
attentions)r%   r   r
  r   
ValueErrorr   r[   loggerwarning_oncer   r   r   r   r	   get_seq_lengthr?   arangerB   r   rk   r   r   r   r   r   r   r   r   )r.   r  rU   rp   r   r	  r   r   r
  r   r  past_seen_tokensmask_functionrh   r3   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr1   r1   r2   r;     s   

	
	


zPhi3Model.forward	NNNNNNNNN)r<   r=   r>   r   r$   r  r  r   r   r   r?   r   r   r   r@   r   r   r   r   r;   rA   r1   r1   r/   r2   r   y  sL    	
r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r<   r=   r>   r1   r1   r1   r2   r    s    r  c                       s&  e Zd ZdgZddiZddgdgfiZ fddZdd	 Zd
d Zdd Z	dd Z
dd Zdd Zee											d(deej deej deej dee deej deej dee dee dee deej d eeejf d!ee d"efd#d$Z						%	d) fd&d'	Z  ZS )*Phi3ForCausalLMzlm_head.weightlm_headcolwise_repr3   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S NFr!   )
r#   r$   r   r   r   r   r&   r'   r  r   r-   r/   r1   r2   r$     s
   
zPhi3ForCausalLM.__init__c                 C      | j jS r   r   r   r   r1   r1   r2   r  
     z$Phi3ForCausalLM.get_input_embeddingsc                 C      || j _d S r   r#  r  r1   r1   r2   r       z$Phi3ForCausalLM.set_input_embeddingsc                 C   r   r   r  r   r1   r1   r2   get_output_embeddings  r  z%Phi3ForCausalLM.get_output_embeddingsc                 C   r  r   r'  )r.   new_embeddingsr1   r1   r2   set_output_embeddings  r  z%Phi3ForCausalLM.set_output_embeddingsc                 C   r  r   r   )r.   decoderr1   r1   r2   set_decoder  r  zPhi3ForCausalLM.set_decoderc                 C   r   r   r+  r   r1   r1   r2   get_decoder  r  zPhi3ForCausalLM.get_decoderNr   r  rU   rp   r   r	  labelsr   r   r
  r   logits_to_keeprd   r4   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }d}|durX| j	d||| j j
d|}t|||j|j|jdS )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Phi3ForCausalLM

        >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  rU   rp   r   r	  r   r   r
  r   )r   r/  r   lossr   r   r3   r  r1   )r%   r   r
  r   r  r   r   slicer  loss_functionr   r   r   r3   r  )r.   r  rU   rp   r   r	  r/  r   r   r
  r   r0  rd   r   r3   slice_indicesr   r2  r1   r1   r2   r;     s:   '
zPhi3ForCausalLM.forwardTc	                    sb   |r| j jr|jd | j jd kr|d }
|
| j jkrd }t jd||||||||d|	}|S )Nr   r   )r  r   rU   r	  r   rp   r   r0  r1   )r%   r   rB    original_max_position_embeddingsr#   prepare_inputs_for_generation)r.   r  r   rU   r	  r   rp   r   r0  rd   past_lengthmodel_inputsr/   r1   r2   r7  g  s*   	z-Phi3ForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTN)r<   r=   r>   _tied_weights_keys_tp_plan_pp_planr$   r  r  r(  r*  r-  r.  r   r   r   r?   r   r   r   r@   r   r   r   r   r  r   r;   r7  rA   r1   r1   r/   r2   r    sv    		
Lr  a  
    The Phi3 Model transformer with a sequence classification head on top (linear layer).

    [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                          e Zd Z fddZdd Zdd Zee									ddee	j
 d	ee	j d
ee	j
 dee dee	j dee	j
 dee dee dee defddZ  ZS )Phi3ForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r!  )
r#   r$   
num_labelsr   r   r   r&   r'   scorer   r-   r/   r1   r2   r$     s
   
z&Phi3ForSequenceClassification.__init__c                 C   r"  r   r#  r   r1   r1   r2   r    r$  z2Phi3ForSequenceClassification.get_input_embeddingsc                 C   r%  r   r#  r  r1   r1   r2   r    r&  z2Phi3ForSequenceClassification.set_input_embeddingsNr  rU   rp   r   r	  r/  r   r   r
  r4   c
              
   C   s(  | j ||||||||	d}
|
j}| |}|dur|jd }n|jd }| jjdu r2|dkr2td| jjdu r;d}n1|dur`|| jjk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j|||| jd
}t|||
j|
j|
jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        rU   rp   r   r	  r   r   r
  Nr   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r5   )r   rY   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r   r/  pooled_logitsr%   r1  )r   r  rA  rB   r%   r   r  rb   r   r?   int32r  argmaxr  r  r0   r<   r4  r   r   r3   r  )r.   r  rU   rp   r   r	  r/  r   r   r
  transformer_outputsr3   r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesrD  r2  r1   r1   r2   r;     sL   


z%Phi3ForSequenceClassification.forwardr  )r<   r=   r>   r$   r  r  r   r   r   r?   r   r   r   r@   r   r   r;   rA   r1   r1   r/   r2   r?    sH    		
r?  c                       r>  )Phi3ForTokenClassificationc                    s|   t  | |j| _t|| _t|dd d ur|j}nt|dd d ur'|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropouthidden_dropoutg?)r#   r$   r@  r   r   r|   rM  rN  r   r   rW   r&   r'   rA  r   )r.   r%   rM  r/   r1   r2   r$     s   
z#Phi3ForTokenClassification.__init__c                 C   r"  r   r#  r   r1   r1   r2   r    r$  z/Phi3ForTokenClassification.get_input_embeddingsc                 C   r%  r   r#  r  r1   r1   r2   r    r&  z/Phi3ForTokenClassification.set_input_embeddingsNr  rU   rp   r   r	  r/  r   r   r
  r4   c
              
   C   sd   | j ||||||||	d}
|
j}| |}| |}d}|dur(| ||| j}t|||
j|
jdS )rB  rC  N)r2  r   r3   r  )	r   r  rW   rA  r4  r%   r   r3   r  )r.   r  rU   rp   r   r	  r/  r   r   r
  r   sequence_outputr   r2  r1   r1   r2   r;     s,   


z"Phi3ForTokenClassification.forwardr  )r<   r=   r>   r$   r  r  r   r   r   r?   r   r   r   r@   r   r   r;   rA   r1   r1   r/   r2   rL    sH    	
rL  )r   r   r  r?  rL  )rP   )Nr   )Atypingr   r   r   r?   r   activationsr   cache_utilsr   r	   
generationr
   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_phi3r   
get_loggerr<   r  Moduler   rG   r   r   rO   r   rj   ry   rz   r   r   r   r   r   r  r  r?  rL  __all__r1   r1   r1   r2   <module>   sr   


 EL"~ VF