o
    i&                     @   s  d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) e*e+Z,G dd dej-Z.G dd deZ/G dd de Z0G dd de%Z1G dd de$Z2G dd de!Z3G d d! d!e"Z4G d"d# d#e#Z5g d$Z6dS )%zPyTorch Starcoder2 model.    )CallableOptionalUnionN)nn)check_model_inputs   )ACT2FN)CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )	MistralAttentionMistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralModelMistralRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )Starcoder2Configc                       s@   e Zd Zdef fddZdeeej  dejfddZ	  Z
S )Starcoder2MLPconfigc                    sT   t    |j}tj||j|jd| _tj|j||jd| _t	|j
 | _|j| _d S )Nbias)super__init__hidden_sizer   Linearintermediate_sizeuse_biasc_fcc_projr   
hidden_actactresidual_dropout)selfr!   	embed_dim	__class__ n/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/starcoder2/modular_starcoder2.pyr%   8   s   
zStarcoder2MLP.__init__hidden_statesreturnc                 C   s8   |  |}| |}| |}tjj|| j| jd}|S )Nptraining)r*   r-   r+   r   
functionaldropoutr.   r9   )r/   r5   r3   r3   r4   forward@   s
   


zStarcoder2MLP.forward)__name__
__module____qualname__r   r%   r   tupletorchFloatTensorr<   __classcell__r3   r3   r1   r4   r    7   s    &r    c                       s   e Zd Zddedee f fddZedddd			dd
ej	de
ej	ej	f deej	 dee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )Starcoder2AttentionNr!   	layer_idxc                    s   t  j||d |j| _tj|j|j| j |jd| _	tj|j|j
| j |jd| _tj|j|j
| j |jd| _tj|j| j |j|jd| _d S )Nr!   rE   r"   )r$   r%   r.   r   r'   r&   num_attention_headshead_dimr)   q_projnum_key_value_headsk_projv_projo_projr/   r!   rE   r1   r3   r4   r%   I   s   "zStarcoder2Attention.__init__past_key_valuepast_key_valuesz4.58)new_nameversionr5   position_embeddingsattention_maskcache_positionkwargsr6   c                 K   sF  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| jt| jdd d|\}}|jg |dR   }| |}tjj|| j| jd	}||fS )
Nr   r   )sincosrU   eagerg        sliding_window)r;   scalingr[   r7   )shaperH   rI   view	transposerK   rL   r   updaterE   r   r!   _attn_implementationr   r9   attention_dropoutr\   getattrreshape
contiguousrM   r   r:   r;   r.   )r/   r5   rS   rT   rP   rU   rV   input_shapehidden_shapequery_states
key_statesvalue_statesrY   rX   cache_kwargsattention_interfaceattn_outputattn_weightsr3   r3   r4   r<   Q   s@   
	


zStarcoder2Attention.forward)N)NN)r=   r>   r?   r   r   intr%   r   rA   Tensorr@   r	   
LongTensorr   r   r<   rC   r3   r3   r1   r4   rD   H   s(    rD   c                       s&   e Zd Zdedef fddZ  ZS )Starcoder2DecoderLayerr!   rE   c                    sR   t  || t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _d S )NrF   eps)r$   r%   rD   	self_attnr    mlpr   	LayerNormr&   norm_epsiloninput_layernormpost_attention_layernormrN   r1   r3   r4   r%      s
   
zStarcoder2DecoderLayer.__init__)r=   r>   r?   r   ro   r%   rC   r3   r3   r1   r4   rr      s    rr   c                   @      e Zd ZdS )Starcoder2RotaryEmbeddingNr=   r>   r?   r3   r3   r3   r4   r|          r|   c                       s   e Zd Zdef fddZe							ddeej deej	 deej dee
eeej f  d	eej d
ee deej dee defddZ  ZS )Starcoder2Modelr!   c                    sL   t    t fddt jD | _tj j j	d| _
 j| _d S )Nc                    s   g | ]}t  |qS r3   )rr   ).0rE   r!   r3   r4   
<listcomp>   s    z,Starcoder2Model.__init__.<locals>.<listcomp>rs   )r$   r%   r   
ModuleListrangenum_hidden_layerslayersrw   r&   rx   normembedding_dropout)r/   r!   r1   r   r4   r%      s   zStarcoder2Model.__init__N	input_idsrT   position_idsrP   inputs_embeds	use_cacherU   rV   r6   c              
   K   s4  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}| jj
d u rNtnt}
|
| j|||||d}|}tjj|| j| jd}| ||}| jd | jj D ]}||f||||||d|}qw| |}t||r|d	S d d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )device)r!   input_embedsrT   rU   rP   r   r7   )rT   r   rP   r   rU   rS   )last_hidden_staterP   )
ValueErrorembed_tokensr
   r!   get_seq_lengthrA   aranger]   r   	unsqueezer[   r   r   r   r:   r;   r   r9   
rotary_embr   r   r   r   )r/   r   rT   r   rP   r   r   rU   rV   past_seen_tokensmask_functioncausal_maskr5   rS   decoder_layerr3   r3   r4   r<      s^   

	

zStarcoder2Model.forward)NNNNNNN)r=   r>   r?   r   r%   r   r   rA   rq   rp   r   r	   listrB   boolr   r   r   r<   rC   r3   r3   r1   r4   r      s:    	
r   c                   @   r{   )Starcoder2ForCausalLMNr}   r3   r3   r3   r4   r      r~   r   c                   @   r{   )#Starcoder2ForSequenceClassificationNr}   r3   r3   r3   r4   r      r~   r   c                   @   r{   ) Starcoder2ForTokenClassificationNr}   r3   r3   r3   r4   r      r~   r   )r   r   Starcoder2PreTrainedModelr   r   )7__doc__typingr   r   r   rA   r   transformers.utils.genericr   activationsr   cache_utilsr	   r
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   mistral.modeling_mistralr   r   r   r   r   r   r   r   r   configuration_starcoder2r   
get_loggerr=   loggerModuler    rD   rr   r|   r   r   r   r   __all__r3   r3   r3   r4   <module>   s4   ,
;	L