o
    wi-                     @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& e'e(Z)G dd dej*Z+G dd deZ,G dd deZ-G dd de"Z.G dd de!Z/G dd de Z0G dd deZ1G d d! d!eZ2G d"d# d#eZ3g d$Z4dS )%zPyTorch Starcoder2 model.    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )
MistralAttentionMistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralModelMistralPreTrainedModelMistralRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )Starcoder2Configc                       s@   e Zd Zdef fddZdeeej  dejfddZ	  Z
S )Starcoder2MLPconfigc                    sT   t    |j}tj||j|jd| _tj|j||jd| _t	|j
 | _|j| _d S N)bias)super__init__hidden_sizer   Linearintermediate_sizeuse_biasc_fcc_projr   
hidden_actactresidual_dropout)selfr   	embed_dim	__class__ n/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/starcoder2/modular_starcoder2.pyr#   7   s   
zStarcoder2MLP.__init__hidden_statesreturnc                 C   s8   |  |}| |}| |}tjj|| j| jd}|S )Nptraining)r(   r+   r)   r   
functionaldropoutr,   r7   )r-   r3   r1   r1   r2   forward?   s
   


zStarcoder2MLP.forward)__name__
__module____qualname__r   r#   r   tupletorchFloatTensorr:   __classcell__r1   r1   r/   r2   r   6   s    &r   c                       s   e Zd Zddedee f fddZ		ddejde	ejejf deej d	ee
 d
eej dee de	ejeej ee	ej  f fddZ  ZS )Starcoder2AttentionNr   	layer_idxc                    s   t    |j| _tj|j|j| j |jd| _	tj|j|j
| j |jd| _tj|j|j
| j |jd| _tj|j| j |j|jd| _d S r    )r"   r#   r,   r   r%   r$   num_attention_headshead_dimr'   q_projnum_key_value_headsk_projv_projo_projr-   r   rC   r/   r1   r2   r#   H   s   
"zStarcoder2Attention.__init__r3   position_embeddingsattention_maskpast_key_valuecache_positionkwargsr4   c                 K   sF  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| jt| jdd d|\}}|jg |dR   }| |}tjj|| j| jd	}||fS )
Nr   r   )sincosrO   eager        sliding_window)r9   scalingrV   r5   )shaperE   rF   view	transposerH   rI   r   updaterC   r   r   _attn_implementationr   r7   attention_dropoutrW   getattrreshape
contiguousrJ   r   r8   r9   r,   )r-   r3   rL   rM   rN   rO   rP   input_shapehidden_shapequery_states
key_statesvalue_statesrS   rR   cache_kwargsattention_interfaceattn_outputattn_weightsr1   r1   r2   r:   P   s@   		


zStarcoder2Attention.forward)N)NN)r;   r<   r=   r   r   intr#   r?   Tensorr>   r   
LongTensorr   r   r:   rA   r1   r1   r/   r2   rB   G   s&    rB   c                       s&   e Zd Zdedef fddZ  ZS )Starcoder2DecoderLayerr   rC   c                    sP   t  |  t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _d S )N)r   rC   eps)r"   r#   rB   	self_attnr   mlpr   	LayerNormr$   norm_epsiloninput_layernormpost_attention_layernormrK   r/   r1   r2   r#      s
   
zStarcoder2DecoderLayer.__init__)r;   r<   r=   r   rj   r#   rA   r1   r1   r/   r2   rm      s    rm   c                   @      e Zd ZdS )Starcoder2RotaryEmbeddingNr;   r<   r=   r1   r1   r1   r2   rw          rw   c                   @   s   e Zd Zdd ZdS )Starcoder2PreTrainedModelc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )NrU   )meanstdg      ?)r   initializer_range
isinstancer   r%   weightdatanormal_r!   zero_	Embeddingpadding_idxrr   fill_)r-   moduler|   r1   r1   r2   _init_weights   s   

z'Starcoder2PreTrainedModel._init_weightsN)r;   r<   r=   r   r1   r1   r1   r2   rz      s    rz   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	e
eej f  d	eej d
ee dee dee deej dee defddZ  ZS )Starcoder2Modelr   c                    sL   t    t fddt jD | _tj j j	d| _
 j| _d S )Nc                    s   g | ]}t  |qS r1   )rm   ).0rC   r   r1   r2   
<listcomp>   s    z,Starcoder2Model.__init__.<locals>.<listcomp>rn   )r"   r#   r   
ModuleListrangenum_hidden_layerslayersrr   r$   rs   normembedding_dropout)r-   r   r/   r   r2   r#      s   zStarcoder2Model.__init__N	input_idsrM   position_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesrO   flash_attn_kwargsr4   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rK|d u rKt
 }|	d u rg|d urW| nd}tj|||jd  |jd}	|d u rp|	d}| j jd u rxtnt}|| j |||	||d}|}tjj|| j| jd}| ||}|rd	nd }|rd	nd }| jd | j j D ]&}|r||f7 }||f||||||	|d
|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )device)r   input_embedsrM   rO   r   r   r5   r1   )rM   r   rN   r   r   rO   rL   )last_hidden_stater   r3   
attentions)r   r   r   r   
ValueErrorgradient_checkpointingr7   loggerwarning_onceembed_tokensr	   get_seq_lengthr?   arangerX   r   	unsqueezerV   r
   r   r   r8   r9   r   
rotary_embr   r   r   r   )r-   r   rM   r   r   r   r   r   r   rO   r   past_seen_tokensmask_functioncausal_maskr3   rL   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr1   r1   r2   r:      s   

	

	


zStarcoder2Model.forward)	NNNNNNNNN)r;   r<   r=   r   r#   r   r?   rl   rk   r   r   listr@   boolr   r   r   r:   rA   r1   r1   r/   r2   r      sD    
	
r   c                   @   rv   )Starcoder2ForCausalLMNrx   r1   r1   r1   r2   r     ry   r   c                   @   rv   )#Starcoder2ForSequenceClassificationNrx   r1   r1   r1   r2   r     ry   r   c                   @   rv   ) Starcoder2ForTokenClassificationNrx   r1   r1   r1   r2   r     ry   r   )r   r   rz   r   r   )5__doc__typingr   r   r   r?   torch.utils.checkpointr   activationsr   cache_utilsr   r	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   mistral.modeling_mistralr   r   r   r   r   r   r   r   r   r   configuration_starcoder2r   
get_loggerr;   r   Moduler   rB   rm   rw   rz   r   r   r   r   __all__r1   r1   r1   r2   <module>   s4   0
:	m