o
    wi{                     @   s  d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ e reddlmZ ddlm Z  e!e"Z#de$de$dej%fddZ&dej%dej%fddZ'dej%dej%dej%dej%fddZ(G dd  d ej)Z*G d!d" d"ej)Z+G d#d$ d$eZ,eG d%d& d&eZ-eG d'd( d(e-Z.ed)d*G d+d, d,e-eZ/g d-Z0dS ).zPyTorch CodeGen model.    )OptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging   )CodeGenConfig)	BlockMask)make_flex_block_causal_masknum_posdimreturnc                 C   s`   ddt jd|dt jd|   }t dt j| t jd | }t jt |t |fddS )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inp r(   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positions0   s    "r*   xc                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r   )r   stackflatten)r+   x1x2r(   r(   r)   rotate_every_two7   s   ""
r2   tensorr$   r%   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r   )r   repeat_interleaver2   )r3   r$   r%   r(   r(   r)   apply_rotary_pos_emb?   s   &&r5   c                       s   e Zd Zd fdd	Zdd Zdd Z		ddd	Z					
	
	ddeej	 dee
 deej	 deej deej	 dee dee deej deeejeej f eeejeej eejdf f  f fddZ  ZS )CodeGenAttentionNc                    s  t    |j}t|j| _t|j| _|| _	|d u r(t
d| jj d |j| _|j| _| j| j | _| j| j | jkrMtd| j d| j dttj| jtjdt | _tj| j| jd dd	| _tj| j| jdd	| _|j| _| jp| j}t||| _d S )
NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   r   F)bias) super__init__max_position_embeddingsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr   sqrtr3   float32toget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr*   embed_positions)selfconfigr@   max_positionspos_embd_dimrC   r(   r)   r9   F   s0   

$zCodeGenAttention.__init__c                 C   sJ   | |jd d || |f }| |jd d d |jdd   }|S )Nr,   r-   )r,   )reshapeshape)rT   r+   n_headdim_headmp_numreshapedr(   r(   r)   _split_headsd   s    &zCodeGenAttention._split_headsc                 C   s   t |jdkr|ddddd }nt |jdkr%|dddd }n
tdt |j | dd	 || f }||S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr-   )lenrZ   permute
contiguousrI   sizeview)rT   r3   rG   attn_head_size	new_shaper(   r(   r)   _merge_headsi   s   
zCodeGenAttention._merge_headsc           	      C   s   | tj}| tj}t||dd}|d ur1|d d d d d d d |jd f }||7 }|| j }tjdd|}| |j	}| 
|}|d urQ|| }t||}||fS )Nr,   r-   r   )rL   r   rK   matmul	transposerZ   rN   r   Softmaxr   r=   )	rT   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputr(   r(   r)   _attnv   s   	&

zCodeGenAttention._attnFhidden_states
layer_pastrp   position_idsrq   	use_cacheoutput_attentionscache_positionr   .c	                 C   s  |  |}	d}
|	|	jd d |
df }| j| j |
 }tj||dd\}}}| j|| j| j|
d}| j|| j| j|
d}| j|| j| j|
d}|dddd}| j	}|j
|j
krc||j
}|| _	|| }tj||jd d dd\}}| jd ur|d d d d d d d | jf }|d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}tj||gdd}nt|||}t|||}|dddd}|dddd}|d ur||| j|d	}|||j|| j|\}}| |||||\}}| || j| j}| |}| |}||f}|r@||f7 }|S )
Nra   r,   r   )r]   r   r   r   r   )r$   r%   partial_rotation_sizer{   )rP   rY   rZ   rH   rG   r   splitr_   rc   rS   devicerL   rR   r5   r#   updater   r@   ru   ri   rQ   r?   )rT   rv   rw   rp   rx   rq   ry   rz   r{   qkvr]   	qkv_split	local_dimrm   ro   rn   rS   sincosr$   r%   k_rotk_passq_rotq_passcache_kwargsrt   rr   outputsr(   r(   r)   forward   sV   

""""



zCodeGenAttention.forwardN)NNNNNNFFN)rD   
__module____qualname__r9   r_   ri   ru   r   r   FloatTensorr   
LongTensorboolr   tupleTensorr   __classcell__r(   r(   rX   r)   r6   E   sJ    
"	
"r6   c                       s6   e Zd Z fddZdeej dejfddZ  ZS )
CodeGenMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S r   )r8   r9   n_embdr   rO   fc_infc_outr   activation_functionactr;   r>   dropout)rT   intermediate_sizerU   rF   rX   r(   r)   r9      s   
zCodeGenMLP.__init__rv   r   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )rT   rv   r(   r(   r)   r      s
   



zCodeGenMLP.forward)	rD   r   r   r9   r   r   r   r   r   r(   r(   rX   r)   r      s    "
r   c                       s   e Zd Zd fdd	Z							ddeej dee deej deej d	eej d
ee	 dee	 deej de
eej eeejeejdf f  f fddZ  ZS )CodeGenBlockNc                    sT   t    |jd ur|jnd|j }tj|j|jd| _t||| _	t
||| _d S )Nra   eps)r8   r9   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r6   attnr   mlp)rT   rU   r@   	inner_dimrX   r(   r)   r9      s
   
zCodeGenBlock.__init__Frv   rw   rp   rx   rq   ry   rz   r{   r   .c	              
   C   sz   |}	|  |}| j||||||||d}
|
d }|
dd  }| |}|| |	 }|r2|f| }|S |f|dd   }|S )N)rv   rw   rp   rx   rq   ry   rz   r{   r   r   )r   r   r   )rT   rv   rw   rp   rx   rq   ry   rz   r{   residualattn_outputsrt   r   feed_forward_hidden_statesr(   r(   r)   r     s*   



zCodeGenBlock.forwardr   r   )rD   r   r   r9   r   r   r   r   r   r   r   r   r   r   r   r(   r(   rX   r)   r      s8    
	(
r   c                       sF   e Zd ZeZdZdZdgZdZdZ	dZ
dZ fddZdd Z  ZS )	CodeGenPreTrainedModeltransformerTr   past_key_valuesc                    s   t  j|i | d S r   )r8   r9   )rT   inputskwargsrX   r(   r)   r9   2  s   zCodeGenPreTrainedModel.__init__c                 C   s   t |tjfr!|jjjd| jjd |jdur|jj	  dS dS t |tj
rD|jjjd| jjd |jdurB|jj|j 	  dS dS t |tjrY|jj	  |jjd dS dS )zInitialize the weights.        )meanstdNr   )
isinstancer   rO   weightdatanormal_rU   initializer_ranger7   zero_	Embeddingpadding_idxr   fill_)rT   moduler(   r(   r)   _init_weights5  s   

z$CodeGenPreTrainedModel._init_weights)rD   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_quantized_cache_supports_static_cacher9   r   r   r(   r(   rX   r)   r   '  s    r   c                       s<  e Zd Z fddZdd Zdd Ze												d"deej	 d	ee
eeeej  f  d
eej deej	 deej	 deej deej dee dee dee dee deej	 de
eef fddZ	d#d
e
ejdf dejdejd	edef
ddZed
ejdededejdejdefd d!Z  ZS )$CodeGenModelc                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _t j j j | _d| _|   d S )Nc                    s   g | ]}t  |d qS ))r@   )r   ).0irU   r(   r)   
<listcomp>O  s    z)CodeGenModel.__init__.<locals>.<listcomp>r   F)r8   r9   r   rF   
vocab_sizer   r   wter;   
embd_pdropdrop
ModuleListrangen_layerhr   r   ln_fminrR   n_ctxrG   gradient_checkpointing	post_initrT   rU   rX   r   r)   r9   H  s    zCodeGenModel.__init__c                 C      | j S r   r   rT   r(   r(   r)   get_input_embeddingsX     z!CodeGenModel.get_input_embeddingsc                 C   
   || _ d S r   r   rT   new_embeddingsr(   r(   r)   set_input_embeddings[     
z!CodeGenModel.set_input_embeddingsN	input_idsr   rp   token_type_idsrx   rq   inputs_embedsry   rz   output_hidden_statesreturn_dictr{   r   c                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td| jrC| jrC|rCt	d d}|du rL| 
|}d}|rit|tsid}|du r_t }n
t|}t	d |jd }|du r|durz| nd}tj||| |jd	}|du r|d}| |||||	}| || j j}|}|dur|d
|}| 
|}|| }| |}d
||d
f}d}|	rdnd}|
rdnd}t| jD ]4\}}|
r||f }||||||| ||	|d}|d }|du r|d }|	r
|||rdnd f }q| |}||}|
r||f }|r"|nd}|r+| }|s;tdd ||||fD S t ||||dS )a  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   r~   r,   r(   )rw   rp   rx   rq   ry   rz   r{   r   c                 s   s    | ]	}|d ur|V  qd S r   r(   )r   vr(   r(   r)   	<genexpr>  s    z'CodeGenModel.forward.<locals>.<genexpr>)last_hidden_stater   rv   
attentions)!rU   rz   r   ry   use_return_dictrI   r   trainingrA   rB   r   r   r   r   from_legacy_cacherZ   get_seq_lengthr   r   r~   	unsqueeze_update_causal_maskget_head_maskr   rf   r   re   	enumerater   r   to_legacy_cacher   r   )rT   r   r   rp   r   rx   rq   r   ry   rz   r   r   r{   r   return_legacy_cache
seq_lengthpast_seen_tokensrs   rv   token_type_embedsoutput_shapenext_decoder_cacheall_self_attentionsall_hidden_statesr   blockr   
next_cacher(   r(   r)   r   ^  s   











zCodeGenModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r   flex_attentionr   Fsdpa)r   past_key_values_lengthis_trainingr   r,   )sequence_lengthtarget_lengthr   r{   
batch_size)cudaxpunpu)rU   _attn_implementationanyr   r   r   r   r   is_compileabler
   _ignore_causal_mask_sdpar   r   rZ   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr~   typefinfor   _unmask_unattended)rT   rp   r   r{   r   rz   r   using_compilable_cacher   r  r  rs   	min_dtyper(   r(   r)   r     sT   




z CodeGenModel._update_causal_maskr  r  r   r  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nra   )
fill_valuer   r~   r   )diagonalr   r,   r   )r   r   r  r   fullr~   triur   rY   expandclonerZ   rL   masked_fill)rp   r  r  r   r{   r  r   rs   r  mask_lengthpadding_maskr(   r(   r)   r  '  s,    $
6  zBCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNN)F)rD   r   r   r9   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodintr   r  r   r(   r(   rX   r)   r   F  s    	

 
Dr   zM
    The CodeGen Model transformer with a language modeling head on top.
    )custom_introc                        s"  e Zd ZdgZ fddZdd Zdd Ze													dd	ee	j
 d
eeeeee	j  f  dee	j dee	j
 dee	j
 dee	j dee	j dee	j
 dee dee dee dee dee	j
 deeef fddZed
eee	j  de	jdeee	j  fddZ  ZS )CodeGenForCausalLMzlm_head.weightc                    s4   t  | t|| _t|j|j| _| 	  d S r   )
r8   r9   r   r   r   rO   r   r   lm_headr   r   rX   r(   r)   r9   h  s   
zCodeGenForCausalLM.__init__c                 C   r   r   r#  r   r(   r(   r)   get_output_embeddingsp  r   z(CodeGenForCausalLM.get_output_embeddingsc                 C   r   r   r$  r   r(   r(   r)   set_output_embeddingss  r   z(CodeGenForCausalLM.set_output_embeddingsNr   r   rp   r   rx   rq   r   labelsry   rz   r   r   r{   r   c                 K   s   |dur|n| j j}| j||||||||	|
|||d}|d }| |tj}d}|durH||j}| j||fd| j j	i|}||j
}|s^|f|dd  }|dur\|f| S |S t|||j|j|jdS )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   rp   r   rx   rq   r   ry   rz   r   r   r{   r   r   r   )losslogitsr   rv   r   )rU   r   r   r#  rL   r   rK   r~   loss_functionr   r   r   r   rv   r   )rT   r   r   rp   r   rx   rq   r   r'  ry   rz   r   r   r{   r   transformer_outputsrv   	lm_logitsr(  outputr(   r(   r)   r   v  sN   zCodeGenForCausalLM.forwardbeam_idxc                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s&    | ]}t  fd d|D V  qdS )c                 3   s$    | ]}| d  |jV  qdS )r   N)index_selectrL   r~   )r   
past_stater.  r(   r)   r     s   " z>CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   rw   r1  r(   r)   r     s
    
z4CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>r2  )r   r.  r(   r1  r)   _reorder_cache  s   	z!CodeGenForCausalLM._reorder_cache)NNNNNNNNNNNNN)rD   r   r   _tied_weights_keysr9   r%  r&  r   r   r   r   r   r   r   r   r   r   r   r   r  r3  r   r(   r(   rX   r)   r"  `  sp    	

Lr"  )r"  r   r   )1__doc__typingr   r   r   torch.utils.checkpointr   activationsr   cache_utilsr   r   
generationr	   modeling_attn_mask_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   configuration_codegenr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrD   rA   r   r   r*   r2   r5   Moduler6   r   r   r   r   r"  __all__r(   r(   r(   r)   <module>   sH   
" !.  m