o
    ijr                     @   s  d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ e raddlmZ ddlmZ e e!Z"de#de#dej$fddZ%dej$dej$fddZ&dej$dej$dej$dej$fddZ'G dd  d ej(Z)G d!d" d"ej(Z*G d#d$ d$eZ+eG d%d& d&eZ,eG d'd( d(e,Z-ed)d*G d+d, d,e,eZ.g d-Z/dS ).zPyTorch CodeGen model.    )OptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging   )CodeGenConfig)	BlockMask)make_flex_block_causal_masknum_posdimreturnc                 C   s`   ddt jd|dt jd|   }t dt j| t jd | }t jt |t |fddS )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inp r(   `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positions/   s    "r*   xc                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r   )r   stackflatten)r+   x1x2r(   r(   r)   rotate_every_two6   s   ""
r2   tensorr$   r%   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r   )r   repeat_interleaver2   )r3   r$   r%   r(   r(   r)   apply_rotary_pos_emb>   s   &&r5   c                       s   e Zd Zd fdd	Zdd Zdd Z		ddd	Z					
	
	ddeej	 dee
 deej	 deej deej	 dee dee deej deeejeej f eeejeej eejdf f  f fddZ  ZS )CodeGenAttentionNc                    s  t    |j}t|j| _t|j| _|| _	|d u r(t
d| jj d |j| _|j| _| j| j | _| j| j | jkrMtd| j d| j dttj| jtjdt | _tj| j| jd dd	| _tj| j| jdd	| _|j| _| jp| j}t||| _d S )
NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   r   F)bias) super__init__max_position_embeddingsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr   sqrtr3   float32toget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr*   embed_positions)selfconfigr@   max_positionspos_embd_dimrC   r(   r)   r9   E   s0   

$zCodeGenAttention.__init__c                 C   sJ   | |jd d || |f }| |jd d d |jdd   }|S )Nr,   r-   )r,   )reshapeshape)rT   r+   n_headdim_headmp_numreshapedr(   r(   r)   _split_headsc   s    &zCodeGenAttention._split_headsc                 C   s   t |jdkr|ddddd }nt |jdkr%|dddd }n
tdt |j | dd	 || f }||S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr-   )lenrZ   permute
contiguousrI   sizeview)rT   r3   rG   attn_head_size	new_shaper(   r(   r)   _merge_headsh   s   
zCodeGenAttention._merge_headsc           	      C   s   | tj}| tj}t||dd}|d ur1|d d d d d d d |jd f }||7 }|| j }tjdd|}| |j	}| 
|}|d urQ|| }t||}||fS )Nr,   r-   r   )rL   r   rK   matmul	transposerZ   rN   r   Softmaxr   r=   )	rT   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputr(   r(   r)   _attnu   s   	&

zCodeGenAttention._attnFhidden_states
layer_pastrp   position_idsrq   	use_cacheoutput_attentionscache_positionr   .c	                 C   sp  |  |}	d}
|	|	jd d |
df }| j| j |
 }tj||dd\}}}| j|| j| j|
d}| j|| j| j|
d}| j|| j| j|
d}|dddd}| j	}|j
|j
krc||j
}|| _	|| }tj||jd d dd\}}| jd ur|d d d d d d d | jf }|d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}tj||gdd}nt|||}t|||}|dddd}|dddd}|d ur||| j|d	}|||j|| j|\}}| |||||\}}| || j| j}| |}| |}||fS )
Nra   r,   r   )r]   r   r   r   r   )r$   r%   partial_rotation_sizer{   )rP   rY   rZ   rH   rG   r   splitr_   rc   rS   devicerL   rR   r5   r#   updater   r@   ru   ri   rQ   r?   )rT   rv   rw   rp   rx   rq   ry   rz   r{   qkvr]   	qkv_split	local_dimrm   ro   rn   rS   sincosr$   r%   k_rotk_passq_rotq_passcache_kwargsrt   rr   r(   r(   r)   forward   sP   

""""


zCodeGenAttention.forwardN)NNNNNNFFN)rD   
__module____qualname__r9   r_   ri   ru   r   r   FloatTensorr   
LongTensorboolr   tupleTensorr   __classcell__r(   r(   rX   r)   r6   D   sJ    
"	
"r6   c                       s6   e Zd Z fddZdeej dejfddZ  ZS )
CodeGenMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S r   )r8   r9   n_embdr   rO   fc_infc_outr   activation_functionactr;   r>   dropout)rT   intermediate_sizerU   rF   rX   r(   r)   r9      s   
zCodeGenMLP.__init__rv   r   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )rT   rv   r(   r(   r)   r      s
   



zCodeGenMLP.forward)	rD   r   r   r9   r   r   r   r   r   r(   r(   rX   r)   r      s    "
r   c                       s   e Zd Zd fdd	Z							ddeej dee deej deej d	eej d
ee	 dee	 deej de
eej eeejeejdf f  f fddZ  ZS )CodeGenBlockNc                    sT   t    |jd ur|jnd|j }tj|j|jd| _t||| _	t
||| _d S )Nra   eps)r8   r9   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r6   attnr   mlp)rT   rU   r@   	inner_dimrX   r(   r)   r9      s
   
zCodeGenBlock.__init__Frv   rw   rp   rx   rq   ry   rz   r{   r   .c	              
   C   sJ   |}	|  |}| j||||||||d\}
}| |}|
| |	 }||fS )N)rv   rw   rp   rx   rq   ry   rz   r{   )r   r   r   )rT   rv   rw   rp   rx   rq   ry   rz   r{   residualattn_outputsrr   feed_forward_hidden_statesr(   r(   r)   r      s   



zCodeGenBlock.forwardr   r   )rD   r   r   r9   r   r   r   r   r   r   r   r   r   r   r   r(   r(   rX   r)   r      s8    
	(
r   c                       sD   e Zd ZU eed< dZdZdgZdZdZ	 fddZ
dd	 Z  ZS )
CodeGenPreTrainedModelrU   transformerTr   past_key_valuesc                    s   t  j|i | d S r   )r8   r9   )rT   inputskwargsrX   r(   r)   r9   #  s   zCodeGenPreTrainedModel.__init__c                 C   s   t |tjfr!|jjjd| jjd |jdur|jj	  dS dS t |tj
rD|jjjd| jjd |jdurB|jj|j 	  dS dS t |tjrY|jj	  |jjd dS dS )zInitialize the weights.        )meanstdNr   )
isinstancer   rO   weightdatanormal_rU   initializer_ranger7   zero_	Embeddingpadding_idxr   fill_)rT   moduler(   r(   r)   _init_weights&  s   

z$CodeGenPreTrainedModel._init_weights)rD   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphr9   r   r   r(   r(   rX   r)   r     s   
 r   c                       s<  e Zd Z fddZdd Zdd Ze												d"deej	 d	ee
eeeej  f  d
eej deej	 deej	 deej deej dee dee dee dee deej	 de
eef fddZ	d#d
e
ejdf dejdejd	edef
ddZed
ejdededejdejdefd d!Z  ZS )$CodeGenModelc                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _t j j j | _d| _|   d S )Nc                    s   g | ]}t  |d qS ))r@   )r   ).0irU   r(   r)   
<listcomp>@  s    z)CodeGenModel.__init__.<locals>.<listcomp>r   F)r8   r9   r   rF   
vocab_sizer   r   wter;   
embd_pdropdrop
ModuleListrangen_layerhr   r   ln_fminrR   n_ctxrG   gradient_checkpointing	post_initrT   rU   rX   r   r)   r9   9  s    zCodeGenModel.__init__c                 C   s   | j S r   r   )rT   r(   r(   r)   get_input_embeddingsI  s   z!CodeGenModel.get_input_embeddingsc                 C   s
   || _ d S r   r   )rT   new_embeddingsr(   r(   r)   set_input_embeddingsL  s   
z!CodeGenModel.set_input_embeddingsN	input_idsr   rp   token_type_idsrx   rq   inputs_embedsry   rz   output_hidden_statesreturn_dictr{   r   c                 K   s&  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td| jrC| jrC|rCt	d d}|du rL| 
|}|rX|du rXt| j d}|jd }|du rv|duri| nd}tj||| |jd}|du r|d}| |||||	}| || j j}|}|dur|d	|}| 
|}|| }| |}d	||d	f}|	rd
nd}|
rd
nd}t| jD ]&\}}|
r||f }||||||| ||	|d}|d }|	r||d f }q| |}||}|
r||f }|stdd ||||fD S t||||dS )a  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   r~   r,   r(   )rw   rp   rx   rq   ry   rz   r{   c                 s   s    | ]	}|d ur|V  qd S r   r(   )r   vr(   r(   r)   	<genexpr>  s    z'CodeGenModel.forward.<locals>.<genexpr>)last_hidden_stater   rv   
attentions)rU   rz   r   ry   use_return_dictrI   r   trainingrA   rB   r   r   rZ   get_seq_lengthr   r   r~   	unsqueeze_update_causal_maskget_head_maskr   rf   r   re   	enumerater   r   r   r   )rT   r   r   rp   r   rx   rq   r   ry   rz   r   r   r{   r   
seq_lengthpast_seen_tokensrs   rv   token_type_embedsoutput_shapeall_self_attentionsall_hidden_statesr   blockoutputsr(   r(   r)   r   O  s   










zCodeGenModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r   flex_attentionr   Fsdpa)r   past_key_values_lengthis_trainingr   r,   )sequence_lengthtarget_lengthr   r{   
batch_size)cudaxpunpu)rU   _attn_implementationanyr   r   r   r   r   is_compileabler
   _ignore_causal_mask_sdpar   r   rZ   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr~   typefinfor   _unmask_unattended)rT   rp   r   r{   r   rz   r   using_compilable_cacher   r   r   rs   	min_dtyper(   r(   r)   r     sT   




z CodeGenModel._update_causal_maskr   r   r   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nra   )
fill_valuer   r~   r   )diagonalr   r,   r   )r   r   r  r   fullr~   triur   rY   expandclonerZ   rL   masked_fill)rp   r   r   r   r{   r   r   rs   r  mask_lengthpadding_maskr(   r(   r)   r    s,    $
6  zBCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNN)F)rD   r   r   r9   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodintr   r  r   r(   r(   rX   r)   r   7  s    	

w
Dr   zM
    The CodeGen Model transformer with a language modeling head on top.
    )custom_introc                        s   e Zd ZdgZ fddZe													ddeej dee	e
eeej  f  deej deej d	eej d
eej deej deej dee dee dee dee deej de	eef fddZ  ZS )CodeGenForCausalLMzlm_head.weightc                    s4   t  | t|| _t|j|j| _| 	  d S r   )
r8   r9   r   r   r   rO   r   r   lm_headr   r   rX   r(   r)   r9   F  s   
zCodeGenForCausalLM.__init__Nr   r   rp   r   rx   rq   r   labelsry   rz   r   r   r{   r   c                 K   s   |dur|n| j j}| j||||||||	|
|||d}|d }| |tj}d}|durH||j}| j||fd| j j	i|}||j
}|s^|f|dd  }|dur\|f| S |S t|||j|j|jdS )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   rp   r   rx   rq   r   ry   rz   r   r   r{   r   r   r   )losslogitsr   rv   r   )rU   r   r   r  rL   r   rK   r~   loss_functionr   r   r   r   rv   r   )rT   r   r   rp   r   rx   rq   r   r  ry   rz   r   r   r{   r   transformer_outputsrv   	lm_logitsr  outputr(   r(   r)   r   N  sN   zCodeGenForCausalLM.forward)NNNNNNNNNNNNN)rD   r   r   _tied_weights_keysr9   r   r   r   r   r   r   r   r   r   r   r   r   r   r(   r(   rX   r)   r  >  s\    	

r  )r  r   r   )0__doc__typingr   r   r   r   activationsr   cache_utilsr   r   
generationr	   modeling_attn_mask_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   configuration_codegenr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrD   rA   r  r   r*   r2   r5   Moduler6   r   r   r   r   r  __all__r(   r(   r(   r)   <module>   sF   
" &  Y