o
    eiiR                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ eeZdededej fddZ!dej dej fddZ"dej dej dej dej fddZ#G dd dej$Z%G dd  d ej$Z&G d!d" d"eZ'eG d#d$ d$eZ(eG d%d& d&e(Z)ed'd(G d)d* d*e(eZ*g d+Z+dS ),zPyTorch CodeGen model.    N)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringlogging   )CodeGenConfignum_posdimreturnc                 C   s`   ddt jd|dt jd|   }t dt j| t jd | }t jt |t |fddS )	Ng      ?i'  r      )dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inp r"   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positions(   s    "r$   xc                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r   )r   stackflatten)r%   x1x2r"   r"   r#   rotate_every_two/   s   ""
r,   tensorr   r   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r   )r   repeat_interleaver,   )r-   r   r   r"   r"   r#   apply_rotary_pos_emb7   s   &&r/   c                       s   e Zd Zd fdd	Zdd Zdd Z	ddd	Z				
	
	ddejdB de	dB dejdB dej
dB dedB dedB dej
dB deejeej f eejeej eejdf f B dB fddZ  ZS )CodeGenAttentionNc                    s  t    |j| _t|j| _t|j| _	|| _
|d u r)td| jj d |j| _|j| _| j| j | _| j| j | jkrNtd| j d| j dt| j| _tj| j| jd dd| _tj| j| jdd| _|j| _| jpv| j| _| jd	t| j| jdd
 d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   F)biasembed_positions)
persistent)super__init__max_position_embeddingsmax_positionsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrormathsqrt
scale_attnLinearqkv_projout_proj
rotary_dimpos_embd_dimregister_bufferr$   )selfconfigr=   r@   r"   r#   r5   >   s4   


zCodeGenAttention.__init__c                 C   sJ   | |jd d || |f }| |jd d d |jdd   }|S )Nr&   r'   )r&   )reshapeshape)rP   r%   n_headdim_headmp_numreshapedr"   r"   r#   _split_heads^   s    &zCodeGenAttention._split_headsc                 C   s   t |jdkr|ddddd }nt |jdkr%|dddd }n
tdt |j | dd	 || f }||S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr'   )lenrT   permute
contiguousrF   sizeview)rP   r-   rD   attn_head_size	new_shaper"   r"   r#   _merge_headsc   s   
zCodeGenAttention._merge_headsc                 C   s   | tj}| tj}t||dd}|d ur|| }|| j }tjdd|}| |j}| 	|}t||}||fS )Nr&   r'   r   )
tor   float32matmul	transposerI   r   Softmaxr   r:   )rP   querykeyvalueattention_maskattn_weightsattn_outputr"   r"   r#   _attnp   s   

zCodeGenAttention._attnFhidden_states
layer_pastrl   position_ids	use_cacheoutput_attentionscache_positionr   .c                 C   sn  |  |}d}	||jd d |	df }
| j| j |	 }tj|
|dd\}}}| j|| j| j|	d}| j|| j| j|	d}| j|| j| j|	d}|dddd}| j	}|j
|j
krc||j
}|| _	|| }tj||jd d dd\}}| jd ur|d d d d d d d | jf }|d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}tj||gdd}nt|||}t|||}|dddd}|dddd}|d ur||| j|d	}|||j|| j|\}}| ||||\}}| || j| j}| |}| |}||fS )
Nr[   r&   r   )rW   r   r   r   r   )r   r   partial_rotation_sizeru   )rK   rS   rT   rE   rD   r   splitrY   r]   r2   devicerd   rM   r/   r   updater   r=   ro   rc   rL   r<   )rP   rp   rq   rl   rr   rs   rt   ru   qkvrW   	qkv_split	local_dimri   rk   rj   r2   sincosr   r   k_rotk_passq_rotq_passcache_kwargsrn   rm   r"   r"   r#   forward   sP   

""""


zCodeGenAttention.forwardNNNNFFN)rA   
__module____qualname__r5   rY   rc   ro   r   FloatTensorr   
LongTensorbooltupleTensorr   __classcell__r"   r"   rR   r#   r0   =   sB     

r0   c                       s6   e Zd Z fddZdejdB dejfddZ  ZS )
CodeGenMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S r   )r4   r5   n_embdr   rJ   fc_infc_outr   activation_functionactr8   r;   dropout)rP   intermediate_sizerQ   rC   rR   r"   r#   r5      s   
zCodeGenMLP.__init__rp   Nr   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )rP   rp   r"   r"   r#   r      s
   



zCodeGenMLP.forward)rA   r   r   r5   r   r   r   r   r"   r"   rR   r#   r      s    "
r   c                       s   e Zd Zd fdd	Z						ddejdB dedB dejdB dejdB d	edB d
edB dejdB de	ej
 e	ej
e	ejdf f B dB fddZ  ZS )CodeGenBlockNc                    sT   t    |jd ur|jnd|j }tj|j|jd| _t||| _	t
||| _d S )Nr[   eps)r4   r5   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r0   attnr   mlp)rP   rQ   r=   	inner_dimrR   r"   r#   r5      s
   
zCodeGenBlock.__init__Frp   rq   rl   rr   rs   rt   ru   r   .c              	   C   sH   |}|  |}| j|||||||d\}	}
| |}|	| | }||
fS )N)rp   rq   rl   rr   rs   rt   ru   )r   r   r   )rP   rp   rq   rl   rr   rs   rt   ru   residualattn_outputsrm   feed_forward_hidden_statesr"   r"   r#   r      s   



	zCodeGenBlock.forwardr   r   )rA   r   r   r5   r   r   r   r   r   r   r   r   r   r"   r"   rR   r#   r      s2    
$	r   c                       s<   e Zd ZU eed< dZdZdgZdZdZ	 fddZ
  ZS )CodeGenPreTrainedModelrQ   transformerTr   past_key_valuesc                    s6   t  | t|trt|jt|j|j	 d S d S r   )
r4   _init_weights
isinstancer0   initcopy_r2   r$   r7   rN   )rP   modulerR   r"   r#   r     s   
z$CodeGenPreTrainedModel._init_weights)rA   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphr   r   r"   r"   rR   r#   r     s   
 r   c                       s   e Zd Z fddZdd Zdd Ze											ddejdB d	e	dB d
ej
dB dejdB dejdB dej
dB dedB dedB dedB dedB dejdB deeB fddZ  ZS )CodeGenModelc                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _t j j j | _d| _|   d S )Nc                    s   g | ]}t  |d qS ))r=   )r   ).0irQ   r"   r#   
<listcomp>$  s    z)CodeGenModel.__init__.<locals>.<listcomp>r   F)r4   r5   r   rC   
vocab_sizer   	Embeddingwter8   
embd_pdropdrop
ModuleListrangen_layerhr   r   ln_fminrM   n_ctxrD   gradient_checkpointing	post_initrP   rQ   rR   r   r#   r5     s    zCodeGenModel.__init__c                 C   s   | j S r   r   )rP   r"   r"   r#   get_input_embeddings-  s   z!CodeGenModel.get_input_embeddingsc                 C   s
   || _ d S r   r   )rP   new_embeddingsr"   r"   r#   set_input_embeddings0  s   
z!CodeGenModel.set_input_embeddingsN	input_idsr   rl   token_type_idsrr   inputs_embedsrs   rt   output_hidden_statesreturn_dictru   r   c              
   K   s  |dur|n| j j}|	dur|	n| j j}	|dur|n| j j}|
dur$|
n| j j}
|du |duA r4td| jrC| jrC|rCt	d d}|du rL| 
|}|rX|du rXt| j d}|jd }|du rv|duri| nd}tj||| |jd}|du r|d}t| j |||||d	}|}|dur|d
|}| 
|}|| }| |}d
||d
f}|rdnd}|	rdnd}t| jD ]#\}}|	r||f }||||||||d}|d }|r||d f }q| |}||}|	r||f }|
stdd ||||fD S t||||dS )a  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )rx   )rQ   r   rl   ru   r   rr   r&   r"   )rq   rl   rr   rs   rt   ru   c                 s   s    | ]	}|d ur|V  qd S r   r"   )r   vr"   r"   r#   	<genexpr>  s    z'CodeGenModel.forward.<locals>.<genexpr>)last_hidden_stater   rp   
attentions)rQ   rt   r   rs   use_return_dictrF   r   trainingr>   r?   r   r   rT   get_seq_lengthr   r   rx   	unsqueezer	   r`   r   r_   	enumerater   r   r   r   )rP   r   r   rl   r   rr   r   rs   rt   r   r   ru   kwargs
seq_lengthpast_seen_tokenscausal_maskrp   token_type_embedsoutput_shapeall_self_attentionsall_hidden_statesr   blockoutputsr"   r"   r#   r   3  s   


	







zCodeGenModel.forward)NNNNNNNNNNN)rA   r   r   r5   r   r   r   r   r   r   r   r   r   r   r   r   r"   r"   rR   r#   r     sR    	
r   zM
    The CodeGen Model transformer with a language modeling head on top.
    )custom_introc                       s   e Zd ZddiZ fddZe													ddejdB dedB d	ej	dB d
ejdB dejdB dej	dB dejdB de
dB de
dB de
dB de
dB dejdB deejB deeB fddZ  ZS )CodeGenForCausalLMzlm_head.weightztransformer.wte.weightc                    s4   t  | t|| _t|j|j| _| 	  d S r   )
r4   r5   r   r   r   rJ   r   r   lm_headr   r   rR   r"   r#   r5     s   
zCodeGenForCausalLM.__init__Nr   r   r   rl   r   rr   r   labelsrs   rt   r   r   ru   logits_to_keepr   c                 K   s   |dur|n| j j}| j||||||||	|
||d}|d }t|tr)t| dn|}| |dd|ddf }d}|durM| jd||| j jd|}|sc|f|dd  }|dura|f| S |S t	|||j
|j|jdS )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r   rl   r   rr   r   rs   rt   r   r   ru   r   )logitsr   r   r   )lossr   r   rp   r   r"   )rQ   r   r   r   intslicer   loss_functionr   r   r   rp   r   )rP   r   r   rl   r   rr   r   r   rs   rt   r   r   ru   r   r   transformer_outputsrp   slice_indicesr   r   outputr"   r"   r#   r     s<   zCodeGenForCausalLM.forward)NNNNNNNNNNNNr   )rA   r   r   _tied_weights_keysr5   r   r   r   r   r   r   r   r   r   r   r   r   r"   r"   rR   r#   r     s\    	
r   )r   r   r   ),__doc__rG   r   r    r   r   activationsr   cache_utilsr   r   
generationr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_codegenr   
get_loggerrA   r>   r   r   r$   r,   r/   Moduler0   r   r   r   r   r   __all__r"   r"   r"   r#   <module>   s@   
" $ M