o
    eis                     @   sL  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ eeZ G dd dej!Z"G dd dej#Z$G dd dej#Z%G dd deZ&eG dd deZ'eG dd de'Z(eddG dd de'eZ)g d Z*dS )!zPyTorch XGLM model.    N)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )
XGLMConfigc                
       sL   e Zd ZdZddededededB f fdd	Zd
ejf fddZ	  Z
S )XGLMScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scaleNc                    s   t  ||| || _d S N)super__init__r   )selfr   r   r   r   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/xglm/modeling_xglm.pyr   *   s   
z XGLMScaledWordEmbedding.__init__	input_idsc                    s   t  || j S r   )r   forwardr   )r   r"   r   r    r!   r#   .   s   zXGLMScaledWordEmbedding.forward)r   )__name__
__module____qualname____doc__intfloatr   torchTensorr#   __classcell__r    r    r   r!   r   %   s    $r   c                	       s   e Zd ZdZddedededB f fddZddedededB fd	d
ZeddedededB fddZe	
 dde	jdB defddZ  ZS )!XGLMSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   r   c                    s:   t    d| _|| _|| _|| _| || j || d S )N   )r   r   offsetr.   r   r   make_weights)r   r.   r   r   r   r    r!   r   5   s   
z*XGLMSinusoidalPositionalEmbedding.__init__r   c                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor2   r4   r5   register_buffer)r   r   r   r   emb_weightsr    r    r!   r1   =   s   
z.XGLMSinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r/   i'  r   )r4   r   dimN)mathlogr*   exparangeint64r)   	unsqueezecatsincosviewzerosr9   get_default_dtype)r   r   r   half_dimembr    r    r!   r7   E   s    $&z/XGLMSinusoidalPositionalEmbedding.get_embeddingr   position_idspast_key_values_lengthc                 C   sn   |  \}}|| j7 }d| | }|| j dkr"| || j| j | jd|d||| jjd 	 S )Nr/   r   r>   )
sizer0   r2   r1   r   r   index_selectrH   shapedetach)r   rM   rN   bszseq_lenmax_posr    r    r!   r#   Z   s   
*z)XGLMSinusoidalPositionalEmbedding.forwardr   )Nr   )r$   r%   r&   r'   r(   r   r1   staticmethodr7   r*   no_gradr+   r#   r,   r    r    r   r!   r-   2   s     $r-   c                       s   e Zd ZdZ				ddedededB d	edB d
edB dedB f fddZ					ddej	dej	dB de
dB dej	dB dedej	dB deej	ej	dB eej	 dB f fddZ  ZS )XGLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _|| _	t
j|||d| _t
j|||d| _t
j|||d| _t
j|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r^   )r   r   rZ   r[   r\   head_dim
ValueErrorscalingr]   r_   r   Lineark_projv_projq_projout_proj)r   rZ   r[   r\   r]   r^   r_   r   r    r!   r   i   s$   
	

zXGLMAttention.__init__hidden_stateskey_value_statespast_key_valuesattention_maskoutput_attentionscache_positionreturnc                 C   s  |du}|  \}}	}
|r|jd n|	}| || j }d}|dur:t|tr8|j| j}|r4|j	}n|j
}n|}|r>|n|}|rW|durW|rW|j| j j}|j| j j}nH| |}| |}|||d| jdd}|||d| jdd}|dur|s|nd}|||| jd|i\}}|rt|trd|j| j< || j d| jf}|||	| j| jdd}|j| }|j| }|j| }| d}t||dd}|  || j |	|fkrtd|| j |	|f d	|   |dur8|  |d|	|fkrtd
|d|	|f d	|   ||| j|	|| }t|tjt|jj|jd}||| j |	|}|jtjkrNt j!j"|dtj#d$tj}nt j!j"|dd}|rm||| j|	|}||| j |	|}nd}t j!j%|| j%| j&d}t||}|  || j |	| jfkrtd|| j|	| jf d	|   ||| j|	| j}|dd}|||	| j'}| (|}||fS )z#Input shape: Batch x Time x ChannelNr   Fr>   r/   rn   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size r5   )r=   r4   r<   ptrainingz `attn_output` should be of size ))rO   rQ   rg   rc   
isinstancer   
is_updatedgetr_   cross_attention_cacheself_attention_cachelayerskeysvaluesre   rf   rH   ra   	transposeupdater[   reshaper*   bmmrb   maxtensorfinfor4   minr5   float16r   
functionalsoftmaxfloat32r9   r\   rs   rZ   rh   )r   ri   rj   rk   rl   rm   rn   is_cross_attentionrS   tgt_len_src_lenquery_statesru   curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputr    r    r!   r#      s   








zXGLMAttention.forward)rY   FTN)NNNFN)r$   r%   r&   r'   r(   r)   boolr   r*   r+   r   tupler#   r,   r    r    r   r!   rX   f   sN     rX   c                       s   e Zd Zddef fddZ							ddejdejdB d	ejdB d
ejdB dedB dedB dedB dejdB dejfddZ	  Z
S )XGLMDecoderLayerNconfigc                    s   t    |j| _t| j|j|jd|d| _|j| _t	|j
 | _|j| _|jr;t| j|j|jd|d| _t| j| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rZ   r[   r\   r]   r_   )r   r   d_modelrZ   rX   attention_headsattention_dropout	self_attnr\   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrd   ffn_dimfc1fc2final_layer_norm)r   r   r_   r   r    r!   r      s2   
zXGLMDecoderLayer.__init__FTri   rl   encoder_hidden_statesencoder_attention_maskrk   rm   	use_cachern   ro   c	                 C   s  |}	|  |}| j|||||d\}}
tjj|| j| jd}|	| }d}|durK|}	| |}| j||||||d\}}tjj|| j| jd}|	| }|}	| |}| 	| 
|}tjj|| j| jd}| |}tjj|| j| jd}|	| }|f}|r||
|f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )ri   rk   rl   rm   rn   rq   N)ri   rj   rl   rk   rm   rn   )r   r   r   r   r\   rs   r   r   r   r   r   r   r   )r   ri   rl   r   r   rk   rm   r   rn   residualself_attn_weightscross_attn_weightsoutputsr    r    r!   r#     sH   





zXGLMDecoderLayer.forwardr   )NNNNFTN)r$   r%   r&   r   r   r*   r+   r   r   r#   r,   r    r    r   r!   r      s8    "	
r   c                       s4   e Zd ZU eed< dZdZdgZ fddZ  Z	S )XGLMPreTrainedModelr   modelTr   c                    sF   t  | t|tr!||j|j |j|j}t	
|j| d S d S r   )r   _init_weightsrt   r-   r7   r.   r0   r   r   initcopy_r2   )r   moduler;   r   r    r!   r   l  s   
z!XGLMPreTrainedModel._init_weights)
r$   r%   r&   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r,   r    r    r   r!   r   e  s   
 r   c                       s   e Zd Zdef fddZe												ddejdB dejdB dejdB dejdB d	ejdB d
edB dejdB de	dB de	dB de	dB de	dB dejdB de
ej eB fddZ  ZS )	XGLMModelr   c                    s   t     j| _ j| _ j| _ j| _ jrt	
 jnd}t j j| j|d| _t j j j| _t fddt jD | _t j| _d| _|   d S )Nr   )r   c                    s   g | ]}t  |d qS ))r_   )r   ).0ir   r    r!   
<listcomp>  s    z&XGLMModel.__init__.<locals>.<listcomp>F)r   r   r\   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingr?   sqrtr   r   
vocab_sizeembed_tokensr-   embed_positionsr   
ModuleListrange
num_layersry   r   
layer_normgradient_checkpointing	post_init)r   r   r   r   r   r!   r   w  s$    zXGLMModel.__init__Nr"   rl   rM   r   r   rk   inputs_embedsr   rm   output_hidden_statesreturn_dictrn   ro   c                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur4|dur4td|durK| || | }|d|d }n|durX| dd }ntd|du re| 	|}| j
rt| jrt|rttd d}|r|du r|dus| j jrtt| j dt| j dnt| j d}|dur| nd}|du rtj|||jd	  |jd
}t| j ||||d}|du rtj||d | tj|dur|jn|jd}|d}|dur|durt| j |||d}|| |||j }tjj|t| j| jd}|
r	dnd}|	rdnd}|	r|durdnd}t | j!D ]D\}}|
r/||f7 }| jr@t"g }|| j#k r@q#|||||||	||d}|d }|	rf||d	 f7 }|durf||d f7 }q#| $|}|
ru||f7 }|st%dd |||||fD S t&|||||dS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer>   z5You have to specify either input_ids or inputs_embedsz_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...Fr   r   r   rp   )r   r   rl   rn   rk   r3   )r   r   rl   r   rq   r    )r   rk   rm   r   rn   r/   c                 s   s    | ]	}|d ur|V  qd S r   r    )r   vr    r    r!   	<genexpr>  s    z$XGLMModel.forward.<locals>.<genexpr>)last_hidden_staterk   ri   
attentionscross_attentions)'r   rm   r   r   use_return_dictrb   %warn_if_padding_and_no_attention_maskrO   rH   r   r   rs   loggerwarning_onceis_encoder_decoderr   r   get_seq_lengthr*   rB   rQ   r5   r   longrD   r
   r   r9   r   r   r\   r)   	enumeratery   randr   r   r   r   )r   r"   rl   rM   r   r   rk   r   r   rm   r   r   rn   kwargsinput_shaperN   ri   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputsr    r    r!   r#     s   









zXGLMModel.forward)NNNNNNNNNNNN)r$   r%   r&   r   r   r   r*   r+   r   r   r   r   r#   r,   r    r    r   r!   r   u  sT    	
r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                !       s   e Zd ZdZddiZ fddZe														ddejdB d	ejdB d
ejdB dejdB dejdB de	dB dejdB dejdB de
dB de
dB de
dB de
dB dejdB deejB deej eB fddZ  ZS )XGLMForCausalLMr   zlm_head.weightzmodel.embed_tokens.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr`   )
r   r   r   r   r   rd   hidden_sizer   lm_headr   )r   r   r   r    r!   r   6  s   
zXGLMForCausalLM.__init__Nr   r"   rl   rM   r   r   rk   r   labelsr   rm   r   r   rn   logits_to_keepro   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}| j||||||||	|
|||d}|d }t|tr>t| dn|}| |dd|ddf }d}|dure| j	||f| j j
| j jd|}|s{|f|dd  }|dury|f| S |S t|||j|j|j|jdS )ai  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r"   rl   rM   r   r   rk   r   r   rm   r   r   rn   r   )r   r   r   )losslogitsrk   ri   r   r   )r   rm   r   r   r   rt   r(   slicer   loss_functionr   r   r   rk   ri   r   r   )r   r"   rl   rM   r   r   rk   r   r   r   rm   r   r   rn   r   r   r   ri   slice_indicesr   r   outputr    r    r!   r#   >  sV   %zXGLMForCausalLM.forward)NNNNNNNNNNNNNr   )r$   r%   r&   r   _tied_weights_keysr   r   r*   r+   r   r   r(   r   r   r#   r,   r    r    r   r!   r   ,  sd    	
r   )r   r   r   )+r'   r?   r*   r    r   r   activationsr   cache_utilsr   r   r   
generationr	   masking_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_xglmr   
get_loggerr$   r   	Embeddingr   Moduler-   rX   r   r   r   r   __all__r    r    r    r!   <module>   s<   
4 i 7d