o
    i                     @   s\  d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZ e e!Z"G dd dej#Z$G dd dej%Z&G dd dej%Z'G dd deZ(eG dd deZ)eG dd de)Z*eddG dd  d e)eZ+g d!Z,dS )"zPyTorch XGLM model.    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )
XGLMConfigc                
       sL   e Zd ZdZddedededee f fddZd	ej	f fd
dZ
  ZS )XGLMScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s   t  ||| || _d S N)super__init__r   )selfr   r   r   r   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/xglm/modeling_xglm.pyr   ,   s   
z XGLMScaledWordEmbedding.__init__	input_idsc                    s   t  || j S r   )r   forwardr   )r   r$   r    r"   r#   r%   0   s   zXGLMScaledWordEmbedding.forward)r   )__name__
__module____qualname____doc__intr   floatr   torchTensorr%   __classcell__r"   r"   r    r#   r   '   s    $r   c                	       s   e Zd ZdZddededee f fddZddededee fd	d
Zeddededee fddZ	e
 ddee
j defddZ  ZS )!XGLMSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   r   c                    s4   t    d| _|| _|| _| || j || d S )N   )r   r   offsetr   r   make_weights)r   r0   r   r   r    r"   r#   r   7   s
   
z*XGLMSinusoidalPositionalEmbedding.__init__r   c                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor4   r6   r7   register_buffer)r   r   r   r   emb_weightsr"   r"   r#   r3   >   s   
z.XGLMSinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r1   i'  r   )r6   r   dimN)mathlogr,   exparangeint64r+   	unsqueezecatsincosviewzerosr;   get_default_dtype)r   r   r   half_dimembr"   r"   r#   r9   F   s    $&z/XGLMSinusoidalPositionalEmbedding.get_embeddingr   position_idspast_key_values_lengthc                 C   sn   |  \}}|| j7 }d| | }|| j dkr"| || j| j | jd|d||| jjd 	 S )Nr1   r   r@   )
sizer2   r4   r3   r   r   index_selectrJ   shapedetach)r   rO   rP   bszseq_lenmax_posr"   r"   r#   r%   [   s   
*z)XGLMSinusoidalPositionalEmbedding.forwardr   )Nr   )r&   r'   r(   r)   r*   r   r   r3   staticmethodr9   r,   no_gradr-   r%   r.   r"   r"   r    r#   r/   4   s     $r/   c                       s   e Zd ZdZ				ddededee d	ee d
ee dee f fddZe	dddd						dde
jdee
j dee dee
j dee
j dedee
j dee
jee
j eee
j  f fddZ  ZS )XGLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _|| _	t
j|||d| _t
j|||d| _t
j|||d| _t
j|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r`   )r   r   r\   r]   r^   head_dim
ValueErrorscalingr_   ra   r   Lineark_projv_projq_projout_proj)r   r\   r]   r^   r_   r`   ra   r    r"   r#   r   k   s$   
	

zXGLMAttention.__init__past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskoutput_attentionscache_positionreturnc                 C   s  |du}|  \}	}
}|r|jd n|
}| || j }d}|dur:t|tr8|j| j}|r4|j	}n|j
}n|}|r>|n|}|rW|durW|rW|j| j j}|j| j j}nH| |}| |}||	|d| jdd}||	|d| jdd}|dur|s|nd}|||| jd|i\}}|rt|trd|j| j< |	| j d| jf}||	|
| j| jdd}|j| }|j| }|j| }| d}t||dd}|  |	| j |
|fkrtd|	| j |
|f d	|   |dur8|  |	d|
|fkrtd
|	d|
|f d	|   ||	| j|
|| }t|tjt|jj|jd}||	| j |
|}|jtjkrNt j!j"|dtj#d$tj}nt j!j"|dd}|dur|  | jfkrrtd| jf d	|   |dddd||	| j|
| }||	| j |
|}|r||	| j|
|}||	| j |
|}nd}t j!j%|| j%| j&d}t||}|  |	| j |
| jfkrtd|	| j|
| jf d	|   ||	| j|
| j}|dd}||	|
| j'}| (|}||fS )z#Input shape: Batch x Time x ChannelNr   Fr@   r1   rv   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size )r7   )r?   r6   r>   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size ))rQ   rS   ri   re   
isinstancer	   
is_updatedgetra   cross_attention_cacheself_attention_cachelayerskeysvaluesrg   rh   rJ   rc   	transposeupdater]   reshaper,   bmmrd   maxtensorfinfor6   minr7   float16r   
functionalsoftmaxfloat32r;   r^   rz   r\   rj   )r   rq   rr   rl   rs   rt   ru   rv   is_cross_attentionrU   tgt_len_src_lenquery_statesr|   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputr"   r"   r#   r%      s   








"
zXGLMAttention.forward)r[   FTN)NNNNFN)r&   r'   r(   r)   r*   r   r+   boolr   r   r,   r-   r   tupler%   r.   r"   r"   r    r#   rZ   h   sV    	rZ   c                       s   e Zd Zddef fddZedddd									
	ddejdeej deej deej deej deej dee	 dee
 dee
 deej dejfddZ  ZS )XGLMDecoderLayerNconfigc                    s   t    |j| _t| j|j|jd|d| _|j| _t	|j
 | _|j| _|jr;t| j|j|jd|d| _t| j| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)r\   r]   r^   r_   ra   )r   r   d_modelr\   rZ   attention_headsattention_dropout	self_attnr^   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrf   ffn_dimfc1fc2final_layer_norm)r   r   ra   r    r"   r#   r   
  s2   
zXGLMDecoderLayer.__init__rk   rl   rm   rn   FTrq   rs   encoder_hidden_statesencoder_attention_maskrt   cross_attn_layer_head_maskru   	use_cacherv   rw   c              	   C   s  |}|  |}| j||||||
d\}}tjj|| j| jd}|| }d}|durM|}| |}| j|||||||
d\}}tjj|| j| jd}|| }|}| |}| 	| 
|}tjj|| j| jd}| |}tjj|| j| jd}|| }|f}|r|||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rq   rl   rs   rt   ru   rv   rx   N)rq   rr   rs   rt   rl   ru   rv   )r   r   r   r   r^   rz   r   r   r   r   r   r   r   )r   rq   rs   r   r   rt   r   rl   ru   r   rv   residualself_attn_weightscross_attn_weightsoutputsr"   r"   r#   r%   (  sL   !



	

zXGLMDecoderLayer.forwardr   )	NNNNNNFTN)r&   r'   r(   r   r   r   r,   r-   r   r   r   r%   r.   r"   r"   r    r#   r   	  sF    	
r   c                   @   s,   e Zd ZU eed< dZdZdgZdd ZdS )XGLMPreTrainedModelr   modelTr   c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )Nr[   )meanstd)r   init_stdr{   r   rf   weightdatanormal_r`   zero_	Embeddingr   )r   moduler   r"   r"   r#   _init_weights  s   

z!XGLMPreTrainedModel._init_weightsN)	r&   r'   r(   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r"   r"   r"   r#   r   {  s   
 r   c                "       s   e Zd Zddedeej f fddZe														ddee	j
 dee	j
 dee	j
 d	ee	j
 d
ee	j
 dee	j
 dee	j
 dee dee	j
 dee dee dee dee dee	j
 deee	j
 ef fddZ  ZS )	XGLMModelNr   embed_tokensc                    s   t     j| _ j| _ j| _ j| _ jrt	
 jnd}|dur)|| _nt j j| j|d| _t j j j| _t fddt jD | _t j| _d| _|   dS )zZ
        embed_tokens (`nn.Embedding`, *optional*):
            output embeddings
        r   N)r   c                    s   g | ]}t  |d qS ))ra   )r   ).0ir   r"   r#   
<listcomp>  s    z&XGLMModel.__init__.<locals>.<listcomp>F)r   r   r^   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrA   sqrtr   r   r   
vocab_sizer/   embed_positionsr   
ModuleListrange
num_layersr   r   
layer_normgradient_checkpointing	post_init)r   r   r   r   r    r   r#   r     s(    zXGLMModel.__init__r$   rs   rO   r   r   	head_maskcross_attn_head_maskrl   inputs_embedsr   ru   output_hidden_statesreturn_dictrv   rw   c                 C   s  |dur|n| j j}|dur|n| j j}|
dur|
n| j j}
|dur$|n| j j}|dur4|	dur4td|durK| || | }|d|d }n|	durX|	 dd }ntd|	du re| 	|}	| j
rt| jrt|
rttd d}
|
r|du r|durtt| j dt| j dnt| j d}|
rt|trtd t|}|dur| nd	}t|||	|}|du rtj||d | tj|dur|jn|	jd
}|d	}|dur|durt||	j|d d}|	| |||	j }tjj |t!| j | jd}|rdnd}|r	dnd}|r|durdnd}t"||gddgD ]+\}}|durJ| d	 t#| j$krJtd| dt#| j$ d| d	  dq t%| j$D ]X\}}|r]||f7 }| jrnt&g }|| j'k rnqQ||||||dur||| nd|dur|| nd|||
|d
}|d	 }|r||d f7 }|dur||d f7 }qQ| (|}|r||f7 }|stdd |||||fD S t)|||||dS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer@   z5You have to specify either input_ids or inputs_embedsz_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r5   )r   rx   r"   r   r   zThe `z` should be specified for z layers, but it is for .)r   rt   r   rl   ru   r   rv   r   r1   c                 s   s    | ]	}|d ur|V  qd S r   r"   )r   vr"   r"   r#   	<genexpr>J  s    z$XGLMModel.forward.<locals>.<genexpr>)last_hidden_staterl   rq   
attentionscross_attentions)*r   ru   r   r   use_return_dictrd   %warn_if_padding_and_no_attention_maskrQ   rJ   r   r   rz   loggerwarning_oncer	   r   r{   r   from_legacy_cacheget_seq_lengthr   r,   rD   longr7   rF   r   r6   r   r;   r   r   r^   r+   ziplenr   	enumeraterandr   r   r   )r   r$   rs   rO   r   r   r   r   rl   r   r   ru   r   r   rv   input_shaperP   rq   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputsr"   r"   r#   r%     s   $











zXGLMModel.forwardr   )NNNNNNNNNNNNNN)r&   r'   r(   r   r   r   r   r   r   r,   r-   r   r   r   r   r   r%   r.   r"   r"   r    r#   r     s`    	
r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                $       s   e Zd ZdZdgZ fddZe															ddeej	 deej	 deej	 d	eej	 d
eej	 deej	 deej	 dee
 deej	 deej	 dee dee dee dee deej	 deeej	 ef f ddZ  ZS )XGLMForCausalLMr   zlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFrb   )
r   r   r   r   r   rf   hidden_sizer   lm_headr   )r   r   r    r"   r#   r   b  s   
zXGLMForCausalLM.__init__Nr$   rs   rO   r   r   r   r   rl   r   labelsr   ru   r   r   rv   rw   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j|||||||||	|||||d}| |d }d}|
durO| j||
f| j j| j jd|}|se|f|dd  }|durc|f| S |S t	|||j
|j|j|jdS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r$   rs   rO   r   r   r   r   rl   r   r   ru   r   r   rv   r   )r   r   r   )losslogitsrl   rq   r   r   )r   ru   r   r   r   r   loss_functionr   r   r   rl   rq   r   r   )r   r$   rs   rO   r   r   r   r   rl   r   r  r   ru   r   r   rv   kwargsr   r  r  outputr"   r"   r#   r%   j  sV   +zXGLMForCausalLM.forward)NNNNNNNNNNNNNNN)r&   r'   r(   r   _tied_weights_keysr   r   r   r,   r-   r   r   r   r   r   r%   r.   r"   r"   r    r#   r   X  sj    	
r   )r   r   r   )-r)   rA   typingr   r   r,   r   activationsr   cache_utilsr   r   r	   
generationr
   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_xglmr   
get_loggerr&   r   r   r   Moduler/   rZ   r   r   r   r   __all__r"   r"   r"   r#   <module>   s>   
4 "r Ji