o
    wi                     @   sD  d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ eeZG dd dejZG dd dej Z!G dd dej Z"G dd deZ#eG dd deZ$eG dd de$Z%eddG dd de$eZ&g dZ'dS ) zPyTorch XGLM model.    N)OptionalUnion)nn   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )
XGLMConfigc                
       sL   e Zd ZdZddedededee f fddZd	ej	f fd
dZ
  ZS )XGLMScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s   t  ||| || _d S N)super__init__r   )selfr   r   r   r   	__class__ c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/xglm/modeling_xglm.pyr   +   s   
z XGLMScaledWordEmbedding.__init__	input_idsc                    s   t  || j S r   )r   forwardr   )r   r    r   r   r   r!   /   s   zXGLMScaledWordEmbedding.forward)r   )__name__
__module____qualname____doc__intr   floatr   torchTensorr!   __classcell__r   r   r   r   r   &   s    $r   c                	       s   e Zd ZdZddededee f fddZddededee fd	d
Zeddededee fddZ	e
 ddee
j defddZ  ZS )!XGLMSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   r   c                    s4   t    d| _|| _|| _| || j || d S )N   )r   r   offsetr   r   make_weights)r   r,   r   r   r   r   r   r   6   s
   
z*XGLMSinusoidalPositionalEmbedding.__init__r   c                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor0   r2   r3   register_buffer)r   r   r   r   emb_weightsr   r   r   r/   =   s   
z.XGLMSinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r-   i'  r   )r2   r   dimN)mathlogr(   exparangeint64r'   	unsqueezecatsincosviewzerosr7   get_default_dtype)r   r   r   half_dimembr   r   r   r5   E   s    $&z/XGLMSinusoidalPositionalEmbedding.get_embeddingr   position_idspast_key_values_lengthc                 C   sn   |  \}}|| j7 }d| | }|| j dkr"| || j| j | jd|d||| jjd 	 S )Nr-   r   r<   )
sizer.   r0   r/   r   r   index_selectrF   shapedetach)r   rK   rL   bszseq_lenmax_posr   r   r   r!   Z   s   
*z)XGLMSinusoidalPositionalEmbedding.forwardr   )Nr   )r"   r#   r$   r%   r&   r   r   r/   staticmethodr5   r(   no_gradr)   r!   r*   r   r   r   r   r+   3   s     $r+   c                       s   e Zd ZdZ			ddedededed	ef
 fd
dZdej	dedefddZ
					ddej	deej	 deeej	  deej	 deej	 dedeej	eej	 eeej	  f fddZ  ZS )XGLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FT	embed_dim	num_headsdropout
is_decoderbiasc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r\   )r   r   rX   rY   rZ   head_dim
ValueErrorscalingr[   r   Lineark_projv_projq_projout_proj)r   rX   rY   rZ   r[   r\   r   r   r   r   j   s"   


zXGLMAttention.__init__tensorrR   rQ   c                 C   s    | ||| j| jdd S )Nr   r-   )rF   rY   r^   	transpose
contiguous)r   rf   rR   rQ   r   r   r   _shape   s    zXGLMAttention._shapeNhidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 C   s  |du}|  \}}	}
| || j }|r"|dur"|d }|d }nZ|r9| | |d|}| | |d|}nC|durh| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n| | |d|}| | |d|}| jr||f}|| j	 d| j
f}| ||	|j| }|j| }|j| }| d}t||dd}|  || j	 |	|fkrtd|| j	 |	|f d|   |dur|  |d|	|fkrtd	|d|	|f d|   ||| j	|	|| }t|tjt|jj|jd
}||| j	 |	|}|jtjkr(tjj|dtjdtj}ntjj|dd}|durg|  | j	fkrLtd| j	f d|   |dddd||| j	|	| }||| j	 |	|}|r~||| j	|	|}||| j	 |	|}nd}tjj|| j| jd}t||}|  || j	 |	| j
fkrtd|| j	|	| j
f d|   ||| j	|	| j
}|dd}|||	| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   r   r<   r-   r:   z$Attention weights should be of size z	, but is z!Attention mask should be of size )r3   )r;   r2   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size ) rM   rd   r`   ri   rb   rc   r(   rC   r[   rY   r^   rF   bmmrg   r_   maxrf   finfor2   minr3   float16r   
functionalsoftmaxfloat32r7   rZ   rs   reshaperX   re   )r   rj   rk   rl   rm   rn   ro   is_cross_attentionrQ   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr   r   r   r!      s   





"

zXGLMAttention.forward)rW   FT)NNNNF)r"   r#   r$   r%   r&   r'   boolr   r(   r)   ri   r   tupler!   r*   r   r   r   r   rV   g   sJ    rV   c                       s   e Zd Zdef fddZ								ddejdeej d	eej d
eej deej deej deeej  dee	 dee	 dejfddZ
  ZS )XGLMDecoderLayerconfigc                    s   t    |j| _t| j|j|jdd| _|j| _t	|j
 | _|j| _|jr9t| j|j|jdd| _t| j| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rX   rY   rZ   r[   )r   r   d_modelrX   rV   attention_headsattention_dropout	self_attnrZ   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normra   ffn_dimfc1fc2final_layer_normr   r   r   r   r   r     s.   
zXGLMDecoderLayer.__init__NFTrj   rm   encoder_hidden_statesencoder_attention_maskrn   cross_attn_layer_head_maskrl   ro   	use_cacherp   c
                 C   sZ  |}
|  |}|dur|dd nd}| j|||||d\}}}tjj|| j| jd}|
| }d}d}|durk|}
| |}|durH|dd nd}| j||||||d\}}}tjj|| j| jd}|
| }|| }|}
| |}| 	| 
|}tjj|| j| jd}| |}tjj|| j| jd}|
| }|f}|r|||f7 }|	r||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr-   )rj   rl   rm   rn   ro   rq   )rj   rk   rm   rn   rl   ro   )r   r   r   ry   rZ   rs   r   r   r   r   r   r   r   )r   rj   rm   r   r   rn   r   rl   ro   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valueoutputsr   r   r   r!     sT   




zXGLMDecoderLayer.forward)NNNNNNFT)r"   r#   r$   r   r   r(   r)   r   r   r   r!   r*   r   r   r   r   r     s>     	
r   c                   @   s&   e Zd ZeZdZdZdgZdd ZdS )XGLMPreTrainedModelmodelTr   c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )NrW   )meanstd)r   init_std
isinstancer   ra   weightdatanormal_r\   zero_	Embeddingr   )r   moduler   r   r   r   _init_weights  s   

z!XGLMPreTrainedModel._init_weightsN)	r"   r#   r$   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r   r   r   r   r   y  s    r   c                        s   e Zd Zddedeej f fddZdd Zdd	 Z	e
													dd
eej deej deej deej deej deej deej deeej  deej dee dee dee dee deeej ef fddZ  ZS )	XGLMModelNr   embed_tokensc                    s   t     j| _ j| _ j| _ j| _ jrt	
 jnd}|dur)|| _nt j j| j|d| _t j j j| _t fddt jD | _t j| _d| _|   dS )zZ
        embed_tokens (`nn.Embedding`, *optional*):
            output embeddings
        r   N)r   c                    s   g | ]}t  qS r   )r   ).0r   r   r   r   
<listcomp>  s    z&XGLMModel.__init__.<locals>.<listcomp>F)r   r   rZ   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingr=   sqrtr   r   r   
vocab_sizer+   embed_positionsr   
ModuleListrange
num_layerslayersr   
layer_normgradient_checkpointing	post_init)r   r   r   r   r   r   r   r     s(    zXGLMModel.__init__c                 C      | j S r   r   r   r   r   r   get_input_embeddings     zXGLMModel.get_input_embeddingsc                 C   
   || _ d S r   r   r   valuer   r   r   set_input_embeddings     
zXGLMModel.set_input_embeddingsr    rm   rK   r   r   	head_maskcross_attn_head_maskpast_key_valuesinputs_embedsr   ro   output_hidden_statesreturn_dictrp   c                 C   s  |dur|n| j j}|dur|n| j j}|
dur|
n| j j}
|dur$|n| j j}|dur4|	dur4td|durK| || | }|d|d }n|	durX|	 dd }ntd|duri|d d j	d nd}|du rt
j||d | t
j|dur|jn|	jd}|d}|	du r| |}	t|||	|}|dur|durt||	j|d d}|	| |||	j }tjj|t| j| jd	}| jr| jr|
rtd
 d}
|rdnd}|rdnd}|r|durdnd}|
rdnd}t||gddgD ]*\}}|dur#| d t| jkr#td| dt| j d| d  dqt | jD ]q\}}|r5||f7 }| jrFt
!g }|| j"k rFq)|durO|| nd}||||||dur_|| nd|duri|| nd|||
d	}|d }|
r|||r~dnd f7 }|r||d f7 }|dur||d f7 }q)| #|}|r||f7 }|
r|nd}|st$dd |||||fD S t%|||||dS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer<   z5You have to specify either input_ids or inputs_embedsr   r-   r1   )r~   rq   z_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...Fr   r   r   zThe `z` should be specified for z layers, but it is for .)r   rn   r   rl   ro   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r   )r   vr   r   r   	<genexpr>E  s    z$XGLMModel.forward.<locals>.<genexpr>)last_hidden_stater   rj   
attentionscross_attentions)&r   ro   r   r   use_return_dictr_   %warn_if_padding_and_no_attention_maskrM   rF   rO   r(   r@   longr3   rB   r   r	   r   r2   r   r7   r   ry   rZ   r'   rs   r   loggerwarning_onceziplenr   	enumeraterandr   r   r   r   )r   r    rm   rK   r   r   r   r   r   r   r   ro   r   r   input_shaperL   rj   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilityrl   layer_outputs
next_cacher   r   r   r!     s   #









zXGLMModel.forwardr   )NNNNNNNNNNNNN)r"   r#   r$   r   r   r   r   r   r   r   r   r(   r)   listFloatTensorr   r   r   r   r!   r*   r   r   r   r   r     s^    	
r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                "       s  e Zd ZdZdgZ fddZdd Zdd Zd	d
 Zdd Z	e
														d!deej deej deej deej deej deej deej deeej  deej deej dee dee dee dee deeej ef fddZedd  Z  ZS )"XGLMForCausalLMr   zlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr]   )
r   r   r   r   r   ra   hidden_sizer   lm_headr   r   r   r   r   r   ]  s   
zXGLMForCausalLM.__init__c                 C   s   | j jS r   r   r   r   r   r   r   r   e  s   z$XGLMForCausalLM.get_input_embeddingsc                 C   s   || j _d S r   r  r   r   r   r   r   h  s   z$XGLMForCausalLM.set_input_embeddingsc                 C   r   r   r   r   r   r   r   get_output_embeddingsk  r   z%XGLMForCausalLM.get_output_embeddingsc                 C   r   r   r  )r   new_embeddingsr   r   r   set_output_embeddingsn  r   z%XGLMForCausalLM.set_output_embeddingsNr    rm   rK   r   r   r   r   r   r   labelsr   ro   r   r   rp   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j|||||||||	||||d}| |d }d}|
durN| j||
f| j j| j jd|}|sd|f|dd  }|durb|f| S |S t	|||j
|j|j|jdS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r    rm   rK   r   r   r   r   r   r   r   ro   r   r   r   )r   r   r   )losslogitsr   rj   r   r   )r   ro   r   r   r   r   loss_functionr   r   r   r   rj   r   r   )r   r    rm   rK   r   r   r   r   r   r   r  r   ro   r   r   kwargsr   r  r  outputr   r   r   r!   q  sT   *zXGLMForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr   c                 3   s$    | ]}| d  |jV  qdS )r   N)rN   r7   r3   )r   
past_statebeam_idxr   r   r     s   " z1XGLMForCausalLM._reorder_cache.<locals>.<genexpr>)r   )r   r  reordered_past
layer_pastr   r  r   _reorder_cache  s   zXGLMForCausalLM._reorder_cache)NNNNNNNNNNNNNN)r"   r#   r$   r   _tied_weights_keysr   r   r   r  r  r   r   r(   r)   r   r   r   r   r   r   r!   rT   r  r*   r   r   r   r   r   S  sp    	
Yr   )r   r   r   )(r%   r=   typingr   r   r(   torch.utils.checkpointr   activationsr   
generationr   modeling_attn_mask_utilsr   r	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_xglmr   
get_loggerr"   r   r   r   Moduler+   rV   r   r   r   r   __all__r   r   r   r   <module>   s<   
4 x G|