o
    ei                     @   s   d dl mZ d dlZd dlmZ ddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ dd	lmZ eeZd
ZG dd deZG dd deZdd ZdddZG dd deZG dd de	ZG dd de
ZG dd deZg dZdS )    )OptionalN   )logging   )LlamaAttentionLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassificationLlamaRotaryEmbedding)Phi3MLP   )	GlmConfigzTHUDM/glm-4-9bc                   @      e Zd ZdS )GlmMLPN__name__
__module____qualname__ r   r   a/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/glm/modular_glm.pyr   %       r   c                   @   sF   e Zd Ze			d
dedB ded dedB dedef fdd	Z	dS )GlmRotaryEmbeddingNconfigdeviceztorch.deviceseq_lenreturnztorch.Tensorc           	      C   st   | j d }| j dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj	|tj
d	|   }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   r   )dtype)r   r   )rope_parametersgetgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r   r   r   baser   r   dimattention_factorinv_freqr   r   r   compute_default_rope_parameters*   s   
&z2GlmRotaryEmbedding.compute_default_rope_parameters)NNN)
r   r   r   staticmethodr   r   r%   tupler*   r/   r   r   r   r   r   )   s    
r   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   Nr   r   r,   )r&   stackflatten)xx1x2r   r   r   rotate_halfK   s   r:   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}|jd }| dd|f | d|df }}|dd|f |d|df }}	|| t||  }
|| t||  }tj|
|gdd}
tj||	gdd}|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr2   r   r3   )	unsqueezeshaperepeat_interleaver:   r&   cat)qkcossinunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr   r   r   apply_rotary_pos_embR   s   

$$
""rK   c                       s,   e Zd ZddededB f fddZ  ZS )GlmAttentionNr   	layer_idxc                    s.   t  || tj|j| j |jdd| _d S )NF)bias)super__init__nnLinearr$   r   r#   o_proj)selfr   rM   	__class__r   r   rP   {   s    zGlmAttention.__init__)N)r   r   r   r   r%   rP   __classcell__r   r   rU   r   rL   z   s    $rL   c                   @   r   )GlmForCausalLMNr   r   r   r   r   rX      r   rX   c                   @   r   )GlmForSequenceClassificationNr   r   r   r   r   rY      r   rY   c                   @   r   )GlmForTokenClassificationNr   r   r   r   r   rZ      r   rZ   )GlmPreTrainedModelGlmModelrX   rY   rZ   )r   )typingr   r&   torch.nnrQ   utilsr   llama.modeling_llamar   r   r   r	   r
   phi3.modeling_phi3r   configuration_glmr   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r   r:   rK   rL   rX   rY   rZ   __all__r   r   r   r   <module>   s$   
"
(