o
    eiY                     @   s  d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( e rd dl)Z)d dl)m*Z* G dd de*j+Z,dd Z-de)j.de/de)j.fddZ0	d=d e*j+d!e)j.d"e)j.d#e)j.d$e)j.dB d%e1d&e1d'ee fd(d)Z2d>d*d+Z3ee3G d,d- d-e*j+Z4G d.d/ d/e*j+Z5G d0d1 d1eZ6eG d2d3 d3eZ7G d4d5 d5e7Z8G d6d7 d7e*j+Z9ed8d9G d:d; d;e7e	Z:g d<Z;dS )?    )Callable)Optional   )ACT2FN)Cache)GenerationMixin)use_kernelized_func)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_available)can_return_tuplemaybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )GlmAsrConfigGlmAsrEncoderConfigN)nnc                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )GlmAsrRotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr    F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr!   rope_parametersr"   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr!   devicerope_init_fnr    	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/glmasr/modeling_glmasr.pyr'   0   s   


zGlmAsrRotaryEmbedding.__init__r1   ztorch.deviceseq_lenreturnztorch.Tensorc           	      C   st   | j d }| j dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj	|tj
d	|   }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   r   dtype)r1   r=   )r+   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r!   r1   r7   baser:   r;   dimattention_factorr    r5   r5   r6   r,   @   s   
&z5GlmAsrRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r   mpscpuF)device_typeenabledr   rI   r<   )r    rG   expandshaperF   r1   
isinstancetypestrr   	transposerC   catcosr-   sinr=   )
r0   xposition_idsinv_freq_expandedposition_ids_expandedrN   freqsembrX   rY   r5   r5   r6   forward`   s   0&zGlmAsrRotaryEmbedding.forwardN)NNN)__name__
__module____qualname__rC   Tensor__annotations__r   r'   staticmethodr   rB   tuplerG   r,   no_gradr   r`   __classcell__r5   r5   r3   r6   r   -   s&   
 

r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrK   r   rP   )rR   rC   rW   )rZ   x1x2r5   r5   r6   rotate_halfp   s   rm   hidden_statesn_repr8   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rR   rQ   reshape)rn   ro   batchnum_key_value_headsslenr;   r5   r5   r6   	repeat_kvw   s
   0rt           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   rK   )rI   r=   )ptrainingr   )rt   num_key_value_groupsrC   matmulrV   r   
functionalsoftmaxfloat32rF   r=   r|   r   
contiguous)rv   rw   rx   ry   rz   r{   r|   r}   
key_statesvalue_statesattn_weightsattn_outputr5   r5   r6   eager_attention_forward   s   
r   c                 C   s   | |}| |}|jd }| dd |f | d|d f }}|dd |f |d|d f }	}
|| t||  }|	| t|	|  }tj||gdd}tj||
gdd}||fS )NrK   .rP   )	unsqueezerR   rm   rC   rW   )qkrX   rY   r[   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr5   r5   r6   apply_rotary_pos_emb   s   


""r   c                       sj   e Zd ZdZdedef fddZ	ddejde	ejejf dB d	e
e d
e	ejejf fddZ  ZS )GlmAsrAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr!   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _d S )Nr;   g      FT)bias)r&   r'   r!   r   r?   r@   rA   r;   rr   r   r{   attention_dropout	is_causalr   Linearq_projk_projv_projo_projr0   r!   r   r3   r5   r6   r'      s   
 zGlmAsrAttention.__init__Nrn   position_embeddingsr}   r8   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}|\}	}
t|||	|
\}}t	| j
jt}|| |||fd | jsVdn| j| jd|\}}|jg |dR   }| |}||fS )NrK   r   r   ru   )rz   r|   r{   )rR   r;   r   viewrV   r   r   r   r   get_interfacer!   _attn_implementationr   r   r   r{   rp   r   r   )r0   rn   r   r}   input_shapehidden_shapequery_statesr   r   rX   rY   attention_interfacer   r   r5   r5   r6   r`      s2   

zGlmAsrAttention.forwardra   )rb   rc   rd   __doc__r   rB   r'   rC   re   rh   r   r   r`   rj   r5   r5   r3   r6   r      s    r   c                       s,   e Zd Z fddZdejfddZ  ZS )	GlmAsrMLPc                    s>   t    t|j|j| _t|j|j| _t|j	 | _
d S ra   )r&   r'   r   r   r@   intermediate_sizefc1fc2r   
hidden_actact_fnr0   r!   r3   r5   r6   r'      s   
zGlmAsrMLP.__init__rn   c                 C   s"   |  |}| |}| |}|S ra   )r   r   r   )r0   rn   r5   r5   r6   r`         


zGlmAsrMLP.forward)rb   rc   rd   r'   rC   re   r`   rj   r5   r5   r3   r6   r      s    r   c                	       s\   e Zd Zdedef fddZ	ddejdeejejf dB de	e
 d	ejfd
dZ  ZS )GlmAsrEncoderLayerr!   r   c                    sJ   t    |j| _t||d| _t|| _t|j| _	t|j| _
d S )N)r!   r   )r&   r'   r@   r   	self_attnr   mlpr   	LayerNorminput_layernormpost_attention_layernormr   r3   r5   r6   r'      s   

zGlmAsrEncoderLayer.__init__Nrn   r   r}   r8   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)rn   r   r5   )r   r   r   r   )r0   rn   r   r}   residual_r5   r5   r6   r`      s   



zGlmAsrEncoderLayer.forwardra   )rb   rc   rd   r   rB   r'   rC   re   rh   r   r   r`   rj   r5   r5   r3   r6   r      s    r   c                   @   s4   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdS )GlmAsrPreTrainedModelr!   model)audiotextTr   past_key_valuesN)rb   rc   rd   r   rf   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar5   r5   r5   r6   r     s   
 r   c                       sb   e Zd ZU eed< dZdZdgZee	dZ
def fddZeeedee fd	d
Z  ZS )GlmAsrEncoderr!   input_featuresr   r   )rn   
attentionsc                    s   t    tj j jddd| _tj j jdddd| _t fddt	 j
D | _t j| _t d| _d	| _|   d S )
Nr   r   )kernel_sizepaddingr   )r   strider   c                    s   g | ]}t  |qS r5   )r   ).0r   r!   r5   r6   
<listcomp>2  s    z*GlmAsrEncoder.__init__.<locals>.<listcomp>r   F)r&   r'   r   Conv1dnum_mel_binsr@   conv1conv2
ModuleListrangenum_hidden_layerslayersr   normr   
rotary_embgradient_checkpointing	post_initr   r3   r   r6   r'   ,  s   zGlmAsrEncoder.__init__r}   c                 K   s   t j| |}t j| |}|dd}|}| j|tj|j	d |j
dd d d f d}| jD ]}||fd|i|}q3| |}t|dS )Nr   r   r1   )r[   r   )last_hidden_state)r   r   gelur   r   rV   r   rC   rD   rR   r1   r   r   r
   )r0   r   r}   inputs_embedsrn   r   encoder_layerr5   r5   r6   r`   9  s   "


zGlmAsrEncoder.forward)rb   rc   rd   r   rf   main_input_namer   r   r   r   _can_record_outputsr'   r   r   r   r   r   r`   rj   r5   r5   r3   r6   r   "  s   
  r   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GlmAsrMultiModalProjectorz
    Audio adaptor (small MLP) that projects GlmAsrEncoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    r!   c                    sN   t    t|jj|jjd | _t	|j
 | _t|jjd |jj| _d S )Nr   )r&   r'   r   r   audio_configr   text_configr@   linear_1r   projector_hidden_actactlinear_2r   r3   r5   r6   r'   S  s   
z"GlmAsrMultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S ra   )r   r   r   )r0   audio_featuresrn   r5   r5   r6   r`   Y  r   z!GlmAsrMultiModalProjector.forward)rb   rc   rd   r   r   r'   r`   rj   r5   r5   r3   r6   r   M  s    r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                       s8  e Zd ZdZdZdZ fddZdd Zdd Zdd	 Z	d
d Z
dd Zdd Zeedddejdejdee deeB fddZee											d&dejdB dejdB dejdB dejdB dejdB dedB dejdB dejdB dedB d ejdB d!eejB dee defd"d#Z fd$d%Z  ZS )'GlmAsrForConditionalGenerationNc                    sH   t  | |jj| _t|j| _t|j| _	t
|| _|   d S ra   )r&   r'   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   r3   r5   r6   r'   j  s   

z'GlmAsrForConditionalGeneration.__init__c                 C   
   | j  S ra   )r   get_input_embeddingsr0   r5   r5   r6   r   t     
z3GlmAsrForConditionalGeneration.get_input_embeddingsc                 C      | j | d S ra   )r   set_input_embeddings)r0   ry   r5   r5   r6   r   w     z3GlmAsrForConditionalGeneration.set_input_embeddingsc                 C   r   ra   )r   get_output_embeddingsr   r5   r5   r6   r   z  r   z4GlmAsrForConditionalGeneration.get_output_embeddingsc                 C   r   ra   )r   set_output_embeddings)r0   new_embeddingsr5   r5   r6   r   }  r   z4GlmAsrForConditionalGeneration.set_output_embeddingsc                 C   r   ra   )r   set_decoder)r0   decoderr5   r5   r6   r     r   z*GlmAsrForConditionalGeneration.set_decoderc                 C   r   ra   )r   get_decoderr   r5   r5   r6   r     r   z*GlmAsrForConditionalGeneration.get_decoderzgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r   r   input_features_maskr}   r8   c                 K   s   | j |fddi|}|j}||jd d| jjj}| |}|d}dD ]\}}	}
|d|  |	d  d |
 d }q'd}|| | d }t	j
|jd |jd	d
d
d
f |d
d
d
f k }|||j |_|S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        return_dictTr   rK   ))r   r   r   )r   r   r   r   r      r   N)r   r   rp   rR   r!   r   r   r   sumrC   rD   r1   rF   pooler_output)r0   r   r   r}   audio_outputsaudio_hidden_statesaudio_embedsaudio_lengthsr   r   r   merge_factorpost_lengths
valid_maskr5   r5   r6   get_audio_features  s   

"2z1GlmAsrForConditionalGeneration.get_audio_featuresr   	input_idsrz   r[   r   r   labels	use_cachecache_positionlogits_to_keepc                 K   s   |du r
|   |}|dur2|dur2| j||ddj}|| jjkd}|||j||j}| j	d||||||	|
|d|}|S )a  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

        >>> model_id = "zai-org/GLM-ASR-Nano-2512"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
        >>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

        >>> inputs = inputs.to(model.device, dtype=model.dtype)

        >>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
        >>> print(decoded_outputs)
        ```NT)r  rK   )r   rz   r[   r   r  r  r  r  r5   )
r   r  r  r!   audio_token_idr   masked_scatterrF   r1   r   )r0   r  r   r   rz   r[   r   r   r  r  r  r  r}   r  audio_token_maskoutputsr5   r5   r6   r`     s*   .	z&GlmAsrForConditionalGeneration.forwardc                    sl   | dd }| dd }|d}t j|i |}|d ur4|d dkr4|d ur,||d< |d ur4||d< |S )Nr   r   r  r   )popr>   r&   prepare_inputs_for_generation)r0   argsr}   r   r   r  model_inputsr3   r5   r6   r    s   
z<GlmAsrForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNr   )rb   rc   rd   _keep_in_fp32_modules_strict_tp_plan_pp_planr'   r   r   r   r   r   r   r   r   rC   FloatTensorre   r   r   rh   r
   r  
LongTensorr   boolrB   r   r`   r  rj   r5   r5   r3   r6   r   `  s    
"	
Er   )r   r   r   )ru   )Nr   )<collections.abcr   typingr   activationsr   cache_utilsr   
generationr   integrationsr   modeling_layersr	   modeling_outputsr
   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   r   utils.output_capturingr   autor   r   configuration_glmasrr   r   rC   r   Moduler   rm   re   rB   rt   rG   r   r   r   r   r   r   r   r   r   __all__r5   r5   r5   r6   <module>   sn   C

5#+ "