o
    eixy                     @   s:  d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddlm	Z	 ddl
mZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( G dd dej)Z*G dd dej)Z+dd Z,eddAddZ-dej.de/dej.fd d!Z0	"dBd#ej)d$ej.d%ej.d&ej.d'ej.dB d(e1d)e1d*ee fd+d,Z2ee-G d-d. d.ej)Z3G d/d0 d0ej)Z4G d1d2 d2ej)Z5G d3d4 d4eZ6eG d5d6 d6eZ7ed7d8G d9d: d:e7Z8eG d;d< d<eZ9ed=d8G d>d? d?e7Z:g d@Z;dS )C    )Callable)	dataclass)OptionalN)nn   )ACT2FN)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )LasrCTCConfigLasrEncoderConfigc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )LasrEncoderSubsamplingconfigc                    st   t    t|j|j| _tj|j|j|j|j	d| _
tj|j|j|j|j	d| _t|j|j| _t | _d S )N)kernel_sizestride)super__init__r   Linearnum_mel_binshidden_sizedense_0Conv1dsubsampling_conv_kernel_sizesubsampling_conv_strideconv_0subsampling_conv_channelsconv_1dense_1ReLUact_fnselfr   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/lasr/modeling_lasr.pyr"   +   s    
zLasrEncoderSubsampling.__init__input_featuresreturnc                 C   sR   |  | |}|dd}|  | |}|  | |}|dd}| |S )Nr      )r/   r&   	transposer*   r,   r-   )r1   r6   hidden_statesr4   r4   r5   forward=   s   
zLasrEncoderSubsampling.forward)	__name__
__module____qualname__r   r"   torchTensorr;   __classcell__r4   r4   r2   r5   r   *   s    r   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )LasrEncoderRotaryEmbeddinginv_freqNr   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrC   F)
persistentoriginal_inv_freq)r!   r"   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr   rope_parametersrD   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r1   r   devicerope_init_fnrC   r2   r4   r5   r"   I   s   


z#LasrEncoderRotaryEmbedding.__init__rP   ztorch.deviceseq_lenr7   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r8   dtype)rP   rV   )	rK   getattrr%   num_attention_headsr?   arangeint64tofloat)r   rP   rR   basedimattention_factorrC   r4   r4   r5   rL   Y   s   
&z:LasrEncoderRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r   mpscpuF)device_typeenabledr8   r^   rU   )rC   r\   expandshaper[   rP   
isinstancetypestrr   r9   r?   catcosrM   sinrV   )
r1   xposition_idsinv_freq_expandedposition_ids_expandedrc   freqsembrl   rm   r4   r4   r5   r;   w   s   0&z"LasrEncoderRotaryEmbedding.forwardN)NNN)r<   r=   r>   r?   r@   __annotations__r   r"   staticmethodr   inttupler\   rL   no_gradr   r;   rA   r4   r4   r2   r5   rB   F   s&   
 

rB   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr`   r8   re   )rg   r?   rk   )rn   x1x2r4   r4   r5   rotate_half   s   r|   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer|   )qkrl   rm   unsqueeze_dimq_embedk_embedr4   r4   r5   apply_rotary_pos_emb   s
   

r   r:   n_repr7   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rg   rf   reshape)r:   r   batchnum_key_value_headsslenrT   r4   r4   r5   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr8   r   r`   r^   rV   ptrainingr   )r   num_key_value_groupsr?   matmulr9   r   
functionalsoftmaxfloat32r[   rV   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr4   r4   r5   eager_attention_forward   s   
r   c                       sv   e Zd ZdZdedef fddZ		ddejde	ejejf dB d	ejdB d
e
e de	ejejf f
ddZ  ZS )LasrEncoderAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )NrT   g      Fbias)r!   r"   r   r   rW   r%   rX   rT   r   r   r   attention_dropout	is_causalr   r#   attention_biasq_projk_projv_projo_projr1   r   r   r2   r4   r5   r"      s(   
zLasrEncoderAttention.__init__Nr:   position_embeddingsr   r   r7   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}	|\}
}t|||
|\}}t	| j
jt}|| |||	|f| jsVdn| j| jd|\}}|jg |dR   }| |}||fS )Nr`   r   r8   r   )r   r   )rg   rT   r   viewr9   r   r   r   r   get_interfacer   _attn_implementationr   r   r   r   r   r   r   )r1   r:   r   r   r   input_shapehidden_shapequery_statesr   r   rl   rm   attention_interfacer   r   r4   r4   r5   r;      s2   

zLasrEncoderAttention.forwardNN)r<   r=   r>   __doc__r   rw   r"   r?   r@   rx   r   r   r;   rA   r4   r4   r2   r5   r      s     r   c                       s.   e Zd Zddef fddZdddZ  ZS )LasrEncoderConvolutionModuleNr   c              	      s   t    |j}|du r|j}tt|dd | _n|d }t|dd | _d| _t	j
|d| ddd	|jd
| _t	j
|||d| j||jd| _t	j|j|jd| _t	j
||ddd	|jd
| _dS )z
        Args:
            config (LasrEncoderConfig): Configuration for the model.
            module_config (dict): Configuration for the module (e.g., encoder or decoder).
        N
hidden_actsilur   
activationsamer8   r   r   )r   r    paddingr   )r    r   groupsr   )momentum)r!   r"   r%   conv_kernel_sizer   rW   r   getr   r   r'   convolution_biaspointwise_conv1depthwise_convBatchNorm1dbatch_norm_momentumnormpointwise_conv2)r1   r   module_configchannelsr   r2   r4   r5   r"     s0   
	z%LasrEncoderConvolutionModule.__init__c                 C   s   | dd}| |}tjj|dd}|dur6|jtjkr&tj| dd}n
tj|dk dd}|	|d}| 
|}| |}| |}| |}| ddS )aY  
        Compute convolution module.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
            attention_mask (`torch.Tensor` of shape `(batch, 1, time, time)`): Attention mask.

        Returns:
            `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.

        r   r8   re   Nr   )r9   r   r   r   glurV   r?   boolallmasked_fillr   r   r   r   )r1   r:   r   all_masked_rowsr4   r4   r5   r;   0  s   




z$LasrEncoderConvolutionModule.forwardrt   r<   r=   r>   r   r"   r;   rA   r4   r4   r2   r5   r     s    "r   c                       s*   e Zd Zdef fddZdd Z  ZS )LasrEncoderFeedForwardr   c                    sR   t    tj|j|j|jd| _t|j	 | _
tj|j|j|jd| _|j| _d S )Nr   )r!   r"   r   r#   r%   intermediate_sizer   linear1r   r   r   linear2activation_dropoutr0   r2   r4   r5   r"   V  s
   
zLasrEncoderFeedForward.__init__c                 C   s4   |  | |}tjj|| j| jd}| |}|S )Nr   )r   r   r   r   r   r   r   r   )r1   r:   r4   r4   r5   r;   ]  s   
zLasrEncoderFeedForward.forwardr   r4   r4   r2   r5   r   U  s    r   c                       s^   e Zd Zdedef fddZ		ddejdejdB dejdB d	ee	 d
ejf
ddZ
  ZS )LasrEncoderBlockr   r   c                    s   t    d| _t|| _t||| _t|| _t|| _	t
j|j|jdd| _t
j|j|jdd| _t
j|j|jdd| _t
j|j|jdd| _t
j|j|jdd| _|j| _|j| _d S )NFr   )r!   r"   gradient_checkpointingr   feed_forward1r   	self_attnr   convfeed_forward2r   	LayerNormr%   layer_norm_epsnorm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outfeed_forward_residual_weightsconv_residual_weightsr   r2   r4   r5   r"   e  s   



zLasrEncoderBlock.__init__Nr:   r   r   r   r7   c           
      K   s   |}|  | |}| jd | | jd |  }| |}| jd|||d|\}}|| }| j| ||d}	| jd | | jd |	  }|}| | 	|}| jd | | jd |  }| 
|}|S )Nr   r   )r:   r   r   )r   r4   )r   r   r   r   r   r   r   r   r   r   r   )
r1   r:   r   r   r   residualnormalized_hidden_statesr   _conv_outputr4   r4   r5   r;   w  s*   


zLasrEncoderBlock.forwardr   )r<   r=   r>   r   rw   r"   r?   r@   r   r   r;   rA   r4   r4   r2   r5   r   d  s    r   c                       s   e Zd ZU eed< dZdZdZdZdgZ	dZ
dZdZdZdZdZeedZe  fd	d
ZdejfddZddejdedB fddZ  ZS )LasrPreTrainedModelr   modelr6   audioTr   F)r:   
attentionsc                    s   t  | d S rt   )r!   _init_weights)r1   r   r2   r4   r5   r     s   z!LasrPreTrainedModel._init_weightsinput_lengthsc                 C   sL   t | jtr
| jjn| j}|j}|j}d}t|D ]
}|| | d }q|S )Nr8   r   )rh   r   r   encoder_configr(   r)   range)r1   r   r   r   r    
num_layersr   r4   r4   r5   _get_subsampling_output_length  s   z2LasrPreTrainedModel._get_subsampling_output_lengthNr   target_lengthc                 C   sH   |  |d}|dur|n| }tj||jd|dddf k }|S )z
        Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
        when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
        r`   NrP   )r   summaxr?   rY   rP   )r1   r   r   output_lengths
max_lengthr4   r4   r5   _get_output_attention_mask  s    z.LasrPreTrainedModel._get_output_attention_maskrt   )r<   r=   r>   r   ru   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr?   ry   r   r@   r   rw   r   rA   r4   r4   r2   r5   r     s(   
 "r   zh
    The LasrEncoder model, based on the Conformer architecture](https://arxiv.org/abs/2005.08100).
    )custom_introc                       sj   e Zd ZU eed< dZdef fddZeee	e
	ddejdejdB dee d	efd
dZ  ZS )LasrEncoderr   encoderc                    s   t    d| _ j| _ j| _ j| _t | _t | _	t
 fddt jD | _t
j j jdd| _|   d S )NFc                    s   g | ]}t  |qS r4   )r   ).0r   r   r4   r5   
<listcomp>  s    z(LasrEncoder.__init__.<locals>.<listcomp>)epsr   )r!   r"   r   r   dropout_positions	layerdropr   
subsamplerrB   
rotary_embr   
ModuleListr   num_hidden_layerslayersr   r%   r   out_norm	post_initr0   r2   r  r5   r"     s   

zLasrEncoder.__init__Nr6   r   r   r7   c           
      K   s   |  |}| |tj|jd |jdd\}}tjj	|| j	| j
d}tjj	|| j| j
d}tjj	|| j| j
d}|durH| j||jd d}t| j||d}| jD ]"}d}| j
rftg }	|	| jk rfd	}|su||f|||fd
|}qS| |}t|dS )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        r   r   r   r   Nr   )r   inputs_embedsr   FT)r   r   )last_hidden_state)r  r  r?   rY   rg   rP   r~   r   r   r   r   r  r   r
   r   r  randr  r  r   )
r1   r6   r   r   r:   rl   rm   encoder_layerto_dropdropout_probabilityr4   r4   r5   r;     s@   





zLasrEncoder.forwardrt   )r<   r=   r>   r   ru   r   r"   r   r   r   r   r?   r@   r   r   r   r;   rA   r4   r4   r2   r5   r    s$   
 r  c                   @   sf   e Zd ZU dZejed< dZeej	 dB ed< dZ
eeej	  dB ed< dZeeej	  dB ed< dS )LasrGenerateOutputah  
    Outputs of Lasr models.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    	sequencesNlogitsr   r:   )r<   r=   r>   r   r?   
LongTensorru   r   rx   FloatTensorr   r:   r4   r4   r4   r5   r  -  s   
 
r  zO
    Lasr Encoder with a Connectionist Temporal Classification (CTC) head.
    c                       s   e Zd ZU eed< def fddZee		ddej	dej	dB dej	dB de
e d	ef
d
dZe 		ddej	dej	dB dede
e d	eejB f
ddZ  ZS )
LasrForCTCr   c                    s<   t  | t|j| _tj|jj|jdd| _	| 
  d S )Nr   )r   )r!   r"   r  r   r	  r   r'   r%   
vocab_sizectc_headr  r0   r2   r4   r5   r"   P  s   zLasrForCTC.__init__Nr6   r   labelsr   r7   c              
   K   s  | j d||d|}|j}| |dddd}d}|dur|dur'|ntj|tjd}| |d}	|| j	j
k}
|
d}||
}tjj|dtjddd}tjjjd	d
 tjj|||	|| j	j
| j	j| j	jd}W d   n1 s{w   Y  t|||j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/lasr-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = LasrForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> outputs = model(**inputs)

        >>> print(outputs.loss)
        ```r6   r   r   r8   NrU   r`   r   r   F)rd   )blank	reductionzero_infinity)lossr   r:   r   r4   )r	  r  r%  r9   r?   	ones_likelongr   r   r   pad_token_idmasked_selectr   r   log_softmaxr   backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr   r:   r   )r1   r6   r   r&  r   encoder_outputsr:   r   r+  r   labels_masktarget_lengthsflattened_targets	log_probsr4   r4   r5   r;   X  sD   

zLasrForCTC.forwardFreturn_dict_in_generatec                 K   st   d|d< | j d
||d|}|jjdd}|dur+| j||jd d}| jj|| < |r8t||j|j|j	d	S |S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = LasrForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        Treturn_dictr'  r`   re   Nr   r  )r  r   r   r:   r4   )
r;   r   argmaxr   rg   r   r.  r  r   r:   )r1   r6   r   r<  r   outputsr  r4   r4   r5   generate  s&   zLasrForCTC.generater   )NF)r<   r=   r>   r   ru   r"   r   r   r?   r@   r   r   r   r;   ry   r   r  r!  r@  rA   r4   r4   r2   r5   r#  H  s@   
 Gr#  )r#  r  r   )r   )r   )<collections.abcr   dataclassesr   typingr   r?   r   activationsr   integrationsr   r	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_lasrr   r   Moduler   rB   r|   r   r@   rw   r   r\   r   r   r   r   r   r   r  r  r#  __all__r4   r4   r4   r5   <module>   sx   A
?H71[ 