o
    iH~                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZm Z  G dd dej!Z"G dd dej!Z#G dd dej!Z$dej%de&dej%fddZ'	d8dej!dej%dej%dej%d eej% d!e(d"e(d#ee fd$d%Z)G d&d' d'ej!Z*G d(d) d)ej!Z+G d*d+ d+eZ,eG d,d- d-eZ-ed.d/G d0d1 d1e-Z.eG d2d3 d3eZ/ed4d/G d5d6 d6e-Z0g d7Z1dS )9    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )ParakeetCTCConfigParakeetEncoderConfigc                       sL   e Zd ZU dZejed< d
def fddZe	 dejfdd	Z
  ZS )$ParakeetEncoderRelPositionalEncodingz*Relative positional encoding for Parakeet.inv_freqNconfigc                    sZ   t    |j| _d}d|tjd|jdtjdj|tjd|j   }| j	d|dd	 d S )
Ng     @      ?r      dtype)devicer   r   F)
persistent)
super__init__max_position_embeddingstorcharangehidden_sizeint64tofloatregister_buffer)selfr   r   baser   	__class__ b/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/parakeet/modeling_parakeet.pyr"   -   s   
 z-ParakeetEncoderRelPositionalEncoding.__init__hidden_statesc                 C   sF  |j d }|| jkrtd| d| j dtj|d | d|jd}| jd d d d f  |j d dd	|j}|d d d d f  }t
|jjtrW|jjdkrW|jjnd	}tj|d
d4 | |  dd}| }| }	tj||	gdd}
|
jg |
j d d dR  }
W d    n1 sw   Y  |
j	|jdS )Nr   zSequence Length: z= has to be less or equal than config.max_position_embeddings .r   r   mpscpuF)device_typeenabledr   dimr   )shaper#   
ValueErrorr$   r%   r   r   r)   expandr(   
isinstancetypestrautocast	transposesincosstackreshaper   )r+   r1   
seq_lengthposition_idsinv_freq_expandedposition_ids_expandedr7   freqsrD   rE   	pos_embedr/   r/   r0   forward;   s2   

. z,ParakeetEncoderRelPositionalEncoding.forwardN)__name__
__module____qualname____doc__r$   Tensor__annotations__r   r"   no_gradrN   __classcell__r/   r/   r-   r0   r   (   s   
 
r   c                       s*   e Zd Zdef fddZdd Z  ZS )ParakeetEncoderFeedForwardr   c                    sR   t    tj|j|j|jd| _t|j	 | _
tj|j|j|jd| _|j| _d S )Nbias)r!   r"   r   Linearr&   intermediate_sizeattention_biaslinear1r   
hidden_act
activationlinear2activation_dropoutr+   r   r-   r/   r0   r"   [   s
   
z#ParakeetEncoderFeedForward.__init__c                 C   s4   |  | |}tjj|| j| jd}| |}|S )Nptraining)r`   r^   r   
functionaldropoutrb   rf   ra   )r+   r1   r/   r/   r0   rN   b   s   
z"ParakeetEncoderFeedForward.forwardrP   rQ   rR   r   r"   rN   rW   r/   r/   r-   r0   rX   Z   s    rX   c                       s.   e Zd Zddef fddZdddZ  ZS ) ParakeetEncoderConvolutionModuleNr   c              	      s   t    |j}|du r|j}tt|dd | _n|d }t|dd | _|d d | _t	j
|d| dddd	d
| _t	j
|||d| j|d	d| _t	|| _t	j
||dddd	d
| _dS )z
        Args:
            config (ParakeetEncoderConfig): Configuration for the model.
            module_config (dict): Configuration for the module (e.g., encoder or decoder).
        Nr_   silukernel_sizer`   r   r   r   T)rl   stridepaddingrZ   )rm   rn   groupsrZ   )r!   r"   r&   conv_kernel_sizer   getattrr`   getrn   r   Conv1dpointwise_conv1depthwise_convBatchNorm1dnormpointwise_conv2)r+   r   module_configchannelsrl   r-   r/   r0   r"   j   s   
z)ParakeetEncoderConvolutionModule.__init__c                 C   s~   | dd}| |}tjj|dd}|dur%tj| dd}||d}| |}| 	|}| 
|}| |}| ddS )aS  
        Compute convolution module.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
            attention_mask (`torch.Tensor` of shape `(batch, 1, time)`): Attention mask.

        Returns:
            `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.

        r   r   r9   Nr3           )rC   rt   r   rg   glur$   allmasked_fillru   rw   r`   rx   )r+   r1   attention_maskall_masked_rowsr/   r/   r0   rN      s   




z(ParakeetEncoderConvolutionModule.forwardrO   ri   r/   r/   r-   r0   rj   i   s    rj   r1   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r<   r>   rG   )r1   r   batchnum_key_value_headsslenhead_dimr/   r/   r0   	repeat_kv   s
   0r   r{   modulequerykeyvaluer   scalingrh   kwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   r;   r3   r:   r   rd   r   )r   num_key_value_groupsr$   matmulrC   r<   r   rg   softmaxfloat32r(   r   rh   rf   
contiguous)r   r   r   r   r   r   rh   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputr/   r/   r0   eager_attention_forward   s   
&r   c                       s   e Zd ZdZdedef fddZedddd		
ddej	de
ej	 de
ej	 dee deej	ej	f f
ddZdd Z  ZS )ParakeetEncoderAttentionztMulti-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860.r   	layer_idxc                    s  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _tj|j|j| j dd| _tt|j| j| _tt|j| j| _d S )Nr   g      FrY   )r!   r"   r   r   rq   r&   num_attention_headsr   r   r   r   attention_dropout	is_causalr   r[   r]   q_projk_projv_projo_projrelative_k_proj	Parameterr$   zerosbias_ubias_vr+   r   r   r-   r/   r0   r"      s.   
z!ParakeetEncoderAttention.__init__past_key_valuepast_key_valuesz4.58)new_nameversionNr1   position_embeddingsr   r   r   c              	   K   s  |j d d }|\}}||d| jf}| ||dd}	| ||dd}
| ||dd}t}| jj	dkrDt
| jj	 }|	| jd| jjd| j }|	| jd| jjd| j }| |}||d| jj| j}||dddd }| |}|dd |f }|| j }|d ur|| td}|| f||
||| jsd	n| j| jd
|\}}|jg |dR   }| |}||fS )Nr3   r   r   eagerr   r   .z-infr{   )r   r   r   r   rh   r   )r<   r   r   viewrC   r   r   r   r   _attn_implementationr   r   r   r   r   permute
_rel_shiftr   masked_fill_logical_notr)   rf   r   rG   r   r   )r+   r1   r   r   r   input_shape
batch_sizerH   hidden_shapequery_statesr   r   attention_interfacequery_states_with_bias_uquery_states_with_bias_vrelative_key_states	matrix_bdr   r   r/   r/   r0   rN      sL   




z ParakeetEncoderAttention.forwardc                 C   sX   |j \}}}}tjj|dd}|||d|}|ddddddf ||||}|S )ztRelative position shift for Shaw et al. style attention. See appendix B of https://huggingface.co/papers/1901.02860.)r   r   )padr3   Nr   )r<   r   rg   r   r   )r+   attention_scoresr   	num_headsquery_lengthposition_lengthr/   r/   r0   r   $  s
   &z#ParakeetEncoderAttention._rel_shiftrO   )rP   rQ   rR   rS   r   intr"   r   r$   rT   r   r   r   tuplerN   r   rW   r/   r/   r-   r0   r      s"    9r   c                       sP   e Zd Zdef fddZdejdejfddZ	dd	ejd
ejfddZ
  ZS ) ParakeetEncoderSubsamplingConv2Dr   c                    s  t    |j| _|j| _|j| _| jd d | _t	t
|j| _t | _| jtjd| j| j| j| jd | jt  t| jd D ]-}| jtj| j| j| j| j| j| jd | jtj| j| jdd | jt  qH|j| j| j  }tj|j| |jdd| _d S )Nr   r   )rl   rm   rn   )rl   rm   rn   ro   rl   TrY   )r!   r"   subsampling_conv_kernel_sizerl   subsampling_conv_striderm   subsampling_conv_channelsrz   rn   r   mathlog2subsampling_factor
num_layersr   
ModuleListlayersappendConv2dReLUrangenum_mel_binsr[   r&   linear)r+   r   i
out_lengthr-   r/   r0   r"   .  s4   

z)ParakeetEncoderSubsamplingConv2D.__init__input_lengths
conv_layerc                 C   sV   t |dr)|jdkr)|j}|jd }|jd }||d  |d  | | d }|S |S )Nrm   )r   r   r   r   )hasattrrm   rn   rl   )r+   r   r   rn   rl   rm   output_lengthsr/   r/   r0   _get_output_lengthQ  s   

 z3ParakeetEncoderSubsamplingConv2D._get_output_lengthNinput_featuresr   c                 C   s   | d}|d ur|dnd }| jD ]9}||}t|tjrL|d urL| ||}|jd }tj	||j
d|d d d f k }||d d d d d d f 9 }q|dd|jd |jd d}| |}|S )Nr   r3   r   r4   r   )	unsqueezesumr   r?   r   r   r   r<   r$   r%   r   rC   rG   r   )r+   r   r   r1   current_lengthslayercurrent_seq_lengthchannel_maskr/   r/   r0   rN   \  s   


"
z(ParakeetEncoderSubsamplingConv2D.forwardrO   )rP   rQ   rR   r   r"   r$   rT   r   r   r   rN   rW   r/   r/   r-   r0   r   -  s    # r   c                       sd   e Zd Zddedee f fddZ		ddejdeej deej d	e	e
 d
ejf
ddZ  ZS )ParakeetEncoderBlockNr   r   c                    s   t    d| _t|| _t||| _t|| _t|| _	t
|j| _t
|j| _t
|j| _t
|j| _t
|j| _d S NF)r!   r"   gradient_checkpointingrX   feed_forward1r   	self_attnrj   convfeed_forward2r   	LayerNormr&   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   r-   r/   r0   r"   s  s   



zParakeetEncoderBlock.__init__r1   r   r   r   r   c                 K   s   |}|  | |}|d|  }| |}| jd|||d|\}}|| }| j| ||d}	||	 }| | |}
|d|
  }| |}|S )Ng      ?)r1   r   r   )r   r/   )	r   r   r   r   r   r   r   r   r   )r+   r1   r   r   r   residualnormalized_hidden_statesr   _conv_output
ff2_outputr/   r/   r0   rN     s$   


zParakeetEncoderBlock.forwardrO   NN)rP   rQ   rR   r   r   r   r"   r$   rT   r   r   rN   rW   r/   r/   r-   r0   r   r  s    r   c                       s   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZdZdZeedZ fdd	Zd
ejfddZddejdee fddZ  ZS )ParakeetPreTrainedModelr   modelr   Tr   F)r1   
attentionsc                    sj   t  | t| jdr| jj}n	t| j dd}t|tr3|j	j
jd|d |jj
jd|d d S d S )Ninitializer_rangeg{Gz?r{   )meanstd)r!   _init_weightsr   r   r   rq   get_text_configr?   r   r   datanormal_r   )r+   r   r   r-   r/   r0   r     s   

z%ParakeetPreTrainedModel._init_weightsr   c           
      C   s   t | jtr
| jjn| j}|j}|j}tt|j	}|d d d }|| }|}t
|D ]}	t|jtjd| |d }t|}q-|jtjdS )Nr   r   r   r   )r?   r   r   encoder_configr   r   r   r   r   r   r   r$   divr(   r)   floor)
r+   r   r  rl   rm   r   all_paddingsadd_padlengthsr   r/   r/   r0   _get_subsampling_output_length  s   z6ParakeetPreTrainedModel._get_subsampling_output_lengthNr   target_lengthc                 C   sH   |  |d}|dur|n| }tj||jd|dddf k }|S )z
        Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
        when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
        r3   Nr4   )r  r   maxr$   r%   r   )r+   r   r  r   
max_lengthr/   r/   r0   _get_output_attention_mask  s    z2ParakeetPreTrainedModel._get_output_attention_maskrO   )rP   rQ   rR   r   rU   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr   r$   rT   r  r   r   r  rW   r/   r/   r-   r0   r     s$   
 "r   z{
    The Parakeet Encoder model, based on the [Fast Conformer architecture](https://huggingface.co/papers/2305.05084).
    )custom_introc                       sf   e Zd ZU eed< dZdef fddZeee		dde
jdee
j dee d	efd
dZ  ZS )ParakeetEncoderr   encoderc                    s   t     | _d| _ j| _ j| _ j| _ jr!t	 j
nd| _t | _t | _t fddt jD | _|   d S )NFr   c                    s   g | ]}t  |qS r/   )r   ).0r   r   r/   r0   
<listcomp>  s    z,ParakeetEncoder.__init__.<locals>.<listcomp>)r!   r"   r   r   rh   dropout_positions	layerdropscale_inputr   sqrtr&   input_scaler   subsamplingr   encode_positionsr   r   r   num_hidden_layersr   	post_initrc   r-   r  r0   r"     s   

zParakeetEncoder.__init__Nr   r   r   r   c           	      K   s   |  ||}|| j }| |}tjj|| j| jd}tjj|| j| jd}|durN| j||j	d d}|
dd|j	d d}||dd@ }|
d}| jD ] }d}| jrdtg }|| jk rdd}|sq||f||d	|}qQt|d
S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        rd   Nr   r  r3   r   FT)r   r   )last_hidden_state)r"  r!  r#  r   rg   rh   rf   r  r  r<   r   r>   rC   r   r$   randr  r
   )	r+   r   r   r   r1   r   encoder_layerto_dropdropout_probabilityr/   r/   r0   rN     s:   







zParakeetEncoder.forwardrO   )rP   rQ   rR   r   rU   r  r"   r   r   r   r$   rT   r   r   r   r
   rN   rW   r/   r/   r-   r0   r    s"   
 r  c                   @   sf   e Zd ZU dZejed< dZee	ej
  ed< dZee	e	ej
   ed< dZee	e	ej
   ed< dS )ParakeetGenerateOutputal  
    Outputs of Parakeet models.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    	sequencesNlogitsr   r1   )rP   rQ   rR   rS   r$   
LongTensorrU   r.  r   r   FloatTensorr   r1   r/   r/   r/   r0   r,  =  s   
 
r,  zS
    Parakeet Encoder with a Connectionist Temporal Classification (CTC) head.
    c                       s   e Zd ZU eed< def fddZee		ddej	de
ej	 de
ej	 dee d	ef
d
dZe 		ddej	de
ej	 dedee d	eeejf f
ddZ  ZS )ParakeetForCTCr   c                    s<   t  | t|j| _tj|jj|jdd| _	| 
  d S )Nr   r   )r!   r"   r  r  r  r   rs   r&   
vocab_sizectc_headr%  rc   r-   r/   r0   r"   `  s   zParakeetForCTC.__init__Nr   r   labelsr   r   c              
   K   s  | j d||d|}|j}| |dddd}d}|dur|dur'|ntj|tjd}| |d}	|| j	j
k}
|
d}||
}tjj|dtjddd}tjjjd	d
 tjj|||	|| j	j
| j	j| j	jd}W d   n1 s{w   Y  t|||j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = ParakeetForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> outputs = model(**inputs)

        >>> print(outputs.loss)
        ```r   r   r   r   Nr   r3   r   r   F)r8   )blank	reductionzero_infinity)lossr.  r1   r   r/   )r  r'  r3  rC   r$   	ones_likelongr  r   r   pad_token_idmasked_selectr   rg   log_softmaxr   backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr   r1   r   )r+   r   r   r4  r   encoder_outputsr1   r.  r9  r   labels_masktarget_lengthsflattened_targets	log_probsr/   r/   r0   rN   h  sD   

zParakeetForCTC.forwardFreturn_dict_in_generatec                 K   st   d|d< | j d
||d|}|jjdd}|dur+| j||jd d}| jj|| < |r8t||j|j|j	d	S |S )a3  
        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = ParakeetForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        Treturn_dictr5  r3   r9   Nr   r&  )r-  r.  r   r1   r/   )
rN   r.  argmaxr  r<   r   r<  r,  r   r1   )r+   r   r   rJ  r   outputsr-  r/   r/   r0   generate  s&   zParakeetForCTC.generater   r   )rP   rQ   rR   r   rU   r"   r   r   r$   rT   r   r   r   r   rN   rV   boolr   r,  r/  rN  rW   r/   r/   r-   r0   r1  X  s@   
 Gr1  )r1  r  r   )r{   )2r   dataclassesr   typingr   r   r   r$   r   activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   configuration_parakeetr   r   Moduler   rX   rj   rT   r   r   r)   r   r   r   r   r   r  r,  r1  __all__r/   r/   r/   r0   <module>   sj   2;
cE/?W 