o
    eik                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( eeddG dd deZ)G dd dej*Z+G dd dej*Z,G dd de"Z-G dd  d e$Z.G d!d" d"ej*Z/G d#d$ d$eZ0eG d%d& d&eZ1ed'dG d(d) d)e1Z2eG d*d+ d+eZ3ed,dG d-d. d.e1Z4g d/Z5dS )0zPyTorch Parakeet model.    N)Callable)	dataclass)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )%FastSpeech2ConformerConvolutionModule)LlamaAttentioneager_attention_forward   )ParakeetCTCConfigParakeetEncoderConfigz
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   @   s    e Zd ZU dZejdB ed< dS )ParakeetEncoderModelOutputNattention_mask)__name__
__module____qualname__r   torchTensor__annotations__ r%   r%   k/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/parakeet/modular_parakeet.pyr   %   s   
 r   c                       sL   e Zd ZU dZejed< d
def fddZe	 dejfdd	Z
  ZS )$ParakeetEncoderRelPositionalEncodingz*Relative positional encoding for Parakeet.inv_freqNconfigc                    sZ   t    |j| _d}d|tjd|jdtjdj|tjd|j   }| j	d|dd	 d S )
N     @      ?r   r   dtype)devicer-   r(   F)
persistent)
super__init__max_position_embeddingsr"   arangehidden_sizeint64tofloatregister_buffer)selfr)   r.   baser(   	__class__r%   r&   r1   4   s   
 z-ParakeetEncoderRelPositionalEncoding.__init__hidden_statesc                 C   sD  |j d }|| jkrtd| d| j dtj|d | d|jd}| jd d d d f  |j d dd	|j}|d d d d f  }t
|jjtrW|jjdkrW|jjnd	}t|d
d4 | |  dd}| }| }	tj||	gdd}
|
jg |
j d d dR  }
W d    n1 sw   Y  |
j	|jdS )Nr   zSequence Length: z= has to be less or equal than config.max_position_embeddings .r.   r   mpscpuF)device_typeenabledr   dimr,   )shaper2   
ValueErrorr"   r3   r.   r(   r7   expandr6   
isinstancetypestrr   	transposesincosstackreshaper-   )r9   r=   
seq_lengthposition_idsinv_freq_expandedposition_ids_expandedrC   freqsrO   rP   	pos_embedr%   r%   r&   forwardB   s2   

. z,ParakeetEncoderRelPositionalEncoding.forwardN)r   r    r!   __doc__r"   r#   r$   r   r1   no_gradrY   __classcell__r%   r%   r;   r&   r'   /   s   
 
r'   c                       s*   e Zd Zdef fddZdd Z  ZS )ParakeetEncoderFeedForwardr)   c                    sR   t    tj|j|j|jd| _t|j	 | _
tj|j|j|jd| _|j| _d S )Nbias)r0   r1   r   Linearr4   intermediate_sizeattention_biaslinear1r   
hidden_act
activationlinear2activation_dropoutr9   r)   r;   r%   r&   r1   b   s
   
z#ParakeetEncoderFeedForward.__init__c                 C   s4   |  | |}tjj|| j| jd}| |}|S )Nptraining)rf   rd   r   
functionaldropoutrh   rl   rg   )r9   r=   r%   r%   r&   rY   i   s   
z"ParakeetEncoderFeedForward.forward)r   r    r!   r   r1   rY   r]   r%   r%   r;   r&   r^   a   s    r^   c                       s$   e Zd Zddef fddZ  ZS ) ParakeetEncoderConvolutionModuleNr)   c                    s   t  || d S rZ   )r0   r1   )r9   r)   module_configr;   r%   r&   r1   q   s   z)ParakeetEncoderConvolutionModule.__init__rZ   )r   r    r!   r   r1   r]   r%   r%   r;   r&   ro   p   s    ro   c                       sr   e Zd ZdZdedef fddZ	ddejdejdB d	ejdB d
e	e
 deejejf f
ddZdd Z  ZS )ParakeetEncoderAttentionztMulti-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860.r)   	layer_idxc                    sf   t  j||d d| _tj|j|j| j dd| _t	t
|j| j| _t	t
|j| j| _d S )N)rr   Fr_   )r0   r1   	is_causalr   ra   r4   num_attention_headshead_dimrelative_k_proj	Parameterr"   zerosbias_ubias_vr9   r)   rr   r;   r%   r&   r1   x   s
   z!ParakeetEncoderAttention.__init__Nr=   position_embeddingsr   kwargsreturnc              	   K   s  |j d d }|\}}||d| jf}| ||dd}	| ||dd}
| ||dd}t| j	j
t}|	| jd| j	jd| j }|	| jd| j	jd| j }| |}||d| j	j| j}||dddd }| |}|dd |f }|| j }|d ur|| td}|| f||
||| jsdn| j| jd	|\}}|jg |dR   }| |}||fS )
Nr?   r   r   r   r   .z-inf        )querykeyvaluer   rn   scaling)rH   ru   q_projviewrN   k_projv_projr   get_interfacer)   _attn_implementationr   ry   rt   rz   rv   permute
_rel_shiftr   masked_fill_logical_notr7   rl   attention_dropoutrR   
contiguouso_proj)r9   r=   r|   r   r}   input_shape
batch_sizerS   hidden_shapequery_states
key_statesvalue_statesattention_interfacequery_states_with_bias_uquery_states_with_bias_vrelative_key_states	matrix_bdattn_outputattn_weightsr%   r%   r&   rY      sL   




z ParakeetEncoderAttention.forwardc                 C   sX   |j \}}}}tjj|dd}|||d|}|ddddddf ||||}|S )ztRelative position shift for Shaw et al. style attention. See appendix B of https://huggingface.co/papers/1901.02860.)r   r   )padr?   Nr   )rH   r   rm   r   r   )r9   attention_scoresr   	num_headsquery_lengthposition_lengthr%   r%   r&   r      s
   &z#ParakeetEncoderAttention._rel_shiftrZ   )r   r    r!   r[   r   intr1   r"   r#   r   r   tuplerY   r   r]   r%   r%   r;   r&   rq   u   s     
9rq   c                       sP   e Zd Zdef fddZdejdejfddZ	dd	ejd
ejfddZ
  ZS ) ParakeetEncoderSubsamplingConv2Dr)   c                    s  t    |j| _|j| _|j| _| jd d | _t	t
|j| _t | _| jtjd| j| j| j| jd | jt  t| jd D ]-}| jtj| j| j| j| j| j| jd | jtj| j| jdd | jt  qH|j| j| j  }tj|j| |jdd| _d S )Nr   r   )kernel_sizestridepadding)r   r   r   groupsr   Tr_   )r0   r1   subsampling_conv_kernel_sizer   subsampling_conv_strider   subsampling_conv_channelschannelsr   r   mathlog2subsampling_factor
num_layersr   
ModuleListlayersappendConv2dReLUrangenum_mel_binsra   r4   linear)r9   r)   i
out_lengthr;   r%   r&   r1      s4   

z)ParakeetEncoderSubsamplingConv2D.__init__input_lengths
conv_layerc                 C   sV   t |dr)|jdkr)|j}|jd }|jd }||d  |d  | | d }|S |S )Nr   )r   r   r   r   )hasattrr   r   r   )r9   r   r   r   r   r   output_lengthsr%   r%   r&   _get_output_length   s   

 z3ParakeetEncoderSubsamplingConv2D._get_output_lengthNinput_featuresr   c                 C   s   | d}|d ur|dnd }| jD ]9}||}t|tjrL|d urL| ||}|jd }tj	||j
d|d d d f k }||d d d d d d f 9 }q|dd|jd |jd d}| |}|S )Nr   r?   r   r@   r   )	unsqueezesumr   rK   r   r   r   rH   r"   r3   r.   rN   rR   r   )r9   r   r   r=   current_lengthslayercurrent_seq_lengthchannel_maskr%   r%   r&   rY      s   


"
z(ParakeetEncoderSubsamplingConv2D.forwardrZ   )r   r    r!   r   r1   r"   r#   r   r   r   rY   r]   r%   r%   r;   r&   r      s    # r   c                       sd   e Zd ZddededB f fddZ		ddejdejdB dejdB d	ee	 d
ejf
ddZ
  ZS )ParakeetEncoderBlockNr)   rr   c                    s   t    d| _t|| _t||| _t|| _t|| _	t
|j| _t
|j| _t
|j| _t
|j| _t
|j| _d S NF)r0   r1   gradient_checkpointingr^   feed_forward1rq   	self_attnro   convfeed_forward2r   	LayerNormr4   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr{   r;   r%   r&   r1   
  s   



zParakeetEncoderBlock.__init__r=   r   r|   r}   r~   c                 K   s   |}|  | |}|d|  }| |}| jd|||d|\}}|| }| j| ||d}	||	 }| | |}
|d|
  }| |}|S )Ng      ?)r=   r   r|   )r   r%   )	r   r   r   r   r   r   r   r   r   )r9   r=   r   r|   r}   residualnormalized_hidden_statesr   _conv_output
ff2_outputr%   r%   r&   rY     s$   


zParakeetEncoderBlock.forwardrZ   NN)r   r    r!   r   r   r1   r"   r#   r   r   rY   r]   r%   r%   r;   r&   r   	  s    r   c                       s   e Zd ZU eed< dZdZdZdZdgZ	dZ
dZdZdZdZdZeedZe  fd	d
ZdejfddZddejdedB fddZ  ZS )ParakeetPreTrainedModelr)   modelr   audioTr   F)r=   
attentionsc                    s   t  | t| jdr| jj}n	t| j dd}t|tr3t	j
|jd|d t	j
|jd|d d S t|trUddtjd| jjdtjd	| jj   }t	|j| d S d S )
Ninitializer_rangeg{Gz?r   )meanstdr+   r*   r   r   r,   )r0   _init_weightsr   r)   r   getattrget_text_configrK   rq   initnormal_ry   rz   r'   r"   r3   r4   r5   copy_r(   )r9   moduler   r(   r;   r%   r&   r   N  s   


"z%ParakeetPreTrainedModel._init_weightsr   c           
      C   s   t | jtr
| jjn| j}|j}|j}tt|j	}|d d d }|| }|}t
|D ]}	t|jtjd| |d }t|}q-|jtjdS )Nr   r   r,   r+   )rK   r)   r   encoder_configr   r   r   r   r   r   r   r"   divr6   r7   floor)
r9   r   r   r   r   r   all_paddingsadd_padlengthsr   r%   r%   r&   _get_subsampling_output_lengthb  s   z6ParakeetPreTrainedModel._get_subsampling_output_lengthNr   target_lengthc                 C   sH   |  |d}|dur|n| }tj||jd|dddf k }|S )z
        Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
        when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
        r?   Nr@   )r   r   maxr"   r3   r.   )r9   r   r   r   
max_lengthr%   r%   r&   _get_output_attention_masks  s    z2ParakeetPreTrainedModel._get_output_attention_maskrZ   )r   r    r!   r   r$   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr   rq   _can_record_outputsr"   r\   r   r#   r   r   r   r]   r%   r%   r;   r&   r   8  s(   
 "r   z{
    The Parakeet Encoder model, based on the [Fast Conformer architecture](https://huggingface.co/papers/2305.05084).
    c                       st   e Zd ZU eed< dZdef fddZeee	e
		ddejdejdB dedB d	ee d
ef
ddZ  ZS )ParakeetEncoderr)   encoderc                    s   t     | _d| _ j| _ j| _ j| _ jr!t	 j
nd| _t | _t | _t fddt jD | _|   d S )NFr+   c                    s   g | ]}t  |qS r%   )r   ).0rr   r)   r%   r&   
<listcomp>  s    z,ParakeetEncoder.__init__.<locals>.<listcomp>)r0   r1   r)   r   rn   dropout_positions	layerdropscale_inputr   sqrtr4   input_scaler   subsamplingr'   encode_positionsr   r   r   num_hidden_layersr   	post_initri   r;   r  r&   r1     s   

zParakeetEncoder.__init__Nr   r   output_attention_maskr}   r~   c                 K   s   |  ||}|| j }| |}tjj|| j| jd}tjj|| j| jd}|durN| j||j	d d}|
dd|j	d d}||dd@ }|
d}| jD ] }d}	| jrdtg }
|
| jk rdd}	|	sq||f||d	|}qQt||r|| d
S dd
S )aJ  
        output_attention_mask (`bool`, *optional*):
            Whether to return the output attention mask.

        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        rj   Nr   r   r?   r   FT)r   r|   )last_hidden_stater   )r  r
  r  r   rm   rn   rl   r  r   rH   r   rJ   rN   r   r"   randr  r   r   )r9   r   r   r  r}   r=   r|   output_maskencoder_layerto_dropdropout_probabilityr%   r%   r&   rY     sB   #






zParakeetEncoder.forwardr   )r   r    r!   r   r$   r   r1   r   r   r   r   r"   r#   boolr   r   r	   rY   r]   r%   r%   r;   r&   r    s*   
 r  c                   @   sf   e Zd ZU dZejed< dZeej	 dB ed< dZ
eeej	  dB ed< dZeeej	  dB ed< dS )ParakeetGenerateOutputal  
    Outputs of Parakeet models.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    	sequencesNlogitsr   r=   )r   r    r!   r[   r"   
LongTensorr$   r  r   FloatTensorr   r=   r%   r%   r%   r&   r    s   
 
r  zS
    Parakeet Encoder with a Connectionist Temporal Classification (CTC) head.
    c                       s   e Zd ZU eed< def fddZee		ddej	dej	dB dej	dB de
e d	ef
d
dZe 		ddej	dej	dB dede
e d	eejB f
ddZ  ZS )ParakeetForCTCr)   c                    s<   t  | t|j| _tj|jj|jdd| _	| 
  d S )Nr   r   )r0   r1   r  r   r  r   Conv1dr4   
vocab_sizectc_headr  ri   r;   r%   r&   r1     s   zParakeetForCTC.__init__Nr   r   labelsr}   r~   c              
   K   s  | j d||d|}|j}| |dddd}d}|dur|dur'|ntj|tjd}| |d}	|| j	j
k}
|
d}||
}tjj|dtjddd}tjjjd	d
 tjj|||	|| j	j
| j	j| j	jd}W d   n1 s{w   Y  t|||j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = ParakeetForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> outputs = model(**inputs)

        >>> print(outputs.loss)
        ```r   r   r   r   Nr,   r?   )rF   r-   r   F)rD   )blank	reductionzero_infinity)lossr  r=   r   r%   )r  r  r   rN   r"   	ones_likelongr   r   r)   pad_token_idmasked_selectr   rm   log_softmaxfloat32backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr
   r=   r   )r9   r   r   r!  r}   encoder_outputsr=   r  r&  r   labels_masktarget_lengthsflattened_targets	log_probsr%   r%   r&   rY     sD   

zParakeetForCTC.forwardFreturn_dict_in_generatec                 K   st   d|d< | j d
||d|}|jjdd}|dur+| j||jd d}| jj|| < |r8t||j|j|j	d	S |S )a3  
        Example:

        ```python
        >>> from transformers import AutoProcessor, ParakeetForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = "nvidia/parakeet-ctc-1.1b"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = ParakeetForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        Treturn_dictr"  r?   rE   Nr   r  )r  r  r   r=   r%   )
rY   r  argmaxr   rH   r)   r)  r  r   r=   )r9   r   r   r8  r}   outputsr  r%   r%   r&   generateV  s&   zParakeetForCTC.generater   r   )r   r    r!   r   r$   r1   r   r   r"   r#   r   r   r
   rY   r\   r  r  r  r<  r]   r%   r%   r;   r&   r    s@   
 Gr  )r  r  r   )6r[   r   collections.abcr   dataclassesr   r"   r    r   r   activationsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   4fastspeech2_conformer.modeling_fastspeech2_conformerr   llama.modeling_llamar   r   configuration_parakeetr   r   r   Moduler'   r^   ro   rq   r   r   r   r  r  r  __all__r%   r%   r%   r&   <module>   sV   2OE/F^ 