o
    ei\                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, G dd de,eZ-G dd de*Z.G dd de#Z/G dd de"Z0G dd dej1Z2G dd  d eZ3G d!d" d"eZ4G d#d$ d$e&Z5G d%d& d&e%Z6G d'd( d(e(Z7ed)d*G d+d, d,e7Z8G d-d. d.e'Z9g d/Z:dS )0    N)Callable)	Tokenizer)Unigram)nn   )create_bidirectional_mask)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TokenizersBackend)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )LlamaAttentionLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)ParakeetCTCConfigParakeetEncoderConfig)ParakeetEncoderBlock ParakeetEncoderConvolutionModuleParakeetForCTCParakeetPreTrainedModel)ParakeetProcessor)T5Tokenizerc                       s^   e Zd Z							d fdd	Z				dd
eee B dededB dedef
ddZ  Z	S )LasrTokenizer</s><unk><pad>d   Nc           	   
      s<   t  jd|||||||d| tt| jddd| _d S )N)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensvocab
vocab_filer   F)unk_idbyte_fallback )super__init__r   r   _vocab_scores
_tokenizer)	selfr$   r%   r&   r'   r(   r)   r*   kwargs	__class__r-   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/lasr/modular_lasr.pyr/   ,   s$   

zLasrTokenizer.__init__FT	token_idsskip_special_tokensclean_up_tokenization_spacesgroup_tokensreturnc                    sT   t |tr|g}|rdd t|D } fdd|D }tj f|||d|S )Nc                 S   s   g | ]}|d  qS )r   r-   ).0token_groupr-   r-   r6   
<listcomp>T   s    z)LasrTokenizer._decode.<locals>.<listcomp>c                    s   g | ]	}| j kr|qS r-   )pad_token_id)r<   tokenr2   r-   r6   r>   W   s    )r7   r8   r9   )
isinstanceint	itertoolsgroupbyr   _decode)r2   r7   r8   r9   r:   r3   r-   rA   r6   rF   I   s   
zLasrTokenizer._decode)r    r!   r"   r#   NNN)FNT)
__name__
__module____qualname__r/   rC   listboolstrrF   __classcell__r-   r-   r4   r6   r   +   s.     
r   c                   @      e Zd ZdS )LasrProcessorNrG   rH   rI   r-   r-   r-   r6   rO   b   s    rO   c                       sZ   e Zd ZdZddddddddd	d
ddddddddddddgddgddf fdd	Z  ZS )LasrEncoderConfiga  
    This is the configuration class to store the configuration of a [`LasrEncoder`]. It is used to instantiate a
    `LasrEncoder` model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
            hidden_size (`int`, *optional*, defaults to 512):
                Dimension of the layers and the hidden states.
            num_hidden_layers (`int`, *optional*, defaults to 17):
                Number of hidden layers in the Transformer encoder.
            num_attention_heads (`int`, *optional*, defaults to 8):
                Number of attention heads for each attention layer in the Transformer encoder.
            intermediate_size (`int`, *optional*, defaults to 2048):
                Dimension of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
            hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
                The non-linear activation function (function or string) in the encoder and pooler.
            attention_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in the attention layers.
            convolution_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in convolutions of the conformer's convolution module.
            conv_kernel_size (`int`, *optional*, defaults to 32):
                The kernel size of the convolution layers in the Conformer block.
            subsampling_conv_channels (`int`, *optional*, defaults to 256):
                The number of channels in the subsampling convolution layers.
            subsampling_conv_kernel_size (`int`, *optional*, defaults to 5):
                The kernel size of the subsampling convolution layers.
            subsampling_conv_stride (`int`, *optional*, defaults to 2):
                The stride of the subsampling convolution layers.
            num_mel_bins (`int`, *optional*, defaults to 128):
                Number of mel features.
            dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
            dropout_positions (`float`, *optional*, defaults to 0.0):
                The dropout ratio for the positions in the input sequence.
            layerdrop (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the layers in the encoder.
            activation_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for activations inside the fully connected layer.
            attention_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the attention layers.
            max_position_embeddings (`int`, *optional*, defaults to 10000):
                The maximum sequence length that this model might ever be used with.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            layer_norm_eps (`float`, *optional*, defaults to 1e-06):
                The epsilon used by the layer normalization layers.
            feed_forward_residual_weights (`tuple[float, float]`, *optional*, defaults to `[1.5, 0.5]`):
                The residual weights for the feed forward layers.
            conv_residual_weights (`tuple[float, float]`, *optional*, defaults to `[2.0, 1.0]`):
                The residual weights for the convolution layers.
            batch_norm_momentum (`float`, *optional*, defaults to 0.01):
                The momentum for the batch normalization layers.
            rope_parameters (`RopeParameters`, *optional*):
                Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
                a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
                with longer `max_position_embeddings`.

    Example:
        ```python
        >>> from transformers import LasrEncoderModel, LasrEncoderConfig

        >>> # Initializing a `LasrEncoder` configuration
        >>> configuration = LasrEncoderConfig()

        >>> # Initializing a model from the configuration
        >>> model = LasrEncoderModel(configuration)

        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```

    This configuration class is based on the LasrEncoder architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
             i   siluF          r      g?        i'  g{Gz?gư>g      ?g      ?g       @g      ?g{Gz?Nc                    s   || _ || _|| _|| _|| _t jdi d|d|d|d|d|d|d|d|d	|	d
|d|
d|d|d|d|d|d|d|d|| | `| `d S )Nhidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_actattention_biasconvolution_biasconv_kernel_sizesubsampling_conv_channelsnum_mel_binssubsampling_conv_kernel_sizesubsampling_conv_stridedropoutdropout_positions	layerdropactivation_dropoutattention_dropoutmax_position_embeddingsinitializer_ranger-   )	rope_parameterslayer_norm_epsfeed_forward_residual_weightsconv_residual_weightsbatch_norm_momentumr.   r/   subsampling_factorscale_input)r2   r[   r\   r]   r^   r_   r`   ra   rb   rc   re   rf   rd   rg   rh   ri   rj   rk   rl   rm   ro   rp   rq   rr   rn   r3   r4   r-   r6   r/      s^   	
zLasrEncoderConfig.__init__)rG   rH   rI   __doc__r/   rM   r-   r-   r4   r6   rQ   f   s6    OrQ   c                       sB   e Zd ZdZ					ddeeB f fdd	Zed
d Z  Z	S )LasrCTCConfiga  
    This is the configuration class to store the configuration of a [`LasrForCTC`]. It is used to instantiate a
    Lasr CTC model according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.
    Args:
            vocab_size (`int`, *optional*, defaults to 512):
                Vocabulary size of the model.
            ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
                Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
                instance of [`LasrForCTC`].
            ctc_zero_infinity (`bool`, *optional*, defaults to `True`):
                Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
                occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
                of [`LasrForCTC`].
            encoder_config (`Union[dict, LasrEncoderConfig]`, *optional*):
                The config object or dictionary of the encoder.
            pad_token_id (`int`, *optional*, defaults to 0):
                Padding token id. Also used as blank token id.
    Example:
        ```python
        >>> from transformers import LasrForCTC, LasrCTCConfig
        >>> # Initializing a Lasr configuration
        >>> configuration = LasrCTCConfig()
        >>> # Initializing a model from the configuration
        >>> model = LasrForCTC(configuration)
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    This configuration class is based on the Lasr CTC architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
    rR   meanTNr   encoder_configc                    s"   t  jd|||||d| d S )N)
vocab_sizectc_loss_reductionctc_zero_infinityrx   r?   r-   )r.   r/   )r2   ry   rz   r{   rx   r?   r3   r4   r-   r6   r/     s   	
zLasrCTCConfig.__init__c                 C   s   | j jd S )Nr   )rx   rf   rA   r-   r-   r6   inputs_to_logits_ratio%  s   z$LasrCTCConfig.inputs_to_logits_ratio)rR   rw   TNr   )
rG   rH   rI   ru   dictrQ   r/   propertyr|   rM   r-   r-   r4   r6   rv      s    #rv   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )LasrEncoderSubsamplingconfigc                    st   t    t|j|j| _tj|j|j|j|j	d| _
tj|j|j|j|j	d| _t|j|j| _t | _d S )N)kernel_sizestride)r.   r/   r   Linearrd   r[   dense_0Conv1dre   rf   conv_0rc   conv_1dense_1ReLUact_fnr2   r   r4   r-   r6   r/   +  s    
zLasrEncoderSubsampling.__init__input_featuresr;   c                 C   sR   |  | |}|dd}|  | |}|  | |}|dd}| |S )N   r   )r   r   	transposer   r   r   )r2   r   hidden_statesr-   r-   r6   forward=  s   
zLasrEncoderSubsampling.forward)	rG   rH   rI   rQ   r/   torchTensorr   rM   r-   r-   r4   r6   r   *  s    r   c                   @   rN   )LasrEncoderRotaryEmbeddingNrP   r-   r-   r-   r6   r   F  s    r   c                       sr   e Zd Zdedef fddZ		ddejdeejejf dB dejdB d	e	e
 d
eejejf f
ddZ  ZS )LasrEncoderAttentionr   	layer_idxc                    s   t  || d| _d S )NF)r.   r/   	is_causalr2   r   r   r4   r-   r6   r/   J  s   
zLasrEncoderAttention.__init__Nr   position_embeddingsattention_maskr3   r;   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}	|\}
}t|||
|\}}t	| j
jt}|| |||	|f| jsVdn| j| jd|\}}|jg |dR   }| |}||fS )Nr   r   rZ   )rg   scaling)shapehead_dimq_projviewr   k_projv_projr   r	   get_interfacer   _attn_implementationr   trainingrk   r   reshape
contiguouso_proj)r2   r   r   r   r3   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightsr-   r-   r6   r   N  s2   

zLasrEncoderAttention.forwardNN)rG   rH   rI   rQ   rC   r/   r   r   tupler   r   r   rM   r-   r-   r4   r6   r   I  s    r   c                       s$   e Zd Zddef fddZ  ZS )LasrEncoderConvolutionModuleNr   c                    s,   t  || d| _tj|j|jd| _d S )Nsame)momentum)r.   r/   paddingr   BatchNorm1dr[   rr   norm)r2   r   module_configr4   r-   r6   r/   t  s   z%LasrEncoderConvolutionModule.__init__N)rG   rH   rI   rQ   r/   rM   r-   r-   r4   r6   r   s  s    r   c                       s^   e Zd Zdedef fddZ		ddejdejdB dejdB d	ee	 d
ejf
ddZ
  ZS )LasrEncoderBlockr   r   c                    s   t  || |j| _|j| _tj|j|jdd| _tj|j|jdd| _	tj|j|jdd| _
tj|j|jdd| _tj|j|jdd| _d S )NF)bias)r.   r/   rp   rq   r   	LayerNormr[   ro   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   r4   r-   r6   r/   {  s   zLasrEncoderBlock.__init__Nr   r   r   r3   r;   c           
      K   s   |}|  | |}| jd | | jd |  }| |}| jd|||d|\}}|| }| j| ||d}	| jd | | jd |	  }|}| | 	|}| jd | | jd |  }| 
|}|S )Nr   r   )r   r   r   )r   r-   )feed_forward1r   rp   r   	self_attnconvr   rq   feed_forward2r   r   )
r2   r   r   r   r3   residualnormalized_hidden_statesr   _conv_outputr-   r-   r6   r     s*   


zLasrEncoderBlock.forwardr   )rG   rH   rI   rQ   rC   r/   r   r   r   r   r   rM   r-   r-   r4   r6   r   z  s    r   c                   @   s(   e Zd ZdZdd ZdejfddZdS )LasrPreTrainedModelFc                 C   s   t | d S r   )r
   _init_weights)r2   moduler-   r-   r6   r     s   z!LasrPreTrainedModel._init_weightsinput_lengthsc                 C   sL   t | jtr
| jjn| j}|j}|j}d}t|D ]
}|| | d }q|S )Nr   r   )rB   r   rv   rx   re   rf   range)r2   r   rx   r   r   
num_layersr   r-   r-   r6   _get_subsampling_output_length  s   z2LasrPreTrainedModel._get_subsampling_output_lengthN)rG   rH   rI   _supports_flex_attnr   r   r   r   r-   r-   r-   r6   r     s    r   zh
    The LasrEncoder model, based on the Conformer architecture](https://arxiv.org/abs/2005.08100).
    )custom_introc                       sj   e Zd ZU eed< dZdef fddZeee	e
	ddejdejdB dee d	efd
dZ  ZS )LasrEncoderr   encoderc                    s   t    d| _ j| _ j| _ j| _t | _t | _	t
 fddt jD | _t
j j jdd| _|   d S )NFc                    s   g | ]}t  |qS r-   )r   )r<   r   r   r-   r6   r>     s    z(LasrEncoder.__init__.<locals>.<listcomp>)epsr   )r.   r/   gradient_checkpointingrg   rh   ri   r   
subsamplerr   
rotary_embr   
ModuleListr   r\   layersr   r[   ro   out_norm	post_initr   r4   r   r6   r/     s   

zLasrEncoder.__init__Nr   r   r3   r;   c           
      K   s   |  |}| |tj|jd |jdd\}}tjj	|| j	| j
d}tjj	|| j| j
d}tjj	|| j| j
d}|durH| j||jd d}t| j||d}| jD ]"}d}| j
rftg }	|	| jk rfd	}|su||f|||fd
|}qS| |}t|dS )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        r   )devicer   )pr   N)target_length)r   inputs_embedsr   FT)r   r   )last_hidden_state)r   r   r   aranger   r   	unsqueezer   
functionalrg   r   rh   _get_output_attention_maskr   r   r   randri   r   r   )
r2   r   r   r3   r   r   r   encoder_layerto_dropdropout_probabilityr-   r-   r6   r     s@   





zLasrEncoder.forwardr   )rG   rH   rI   rQ   __annotations__base_model_prefixr/   r   r   r   r   r   r   r   r   r   r   rM   r-   r-   r4   r6   r     s$   
 r   c                       s   e Zd Z fddZ  ZS )
LasrForCTCc                     s   t  jdi | S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = LasrForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        Nr-   )r.   generate)super_kwargsr4   r-   r6   r     s   zLasrForCTC.generate)rG   rH   rI   r   rM   r-   r-   r4   r6   r     s    r   )r   r   r   rO   rQ   rv   r   );rD   collections.abcr   r   
tokenizersr   tokenizers.modelsr   r   masking_utilsr   modeling_outputsr   modeling_utilsr	   r
   processing_utilsr   tokenization_utils_tokenizersr   utilsr   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   r   parakeet.configuration_parakeetr   r   parakeet.modeling_parakeetr   r   r   r   parakeet.processing_parakeetr   t5.tokenization_t5r   r   rO   rQ   rv   Moduler   r   r   r   r   r   r   r   __all__r-   r-   r-   r6   <module>   sH   7 9*1[