o
    ei0                     @   sD  d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9 e):e;Z<G dd deZ=ee'ddG dd deZ>G dd dej?Z@G dd  d ej?ZAG d!d" d"e1ZBG d#d$ d$e0ZCG d%d& d&e4ZDG d'd( d(eZEe'G d)d* d*e"ZFG d+d, d,eFZGG d-d. d.e5ZHG d/d0 d0e8ZIe'd1dG d2d3 d3eFeZJg d4ZKdS )5    )Callable)	dataclassN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PreTrainedConfig)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)RopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightc                2       s8  e Zd ZdZdZdgZdddddZ			
																						d.dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB d edB d!e	dB d"edB d#e
dB d$eeeef B dB d%e
dB d&e
dB d'e	dB d(edB d)edB d*edB d+e
dB f0 fd,d-Z  ZS )/MoonshineConfiga7  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.
        pad_token_id (`int`, *optional*):
            Padding token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	moonshinepast_key_valuesdecoder_num_key_value_headsdecoder_num_attention_headsdecoder_num_hidden_layersdecoder_hidden_act)num_key_value_headsnum_attention_headsnum_hidden_layers
hidden_act              Ngelusilu   {Gz?   TF        r   
vocab_sizehidden_sizeintermediate_sizeencoder_num_hidden_layersencoder_num_attention_headsencoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actmax_position_embeddingsinitializer_rangedecoder_start_token_id	use_cacherope_parametersis_encoder_decoderattention_biasattention_dropoutbos_token_ideos_token_idpad_token_idtie_word_embeddingsc                    s   || _ || _|| _|| _|| _|| _|| _|d u r|}|| _|	d u r$|}	|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|dd t jdd|i| d S )Npartial_rotary_factorg?rK    )r>   r?   r@   rA   r-   rB   r,   rC   r+   rD   rE   r.   rF   rG   rH   rI   rK   rL   rM   rN   rO   rP   rQ   rJ   
setdefaultsuper__init__)selfr>   r?   r@   rA   r-   rB   r,   rC   r+   rD   rE   r.   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   kwargs	__class__rS   m/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/moonshine/modular_moonshine.pyrV      s>   zMoonshineConfig.__init__)r3   r4   r5   r6   r6   r7   r7   NNNr8   r9   r:   r;   r<   TNTFr=   r<   r   NT)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapintstrfloatboolr   dictrV   __classcell__rS   rS   rY   r[   r(   1   s    \		
r(   z
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   @   s    e Zd ZU dZejdB ed< dS )MoonshineEncoderModelOutputNattention_mask)r\   r]   r^   rk   torchTensor__annotations__rS   rS   rS   r[   rj      s   
 rj   c                       2   e Zd Z fddZdejdejfddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S NrU   rV   configr   activation_fnnnLinearr?   r@   fc1fc2rW   rs   r2   rY   rS   r[   rV      s
   

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S rq   )rw   rt   rx   )rW   rz   rS   rS   r[   forward   s   


zMoonshineEncoderMLP.forwardr\   r]   r^   rV   rl   rm   r|   rh   rS   rS   rY   r[   rp          rp   c                       ro   )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )Nr   rr   ry   rY   rS   r[   rV      s
   

zMoonshineDecoderMLP.__init__rz   r{   c                 C   s8   |  |}|jddd\}}| || }| |}|S )Nr   )dim)rw   chunkrt   rx   )rW   rz   gaterS   rS   r[   r|      s
   

zMoonshineDecoderMLP.forwardr}   rS   rS   rY   r[   r      r~   r   c                   @   s   e Zd ZdS )MoonshineRotaryEmbeddingN)r\   r]   r^   rS   rS   rS   r[   r     s    r   c                       s   e Zd Zdededededef
 fddZ					dd	ejd
e	ejejf dB dejdB de
dB dejdB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )MoonshineAttentionrs   	layer_idx	is_causalr0   r/   c                    s~   | ||d t || || _t|d|j|j | _| jj	d ur:| jj	}|| j| d |  }|| j | _
d S d| _
d S )N)r0   r/   head_dimr<   r   )updaterU   rV   r   getattrr?   r0   r   rs   rD   head_dim_padding)rW   rs   r   r   r0   r/   target_multipletarget_head_dimrY   rS   r[   rV     s   
zMoonshineAttention.__init__Nrz   position_embeddingsrk   r*   cache_positionkey_value_statesrX   r{   c                 K   sJ  |j d d \}}	| |||	| jj| jdd}
|d u}|d ur9|j| j	}|r6d|j| j	< |j
}n|j}|d ur?|n|}|rV|rV|rV|j| j	 j}|j| j	 j}n7| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|s|\}}t|
|||\}
}|d ur|||d}|||| j	|\}}t| jjt}| jo|d u o|	dk}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| jsdn| j| j|d	|\}}| jdkr|d
d | j f }| ||	d! }| "|}||fS )Nr   r<   r   Tr   )sincosr   r   r=   )dropoutscalingr   .)#shapeq_projviewrs   r/   r   	transpose
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesk_projv_projr   r"   r   get_interface_attn_implementationr%   r   r   rl   ru   
functionalpadtrainingrM   r   reshape
contiguouso_proj)rW   rz   r   rk   r*   r   r   rX   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statesr   r   cache_kwargsattention_interfacer   attn_outputattn_weightsrS   rS   r[   r|     sx   
"

	

zMoonshineAttention.forward)NNNNN)r\   r]   r^   r(   rc   rf   rV   rl   rm   tupler   
LongTensorr   r   r|   rh   rS   rS   rY   r[   r     sD    	r   c                       s&   e Zd Zdedef fddZ  ZS )MoonshineEncoderLayerrs   r   c                    s\   t  || t||d|j|jd| _t||j| _t	j
|jdd| _t	j
|jdd| _d S )NFrs   r   r   r0   r/   bias)rU   rV   r   rB   rC   	self_attnrp   rE   mlpru   	LayerNormr?   input_layernormpost_attention_layernormrW   rs   r   rY   rS   r[   rV   u  s   zMoonshineEncoderLayer.__init__)r\   r]   r^   r(   rc   rV   rh   rS   rS   rY   r[   r   t  s    r   c                       s   e Zd ZddededB f fddZ										ddejdejdB d	ejdB d
ejdB dejdB dejdB de	dB de
dB dejdB deejejf dB deejejf dB dee deejeejejf dB f fddZ  ZS )MoonshineDecoderLayerNrs   r   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTr   Fr   )rU   rV   r?   r   r0   r/   r   encoder_attnr   r2   r   ru   r   r   r   final_layernormr   rY   rS   r[   rV     s(   
zMoonshineDecoderLayer.__init__Frz   rk   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr*   rI   r   r   encoder_position_embeddingsrX   r{   c              
   K   s   |}|  |}| jd||||||	|
d|\}}|| }|d ur8|}| |}| j|||||d\}}|| }|}| |}| |}|| }|S )N)rz   rk   r   r*   rI   r   r   )rz   r   rk   r*   rI   rS   )r   r   r   r   r   r   )rW   rz   rk   r   r   r   r   r*   rI   r   r   r   rX   residual_rS   rS   r[   r|     s<   






zMoonshineDecoderLayer.forwardrq   )
NNNNNNFNNN)r\   r]   r^   r(   rc   rV   rl   rm   r   r   rf   r   r   r   FloatTensorr|   rh   rS   rS   rY   r[   r     sN    	
r   c                   @   sJ   e Zd ZU eed< dZdZdZdZddgZ	dZ
dZdZdejfd	d
ZdS )MoonshinePreTrainedModelrs   modelinput_valuesaudioTr   r   input_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r<      r   r   )rc   )rW   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengthrS   rS   r[    _get_feat_extract_output_lengths  s   z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)r\   r]   r^   r(   rn   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphrl   r   r   rS   rS   rS   r[   r     s   
 r   c                       s   e Zd ZdZdZeedZdef fddZ	de
jfdd	Zd
e
jfddZee	ddejdejdB dee deeB fddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsrz   rs   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t fddt jD | _tj|dd| _t d| _d| _|   d S )Nr<   r   r   F)kernel_sizestrider   r   r   r   )r   r   gh㈵>)
num_groupsnum_channelsepsc                       g | ]}t  |qS rS   )r   .0idxrs   rS   r[   
<listcomp>      z-MoonshineEncoder.__init__.<locals>.<listcomp>r   r   )rU   rV   rs   r?   ru   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListrangerA   r   r   
layer_normr   
rotary_embgradient_checkpointing	post_init)rW   rs   	embed_dimrY   r   r[   rV     s   zMoonshineEncoder.__init__r{   c                 C      | j S rq   r   rW   rS   rS   r[   get_input_embeddings     z%MoonshineEncoder.get_input_embeddingsvaluec                 C   
   || _ d S rq   r   )rW   r   rS   rS   r[   set_input_embeddings     
z%MoonshineEncoder.set_input_embeddingsNrk   rX   c                 K   s"  | d}tj| |}| |}tj| |}tj| |}|	ddd}|durK| 
|jd }d}|ddd|f dd|f }|}t| j|||d}tjd|jd |jd	 d}| j||d
}	| jD ]}
|
|f|||	d|}qm| |}t||dur| dS ddS )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r<   r   r   Nr   i  .rs   inputs_embedsrk   r   devicer   )rk   r   r   )last_hidden_staterk   )	unsqueezeru   r   tanhr   r   r8   r   r   permuter   r   r   rs   rl   aranger  r   r   r   rj   rc   )rW   r   rk   rX   rz   mask_lendownsample_strideoutput_attention_maskr   r   encoder_layerrS   rS   r[   r|     sH   



zMoonshineEncoder.forwardrq   )r\   r]   r^   r_   r   r   r   _can_record_outputsr(   rV   ru   Moduler   r  r   r   rl   r   rm   r   r   r   r   r|   rh   rS   rS   rY   r[   r     s*    r   c                       s   e Zd ZdZeedddeeeddddZdef fdd	Z	e
e	
	
	
	
	
	
	
	
	
ddejd
B dejd
B dejd
B ded
B dejd
B ded
B dejd
B dejd
B dejd
B dee deeB fddZ  ZS )MoonshineDecoder	input_idsr<   r   )index
layer_namer   )r   rz   cross_attentionsrs   c                    sB   t    tj jdd| _t fddt jD | _	d S )NFr   c                    r   rS   )r   r   r   rS   r[   r   Y  r   z-MoonshineDecoder.__init__.<locals>.<listcomp>)
rU   rV   ru   r   r?   normr   r   r1   r   rW   rs   rY   r   r[   rV   V  s   $zMoonshineDecoder.__init__Nrk   r   r*   r  rI   r   r   r   rX   r{   c
              
   K   s$  |du |duA rt d|du r| |}|r(|du r(tt| jdt| jd}|du rD|dur4| nd}tj|||jd  |j	d}|du rM|
d}t| j|||||d}t| j||	|d}	|}| j||d	}| jD ]}||||f|	|||||d
|
}qm| |}t||r|dS ddS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r<   r  )rs   r  rk   r   r*   r   r  r  )r   r   r*   rI   r   r   )r	  r*   )
ValueErrorembed_tokensr   r   rs   get_seq_lengthrl   r  r   r  r
  r   r   r   r   r  r   )rW   r  rk   r   r*   r  rI   r   r   r   rX   past_seen_tokenscausal_maskrz   r   decoder_layerrS   rS   r[   r|   [  sf   




zMoonshineDecoder.forward)	NNNNNNNNN)r\   r]   r^   r   r   r   r   r  r(   rV   r   r   rl   r   rm   r   r   rf   r   r   r   r   r|   rh   rS   rS   rY   r[   r  N  sR    	
r  c                   @   s   e Zd Zdd Zee										ddejdB dejdB dejdB dejdB de	e	ej  dB d	e
dB d
e	ej dB de	ej dB dedB dejdB dee defddZdS )MoonshineModelc                 C   s   t d)NzNot needed for Moonshine)AttributeErrorr   rS   rS   r[   _mask_input_features  s   z#MoonshineModel._mask_input_featuresNr   rk   decoder_input_idsdecoder_attention_maskencoder_outputsr*   decoder_inputs_embedsdecoder_position_idsrI   r   rX   r{   c                 K   sn   |du r| j |fd|i|}| jd|||j|j||||	|
d	|}t|j|j|j|j|j|j|j|jdS )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        Nrk   )	r  rk   r   r   r*   r  r   rI   r   )r	  r*   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsrS   )	encoderdecoderr	  rk   r   r*   rz   r   r  )rW   r   rk   r$  r%  r&  r*   r'  r(  rI   r   rX   decoder_outputsrS   rS   r[   r|     s2   .
zMoonshineModel.forward)
NNNNNNNNNN)r\   r]   r^   r#  r   r   rl   r   r   r   r   rf   r   r   r   r|   rS   rS   rS   r[   r!    sN    	
r!  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    c                       s   e Zd ZddiZdef fddZdd Zdd	 Zd
ej	fddZ
ee											ddejdB dejdB dejdB dejdB deeej  dB dedB deej dB deej dB dedB dejdB dejdB dee d
efddZ  ZS )!MoonshineForConditionalGenerationzproj_out.weightz!model.decoder.embed_tokens.weightrs   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
rU   rV   r!  r   ru   rv   r?   r>   proj_outr   r  rY   rS   r[   rV     s   
z*MoonshineForConditionalGeneration.__init__c                 C   r   rq   r1  r   rS   rS   r[   get_output_embeddings  r   z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   r  rq   r2  )rW   new_embeddingsrS   rS   r[   set_output_embeddings  r  z7MoonshineForConditionalGeneration.set_output_embeddingsr{   c                 C   s
   | j  S rq   )r   r   r   rS   rS   r[   r     r  z6MoonshineForConditionalGeneration.get_input_embeddingsNr   rk   r$  r%  r&  r*   r'  r(  rI   r   labelsrX   c                 K   s   |dur|du r|du rt || jj| jj}| j|f||||||||	|
d	|}| |j}d}|dur?| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	rk   r$  r&  r%  r*   r'  r(  rI   r   )logitsr6  r>   )	lossr7  r*   r)  r*  r  r+  r   r,  )r'   rs   rP   rH   r   r1  r	  loss_functionr>   r   r*   r)  r*  r  r+  r   r,  )rW   r   rk   r$  r%  r&  r*   r'  r(  rI   r   r6  rX   outputsr7  r8  rS   rS   r[   r|     sF   3z)MoonshineForConditionalGeneration.forward)NNNNNNNNNNN)r\   r]   r^   _tied_weights_keysr(   rV   r3  r5  ru   r  r   r   r   rl   r   r   r   r   rf   r   r   r   r|   rh   rS   rS   rY   r[   r0    s\    	
r0  )r(   r!  r   r0  )Lcollections.abcr   dataclassesr   rl   torch.nnru   activationsr   cache_utilsr   r   r   configuration_utilsr	   
generationr
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   glm.modeling_glmr    r!   r"   llama.modeling_llamar#   r$   r%   whisper.modeling_whisperr&   r'   
get_loggerr\   loggerr(   rj   r  rp   r   r   r   r   r   r   r   r  r!  r0  __all__rS   rS   rS   r[   <module>   sZ   
 )nJf`Nm