o
    i                     @   s.  d dl mZmZmZ d dlZd dlmZ d dlmZm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; e.<e=Z>G dd deZ?G dd dej@ZAG dd dej@ZBG dd de2ZCG dd  d e3ZDG d!d" d"e6ZEG d#d$ d$eZFe,G d%d& d&e'ZGG d'd( d(eGZHG d)d* d*e7ZIG d+d, d,e:ZJe,d-d.G d/d0 d0eGeZKg d1ZLdS )2    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightc                       sh   e Zd ZdZdZdgZddddZ				
																					d fdd	Z  ZS )MoonshineConfiga"  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
            Percentage of the query and keys which will have rotary embedding.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	moonshinepast_key_valuesencoder_num_key_value_headsencoder_num_attention_headsencoder_num_hidden_layers)num_key_value_headsnum_attention_headsnum_hidden_layers              Ngelusilu   {Gz?   T     @?F        r!   c                    s   || _ || _|| _|| _|| _|| _|| _|d u r|}|| _|	d u r$|}	|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t|  t jd||||d| d S )N)bos_token_ideos_token_idis_encoder_decoderdecoder_start_token_id )
vocab_sizehidden_sizeintermediate_sizer/   decoder_num_hidden_layersr.   decoder_num_attention_headsr-   decoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actdecoder_hidden_actmax_position_embeddingsinitializer_rangerC   	use_cache
rope_thetarope_scalingpartial_rotary_factorrB   attention_biasattention_dropoutr   super__init__)selfrE   rF   rG   r/   rH   r.   rI   r-   rJ   rK   rL   rM   rN   rO   rC   rP   rQ   rR   rS   rB   rT   rU   r@   rA   kwargs	__class__rD   l/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/moonshine/modular_moonshine.pyrW      sF   
zMoonshineConfig.__init__)r3   r4   r5   r6   r6   r7   r7   NNNr8   r9   r:   r;   r<   Tr=   Nr>   TFr?   r<   r!   )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_maprW   __classcell__rD   rD   rZ   r\   r*   2   sB    }r*   c                       2   e Zd Z fddZdejdejfddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S NrV   rW   configr   activation_fnnnLinearrF   rG   fc1fc2rX   ri   
hidden_actrZ   rD   r\   rW      s
   

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S rg   )rm   rj   rn   )rX   rq   rD   rD   r\   forward  s   


zMoonshineEncoderMLP.forwardr]   r^   r_   rW   torchTensorrs   rd   rD   rD   rZ   r\   rf          rf   c                       re   )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )Nr!   rh   ro   rZ   rD   r\   rW     s
   

zMoonshineDecoderMLP.__init__rq   rr   c                 C   s8   |  |}|jddd\}}| || }| |}|S )Nr!   )dim)rm   chunkrj   rn   )rX   rq   gaterD   rD   r\   rs     s
   

zMoonshineDecoderMLP.forwardrt   rD   rD   rZ   r\   rx     rw   rx   c                       s   e Zd Zdededededef
 fddZedd	d
d					ddej	de
eej	ej	f  de
ej	 d	e
e de
ej de
ej	 dee deej	e
ej	 e
eej	  f fddZ  ZS )MoonshineAttentionri   	layer_idx	is_causalr1   r0   c                    s~   | ||d t || || _t|d|j|j | _| jj	d ur:| jj	}|| j| d |  }|| j | _
d S d| _
d S )N)r1   r0   head_dimr<   r   )updaterV   rW   r   getattrrF   r1   r   ri   rK   head_dim_padding)rX   ri   r~   r   r1   r0   target_multipletarget_head_dimrZ   rD   r\   rW     s   
zMoonshineAttention.__init__past_key_valuer,   4.58new_nameversionNrq   position_embeddingsattention_maskcache_positionkey_value_statesrY   rr   c                 K   sV  |j d d \}}	| |||	| jj| jdd}
|d u}|d ur9|j| j	}|r6d|j| j	< |j
}n|j}|d ur?|n|}|rV|rV|rV|j| j	 j}|j| j	 j}n7| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|s|\}}t|
|||\}
}|d ur|||d}|||| j	|\}}t}| jjdkrt| jj }| jo|d u o|	dk}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| jsd	n| j| j|d
|\}}| jdkr|dd | j f }|||	d  }| !|}||fS )Nry   r<   r!   Tr   )sincosr   eagerr   r?   )dropoutscalingr   .)"shapeq_projviewri   r0   r   	transpose
is_updatedgetr~   cross_attention_cacheself_attention_cachelayerskeysvaluesk_projv_projr   r$   r'   _attn_implementationr   r   r   ru   rk   
functionalpadtrainingrU   r   reshape
contiguouso_proj)rX   rq   r   r   r,   r   r   rY   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statesr   r   cache_kwargsattention_interfacer   attn_outputattn_weightsrD   rD   r\   rs   4  sx   "

	

zMoonshineAttention.forward)NNNNN)r]   r^   r_   r*   intboolrW   r    ru   rv   r   tupler	   
LongTensorr   r   rs   rd   rD   rD   rZ   r\   r}     sF    	r}   c                   @   s   e Zd ZdS )MoonshineRotaryEmbeddingN)r]   r^   r_   rD   rD   rD   r\   r     s    r   c                       s&   e Zd Zdedef fddZ  ZS )MoonshineEncoderLayerri   r~   c                    s\   t  || t||d|j|jd| _t||j| _t	j
|jdd| _t	j
|jdd| _d S )NFri   r~   r   r1   r0   bias)rV   rW   r}   r.   r-   	self_attnrf   rL   mlprk   	LayerNormrF   input_layernormpost_attention_layernormrX   ri   r~   rZ   rD   r\   rW     s   zMoonshineEncoderLayer.__init__)r]   r^   r_   r*   r   rW   rd   rD   rD   rZ   r\   r     s    r   c                !       s   e Zd Zddedee f fddZedddd								
			ddej	deej	 deej	 deej	 deej
 deej
 dee dee deej
 deeej	ej	f  deeej	ej	f  dee deejeeejejf  f fddZ  ZS )MoonshineDecoderLayerNri   r~   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTr   Fr   )rV   rW   rF   r}   rI   rJ   r   encoder_attnrx   rM   r   rk   r   r   r   final_layernormr   rZ   rD   r\   rW     s(   
zMoonshineDecoderLayer.__init__r   r,   r   r   Frq   r   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsrP   r   r   encoder_position_embeddingsrY   rr   c              
   K   s   |}|  |}| jd||||||	|
d|\}}|| }|d ur8|}| |}| j|||||d\}}|| }|}| |}| |}|| }|S )N)rq   r   r   r,   rP   r   r   )rq   r   r   r,   rP   rD   )r   r   r   r   r   r   )rX   rq   r   r   r   r   r   r,   rP   r   r   r   rY   residual_rD   rD   r\   rs     s<   






zMoonshineDecoderLayer.forwardrg   )
NNNNNNFNNN)r]   r^   r_   r*   r   r   rW   r    ru   rv   r   r	   r   r   r   r   FloatTensorrs   rd   rD   rD   rZ   r\   r     sP    	
r   c                   @   sF   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZdejfdd	Zd
S )MoonshinePreTrainedModelri   modelinput_valuesTr   r   input_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r<      r   r!   )r   )rX   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengthrD   rD   r\    _get_feat_extract_output_lengths  s   z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)r]   r^   r_   r*   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphru   r   r   rD   rD   rD   r\   r     s   
 r   c                
       s   e Zd ZdZdZeedZdef fddZ	de
jfdd	Zd
e
jfddZe	ddejdeej dee defddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsrq   ri   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t d| _t fddt jD | _tj|dd| _d| _|   d S )Nr<   r   r   F)kernel_sizestrider   r!   r   r   )r   r   gh㈵>)
num_groupsnum_channelsepsri   c                       g | ]}t  |qS rD   )r   .0idxr   rD   r\   
<listcomp>      z-MoonshineEncoder.__init__.<locals>.<listcomp>r   )rV   rW   ri   rF   rk   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListranger/   r   r   
layer_normgradient_checkpointing	post_init)rX   ri   	embed_dimrZ   r   r\   rW     s   zMoonshineEncoder.__init__rr   c                 C      | j S rg   r   rX   rD   rD   r\   get_input_embeddings%     z%MoonshineEncoder.get_input_embeddingsvaluec                 C   
   || _ d S rg   r   )rX   r   rD   rD   r\   set_input_embeddings(     
z%MoonshineEncoder.set_input_embeddingsNr   rY   c           
      K   s<  | d}tj| |}| |}tj| |}tj| |}|	ddd}|durm| 
|jd }d}|ddd|f dd|f }| jjdkrZ|d	k rW|nd}n| jjd
krgt||j}nt||j}tjd|jd |jd d}| ||}| jD ]}	|	|f|||d|}q| |}t|dS )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r<   r   r!   Nry     .flash_attention_2r?   sdpadevice)r   r   r   )last_hidden_state)	unsqueezerk   r   tanhr   r   r8   r   r   permuter   r   ri   r   anyr   dtyper   ru   aranger  r   r   r   r   )
rX   r   r   rY   rq   mask_lendownsample_strider   r   encoder_layerrD   rD   r\   rs   +  s>   



zMoonshineEncoder.forwardrg   )r]   r^   r_   r`   r   r}   r   _can_record_outputsr*   rW   rk   Moduler   r  r   ru   r   r   rv   r   r   r   rs   rd   rD   rD   rZ   r\   r     s(    r   c                       s   e Zd ZdZeedddeeeddddZdef fdd	Z	e
	
	
	
	
	
	
	
	
	
ddeej deej deej dee deej dee deej deej deej dee deeef fddZ  ZS )MoonshineDecoder	input_idsr<   r   )index
layer_namer   )r   rq   cross_attentionsri   c                    sB   t    tj jdd| _t fddt jD | _	d S )NFr   c                    r   rD   )r   r   r   rD   r\   r   s  r   z-MoonshineDecoder.__init__.<locals>.<listcomp>)
rV   rW   rk   r   rF   normr   r   rH   r   rX   ri   rZ   r   r\   rW   o  s
   
zMoonshineDecoder.__init__Nr   r   r,   inputs_embedsrP   r   r   r   rY   rr   c
              
   K   s  |du |duA rt d|du r| |}|r(|du r(tt| jdt| jd}|du rD|dur4| nd}tj|||jd  |j	d}|du rM|
d}t| j|||||d}|}| ||}|	dur|jd }d	}|	d
dd|f d
d|f }	| jjdkr|	dk r|	nd}	n| jjdkrt|	|j|jd }	n
t|	|j|jd }	| jD ]}||||f|	|||||d|
}q| |}t||r|dS ddS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r<   r  )ri   input_embedsr   r   r,   r   r  .r  r?   r  )r   r   r,   rP   r   r   )r	  r,   )
ValueErrorembed_tokensr   r
   ri   get_seq_lengthru   r  r   r  r
  r   r   r   r  r   r  r   r   r  r   )rX   r  r   r   r,   r  rP   r   r   r   rY   past_seen_tokenscausal_maskrq   r   r  r  decoder_layerrD   rD   r\   rs   v  st   

	



zMoonshineDecoder.forward)	NNNNNNNNN)r]   r^   r_   r   r   r}   r   r  r*   rW   r   r   ru   r   rv   r	   r   r   r   r   r   r   r   rs   rd   rD   rD   rZ   r\   r  g  sP    	

r  c                   @   s   e Zd Zee										ddeej deej deej deej dee	e	ej   dee
ee	ej f  dee	ej  d	ee	ej  d
ee deej dee defddZdS )MoonshineModelNr   r   decoder_input_idsdecoder_attention_maskencoder_outputsr,   decoder_inputs_embedsdecoder_position_idsrP   r   rY   rr   c                 K   sl   |du r| j |fd|i|}| jd||||j||||	|
d	|}t|j|j|j|j|j|j|j|jdS )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        Nr   )	r  r   r   r   r,   r  r   rP   r   )r	  r,   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsrD   )encoderdecoderr	  r   r,   rq   r   r  )rX   r   r   r&  r'  r(  r,   r)  r*  rP   r   rY   decoder_outputsrD   rD   r\   rs     s2   .
zMoonshineModel.forward)
NNNNNNNNNN)r]   r^   r_   r   r   r   ru   r   r   r   r   r   r   r   r   r   rs   rD   rD   rD   r\   r%    sL    	
r%  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       s  e Zd ZdgZdef fddZdd Zdd Zd	d
 Zdd Z	de
jfddZee											ddeej deej deej deej deeeej   deeeeej f  deeej  deeej  dee deej deej dee defddZ  ZS ) !MoonshineForConditionalGenerationzproj_out.weightri   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
rV   rW   r%  r   rk   rl   rF   rE   proj_outr   r  rZ   rD   r\   rW   $  s   
z*MoonshineForConditionalGeneration.__init__c                 C   
   | j  S rg   )r   get_encoderr   rD   rD   r\   r6  ,  r  z-MoonshineForConditionalGeneration.get_encoderc                 C   r5  rg   )r   get_decoderr   rD   rD   r\   r7  /  r  z-MoonshineForConditionalGeneration.get_decoderc                 C   r   rg   r4  r   rD   rD   r\   get_output_embeddings2  r   z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   r  rg   r8  )rX   new_embeddingsrD   rD   r\   set_output_embeddings5  r  z7MoonshineForConditionalGeneration.set_output_embeddingsrr   c                 C   r5  rg   )r   r   r   rD   rD   r\   r   8  r  z6MoonshineForConditionalGeneration.get_input_embeddingsNr   r   r&  r'  r(  r,   r)  r*  rP   r   labelsrY   c                 K   s   |dur|du r|du rt || jj| jj}| j|f||||||||	|
d	|}| |j}d}|dur?| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r   r&  r(  r'  r,   r)  r*  rP   r   )logitsr<  rE   )	lossr=  r,   r+  r,  r  r-  r   r.  )r)   ri   pad_token_idrC   r   r4  r	  loss_functionrE   r   r,   r+  r,  r  r-  r   r.  )rX   r   r   r&  r'  r(  r,   r)  r*  rP   r   r<  rY   outputsr=  r>  rD   rD   r\   rs   ;  sF   3z)MoonshineForConditionalGeneration.forward)NNNNNNNNNNN)r]   r^   r_   _tied_weights_keysr*   rW   r6  r7  r9  r;  rk   r  r   r   r   r   ru   r   r   r   r   r   r   r   r   r   rs   rd   rD   rD   rZ   r\   r3    s`    	
r3  )r*   r%  r   r3  )Mtypingr   r   r   ru   torch.nnrk   transformers.utils.genericr   r   activationsr   cache_utilsr	   r
   r   configuration_utilsr   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr    glm.modeling_glmr"   r#   r$   llama.modeling_llamar%   r&   r'   whisper.modeling_whisperr(   r)   
get_loggerr]   loggerr*   r  rf   rx   r}   r   r   r   r   r   r  r%  r3  __all__rD   rD   rD   r\   <module>   sP   
 NoKbjKs