o
    ei                     @   s  d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ee*ddG dd deZ4G dd dej5Z6G dd dej5Z7G dd dej5Z8d ej9d!e:d"ej9fd#d$Z;	%dKd&ej5d'ej9d(ej9d)ej9d*ej9dB d+e<d,e<d-e'e) fd.d/Z=d0d1 Z>dLd2d3Z?ee?G d4d5 d5ej5Z@G d6d7 d7eZAG d8d9 d9eZBe*G d:d; d;e%ZCG d<d= d=eCZDe*G d>d? d?eCZEe*G d@dA dAeCZFdBej9dCe:dDe:fdEdFZGe*dGdG dHdI dIeCeZHg dJZIdS )M    )Callable)	dataclass)OptionalN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)use_kernelized_func)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )MoonshineConfigz
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   @   s    e Zd ZU dZejdB ed< dS )MoonshineEncoderModelOutputNattention_mask)__name__
__module____qualname__r%   torchTensor__annotations__ r,   r,   n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/moonshine/modeling_moonshine.pyr$   3   s   
 r$   c                       2   e Zd Z fddZdejdejfddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S Nsuper__init__configr   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr4   
hidden_act	__class__r,   r-   r3   >   s
   

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r0   )r:   r5   r;   )r=   rA   r,   r,   r-   forwardE   s   


zMoonshineEncoderMLP.forwardr&   r'   r(   r3   r)   r*   rC   __classcell__r,   r,   r?   r-   r/   =       r/   c                       r.   )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )N   r1   r<   r?   r,   r-   r3   M   s
   

zMoonshineDecoderMLP.__init__rA   rB   c                 C   s8   |  |}|jddd\}}| || }| |}|S )NrH   dim)r:   chunkr5   r;   )r=   rA   gater,   r,   r-   rC   T   s
   

zMoonshineDecoderMLP.forwardrD   r,   r,   r?   r-   rG   L   rF   rG   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )MoonshineRotaryEmbeddinginv_freqNr4   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrO   F)
persistentoriginal_inv_freq)r2   r3   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr4   rope_parametersrP   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r=   r4   devicerope_init_fnrO   r?   r,   r-   r3   _   s   


z!MoonshineRotaryEmbedding.__init__r\   ztorch.deviceseq_lenrB   ztorch.Tensorc           	      C   st   | j d }| j dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj	|tj
d	|   }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   rH   dtype)r\   rc   )rW   getgetattrr8   num_attention_headsintr)   arangeint64tofloat)	r4   r\   r^   baser`   ra   rK   attention_factorrO   r,   r,   r-   rX   o   s   
&z8MoonshineRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   rI   r!   mpscpuF)device_typeenabledrH   rJ   rb   )rO   rk   expandshaperj   r\   
isinstancetypestrr   	transposer)   catcosrY   sinrc   )
r=   xposition_idsinv_freq_expandedposition_ids_expandedrp   freqsembry   rz   r,   r,   r-   rC      s   0&z MoonshineRotaryEmbedding.forwardr0   )NNN)r&   r'   r(   r)   r*   r+   r"   r3   staticmethodr   rg   tuplerk   rX   no_gradr   rC   rE   r,   r,   r?   r-   rN   \   s&   
 

rN   rA   n_reprB   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rs   rr   reshape)rA   r   batchnum_key_value_headsslenra   r,   r,   r-   	repeat_kv   s
   0r           modulequerykeyvaluer%   scalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )NrH   r   rI   )rK   rc   )ptrainingr!   )r   num_key_value_groupsr)   matmulrw   r6   
functionalsoftmaxfloat32rj   rc   r   r   
contiguous)r   r   r   r   r%   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr,   r,   r-   eager_attention_forward   s   
r   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   NrH   r!   rI   rJ   )r)   stackflatten)r{   x1x2r,   r,   r-   rotate_half   s   r   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}|jd }| dd|f | d|df }}|dd|f |d|df }}	|| t||  }
|| t||  }tj|
|gdd}
tj||	gdd}|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrI   rH   rJ   )	unsqueezers   repeat_interleaver   r)   rx   )qkry   rz   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr,   r,   r-   apply_rotary_pos_emb   s   

$$
""r   c                       s   e Zd ZdZdededededef
 fddZ										dd
ej	de
ej	ej	f d	B dej	d	B ded	B dejd	B dej	d	B dee de
ej	ej	d	B e
ej	 d	B f fddZ  ZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr4   	layer_idx	is_causalrf   r   c                    s  t    |||d || _|| _t|d|j|j | _|j|j	 | _
| jd | _|j| _|| _tj|j|j| j |jd| _tj|j|j	| j |jd| _tj|j|j	| j |jd| _tj|j| j |jdd| _| jjd ur| jj}|| j| d |  }|| j | _d S d| _d S )N)rf   r   ra   g      ࿩biasFr!   r   )r2   r3   updater4   r   re   r8   rf   ra   r   r   r   attention_dropoutr   r6   r7   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)r=   r4   r   r   rf   r   target_multipletarget_head_dimr?   r,   r-   r3      s0   

zMoonshineAttention.__init__NrA   position_embeddingsr%   past_key_valuescache_positionkey_value_statesr   rB   c                 K   sJ  |j d d \}}	| |||	| jj| jdd}
|d u}|d ur9|j| j	}|r6d|j| j	< |j
}n|j}|d ur?|n|}|rV|rV|rV|j| j	 j}|j| j	 j}n7| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|s|\}}t|
|||\}
}|d ur|||d}|||| j	|\}}t| jjt}| jo|d u o|	dk}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| jsdn| j| j|d	|\}}| jdkr|d
d | j f }| ||	d! }| "|}||fS )NrI   r!   rH   Tr   )rz   ry   r   r   r   )r   r   r   .)#rs   r   viewr4   r   ra   rw   
is_updatedrd   r   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   r   get_interface_attn_implementationr   r   r   r)   r6   r   padr   r   r   r   r   r   )r=   rA   r   r%   r   r   r   r   bszq_lenquery_statesis_cross_attentionr   current_statesr   r   ry   rz   cache_kwargsattention_interfacer   r   r   r,   r,   r-   rC     sx   
"

	

zMoonshineAttention.forward)NNNNN)r&   r'   r(   __doc__r"   rg   boolr3   r)   r*   r   r   
LongTensorr   r   rC   rE   r,   r,   r?   r-   r      sF    (	r   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )MoonshineEncoderLayerr4   r   c                    s`   t    |j| _t||d|j|jd| _t||j| _	t
j|jdd| _t
j|jdd| _d S )NFr4   r   r   rf   r   r   )r2   r3   r8   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr/   encoder_hidden_actmlpr6   	LayerNorminput_layernormpost_attention_layernormr=   r4   r   r?   r,   r-   r3   u  s   
zMoonshineEncoderLayer.__init__NFrA   r%   r|   r   	use_cacher   r   r   rB   c              
   K   s^   |}	|  |}| jd|||||||d|\}}
|	| }|}	| |}| |}|	| }|S )NrA   r%   r|   r   r   r   r   r,   )r   r   r   r   )r=   rA   r%   r|   r   r   r   r   r   residual_r,   r,   r-   rC     s&   




zMoonshineEncoderLayer.forward)NNNFNN)r&   r'   r(   r"   rg   r3   r)   r*   r   r   r   r   r   r   rC   rE   r,   r,   r?   r-   r   t  s6    	
r   c                       s   e Zd ZddededB f fddZ										ddejdejdB d	ejdB d
ejdB dejdB dejdB de	dB de
dB dejdB deejejf dB deejejf dB dee deejeejejf dB f fddZ  ZS )MoonshineDecoderLayerNr4   r   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTr   Fr   )r2   r3   r8   r   rf   r   r   encoder_attnrG   r>   r   r6   r   r   r   final_layernormr   r?   r,   r-   r3     s(   
zMoonshineDecoderLayer.__init__FrA   r%   encoder_hidden_statesencoder_attention_maskr|   encoder_position_idsr   r   r   r   encoder_position_embeddingsr   rB   c              
   K   s   |}|  |}| jd||||||	|
d|\}}|| }|d ur8|}| |}| j|||||d\}}|| }|}| |}| |}|| }|S )Nr   )rA   r   r%   r   r   r,   )r   r   r   r   r   r   )r=   rA   r%   r   r   r|   r   r   r   r   r   r   r   r   r   r,   r,   r-   rC     s<   






zMoonshineDecoderLayer.forwardr0   )
NNNNNNFNNN)r&   r'   r(   r"   rg   r3   r)   r*   r   r   r   r   r   r   FloatTensorrC   rE   r,   r,   r?   r-   r     sN    	
r   c                   @   sJ   e Zd ZU eed< dZdZdZdZddgZ	dZ
dZdZdejfd	d
ZdS )MoonshinePreTrainedModelr4   modelinput_valuesaudioTr   r   input_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r!      r   rH   )rg   )r=   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengthr,   r,   r-    _get_feat_extract_output_lengths  s   z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)r&   r'   r(   r"   r+   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr)   r   r   r,   r,   r,   r-   r     s   
 r   c                       s   e Zd ZdZdZeedZdef fddZ	de
jfdd	Zd
e
jfddZee	ddejdejdB dee deeB fddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsrA   r4   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t fddt jD | _tj|dd| _t d| _d| _|   d S )Nr!   r   r   F)kernel_sizestrider   rH   r   r   )r  r  gh㈵>)
num_groupsnum_channelsepsc                       g | ]}t  |qS r,   )r   .0idxr4   r,   r-   
<listcomp>#      z-MoonshineEncoder.__init__.<locals>.<listcomp>r   r  )r2   r3   r4   r8   r6   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListrangeencoder_num_hidden_layersr   r   
layer_normrN   
rotary_embgradient_checkpointing	post_init)r=   r4   	embed_dimr?   r  r-   r3     s   zMoonshineEncoder.__init__rB   c                 C      | j S r0   r  r=   r,   r,   r-   get_input_embeddings*     z%MoonshineEncoder.get_input_embeddingsr   c                 C   
   || _ d S r0   r"  r=   r   r,   r,   r-   set_input_embeddings-     
z%MoonshineEncoder.set_input_embeddingsNr%   r   c                 K   s"  | d}tj| |}| |}tj| |}tj| |}|	ddd}|durK| 
|jd }d}|ddd|f dd|f }|}t| j|||d}tjd|jd |jd	 d}| j||d
}	| jD ]}
|
|f|||	d|}qm| |}t||dur| dS ddS )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r!   r   rH   NrI   i  .r4   inputs_embedsr%   r   r\   r|   )r%   r|   r   )last_hidden_stater%   )r   r6   r   tanhr  r  gelur  r  permuter   rs   r   r4   r)   rh   r\   r  r   r  r$   rg   )r=   r   r%   r   rA   mask_lendownsample_strideoutput_attention_maskr|   r   encoder_layerr,   r,   r-   rC   0  sH   



zMoonshineEncoder.forwardr0   )r&   r'   r(   r   r   r   r   _can_record_outputsr"   r3   r6   Moduler$  r(  r   r    r)   r   r*   r   r   r   r   rC   rE   r,   r,   r?   r-   r  
  s*    r  c                       s   e Zd ZdZeedddeeeddddZdef fdd	Z	e
e	
	
	
	
	
	
	
	
	
ddejd
B dejd
B dejd
B ded
B dejd
B ded
B dejd
B dejd
B dejd
B dee deeB fddZ  ZS )MoonshineDecoder	input_idsr!   r   )index
layer_namer   )r  rA   cross_attentionsr4   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj jdd| _t d| _d| _|   d S )Nc                    r  r,   )r   r  r  r,   r-   r    r  z-MoonshineDecoder.__init__.<locals>.<listcomp>Fr   r  )r2   r3   pad_token_idpadding_idx
vocab_sizer6   	Embeddingr8   embed_tokensr  r  num_hidden_layersr   r   normrN   r  r  r  r=   r4   r?   r  r-   r3   y  s    zMoonshineDecoder.__init__Nr%   r|   r   r+  r   r   r   r   r   rB   c
              
   K   s$  |du |duA rt d|du r| |}|r(|du r(tt| jdt| jd}|du rD|dur4| nd}tj|||jd  |j	d}|du rM|
d}t| j|||||d}t| j||	|d}	|}| j||d	}| jD ]}||||f|	|||||d
|
}qm| |}t||r|dS ddS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r!   r,  )r4   r+  r%   r   r   r|   r*  r-  )r   r|   r   r   r   r   )r.  r   )
ValueErrorrA  r	   r   r4   get_seq_lengthr)   rh   rs   r\   r   r   r   r  r   rC  r   )r=   r9  r%   r|   r   r+  r   r   r   r   r   past_seen_tokenscausal_maskrA   r   decoder_layerr,   r,   r-   rC     sf   




zMoonshineDecoder.forward)	NNNNNNNNN)r&   r'   r(   r   r   r   r   r6  r"   r3   r   r    r)   r   r*   r   r   r   r   r   r   r   rC   rE   r,   r,   r?   r-   r8  p  sR    	
r8  c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zd
d Ze	e
										ddejdB dejdB dejdB dejdB deeej  dB dedB deej dB deej dB dedB dejdB dee defddZ  ZS )MoonshineModelr4   c                    s,   t  | t|| _t|| _|   d S r0   )r2   r3   r  encoderr8  decoderr  rD  r?   r,   r-   r3     s   

zMoonshineModel.__init__c                 C   s   | j jS r0   rL  rA  r#  r,   r,   r-   r$    s   z#MoonshineModel.get_input_embeddingsc                 C   s   || j _d S r0   rM  r'  r,   r,   r-   r(    s   z#MoonshineModel.set_input_embeddingsc                 C   s   | j   dS )z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)rK  _freeze_parametersr#  r,   r,   r-   freeze_encoder  s   zMoonshineModel.freeze_encoderc                 C   s   t d)z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        zNot needed for Moonshine)AttributeErrorr#  r,   r,   r-   _mask_input_features  s   z#MoonshineModel._mask_input_featuresNr   r%   decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr   r   r   rB   c                 K   sn   |du r| j |fd|i|}| jd|||j|j||||	|
d	|}t|j|j|j|j|j|j|j|jdS )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        Nr%   )	r9  r%   r   r   r   r+  r|   r   r   )r.  r   decoder_hidden_statesdecoder_attentionsr<  encoder_last_hidden_stater   encoder_attentionsr,   )	rK  rL  r.  r%   r   r   rA   r  r<  )r=   r   r%   rR  rS  rT  r   rU  rV  r   r   r   decoder_outputsr,   r,   r-   rC     s2   .
zMoonshineModel.forward)
NNNNNNNNNN)r&   r'   r(   r"   r3   r$  r(  rO  rQ  r   r   r)   r   r   r   r	   r   r   r   r   rC   rE   r,   r,   r?   r-   rJ    sV    	
rJ  r9  r=  decoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    NrI   r!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrs   r[   rE  masked_fill_)r9  r=  r\  shifted_input_idsr,   r,   r-   shift_tokens_rightB  s   (r`  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    c                       s   e Zd ZddiZdef fddZdd Zdd	 Zd
ej	fddZ
ee											ddejdB dejdB dejdB dejdB deeej  dB dedB deej dB deej dB dedB dejdB dejdB dee d
efddZ  ZS )!MoonshineForConditionalGenerationzproj_out.weightz!model.decoder.embed_tokens.weightr4   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
r2   r3   rJ  r   r6   r7   r8   r?  proj_outr  rD  r?   r,   r-   r3   Z  s   
z*MoonshineForConditionalGeneration.__init__c                 C   r!  r0   rb  r#  r,   r,   r-   get_output_embeddingsb  r%  z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   r&  r0   rc  )r=   new_embeddingsr,   r,   r-   set_output_embeddingse  r)  z7MoonshineForConditionalGeneration.set_output_embeddingsrB   c                 C   s
   | j  S r0   )r   r$  r#  r,   r,   r-   r$  h  r)  z6MoonshineForConditionalGeneration.get_input_embeddingsNr   r%   rR  rS  rT  r   rU  rV  r   r   labelsr   c                 K   s   |dur|du r|du rt || jj| jj}| j|f||||||||	|
d	|}| |j}d}|dur?| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r%   rR  rT  rS  r   rU  rV  r   r   )logitsrg  r?  )	lossrh  r   rW  rX  r<  rY  r   rZ  )r`  r4   r=  r\  r   rb  r.  loss_functionr?  r   r   rW  rX  r<  rY  r   rZ  )r=   r   r%   rR  rS  rT  r   rU  rV  r   r   rg  r   outputsrh  ri  r,   r,   r-   rC   k  sF   3z)MoonshineForConditionalGeneration.forward)NNNNNNNNNNN)r&   r'   r(   _tied_weights_keysr"   r3   rd  rf  r6   r7  r$  r   r   r)   r   r   r   r	   r   r   r   r   rC   rE   r,   r,   r?   r-   ra  R  s\    	
ra  )rJ  r   ra  )r   )r!   )Jcollections.abcr   dataclassesr   typingr   r)   torch.nnr6   activationsr   cache_utilsr   r   r	   
generationr
   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   r    configuration_moonshiner"   r$   r7  r/   rG   rN   r*   rg   r   rk   r   r   r   r   r   r   r   r  r8  rJ  r`  ra  __all__r,   r,   r,   r-   <module>   s   C

( 3Jfigm