o
    i                     @   s  d dl mZmZmZ d dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 G dd dej3Z4G dd dej3Z5dej6de7dej6fddZ8	dLdej3dej6d ej6d!ej6d"eej6 d#e9d$e9d%e*e, fd&d'Z:d(d) Z;dMd*d+Z<G d,d- d-ej3Z=G d.d/ d/ej3Z>G d0d1 d1eZ?G d2d3 d3eZ@e-G d4d5 d5e(ZAG d6d7 d7eAZBe-G d8d9 d9eAZC		 dNd:eDe7e7f d;e9d<e7d"eejE d=e7dejFfd>d?ZGe-G d@dA dAeAZHdBej6dCe7dDe7fdEdFZIe-dGdHG dIdJ dJeAeZJg dKZKdS )O    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg   )MoonshineConfigc                       2   e Zd Z fddZdejdejfddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S Nsuper__init__configr   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr(   
hidden_act	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/moonshine/modeling_moonshine.pyr'   4   s
   

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r$   )r.   r)   r/   )r1   r7   r5   r5   r6   forward;   s   


zMoonshineEncoderMLP.forward__name__
__module____qualname__r'   torchTensorr9   __classcell__r5   r5   r3   r6   r#   3       r#   c                       r"   )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )N   r%   r0   r3   r5   r6   r'   C   s
   

zMoonshineDecoderMLP.__init__r7   r8   c                 C   s8   |  |}|jddd\}}| || }| |}|S )NrC   dim)r.   chunkr)   r/   )r1   r7   gater5   r5   r6   r9   J   s
   

zMoonshineDecoderMLP.forwardr:   r5   r5   r3   r6   rB   B   rA   rB   r7   n_repr8   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)shapeexpandreshape)r7   rI   batchnum_key_value_headsslenhead_dimr5   r5   r6   	repeat_kvR   s
   0rQ           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )NrC   r   rD   )rF   dtype)ptrainingr    )rQ   num_key_value_groupsr>   matmul	transposerJ   r*   
functionalsoftmaxfloat32tor\   rY   r^   
contiguous)rS   rT   rU   rV   rW   rX   rY   rZ   
key_statesvalue_statesattn_weightscausal_maskattn_outputr5   r5   r6   eager_attention_forward^   s   
&rl   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   NrC   r    rD   rE   r[   )r>   stackflatten)xx1x2r5   r5   r6   rotate_halfx   s   rr   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}|jd }| dd|f | d|df }}|dd|f |d|df }	}
|| t||  }|	| t|	|  }tj||gdd}tj||
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrD   rC   rE   )	unsqueezerJ   repeat_interleaverr   r>   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr5   r5   r6   apply_rotary_pos_emb   s   

$$
""r   c                       s   e Zd ZdZdededededef
 fddZed	d
dd					dde	j
deee	j
e	j
f  dee	j
 d
ee dee	j dee	j
 dee dee	j
ee	j
 eee	j
  f fddZ  ZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr(   	layer_idx	is_causalnum_attention_headsrN   c                    s  t    |||d || _|| _t|d|j|j | _|j|j	 | _
| jd | _|j| _|| _tj|j|j| j |jd| _tj|j|j	| j |jd| _tj|j|j	| j |jd| _tj|j| j |jdd| _| jjd ur| jj}|| j| d |  }|| j | _d S d| _d S )N)r   rN   rP   g      ࿩biasFr    r   )r&   r'   updater(   r   getattrr,   r   rP   rN   r_   rX   attention_dropoutr   r*   r+   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)r1   r(   r   r   r   rN   target_multipletarget_head_dimr3   r5   r6   r'      s0   

zMoonshineAttention.__init__past_key_valuepast_key_values4.58new_nameversionNr7   position_embeddingsrW   cache_positionkey_value_statesrZ   r8   c                 K   sV  |j d d \}}	| |||	| jj| jdd}
|d u}|d ur9|j| j	}|r6d|j| j	< |j
}n|j}|d ur?|n|}|rV|rV|rV|j| j	 j}|j| j	 j}n7| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|s|\}}t|
|||\}
}|d ur|||d}|||| j	|\}}t}| jjdkrt| jj }| jo|d u o|	dk}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| jsd	n| j| j|d
|\}}| jdkr|dd | j f }|||	d  }| !|}||fS )NrD   r    rC   Tr   )ry   rx   r   eagerr   rR   )rY   rX   r   .)"rJ   r   viewr(   rN   rP   ra   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   rl   _attn_implementationr   r   r   r>   r*   rb   padr^   r   rX   rL   rf   r   )r1   r7   r   rW   r   r   r   rZ   bszq_lenquery_statesis_cross_attentionr   current_statesrg   rh   rx   ry   cache_kwargsattention_interfacer   rk   ri   r5   r5   r6   r9      sx   "

	

zMoonshineAttention.forward)NNNNN)r;   r<   r=   __doc__r!   intboolr'   r   r>   r?   r   tupler	   
LongTensorr   r   r9   r@   r5   r5   r3   r6   r      sH    %	r   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	MoonshineRotaryEmbeddinginv_freqNr(   c                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r&   r'   hasattr
isinstancer   dictr   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr(   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r1   r(   devicer   r3   r5   r6   r'   -  s   
z!MoonshineRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   rD   r    mpscpuF)device_typeenabledrC   rE   r\   )r   floatrK   rJ   re   r   r   r   strr>   autocastra   ru   rx   r   ry   r\   )
r1   ro   rz   inv_freq_expandedposition_ids_expandedr   freqsembrx   ry   r5   r5   r6   r9   >  s   0&z MoonshineRotaryEmbedding.forwardr$   )r;   r<   r=   r>   r?   __annotations__r!   r'   no_gradr   r9   r@   r5   r5   r3   r6   r   *  s   
 
r   c                       s   e Zd Zdedef fddZedddd							
				ddejde	ej de	ej
 de	e de	e de	ej
 de	eejejf  dee dejfddZ  ZS )MoonshineEncoderLayerr(   r   c                    s`   t    |j| _t||d|j|jd| _t||j| _	t
j|jdd| _t
j|jdd| _d S )NFr(   r   r   r   rN   r   )r&   r'   r,   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr#   encoder_hidden_actmlpr*   	LayerNorminput_layernormpost_attention_layernormr1   r(   r   r3   r5   r6   r'   O  s   
zMoonshineEncoderLayer.__init__r   r   r   r   NFr7   rW   rz   	use_cacher   r   rZ   r8   c              
   K   s^   |}	|  |}| jd|||||||d|\}}
|	| }|}	| |}| |}|	| }|S )Nr7   rW   rz   r   r   r   r   r5   )r   r   r   r   )r1   r7   rW   rz   r   r   r   r   rZ   residual_r5   r5   r6   r9   _  s&   




zMoonshineEncoderLayer.forward)NNNFNN)r;   r<   r=   r!   r   r'   r   r>   r?   r   r   r	   r   r   r   r   r9   r@   r5   r5   r3   r6   r   N  s8    	
r   c                !       s   e Zd Zddedee f fddZedddd								
			ddej	deej	 deej	 deej	 deej
 deej
 dee dee deej
 deeej	ej	f  deeej	ej	f  dee deejeeejejf  f fddZ  ZS )MoonshineDecoderLayerNr(   r   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTr   Fr   )r&   r'   r,   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrB   decoder_hidden_actr   r*   r   r   r   final_layernormr   r3   r5   r6   r'     s(   
zMoonshineDecoderLayer.__init__r   r   r   r   Fr7   rW   encoder_hidden_statesencoder_attention_maskrz   encoder_position_idsr   r   r   encoder_position_embeddingsrZ   r8   c              
   K   s   |}|  |}| jd||||||	|
d|\}}|| }|d ur8|}| |}| j|||||d\}}|| }|}| |}| |}|| }|S )Nr   )r7   r   rW   r   r   r5   )r   r   r   r   r   r   )r1   r7   rW   r   r   rz   r   r   r   r   r   r   rZ   r   r   r5   r5   r6   r9     s<   






zMoonshineDecoderLayer.forwardr$   )
NNNNNNFNNN)r;   r<   r=   r!   r   r   r'   r   r>   r?   r   r	   r   r   r   r   FloatTensorr9   r@   r5   r5   r3   r6   r     sP    	
r   c                   @   sF   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZdejfdd	Zd
S )MoonshinePreTrainedModelr(   modelinput_valuesTr   r   input_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r       r   rC   )r   )r1   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengthr5   r5   r6    _get_feat_extract_output_lengths  s   z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)r;   r<   r=   r!   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr>   r   r   r5   r5   r5   r6   r     s   
 r   c                
       s   e Zd ZdZdZeedZdef fddZ	de
jfdd	Zd
e
jfddZe	ddejdeej dee defddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsr7   r(   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t d| _t fddt jD | _tj|dd| _d| _|   d S )Nr    r   r   F)kernel_sizestrider   rC   r   r   )r  r  gh㈵>)
num_groupsnum_channelsepsr(   c                       g | ]}t  |qS r5   )r   .0idxr	  r5   r6   
<listcomp>      z-MoonshineEncoder.__init__.<locals>.<listcomp>r   )r&   r'   r(   r,   r*   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListrangeencoder_num_hidden_layersr   r   
layer_normgradient_checkpointing	post_init)r1   r(   	embed_dimr3   r	  r6   r'     s   zMoonshineEncoder.__init__r8   c                 C      | j S r$   r  r1   r5   r5   r6   get_input_embeddings     z%MoonshineEncoder.get_input_embeddingsrV   c                 C   
   || _ d S r$   r  r1   rV   r5   r5   r6   set_input_embeddings     
z%MoonshineEncoder.set_input_embeddingsNrW   rZ   c           
      K   s<  | d}tj| |}| |}tj| |}tj| |}|	ddd}|durm| 
|jd }d}|ddd|f dd|f }| jjdkrZ|d	k rW|nd}n| jjd
krgt||j}nt||j}tjd|jd |jd d}| ||}| jD ]}	|	|f|||d|}q| |}t|dS )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r    r   rC   NrD     .flash_attention_2rR   sdpar   )rW   rz   r   )last_hidden_state)rs   r*   rb   tanhr  r  gelur  r  permuter   rJ   r(   r   anyr   r\   r   r>   aranger   r  r   r  r   )
r1   r   rW   rZ   r7   mask_lendownsample_striderz   r   encoder_layerr5   r5   r6   r9     s>   



zMoonshineEncoder.forwardr$   )r;   r<   r=   r   r   r   r   _can_record_outputsr!   r'   r*   Moduler!  r%  r   r>   r   r   r?   r   r   r   r9   r@   r5   r5   r3   r6   r    s(    r  c                       s   e Zd ZdZeedddeeeddddZdef fdd	Z	e
	
	
	
	
	
	
	
	
	
ddeej deej deej dee deej dee deej deej deej dee deeef fddZ  ZS )MoonshineDecoder	input_idsr    r   )index
layer_namer   )r  r7   cross_attentionsr(   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj jdd| _t d| _d| _|   d S )Nc                    r
  r5   )r   r  r	  r5   r6   r  W  r  z-MoonshineDecoder.__init__.<locals>.<listcomp>Fr   r	  )r&   r'   pad_token_idpadding_idx
vocab_sizer*   	Embeddingr,   embed_tokensr  r  decoder_num_hidden_layersr   r   normr   r  r  r  r1   r(   r3   r	  r6   r'   P  s   zMoonshineDecoder.__init__NrW   rz   r   inputs_embedsr   r   r   r   rZ   r8   c
              
   K   s  |du |duA rt d|du r| |}|r(|du r(tt| jdt| jd}|du rD|dur4| nd}tj|||jd  |j	d}|du rM|
d}t| j|||||d}|}| ||}|	dur|jd }d	}|	d
dd|f d
d|f }	| jjdkr|	dk r|	nd}	n| jjdkrt|	|j|jd }	n
t|	|j|jd }	| jD ]}||||f|	|||||d|
}q| |}t||r|dS ddS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr	  r   r    r*  )r(   input_embedsrW   r   r   rz   r[   r'  .r(  rR   r)  )r   rz   r   r   r   r   )r+  r   )
ValueErrorr?  r   r
   r(   get_seq_lengthr>   r0  rJ   r   rs   r   r  r   r/  r   r\   r   r   rA  r   )r1   r7  rW   rz   r   rC  r   r   r   r   rZ   past_seen_tokensrj   r7   r   r1  r2  decoder_layerr5   r5   r6   r9   `  st   

	



zMoonshineDecoder.forward)	NNNNNNNNN)r;   r<   r=   r   r   r   r   r4  r!   r'   r   r   r>   r   r?   r	   r   r   r   r   r   r   r   r9   r@   r5   r5   r3   r6   r6  G  sP    	

r6  rJ   	mask_probmask_length	min_masksc                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r    z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr    r   )r   max)input_lengthnum_masked_spanepsilonrJ  rI  rK  sequence_lengthr5   r6   compute_num_masked_span  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrD   c                    s   g | ]} qS r5   r5   )r  r   )rR  r5   r6   r    s    z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)rE  nprandomranditemdetachsumtolistr  zerosr   choicer0  lenconcatenateonesint32appendarraybroadcast_torL   rM  put_along_axis)rJ   rI  rJ  rW   rK  
batch_sizerS  r   spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrN  rO  spec_aug_mask_idxdummy_mask_idxoffsetsr5   rP  r6   _compute_mask_indices  s\   

rm  c                       s  e Zd Zdef fddZdd Zdd Zdd	 Zd
d Z	dde	j
dee	j fddZee										ddee	j
 dee	j dee	j dee	j deeee	j
   deeeee	j
 f  deee	j
  deee	j  dee dee	j dee defddZ  ZS ) MoonshineModelr(   c                    s,   t  | t|| _t|| _|   d S r$   )r&   r'   r  encoderr6  decoderr  rB  r3   r5   r6   r'   4  s   

zMoonshineModel.__init__c                 C   s   | j jS r$   rp  r?  r   r5   r5   r6   r!  <  s   z#MoonshineModel.get_input_embeddingsc                 C   s   || j _d S r$   rq  r$  r5   r5   r6   r%  ?  s   z#MoonshineModel.set_input_embeddingsc                 C   r  r$   )ro  r   r5   r5   r6   get_encoderB  r"  zMoonshineModel.get_encoderc                 C   s   | j   dS )z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)ro  _freeze_parametersr   r5   r5   r6   freeze_encoderE  s   zMoonshineModel.freeze_encoderNinput_featuresrW   c                 C   s   t | jdds	|S | \}}}| jjdkrE| jrEt||f| jj| jj|| jjd}tj	||j
tjd}|dddf d|d}d||< | jjdkrl| jrlt||f| jj| jj| jjd}tj	||j
tjd}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTr   )rI  rJ  rW   rK  )r   r\   NrD   )rI  rJ  rK  )r   r(   sizemask_time_probr^   rm  mask_time_lengthmask_time_min_masksr>   tensorr   r   rK   mask_feature_probmask_feature_lengthmask_feature_min_masks)r1   ru  rW   rf  r,   rR  mask_time_indicesmask_feature_indicesr5   r5   r6   _mask_input_featuresL  s0   z#MoonshineModel._mask_input_featuresr   decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr   r   rZ   r8   c                 K   sl   |du r| j |fd|i|}| jd||||j||||	|
d	|}t|j|j|j|j|j|j|j|jdS )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        NrW   )	r7  rW   r   r   r   rC  rz   r   r   )r+  r   decoder_hidden_statesdecoder_attentionsr:  encoder_last_hidden_stater   encoder_attentionsr5   )ro  rp  r+  r   r   r7   r  r:  )r1   r   rW   r  r  r  r   r  r  r   r   rZ   decoder_outputsr5   r5   r6   r9   w  s2   .
zMoonshineModel.forwardr$   )
NNNNNNNNNN)r;   r<   r=   r!   r'   r!  r%  rr  rt  r>   r   r   r   r  r   r   r   r   r   r   r   r   r   r9   r@   r5   r5   r3   r6   rn  2  sb    

+	
rn  r7  r;  decoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    NrD   r    r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrJ   clonerE  masked_fill_)r7  r;  r  shifted_input_idsr5   r5   r6   shift_tokens_right  s   (r  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       s  e Zd ZdgZdef fddZdd Zdd Zd	d
 Zdd Z	de
jfddZee											ddeej deej deej deej deeeej   deeeeej f  deeej  deeej  dee deej deej dee defddZ  ZS ) !MoonshineForConditionalGenerationzproj_out.weightr(   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
r&   r'   rn  r   r*   r+   r,   r=  proj_outr  rB  r3   r5   r6   r'     s   
z*MoonshineForConditionalGeneration.__init__c                 C   
   | j  S r$   )r   rr  r   r5   r5   r6   rr    r&  z-MoonshineForConditionalGeneration.get_encoderc                 C   r  r$   )r   get_decoderr   r5   r5   r6   r    r&  z-MoonshineForConditionalGeneration.get_decoderc                 C   r  r$   r  r   r5   r5   r6   get_output_embeddings  r"  z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   r#  r$   r  )r1   new_embeddingsr5   r5   r6   set_output_embeddings  r&  z7MoonshineForConditionalGeneration.set_output_embeddingsr8   c                 C   r  r$   )r   r!  r   r5   r5   r6   r!    r&  z6MoonshineForConditionalGeneration.get_input_embeddingsNr   rW   r  r  r  r   r  r  r   r   labelsrZ   c                 K   s   |dur|du r|du rt || jj| jj}| j|f||||||||	|
d	|}| |j}d}|dur?| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	rW   r  r  r  r   r  r  r   r   )logitsr  r=  )	lossr  r   r  r  r:  r  r   r  )r  r(   r;  r  r   r  r+  loss_functionr=  r   r   r  r  r:  r  r   r  )r1   r   rW   r  r  r  r   r  r  r   r   r  rZ   outputsr  r  r5   r5   r6   r9     sF   3z)MoonshineForConditionalGeneration.forward)NNNNNNNNNNN)r;   r<   r=   _tied_weights_keysr!   r'   rr  r  r  r  r*   r5  r!  r   r   r   r>   r   r   r   r   r   r   r   r   r   r9   r@   r5   r5   r3   r6   r    s`    	
r  )rn  r   r  )rR   )Nr    )Nr   )Ltypingr   r   r   numpyrU  r>   torch.nnr*   transformers.utils.genericr   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   configuration_moonshiner!   r5  r#   rB   r?   r   rQ   r   rl   rr   r   r   r   r   r   r   r  r6  r   r   ndarrayrm  rn  r  r  __all__r5   r5   r5   r6   <module>   s   

* $4Kbw

w s