o
    ߥi                     @   s  d Z ddlZddlmZ ddlmZmZ ddlZddl	Z	ddl
mZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddl	mZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/ e%0 Z1dZ2dZ3dZ4e5e	j6Z7e5dZ8dZ9dZ:e;dZ<ddgZ=zddl>m?Z@ dZAG dd de@Z?W n eBy   dZAY nw G dd  d e.ZCeG d!d" d"e ZDeG d#d$ d$e ZEd%ZFd&ZGd'ZHG d(d) d)ejIZJG d*d+ d+ejIZKG d,d- d-eCZLe"d.eFG d/d0 d0e-ZMdS )1z PyTorch OFA-MMSpeech model.    N)	dataclass)OptionalTuple)compute_mask_indices)TransformerSentenceEncoderLayer)	LayerNormSamePadTransposeLast)init_bert_params)	index_put)version)nn)
functional)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)logging   )MMSpeechConfig)utils)	Embedding
OFADecoderOFAModelOFAPreTrainedModel_expand_maskzmmspeech-baser   OFATokenizerz1.9.1i   g    חAzmmspeech-large)FusedLayerNormTc                       s$   e Zd Zejj fddZ  ZS )r   c                    sP   |j s	t |S tj|j t |W  d    S 1 s!w   Y  d S N)is_cudasuperforwardtorchcudadevice)selfx	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/ofa/modeling_mmspeech.pyr!   ?   s
   
$zFusedLayerNorm.forward)__name__
__module____qualname__r"   jitunusedr!   __classcell__r)   r)   r'   r*   r   =   s    r   Fc                   @   s   e Zd ZdZeZdddZdS )MMSpeechPreTrainedModelz
    Base class OFA
    Fc                 C      t |ttfr||_dS dS z?
        Turn on the switch of gradient checkpointing.
        N
isinstancer   MMSpeechEncodergradient_checkpointingr%   modulevaluer)   r)   r*   _set_gradient_checkpointingR      
z3MMSpeechPreTrainedModel._set_gradient_checkpointingNF)r+   r,   r-   __doc__r   config_classr;   r)   r)   r)   r*   r1   K   s    r1   c                   @   s   e Zd ZU dZdZejed< dZejed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeej ed< dZeej ed	< dS )
MMSpeechEncoderOutputa  
    Base class for OFA's outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`):
            Sequence of hidden-states at the output of the last layer of the model.

        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed
            or when `config.output_hidden_states=True`):

            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(bsz, seq_len, hidden)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.

        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed
            or when `config.output_attentions=True`):

            Tuple of `torch.FloatTensor` (one for each layer) of shape `(bsz, num_heads, seq_len, seq_len)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

        position_embedding (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`):
            postional embeddings of the inputs.
    Nphone_distributionlast_hidden_statepadding_maskhidden_states
attentionsposition_embeddingkl_loss)r+   r,   r-   r>   rA   r"   Tensor__annotations__rB   rC   rD   r   r   FloatTensorrE   rF   rG   r)   r)   r)   r*   r@   Z   s   
 r@   c                   @   s  e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeeej   ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed	< dZeeej  ed
< dZeeej  ed< dZeej ed< dZeej ed< dZeej ed< dS )MMSpeechModelOutputa  
    Base class for sequence-to-sequence language models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*,
            returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*,
            returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*,
            returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights of the decoder, after the attention softmax,
            used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*,
            returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights of the decoder's cross-attention layer,
            after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`,
            *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*,
            returned when `output_hidden_states=True` is passed
            or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*,
            returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    Nlosslogitspast_key_valuesdecoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsencoder_padding_maskrA   rG   )r+   r,   r-   r>   rL   r   r"   rJ   rI   rM   rN   r   rO   rP   rQ   rR   rS   rT   rU   rH   rA   rG   r)   r)   r)   r*   rK   ~   s   
 6rK   aG  
    This model inherits from [`OFAModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`~MMSpeechConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Image captioning example:

    ```python
    >>> import soundfile as sf
    >>> import torchaudio
    >>> import torchaudio.compliance.kaldi as ta_kaldi
    >>> wav, sr = sf.read(data[self.column_map['wav']])
    >>> wav = torchaudio.sox_effects.apply_effects_tensor(
    >>>         wav, sr,
    >>>         [['speed', '1.0'], ['rate', '16000'], ['gain', '-n'], ['channels', '1']]))
    >>> wav = wav * (2**15)
    >>> wav = torch.from_numpy(wav.numpy())
    >>> fbank = ta_kaldi.fbank(
            waveform, num_mel_bins=n_bins, sample_frequency=sample_rate)
    >>> fbank_mask = torch.tensor([True])
    >>> model = MMSpeechModel.from_pretrained(ckpt_dir)
    >>> tokenizer = OFATokenizerZH.from_pretrained(ckpt_dir)

    >>> gen = model.generate(fbank=fbank, fbank_mask=fbank_mask, num_beams=4)
    >>> print(tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=False))
    ```
aL  
    Args:
        input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`):
            indices of input sequence tokens in the vocabular, and padding will be ignored by default;

            indices can be obtained using [`~OFATokenizer`].

        patch_images (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
            the resized image, which are transformed by the default operations.
        patch_images_2 (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
            the second (if it exists) image.
        patch_masks (`torch.BoolTensor`): the patches to be masked.
        token_embeddings (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`): token embeddings.
        sample_patch_num (`int`): the number of patches to sample.
        fbank (`torch.Tensor`): fbank feature of audio.
        fbank_length (`torch.Tensor`): fbank length of audio.
        fbank_masks (`torch.BoolTensor`): whether to have fbank feature.
        phone_items (`torch.Tensor`): phoneme sequence.
        phone_masks (`torch.BoolTensor`): whether to have phoneme feature.
        features_only (`torch.BoolTensor`): whether to return encoder features only.
        mask (`torch.BoolTensor`): whether to mask fbank feature.
        mask_prob (`torch.Tensor`): the prob of mask fbank feature.
        layer (`int`): the number of layer to cache hidden state.
        decoder_input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`): indices of the sequence in the vocabulary.
        code_masks (`torch.Tensor` of shape `(bsz, seq_len)`): masks only for code generation.
        attention_mask (`torch.Tensor` of shape `(bsz, seq_len)`): attention mask for decoding.
        encoder_outputs (`OFAEncoderOutput`):
            encoder outputs with hidden states, positional embeddings, and padding masks.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(bsz, num_heads, tgt_len, head_size)`) and 2 additional tensors of
            shape `(bsz, num_heads, src_len, head_size)`.
        use_cache (`bool`): whether to use cache for faster inference.
        output_attentions (`bool`): whether to output attention weights.
        output_hidden_states (`bool`): whether to output hidden states.
        return_dict (`bool`): unused. Keep it for generation only.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
c                       sX   e Zd ZdZdedef fddZdd Zdejd	ejd
e	ejejf fddZ
  ZS )Conv2dSubsampling4zConvolutional 2D subsampling (to 1/4 length).

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.

    idimodimc              
      s   t    tjtjd|ddtj tj||ddtj | _tjtj||d d d d  || _	d| _
d| _dS )z'Construct an Conv2dSubsampling4 object.r               N)r    __init__r"   r   
SequentialConv2dReLUconvLinearoutsubsampling_rateright_context)r%   rW   rX   r'   r)   r*   r]   !  s   
 
zConv2dSubsampling4.__init__c                 C   s6   |  }tdD ]}| d d d   }q|S )NrZ   r   )clonerangefloatfloorlong)r%   in_seq_lens_tensorrc   _r)   r)   r*   get_out_seq_lens_tensor2  s   z*Conv2dSubsampling4.get_out_seq_lens_tensorr&   x_lengthreturnc                 C   sV   | d}| |}| \}}}}| |dd |||| }|| |fS )a  Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 4.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 4.

        r   rZ   )	unsqueezera   sizerc   	transpose
contiguousviewrm   )r%   r&   rn   bctfr)   r)   r*   r!   8  s
   

$zConv2dSubsampling4.forward)r+   r,   r-   r>   intr]   rm   r"   rH   r   r!   r0   r)   r)   r'   r*   rV     s    	rV   c                       sj   e Zd ZdefddZdef fddZ					dddZ							dd
dZdd Zdd Z	  Z
S )TransformerEncoderargsc              
   C   s,   t | j|j|j| j|j|j|j|jd}|S )N)embedding_dimffn_embedding_dimnum_attention_headsdropoutattention_dropoutactivation_dropoutactivation_fnlayer_norm_first)	r   r|   encoder_ffn_dimencoder_attention_headsr   r   r   activation_functionencoder_normalize_before)r%   r{   layerr)   r)   r*   build_encoder_layerQ  s   
z&TransformerEncoder.build_encoder_layerc                    s   t     j_ j_ j_ j}|dkr< j}td j| }dd }|j| j	|_
|j| j	|_ndd }|j j j	_
|j j j	_t fddt jD _ j_tj_tj_ j_t d S )	Nr   rY   c                    s    t j fddt|D  S )Nc                    sH   g | ] }t t j  d  dtt t ddt t  qS )rZ   kernel_sizepaddinggroupsF)elementwise_affine)r   r^   Conv1dr   r	   r   GELU.0rl   egkr)   r*   
<listcomp>k  s"    
zHTransformerEncoder.__init__.<locals>.make_conv_block.<locals>.<listcomp>)r   r^   rg   )r   r   r   lar)   r   r*   make_conv_blockj  s   z4TransformerEncoder.__init__.<locals>.make_conv_blockc                 S   s   t j| | ||d |d}d}tdd|  ||   }t jj|jd|d t j|jd t j	j
|ddd}t |t|t  }|S )	NrZ   r   r   r[   g      ?)meanstdweight)namedim)r   r   mathsqrtinitnormal_r   	constant_biasr   weight_normr^   r   r   )r   r   r   pos_convr   r   r)   r)   r*   make_conv_pos  s   z2TransformerEncoder.__init__.<locals>.make_conv_posc                    s   g | ]}  qS r)   )r   r   r{   r%   r)   r*   r     s    
z/TransformerEncoder.__init__.<locals>.<listcomp>)r    r]   r   d_modelr|   required_seq_len_multipleencoder_pos_conv_depthmaxencoder_conv_posencoder_conv_pos_groupsr   phone_pos_convr   
ModuleListrg   encoder_layerslayersr   r   r   
layer_normphone_layer_normencoder_layerdrop	layerdropapplyr
   )r%   r{   pos_conv_depth
num_layersr   r   r   r'   r   r*   r]   ^  sH   

zTransformerEncoder.__init__Nc           
      C   sB   | j ||||||d\}}}}	| jr|d u r| |}||||	fS )N)context_layer)extract_featuresr   r   )
r%   r&   rC   phone_xphone_padding_maskr   r   layer_resultsx_convpre_padding_maskr)   r)   r*   r!     s   
zTransformerEncoder.forwardr   c                 C   s  |d ur
t ||d}| |dd}|dd}|| }| js%| |}|d urN|d ur3t ||d}| |dd}	|	dd}	||	 }| jsN| |}| }
tj|| j| j	d}|dd}g }d }t
| jD ]z\}}||k r{|  du r{qk||kr|d ur|	d ur|dd}tj||gdd}tj||gdd}| }
tj||	gdd}|dd}| jdkrtj nd}| j	r|| jkr|||dd\}\}}||kr||||f ||kr|} nqk|d ur|}|dd}||||
fS )Nr   r   rZ   )ptrainingFr   )self_attn_padding_maskneed_weights)r   r   rr   r   r   r   rf   Fr   r   	enumerater   anyr"   catr   nprandomappend)r%   r&   rC   r   r   	tgt_layer	min_layerr   r   phone_x_convr   r   rir   dropout_probabilityzlrr)   r)   r*   r     sd   




z#TransformerEncoder.extract_featuresc                 C   s   | j jS )z/Maximum output length supported by the encoder.)r{   encoder_max_positionsr%   r)   r)   r*   max_positions  s   z TransformerEncoder.max_positionsc                 C   s   |S )z@Upgrade a (possibly old) state dict for new versions of fairseq.r)   )r%   
state_dictr   r)   r)   r*   upgrade_state_dict_named  s   z+TransformerEncoder.upgrade_state_dict_named)NNNNN)NNNNr   N)r+   r,   r-   r   r   r]   r!   r   r   r   r0   r)   r)   r'   r*   rz   O  s$    R

Krz   c                       s   e Zd Z	ddedeej f fddZdd Zdd	 Z				d d
dZ
dejfddZ										d!deej deej deej deej deej deej deej deej fddZdd Zedd Z  ZS )"r6   Ncfgembed_tokensc                    s   t  | || _|j| _td|j| _t|j|j| _	|j
| _
| j
| _t|j| j| j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _ |j!| _"|j#| _$|j%| _&|j'| _(t)|j*| _+t)|j,| _-t.t/0|j1 | _2t3|| _4t| j| j| _5d| _6d S )NP   r   )7r    r]   r   r   embedrV   	subsampler   rb   post_subsample_projpadding_idxphone_padding_idxr   phone_vocab_sizephone_item_embeddingaudio_mask_prob	mask_probaudio_mask_selectionmask_selectionaudio_mask_other
mask_otheraudio_mask_lengthmask_lengthaudio_no_mask_overlapno_mask_overlapaudio_mask_min_spacemask_min_spaceaudio_mask_channel_probmask_channel_probaudio_mask_channel_beforemask_channel_beforeaudio_mask_channel_selectionmask_channel_selectionaudio_mask_channel_othermask_channel_otheraudio_mask_channel_lengthmask_channel_lengthaudio_no_mask_channel_overlapno_mask_channel_overlapaudio_mask_channel_min_spacemask_channel_min_spaceDropoutencoder_dropout_inputdropout_inputencoder_dropout_featuresdropout_features	Parameterr"   rJ   uniform_mask_embrz   encoder
final_projnum_updates)r%   r   r   r'   r)   r*   r]     s:   


zMMSpeechEncoder.__init__c                 C   s   | j S )z+
        Get the embedding weight.
        r   r   r)   r)   r*   get_input_embeddingsG  s   z$MMSpeechEncoder.get_input_embeddingsc                 C   s
   || _ dS )zD
        Set the weight of embedding with the given tensor.
        Nr	  )r%   r:   r)   r)   r*   set_input_embeddingsM  s   
z$MMSpeechEncoder.set_input_embeddingsc           	      C   sf  |j \}}}| jdkr7| jr7t||fd | j| j| j| j| j| jd}t	
||jdd|d}d||< | jdks@|d urv|d u rn|d u rK| j}t||f||| j| j| jd| j| j| jj| jjd}t	
||j}t||| j}nd }| jdkr| js|d u rt||fd | j| j| j| j| j| jd}t	
||jdd|d}t||d}||fS )Nr   )
no_overlap	min_spacer   )	min_masksr  r  require_same_masksmask_dropout)shaper   r   r   r   r   r   r   r   r"   
from_numpytor$   rp   expandr   r   r   r   r   r   r   r  r  r   r  )	r%   r&   rC   mask_indicesmask_channel_indicesr   BTCr)   r)   r*   
apply_maskS  sr   

zMMSpeechEncoder.apply_maskinput_lengthsc                 C   sN   dd }t | jj}tt|D ]}|||| d || d }q|tjS )zH
        Computes the output length of the convolutional layers
        c                 S   s   t | | | d S )Nr   )r"   ri   )input_lengthr   strider)   r)   r*   _conv_out_length  s   zJMMSpeechEncoder._get_feat_extract_output_lengths.<locals>._conv_out_lengthr   rZ   )evalr   conv_feature_layersrg   lenr  r"   rj   )r%   r  r  conv_cfg_listr   r)   r)   r*    _get_feat_extract_output_lengths  s   

z0MMSpeechEncoder._get_feat_extract_output_lengthsTFfbankfbank_lengthfbank_masksphone_itemsphone_masksfeatures_onlymaskr   c           #   	   C   s  |  ||\}}| jd ur| |}t|jd d d|j}t|D ]\}}||jd  }|dk r>d|||d f< q'|	 }| 
|}|rU| j|||d\}}n|}d }d|| < d }d }|d ur| |}|| j}d|| < |d ur||  }tj||gdd}|	 }| j|||||	d	d
\}}}}| jjd| jjd d f }|du r| }t||d }|rt|dd|||dS di i}t C | j  | jj||||dd	d\}}}}|||d}| jjd| jjd d f }t|d |d }|| }| j  W d    n	1 sw   Y  || } dd }!|}|!|   |  }"t  | !||d< | !|   |d< W d    n	1 sQw   Y  | j"dkr|d | jj#k rt$%d|d &  d| jj# d t'd|d &  d| jj# d| j"dkr|d | jj(k rt$%d|d &  d| jj( d t'd|d &  d| jj( dt|dd||||"dS )NrZ   Fr  r   T)r   r   r   r\   )rC   r   r   r   r   rY   )rA   rB   rC   rF   losses)rC   r   r   r   r   )r&   rC   r   r&   c                 S   s(   t jtj| ddtj|dddd}|S )Nr  r   sum)	reduction)r   kl_divr   log_softmaxsoftmax)r   qrL   r)   r)   r*   _kl_loss  s   z)MMSpeechEncoder.forward.<locals>._kl_loss
target_varpred_vari  ztarget var is z < z	, exitingzpred var is )rA   rB   rC   rF   rG   ))r   r   r"   
BoolTensorr  fill_r  r$   r   rf   r   r  r   eqr   	new_zerosrq   boolr   r  r   r   phone_dict_sizedetachr   linearr@   rr   no_gradr   r   trainrh   compute_varr  min_target_varloggererroritem	Exceptionmin_pred_var)#r%   r%  r&  r'  r(  r)  r*  r+  r   r   output_hidden_statesfeaturesfbank_feature_lengthrC   r   ldiffpre_encoder_featuresr&   r  r   r   phone_mask_indicesr   r   	pos_embed
emb_weightrA   resultyy_layer_resultsrl   	y_studentr3  rG   r)   r)   r*   r!     s   









	

zMMSpeechEncoder.forwardc              	   C   s
  d|vrd}n|d  d|}d|vrd}n|d  d|}d|vr%d}n|d  d|}d|vr4d}n|d }d}t|dkrRt|D ]\}}	||	 d|f7 }qDd|vrYd}
n|d }
d}d	|v rg|d	 }t|d
 dkrrd}n|d
  d|}t|||||
||dS )a  
        Reorder encoder output according to *new_order*.

        Args:
            encoder_out: output from the ``forward()`` method
            new_order (LongTensor): desired order

        Returns:
            *encoder_out* rearranged according to *new_order*
        rB   Nr   rC   rF   rD   r)   rE   rG   rA   r   )rA   rB   rC   rD   rE   rF   rG   )index_selectr"  r   r@   )r%   encoder_out	new_ordernew_encoder_outnew_encoder_padding_masknew_position_embeddingsnew_encoer_statesencoder_statesidxstaterE   new_kl_lossnew_phone_distributionr)   r)   r*   reorder_encoder_out2  s`   z#MMSpeechEncoder.reorder_encoder_outc                 C   s   |  d| d} t rMt| d }| jdd}| d jdd}t| t| t| ||d  |d ||d    }t	|d 
 S t	| jddd 
 S )Nr  r   r   rZ   r   gư>)rt   rq   distis_initializedr"   tensorr#   r-  
all_reducer   r   var)rQ  zczszssre  r)   r)   r*   r@  r  s   


 zMMSpeechEncoder.compute_varr   )NNN)
NNNNNTFNNF)r+   r,   r-   r   r   r   r   r]   r
  r  r  r"   
LongTensorr$  rH   r!   r`  staticmethodr@  r0   r)   r)   r'   r*   r6     s^    /	
A

 @r6   zQThe bare OFA Model outputting raw hidden-states without any specific head on top.c                       s   e Zd ZdZeZdef fddZeee	e
eeeddd Z														
								
	
	
	
dddZdddZ  ZS )MMSpeechModelz
    The OFA model built with an encoder and a decoder only, without any classification head.

    Args:
        config (MMSpeechConfig): OFA configuration.
    configc                    sh   t  | t|dd| _|j|j| _}t||j	| j}t
||| _t||| _|j| _|   d S )Ndisable_entangleF)r    r]   getattrrm  pad_token_id
vocab_sizer   r   r   r   r6   r  r   decoder
use_ofasys	post_init)r%   rl  kwargsrp  sharedr'   r)   r*   r]     s   zMMSpeechModel.__init__)processor_class
checkpointoutput_typer?   c                 K   s0   |d }|rt j| ddS t j| ddS )z@Get normalized probabilities (or log probs) from a net's output.rA   r  r   )r   r0  rh   r1  )r%   
net_output	log_probsrt  rM   r)   r)   r*   get_encoder_normalized_probs  s   	z*MMSpeechModel.get_encoder_normalized_probsNTFc                 C   s   |r|n| j j}|r|n| j j}|dur|n| j j}|du r,| j|||	|
|||||d	}|| j j r;|| j}|j	}t
|j|j|jd }|j}| j||||||||||d
}t|j	|j|j|j|j|j	|j|j|j|j|jdS )a   
        Args:
            input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`):
                indices of input sequence tokens in the vocabular, and padding will be ignored by default;

                indices can be obtained using [`~OFATokenizer`].

            patch_images (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
                the resized image, which are transformed by the default operations.
            patch_images_2 (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
                the second (if it exists) image.
            patch_masks (`torch.BoolTensor`): the patches to be masked.
            token_embeddings (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`): token embeddings.
            sample_patch_num (`int`): the number of patches to sample.
            decoder_input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`): indices of the sequence in the vocabulary.
            code_masks (`torch.Tensor` of shape `(bsz, seq_len)`): masks only for code generation.
            attention_mask (`torch.Tensor` of shape `(bsz, seq_len)`): attention mask for decoding.
            encoder_outputs (`OFAEncoderOutput`):
                encoder outputs with hidden states, positional embeddings, and padding masks.
            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(bsz, num_heads, tgt_len, head_size)`) and 2 additional tensors of
                shape `(bsz, num_heads, src_len, head_size)`.
            use_cache (`bool`): whether to use cache for faster inference.
            output_attentions (`bool`): whether to output attention weights.
            output_hidden_states (`bool`): whether to output hidden states.
            return_dict (`bool`): unused. Keep it for generation only.

        Returns:
            OFASpeechOutput:
                last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`): the last decoder hidden states.
                past_key_values (`tuple(tuple(torch.FloatTensor)): past keys and values for faster inference.
                decoder_hidden_states (`tuple(torch.FloatTensor)`): the decoder hidden states of all layers.
                decoder_attentions (`tuple(torch.FloatTensor)): the decoder self attention weights of all layers.
                cross_attentions (`tuple(torch.FloatTensor)): cross attention weights of all layers.
                encoder_last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`):
                    the encoder last hidden state.
                encoder_hidden_states (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`):
                    the encoder states of all layers including the embeddings.
                encoder_attentions (`torch.FloatTensor` of shape `(bsz, num_heads, seq_len, seq_len)`):
                    the encoder attention weights of all layers.
        N)	r%  r&  r'  r(  r)  r*  r+  r   r   r  )
	input_idsattention_maskrS   encoder_attention_mask
code_maskssrc_pos_embedrN   	use_cacheoutput_attentionsrG  )rM   rN   rO   rP   rQ   rR   rS   rT   rU   rA   rG   )rl  r  rG  r  r  r8  ro  r   r   rB   r   rC   dtyper  rF   rq  rK   rN   rD   rE   rQ   rA   rG   )r%   r|  patch_imagespatch_images_2patch_maskstoken_embeddingssample_patch_numr%  r&  r'  r(  r)  r*  r+  r   r   decoder_input_idsr  r}  encoder_outputsrN   r  r  rG  return_dictrS   r~  r  decoder_outputsr)   r)   r*   r!     sf   DzMMSpeechModel.forwardc                 C   r2   r3   r4   r8   r)   r)   r*   r;   (  r<   z)MMSpeechModel._set_gradient_checkpointing)NNNNNNNNNNNTFNNNNNNNFFFFr=   )r+   r,   r-   r>   r   r?   r]   r   MMSPEECH_INPUTS_DOCSTRINGr   _TOKENIZER_FOR_DOC_CHECKPOINT_FOR_DOCrK   _CONFIG_FOR_DOCr{  r!   r;   r0   r)   r)   r'   r*   rk    sL    	
yrk  )Nr>   r   dataclassesr   typingr   r   numpyr   r"   torch.distributeddistributedra  fairseq.data.data_utilsr   fairseq.models.wav2vec.wav2vec2r   fairseq.modulesr   r   r	   ,fairseq.modules.transformer_sentence_encoderr
   fairseq.utilsr   	packagingr   r   torch.nnr   r   transformers.file_utilsr   r   r   r   transformers.utilsr   configuration_mmspeechr   generater   modeling_ofar   r   r   r   r   
get_loggerrB  r  r  r  parse__version__TORCH_VERSIONTORCH_MESH_GRID_WARNING_VERSIONDEFAULT_MAX_SOURCE_POSITIONSDEFAULT_MAX_TARGET_POSITIONSry   DEFAULT_MIN_PARAMS_TO_WRAP!OFA_PRETRAINED_MODEL_ARCHIVE_LISTapex.normalizationr   _FusedLayerNormhas_fused_layernormImportErrorr1   r@   rK   MMSPEECH_START_DOCSTRINGMMSPEECH_GENERATION_EXAMPLEr  ModulerV   rz   r6   rk  r)   r)   r)   r*   <module>   sp   

#E+8 H  p