o
    irN                     @   s,  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*Z*e+ddG dd dej,j-Z.dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)th_accuracy)	to_device)DatadirWriter)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_mask)ts_prediction_lfr6_standard)load_audio_text_image_videoextract_fbank)pad_sequencemodel_classesParaformer_v2_communityc                +       s  e Zd ZdZ																				
	
	
dBdee dee dedee dedee dedee dedee dedededededededede	de	de	f* fd d!Z
d"ejd#ejd$ejd%ejd&eejeeejf ejf f
d'd(Zd"ejd#ejd&eejejf fd)d*Zd+ejd,ejd-ejd.ejfd/d0Zd+ejd,ejd-ejd.ejfd1d2Zd3d4 ZdCd5ejd6ejd&efd7d8Zd9d: Zd;d< Z				dDd=efd>d?Zd@dA Z  ZS )E
Paraformerz
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    N      ?P   r                 Fspecaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confctcctc_conf
ctc_weight
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_lossshare_embeddinguse_1st_decoder_lossc                    s  t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|dd|i|}| }|d urNtj|}|d||d|}|dkrb|
d u rXi }
t	d||d|
}	|| _
|d urk|n|d | _|d urv|n|d | _|| _|| _|| _|| _|| _|| _|dkrd | _n|| _t||||d| _|dkrd | _n|	| _|| _| jrd | j_|| _|| _d | _d | _d S )	Nr,   )r-   encoder_output_sizer    )odimr6   r   g      ?)sizepadding_idx	smoothingnormalize_length )super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesr	   r/   r0   r1   r-   r.   r+   r!   r#   r%   r'   r   criterion_attr)   r4   embedr5   r3   beam_searcherror_calculator)selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   kwargsspecaug_classnormalize_classencoder_classr6   decoder_class	__class__r<   _/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/paraformer_v2_community/model.pyr>   &   sd   
"
zParaformer.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   sd  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d\}	}
d}t }| jdkrU| ||||\}	}
|	durM|	 nd|d< |
|d< | ||||\}}}}| jdkri|}n| j|	 d| j |  }|dur}| nd|d< ||d	< ||d
< ||d< t	
| |d< ||d< | jr| }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   NNr    loss_ctccer_ctcloss_attacccerwerloss
batch_size)lenr8   shapeencodedictr+   _calc_ctc_lossdetach_calc_att_losstorchcloner3   sumr   device)rI   rR   rS   rT   rU   rJ   r_   encoder_outencoder_out_lensrX   rY   loss_prestatsrZ   acc_attcer_attwer_attr^   weightr<   r<   rQ   forward   sD   



zParaformer.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | ||\}}}t|trC|d }||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r!   trainingr#   r%   
isinstancetuple)rI   rR   rS   rJ   rk   rl   _r<   r<   rQ   rb      s   


zParaformer.encoderk   rl   ys_padys_pad_lensc              	   C   s  d }| d}| j|}g }t|D ]V}	||	d ||	 f }
||	d ||	 f }t & |
 }| j| | | j	d}|
|j}| || j	}W d    n1 sVw   Y  | |
|||	 }|| qt|dd
|j}| ||||}|d |d }}|d u r|}| ||}t|d| j|| jd}| js| jd u rd\}}n|jdd	}| | | \}}||||fS )
Nr   )r/   T)batch_firstr   r   )ignore_labelrW   dim)r8   r)   softmaxrangerg   no_gradlogforce_aligncpur/   torj   map_alignment_to_target_indexaverage_repeats_trainingappendr   r'   rE   r   viewr-   r.   rt   rH   argmax)rI   rk   rl   rx   ry   decoder_out_1str_   ctc_probs_allcompressed_ctc_listb
ctc_prob_btext_bctc_log_prob_b
align_pathtarget_idx_pathctc_comppadded_ctc_inputdecoder_outsdecoder_outrw   rZ   ro   rp   rq   ys_hatr<   r<   rQ   rf      s>   	


zParaformer._calc_att_lossc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r)   rt   rH   r   datar   )rI   rk   rl   rx   ry   rX   rY   r   r<   r<   rQ   rd   3  s   zParaformer._calc_ctc_lossc                 C   sP   ||k}t |d}||d< |||k@ }t j| ddd }t ||d}|S )a  
        Robustly map CTC alignment path (Token IDs) to Target Indices.
        
        Logic:
            Detect boundaries where a new token segment begins.
            A segment starts if the current frame is a Token AND it is different from the previous frame 
            (considering CTC topology where repeats are separated by blanks or are distinct tokens).
        
        Example:
            Text: [A, B]
            Align Path: [A, A, _, B, B]
            Output:     [0, 0, -1, 1, 1]
        r   r   r|   r   )rg   rollcumsumlongwhere)rI   r   r/   is_token	prev_pathnew_segment_startsegment_idsr   r<   r<   rQ   r   D  s   z(Paraformer.map_alignment_to_target_index	ctc_probsyc                 C   s6   |d   }|d   }tjj|||d\}}|d S )a  ctc forced alignment.

        Args:
            torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D)
            torch.Tensor y: id sequence tensor 1d tensor (L)
            int blank_id: blank symbol index
        Returns:
            torch.Tensor: alignment result
        N)blankr   )r   
torchaudio
functionalforced_align)rI   r   r   r/   
alignmentsrw   r<   r<   rQ   r   e  s   
zParaformer.force_alignc                 C   s   |}| d}tj||f|j|jd}tj|df|j|jd}|dk}|| }	|| }
|	 dkr3|S |	dd|}|d||
 tj	|	 ddf|jd}|d|	d| ||d  }|S )a  
        Aggregates frames belonging to the same target index using scatter_add.
        
        Args:
            ctc_probs: [T, V]
            target_idx_path: [T], values in [-1, 0, ... U-1]
            target_len: U
        Returns:
            compressed: [U, V]
        r   )rj   dtyper   r   rj   g&.>)
r8   rg   zerosrj   r   numel	unsqueezerepeatscatter_add_ones)rI   r   r   
target_lenUV
compressedcountsmaskvalid_indicesvalid_probsindex_expandedr   r<   r<   rQ   r   t  s   
z#Paraformer.average_repeats_trainingc                 C   s   |  dkrtjd|df|jdS tj|dd\}}tj|dd}ttjdg|jd|dd g}g }t	|D ]$\}}	|	| j
kr_||  }
||  }||
| jdd}|| q;|sotjd|df|jdS t|S )	z
        Returns:
            merged_probs: [U', V]
            timestamps: List[Tuple[int, int]] -> [(start_frame, end_frame), ...]
        r   r   r   T)return_countsr|   Nr   )r   rg   r   r8   rj   unique_consecutiver   cattensor	enumerater/   itemmeanr   stack)rI   r   greedy_pathunique_tokensr   end_indicesstart_indicesmerged_probsitokenstartendavg_probr<   r<   rQ   average_repeats_inference  s    $


z$Paraformer.average_repeats_inferencekeyc           !         s  i }t |tjr7|dddkr7||}}	t|jdk r'|d d d d d f }|	d ur1|	d}	nQ|jd }	nKt }
t	||j
|dd|dd|d	}t }||
 d
|d< t||dd|d\}}	t }|| d
|d< |	  |j |j d |d< |j|d d}|	j|d d}	|ddr| } ||	\}}t |tr|d } j|}|jdd}g }| \}}}t |d ttfr|d }t||k r|| }t|D ]q}||d || f }||d || f } ||}|ddkrg }q|d}tj|dg|jd} |||d  |||d  ||\}}|jddd }| }tt  fdd|}|| |d} |!|  q||fS )N	data_typesoundfbank   r   r   fsi>  )r   audio_fsr   	tokenizerz0.3f	load_data)r   frontendextract_feati  batch_data_timerj   r   fp16Fr   r|   c                    s   |  j ko|  jko|  jkS )N)r1   r0   r/   )xrI   r<   rQ   <lambda>  s    z&Paraformer.inference.<locals>.<lambda>)r   	token_int)"ru   rg   Tensorr@   r`   ra   squeezetimeperf_counterr   r   r   ri   r   frame_shiftlfr_nr   halfrb   rv   r)   r~   r   r8   listr   r   r   r   rj   r'   tolistfilterr   )!rI   data_indata_lengthsr   r   r   rJ   	meta_datarR   rS   time1audio_sample_listtime2time3rk   rl   r   ctc_greedy_pathsresultsr_   ndr   probspathcompressed_probr   compressed_prob_inin_lensr   rw   yseqresult_ir<   r   rQ   	inference  s   







zParaformer.inferencec                 K   s2   ddl m} d|vrd|d< |dd| i|}|S )Nr   )export_rebuild_modelmax_seq_leni   modelr<   )export_metar   )rI   rJ   r   modelsr<   r<   rQ   export  s
   zParaformer.export)NNNNNNNNNNr   r   r   r   r   r   r   r    FFF)r   )NNNN)__name__
__module____qualname____doc__r   strr   floatintboolr>   rg   r   r   rs   rb   rf   rd   r   r   r   r   r   r   r   __classcell__r<   r<   rO   rQ   r      s    	
v
D

6
!%$
br   )/r   copyrg   loggingtorch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.utils.datadir_writerr   funasr.models.paraformer.searchr   &funasr.models.paraformer.cif_predictorr   r   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   funasr.utils.timestamp_toolsr   funasr.utils.load_utilsr   r   torch.nn.utils.rnnr   r   registernnModuler   r<   r<   r<   rQ   <module>   s0   
