o
    iwR                     @   s   d dl Z d dlZd dlZd dlm  mZ d dlm	Z	 d dl
mZ G dd dZddd dddddddd	ejfd
ejdejdedededededejdedejdejdejfddZdS )    N)sequence_mask)make_pad_maskc                   @   s(  e Zd ZdZ						d6deded	ed
ededefddZdd ZddefddZd7ddZ	d8ddZ
dd Zdd Zdddejfd d!Zdddejfd"d#Zddd$ejfd%d&Zddd'ejfd(d)Zddddd*ejfd+d,Zddddd-ejfd.d/Zdddd0ejfd1d2Zdddd3ejfd4d5ZdS )9overlap_chunkz
    Author: Speech Lab of DAMO Academy, Alibaba Group
    San-m: Memory equipped self-attention for end-to-end speech recognition
    https://arxiv.org/abs/2006.01713

       
   r      r   
chunk_sizestridepad_leftencoder_att_look_back_factor
shfit_fsmndecoder_att_look_back_factorc                 C   s   |  ||}|  ||}|  ||}|||||f\| _| _| _| _| _|| _d | _d | _d | _	d | _
d | _d | _d | _d | _d\| _| _| _| _| _d S )N)NNNNN)check_chunk_size_argsr   r   r   r   r   r   
x_add_mask	x_rm_maskx_lenmask_shfit_chunkmask_chunk_predictormask_att_chunk_encodermask_shift_att_chunk_decoder
chunk_outschunk_size_cur
stride_curpad_left_cur encoder_att_look_back_factor_curchunk_size_pad_shift_cur)selfr   r   r   r   r   r    r!   T/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/scama/chunk_utilis.py__init__   sF   

zoverlap_chunk.__init__c                    s&   t  t |k r fdd|D   S )Nc                    s   g | ]} d  qS r	   r!   ).0ixr!   r"   
<listcomp>C   s    z7overlap_chunk.check_chunk_size_args.<locals>.<listcomp>)len)r    r   r'   r!   r&   r"   r   A   s   z#overlap_chunk.check_chunk_size_argsindc                 C   s|   | j | | j| | j| | j| | j| f\}}}}}|||||| j |f\| _| _| _| _	| _
| _| j| j| j| j	| j
fS N)r   r   r   r   r   r   r   r   r   r   r    decoder_att_look_back_factor_cur)r    r*   r   r   r   r   r   r!   r!   r"   get_chunk_sizeF   s4   
zoverlap_chunk.get_chunk_sizeTNc                 C   sH   t | j}d}|r|dkrtd|d  }|s"|d ur"t|}|S )Nr   r   r!   )r)   r   torchrandintcpuitemint)r    trainingdecoding_ind	chunk_numr*   r!   r!   r"   random_choicef   s   
zoverlap_chunk.random_choicer   c           >   	   C   s  t   |  }| }| |\}}}}	}
| j}|| | }t|| 	tj
}|d |
 | | d | |d |  }|	|j}| }tt|| }tj
}t||| }tjd|g|d}tj|dg|d}tjd|g|d}tjd|g|d}tjddg|d}tjd||
 g|d}t|D ]}tj||f|d}ttj|tjd}tj||| f|d}tj||f|d}tj|||gdd}|d |d |f }tj||gdd}tj||gdd}tj||f|d}tj||f|d} tj||f|d}!ttj||d}tj|| |f|d}"tj||f|d}#tj|"||#gdd}$|$d |d |f }$tj|| |$|!gdd}%tj||%gdd}tj||g|d}&tj||g|d}'tj|&|'gdd}(tj||(gdd}tj|| |g|d})tj||g|d}*tj|| | |g|d}+tj|*|+gdd},tj|)|,gdd}-tj||-gdd}tj|||
 g|d}.t||	 d}/tj||/|
 g|d}0t||/ d}1tj||g|d}2tj||g|d}3tj|| |g|d}4tj||| g|d}5tj|3|4gdd}*tj|2|*|5gdd}*t|*d|1g}*tj||g|d}6tj||g|d}7tj|6|7gdd}8t|d | d}9tj||9|
 g|d}:tj|0|*|8|:gdd};tj|.|;gdd}<tj||<gdd}t|dg})t|dg}'tj|)|'gdd}=tj||=gdd}q|d |d || f | _|| _|d |d |f | _|| _|d |d d f | _|d |d d f | _|d |d |f | _|d |d d f | _| j| j| j| j| j| j| j| jf| _W d    | jS 1 sw   Y  | jS )Nr   r   dtype)axis)r.   no_gradr0   numpymaxr-   r   npceilastypeint32r8   r2   mathzerosrangediagonesfloat32concatenatetiler   x_len_chunkr   r   r   r   r   r   r   )>r    r   r*   	num_unitsnum_units_predictor	x_len_maxr   r   r   r   chunk_size_pad_shiftr   	pad_rightchunk_num_batchrI   x_len_chunk_maxr5   r8   max_len_for_x_mask_tmpr   r   r   r   r   r   	chunk_idsfsmn_padding
x_mask_curx_mask_pad_leftx_mask_pad_right	x_cur_padx_add_mask_fsmnpadding_mask_leftpadding_mask_rightx_mask_cur_pad_topx_mask_cur_pad_bottomx_rm_mask_curx_rm_mask_cur_fsmnpad_shfit_maskones_1mask_shfit_chunk_curzeros_1ones_2zeros_3
ones_zerosmask_chunk_predictor_curzeros_1_topzeros_2_numzeros_2encoder_att_look_back_numzeros_2_left
ones_2_midzeros_2_bottomzeros_2_rightzeros_3_leftones_3_rightones_3zeros_remain_numzeros_remainones2_bottommask_att_chunk_encoder_cur mask_shift_att_chunk_decoder_curr!   r!   r"   gen_chunk_maskp   s   



   zoverlap_chunk.gen_chunk_maskc                 C   s   |ddd|  ddf }| \}}}t||d |j}||dddddf 9 }| j||j|jd}| j||j|jd}	dd| jdf}
t	
||
dd}| \}}}t|dd}t||dg}t||}t|d||gdd}||	fS )	zk
        :param x: (b, t, d)
        :param x_length: (b)
        :param ind: int
        :return:
        Nmaxlenr7   r   constantg        r   )r<   sizer   todeviceget_x_add_maskr8   get_x_len_chunkr   Fpadr.   	transposereshapemm)r    r'   r   r   btd
x_len_maskr   rI   r   x_chunkr!   r!   r"   split_chunk   s   zoverlap_chunk.split_chunkc                 C   s   |d d d |  d d f }| \}}}t||d |j}||d d d d d f 9 }| j||j|jd}| j||j|jd}	t	|dd}t
||dg}t||}
t
|
d||g	dd}
|
|	fS )Nrx   r7   r   r   r{   )r<   r|   r   r}   r~   get_x_rm_maskr8   	get_x_lenr.   r   r   r   )r    r   rI   r   r   r   r   x_len_chunk_maskr   r   r'   r!   r!   r"   remove_chunk  s   zoverlap_chunk.remove_chunkr0   c                 C   ^   t  ! |d ur|| n| j| }t |||}W d    |S 1 s(w   Y  |S r+   r.   r:   r   
from_numpytyper}   r    r   r~   idxr8   r'   r!   r!   r"   r         

zoverlap_chunk.get_x_add_maskc                 C   r   r+   r   r   r!   r!   r"   r   &  r   zoverlap_chunk.get_x_len_chunk   c                 C   r   r+   r   r   r!   r!   r"   r   ,  r   zoverlap_chunk.get_x_rm_mask   c                 C   r   r+   r   r   r!   r!   r"   r   2  r   zoverlap_chunk.get_x_len   c                 C      t  3 |d ur|| n| j| }t|d d d d d f |d|g}t |||}W d    |S 1 s:w   Y  |S Nr   r.   r:   r   r=   rH   r   r   r}   r    r   r~   
batch_sizerJ   r   r8   r'   r!   r!   r"   get_mask_shfit_chunk8      

z"overlap_chunk.get_mask_shfit_chunk   c                 C   r   r   r   r   r!   r!   r"   get_mask_chunk_predictorH  r   z&overlap_chunk.get_mask_chunk_predictor   c                 C   s   t  3 |d ur|| n| j| }t|d d d d d f |ddg}t |||}W d    |S 1 s:w   Y  |S r   r   r    r   r~   r   r   r8   r'   r!   r!   r"   get_mask_att_chunk_encoderX  r   z(overlap_chunk.get_mask_att_chunk_encoder   c                 C   s   t  2 |d ur|| n| j| }t|d d d d df |ddg}t |||}W d    |S 1 s9w   Y  |S )Nr   r   r   r   r!   r!   r"    get_mask_shift_att_chunk_decoderh  s   
"
z.overlap_chunk.get_mask_shift_att_chunk_decoder)r   r   r	   r
   r   r
   )TN)r   r   r   )__name__
__module____qualname____doc__tupler2   r#   r   r-   r6   rw   r   r   r.   rF   r   r   r   r   r   r   r   r   r!   r!   r!   r"   r   
   sP    	
/
 

 



r   r   r   chunkTpredictor_alignmentsencoder_sequence_lengthr   encoder_chunk_sizeattention_chunk_center_biasattention_chunk_sizeattention_chunk_typepredictor_mask_chunk_hoppingr   r   target_lengthr8   c           !   
      sr  t   | j|  \}}|  }| j}|s$| jdd|j}| }t j	| dd}|d d d d d f 
d|d}t j||g|d}t j	|dd}|d d d d d f 
dd|}t t |||}|dk}t j||ddd }t j|d|d}||9 }||7 }t j|d d|d}|}t | d  }t|  d\}}|d ur|| }t |d}t||d}d| }||	d  }|| }t |d}t||d}d| }t||dd ur2 \ d d d d d df 
dd|d ur|  f	d	d
}|kr0| n|d ur;|9 t||jd}d d d |d d f |d d d d d f  t||jd} d d d d d |f | d d d d d f  |dkrt |
d ur|
 d d d |d |f |W d    S 1 sw   Y  S )Nr{   dimr   r7   r   )minr<   )NN)ry   r8   r~   c                     s   d  d f } t j g| jd}t j| |gdd}  \}}}| }t jjd|gdd}| | }| }|S )Nr7   r   r   r   rz   )	r.   rB   r8   r}   catr|   nn
functionalr   )mask_slicedzero_pad_right_ttpad_right_p predictor_mask_chunk_hopping_padmasked	mask_true	r   r~   r   kmaskmask_mask_flipmax_len_chunkr   r   r!   r"   _fn  s   z9build_scama_mask_for_cross_attention_decoder.<locals>._fnfull)r.   r:   r~   r|   r<   r1   r8   sumr   cumsumrepeatrE   r}   floordividecliprA   r>   r   	ones_like)!r   r   r   r   r   r   r   stepr   r   r   r   is_trainingr8   r   r5   maximum_encoder_lengthint_typemaximum_target_lengthpredictor_alignments_cumsumindex	index_divindex_div_bool_zerosindex_div_bool_zeros_countindex_div_bool_zeros_count_ori	mask_flip
mask_flip2index_div_bool_zeros_count_beg#index_div_bool_zeros_count_beg_maskattention_chunk_size2r   mask_targetmask_lenr!   r   r"   ,build_scama_mask_for_cross_attention_decoderr  s    




0
0

(
kkr   )rA   r.   r;   r=   torch.nn.functionalr   r   r   funasr.models.scama.utilsr   *funasr.models.transformer.utils.nets_utilsr   r   rF   Tensorr2   strr8   r   r!   r!   r!   r"   <module>   s\      m	
