o
    if@                     @   s  d dl Z d dlZd dlZd dlm  mZ d dlm	Z	 d dl
mZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ zd dlZW n   Y eejedkrnd dlm Z  ne	dddZ e!ddG dd dejj"Z#dS )    N)contextmanager)LooseVersion)AnyListTupleOptional)tables)	to_device)force_gatherable)load_audio_text_image_video)make_pad_mask)split_to_mini_sentencesplit_wordsz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   V/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/ct_transformer/model.pyr      s   
r   model_classesCTTransformerc                       s  e Zd ZdZ													d7d
edededededededededededef fddZ	de
jde
jfddZdd Zde
jded e
jd!ee
jef fd"d#Zd$e
jd%ee d&e
jd!ee
jee f fd'd(Z			d8de
jd)e
jde
jd*e
jd+ee d,ee
j d-ee
j d!ee
je
jf fd.d/Z		d9de
jd)e
jde
jd*e
jd,ee
j d-ee
j fd0d1Z				d:d2efd3d4Zd5d6 Z  ZS );r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
    https://arxiv.org/pdf/2003.01309.pdf
    N            ?         encoderencoder_conf
vocab_size	punc_listpunc_weight
embed_unitatt_unitdropout_rate	ignore_idsoseossentence_end_idc                    s   t    t|}|d u rdg| }tj||| _tj	|}|di |}tj
||| _|| _|| _|| _|	| _|
| _|| _|| _d | _|	dd d ur[t|d  t| _d S d S )Nr   jieba_usr_dictr   )super__init__lentorchnn	Embeddingembedr   encoder_classesgetLineardecoderr   r   r    r$   r%   r&   r'   r(   jiebaload_userdict)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   kwargs	punc_sizeencoder_class	__class__r   r   r*   *   s(   


zCTTransformer.__init__texttext_lengthsc                 K   s.   |  |}| ||\}}}| |}|dfS )zCompute loss value from buffer sequences.

        Args:
            input (torch.Tensor): Input ids. (batch, len)
            hidden (torch.Tensor): Target ids. (batch, len)

        N)r/   r   r3   )r6   r<   r=   r7   xh_yr   r   r   punc_forwardQ   s   

zCTTransformer.punc_forwardc                 C   s   dS )NFr   )r6   r   r   r   with_vad_   s   zCTTransformer.with_vadrA   stater>   returnc                 C   s^   | d}| jj| || ||d\}}}| |dddf }|jddd}||fS )a  Score new token.

        Args:
            y (torch.Tensor): 1D torch.int64 prefix tokens.
            state: Scorer state for prefix tokens
            x (torch.Tensor): encoder feature that generates ys.

        Returns:
            tuple[torch.Tensor, Any]: Tuple of
                torch.float32 scores for next token (vocab_size)
                and next state for ys

        r   cacheNr   dim)	unsqueezer   forward_one_stepr/   _target_maskr3   log_softmaxsqueeze)r6   rA   rD   r>   r?   r@   rG   logpr   r   r   scoreb   s   
zCTTransformer.scoreysstatesxsc           	         s   t | t | jjd du rd}n fddtD }| jj| || ||d\}}| |dddf }|jdd}fddt D }||fS )	a  Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, vocab_size)`
                and next state list for ys.

        r   Nc                    s*   g | ] t  fd dtD qS )c                       g | ]}|   qS r   r   ).0b)irR   r   r   
<listcomp>       8CTTransformer.batch_score.<locals>.<listcomp>.<listcomp>)r,   stackrangerU   )n_batchrR   )rW   r   rX      s    z-CTTransformer.batch_score.<locals>.<listcomp>rF   r   rH   c                    s$   g | ]  fd dt D qS )c                    rT   r   r   )rU   rW   )rV   rR   r   r   rX      rY   rZ   )r\   r]   )n_layersrR   )rV   r   rX      s   $ )	r+   r   encodersr\   rK   r/   rL   r3   rM   )	r6   rQ   rR   rS   batch_stater?   r@   rO   
state_listr   )r^   r_   rR   r   batch_scorex   s   zCTTransformer.batch_scorepuncpunc_lengths
max_lengthvad_indexesvad_indexes_lengthsc                 C   s  | d}|du r"|ddd| f }|ddd| f }n|ddd|f }|ddd|f }|  rJ|dus@J | |||\}	}
n| ||\}	}
| jdkr|	d|	jd jddd\}
}ddlm	} ||d
   |d
   dd	}t|g| }||fS | j|j| _tj|	d|	jd |d| jd
| jd}|du r|t||jdd n|t||d d|jdd ||d}||fS )zCompute negative log likelihood(nll)

        Normally, this function is called in batchify_nll.
        Args:
            text: (Batch, Length)
            punc: (Batch, Length)
            text_lengths: (Batch,)
            max_lengths: int
        r   NFr   r   rH   )f1_scoremicro)averagenone)	reductionignore_indexg        )maxlen)sizemaxrC   rB   trainingviewshapetopksklearn.metricsri   detachcpunumpyrN   r,   Tensorrepeatsumr    todeviceFcross_entropyr$   masked_fill_r   )r6   r<   rd   r=   re   rf   rg   rh   
batch_sizerA   r@   indicesri   nllr   r   r   r      sH   

  zCTTransformer.nllc                 C   s\   | j |||||d\}}| }	| |	 }
t|
 d}t|
||	f|
j\}
}}|
||fS )N)rg   )loss)r   r|   dictrw   r
   r~   )r6   r<   rd   r=   re   rg   rh   r   	y_lengthsntokensr   statsweightr   r   r   forward   s   	
zCTTransformer.forwardkeyc           ,      K   sp  t |dksJ t||dddd }|dd }|dd}	t|| jd	}
||
}t|
|	}t||	}t |t |ks@J g }tt	j
g d
d}d}g }d}g }i }d }tt |D ]k}|| }|| }|| }t	j||fdd}tt|dtt	j
t |gd
dd}t||d }| jd"i |\}}|d|jd jddd\}}tj|dd}| d t |ksJ |t |d k r8d}d}tt |d ddD ]'} | j||   dks| j||   dkr| } n|dk r| j||   dkr| }q|dk rt ||kr|dkr|}| j||< ||d d  }||d d  }|d|d  }|d|d  }|  }!|dd |!D 7 }g }"tt |D ]} | dksn| j|| d   dksn| j|| d   dkrt ||  d  dkr||   || < | dkrt ||  d  dkrd||   || < | dkrt ||  d  dkrt || d  d  dkrd||   || < |"||   | j||   dkr| j||   }#t ||  d  dkr|#dkrd}#n|#dkrd}#n|#dkrd}#|"|# qO|d|"7 }|}$|}%|t |d kr|d dks+|d dkr>|d d d }$|d d | jg }%n{|d dkrX|d d d }$|d d | jg }%na|d dkr|d dkrt |d  dkr|d }$|d d | jg }%t |rd|d< n0|d dkr|d dkrt |d  dkr|d }$|d d | jg }%t |rd|d< |d u r|}q^tj||gdd}q^| jd ur'|d}t |
}&t| }'t |
d d d D ]7\} }(d|(d   krd kr n qt |(dkr t |(d })|&|  d }*t|)D ]	}|'!|*d qqt"|'}|d |$|d!}+||+ ||fS )#Nr   r7   r<   )	data_typer   rg   
split_size   )r(   int32)dtype    )axis)r<   r=   r~   r   rH   r   u   。u   ？u   ，c                 S   s   g | ]}t |qS r   )int)rU   r>   r   r   r   rX   D  s    z+CTTransformer.inference.<locals>.<listcomp> r@   ,.?u   、u   ฀u   龥)r   r<   
punc_arrayr   )#r+   r   r1   r   r(   encoder   r,   
from_numpynparrayr\   concatenaterJ   r	   rB   rs   rt   ru   rN   rp   r   r'   rx   ry   
capitalizeappendjoincatreshapecopytolist	enumerateinserttensor),r6   data_indata_lengthsr   	tokenizerfrontendr7   r<   rg   r   tokens
tokens_intmini_sentencesmini_sentences_id
cache_sentcache_sent_idnew_mini_sentencenew_mini_sentence_punccache_pop_trigger_limitresults	meta_datar   mini_sentence_imini_sentencemini_sentence_iddatarA   r@   r   punctuationssentenceEndlast_comma_indexrW   punctuations_npwords_with_puncpunc_resnew_mini_sentence_outnew_mini_sentence_punc_out
len_tokensnew_punc_arraytoken
num_append
ind_appendresult_ir   r   r   	inference   s  	


 

















"

zCTTransformer.inferencec                 K   s"   ddl m} |dd| i|}|S )Nr   )export_rebuild_modelmodelr   )export_metar   )r6   r7   r   modelsr   r   r   export  s   zCTTransformer.export)NNr   NNr   r   r   r   r   r   r   )NNN)NN)NNNN)__name__
__module____qualname____doc__strr   r   listfloatr*   r,   rz   rB   rC   r   r   rP   r   rc   r   r   r   r   r   __classcell__r   r   r:   r   r   "   s    	
'(
-	
L

 ))T)$r   r,   ry   r   torch.nn.functionalr-   
functionalr   
contextlibr   distutils.versionr   typingr   r   r   r   funasr.registerr   funasr.train_utils.device_funcsr	   r
   funasr.utils.load_utilsr   *funasr.models.transformer.utils.nets_utilsr   "funasr.models.ct_transformer.utilsr   r   r4   __version__torch.cuda.ampr   registerModuler   r   r   r   r   <module>   s.   
