o
    }oi(                     @   sL   d dl Z d dlmZ d dl mZ d dlmZ d dlmZ G dd deZdS )    N)
DictConfig)nn)transformer_2501)NeuralModulec                       sf   e Zd Zdedededef fddZdd	d
ZdddZdd Zdd Z	dd Z
edd Z  ZS )TransformerARSpeechDecoderspeech_decoder_parmslantent_dimnum_audio_codebooksnum_audio_tokens_per_codebookc                    s  t    d| _|| _|| _|| _|| _| jdd | _| jdd| _	| jdd| _
| jdd| _|| jd krFt|| jd | _nd | _tjd	i | j| _t| jd || | _| j
rg }t| jD ]}|t|| jd  qit|| _d S d S )
NFcfg_unconditional_prob	cfg_scaleg      @cond_on_prev_audio_tokensTdetach_inputd_model )super__init__use_input_cacher   r   r	   r
   popr   r   r   r   r   Linear
input_projr   Transformer
t5_decoder
final_projrangeappend	Embedding
ModuleListaudio_embeddings)selfr   r   r	   r
   r   _	__class__r   h/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/speechlm2/modules/speech_generation.pyr      s2   
z#TransformerARSpeechDecoder.__init__NFc                 C   s|  | dd }| jrn| jd d u r|| jd< ntj| jd |gdd| jd< | jd }| jd d u r9|| jd< ntj| jd |gdd| jd< | jd }| jd d u rZ|| jd< ntj| jd |gdd| jd< | jd }| jru| }| jd ur| |}n|}|d u rtj	|
d|
df|jtjd}| jr| jrtd | jk rt|}n#t|}tj||gdd}tj||gdd}| jrtj||gdd}| jr| jr| }| | dd }|| }| j||d	d
 }| jr|d d dd d d f }| |}	| jr0| js0|	
dd }
|	d |
 }|	|
d  }d| j | | j|  }	|r5|	S | |	}||	fS )Nr      hidden_statesdimspeech_maskinput_audio_tokens)devicedtype   )xx_maskoutput)	transpose
contiguousr   cachetorchcatr   detachr   onessizer*   boolr   trainingranditem
zeros_liker   embed_audio_tokensr   r   r   #all_logits_to_each_codebooks_logits)r   r%   r(   r)   return_raw_logitsspeech_decoder_inputspeech_decoder_input_zerosaudio_tokens_embeddeddecoder_outall_code_logits
batch_sizecond_logitsuncond_logitsall_codebook_logitsr   r   r#   forwardB   sr   







z"TransformerARSpeechDecoder.forwardffffff?P   c                 C   s   g }t | jD ]K}|| j }|| j }|d d ||f }tj||ddd }	||	d d df dk }
| }td||
< tj|| dd}t	|d}|
| qtj|dd }|S )Nr0   r&   r   z-infr$   )r   r	   r
   r4   topk	unsqueezeclonefloatsoftmaxmultinomialr   r5   long)r   all_code_logits_ttemperaturerM   	all_predsidxsieicodebook_logitscodebook_logits_topkindices_to_removecodebook_logits_rescoredcodebook_probscodebook_predsr   r   r#   sample_codes_from_logits   s    

z3TransformerARSpeechDecoder.sample_codes_from_logitsc                 C   s`   g }t | jD ]&}|| j }|| j }|d d d d ||f }|dd }|| q|S )Nr   r$   )r   r	   r
   r1   r2   r   )r   logitsrI   rW   rX   rY   rZ   r   r   r#   r?      s   

z>TransformerARSpeechDecoder.all_logits_to_each_codebooks_logitsc                 C   s\   d }t | jD ]}| j| |d d |d d f }|d u r |}q|| }q||d }|S )Nr$   )r   r	   r   r8   )r   audio_tokensaudio_embeddingc	embeddingr   r   r#   r>      s    
z-TransformerARSpeechDecoder.embed_audio_tokensc                 C   s.   |rt d || _|  | _| jj|d d S )NzEnabling input and KV cache!)	use_cache)printr   _init_cacher3   r   reset_cache)r   rf   r   r   r#   reset_input_and_kv_cache   s
   
z3TransformerARSpeechDecoder.reset_input_and_kv_cachec                   C   s   d d d dS )N)r%   r(   r)   r   r   r   r   r#   rh      s   z&TransformerARSpeechDecoder._init_cache)NF)rK   rL   )__name__
__module____qualname__r   intr   rJ   r`   r?   r>   rj   staticmethodrh   __classcell__r   r   r!   r#   r      s"    
+
^r   )	r4   	omegaconfr   r   nemo.collections.tts.modulesr   nemo.core.classes.moduler   r   r   r   r   r#   <module>   s   