o
    }oib                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0 G dd dee%Z1	d dejdejde2de2de3ej deej de4ejeej ejf fddZ5dS )!    N)defaultdict)repeat)AnyOptional)collate_vectors)LightningModule)
DictConfig)	PeftModel)Tensor)fully_shard)	ReplicateShard)ColwiseParallelPrepareModuleInputRowwiseParallelSequenceParallelloss_parallelparallelize_module)GenerationConfig)PromptFormatter)AutoTokenizer)
HFHubMixin)maybe_install_lora)configure_optimizers	is_frozen)load_pretrained_hfmove_embeddingsetup_speech_encoder)loggingc                       s`  e Zd Zd3 fddZedd ZedefddZedefd	d
ZedefddZ	edefddZ
		d4dededeeef fddZdefddZdedefddZd3ddZd3ddZdedefddZd3dd Zd3d!d"Zd#ed$efd%d&Z fd'd(Ze 			d5d)eeee   d*ejd+ejd,edejf
d-d.Zd/d0 Zd3d1d2Z  Z S )6SALMreturnNc                    s   t |tsJ dt|dt   |   t|| _| jj| _t	| jj
dd| _| jd| jgi t| jj
| jjd| _| jjj| _| jj`t|  t|  d| _d| _d S )NzYou must pass the config to SALM as a Python dict to support hyperparameter serialization in PTL checkpoints (we got: 'type(cfg)=z').T)use_fastadditional_special_tokens)pretrained_weightsF)
isinstancedicttypesuper__init__save_hyperparametersr   cfgaudio_locator_tagr   pretrained_llm	tokenizeradd_special_tokensr   r#   llmmodelembed_tokensr   r   	_use_fsdp_use_tp)selfr*   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/speechlm2/models/salm.pyr(   /   s$   



zSALM.__init__c                 C      | j jS )z&Return the size of the text tokenizer.)r1   num_embeddingsr4   r7   r7   r8   text_vocab_sizeH   s   zSALM.text_vocab_sizec                 C   r9   N)r-   bos_idr;   r7   r7   r8   text_bos_idM      zSALM.text_bos_idc                 C   r9   r=   )r-   eos_idr;   r7   r7   r8   text_eos_idQ   r@   zSALM.text_eos_idc                 C   s2   | j j}|d u r| j j}|d u rtd d}|S )Nzqthe text tokenizer has no <pad> or <unk> tokens available, using id 0 for padding (this may lead to silent bugs).r   )r-   padunk_idwarningswarn)r4   pad_idr7   r7   r8   text_pad_idU   s   zSALM.text_pad_idc                 C   s   | j | jS r=   )r-   token_to_idr+   r;   r7   r7   r8   audio_locator_tag_ida   s   zSALM.audio_locator_tag_idinput_embedsattention_maskc                 C   s<   | j ||||dudd}d|d i}|dur|d |d< |S )z
        Implements a fully offline forward pass through the entire model.
        The flow is the following:

        |speech and text embeddings| -> |llm| -> |lm_head| -> |token ids|

        NT)inputs_embedsrL   past_key_values	use_cachereturn_dictlogitsrN   cache)r/   )r4   rK   rL   rR   outansr7   r7   r8   forwarde   s   zSALM.forwardbatchc              
   C   sB  | j |d |d d\}}dd t||D }t|d | jkd|d }| |}t|d || j| j||d |d d	d
\}}}|ddddf }|ddddf }|ddddf }| jr| j	d 
 }	|jd d |	  }
dkr|ddd|
 f }|ddd|
 f }|ddd|
 f }|||||dS )a  
        Performs additional processing on the mini-batch collected from dataloader.
        Notably:
        * Convert source audio to speech representations.
        * Convert target audio to target audio tokens.
        * Convert target text to embeddings.
        * Combine the input audio and target text embeddings.
        * Take care of any necessary slicing to align the shapes of source audio,
            target audio, and target token ids.
        audios
audio_lens)input_signalinput_signal_lengthc                 S   s   g | ]
\}}|d | qS r=   r7   ).0embemblenr7   r7   r8   
<listcomp>       z'SALM.prepare_inputs.<locals>.<listcomp>	input_idsr   	loss_maskr`   embeds
padding_idplaceholder_idreplacements
target_idsN   tensor_parallel)audio_embedstext_embedsrK   rL   rh   )
perceptionziptorchwhererJ   r1   &replace_placeholders_and_build_targetsrH   r3   device_meshsizeshape)r4   rV   
audio_embsaudio_emb_lensinput_ids_to_embed	text_embs
input_embsrh   rL   tp_world_size	remainderr7   r7   r8   prepare_inputs   s:   

zSALM.prepare_inputs	batch_idxc              	   C   sD  | j j| j j| jfD ]
}t|r|  q
| |}| |d |d d}|d dk  }t	 ! t
jjj|d dd|d ddd	dd
| }W d    n1 sUw   Y  |d jd d \}}	|t
| jd urx| jjd jd d nd||	|t
j|||	  |d | jk  |d   d}
| j|
dd |
S )NrK   rL   rL   rh   rb   rQ   r   rj   sum	reductionignore_index   lrr`   )losslearning_rate
batch_sizesequence_length
num_framestarget_to_input_ratiopadding_ratioT)on_step)rn   preprocessorencoderr/   r   evalr}   longr   r   rp   nn
functionalcross_entropyflattenru   	as_tensor_trainertrainer
optimizersparam_groupstofloat32rH   numellog_dict)r4   rV   r~   minputsforward_outputsr   r   BTrT   r7   r7   r8   training_step   s:   
(

 zSALM.training_stepc                 C   s   t t| _t t| _d S r=   )r   list_partial_val_losses_partial_accuraciesr;   r7   r7   r8   on_validation_epoch_start   s   
zSALM.on_validation_epoch_startc                 C   s   g }| j  D ]\}}t| }| jd| |ddd || q| jdt| ddd g }| j D ]\}}t| }| jd| |ddd || q9| jdt| ddd | j   | j  d S )N	val_loss_T)on_epoch	sync_distval_lossval_acc_val_acc)	r   itemsrp   stackmeanlogappendr   clear)r4   
val_lossesnamevalsr   
accuraciesaccsr   r7   r7   r8   on_validation_epoch_end   s   
zSALM.on_validation_epoch_endc              	   C   s  |  D ]\}}|d u rq| |}| |d |d d}|d dk  }t ! tjjj|d 	dd|d 	ddd	dd
| }W d    n1 sMw   Y  |d j
ddd}	|d d}
|	|
dk }	|
|
dk }
|	|
  }| j| | | j| | qd S )NrK   rL   r   rh   rb   rQ   r   rj   r   r   ri   dim)r   r}   r   r   r   rp   r   r   r   r   argmaxvieweqfloatr   r   r   r   )r4   rV   r~   r   dataset_batchr   r   r   r   predsrefsaccuracyr7   r7   r8   validation_step   s2   
zSALM.validation_stepc                 C      |   S r=   )r   r;   r7   r7   r8   on_test_epoch_start     zSALM.on_test_epoch_startc                 C   r   r=   )r   r;   r7   r7   r8   on_test_epoch_end
  r   zSALM.on_test_epoch_endargskwargsc                 O   s   | j |i |S r=   )r   r4   r   r   r7   r7   r8   	test_step  s   zSALM.test_stepc                    s>   t   t j|i | W d    d S 1 sw   Y  d S r=   )r   r'   backwardr   r5   r7   r8   r     s   "zSALM.backwardpromptsrW   rX   generation_configc                    s   t jjj t fdd|D jdj}|durO|	|j
kd}|}fddt||D }t||jj
|dd\}	}
}|	|d}n
|jk}||d	}t jjdi |d
|i}W d   |S 1 svw   Y  |S )a  
        Generate LLM answers given text or mixed text+audio prompts.

        Inputs:
            prompts: batch of prompts as list[dict] each in the following format
                [
                  # batch example id 0
                  [{"role": "user"}, "slots": {"message": "Repeat after me, translating to Polish.<audio_locator_tag>"}]
                  # batch example id 1
                  [{"role": "user"}, "slots": {"message": "Repeat after me.<audio_locator_tag>"}]
                ]
                "role" is LLM-specific, you can pass multiple turns as well.
            audios: Optional. Time-domain audio signal zero-padded batch of shape (B, T).
                The number of audios must correspond to the number of occurrences of <audio_locator_tag> in prompts.
                Each prompt can have multiple audios.
            audio_lens: Optional. Length of each audio example.
            generation_config: Optional HuggingFace GenerationConfig object.
        c                    s   g | ]
} j |d d qS ))turnsr`   )encode_dialog)r[   prompt)	formatterr7   r8   r^   1  r_   z!SALM.generate.<locals>.<listcomp>)padding_valueNr   c                    s0   g | ]\}}  |d |d d  d  qS )r   )rn   	unsqueeze)r[   aalr;   r7   r8   r^   <  s    $rc   )rM   rL   )r`   rL   r   r7   )r   resolver*   prompt_formatr-   r   rH   r   devicerq   rJ   r1   ro   rr   r   r/   generate)r4   r   rW   rX   r   tokenstokens_to_embedtoken_embedsrl   rK   _rL   generation_inputsanswer_tokensr7   )r   r4   r8   r     sB   







zSALM.generatec                 C   s   t | S r=   )r   r;   r7   r7   r8   r   V  r   zSALM.configure_optimizersc                 C   s  | j }|d u r	d S | j}t|tr|jj}|d  } dkrd| _tt	 ft
dfddt d}t||| |jjD ]_}t t t t tt
ddt tt
dft	 fdt t tt
ddd
}|j}d	D ]*}t||}||  d
krtd| d| d| d t||||   qjt||| q<t|j|tt
dt
ddd |d  }	 dkr|	jdksJ d| _d|	i}
t|jjD ]\}}t|fi |
|jj|< qt| jfi |
| _t|jfi |
|_t| jfi |
| _t| jfi |
| _d S d S )Nrk   rj   T)input_layoutsdesired_input_layoutsuse_local_output)zlayers.0norm)output_layouts)r   r   )
input_layernormzself_attn.q_projzself_attn.k_projzself_attn.v_projzself_attn.o_projpost_attention_layernormmlpzmlp.gate_projzmlp.up_projzmlp.down_proj)	num_headsnum_key_value_headshidden_sizer   zattn_layer.=z$ is not divisible by tp_mesh.size()=z:: set a different tensor parallelism size to avoid errors.ri   F)r   r   r   data_parallelmesh)rs   r/   r$   r	   
base_modelr0   rt   r3   r   r   r   r   r   layersr   r   	self_attngetattrr   warningsetattrlm_headndimr2   	enumerater   r1   rn   )r4   rs   r/   tp_meshplantransformer_block
attn_layerattrvaldp_meshfsdp_configidxlayerr7   r7   r8   configure_modelY  sx   

zSALM.configure_model)r    N)NN)NNN)!__name__
__module____qualname__r(   propertyr<   intr?   rB   rH   rJ   r
   r%   strrU   r}   r   r   r   r   r   r   r   r   r   rp   no_gradr   r   r   r   r  __classcell__r7   r7   r5   r8   r   .   s^    


5
"


Ar   r`   rd   re   rf   rg   rh   r    c               
   C   s  |   \}}|dur|  |   ksJ d| d}|j|j}	}
d}g }g }g }d}t|D ]
}| | |kjddd }t|dkrm|||  |durc||  }||| | |k< || || | |k q0g }g }g }d}|D ]i}||kr|||||f  |dur||||f  }||||k< || || |||f |k || }|| |tj	| df|tj
|	d |tj| dftj|	d |d	7 }|d	 }qw||k r|||||f  |dur||||f  }||||k< || || |||f |k |tj|dd
 |tj|dd
 |dur;|tj|dd
 q0|t|krOtdt| d| tdd |D }tj||||	|
d}|durttj	||f|tj
|	d}nd}tj||ftj|	d}|du rtd}tt|||D ]*\}\}}}| d}|||d|f< |dur|||d|f< |||d|f< q|||fS )aE  Replaces each occurrence of the placeholder_id in input_ids with the corresponding tensor
    from the replacements list in the embeds tensor, and creates corresponding adjusted target_ids.

    Args:
      input_ids (Tensor): shape (batch, sequence_length); input token ids.
      embeds (Tensor): shape (batch, sequence_length, hidden_dim); embeddings for each token.
      padding_id (int): these IDs will be marked as ignore_index in target_ids.
      placeholder_id (int): an id to be replaced.
      replacements (list of Tensor): each Tensor has shape (L_i, hidden_dim), with L_i arbitrary.
      target_ids (Tensor): shape (batch, sequence_length); target token ids.

    Returns:
      Tuple[Tensor, Tensor, Tensor]:
        - Tensor of shape (batch, max_new_sequence_length, hidden_dim) corresponding to
          ``embeds`` after replacements.
        - Tensor of shape (batch, max_new_sequence_length) with adjusted target IDs where:
          * Original target values are preserved where input was not a placeholder or padding
          * Positions that were placeholders, padding, or added by replacements are set to -100
          Will be None if target_ids input was None.
        - Tensor of shape (batch, max_new_sequence_length) with attention padding masks
          updated to account for shape changes due to replacements.
    Nz0target_ids must have the same shape as input_idsr   rb   r   T)as_tuple)dtyper   rj   r   z	Expected z replacements but used c                 s   s    | ]}| d V  qdS )r   N)rt   )r[   seqr7   r7   r8   	<genexpr>(  s    z9replace_placeholders_and_build_targets.<locals>.<genexpr>)r   r  )rt   r   r  rangenonzerolenr   clonerp   fullr   onesboolcat
ValueErrormaxzerosr   r   ro   ) r`   rd   re   rf   rg   rh   r   seq_len
hidden_dimr   r  r   output_sequencesoutput_target_idsoutput_att_masksreplacement_idxiplaceholder_positionsnew_target_idssegmentstarget_segments	att_masksprev_pospossegment_target_idsrepmax_seq_lengthoutputattention_masksr  tgtattr7   r7   r8   rr     s   



" 









rr   r=   )6rE   collectionsr   	itertoolsr   typingr   r   rp   lhotse.dataset.collationr   	lightningr   	omegaconfr   peftr	   r
   torch.distributed.fsdpr   torch.distributed.tensorr   r   !torch.distributed.tensor.parallelr   r   r   r   r   r   transformersr   nemo.collections.common.promptsr   "nemo.collections.common.tokenizersr   'nemo.collections.speechlm2.parts.hf_hubr   %nemo.collections.speechlm2.parts.lorar   ,nemo.collections.speechlm2.parts.optim_setupr   r   +nemo.collections.speechlm2.parts.pretrainedr   r   r   
nemo.utilsr   r   r	  r   tuplerr   r7   r7   r7   r8   <module>   sR       