o
    wi|                     @   s(  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmZ d dlmZ d d	lmZ d d
l
mZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: G dd dee)Z;	d*de
jde
jde<de<de=e
j de	e
j d e>e
je	e
j e
jf fd!d"Z?de
jde
jde	e
j de<d e>e
je
je	e
j f f
d#d$Z@d%e=e=eA  d&e<d'eBe
jCB d e>e
je
jf dB fd(d)ZDdS )+    N)defaultdict)repeat)Path)AnyOptional)CutSet)LightningModule)
DictConfig)	PeftModel)Tensor)fully_shard)	ReplicateShard)ColwiseParallelPrepareModuleInputRowwiseParallelSequenceParallelloss_parallelparallelize_module)GenerationConfig)PromptFormatter)AutoTokenizer)left_collate_vectors)
HFHubMixin)maybe_install_lora)configure_optimizers	is_frozen)load_pretrained_hfmove_embeddingsetup_speech_encoder)AudioSignal
LabelsTypeLengthsTypeMaskType
NeuralType)loggingc                       s  e Zd Zd9 fddZedd ZedefddZedefd	d
ZedefddZ	edefddZ
edefddZedefddZ		d:dededeeef fddZdefddZdedefddZd9ddZd9dd Zdedefd!d"Zd9d#d$Zd9d%d&Zd'ed(efd)d*Z fd+d,Ze 			d;d-eeee   ejB d.ejd/ejd0edejf
d1d2Z d3d4 Z!d9d5d6Z"edefd7d8Z#  Z$S )<SALMreturnNc                    s   t |tsJ dt|dt   |   t|| _| jj| _t	| jj
dd| _| jd| jgi t| jj
| jjd| _| jjj| _| jj`t|  t| | jjd d| _d| _d S )NzYou must pass the config to SALM as a Python dict to support hyperparameter serialization in PTL checkpoints (we got: 'type(cfg)=z').T)use_fastadditional_special_tokens)pretrained_weightsF)
isinstancedicttypesuper__init__save_hyperparametersr	   cfgaudio_locator_tagr   pretrained_llm	tokenizeradd_special_tokensr   r*   llmmodelembed_tokensr   r   	_use_fsdp_use_tp)selfr1   	__class__ c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/speechlm2/models/salm.pyr/   2   s$   



zSALM.__init__c                 C      | j jS )z&Return the size of the text tokenizer.)r8   num_embeddingsr;   r>   r>   r?   text_vocab_sizeK   s   zSALM.text_vocab_sizec                 C   r@   N)r4   bos_idrB   r>   r>   r?   text_bos_idP      zSALM.text_bos_idc                 C   r@   rD   )r4   eos_idrB   r>   r>   r?   text_eos_idT   rG   zSALM.text_eos_idc                 C   s2   | j j}|d u r| j j}|d u rtd d}|S )Nzqthe text tokenizer has no <pad> or <unk> tokens available, using id 0 for padding (this may lead to silent bugs).r   )r4   padunk_idwarningswarn)r;   pad_idr>   r>   r?   text_pad_idX   s   zSALM.text_pad_idc                 C   s   | j | jS rD   )r4   token_to_idr2   rB   r>   r>   r?   audio_locator_tag_idd   s   zSALM.audio_locator_tag_idc                 C   r@   )zx
        Returns the audio duration corresponding to a single frame/token at the output of ``self.perception``.
        )
perceptiontoken_equivalent_durationrB   r>   r>   r?   rS   h   s   zSALM.token_equivalent_durationc                 C   s   | j jjjS rD   )rR   preprocessor
featurizersample_raterB   r>   r>   r?   sampling_rateo   s   zSALM.sampling_rateinput_embedsattention_maskc                 C   s<   | j ||||dudd}d|d i}|dur|d |d< |S )z
        Implements a fully offline forward pass through the entire model.
        The flow is the following:

        |speech and text embeddings| -> |llm| -> |lm_head| -> |token ids|

        NT)inputs_embedsrY   past_key_values	use_cachereturn_dictlogitsr[   cache)r6   )r;   rX   rY   r_   outansr>   r>   r?   forwards   s   zSALM.forwardbatchc              
   C   s>  | j |d |d d\}}dd t||D }t|d | jkd|d }| |}t|d || j| j||d |d d	d
\}}}|ddddf }|ddddf }|ddddf }| jr| j	d 
 }	|jd d |	  }
dkr|ddd|
 f }|ddd|
 f }|ddd|
 f }|||dS )a  
        Performs additional processing on the mini-batch collected from dataloader.
        Notably:
        * Convert source audio to speech representations.
        * Convert target audio to target audio tokens.
        * Convert target text to embeddings.
        * Combine the input audio and target text embeddings.
        * Take care of any necessary slicing to align the shapes of source audio,
            target audio, and target token ids.
        audios
audio_lens)input_signalinput_signal_lengthc                 S   s   g | ]
\}}|d | qS rD   r>   ).0embemblenr>   r>   r?   
<listcomp>       z'SALM.prepare_inputs.<locals>.<listcomp>	input_idsr   	loss_maskrm   embeds
padding_idplaceholder_idreplacements
target_idsN   tensor_parallel)rX   rY   ru   )rR   ziptorchwhererQ   r8   &replace_placeholders_and_build_targetsrO   r:   device_meshsizeshape)r;   rc   
audio_embsaudio_emb_lensinput_ids_to_embed	text_embs
input_embsru   rY   tp_world_size	remainderr>   r>   r?   prepare_inputs   s6   

zSALM.prepare_inputs	batch_idxc              	   C   sD  | j j| j j| jfD ]
}t|r|  q
| |}| |d |d d}|d dk  }t	 ! t
jjj|d dd|d ddd	dd
| }W d    n1 sUw   Y  |d jd d \}}	|t
| jd urx| jjd jd d nd||	|t
j|||	  |d | jk  |d   d}
| j|
dd |
S )NrX   rY   rY   ru   ro   r^   r   rw   sum	reductionignore_index   lrrm   )losslearning_rate
batch_sizesequence_length
num_framestarget_to_input_ratiopadding_ratioT)on_step)rR   rT   encoderr6   r   evalr   longr   r   rz   nn
functionalcross_entropyflattenr   	as_tensor_trainertrainer
optimizersparam_groupstofloat32rO   numellog_dict)r;   rc   r   minputsforward_outputsr   r   BTra   r>   r>   r?   training_step   s:   
(

 zSALM.training_stepc                 C   s   t t| _t t| _d S rD   )r   list_partial_val_losses_partial_accuraciesrB   r>   r>   r?   on_validation_epoch_start   s   
zSALM.on_validation_epoch_startc                 C   s   g }| j  D ]\}}t| }| jd| |ddd || q| jdt| ddd g }| j D ]\}}t| }| jd| |ddd || q9| jdt| ddd | j   | j  d S )N	val_loss_T)on_epoch	sync_distval_lossval_acc_val_acc)	r   itemsrz   stackmeanlogappendr   clear)r;   
val_lossesnamevalsr   
accuraciesaccsr   r>   r>   r?   on_validation_epoch_end   s   
zSALM.on_validation_epoch_endc              	   C   s  |  D ]\}}|d u rq| |}| |d |d d}|d dk  }t ! tjjj|d 	dd|d 	ddd	dd
| }W d    n1 sMw   Y  |d j
ddd}	|d d}
|	|
dk }	|
|
dk }
|	|
  }| j| | | j| | qd S )NrX   rY   r   ru   ro   r^   r   rw   r   r   rv   dim)r   r   r   r   r   rz   r   r   r   r   argmaxviewreshapeeqfloatr   r   r   r   )r;   rc   r   r   dataset_batchr   r   r   r   predsrefsaccuracyr>   r>   r?   validation_step   s2   
zSALM.validation_stepc                 C      |   S rD   )r   rB   r>   r>   r?   on_test_epoch_start     zSALM.on_test_epoch_startc                 C   r   rD   )r   rB   r>   r>   r?   on_test_epoch_end  r   zSALM.on_test_epoch_endargskwargsc                 O   s   | j |i |S rD   )r   r;   r   r   r>   r>   r?   	test_step  s   zSALM.test_stepc                    s>   t   t j|i | W d    d S 1 sw   Y  d S rD   )r   r.   backwardr   r<   r>   r?   r     s   "zSALM.backwardpromptsrd   re   generation_configc                    st  t |tjr	|}n9t|| j| jd }dur&|du r|du s"J d|\}}t| jj	| j
tfdd|D | jd| j}|dur|||| jkd}| |}	| ||\ }
 fddt|
D  t||	| j| j dd	\}}}||d
}n
|| jk}||d}|du rt| j| j| jd}t|  | jjdi ||d|i}W d   |S 1 sw   Y  |S )a  
        Generate LLM answers given text or mixed text+audio prompts.

        Example 1. High-level API using ``prompts`` to provide both text and audio::

            >>> answer_ids = model.generate(
            ...    prompts=[
            ...        [
            ...             {
            ...                 "role": "user",
            ...                 "content": f"Transcribe the following: {model.audio_locator_tag}",
            ...                 "audio": ["path/to/audio.wav"],
            ...             }
            ...         ]
            ...    ],
            ...    max_new_tokens=128,
            ... )

        You may also include a ``transformers.GenerationConfig`` object to customize decoding strategy::

            >>> answer_ids = model.generate(..., generation_config=GenerationConfig(do_sample=True, num_beams=5))

        Example 2. Lower-level API, using ``prompts`` for the text part,
        and pre-loaded ``audio`` and ``audio_lens`` tensors::

            >>> answer_ids = model.generate(
            ...    prompts=[
            ...        [{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}"}],
            ...        [{"role": "user", "content": f"Transcribe the following in Polish: {model.audio_locator_tag}"}],
            ...    ],
            ...    audios=audios,  # torch.Tensor, float32, of shape (batch, time)
            ...    audio_lens=audio_lens,  # torch.Tensor, int64, of shape (batch,)
            ...    max_new_tokens=128,
            ... )

        Example 3. Lower-level API, using pre-tokenized and pre-formatted ``prompts`` for the text part,
        and pre-loaded ``audio`` and ``audio_lens`` tensors::

            >>> answer_ids = model.generate(
            ...    prompts=prompts,  # torch.Tensor, int64, of shape (batch, num_tokens)
            ...    audios=audios,  # torch.Tensor, float32, of shape (batch, time)
            ...    audio_lens=audio_lens,  # torch.Tensor, int64, of shape (batch,)
            ...    max_new_tokens=128,
            ... )

        Inputs:
            prompts: batch of prompts Tensor or as list[dict] each in the following format
                [
                  # batch example id 0
                  [{"role": "user"}, "slots": {"message": f"Transcribe the following: {model.audio_locator_tag}"}]
                  # batch example id 1
                  [{"role": "user"}, "slots": {"message": f"Transcribe the following in Polish: {model.audio_locator_tag}"}]
                ]
                "role" is LLM-specific, you can pass multiple turns as well.
                If ``prompts`` is a Tensor, we assume it was already formatted in the relevant chat template
                and tokenized with the model's tokenizer.
            audios: Optional. Time-domain audio signal zero-padded batch of shape (B, T).
                The number of audios must correspond to the number of occurrences of <audio_locator_tag> in prompts.
                Each prompt can have multiple audios.
            audio_lens: Optional. Length of each audio example.
            generation_config: Optional HuggingFace GenerationConfig object.
            generation_kwargs: Keyword arguments passed directly to the underlying LLM's ``generate`` method.
        )rW   deviceNzaAudios cannot be provided via ``prompts`` and ``audios``/``audio_lens`` arguments simultaneously.c                    s   g | ]
} j |d d qS ))turnsrm   )encode_dialog)rh   prompt)	formatterr>   r?   rk   u  rl   z!SALM.generate.<locals>.<listcomp>)padding_valuer   c                    s    g | ]\}} |d |f qS rD   r>   )rh   ielen)audio_embedsr>   r?   rk     s     rp   )rZ   rY   )rm   rY   )bos_token_ideos_token_idpad_token_idr   r>   )r+   rz   r   _resolve_audios_in_promptrW   r   r   resolver1   prompt_formatr4   r   rO   r   r{   rQ   r8   rR   	enumerater|   r   rF   rI   r   r6   generate)r;   r   rd   re   r   generation_kwargstokensmaybe_audiotokens_to_embedtoken_embedsaudio_embed_lensrX   _rY   generation_inputsanswer_tokensr>   )r   r   r?   r      sd   I






zSALM.generatec                 C   s   t | S rD   )r   rB   r>   r>   r?   r     r   zSALM.configure_optimizersc                 C   s  | j }|d u r	d S | j}t|tr|jj}|d  } dkrd| _tt	 ft
dfddt d}t||| |jjD ]_}t t t t tt
ddt tt
dft	 fdt t tt
ddd
}|j}d	D ]*}t||}||  d
krtd| d| d| d t||||   qjt||| q<t|j|tt
dt
ddd |d  }	 dkr|	jdksJ d| _d|	i}
t|jjD ]\}}t|fi |
|jj|< qt| jfi |
| _t|jfi |
|_t| jfi |
| _t| jfi |
| _d S d S )Nrx   rw   T)input_layoutsdesired_input_layoutsuse_local_output)zlayers.0norm)output_layouts)r   r   )
input_layernormzself_attn.q_projzself_attn.k_projzself_attn.v_projzself_attn.o_projpost_attention_layernormmlpzmlp.gate_projzmlp.up_projzmlp.down_proj)	num_headsnum_key_value_headshidden_sizer   zattn_layer.=z$ is not divisible by tp_mesh.size()=z:: set a different tensor parallelism size to avoid errors.rv   F)r   r   r   data_parallelmesh)r}   r6   r+   r
   
base_modelr7   r~   r:   r   r   r   r   r   layersr   r   	self_attngetattrr%   warningsetattrlm_headndimr9   r   r   r8   rR   )r;   r}   r6   tp_meshplantransformer_block
attn_layerattrvaldp_meshfsdp_configidxlayerr>   r>   r?   configure_model  sx   

zSALM.configure_modelc                 C   sV   t dtdt dddtdt dddtdt d| jd	d
tdt ddgdS )z
        Return a typing schema for optimal batch size calibration for various
        sequence lengths using OOMptimizer.
        rd   )r   r   input)r   r-   
seq_lengthre   )r   rm   output)r   r-   r  
vocab_sizern   )clsr   )r,   r$   r    r"   r!   rC   r#   rB   r>   r>   r?   oomptimizer_schema  s   
zSALM.oomptimizer_schema)r'   N)NN)NNN)%__name__
__module____qualname__r/   propertyrC   intrF   rI   rO   rQ   r   rS   rW   r   r,   strrb   r   r   r   r   r   r   r   r   r   r   rz   no_gradr   r   r   r   r  r  __classcell__r>   r>   r<   r?   r&   1   sj    


3
"


~
`r&   rm   rq   rr   rs   rt   ru   r'   c               
   C   s  |   \}}|dur|  |   ksJ d| d}|j|j}	}
d}t| |||\} }}g }g }g }d}t|D ]
}| | |kjddd }t|dkrw|||  |durm||  }||| | |k< || || | |k q:g }g }g }d}|D ]i}||kr||| ||  |dur|| ||  }||||k< || || | || |k || }|| |t	j
| df|t	j|	d |t	j| dft	j|	d |d	7 }|d	 }q||k r"||| ||  |dur|| ||  }||||k< || || | || |k |t	j|dd
 |t	j|dd
 |durE|t	j|dd
 q:|t|krYtdt| d| tdd |D }t	j||||	|
d}|dur~t	j
||f|t	j|	d}nd}t	j||ft	j|	d}|du rtd}tt|||D ]-\}\}}}| d}|||| df< |dur|||| df< |||| df< q|||fS )a  Replaces each occurrence of the placeholder_id in input_ids with the corresponding tensor
    from the replacements list in the embeds tensor, and creates corresponding adjusted target_ids.

    Note: when padding is necessary, we apply left-padding to the examples not to introduce
        anomalies at generation time.

    Args:
      input_ids (Tensor): shape (batch, sequence_length); input token ids.
      embeds (Tensor): shape (batch, sequence_length, hidden_dim); embeddings for each token.
      padding_id (int): these IDs will be marked as ignore_index in target_ids.
      placeholder_id (int): an id to be replaced.
      replacements (list of Tensor): each Tensor has shape (L_i, hidden_dim), with L_i arbitrary.
      target_ids (Tensor): shape (batch, sequence_length); target token ids.

    Returns:
      Tuple[Tensor, Tensor, Tensor]:
        - Tensor of shape (batch, max_new_sequence_length, hidden_dim) corresponding to
          ``embeds`` after replacements.
        - Tensor of shape (batch, max_new_sequence_length) with adjusted target IDs where:
          * Original target values are preserved where input was not a placeholder or padding
          * Positions that were placeholders, padding, or added by replacements are set to -100
          Will be None if target_ids input was None.
        - Tensor of shape (batch, max_new_sequence_length) with attention padding masks
          updated to account for shape changes due to replacements.
    Nz0target_ids must have the same shape as input_idsr   ro   r   Tas_tuple)dtyper   rw   r   z	Expected z replacements but used c                 s   s    | ]}| d V  qdS )r   N)r~   )rh   seqr>   r>   r?   	<genexpr>  s    z9replace_placeholders_and_build_targets.<locals>.<genexpr>)r   r&  )r~   r   r&  _unpad_inputsrangenonzerolenr   clonerz   fullr   onesboolcat
ValueErrormaxzerosr   r   ry   ) rm   rq   rr   rs   rt   ru   r   seq_len
hidden_dimr   r&  r   output_sequencesoutput_target_idsoutput_att_masksreplacement_idxr   placeholder_positionsnew_target_idssegmentstarget_segments	att_masksprev_pospossegment_target_idsrepmax_seq_lengthr  attention_masksr'  tgtattr>   r>   r?   r|     s   !



" 









r|   c           
      C   s   dd }g g }}|d urg nd }t | jd D ].}|| | |}	|| ||	d f  ||||	d f  |d urF||||	d f  q|||fS )Nc                 S   s2   | |k}t j|dd}| dkr|d  S dS )NFr$  r   rv   )rz   r+  r   item)tensorvaluemaskindicesr>   r>   r?   first_index_not_value  s
   z,_unpad_inputs.<locals>.first_index_not_valuer   )r*  r   r   )
rm   rq   ru   rr   rM  input_ids_unpadembeds_unpadtarget_ids_unpadr   r  r>   r>   r?   r)    s   

r)  r   rW   r   c                    s   ddl m  g }| D ]1}|D ],}d|v r:|d }t|ttfr"|g}|D ]}t|ttfs4J d| || q$qq
|s@d S t fdd|D }td |	|j
dd	\}	}
W d    n1 sfw   Y  t|	j|dd
t|
j|dd
fS )Nr   	Recordingaudioz(Invalid value under prompt key 'audio': c                    s   g | ]	}  | qS r>   )	from_fileto_cut)rh   prQ  r>   r?   rk     s    z-_resolve_audios_in_prompt.<locals>.<listcomp>cpuT)collate)non_blocking)lhotserR  r+   r!  r   r   r   rz   r   resample
load_audior   r   )r   rW   r   pathsconversationturn
turn_audiorV  cutsrS  re   r>   rQ  r?   r     s,   r   rD   )ErL   collectionsr   	itertoolsr   pathlibr   typingr   r   rz   rZ  r   	lightningr   	omegaconfr	   peftr
   r   torch.distributed.fsdpr   torch.distributed.tensorr   r   !torch.distributed.tensor.parallelr   r   r   r   r   r   transformersr   nemo.collections.common.promptsr   "nemo.collections.common.tokenizersr   ,nemo.collections.speechlm2.data.salm_datasetr   'nemo.collections.speechlm2.parts.hf_hubr   %nemo.collections.speechlm2.parts.lorar   ,nemo.collections.speechlm2.parts.optim_setupr   r   +nemo.collections.speechlm2.parts.pretrainedr   r   r   nemo.core.neural_typesr    r!   r"   r#   r$   
nemo.utilsr%   r&   r   r   tupler|   r)  r,   r!  r   r   r>   r>   r>   r?   <module>   s       p
 	

