o
    }oi                     @   s   d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ G dd dejjjZee
ede
defddZee
ede
defddZdS )    )groupbyN)CutSet)collate_audiocollate_vectors)NeMoMultimodalConversation)TextTurn)registered_prompt_format_fn)Llama2PromptFormatterLlama3PromptFormatter)AutoTokenizer)
get_pad_idc                   @   s4   e Zd ZdZdeddfddZdedefdd	ZdS )
SALMDataseta  
    A dataset for Speech-Augmented Language Models (SALM) that processes multimodal conversations
    containing both text and audio turns.

    This dataset handles NeMoMultimodalConversation objects which combine text messages
    and audio segments in a conversational format. It uses audio_locator_tag in the text,
    where each such placeholder corresponds to an entire audio segment.

    Args:
        tokenizer (AutoTokenizer):
            Tokenizer for converting text to token IDs and vice versa. Must have a special
            audio_locator_tag token that will be replaced with audio embeddings during model's
            training step.

    Returns:
        A dictionary with the following keys:
            - audios: Tensor of audio waveform samples [B_audio, T_samples]
            - audio_lens: Tensor of audio lengths [B_audio]
            - input_ids: Tensor of text token IDs [B, T_tokens], including audio_locator_tag tokens
            - loss_mask: Boolean tensor [B, T_tokens] indicating which tokens are part of the
                assistant's responses (True) and should be used for computing loss

    Notes:
        - Each audio_locator_tag token in input_ids corresponds to an audio segment in audios
        - The SALM model later replaces these audio_locator_tag tokens with encoded audio embeddings
        - The loss_mask identifies which tokens are part of the target sequences (assistant responses)
          and which are part of the source sequences (user prompts)
        - The input_ids and loss_mask will be expanded during model forward pass to account for
          the variable-length audio segments that replace each audio_locator_tag token
    	tokenizerreturnNc                 C   s   || _ t|| _d S )N)r   r   pad_id)selfr    r   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/speechlm2/data/salm_dataset.py__init__=   s   zSALMDataset.__init__conversationsc           	      C   s   g }g }d}|D ]%}t |tsJ |g  | D ]}|| |d | |d7 }qqtt|\}}||tdd |D | jdtdd |D ddt	j
dS )	Nr      c                 S      g | ]}|j qS r   )	input_ids.0cr   r   r   
<listcomp>P       z+SALMDataset.__getitem__.<locals>.<listcomp>)padding_valuec                 S   r   r   )maskr   r   r   r   r   Q   r   )audios
audio_lensr   	loss_mask)
isinstancer   append	list_cutsr   r   r   r   totorchbool)	r   r   all_cutsexample_idx_to_audio_idxscntrconversationcutr!   r"   r   r   r   __getitem__A   s"   


zSALMDataset.__getitem__)	__name__
__module____qualname____doc__r   r   r   dictr/   r   r   r   r   r      s    r   examplepromptc                 C   sT   t dd | jD dd d}dd |D }t| dr%dd	| jid
g| }||S )Nc                 S   .   g | ]}|j d t|tr|jn|jidqS messageroleslotsr;   r$   r   valueaudio_locator_tagr   turnr   r   r   r   [       Ddefault_multimodal_conversation_prompt_format_fn.<locals>.<listcomp>c                 S      | d S Nr;   r   rA   r   r   r   <lambda>b       Bdefault_multimodal_conversation_prompt_format_fn.<locals>.<lambda>keyc                 S   .   g | ]\}}|d d dd |D idqS )r9    c                 s       | ]	}|d  d V  qdS r<   r9   Nr   r   tr   r   r   	<genexpr>e       Ndefault_multimodal_conversation_prompt_format_fn.<locals>.<listcomp>.<genexpr>r:   joinr   r;   turn_grpr   r   r   r   d       system_promptsystemr9   r:   r   turnshasattrrZ   encode_dialogr5   r6   r]   r   r   r   0default_multimodal_conversation_prompt_format_fnU   s   


ra   c                 C   s\   t dd | jD dd d}dd |D }t| dr)d|d	 d
< | j|d	 d d< ||S )Nc                 S   r7   r8   r=   r@   r   r   r   r   s   rB   rC   c                 S   rD   rE   r   rF   r   r   r   rG   z   rH   rI   rJ   c                 S   rL   )r9   rM   c                 s   rN   rO   r   rP   r   r   r   rR   }   rS   rT   r:   rU   rW   r   r   r   r   |   rY   rZ   system_and_userr   r;   r<   r[   r\   r`   r   r   r   ra   m   s   


)	itertoolsr   r(   torch.utils.datalhotser   lhotse.dataset.collationr   r   #nemo.collections.common.data.lhotser   1nemo.collections.common.data.lhotse.text_adaptersr   &nemo.collections.common.data.prompt_fnr   nemo.collections.common.promptsr	   r
   "nemo.collections.common.tokenizersr   %nemo.collections.speechlm2.data.utilsr   utilsdataDatasetr   ra   r   r   r   r   <module>   s0   8