o
    wio                     @   s,  d dl mZ d dlmZmZ d dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ G dd dejjj Z!e j"fdeeej#ej$f  dee%e&f dej#fddZ'de
de
fddZ(eeededefddZ)dS )    )groupby)IterableUnionN)CutSetfastcopy)CrossEntropyLoss)pad_sequence)NeMoMultimodalConversation)	AudioTurnTextTurn)collate_conversation_audio_fault_tolerant)registered_prompt_format_fn)Llama2PromptFormatter)AutoTokenizer)
get_pad_idc                   @   s8   e Zd ZdZdeddfddZdededB fdd	ZdS )
SALMDataseta  
    A dataset for Speech-Augmented Language Models (SALM) that processes multimodal conversations
    containing both text and audio turns.

    This dataset handles NeMoMultimodalConversation objects which combine text messages
    and audio segments in a conversational format. It uses audio_locator_tag in the text,
    where each such placeholder corresponds to an entire audio segment.

    Args:
        tokenizer (AutoTokenizer):
            Tokenizer for converting text to token IDs and vice versa. Must have a special
            audio_locator_tag token that will be replaced with audio embeddings during model's
            training step.

    Returns:
        A dictionary with the following keys:
            - audios: Tensor of audio waveform samples [B_audio, T_samples]
            - audio_lens: Tensor of audio lengths [B_audio]
            - input_ids: Tensor of text token IDs [B, T_tokens], including audio_locator_tag tokens
            - loss_mask: Boolean tensor [B, T_tokens] indicating which tokens are part of the
                assistant's responses (True) and should be used for computing loss

    Notes:
        - Each audio_locator_tag token in input_ids corresponds to an audio segment in audios
        - The SALM model later replaces these audio_locator_tag tokens with encoded audio embeddings
        - The loss_mask identifies which tokens are part of the target sequences (assistant responses)
          and which are part of the source sequences (user prompts)
        - The input_ids and loss_mask will be expanded during model forward pass to account for
          the variable-length audio segments that replace each audio_locator_tag token
    	tokenizerreturnNc                 C   s   || _ t|| _d S )N)r   r   pad_id)selfr    r   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/speechlm2/data/salm_dataset.py__init__D   s   zSALMDataset.__init__conversationsc                 C   sX   t |\}}}|sd S ||tdd |D | jdtdd |D ddtjt|dS )Nc                 S   s   g | ]}|j qS r   )	input_ids.0cr   r   r   
<listcomp>R   s    z+SALMDataset.__getitem__.<locals>.<listcomp>)padding_valuec                 S   s   g | ]}t |d tdqS )maskr   )getattrtorchemptyr   r   r   r   r   T   s    r   )audios
audio_lensr   	loss_maskr   )r   left_collate_vectorsr   tor"   booldrop_in_memory_data)r   r   r$   r%   r   r   r   __getitem__H   s   zSALMDataset.__getitem__)	__name__
__module____qualname____doc__r   r   r   dictr+   r   r   r   r   r   $   s    r   tensorsr   r   c                 C   s8   dd | D } t dd | D sJ dt| d|ddS )	Nc                 S   s   g | ]}t |qS r   )r"   	as_tensorr   tr   r   r   r   ^   s    z(left_collate_vectors.<locals>.<listcomp>c                 s   s    | ]
}t |jd kV  qdS )   N)lenshaper3   r   r   r   	<genexpr>_   s    z'left_collate_vectors.<locals>.<genexpr>z Expected only 1-D input tensors.Tleft)batch_firstr   padding_side)allr   )r1   r   r   r   r   r'   Z   s   r'   r   c                 C   s    dt dt fdd}| j|d dS )Nconversationr   c                 S   sB   g }| j D ]}t|trt||j d}|| qt| |dS )N)cut)turns)r?   
isinstancer
   r   r>   r*   append)r=   r?   r4   r   r   r   _dropd   s   

z"drop_in_memory_data.<locals>._drop)apply_fn)r	   map)r   rB   r   r   r   r*   c   s   r*   examplepromptc                 C   s\   t dd | jD dd d}dd |D }t| dr)d|d	 d
< | j|d	 d d< ||S )Nc                 S   s.   g | ]}|j d t|tr|jn|jidqS )messageroleslots)rI   r@   r   valueaudio_locator_tag)r   turnr   r   r   r   u   s    zDdefault_multimodal_conversation_prompt_format_fn.<locals>.<listcomp>c                 S   s   | d S )NrI   r   )rM   r   r   r   <lambda>|   s    zBdefault_multimodal_conversation_prompt_format_fn.<locals>.<lambda>)keyc                 S   s.   g | ]\}}|d d dd |D idqS )rG    c                 s   s    | ]	}|d  d V  qdS )rJ   rG   Nr   r3   r   r   r   r8      s    zNdefault_multimodal_conversation_prompt_format_fn.<locals>.<listcomp>.<genexpr>rH   )join)r   rI   turn_grpr   r   r   r   ~   s    system_promptsystem_and_userr   rI   rJ   system)r   r?   hasattrrS   encode_dialog)rE   rF   r?   r   r   r   0default_multimodal_conversation_prompt_format_fno   s   


rX   )*	itertoolsr   typingr   r   numpynpr"   torch.utils.datalhotser   r   torch.nnr   torch.nn.utils.rnnr   #nemo.collections.common.data.lhotser	   1nemo.collections.common.data.lhotse.text_adaptersr
   r   r   &nemo.collections.common.data.prompt_fnr   nemo.collections.common.promptsr   "nemo.collections.common.tokenizersr   %nemo.collections.speechlm2.data.utilsr   utilsdataDatasetr   ignore_indexTensorndarrayintfloatr'   r*   rX   r   r   r   r   <module>   s<   8

	