o
    wi'$                     @   s  d dl Z d dlZd dlZd dlmZmZmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ G d	d
 d
ejjjZdedededee deejejf f
ddZ	ddedededee dedejfddZe de dfdedefddZ dS )    N)CutSetSecondscompute_num_frames)Cut)collate_audiocollate_vectors)ifnone)TokenizerSpec)
get_pad_id)loggingc                   @   sR   e Zd ZdZ		ddededededee dee fd	d
Z	de
defddZdS )DuplexS2SDataseta5  
    A dataset for duplex speech-to-speech models that handles bidirectional conversations.

    This dataset processes Lhotse CutSet objects containing recordings with supervision segments
    from different speakers (roles). It creates aligned representations of audio and text for
    both source (input) and target (output) channels, preserving temporal alignment between
    audio frames and text tokens.

    Args:
        tokenizer (TokenizerSpec):
            Tokenizer for converting text to token IDs and vice versa. Must support BOS and EOS tokens.
            It's expected to support PAD token as well, otherwise we will use 0 as the pad token
            and emit a warning.

        frame_length (Seconds):
            Duration of a single frame in seconds. Used to calculate frame positions for token alignment.

        source_sample_rate (int):
            Sample rate for source audio (e.g., 16000 Hz).

        target_sample_rate (int):
            Sample rate for target audio (e.g., 22050 Hz).

        input_roles (list[str], optional):
            List of speaker roles (cut.supervisions[:].speaker) to consider as inputs. Defaults to ["user"].

        output_roles (list[str], optional):
            List of speaker roles (cut.supervisions[:].speaker) to consider as outputs. Defaults to ["agent"].

    Returns:
        A dictionary with the following keys:
            - source_audio: Tensor of source waveform samples [B, T]
            - source_audio_lens: Tensor of source audio lengths [B]
            - target_audio: Tensor of target waveform samples [B, T]
            - target_audio_lens: Tensor of target audio lengths [B]
            - target_tokens: Tensor of target text tokens [B, T], with special tokens (BOS/EOS/PAD)
                at positions aligned with audio frames
            - target_token_lens: Tensor of target token sequence lengths [B]
            - source_tokens: Tensor of source text tokens [B, T], with special tokens (BOS/EOS/PAD)
                at positions aligned with audio frames
            - source_token_lens: Tensor of source token sequence lengths [B]
            - target_texts: List of full target texts joined from output_roles supervisions [B]

    Notes:
        - The dataset ensures frame-level alignment between audio and text by inserting tokens at
          specific frame positions based on the timing of supervision segments.
        - PAD tokens (typically 0) are used to fill gaps where there's no text.
        - BOS tokens mark the beginning of each speech segment.
        - EOS tokens mark the end of each speech segment.
        - Text tokens from each speaker are placed at frame positions corresponding to their
          timestamp in the original recording, preserving the temporal relationship.
          This is a segment-level alignment only, not word-level alignment.
    N	tokenizerframe_lengthsource_sample_ratetarget_sample_rateinput_rolesoutput_rolesc                 C   sd   || _ || _|| _|| _tt|dg| _tt|dg| _|jd us'J d|j	d us0J dd S )Nuseragentz8BOS support in the tokenizer is required for S2S models.z8EOS support in the tokenizer is required for S2S models.)
r   r   r   r   setr   r   r   boseos)selfr   r   r   r   r   r    r   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/speechlm2/data/s2s_dataset.py__init__S   s   	zDuplexS2SDataset.__init__cutsreturnc           
         s   | t}t| j\}}t| jdd\}}t| j j j	d\}}t| j j j
d\}}	||||||||	 fdd|D d	S )Ntarget_audio)recording_field)rolesc                    s&   g | ]}d   fdd|jD qS ) c                 3   s"    | ]}|j  jv r|jV  qd S )N)speakerr   text).0sr   r   r   	<genexpr>|   s     z:DuplexS2SDataset.__getitem__.<locals>.<listcomp>.<genexpr>)joinsupervisions)r$   cutr&   r   r   
<listcomp>{   s    z0DuplexS2SDataset.__getitem__.<locals>.<listcomp>)	source_audiosource_audio_lensr   target_audio_lenstarget_tokenstarget_token_lenssource_tokenssource_token_lenstarget_texts)transform_text_strip_timestampsr   resampler   r   collate_token_channelr   r   r   r   )
r   r   r,   r-   r   r.   r/   r0   r1   r2   r   r&   r   __getitem__f   s.   




zDuplexS2SDataset.__getitem__)NN)__name__
__module____qualname____doc__r	   r   intliststrr   r   dictr8   r   r   r   r   r      s$    <
r   r   r   r   r    r   c                    sH   t  fdd| D }tdd |D }t|d}||fS )Nc              	      s   g | ]}t | d qS ))r   r   r    pad_id)build_token_channel)r$   cr   rA   r    r   r   r   r+      s    z)collate_token_channel.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   )len)r$   ttr   r   r   r+      s    )padding_value)r
   torchtensorr   )r   r   r   r    tokens
token_lensr   rD   r   r7      s   r7   r*   rA   c                 C   s  d| j }t| dd d ur| d| j}t| j|| j}tj|tjd| }| j	D ]}|j
|v rt|jg||j }	t|j|| j}
|
t|kr^td|
 dt| d|  q+|
t|	 }|t|krt||
 }tdt|	 d	| d
|dt|d| 
 |	d | }	z|	||
|< W n" ty } ztd|jd|
d|d|	jd| 
|d }~ww t|j|| j}|t|k r|j||< q+|S )NzExtra info: cut.id=shard_originz cut.shard_origin=)dtypez?Ill-constructed example: the beginning offset of a supervision z% is larger than the example's length z. z1Truncating training example's text_ids of length z by z because endpos=z > len(tokens)=ztokens.shape=z pos=z endpos=z text_ids.shape=r!   )idgetattrrM   r   durationsampling_raterH   oneslongr)   r"   	as_tensorr   text_to_idsr#   startrE   r   warning	ExceptionRuntimeErrorshapeendr   )r*   r   r   r    rA   
diagnostictotalrJ   supervisiontext_idsposendpos	trunc_leneeosposr   r   r   rB      s@   

(,
rB   z	<\|\d+\|>z\s+r#   c                 C   s   | d| } | d|  S )z
    Strips timestamp tokens from text, e.g. turns:
      '<|0|> Hey <|3|> <|3|> how <|5|> <|7|> are <|8|> <|8|> <|10|> you? <|12|>'
      into:
      'Hey how are you?'
     r!   )substrip)r#   _TIMESTAMP_PATTERN_SPACE_PATTERNr   r   r   r5      s   
r5   )rL   )!rerH   torch.utils.datalhotser   r   r   
lhotse.cutr   lhotse.dataset.collationr   r   lhotse.utilsr   "nemo.collections.common.tokenizersr	   %nemo.collections.speechlm2.data.utilsr
   
nemo.utilsr   utilsdataDatasetr   r   r?   tupleTensorr7   r=   rB   compiler5   r   r   r   r   <module>   sT   e

1