o
    }oiS                     @   s   d dl mZ d dlmZ d dlZd dlmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d	d
gZeG dd
 d
ZG dd	 d	ZdS )    )	dataclass)OptionalN)	AudioTurnNeMoMultimodalConversation)get_prompt_format_fn)PromptFormatter)AutoTokenizer)logging#MultimodalConversationTextProcessorTextProcessorOutputc                   @   sX   e Zd ZU dZejed< ejed< ejed< ejed< ejed< ejed< ejed< d	S )
r   z@
    A dataclass to store the output of the text processor.
    	input_idsanswer_start_idxcontext_idscontext_length
answer_idscontext_start_idx
num_audiosN)__name__
__module____qualname____doc__torchTensor__annotations__ r   r   b/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/speechlm/data/text_processing.pyr      s   
 





c                       s~   e Zd ZdZ					ddddee d	ee d
ee dee dee f fddZde	de
fddZde	de
fddZ  ZS )r
   zM
    Text processor for multi-modal conversation with lhotse dataloader.
    NF<BOA><EOA>	tokenizerz0nemo.collections.common.tokenizers.TokenizerSpecprompt_formatmax_seq_lengthadd_boa_eoa
boa_string
eoa_stringc                    s   t    t||| _tt| j| _|| _|| _	|du s"|dkr)t
d| t|dr=|jdkr=|jdkr=|j| _n| jjdurM| jjdkrM| jjnd| _|| _|| _|| _dS )ad  
        Args:
            tokenizer: The tokenizer to use.
            prompt_format: The prompt format string.
            max_seq_length: The maximum sequence length.
            add_boa_eoa: Whether to add BOA and EOA strings before and after audio.
            boa_string: The BOA string to use.
            eoa_string: The EOA string to use.
        Nr   z/max_seq_length must be a positive integer, got pad_id)super__init__r   resolvepromptr   r   prompt_format_fnr   r    
ValueErrorhasattrr$   eos_idr!   r"   r#   )selfr   r   r    r!   r"   r#   	__class__r   r   r&   1   s   

"
z,MultimodalConversationTextProcessor.__init__lhotse_inputreturnc                 C   s
   |  |S )
        process a single input sample.
        Args:
            lhotse_input: a NeMoMultimodalConversation sample from lhotse dataset.
        )process_sample)r-   r0   r   r   r   __call__V   s   
z,MultimodalConversationTextProcessor.__call__c              
   C   s&  t |tstdt| dd |jD }t|}|dkr#|d jnd}| || j}|dkrRdg}|d 	 
  }|d 	 
  }|d 	 
  }	ntt | jtrc| jj|d d	d
}
n| j|d }
g }g }|
|}t|D ]?\}}|t| | jr|dkr|d | j }n|t|d kr| jd | }n| jd | d | j }|| j| qx|d 	 
  }	||	 }t|| jkrtdt| d| j d |d| j }tt| tt| t| tt| t|	 t| t|dS )r2   z6Input must be of type NeMoMultimodalConversation, got c                 S   s   g | ]	}t |tr|qS r   )
isinstancer   ).0turnr   r   r   
<listcomp>g   s    zFMultimodalConversationTextProcessor.process_sample.<locals>.<listcomp>r   Nr   r   r   F)remove_special_tokens    zInput ids length z exceed max sequence length z, truncating.)r   r   r   r   r   r   r   )r5   r   r*   typeturnslenaudio_locator_tagr)   r(   cpunumpytolistr   r   ids_to_textsplit	enumerateappendr!   r"   r#   extendtext_to_idsr    r	   warningr   r   tensorlong)r-   r0   audio_turnsr   audio_locator_strprocessed_sampler   r   r   r   contextsegmentsicontext_segr   r   r   r3   ^   sT   

z2MultimodalConversationTextProcessor.process_sample)NNFr   r   )r   r   r   r   r   strintboolr&   r   r   r4   r3   __classcell__r   r   r.   r   r
   ,   s,    %)dataclassesr   typingr   r   1nemo.collections.common.data.lhotse.text_adaptersr   r   &nemo.collections.common.data.prompt_fnr   nemo.collections.common.promptsr   .nemo.collections.common.tokenizers.huggingfacer   
nemo.utilsr	   __all__r   r
   r   r   r   r   <module>   s   