o
    	۷i@                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlZddl	m
Z
mZ e r'd dlZe
 r.d dlZddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ G d
d deddZG dd deddZG dd deZdgZdS )    N)Path)AnyOptionalUnion   )is_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   s"   e Zd ZU eeeef  ed< dS )CsmAudioKwargsencoded_length_kwargsN)__name__
__module____qualname__r   dictstrr   __annotations__ r   r   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/csm/processing_csm.pyr   %   s   
 r   F)totalc                   @   sJ   e Zd ZU eed< ddddg dg dg ddd	d
dddidZdS )CsmProcessorKwargsaudio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r         r   r$   
   r   r$      r   r$      r      )r$   r$   r$   r)   r$   r$      r$   r$      r$   r$   r%   r$      )r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   )   s   
 	
r   c                       s   e Zd ZdZddgZdZdZ	d fdd	Zedd	d
Z	de
deeeeeeef  f dee fddZ			ddeeeeee ee f  dee
 dee dee dee f
ddZedd Z  ZS )CsmProcessora  
    Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
    tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import CsmProcessor
        from datasets import load_dataset

        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        audio = ds[0]["audio"]["array"]

        processor = CsmProcessor.from_pretrained("sesame/csm-1b")

        processor(
            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
            audio=audio,
            text_kwargs = {"padding": False},
            audio_kwargs = {"sampling_rate": 16000},
            common_kwargs = {"return_tensors": "pt"},
        )
        # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
        ```

    Args:
        feature_extractor ([`EncodecFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.

    feature_extractor	tokenizerEncodecFeatureExtractorPreTrainedTokenizerFastNc                    sv   t |dsd| _|| j| _n|j| _|j| _t |ds(d| _|| j| _n|j| _|j| _t j|||d d S )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrr<   convert_tokens_to_idsaudio_token_idr=   audio_eos_token_idsuper__init__)selfr8   r9   r>   	__class__r   r   rD   f   s   

zCsmProcessor.__init__c                 C   s   | }|du s|du s|du s|du r|S t |||D ]U\}}}|d | d }	|| }
|
d }|
| }||	 |
 | d }t|d }|| | |
 }|| }|rW|
}|}n|| }|| | }|||d   d | d }q|S )a|  
        Compute the length of the encoded audio sequence.

        Args:
            audio_length (int): The length of the audio sequence.
            kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
            strides (list[int]): The strides for the convolutional layers.
            use_causal_conv (bool): Whether to use causal convolutions.
        Nr$   r,   )zipmathceil)audio_lengthr-   r.   r/   r0   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddingr   r   r   _get_encoded_length|   s&    z CsmProcessor._get_encoded_lengthaudiosaving_pathkwargsc           	      K   s   t  stdt|}t|ttfr|g}nt|ttfr&tdd |D s*t	dt
|t
|kr6t	d| jtfi |}|d }|d }t||D ]\}}t|tjr^|   }t||| qLd S )Nz/Please install `soundfile` to save audio files.c                 s   s    | ]
}t |ttfV  qd S N)
isinstancer   r   ).0pr   r   r   	<genexpr>   s    z*CsmProcessor.save_audio.<locals>.<genexpr>zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer   r1   )r   ImportErrorr
   r\   r   r   listtupleall
ValueErrorlen_merge_kwargsr   rH   torchTensorcpufloatnumpysfwrite)	rE   rX   rY   rZ   output_kwargsr   r1   audio_valuer^   r   r   r   
save_audio   s*    zCsmProcessor.save_audioF      ?textoutput_labelsdepth_decoder_labels_ratioc              
      s  j tfdjji|}|d }|d }|d }	|	dd}
|
dkr,tjj dt|t	r5|g}nt|t
tfrEtd	d
 |D sItdfdd|D }d}|dur`t|}t|}t|dkr|t|kr|du rttdtd| d| d|dur|di   fdd|D }| }g }|D ]<}g }j|v r|d}j| }|| |jdd}j|v sd|v r|d|dd}d|v s|| q|}j|fi |}i }|| |durw|dd g g }}d}|D ]I}|dkr|td |tdg q|tjdd ||||  D dd |tdd ||||  D jdd ||7 }qj|fi |}|dd || tdd
 |D fdd|D }tj|dd|d < |r|d! jk }|j d }|d"krt!|dt"|d|   }|| }n|}t#|d! jk|d! j$kB |d! d#}d$||dddf |dddf f< ||d%< t%||
d&S )'a  
        Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
        the text. To prepare the audio, this method forwards the `audio` arguments to
        EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
        to the docstring of the above two methods for more information.

        Args:
            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
                tensor.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
                - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
                - `-100` will be ignored in the loss computation
                - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
            depth_decoder_labels_ratio (float, *optional*, default=1.0):
                The ratio of audio frames to keep for the depth decoder labels.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
        tokenizer_init_kwargsr4   r   r5   r2   Nr3   z% only supports `return_tensors='pt'`.c                 s   s    | ]}t |tV  qd S r[   )r\   r   r]   tr   r   r   r_         z(CsmProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsc                    s   g | ]}|  jqS r   )countr<   rv   )rE   r   r   
<listcomp>      z)CsmProcessor.__call__.<locals>.<listcomp>r   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   c                    s$   g | ]}j |jd  fi  qS )rW   shape)r]   audio_array)r   rE   r   r   rz     s    z<placeholder>r$   return_attention_maskr}   c                 S   s(   g | ]}t |tjr|  n|qS r   )r\   rg   rh   ri   rk   r]   elr   r   r   rz   =  s    )axisc                 S   s   g | ]}|j d  qS r|   r~   r   r   r   r   rz   E  s    )dimpadding_maskc                 s   s    | ]}|j d  V  qdS )r}   Nr   r]   cut_idxsr   r   r   r_   N  rx   c                    s.   g | ]}t jjj|d  |jd  fddqS )r   r}   )value)rg   nn
functionalpadr~   r   )max_lenr   r   rz   O  s     input_values_cutoffs	input_idsrq   iilabels)datatensor_type)&rf   r   r9   init_kwargspoprd   rG   r   r\   r   ra   rb   rc   r
   re   sumcopyr<   appendreplaceupdatenpzerosrg   tensorconcatenatecumsumr8   maxstackrA   nonzeror~   randpermintwhererB   r   )rE   rr   rX   rs   rt   rZ   rn   r4   r   r5   r2   n_audio_in_textn_audionum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetaudio_inputsaudio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   r   )r   r   rE   r   __call__   s   /
 








	&





$zCsmProcessor.__call__c                 C   s0   | j j}| jj}dd |D }t|| dg S )Nc                 S   s   g | ]}|d kr|qS )r   r   )r]   namer   r   r   rz   q  r{   z2CsmProcessor.model_input_names.<locals>.<listcomp>r   )r9   model_input_namesr8   ra   )rE   tokenizer_input_namesfeature_extractor_input_namesr   r   r   r   j  s   zCsmProcessor.model_input_namesr[   )NNNN)NFrq   )r   r   r   __doc__
attributesfeature_extractor_classtokenizer_classrD   staticmethodrW   r	   r   r   r   ra   r   r   rp   r   r   r   boolrj   r   propertyr   __classcell__r   r   rF   r   r7   >   sB    #%
%
 'r7   ) rI   pathlibr   typingr   r   r   rk   r   utilsr   r   rg   	soundfilerl   audio_utilsr	   r
   feature_extraction_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r   r7   __all__r   r   r   r   <module>   s&     
9