o
    ei5                     @   s   d dl Z d dlmZ d dlmZ d dlZddlmZm	Z	m
Z
 e
 r%d dlZe	 r,d dlZddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ G d
d deddZG dd deddZeG dd deZdgZdS )    N)Path)Any   )auto_docstringis_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   s&   e Zd ZU dZeeef dB ed< dS )CsmAudioKwargsa  
    encoded_length_kwargs (`dict[str, Any]`, *optional*):
        Dictionary of keyword arguments used to compute the encoded audio sequence length. This includes parameters
        such as `kernel_sizes`, `strides`, `dilations`, and `use_causal_conv` that define the convolutional layers
        used in audio encoding. The encoded length is used to determine how many audio tokens to generate for each
        audio input in the text sequence.
    Nencoded_length_kwargs)__name__
__module____qualname____doc__dictstrr   __annotations__ r   r   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/csm/processing_csm.pyr   $   s   
 r   F)totalc                   @   sJ   e Zd ZU eed< ddddg dg dg ddd	d
dddidZdS )CsmProcessorKwargsaudio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r         r   r$   
   r   r$      r   r$      r      )r$   r$   r$   r)   r$   r$      r$   r$      r$   r$   r%   r$      )r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   r$   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   0   s   
 	
r   c                       s   e Zd Z	d fdd	ZedddZdedeeB e	eeB  B de
e fd	d
Ze			ddeeB e	e B e	e B dB dedB dedB dedB de
e f
ddZedd Z  ZS )CsmProcessorNc                    sv   t |dsd| _|| j| _n|j| _|j| _t |ds(d| _|| j| _n|j| _|j| _t j|||d d S )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrr8   convert_tokens_to_idsaudio_token_idr9   audio_eos_token_idsuper__init__)selffeature_extractor	tokenizerr:   	__class__r   r   r@   G   s   

zCsmProcessor.__init__c                 C   s   | }|du s|du s|du s|du r|S t |||D ]U\}}}|d | d }	|| }
|
d }|
| }||	 |
 | d }t|d }|| | |
 }|| }|rW|
}|}n|| }|| | }|||d   d | d }q|S )a|  
        Compute the length of the encoded audio sequence.

        Args:
            audio_length (int): The length of the audio sequence.
            kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
            strides (list[int]): The strides for the convolutional layers.
            use_causal_conv (bool): Whether to use causal convolutions.
        Nr$   r,   )zipmathceil)audio_lengthr-   r.   r/   r0   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddingr   r   r   _get_encoded_length]   s&    z CsmProcessor._get_encoded_lengthaudiosaving_pathkwargsc           	      K   s   t  stdt|}t|ttfr|g}nt|ttfr&tdd |D s*t	dt
|t
|kr6t	d| jtfi |}|d }|d }t||D ]\}}t|tjr^|   }t||| qLd S )Nz/Please install `soundfile` to save audio files.c                 s   s    | ]
}t |ttfV  qd S N)
isinstancer   r   ).0pr   r   r   	<genexpr>   s    z*CsmProcessor.save_audio.<locals>.<genexpr>zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer   r1   )r   ImportErrorr	   rZ   r   r   listtupleall
ValueErrorlen_merge_kwargsr   rF   torchTensorcpufloatnumpysfwrite)	rA   rV   rW   rX   output_kwargsr   r1   audio_valuer\   r   r   r   
save_audio   s*    zCsmProcessor.save_audioF      ?textoutput_labelsdepth_decoder_labels_ratioc              
      s  j tfdjji|}|d }|d }|dd}	|	dkr(tjj dt|t	r1|g}nt|t
tfrAtdd	 |D sEtd
fdd|D }
d}|dur\t|}t|}t|
dkr{|t|
kr{|du rptdtd|
 d| d|dur|di   fdd|D }| }g }|D ]<}g }j|v r|d}j| }|| |jdd}j|v sd|v r|d|dd}d|v s|| q|}j|fi |}i }|| |durs|dd g g }}d}|
D ]I}|dkr|td |tdg q|tjdd ||||  D dd |tdd ||||  D jdd ||7 }qj|fi |}|dd || tdd	 |D fdd|D }tj|dd|d< |r|d  jk  }|j!d }|d!krt"|dt#|d|   }|| }n|}t$|d  jk|d  j%kB |d  d"}d#||dddf |dddf f< ||d$< t&||	d%S )&a  
        output_labels (bool, *optional*, default=False):
            Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
            - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
        depth_decoder_labels_ratio (float, *optional*, default=1.0):
            The ratio of audio frames to keep for the depth decoder labels.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
        tokenizer_init_kwargsr4   r   r2   Nr3   z% only supports `return_tensors='pt'`.c                 s   s    | ]}t |tV  qd S rY   )rZ   r   r[   tr   r   r   r]          z(CsmProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsc                    s   g | ]}|  jqS r   )countr8   rt   )rA   r   r   
<listcomp>       z)CsmProcessor.__call__.<locals>.<listcomp>r   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   c                    s$   g | ]}j |jd  fi  qS )rU   shape)r[   audio_array)r   rA   r   r   rx      s    z<placeholder>r$   return_attention_maskr{   c                 S   s(   g | ]}t |tjr|  n|qS r   )rZ   re   rf   rg   ri   r[   elr   r   r   rx   
  s    )axisc                 S   s   g | ]}|j d  qS rz   r|   r   r   r   r   rx     s    )dimpadding_maskc                 s   s    | ]}|j d  V  qdS )r{   Nr   r[   cut_idxsr   r   r   r]     rv   c                    s.   g | ]}t jjj|d  |jd  fddqS )r   r{   )value)re   nn
functionalpadr|   r   )max_lenr   r   rx     s     input_values_cutoffs	input_idsro   iilabels)datatensor_type)'rd   r   rC   init_kwargsgetrb   rE   r   rZ   r   r_   r`   ra   r	   rc   sumpopcopyr8   appendreplaceupdatenpzerosre   tensorconcatenatecumsumrB   maxstackr=   nonzeror|   randpermintwherer>   r
   )rA   rp   rV   rq   rr   rX   rl   r4   r   r2   n_audio_in_textn_audionum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetaudio_inputsaudio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   r   )r   r   rA   r   __call__   s   
 








	&





$zCsmProcessor.__call__c                 C   s0   | j j}| jj}dd |D }t|| dg S )Nc                 S   s   g | ]}|d kr|qS )r   r   )r[   namer   r   r   rx   >  ry   z2CsmProcessor.model_input_names.<locals>.<listcomp>r   )rC   model_input_namesrB   r_   )rA   tokenizer_input_namesfeature_extractor_input_namesr   r   r   r   7  s   zCsmProcessor.model_input_namesrY   )NNNN)NFro   )r   r   r   r@   staticmethodrU   r   r   r   r_   r   r   rn   r   r   r   boolrh   r   propertyr   __classcell__r   r   rD   r   r7   E   s<    %
" r7   )rG   pathlibr   typingr   ri   r   utilsr   r   r   re   	soundfilerj   audio_utilsr   r	   feature_extraction_utilsr
   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r   r7   __all__r   r   r   r   <module>   s&    
}