o
    iN                     @   s   d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZmZ dd	lmZmZ e r=ddlZe rDddlZG d
d deddZG dd deddZG dd deZdgZdS )zProcessor class for Dia    N)Path)OptionalUnion   )
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)is_soundfile_availableis_torch_availablec                   @   s:   e Zd ZU eed< eed< eed< ee ed< eed< dS )DiaAudioKwargsbos_token_ideos_token_idpad_token_iddelay_pattern
generationN)__name__
__module____qualname__int__annotations__listbool r   r   c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/dia/processing_dia.pyr   "   s   
 r   F)totalc                   @   s@   e Zd ZU eed< dddddddg d	dd
dddidZdS )DiaProcessorKwargsaudio_kwargsTrightF)paddingpadding_sideadd_special_tokensi   i  i  )	r      	   
                  iD  )r   r   r   r   r   sampling_ratereturn_tensorspt)text_kwargsr    common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   *   s   
 
r   c                       sV  e Zd ZdZdZdZdZ fddZ		d*d	ee	e
e	 f d
ee dee dee fddZ	d+dddee dee de
d fddZ	d+dddee dee ddfddZdddee defddZd
edee	ee
ee	ef  f dee fddZe	d,dededed e
e d!eded" fd#d$Zed
dd%ed&ed'ed" ddf
d(d)Z  ZS )-DiaProcessora  
    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
    information.

    Args:
        feature_extractor (`DiaFeatureExtractor`):
            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`DiaTokenizer`):
            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
        audio_tokenizer (`DacModel`):
            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
    DiaFeatureExtractorDiaTokenizerDacModelc                    s   t  j|||d d S )N)audio_tokenizer)super__init__)selffeature_extractor	tokenizerr7   	__class__r   r   r9   R   s   zDiaProcessor.__init__NFtextaudiooutput_labelskwargsc           '   	   K   s  t  std|du rtd| jtfi |}|d }|d }|d }|dd}	|	dkr7t| jj d	i }
t|trB|g}nt|t	t
frRtd
d |D sVtd| j|fi |}|
| |dd}|dd}|dd}|dd}|dd}|du s|du s|du s|du rtd|r|rtd| d| d|
d jd }t|}t|}|durt|}| j|fi |}t| jjj}|d d jd | }g }g }t|d |d D ]\}}| jj}t|jdd| | }|| }|| }t ! |ddd|f | jj}| j |j!"dd}W d   n	1 s,w   Y  |s?tj#j$j%|d d!|d"}tj#j$j%|dd|d dddfd!|d"}|d | }||r^dnd7 }tj&dg| dg|  tj'd#dddf } |(| |(|  qtj)|dd}tj)|dd}n |rtj*|d|f|tj'd#}tj+|d| ftj'd$}ntd%||jd krtd&| d'|jd  d(|jd }!|!| }"| j,||!||d)d*}#tj*||!|f|tj-d+}$||$ddd|"f< | j.|$|||#d,}%|
|%|d- |rL|
d. / ddddf }&d/|&|&|k< d/|&|&|k< |&"dd0|| d1 ' |
d0< |
d. ddddf |
d.< |
d1 ddddf |
d1< t2|
|	d2S )3a  
        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
        to the docstring of the above methods for more information.
        zThe `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't find it in your environment. You can install torch via `pip install torch`.Nz0You need to specify the `text` input to process.r0   r    r1   r.   r/   z% only supports `return_tensors='pt'`.c                 s   s    | ]}t |tV  qd S N)
isinstancestr).0tr   r   r   	<genexpr>}   s    z(DiaProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsr   r   r   r   r   TzTo enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.z9Labels with `generation` is incompatible, got generation=z, output_labels=.	input_idsr   padding_maskinput_valuesdim.      )r   r   r   rP   r   r   constant)padmodevaluedtype)sizerW   z;If you try to train, you should provide audio data as well.zNNeed the same amount of samples for both text and audio, but got text samples=z and audio samples = z	 instead.Fbszseq_lennum_channelsr   revert)
fill_valuerW   r@   r   r   precomputed_idx)decoder_input_idsdecoder_attention_maskra   ilabelsrb   )datatensor_type)3r   
ValueError_merge_kwargsr   popr>   r   rD   rE   r   tupleallr<   updateshapelenmaxr   r;   mathprodr7   configdownsampling_ratioszip
hop_lengthceilsumtorchno_gradtodeviceencodeaudio_codes	transposenn
functionalrS   tensorlongappendcatfullonesbuild_indicesr   apply_audio_delayclonereshape
contiguousr   )'r:   r?   r@   rA   rB   output_kwargsr0   r    r1   r.   rd   	encodingsr   audio_bos_token_idaudio_eos_token_idaudio_pad_token_idr   
batch_sizer\   	max_delayinput_audioscompression_ratemax_encoded_sequence_lenra   rb   rK   base_pad_lencurrent_audio_lenencoded_sequence_lenpadding_lenrJ   num_valid_inputsattention_maskmax_seq_lenmax_audio_lenr`   prefilldelayed_decoder_input_idsrc   r   r   r   __call__U   s   
 


,


$zDiaProcessor.__call__ra   torch.Tensoraudio_prompt_lenreturnc                 K   s  | j tfi |}|d }|dd}|dd}|dd}|du s+|du s+|du r/td|durHtj||jtjd}|d |j	d }	n|dddddf |kj
d	d
}	|j	d |dddddf |kj
d	d
 d }
|j	\}}}| j||||dd}| j|d	d	|ddd}g }t < t|	j	d D ]+}||dd|	| |
| f d }|| jj}| jj|dj  }|| qW d   |S 1 sw   Y  |S )a  
        Decodes a batch of audio codebook sequences into their respective audio waveforms via the
        `audio_tokenizer`. See [`~DacModel.decode`] for more information.

        Args:
            decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
            audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
        r    r   Nr   r   zTo enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.)rz   rW   r   rL   rN   rP   TrY   r_   rQ   )N.)r|   )rg   r   rh   rf   rw   r   rz   r   expandrl   rv   r   r   r}   rx   rangery   r7   decodeaudio_valuescpusqueezer   )r:   ra   r   rB   r   r    r   r   r   start_of_generation_idxend_of_generation_idxrZ   r[   r\   r`   output_sequencesaudiosioutput_iaudio_ir   r   r   batch_decode  s^   ".

"
zDiaProcessor.batch_decodec                 K   s<   |j d dkrtd|j d  d| j||fi |d S )z
        Decodes a single sequence of audio codebooks into the respective audio waveform via the
        `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
        r   rP   z5Expecting a single output to be decoded but received z samples instead.)rl   rf   r   )r:   ra   r   rB   r   r   r   r   I  s
   
zDiaProcessor.decoderb   c                 K   sH   | j tfi |}|d }|dd}|du rtd|jd t| S )z0Utility function to get the audio prompt length.r    r   NzTo enable the utility of retrieving the prompt length for Dia, we need the `delay_pattern`. You may have accidentally overwritten this.rP   )rg   r   rh   rf   rl   rn   )r:   rb   rB   r   r    r   r   r   r   get_audio_prompt_lenZ  s   z!DiaProcessor.get_audio_prompt_lensaving_pathc           	      K   s   t  stdt|}t|ttfr|g}nt|ttfr&tdd |D s*t	dt
|t
|kr6t	d| jtfi |}|d }|d }t||D ]\}}t|tjr^|   }t||| qLd S )Nz/Please install `soundfile` to save audio files.c                 s   s    | ]
}t |ttfV  qd S rC   )rD   rE   r   )rF   pr   r   r   rH     s    z*DiaProcessor.save_audio.<locals>.<genexpr>zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer    r-   )r   ImportErrorr   rD   rE   r   r   ri   rj   rf   rm   rg   r   rs   rw   Tensorr   floatnumpysfwrite)	r:   r@   r   rB   r   r    r-   audio_valuer   r   r   r   
save_audioo  s*    zDiaProcessor.save_audiorZ   r[   r\   r   r]   )r   r   c                 C   s   t j|t jd}t j|t jddddf | |d }|s*||ddddf  }n||ddddf  }t |d|d }t j| t jdddddf | ||}t j|t jdddddf | ||}	t j|d|d|	dgdd }
||
fS )a  
        Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
        or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
        Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
        rV   N).Nr   rP   rL   rN   )	rw   r   int32aranger   clampstackr   r   )rZ   r[   r\   r   r]   delay_arraysequence_idxvalid_sequence_idx	batch_idxchannel_idxall_idxr   r   r   r     s   (((zDiaProcessor.build_indicesr   r   r`   c              	   C   s   | j }|\}}||}||}tj|dd\}}}	| |||	f |  }
|dk }|| jd k}t||t|||
}|S )a  
        Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
        inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.

        Args:
            audio: audio tokens of shape [bsz, seq_len, num_channels]
            pad_token_id: the PAD token
            bos_token_id: the BOS token
            precomputed_idx: from `build_indices`

        Returns:
            final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
        rL   rN   r   rP   )rz   ry   rw   unbindviewrX   rl   where)r@   r   r   r`   rz   r   r   r   r   r   gathered_audiomask_bosmask_padfinal_audior   r   r   r     s   

zDiaProcessor.apply_audio_delay)NFrC   )F)r   r   r   __doc__feature_extractor_classtokenizer_classaudio_tokenizer_classr9   r   rE   r   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodri   r   r   __classcell__r   r   r=   r   r3   >   s    
 1
J


""r3   )r   ro   pathlibr   typingr   r   audio_utilsr   r   feature_extraction_utilsr   processing_utilsr	   r
   r   r   utilsr   r   rw   	soundfiler   r   r   r3   __all__r   r   r   r   <module>   s&      
