o
    ei                     @   s:   d Z ddlmZ ddlmZ eG dd deZdgZdS )z$Speech processor class for SpeechT5.   )ProcessorMixin)auto_docstringc                       s0   e Zd Z fddZedd Zdd Z  ZS )SpeechT5Processorc                    s   t  || d S )N)super__init__)selffeature_extractor	tokenizer	__class__ n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/speecht5/processing_speecht5.pyr      s   zSpeechT5Processor.__init__c                 O   sd  | dd }| dd }| dd }| dd }| dd }|d ur*|d ur*td|d ur6|d ur6td|d u rJ|d u rJ|d u rJ|d u rJtd|d ur]| j|g|R d|i|}n|d urk| j|fi |}nd }|d ur| j|||d	|}	|	d
 }
n|d ur| j|fi |}	|	d }
nd }	|d u r|	S |	d ur|
|d< |	d}|d ur||d< |S )Naudiotexttext_targetaudio_targetsampling_ratez\Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?z\Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?zaYou need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process.)r   r   input_values	input_idslabelsattention_maskdecoder_attention_mask)pop
ValueErrorr   r	   get)r   argskwargsr   r   r   r   r   inputstargetsr   r   r   r   r   __call__   sJ    


zSpeechT5Processor.__call__c           
      O   sd  | dd}| dd}| dd}|dur|durtd|du r.|du r.|du r.td|dur@| jj|g|R i |}n|durO| jj|fi |}nd}|durd|v sdt|trsd|d v rs| jj|fi |}|d }n"| jj}| jj| j_| jj|g|R i |}|| j_|d }nd}|du r|S |dur||d< |	d}	|	dur|	|d	< |S )
au  
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        r   Nr   r   z:Cannot process both `input_values` and `input_ids` inputs.zZYou need to specify either an `input_values`, `input_ids`, or `labels` input to be padded.    r   r   )
r   r   r   padr	   
isinstancelistfeature_sizenum_mel_binsr   )
r   r   r   r   r   r   r   r   feature_size_hackr   r   r   r   r!   J   s@   


zSpeechT5Processor.pad)__name__
__module____qualname__r   r   r   r!   __classcell__r   r   r
   r   r      s
    
0r   N)__doc__processing_utilsr   utilsr   r   __all__r   r   r   r   <module>   s   
r