o
    eiI                     @   sf   d Z ddlZddlmZmZ ddlmZmZ ddl	m
Z
mZmZmZmZ eeZG dd deZdS )	zZ
Sequence feature extraction class for common feature extractors to preprocess sequences.
    N   )is_valid_audio
load_audio)BatchFeatureFeatureExtractionMixin)PaddingStrategy
TensorTypeis_torch_tensorloggingto_numpyc                       sf  e Zd ZdZdededef fddZ							dd
eee B e	e
ef B e	e
ee f B ee	e
ef  B dee
B eB dedB dededB dedB de
eB dB defddZdejddfd
e	e
ejf eB dedB dededB dedB de	fddZ			dd
e	e
ejf eB dedB dedB dedB fddZd ddZde
ee
 B eee
  B fddZ  ZS )!SequenceFeatureExtractora  
    This is a general feature extraction class for speech recognition.

    Args:
        feature_size (`int`):
            The feature dimension of the extracted features.
        sampling_rate (`int`):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`):
            The value that is used to fill the padding values / vectors.
    feature_sizesampling_ratepadding_valuec                    sD   || _ || _|| _|dd| _|dd| _t jdi | d S )Npadding_siderightreturn_attention_maskT )r   r   r   popr   r   super__init__)selfr   r   r   kwargs	__class__r   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/feature_extraction_sequence_utils.pyr   )   s   z!SequenceFeatureExtractor.__init__TNFprocessed_featurespadding
max_length
truncationpad_to_multiple_ofr   return_tensorsreturnc                    s  t ttfrt d ttfrfddd  D jd vr5tdjd  dt  jd  }|durB|nj}t	|dkrS|rQg d< S |d }	t |	ttfrd}
t	||
 dkrt|
d7 }
t	||
 dksh|
t	|k r||
 d }	|du rt
|	rd	}nt |	tttttjfrd
}ntd|	 dt|	 d D ]\}}t |d ttfrt||< qdd |D |< qj||d}jd  }t	| t fdd D stdg }t D ]fdd D }j||||d}|| q|tjkr!tfdd|D }tj}i }t D ]9j| ||||d}| D ]%\}}||vrFg ||< |jttju rV|tj }|| | q9q't||dS )a2  
        Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the
        max sequence length in the batch.

        Padding side (left/right) padding values are defined at the feature extractor level (with `self.padding_side`,
        `self.padding_value`)

        <Tip>

        If the `processed_features` passed are dictionary of numpy arrays or PyTorch tensors  the
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.

        </Tip>

        Args:
            processed_features ([`BatchFeature`], list of [`BatchFeature`], `dict[str, list[float]]`, `dict[str, list[list[float]]` or `list[dict[str, list[float]]]`):
                Processed inputs. Can represent one input ([`BatchFeature`] or `dict[str, list[float]]`) or a batch of
                input values / vectors (list of [`BatchFeature`], *dict[str, list[list[float]]]* or *list[dict[str,
                list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.

                Instead of `list[float]` you can have tensors (numpy arrays or PyTorch tensors),
                see the note above for the return type.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        r   c                    s    i | ]   fd dD qS )c                    s   g | ]}|  qS r   r   ).0examplekeyr   r   
<listcomp>|       z;SequenceFeatureExtractor.pad.<locals>.<dictcomp>.<listcomp>r   )r#   )r   r%   r   
<dictcomp>{   s    z0SequenceFeatureExtractor.pad.<locals>.<dictcomp>zYou should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature` to this method that includes z, but you provided Nattention_maskr   ptnpztype of z
 unknown: z6. Should be one of a python, numpy, or pytorch object.c                 S   s   g | ]}t |qS r   )r   r#   vr   r   r   r'      r(   z0SequenceFeatureExtractor.pad.<locals>.<listcomp>)r   r   c                 3   s    | ]	}t | kV  qd S )N)lenr-   )
batch_sizer   r   	<genexpr>   s    z/SequenceFeatureExtractor.pad.<locals>.<genexpr>zLSome items in the output dictionary have a different batch size than others.c                    s   i | ]	\}}||  qS r   r   )r#   kr.   )ir   r   r)      s    )r   r    r   c                 3   s"    | ]}t | jd   V  qdS )r   N)r/   model_input_names)r#   input_slicer   r   r   r1      s     )r   padding_strategyr    r   )tensor_type)!
isinstancelisttupledictr   keysr4   
ValueErrorr   r/   r	   intfloatr,   ndarraytypeitemsr   _get_padding_strategiesallvaluesrange	_truncateappendr   LONGESTmax
MAX_LENGTH_paddtypefloat64astypefloat32)r   r   r   r   r   r    r   r!   required_inputfirst_elementindexr&   valuer7   truncated_inputsinputsinputs_slicebatch_outputsoutputsr   )r0   r3   r   r   r   pad3   s    F



zSequenceFeatureExtractor.padr7   c           
      C   sx  || j d  }|tjkrt|}|dur&|dur&|| dkr&|| d | }|tjko0t||k }|rCd|vrCtjt|tjd|d< |r|t| }| jdkr~|r^t	|d d|f|d< | j
dkrid|fdfnd|f}	tj	||	d| jd	|| j d < |S | jd
kr|rt	|d |df|d< | j
dkr|dfdfn|df}	tj	||	d| jd	|| j d < |S tdt| j |S )a  
        Pad inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            processed_features (`Union[dict[str, np.ndarray], BatchFeature]`):
                Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
                of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see below)
            padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
                PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The feature_extractor padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of (`int`, *optional*):
                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
                which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Set to False to avoid returning attention mask (default: set to model specifics)
        r   Nr   r*   )rN   r   )r   r   constant)constant_valuesleftzInvalid padding strategy:)r4   r   rJ   r/   
DO_NOT_PADr,   onesint32r   r[   r   r   r>   str)
r   r   r   r7   r    r   rR   needs_to_be_padded
differencepadding_shaper   r   r   rM      s@   "




zSequenceFeatureExtractor._padc                 C   s   |s|S |r|du rt d|| jd  }|dur+|dur+|| dkr+|| d | }t||k}|rQ|| jd  d| || jd < d|v rQ|d d| |d< |S )a  
        Truncate inputs to predefined length or max length in the batch

        Args:
            processed_features(`Union[dict[str, np.ndarray], BatchFeature]`):
                Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
                of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
            max_length (`int`, *optional*):
                maximum length of the returned list and optionally padding length (see below)
            pad_to_multiple_of (`int`, *optional*) :
                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
                which benefit from having sequence lengths be a multiple of 128.
            truncation (`bool`, *optional*):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
        NzKWhen setting ``truncation=True``, make sure that ``max_length`` is defined.r   r   r*   )r>   r4   r/   )r   r   r   r    r   rR   needs_to_be_truncatedr   r   r   rH   %  s    z"SequenceFeatureExtractor._truncatec                 C   s   |dur|du rt j}nt|t st |}nt|t r|}nt j}|du r3|t jkr3tdt j d|t jkrA| jdu rAtd|S )z3
        Find the correct padding strategy
        FTNzWhen setting ``padding=z(``, make sure that max_length is definedzAsking to pad but the feature_extractor does not have a padding value. Please select a value to use as `padding_value`. For example: `feature_extractor.padding_value = 0.0`.)r   rJ   r9   r_   rL   r>   r   )r   r   r   r7   r   r   r   rD   P  s&   



z0SequenceFeatureExtractor._get_padding_strategiesaudio_url_or_urlsc                    sL   t |tr fdd|D S t |trt|S t|r|S tdt| )z
        Convert a single or a list of urls into the corresponding `np.ndarray` objects.

        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
        returned.
        c                    s   g | ]}  |qS r   )fetch_audio)r#   xr6   r   r   r'   x  s    z8SequenceFeatureExtractor.fetch_audio.<locals>.<listcomp>z=only a single or a list of entries is supported but got type=)r9   r:   rb   r   r   	TypeErrorrB   )r   rg   r   r6   r   rh   p  s   

z$SequenceFeatureExtractor.fetch_audio)TNFNNN)NNN)FN)__name__
__module____qualname____doc__r?   r@   r   r   r:   r<   rb   boolr   r   r[   r_   r,   rA   rM   rH   rD   rh   __classcell__r   r   r   r   r      s    

	


 .
K

+* r   )rn   numpyr,   audio_utilsr   r   feature_extraction_utilsr   r   utilsr   r   r	   r
   r   
get_loggerrk   loggerr   r   r   r   r   <module>   s   
