o
    ei7                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZ d	d
lmZ eeZeG dd deZdgZdS )z
Processor class for Bark
    N   )BatchFeature)ProcessorMixin)BatchEncoding)auto_docstringlogging)cached_file   )AutoTokenizerc                       s   e Zd ZddddZd fdd	Ze	d dd	Z		
	d!def fddZdde	dB fddZ
ddedB fddZedefddZd"defddZe							d#defddZ  ZS )$BarkProcessor   r	   semantic_promptcoarse_promptfine_promptNc                    s   t  | || _dS )a*  
        speaker_embeddings (`dict[dict[str]]`, *optional*):
            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
            a list of `voice_preset_names`.
        N)super__init__speaker_embeddings)self	tokenizerr   	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/bark/processing_bark.pyr   *   s   	
zBarkProcessor.__init__speaker_embeddings_path.jsonc           	      K   s   | d}|dur`t|||dd|dd|dd|dd|dd||d	ddddd
}|du rFtdtj|| d d}nt|}t	
|}W d   n1 sZw   Y  nd}|durnd|v rn||d< tj|fi |}| ||dS )a  
        Instantiate a Bark processor associated with a pretrained model.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                  method, e.g., `./my_model_directory/`.
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file containing the speaker_embeddings dictionary located in
                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
            **kwargs
                Additional keyword arguments passed along to both
                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
        tokenN	subfolder	cache_dirforce_downloadFproxieslocal_files_onlyrevision
r   r   r   r   r    r   r!    _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors`z` does not exists
                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                    dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`.repo_or_path)r   r   )getr   poploggerwarningospathjoinopenjsonloadr
   from_pretrained)	cls!pretrained_processor_name_or_pathspeaker_embeddings_dict_pathkwargsr   speaker_embeddings_pathr   speaker_embeddings_jsonr   r   r   r   r2   7   s>   







zBarkProcessor.from_pretrainedr   Fpush_to_hubc              
      s  | j durwtjtj||ddd i }||d< | jD ];}| |}i }	| j | D ](}
tjtj|d || d|
 ||
 dd tj|| d|
 d	|	|
< q)|	||< qt	tj||d
}t
|| W d   n1 srw   Y  t j||fi | dS )a|  
        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
        using the [`~BarkProcessor.from_pretrained`] method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
                if it does not exist).
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
                The name of the folder in which the speaker_embeddings arrays will be saved.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs:
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        Nv2T)exist_okr'   _F)allow_picklez.npyw)r   r,   makedirsr-   r.   available_voice_presets_load_voice_presetnpsaver/   r0   dumpr   save_pretrained)r   save_directoryr5   speaker_embeddings_directoryr9   r6   embeddings_dict
prompt_keyvoice_presettmp_dictkeyfpr   r   r   rE   q   s*   


 
zBarkProcessor.save_pretrainedrJ   c                 K   s   | j | }i }|d}dD ]b}||vrtd| d| dt| j dd|| |dd |d	d |d
d|dd |dd||dd dddd}|d u ritdtj| j dd||  d| dt	|||< q|S )Nr   r   #Voice preset unrecognized, missing z% as a key in self.speaker_embeddings[z].r'   /r   r   r   Fr   r    r!   r"   r&   z{` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the z 
                    embeddings.)
r   r(   
ValueErrorr   r)   r,   r-   r.   rB   r1   )r   rJ   r6   voice_preset_pathsvoice_preset_dictr   rL   r-   r   r   r   rA      s<   







 z BarkProcessor._load_voice_presetc                 C   s   dD ]@}||vrt d| dt|| tjs't| dt| j|  dt|| j| j| krBt | dt| j|  dqd S )Nr   rN   z
 as a key.z voice preset must be a z
D ndarray.)	rP   
isinstancerB   ndarray	TypeErrorstrpreset_shapelenshape)r   rJ   rL   r   r   r   _validate_voice_preset_dict   s   z)BarkProcessor._validate_voice_preset_dictreturnc                 C   s2   | j du rg S t| j  }d|v r|d |S )z
        Returns a list of available voice presets.

        Returns:
            `list[str]`: A list of voice preset names.
        Nr'   )r   listkeysremove)r   voice_presetsr   r   r   r@      s   

z%BarkProcessor.available_voice_presetsTremove_unavailablec              	   C   s   g }| j d urC| jD ]}z| |}W n ty!   || Y q
w | | q
|r8tdt| d| d |rE|D ]
}| j |= q<d S d S d S )NzThe following z' speaker embeddings are not available: zU If you would like to use them, please check the paths or try downloading them again.)	r   r@   rA   rP   appendrZ   r*   r+   rX   )r   r`   unavailable_keysrJ   rR   r   r   r   _verify_speaker_embeddings   s(   



z(BarkProcessor._verify_speaker_embeddingspt   c           
   	   K   s   |dur1t |ts1t |tr| jdur|| jv r| |}nt |tr,|ds,|d }t|}|durD| j|fi | t	||d}| j
|f|d||||d|}	|dur\||	d< |	S )a  
        voice_preset (`str`, `dict[np.ndarray]`):
            The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
            `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
            it can be a valid file name of a local `.npz` single voice preset containing the keys
            `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.

        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`.
            If a voice preset is provided, the returned object will include a `"history_prompt"` key
            containing a [`BatchFeature`], i.e the voice preset with the right tensors type.
        Nz.npz)datatensor_type
max_length)return_tensorspaddingrh   return_attention_maskreturn_token_type_idsadd_special_tokenshistory_prompt)rS   dictrV   r   rA   endswithrB   r1   rZ   r   r   )
r   textrJ   ri   rh   rm   rk   rl   r6   encoded_textr   r   r   __call__   s6   


zBarkProcessor.__call__)N)r   )r   r   F)T)NNrd   re   FTF)__name__
__module____qualname__rW   r   classmethodr2   boolrE   rV   rA   ro   rZ   propertyr\   r@   rc   r   r   rs   __classcell__r   r   r   r   r   "   s>    <8$
r   )__doc__r0   r,   numpyrB   feature_extraction_utilsr   processing_utilsr   tokenization_utils_baser   utilsr   r   	utils.hubr   autor
   
get_loggerrt   r*   r   __all__r   r   r   r   <module>   s    
  
