o
    wi                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ defddZdejfdedefddZedd ZdejjfddZddejjdefddZdS )    )contextmanager)PathN)	open_dict)	PeftModel)
AutoConfigAutoModelForCausalLM)ASRModel)AudioPerceptionModule)fp32_precision)AudioCodecModelmodel_path_or_namec                 C   s*   t | r|dr| |S | |S )z
    Load pretrained NeMo 1.0 model (inheriting from ModelPT). Works with ASR, TTS, codec models.

    Setting ``pretrained_weights=False`` returns a model that has identical architecture with the checkpoint,
    but is randomly initialized.
    z.nemo)r   existsendswithrestore_fromfrom_pretrained)clsr    r   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/speechlm2/parts/pretrained.pyload_pretrained_nemo   s   

r   Tpretrained_weightsc                 C   s*   |r	t j| |dS t| }t j||dS )z
    Load pretrained HuggingFace AutoModelForCausalLM.

    Setting ``pretrained_weights=False`` returns a model that has identical architecture with the checkpoint,
    but is randomly initialized.
    )torch_dtype)r   r   r   from_config)r   r   dtypeconfigr   r   r   load_pretrained_hf*   s   
r   c                 c   sZ    t | jtr| j| jjjj_n| j| jj_dV  t | jtr'| jjjj`dS | jj`dS )zKTemporarily restores the embedding layer into HF LLM. Supports LoRA models.N)
isinstancellmr   embed_tokens
base_modelmodel)r   r   r   r   move_embedding8   s   r    r   c                 C   s~   t | drt| j jtjkrdS t  tt	| j
j | _W d   n1 s*w   Y  | j D ]}d|_q4| j`dS )z
    Sets up an ``AudioCodecModel``, initializing it from pretrained weights.
    The result is assigned to ``model.audio_codec`` attribute.

    Includes a workaround for PTL auto-downcasting the codec model to bf16 with bf16-true precision.
    audio_codecNF)hasattrnextr!   
parametersr   torchfloatr
   r   r   cfgpretrained_audio_codecevalrequires_graddiscriminator)r   pr   r   r   setup_audio_codecF   s    
r-   c                 C   s   |rKt t| jj }t| j |jj| jj_|jj| jj_| j	j
j| jj_W d   n1 s1w   Y  t| jj | _| jj| dd dS t| jj | _dS )z
    Sets up an ``AudioPerceptionModule``, initializing its ``encoder`` and ``preprocessor``
    with a pretrained NeMo ``ASRModel``.
    The result is assigned to ``model.perception`` attribute and is trainable.
    NF)strict)r   r   r'   pretrained_asrr)   r   preprocessor
perceptionencoderr   r   hidden_size
output_dimr	   trainload_state_dict
state_dict)r   r   asrr   r   r   setup_speech_encoderV   s   r9   )T)
contextlibr   pathlibr   r%   	omegaconfr   peftr   transformersr   r   nemo.collections.asr.modelsr   "nemo.collections.speechlm2.modulesr	   *nemo.collections.speechlm2.parts.precisionr
   nemo.collections.tts.modelsr   strr   float32boolr   r    nnModuler-   r9   r   r   r   r   <module>   s    
