o
    }oi                     @   sz   d dl mZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
mZmZ G dd dejZG dd deZG dd	 d	eZdS )
    )ListOptionalUnionN)CLIPTextModelCLIPTokenizerT5ConfigT5EncoderModelT5Tokenizerc                	       s
  e Zd ZdZ			ddedeee  deee  ddf fdd	Ze	defd
dZ
e	deeejf fddZe	defddZe
jdefddZ
ejdeeejf fddZejdefddZe
jdd Z
ejdd Zejdd Zdd Zdd Z  ZS )AbstractEmbModelz 
    Abstract encoder model
    FNenable_lora_finetunetarget_blocktarget_modulereturnc                    sB   t    d | _d | _d | _|pg | _|pg | _|rg | _d S d S N)super__init___is_trainable	_ucg_rate
_input_keyTARGET_BLOCKTARGET_MODULElora_layers)selfr   r   r   	__class__ c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/encoders/conditioner.pyr      s   



zAbstractEmbModel.__init__c                 C      | j S r   r   r   r   r   r   is_trainable,      zAbstractEmbModel.is_trainablec                 C   r   r   r   r   r   r   r   ucg_rate0   r!   zAbstractEmbModel.ucg_ratec                 C   r   r   r   r   r   r   r   	input_key4   r!   zAbstractEmbModel.input_keyvaluec                 C   
   || _ d S r   r   r   r&   r   r   r   r    8      
c                 C   r'   r   r"   r(   r   r   r   r#   <   r)   c                 C   r'   r   r$   r(   r   r   r   r%   @   r)   c                 C      | ` d S r   r   r   r   r   r   r    D      c                 C   r*   r   r"   r   r   r   r   r#   H   r+   c                 C   r*   r   r$   r   r   r   r   r%   L   r+   c                 O      t r   NotImplementedError)r   argskwargsr   r   r   encodeP      zAbstractEmbModel.encodec                 C   r,   r   r-   )r   
lora_modelr   r   r   _enable_loraS   r2   zAbstractEmbModel._enable_lora)FNN)__name__
__module____qualname____doc__boolr   r   strr   propertyr    r   floattorchTensorr#   r%   setterdeleterr1   r4   __classcell__r   r   r   r   r
      sD    




r
   c                       sV   e Zd ZdZg dZdddddddejf fd	d
	Zdd ZdddZ	dd Z
  ZS )FrozenCLIPEmbedderz>Uses the CLIP transformer encoder for text (from Hugging Face))lastpooledhiddenopenai/clip-vit-large-patch14cudaM   FrC   Nc	           	         s   t  j|ddgdgd td| _tj||d|| _|| _|| _	| 
  |r:| | j tdt| j d || _|| _|| _|d	kr]|d usMJ d
t|  krZdks_J  J d S d S )NCLIPAttentionCLIPMLPLinear)r   r   rF   torch_dtypezCLIP transformer encoder add z lora layers.rE   r      )r   r   r   from_pretrained	tokenizerr   totransformerdevice
max_lengthfreezer4   printlenr   layer	layer_idxreturn_pooledabs)	r   versionrS   rT   r   rX   rY   always_return_pooleddtyper   r   r   r   \   s"    zFrozenCLIPEmbedder.__init__c                 C   $   | j  | _ |  D ]}d|_q
d S NFrR   eval
parametersrequires_gradr   paramr   r   r   rU   x      zFrozenCLIPEmbedder.freezec              	   C   s   | j |d|r|n| jddddd}|d j| jjdd}| j|| jdkd	}| jd
kr/|j}n| jdkrA|jd d d d d f }n|j| j	 }|j
d d d d d }tjjj|ddd||j
d  fdd}| jro||jfS |S )NTFrT   pt
truncationrT   return_lengthreturn_overflowing_tokenspaddingreturn_tensors	input_idsnon_blockingrE   ro   output_hidden_statesrC   rD         r   g        )r&   )rP   rT   rQ   rR   rS   rX   last_hidden_statepooler_outputhidden_statesrY   shaper=   nn
functionalpadrZ   )r   textmax_sequence_lengthbatch_encodingtokensoutputszseq_lenr   r   r   forward}   s*   	

&
zFrozenCLIPEmbedder.forwardc                 C   s   | |S r   r   )r   r}   r   r   r   r1      s   zFrozenCLIPEmbedder.encoder   )r5   r6   r7   r8   LAYERSr=   r<   r   rU   r   r1   rA   r   r   r   r   rB   W   s    
rB   c                       s@   e Zd ZdZdddejdf fdd	Zdd	 ZdddZ  Z	S )FrozenT5Embedderz(
    FrozenT5 encoder model from HF
    google/t5-v1_1-xxli   rG   Fc                    sp   t    tjd|d| _|rtd t|}t|| _ntj||d	|| _|| _
|   || _|| _d S )Nr   )rT   z9T5 will be randomly initialized for testing purpose only!rL   )r   r   r	   rO   rP   rV   r   r   rR   rQ   rT   rU   rS   r^   )r   r\   rT   rS   r^   load_config_onlyconfigr   r   r   r      s   


zFrozenT5Embedder.__init__c                 C   r_   r`   ra   re   r   r   r   rU      rg   zFrozenT5Embedder.freezeNc              	   C   sL   | j |d|r|n| jddddd}|d j| jjdd}| j|d d}|jS )	NTFrT   rh   ri   ro   rp   rr   )rP   rT   rQ   rR   rS   rv   )r   r}   r~   r   r   r   r   r   r   r      s   
zFrozenT5Embedder.forwardr   )
r5   r6   r7   r8   r=   r<   r   rU   r   rA   r   r   r   r   r      s    r   )typingr   r   r   r=   torch.nnrz   transformersr   r   r   r   r	   Moduler
   rB   r   r   r   r   r   <module>   s   @E