o
    ߥi                     @   sX   d dl Zd dlZd dlZd dlmZ d dlmZ G dd dej	Z
G dd dej	ZdS )    Nc                       sh   e Zd ZdZddgZ						d fd	d
	Zdd Zdd Zdd Zdde	j
fddZdd Z  ZS )FrozenOpenCLIPEmbedder8
    Uses the OpenCLIP transformer encoder for text
    lastpenultimateViT-H-14laion2b_s32b_b79kcudaM   Tc           	         s   t    || jv sJ tj|td|d\}}}|`|| _|| _|| _	|r+| 
  || _| jdkr8d| _d S | jdkrBd| _d S t )Ncpudevice
pretrainedr   r   r      )super__init__LAYERS	open_clipcreate_model_and_transformstorchr   visualmodel
max_lengthfreezelayer	layer_idxNotImplementedError)	selfarchr   r   r   r   r   r   _	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/clip.pyr      s"   




zFrozenOpenCLIPEmbedder.__init__c                 C   $   | j  | _ |  D ]}d|_q
d S NFr   eval
parametersrequires_gradr   paramr!   r!   r"   r   *      zFrozenOpenCLIPEmbedder.freezec                 C   s    t |}| || j}|S N)r   tokenizeencode_with_transformertor   )r   texttokenszr!   r!   r"   forward/   s   
zFrozenOpenCLIPEmbedder.forwardc                 C   V   | j |}|| j j }|ddd}| j|| j jd}|ddd}| j |}|S Nr   r      	attn_maskr   token_embeddingpositional_embeddingpermutetext_transformer_forwardr8   ln_finalr   r0   xr!   r!   r"   r.   4      z.FrozenOpenCLIPEmbedder.encode_with_transformerNr@   c                 C   h   t | jjjD ]*\}}|t| jjj| j kr |S | jjjr+tj	 s+t
|||}q|||d}q|S Nr7   	enumerater   transformer	resblockslenr   grad_checkpointingr   jitis_scripting
checkpointr   r@   r8   irr!   r!   r"   r=   =      z/FrozenOpenCLIPEmbedder.text_transformer_forwardc                 C      | |S r,   r!   r   r0   r!   r!   r"   encodeH      zFrozenOpenCLIPEmbedder.encode)r   r   r   r	   Tr   r,   __name__
__module____qualname____doc__r   r   r   r3   r.   r   Tensorr=   rS   __classcell__r!   r!   r   r"   r   
   s    	r   c                       sj   e Zd ZdZddgZ								d fd
d	Zdd Zdd Zdd Zdde	j
fddZdd Z  ZS )FrozenOpenCLIPVisualEmbedderr   r   r   r   r   r   r	   T   r^      c                    s   t    || jv sJ tj|td|d\}}	}
|`|| _t	j
|t	jdd }|
t |d| _|
| _|| _|| _|rD|   || _| jdkrQd| _d S | jdkr[d| _d S t )	Nr
   r   )dtype   r   r   r   r   )r   r   r   r   r   r   r   rF   r   nponesuint8T
ToPILImage	unsqueezeblack_image
preprocessr   r   r   r   r   )r   r   r   r   r   r   r   input_shaper   r   ri   
data_whiter   r!   r"   r   R   s(   




z%FrozenOpenCLIPVisualEmbedder.__init__c                 C   r#   r$   r%   r)   r!   r!   r"   r   p   r+   z#FrozenOpenCLIPVisualEmbedder.freezec                 C   s   | j || j}|S r,   )r   encode_imager/   r   )r   imager2   r!   r!   r"   r3   u   s   z$FrozenOpenCLIPVisualEmbedder.forwardc                 C   r4   r5   r9   r?   r!   r!   r"   r.   z   rA   z4FrozenOpenCLIPVisualEmbedder.encode_with_transformerNr@   c                 C   rB   rC   rD   rM   r!   r!   r"   r=      rP   z5FrozenOpenCLIPVisualEmbedder.text_transformer_forwardc                 C   rQ   r,   r!   rR   r!   r!   r"   rS      rT   z#FrozenOpenCLIPVisualEmbedder.encode)r   r   r   r	   Tr   r]   r,   rU   r!   r!   r   r"   r\   L   s     	r\   )numpyrb   r   r   torch.nnnntorchvision.transforms
transformsre   Moduler   r\   r!   r!   r!   r"   <module>   s   B