o
    ߥi                     @   s   d Z ddlZddlZddlZddlm  mZ ddlmZ ddlm	Z	m
Z
 G dd dejZG dd dejZG d	d
 d
ejZdS )z* Generative Multimodal Model Architecture.    N)nn)	gemm_base	tokenizerc                       s*   e Zd ZdZ fddZdddZ  ZS )ImageEncoderz4Image Feature Encoder
    ViT Style Transformer
    c              	      sB   t    |d d \}}}}}tj|||||d |dd| _d S )N   @   F)input_resolution
patch_sizewidthlayersheads
output_dimuse_gc)super__init__r   VisualTransformervisual)selfconfigs	embed_dimimage_resolutionvision_layersvision_widthvision_patch_size	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/rleg/model.pyr      s   


zImageEncoder.__init__Fc                 C   sJ   |  |}|d d dd d d f }|d d dd d f }|r#||fS |S )N   r   )r   )r   imagereturn_tokensfeaturestokens	embeddingr   r   r   forward%   s   
zImageEncoder.forwardF)__name__
__module____qualname____doc__r   r$   __classcell__r   r   r   r   r      s    r   c                       s4   e Zd ZdZ fddZd
ddZddd	Z  ZS )TextEncoderz4Text Feature Encoder
    BERT style transformer
    c                    s   t    |dd  \}}}}}tj|||| |d| _t||| _t	t
||| _t|| _t	t
||d | _d S )N)r
   r   r   	attn_maskr   )r   r   r   Transformerbuild_attention_masktransformerr   	Embeddingtoken_embedding	Parametertorchemptypositional_embedding	LayerNormln_finaltext_projection)r   r   context_length
vocab_sizemodel_widthmodel_headsmodel_layersr   r   r   r   1   s$   




zTextEncoder.__init__Nc                 C   s   t ||d }|d |S )Ng     r   )r4   onestriu_)r   
seq_lengthmaskr   r   r   r/   D   s   
z TextEncoder.build_attention_maskFc                 C   s|   |  |}|| j }|ddd}| |}|ddd}| |}|t|jd |jdddf | j	 }|r<||fS |S )Nr   r      )dim.)
r2   r6   permuter0   r8   r4   arangeshapeargmaxr9   )r   textr    xr#   r   r   r   r$   I   s   



zTextEncoder.forwardNr%   )r&   r'   r(   r)   r   r/   r$   r*   r   r   r   r   r+   ,   s
    
r+   c                       sR   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Ze	
 dddZ  ZS )	RLEGModelz Generative multi-modal model, trained with RLEG method.
    It takes image or text or both of them as input, and produce
    the corresponding features of inputs.
    c                    s   t    td|ddd}t| }W d    n1 s!w   Y  t| d }|| }t	j
|d}t|| _t|| _t|| _ttg | _d S )Nz{}/encoder_config.jsonrzutf-8)encodingr   zbpe_vocab_16e6.txt.gz)r   r   openformatjsonloadsreadlistkeysospathjoinr   SimpleTokenizerr   image_encoderr+   text_encoderr   r3   r4   r?   logit_scale)r   	model_dirfmodel_config
model_nameconfig_argsbpe_pathr   r   r   r   \   s   



zRLEGModel.__init__c                 C   s   t | j |gd }|S )Nr   )r   clip_tokenize)r   text_strtext_tensorr   r   r   tokenizek   s   zRLEGModel.tokenizec                 C      |  |}tj|ddd}|S NrC   rD   )prE   )r\   F	normalize)r   rJ   featurer   r   r   encode_texto      
zRLEGModel.encode_textc                 C   rh   ri   )r[   rk   rl   )r   r   rm   r   r   r   encode_imaget   ro   zRLEGModel.encode_imagec                 C   s   |   }|S rL   )cpunumpy)r   featoutr   r   r   
parse_featy   s   zRLEGModel.parse_featNc                 C   sF   d\}}|dur|  | |}|dur|  | |}||d}|S )zW It takes image or text as input,
        and extracts the features as output.
        NNN)image_featuretext_feature)ru   rp   rn   )r   r   rJ   img_featurerx   rt   r   r   r   r$   }   s   zRLEGModel.forwardrv   )r&   r'   r(   r)   r   rg   rn   rp   ru   r4   no_gradr$   r*   r   r   r   r   rM   V   s    rM   )r)   rW   rR   r4   torch.nn.functionalr   
functionalrk   "modelscope.models.multi_modal.gemmr   r   Moduler   r+   rM   r   r   r   r   <module>   s   *