o
    }oiJ                     @   sB  d dl mZ d dlmZmZmZ d dlmZ d dl	Z
d dlZd dlZd dlm  mZ d dlmZ zd dlmZ W n eyO   d dlmZ dZed Y nw d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z% d dl&m'Z' d dl(m)Z) d dl*m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z= d dlmZ dej>fddZ?dee@ej>f fddZAdd ZBeG dd  d e)e=jCZDG d!d" d"eZeG d#d$ d$e)e=jCZEG d%d& d&eZFeG d'd( d(e)e=jCZGG d)d* d*e"ZHG d+d, d,ejIe=jCe=jJe.jKZLdS )-    )	dataclass)CallableDictOptionalN)	ModelType)TENorm)loggingzFailed to import Transformer Engine dependencies. `from megatron.core.extensions.transformer_engine import TENorm`If using NeMo Run, this is expected. Otherwise, please verify the Transformer Engine installation.)GPTModel)CLIPViTModel)OptimizerConfig)MegatronModule)AttnMaskType)
ModuleSpec)TransformerConfig)tqdm)TokenizerSpec)fn)transformer_engine_layer_spec)default_layer_spec)+build_imagenet_validation_dataloader_params))average_losses_across_data_parallel_group)ClipMegatronLoss)MegatronOptimizerModuleOptimizerModuleget_vocab_sizeioreturnc                 C   s    |d |d d}| di |S )Nimagescaptions)r   r    r   )modelbatchforward_argsr   r   X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/clip/model/base.pyclip_forward_step;   s   r$   c                 C   sl   t | }t|trt|dkr|d }n|}d|v r+t|d jdkr+|d  |d< dd | D }|S )N   r   r   c                 S   s*   i | ]\}}||d ur|j ddnd qS )NTnon_blocking)cuda).0keyvalr   r   r#   
<dictcomp>K   s   * z"clip_data_step.<locals>.<dictcomp>)next
isinstancetuplelenshapesqueezeitems)dataloader_iterr!   _batchr   r   r#   clip_data_step@   s   
r6   c                 C      d S Nr   selftensorr   r   r#   set_input_tensorO   s   r<   c                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< d
Zeed< dZeed< eZeed< dZeed< dZeed< dZeed< dZeed< dddZdS )CLIPViTConfigzClip ViT model config   
output_dimTadd_class_token   class_token_len   	patch_dim   img_himg_wclipvision_model_typetransformer_layer_specFgated_linear_unitattention_softmax_in_fp32   
num_layersnum_attention_headsr   r
   c                 C   sd   | j }t|tsddlm} |dd}tj|jjj	d< || _ t
| || j| j| j| j| j| j| jd	S )Nr   )get_layer_spec_teT)is_vitattn_mask_type)rJ   r@   rB   rD   rF   rG   model_subtyper?   )rJ   r.   r    nemo.collections.vlm.layer_specsrP   MCoreAttnMaskTypeno_mask
submodulesself_attentionparamsr
   r@   rB   rD   rF   rG   rI   r?   )r:   rJ   rP   r   r   r#   configure_modelh   s"   

zCLIPViTConfig.configure_modelN)r   r
   )__name__
__module____qualname____doc__r?   int__annotations__r@   boolrB   rD   rF   rG   rI   strr   rJ   r   rK   rL   rN   rO   rZ   r   r   r   r#   r=   T   s   
 r=   c                       sj   e Zd ZdZ							dded	ed
ededededededef fddZ	dd Z
 fddZ  ZS )r
   zClip ViT modelTrA   rC   rE   rH      transformer_configrJ   r@   rB   rD   rF   rG   rS   r?   c
           
   
      sZ   |	| _ t j||||||||d t| j| jj| jjd| _tj	j
| jj| j dd| _d S )N)rd   rJ   r@   rB   rD   rF   rG   rS   confighidden_sizeepsFbias)r?   super__init__r   rf   rg   layernorm_epsilonfinal_layernormtorchnnLinearhead)
r:   rd   rJ   r@   rB   rD   rF   rG   rS   r?   	__class__r   r#   rl      s*   zCLIPViTModel.__init__c                 C   r7   r8   r   r9   r   r   r#   r<         zCLIPViTModel.set_input_tensorc                    s4   t  |}| |}|d d df }| |}|S )Nr   )rk   forwardrn   rr   )r:   xrs   r   r#   rv      s   

zCLIPViTModel.forward)TrA   rC   rE   rE   rH   rc   )r[   r\   r]   r^   r   r   ra   r_   rb   rl   r<   rv   __classcell__r   r   rs   r#   r
      s<    	
'r
   c                   @   sf   e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
ed	< dZe
ed
< eZeed< ddddZdS )CLIPTextModelConfigzClip text model configr>   r?      make_vocab_size_divisible_byrc   max_seq_lengthF#share_embeddings_and_output_weights&use_transformer_engine_full_layer_specrJ   Nr   CLIPTextModelc              	   C   s   | j }t|ts|| }t| dr,| j}|d ur+td| d|j d||j  d nt| |j| j}t	| ||| j
| j| jdS )N
vocab_sizezUse preset vocab_size: z, original vocab_size: z, dummy tokens: .)rd   rJ   r   max_sequence_lengthr?   r}   )rJ   r.   r   hasattrr   r   infor   r{   r   r|   r?   r}   )r:   	tokenizerpre_processpost_processrJ   r   r   r   r#   rZ      s*   

z#CLIPTextModelConfig.configure_model)NN)r   r   )r[   r\   r]   r^   r?   r_   r`   r{   r|   r}   ra   r~   r   rJ   r   rZ   r   r   r   r#   ry      s   
 ry   c                       sT   e Zd ZdZ		ddededededed	ef fd
dZ fddZ	dd Z
  ZS )r   zClip text modelrc   Frd   rJ   r   r   r?   r}   c              	      s~   || _ t ||||dd| t| j| jj| jjd| _tj	j
| jj| j dd| _d | _| jr=t|dd | _d S d S )NTFre   ri   rM   )r?   rk   rl   r   rf   rg   rm   rn   ro   rp   rq   rr   position_idsr   arangeexpandr(   )r:   rd   rJ   r   r   r?   r}   rs   r   r#   rl      s0   	zCLIPTextModel.__init__c                    sL   t  j|| jd d}| |}||jddt|jd f }| |}|S )N)r   attention_maskr   dimrM   )	rk   rv   r   rn   argmaxro   r   r1   rr   )r:   	input_idsrw   rs   r   r#   rv     s
   
 
zCLIPTextModel.forwardc                 C   r7   r8   r   r9   r   r   r#   r<     ru   zCLIPTextModel.set_input_tensor)rc   F)r[   r\   r]   r^   r   r   r_   ra   rl   rv   r<   rx   r   r   rs   r#   r      s&    &r   c                   @   s   e Zd ZU dZdZee ed< dZee	 ed< dZ
eed< eZeed< eZeed< d	Zeed
< dZeed< dZeed< dZeed< dddZdS )
CLIPConfigzClip model configNtext_transformer_configvision_transformer_configTget_attention_mask_from_fusionforward_step_fndata_step_fnrM   rN   rA   rO   i   rg   P   
seq_lengthc                 C   s   t | j t| |||dS )Nr   r   r   )printkv_channelsMCoreClipModel)r:   r   r   r   r   r   r#   rZ   +  s   
zCLIPConfig.configure_modelTT)r[   r\   r]   r^   r   r   ry   r`   r   r=   r   ra   r$   r   r   r6   r   rN   r_   rO   rg   r   rZ   r   r   r   r#   r     s   
 r   c                       sJ   e Zd ZdZddeddf fddZdejd	ejfd
dZdd Z	  Z
S )r   z
Clip modelTrf   r   Nc                    sv   t  j|d || _|| _|j}|j}|jj| _| | _|j|||d| _	t
jt
g td | _tj| _d S )Nrf   r   g$I$I,@)rk   rl   r   r   r   r   r?   rZ   vision_model
text_modelro   rp   	Parameteronesnploglogit_scaler   encoder_or_decoder
model_type)r:   rf   r   r   r   r   r   rs   r   r#   rl   9  s   

zMCoreClipModel.__init__r   r   c                 C   sF   |  |}| |}| jrtj|ddtj|dd| j fS ||fS )Nr   r   )r   r   r   F	normalizer   exp)r:   r   r   image_featurestext_featuresr   r   r#   rv   I  s
   

$zMCoreClipModel.forwardc                 C   r7   r8   r   r9   r   r   r#   r<   R  ru   zMCoreClipModel.set_input_tensorr   )r[   r\   r]   r^   r   rl   ro   Tensorrv   r<   rx   r   r   rs   r#   r   6  s
    	r   c                       s  e Zd ZdZ						d*dedee ded d	ee d
ededef fddZ	dd Z
d+ddZdejdejfddZdeeejf fddZdejfddZd,dejfddZd,dejfddZd d! Zd"d# Zd$d% Zedefd&d'Zedefd(d)Z  ZS )-	CLIPModela  
    CLIPModel is the base class for all CLIP models.

    Args:
        config: CLIPConfig. The configuration of the CLIP model. Please see the `CLIPConfig` for details.
        optim: OptimizerModule. This module is just used for init and the actual optimizer is created via trainer API.
        tokenizer: TokenizerSpec. This module is used for deciding the output length of the language model.

        # These parameters are just for imagenet validation
        imagenet_val: Optional[str] = None: Optional path to imagenet validation dataset.
        mbs: int = 8: Batch size for imagenet validation.
        gbs: int = 8: Global Batch for imagenet validation.
        max_workers: int = 4: Maximum number of workers used for imagenet validation.


    NrA      rf   optimr   r   imagenet_valmbsgbsmax_workersc                    sb   t    || _|| _|pttdddd| _| j|  d | _d | _	|| _
|| _|| _|| _d S )Ng-C6?T)lruse_distributed_optimizerr   )rk   rl   rf   r   r   r   r   connect_training_loss_reduction_validation_loss_reductionr   r   r   r   )r:   rf   r   r   r   r   r   r   rs   r   r#   rl   i  s   

zCLIPModel.__init__c              
   C   sH   | j dur"t| j | jjj| jjj| j| j| j| jj	j
| jd| _ dS dS )z<Initialize the dataloader parameters for imagenet validationN)num_workersmax_position_embeddingr   )r   r   rf   r   rF   rG   r   r   r   r   r|   r   r:   r   r   r#   on_fit_start  s   
zCLIPModel.on_fit_startr   c                 C   s"   t | ds| j| j| _dS dS )zConfigure the modelmoduleN)r   rf   rZ   r   r   r   r   r   r#   rZ     s   
zCLIPModel.configure_modelr   r   c                 C   s   |  ||S r8   )r   )r:   r   r   r   r   r#   rv        zCLIPModel.forwardc                 C   s   | j |S r8   )rf   r   )r:   r4   r   r   r#   	data_step  r   zCLIPModel.data_stepc                 C   s   | j | |S r8   )rf   r   )r:   r!   r   r   r#   forward_step  s   zCLIPModel.forward_stepc                 C   
   |  |S zQIn mcore the loss-function is part of the forward-pass (when labels are provided)r   r:   r!   	batch_idxr   r   r#   training_step     
zCLIPModel.training_stepc                 C   r   r   r   r   r   r   r#   validation_step  r   zCLIPModel.validation_stepc              
   C   s   | j j j j}t T g }| jd D ]<}|jdd}tjjjdtjd ||}t	j
|ddjdd}||  }W d   n1 sDw   Y  || qtj|d	d}W d   |S 1 saw   Y  |S )
z,Zero shot classifier for imagenet validationtextsTr&   enableddtyper   r   r   NrM   )r   r   ro   no_gradr   r(   ampautocastbfloat16r   r   meannormappendstack)r:   text_encoderzeroshot_weightsr   class_embeddingsclass_embeddingr   r   r#   zero_shot_classifier  s&   

zCLIPModel.zero_shot_classifierc              
   C   sf  ddd}t d t d |  }t d | jjjj}t x d\}}}t| jd d	d
dD ]`\}}|du s>|du r?q2|j	dd
tj}|j	dd}tj	jjdtjd ||}	tj|	dd}	d|	 | }
W d   n1 suw   Y  ||
|dd\}}||7 }||7 }||d7 }q2W d   n1 sw   Y  t d || }|| }||fS )z,Zero shot evaluation for imagenet validationrM   c                    sF   |  t|dddd  }||dd|  fdd|D S )NrM   Tr   c                    s8   g | ]}t  d | d  jddd  qS )Nr   r   T)keepdim)floatreshapesumcpunumpy)r)   kcorrectr   r#   
<listcomp>  s   8 z>CLIPModel.zero_shot_eval.<locals>.accuracy.<locals>.<listcomp>)topkmaxteqview	expand_as)outputtargetr   predr   r   r#   accuracy  s   z*CLIPModel.zero_shot_eval.<locals>.accuracyzStarting zero-shot imagenet.zBuilding zero-shot classifierzUsing classifier)        r   r   r   zImagenet Zero-shot EvaluationF)descleaveNTr&   r   r   r   g      Y@)rM      )r   r   zFinished zero-shot imagenet.)r   )r   r   r   r   r   ro   r   r   r   r(   tor   r   r   r   r   size)r:   r   
classifiervision_encodertop1top5nr   r   r   logitsacc1acc5r   r   r#   zero_shot_eval  s>   







zCLIPModel.zero_shot_evalc                 C   sl   | j dur4td }|  \|d< |d< t|}| jd|d dddd | jd|d dddd dS dS )	z0Run zero shot evaluation for imagenet validationN   r   rM   imagenet_top1T)prog_barrank_zero_only
batch_sizeimagenet_top5)r   ro   zerosr(   r   r   r   )r:   imagenet_metricr   r   r#   on_validation_epoch_end  s   
z!CLIPModel.on_validation_epoch_endc                 C      | j st | _ | j S r8   )r   r   r   r   r   r#   training_loss_reduction     z!CLIPModel.training_loss_reductionc                 C   r  r8   )r   r   r   r   r   r#   validation_loss_reduction  r
  z#CLIPModel.validation_loss_reduction)NNNrA   rA   r   )r   Nr8   )r[   r\   r]   r^   r   r   r   rb   r_   rl   r   rZ   ro   r   rv   r   r   r   r   r   r   r   r  propertyr   r	  r  rx   r   r   rs   r#   r   W  sJ    
/	r   )Mdataclassesr   typingr   r   r   lightning.pytorchpytorchLr   r   ro   torch.distributedtorch.nn.functionalrp   
functionalr   megatron.core.enumsr   +megatron.core.extensions.transformer_enginer   ImportError
nemo.utilsr   warningmegatron.core.models.gptr	   MCoreGPTModel*megatron.core.models.vision.clip_vit_modelr
   MCoreCLIPViTModelmegatron.core.optimizerr   megatron.core.transformerr   megatron.core.transformer.enumsr   rU   $megatron.core.transformer.spec_utilsr   ,megatron.core.transformer.transformer_configr   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   nemo.collections.llm.gpt.modelr   #nemo.collections.llm.gpt.model.baser   2nemo.collections.multimodal.data.clip.clip_datasetr   2nemo.collections.nlp.modules.common.megatron.utilsr   (nemo.collections.vlm.clip.loss.clip_lossr   nemo.lightningr   r   r   r   r   r$   rb   r6   r<   IOMixinr=   ry   r   r   r   LightningModuleConnectorMixinFNMixinr   r   r   r   r#   <module>   s^   +9)6"!