o
    wi#_                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ erdd dlmZ d dlmZ eG dd deZeG dd deZ eG dd deZ!eG dd deZ"eG dd de"Z#eG dd de"Z$G dd deZ%e&e%dG dd dej'def Z(e)e%dG d d! d!ej'ed"f Z*ej+d#d$d%d&ej,fd'd(Z-ej+d)d*d%d&ej,fd+d,Z.ej+d-d.d%d&ej,fd/d0Z/ej+d1d2d%d&ej,fd3d4Z0ej+d5d$d%d&ej,fd6d7Z1ej+d8d*d%d&ej,fd9d:Z2ej+d;d.d%d&ej,fd<d=Z3ej+d$d5d%d&ej,fd>d?Z4ej+d*d8d%d&ej,fd@dAZ5ej+d.dBd%d&ej,fdCdDZ6dS )E    N)	dataclass)Path)TYPE_CHECKING	AnnotatedCallableOptional)nn)AutoTokenizerTokenizerSpec)
BertConfig	BertModel)Config)OptimizerModuleioteardown)dtype_from_hf)loggingr   r   c                   @   s~   e Zd ZU dZdZeed< dZeed< dZ	e
ed< dZe
ed	< d
Ze
ed< dZe
ed< dZeed< dZeed< dZeed< dS )MegatronBertConfigz/Configs for training megatron-style Bert Model.megatron	bert_typeF
add_pooler{Gz?init_method_std皙?hidden_dropout	LayerNormnormalizationh㈵>layernorm_epsilonapply_query_key_layer_scalinglearned_absoluteposition_embedding_typebert_binary_headN)__name__
__module____qualname____doc__r   str__annotations__r   boolr   floatr   r   r    r!   r#   r$    r-   r-   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/bert/model/bert.pyr   $   s   
 r   c                   @   B   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
S )MegatronBertLargeConfigz)Configs for Bert-Large in megatron style.   
num_layers   hidden_size   ffn_hidden_size   num_attention_headsN
r%   r&   r'   r(   r2   intr*   r4   r6   r8   r-   r-   r-   r.   r0   3      
 r0   c                   @   B   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	S )
MegatronBertBaseConfigz(Configs for Bert-Base in megatron style.   r2      r4      r6   r8   Nr9   r-   r-   r-   r.   r=   =   r;   r=   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZ	e
ed< dZe
ed	< d
Ze
ed< dZe
ed< dZeed< dZeed< dS )HuggingFaceBertConfigz8Configs for models in https://huggingface.co/google-berthuggingfacer   Tr   r   r   r   r   r   r   r   r    Fr!   r"   r#   N)r%   r&   r'   r(   r   r)   r*   r   r+   r   r,   r   r   r    r!   r#   r-   r-   r-   r.   rA   G   s   
 rA   c                   @   r<   )
HuggingFaceBertBaseConfigzIConfigs for model in https://huggingface.co/google-bert/bert-base-uncasedr>   r2   r?   r4   r@   r6   r8   Nr9   r-   r-   r-   r.   rC   U   r;   rC   c                   @   r/   )HuggingFaceBertLargeConfigzJConfigs for model in https://huggingface.co/google-bert/bert-large-uncasedr1   r2   r3   r4   r5   r6   r7   r8   Nr9   r-   r-   r-   r.   rD   _   r;   rD   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )HuggingFaceBertModelzGoogle Bert Model.Nconfigoptim	tokenizerr
   model_transformc                    s   t  j|pt |||d d S )N)rG   rH   rI   )super__init__r   )selfrF   rG   rH   rI   	__class__r-   r.   rK   l   s   zHuggingFaceBertModel.__init__)NNNN)r%   r&   r'   r(   r   r   r   r   r   r   r   ModulerK   __classcell__r-   r-   rM   r.   rE   i   s    rE   hfc                       sh   e Zd ZdZ fddZdefddZdedefdd	Zd
d Z	e
dddZe
defddZ  ZS )HuggingFaceBertImporterz>Importer Connector for converting HF Google Bert Model to NeMoc                    s(   t jdkrt j|  |dd| _d S )N)      typemodel)sysversion_inforJ   rK   getrU   )rL   argskwargsrM   r-   r.   rK   z   s   
z HuggingFaceBertImporter.__init__returnc                 C   s   t | j| jdS )N)rH   )rE   rF   rH   rL   r-   r-   r.   init   s   zHuggingFaceBertImporter.initoutput_pathc           	      C   s  ddl m}m}m}m} |jt| dd}| jdkr$|jt| dd}n,| jdkr3|jt| dd}n| jdkrB|jt| dd}n| jdkrP|jt| dd}t	d	| j
j d
| j
j d| j
j  |  }| |}| || | || t	d|  t|| ~~|S )Nr   )BertForMaskedLMBertForNextSentencePredictionBertForPreTrainingr   autotorch_dtyperV   pretrainingmaskedclassificationz$Initializing Bert Model with pooler=z	 lm_head=z  binary_head=z-Converted Bert model to Nemo, model saved to )transformersr`   ra   rb   r   from_pretrainedr)   rU   r   inforF   r   add_lm_headr$   r^   
nemo_setupconvert_state	nemo_saver   )	rL   r_   r`   ra   rb   r   sourcetargettrainerr-   r-   r.   apply   s2   





zHuggingFaceBertImporter.applyc                 C   s   ddddddddd	d
ddddd}| j jr|ddd | jdkr(tttg}nttt	g}| jdks7| jdkr<|
t | jdkrJdd | D }| j jrX|ddddd | j jrd|ddd tj||||d S )!z,Converting HF state dict to NeMo state dict.$embedding.position_embeddings.weight%embedding.tokentype_embeddings.weight encoder.initial_layernorm.weightencoder.initial_layernorm.bias2encoder.layers.*.self_attention.linear_proj.weight0encoder.layers.*.self_attention.linear_proj.bias*encoder.layers.*.post_att_layernorm.weight(encoder.layers.*.post_att_layernorm.bias&encoder.layers.*.mlp.linear_fc1.weight$encoder.layers.*.mlp.linear_fc1.bias&encoder.layers.*.mlp.linear_fc2.weight$encoder.layers.*.mlp.linear_fc2.bias*encoder.layers.*.post_mlp_layernorm.weight(encoder.layers.*.post_mlp_layernorm.bias)%embeddings.position_embeddings.weight'embeddings.token_type_embeddings.weightembeddings.LayerNorm.weightembeddings.LayerNorm.bias-encoder.layer.*.attention.output.dense.weight+encoder.layer.*.attention.output.dense.bias1encoder.layer.*.attention.output.LayerNorm.weight/encoder.layer.*.attention.output.LayerNorm.bias)encoder.layer.*.intermediate.dense.weight'encoder.layer.*.intermediate.dense.bias#encoder.layer.*.output.dense.weight!encoder.layer.*.output.dense.bias'encoder.layer.*.output.LayerNorm.weight%encoder.layer.*.output.LayerNorm.biaspooler.dense.weightpooler.dense.biasr   r   rV   rf   rg   c                 S   s   i | ]
\}}d | |qS )zbert.r-   ).0kvr-   r-   r.   
<dictcomp>   s    z9HuggingFaceBertImporter.convert_state.<locals>.<dictcomp>zlm_head.dense.weightzlm_head.dense.biaszlm_head.layer_norm.weightzlm_head.layer_norm.bias)z&cls.predictions.transform.dense.weightz$cls.predictions.transform.dense.biasz*cls.predictions.transform.LayerNorm.weightz(cls.predictions.transform.LayerNorm.biaszbinary_head.weightzbinary_head.bias)zcls.seq_relationship.weightzcls.seq_relationship.biasmapping
transforms)rF   r   updaterU   _import_qkv_2_import_qkv_bias_2_import_embedding_2_import_qkv_import_qkv_bias_import_embeddingappend_import_output_biasitemsrl   r$   r   apply_transforms)rL   rp   rq   r   r   r-   r-   r.   rn      sV   
	


z%HuggingFaceBertImporter.convert_stater	   c                 C   s   ddl m} || t| S )Retrieve Tokenizer from HFr   )r	   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr	   save_hf_tokenizer_assetsr)   )rL   r	   r-   r-   r.   rH      s   z!HuggingFaceBertImporter.tokenizerc                 C   s   ddl m} |t| }tdi ddd|jd|jd|jd|jd	|j	d
|j
d|jd| jdkp9| jdkd| jdkpj| jdkd| jdkdddddt|tjkdt|tjkdt|}|S d| jdkdddddt|tjkdt|tjkdt|}|S )z'Generate NeMo Config based on HF configr   r   r   rB   r2   r4   r6   r8   r   r    
seq_lengthrl   rf   rg   r$   rh   r   #share_embeddings_and_output_weightsTnum_tokentypes   fp16bf16params_dtypeNr-   )ri   r   rj   r)   rA   num_hidden_layersr4   intermediate_sizer8   initializer_rangelayer_norm_epsmax_position_embeddingsrU   r   torchfloat16bfloat16)rL   HFBertConfigrp   outputr-   r-   r.   rF      sh   	


zHuggingFaceBertImporter.config)r\   r	   )r%   r&   r'   r(   rK   rE   r^   r   rs   rn   propertyrH   r   rF   rP   r-   r-   rM   r.   rR   v   s    ;rR   r`   c                   @   sT   e Zd ZdZejfdddZdedefddZe	d	d
 Z
dd Ze	dddZdS )HuggingFaceBertExporterz7Exporter Connector for converting NeMo Bert Model to HFr\   r   c                 C   sR   ddl m} ddlm} |  |j| j|dW  d    S 1 s"w   Y  d S )Nr   r   )no_init_weightsrd   )ri   r   transformers.modeling_utilsr   _from_configrF   )rL   dtyper   r   r-   r-   r.   r^     s
   $zHuggingFaceBertExporter.initr_   c                 C   sL   |  t| \}}| |j}| ||}| }|| | j| |S N)	nemo_loadr)   r^   r   rn   cpusave_pretrainedrH   )rL   r_   rp   _rq   r-   r-   r.   rs     s   
zHuggingFaceBertExporter.applyc                 C   s   t t| jjjS )r   )r   load_contextr)   rV   rH   r]   r-   r-   r.   rH     s   z!HuggingFaceBertExporter.tokenizerc                 C   sR   ddddddddd	d
ddddd}|j jr|ddd tj|||tttgdS )z#Convert NeMo state dict to HF styler   r   r   r   r   r   r   r   r   r   r   r   r   r   )rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   )rF   r   r   r   r   _export_qkv_export_qkv_bias_export_embedding)rL   rp   rq   r   r-   r-   r.   rn     s6   z%HuggingFaceBertExporter.convert_stater   c              
   C   sH   t jt| dd}ddlm} ||j|j|j|j|j	|j
|j| jjdS )z'Generate HF Config based on NeMo configzmodel.config)subpathr   r   )r   r4   r   r8   r   r   r   
vocab_size)r   r   r)   ri   r   r2   r4   r6   r8   r   r   r    rH   r   )rL   rp   r   r-   r-   r.   rF   ?  s   zHuggingFaceBertExporter.configN)r\   r   )r\   r   )r%   r&   r'   r(   r   r   r^   r   rs   r   rH   rn   rF   r-   r-   r-   r.   r     s    
"r   r   )z0bert.encoder.layer.*.attention.self.query.weightz.bert.encoder.layer.*.attention.self.key.weightz0bert.encoder.layer.*.attention.self.value.weightz1encoder.layers.*.self_attention.linear_qkv.weight)
source_key
target_keyctxc              	   C   &  | j j}|j}|j}t|d|j|j }| }||f|dd   }	|j|	 }|j|	 }|j|	 }td|f|dd   j	|j
d}
t|D ]>}t|
|||d d d d d f f}
t|
|||d d d d d f f}
t|
|||d d d d d f f}
qG|
|d|  |g}
|
S Nkv_channels   r   r   rS   rq   rF   r8   r4   getattrsizeviewr   emptytor   rangecatreshaper   qr   r   megatron_confighead_numr4   	head_sizeold_tensor_shapenew_q_tensor_shapeqkv_weightsir-   r-   r.   r   R  $   



$((*r   )z.bert.encoder.layer.*.attention.self.query.biasz,bert.encoder.layer.*.attention.self.key.biasz.bert.encoder.layer.*.attention.self.value.biasz/encoder.layers.*.self_attention.linear_qkv.biasc                 C      | j j}|j}t|d|j|j }||f}|j| }|j| }	|j| }
td|fj|j	d}t
|D ])}t||||d  f}t||	||d  f}t||
||d  f}q4||d|  g}|S Nr   r   r   r   rS   rq   rF   r8   r   r4   r   r   r   r   r   r   r   r   r   qbkbvbr   r   r   new_q_tensor_shape_biasbias_qbias_kbias_v
qkv_biasesr   r-   r-   r.   r   u      



r   )z&bert.embeddings.word_embeddings.weightz embedding.word_embeddings.weightc                 C   v   | j jj}|d}tt|| | }||kr9tj|| |d|j	|j
dj|j	d}tj||fdd}|S |S Nr   r   r   devicer   dimrq   rF   make_vocab_size_divisible_byr   r:   mathceilr   zerosr   r   r   r   r   	embedding	divisibleemb_sizepadded_emb_sizezeros_to_addpadded_embeddingr-   r-   r.   r        

r   )zcls.predictions.decoder.biaszoutput_layer.biasc                 C   sd   | j jj}|d}tt|| | }||kr0tj|| |j	|j
d}tj||fdd}|S |S )Nr   r   r   )rq   rF   r   r   r:   r   r   r   r   r   r   r   )r   biasr   	bias_sizepadded_bias_sizer  r  r-   r-   r.   r     s   

r   )z+encoder.layer.*.attention.self.query.weightz)encoder.layer.*.attention.self.key.weightz+encoder.layer.*.attention.self.value.weightc              	   C   r   r   r   r   r-   r-   r.   r     r   r   )z)encoder.layer.*.attention.self.query.biasz'encoder.layer.*.attention.self.key.biasz)encoder.layer.*.attention.self.value.biasc                 C   r   r   r   r   r-   r-   r.   r     r   r   )!embeddings.word_embeddings.weightc                 C   r   r   r   r   r-   r-   r.   r     r  r   c                    s   | j j}|j}|}||  |j}t|d|j|j }|d|  }||||g}t fddt|D }t	 | d }	t	 d | d }
|| d|
 }||	 d|
 }||
 d|
 }|||fS )Nr   r   c                    ,   g | ]}t  d  |  d  |   qS r   r   aranger   r   heads_per_groupr-   r.   
<listcomp>0      z_export_qkv.<locals>.<listcomp>r   )rq   rF   r8   r4   r   r   r   r   r   r  r   )r   
linear_qkvr   r   num_query_groupsr4   r   qkv_total_dimq_slicek_slicev_sliceq_projk_projv_projr-   r  r.   r     s*   


r   c                    s   | j j}|j}|}||  t|d|j|j }|d|  }|||g}t fddt|D }t	 | d }t	 d | d }	|| d
 }
|| d
 }||	 d
 }|
||fS )Nr   r   c                    r	  r
  r  r  r  r-   r.   r  T  r  z$_export_qkv_bias.<locals>.<listcomp>r   r  )rp   rF   r8   r   r4   r   r   r   r   r  r   )r   qkv_biasr   r   r  r   r  r  r  r  q_biask_biasv_biasr-   r  r.   r   ?  s(   	

r   r  c                 C   s   | j j}|d |jd d f S r   )rq   rF   r   )r   r   r   r-   r-   r.   r   c  s   r   )7r   rW   dataclassesr   pathlibr   typingr   r   r   r   r   r   "nemo.collections.common.tokenizersr	   r
   $nemo.collections.llm.bert.model.baser   r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   nemo.lightning.pytorch.utilsr   
nemo.utilsr   ri   r   r   r0   r=   rA   rC   rD   rE   model_importerModelConnectorrR   model_exporterr   state_transformTransformCTXr   r   r   r   r   r   r   r   r   r   r-   r-   r-   r.   <module>   s   				
 
O