o
    i*                     @   s  d dl mZmZ d dlZd dlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZ dd
lmZ g dZeeefZdefddZde
e deeejf deeejf fddZdddeeejf fddZdeeejf deeejf fddZdeeejf deeejf fddZdeeejf deeejf fddZdeeejf deeejf fdd Zdeeejf deeejf fd!d"Zdeeejf deeejf fd#d$Zdeeejf deeejf fd%d&ZdS )'    )DictOrderedDictN   )AlbertEncoder)BertEncoder)CuratedTransformerCuratedEncoderT)RobertaEncoder   )transformers)Errorsalbertbert	camembertrobertazxlm-roberta
model_typec                 C   s    | t vrttjj| t dd S )N)unsupported_modelsupported_models)SUPPORTED_MODEL_TYPES
ValueErrorr   E007format)r    r   W/home/ubuntu/.local/lib/python3.10/site-packages/curated_transformers/models/hf_util.py_check_supported_hf_models   s   r   transformerparamsreturnc                 C   st   t |}| j}t|trt|}t|S t|tr!t|}t|S t|tr.t|}t|S t	t
jjt|td)zConverts parameters from a compatible pre-trained model to
    parameters that can be consumed by the given curated transformer.

    Returns the state_dict that can be directly loaded by the curated
    transformer.
    )unsupported_encodersupported_encoders)_rename_old_hf_namescurated_encoder
isinstancer   _convert_albert_base_stater   _convert_bert_base_stater	   _convert_roberta_base_state	TypeErrorr   E026r   typeSUPPORTED_CURATED_ENCODERS_add_curated_encoder_prefix)r   r   encoder	convertedr   r   r   $convert_pretrained_model_for_encoder   s"   



	r.   hf_modelztransformers.PreTrainedModelc                 C   s8   t | jj tttttd}|| jj |  }t|S )zConverts HF model parameters to parameters that can be consumed by
    our implementation of the Transformer.

    Returns the state_dict that can be directly loaded by one of the
    curated transformers.
    r   )r   configr   r$   r%   r&   
state_dictr+   )r/   
convertersr-   r   r   r   &convert_hf_pretrained_model_parameters8   s   	r3   r-   c                 C   s   dd |   D S )Nc                 S   s   i | ]
\}}d | |qS )zcurated_encoder.r   .0kvr   r   r   
<dictcomp>S   s    z/_add_curated_encoder_prefix.<locals>.<dictcomp>)items)r-   r   r   r   r+   P   s   r+   c                 C   s@   t  }|  D ]\}}tdd|}tdd|}|||< q|S )Nz\.gamma$z.weightz\.beta$z.bias)r   r9   resub)r   outname	parameterr   r   r   r!   V   s   
r!   c                 C   s  dd |   D }i }|  D ]L\}}d|vrqtdd|}tdd|}tdd	|}td
d|}tdd|}tdd|}tdd|}tdd|}tdd|}|||< q|d |d< |d |d< |d |d< |d |d< |d |d< |d |d< |d |d < t|S )!Nc                 S       i | ]\}}t d d||qS )z	^albert\. r:   r;   r4   r   r   r   r8   e        z._convert_albert_base_state.<locals>.<dictcomp>zencoder.albert_layer
^encoder\.r@   z^albert_layer_groups\.groups.z\.albert_layers\..group_layers.z\.attention\.z.mha.z\.mha\.LayerNorm.attn_output_layernormz\.mha\.dense\.z.mha.output.z\.ffn\.z.ffn.intermediate.z\.ffn_output\.z.ffn.output.z\.full_layer_layer_norm\.z.ffn_output_layernorm.!embeddings.word_embeddings.weight'embeddings.token_type_embeddings.weight%embeddings.position_embeddings.weightembeddings.LayerNorm.weightembeddings.layer_norm.weightembeddings.LayerNorm.biasembeddings.layer_norm.biasz*encoder.embedding_hidden_mapping_in.weightzembeddings.projection.weightz(encoder.embedding_hidden_mapping_in.biaszembeddings.projection.bias)r9   r:   r;   _merge_qkv_albert)r   stripped_paramsr<   r=   r>   r   r   r   r$   a   sJ   	
r$   c                 C   s   i }dd |   D }|  D ]E\}}d|vrqtdd|}tdd|}tdd	|}td
d	|}tdd|}tdd|}tdd|}tdd|}|||< q|d |d< |d |d< |d |d< |d |d< |d |d< t|S )Nc                 S   r?   )z^bert\.r@   rA   r4   r   r   r   r8      rB   z,_convert_bert_base_state.<locals>.<dictcomp>encoder.layer.rC   r@   ^layerlayers$\.attention\.self\.(query|key|value).mha.\1\.attention\.(output)\.dense\.attention\.output\.LayerNormrF   \.(intermediate)\.dense.ffn.\1(\.\d+)\.output\.LayerNorm\1.ffn_output_layernorm(\.\d+)\.(output)\.dense	\1.ffn.\2rG   rH   rI   rJ   rK   rL   rM   r9   r:   r;   
_merge_qkvr   r<   rO   r=   r>   r   r   r   r%      s8   
r%   c                 C   s   i }dd |   D }|  D ]E\}}d|vrqtdd|}tdd|}tdd	|}td
d	|}tdd|}tdd|}tdd|}tdd|}|||< q|d |d< |d |d< |d |d< |d |d< |d |d< t|S )Nc                 S   r?   )z
^roberta\.r@   rA   r4   r   r   r   r8      rB   z/_convert_roberta_base_state.<locals>.<dictcomp>rP   rC   r@   rQ   rR   rS   rT   rU   rV   rF   rW   rX   rY   rZ   r[   r\   rG   z'embeddings.inner.word_embeddings.weightrH   z-embeddings.inner.token_type_embeddings.weightrI   z+embeddings.inner.position_embeddings.weightrJ   z"embeddings.inner.layer_norm.weightrL   z embeddings.inner.layer_norm.biasr]   r_   r   r   r   r&      s@   
r&   c              
   C   s   i }|   D ]@\}}td|}|rBd|v rAd|d  d}t|| | d|d   | | d|d   g|| d	|d  < q|||< q|S )
NzMlayers\.(?P<layer>[0-9]+)\.mha\.(query|key|value).(?P<param_type>weight|bias)queryzlayers.layer.mha.key.
param_type.value..input.r9   r:   matchtorchcatr   r<   r=   r>   mbaser   r   r   r^     s$   
r^   c              
   C   s   i }|   D ]E\}}td|}|rGd|v rFd|d  d|d  d}t|| | d|d	   | | d
|d	   g|| d|d	  < q|||< q|S )Nzngroups\.(?P<group>[0-9]+)\.group_layers\.(?P<layer>[0-9]+)\.mha\.(query|key|value).(?P<param_type>weight|bias)r`   rD   grouprE   ra   rb   rc   rd   re   rf   rg   rk   r   r   r   rN     s$   
rN   ) typingr   r   ri   r:   albert.encoderr   bert.encoderr   curated_transformerr   r   roberta.encoderr	   _compatr   errorsr   r   r*   strr   Tensorr.   r3   r+   r!   r$   r%   r&   r^   rN   r   r   r   r   <module>   sf    
	




E
/
&3*