o
    i k                  /   @   s  d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m"Z" d dl+m,Z, d dl-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE ddlFmFZF dddddddd d d!dd!d dd"d"e  dd#d$eGd%ee>ge<f d&e2d'eHd(eGd)eId*eHd+eGd,eGd-eHd.eGd/eGd0eGd1eGd2eGd3eGd4eGd5eJd6eJd7eKd8eeB d9e
eCeEf f,d:d;ZLd<d=d<dddd d d!d!d dd"d"e  dd>d$eGd%ee>ge<f d&e2d'eHd)eId*eHd+eGd,eGd-eHd.eGd/eGd0eGd2eGd3eGd4eGd5eJd6eJd7eKd8eeB d9e
eCeEf f(d?d@ZMd<d=d<dddAdBd d!d!ddd"d"e  ddCd$eGd%ee>ge<f d&e2d'eHd)eId*eHd+eGd,eGd-eHd.eGd/eGd0eGd2eGd3eGd4eGd6eJd7eKd8eeB d9e
eCeEf f&dDdEZNd<d=d<dddAdBd d!d!ddd"d"e  dd>d$eGd%ee>ge<f d&e2d'eHd)eId*eHd+eGd,eGd-eHd.eGd/eGd0eGd2eGd3eGd4eGd5eJd6eJd7eKd8eeB d9e
eCeEf f(dFdGZOd<d=d<dddAdBd d!d!ddd"d"e  dd>d$eGd%ee>ge<f d&e2d'eHd)eId*eHd+eGd,eGd-eHd.eGd/eGd0eGd2eGd3eGd4eGd5eJd6eJd7eKd8eeB d9e
eCeEf f(dHdIZPd%ee>ge<f dJe>d&e2d8eeB d9e
eCeEf f
dKdLZQd%ee>ge<f dJe>d&e2d9eCfdMdNZRdOeCdPeAdQeJd9e	eDe@f fdRdSZS	dfdOeCdTeeA d9e"fdUdVZTd"e  dWdXed+eGd6eJd7eKd9e>f
dYdZZUd/eGd3eGd9e"fd[d\ZVdOe"dTe=dQeJd]eGd3eGd9e	e.eegee0 f f fd^d_ZWdOe"d`e	e=ef dQeJd9e	e?eeee/  ge.f f fdadbZXdced9ee>eee  eee  ge>f fdddeZYdS )g    )partial)Path)AnyCallableListOptionalTupleUnioncastN)AlbertConfigAlbertEncoder)
BertConfigBertEncoder)CuratedEncoderTCuratedTransformer)$convert_pretrained_model_for_encoder)PyTorchTransformerOutput)RobertaConfigRobertaEncoder)Doc)SimpleFrozenDict)ModelPyTorchWrapper_v2TorchScriptWrapper_v1get_torch_default_devicetorch2xpxp2torch)chain)r   )PyTorchGradScaler)
ArgsKwargsFloats2dInts1d   )Tok2PiecesModelT   )WrappedTransformerAndListenerreplace_listener_callbackreplace_listener_cfg_callback)TransformerModelOutput)remove_bos_eos)
SpanExtractorModelTTorchTransformerInTTorchTransformerModelTTorchTransformerOutTTransformerBackpropTTransformerInTTransformerListenerModelTTransformerModelTTransformerOutT#WrappedTransformerAndListenerModelT)with_non_ws_tokensg           gelu_newi   i   g-q=i      F)attention_probs_dropout_probembedding_width
hidden_acthidden_dropout_probhidden_widthintermediate_widthlayer_norm_epsmax_position_embeddingsmodel_max_lengthnum_attention_headsnum_hidden_groupsnum_hidden_layerspadding_idxtype_vocab_sizetorchscriptmixed_precisiongrad_scaler_configwrapped_listener
vocab_size
with_spanspiece_encoderr8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   returnc                 C   s`   t |||||||||| ||
||	|d}|rt||d}nt|}t||||d}t||||dS )a  Construct an ALBERT transformer model.

    vocab_size (int):
        Vocabulary size.
    with_spans (Callable):
        Callback that constructs a span generator model.
    piece_encoder (Model)
        The piece encoder to segment input tokens.
    attention_probs_dropout_prob (float):
        Dropout probabilty of the self-attention layers.
    embedding_width (int):
        Width of the embedding representations.
    hidden_act (str):
        Activation used by the point-wise feed-forward layers.
    hidden_dropout_prob (float):
        Dropout probabilty of the point-wise feed-forward and
        embedding layers.
    hidden_width (int):
        Width of the final representations.
    intermediate_width (int):
        Width of the intermediate projection layer in the
        point-wise feed-forward layer.
    layer_norm_eps (float):
        Epsilon for layer normalization.
    max_position_embeddings (int):
        Maximum length of position embeddings.
    model_max_length (int):
        Maximum length of model inputs.
    num_attention_heads (int):
        Number of self-attention heads.
    num_hidden_groups (int):
        Number of layer groups whose constituents share parameters.
    num_hidden_layers (int):
        Number of hidden layers.
    padding_idx (int):
        Index of the padding meta-token.
    type_vocab_size (int):
        Type vocabulary size.
    torchscript (bool):
        Set to `True` when loading TorchScript models, `False` otherwise.
    mixed_precision (bool):
        Use mixed-precision training.
    grad_scaler_config (dict):
        Configuration passed to the PyTorch gradient scaler.
    wrapped_listener (Optional[TransformerListenerModelT]):
        Optional listener to wrap. Only used when replacing listeners
        in downstream components.
    )r9   r<   r=   rA   rB   rC   r8   r;   r:   rJ   rE   r?   r@   r>   rD   r@   rD   r<   rG   rH   rK   rL   transformerrI   )r   _torchscript_encoderr   _pytorch_encoder(build_transformer_or_listener_wrapper_v1)rJ   rK   rL   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   configrQ   encoder rW   c/home/ubuntu/.local/lib/python3.10/site-packages/spacy_curated_transformers/models/architectures.py!build_albert_transformer_model_v15   sD   KrY   g?gelu)r8   r:   r;   r<   r=   r>   r?   r@   rA   rC   rD   rE   rF   rG   rH   rI   c                 C   ^   t ||||||||| ||	|
||d}|rt|
|d}nt|}t||||d}t||||dS )aj  Construct a BERT transformer model.

    vocab_size (int):
        Vocabulary size.
    with_spans (Callable):
        Callback that constructs a span generator model.
    piece_encoder (Model)
        The piece encoder to segment input tokens.
    attention_probs_dropout_prob (float):
        Dropout probabilty of the self-attention layers.
    hidden_act (str):
        Activation used by the point-wise feed-forward layers.
    hidden_dropout_prob (float):
        Dropout probabilty of the point-wise feed-forward and
        embedding layers.
    hidden_width (int):
        Width of the final representations.
    intermediate_width (int):
        Width of the intermediate projection layer in the
        point-wise feed-forward layer.
    layer_norm_eps (float):
        Epsilon for layer normalization.
    max_position_embeddings (int):
        Maximum length of position embeddings.
    model_max_length (int):
        Maximum length of model inputs.
    num_attention_heads (int):
        Number of self-attention heads.
    num_hidden_layers (int):
        Number of hidden layers.
    padding_idx (int):
        Index of the padding meta-token.
    type_vocab_size (int):
        Type vocabulary size.
    torchscript (bool):
        Set to `True` when loading TorchScript models, `False` otherwise.
    mixed_precision (bool):
        Use mixed-precision training.
    grad_scaler_config (dict):
        Configuration passed to the PyTorch gradient scaler.
    wrapped_listener (Optional[TransformerListenerModelT]):
        Optional listener to wrap. Only used when replacing listeners
        in downstream components.
    r9   r<   r=   rA   rC   r8   r;   r:   rJ   rE   r?   r@   r>   rD   rN   rO   rP   )r   rR   r   rS   rT   rJ   rK   rL   r8   r:   r;   r<   r=   r>   r?   r@   rA   rC   rD   rE   rF   rG   rH   rI   rU   rQ   rV   rW   rW   rX   build_bert_transformer_model_v1   B   Er^   gh㈵>i  )r8   r:   r;   r<   r=   r>   r?   r@   rA   rC   rD   rE   rG   rF   rH   rI   c                 C   s^   t ||||||||| ||	|
||d}|rt|
|d}nt|}t||||d}t||||dS )ao  Construct a CamemBERT transformer model.

    vocab_size (int):
        Vocabulary size.
    with_spans (Callable):
        Callback that constructs a span generator model.
    piece_encoder (Model)
        The piece encoder to segment input tokens.
    attention_probs_dropout_prob (float):
        Dropout probabilty of the self-attention layers.
    hidden_act (str):
        Activation used by the point-wise feed-forward layers.
    hidden_dropout_prob (float):
        Dropout probabilty of the point-wise feed-forward and
        embedding layers.
    hidden_width (int):
        Width of the final representations.
    intermediate_width (int):
        Width of the intermediate projection layer in the
        point-wise feed-forward layer.
    layer_norm_eps (float):
        Epsilon for layer normalization.
    max_position_embeddings (int):
        Maximum length of position embeddings.
    model_max_length (int):
        Maximum length of model inputs.
    num_attention_heads (int):
        Number of self-attention heads.
    num_hidden_layers (int):
        Number of hidden layers.
    padding_idx (int):
        Index of the padding meta-token.
    type_vocab_size (int):
        Type vocabulary size.
    torchscript (bool):
        Set to `True` when loading TorchScript models, `False` otherwise.
    mixed_precision (bool):
        Use mixed-precision training.
    grad_scaler_config (dict):
        Configuration passed to the PyTorch gradient scaler.
    wrapped_listener (Optional[TransformerListenerModelT]):
        Optional listener to wrap. Only used when replacing listeners
        in downstream components.
    r\   rN   rO   rP   r   rR   r   rS   rT   )rJ   rK   rL   r8   r:   r;   r<   r=   r>   r?   r@   rA   rC   rD   rE   rG   rF   rH   rI   rU   rQ   rV   rW   rW   rX   $build_camembert_transformer_model_v1  r_   ra   c                 C   r[   )am  Construct a RoBERTa transformer model.

    vocab_size (int):
        Vocabulary size.
    with_spans (Callable):
        Callback that constructs a span generator model.
    piece_encoder (Model)
        The piece encoder to segment input tokens.
    attention_probs_dropout_prob (float):
        Dropout probabilty of the self-attention layers.
    hidden_act (str):
        Activation used by the point-wise feed-forward layers.
    hidden_dropout_prob (float):
        Dropout probabilty of the point-wise feed-forward and
        embedding layers.
    hidden_width (int):
        Width of the final representations.
    intermediate_width (int):
        Width of the intermediate projection layer in the
        point-wise feed-forward layer.
    layer_norm_eps (float):
        Epsilon for layer normalization.
    max_position_embeddings (int):
        Maximum length of position embeddings.
    model_max_length (int):
        Maximum length of model inputs.
    num_attention_heads (int):
        Number of self-attention heads.
    num_hidden_layers (int):
        Number of hidden layers.
    padding_idx (int):
        Index of the padding meta-token.
    type_vocab_size (int):
        Type vocabulary size.
    torchscript (bool):
        Set to `True` when loading TorchScript models, `False` otherwise.
    mixed_precision (bool):
        Use mixed-precision training.
    grad_scaler_config (dict):
        Configuration passed to the PyTorch gradient scaler.
    wrapped_listener (Optional[TransformerListenerModelT]):
        Optional listener to wrap. Only used when replacing listeners
        in downstream components.
    r\   rN   rO   rP   r`   r]   rW   rW   rX   "build_roberta_transformer_model_v1}  r_   rb   c                 C   r[   )aq  Construct a XLM-RoBERTa transformer model.

    vocab_size (int):
        Vocabulary size.
    with_spans (Callable):
        Callback that constructs a span generator model.
    piece_encoder (Model)
        The piece encoder to segment input tokens.
    attention_probs_dropout_prob (float):
        Dropout probabilty of the self-attention layers.
    hidden_act (str):
        Activation used by the point-wise feed-forward layers.
    hidden_dropout_prob (float):
        Dropout probabilty of the point-wise feed-forward and
        embedding layers.
    hidden_width (int):
        Width of the final representations.
    intermediate_width (int):
        Width of the intermediate projection layer in the
        point-wise feed-forward layer.
    layer_norm_eps (float):
        Epsilon for layer normalization.
    max_position_embeddings (int):
        Maximum length of position embeddings.
    model_max_length (int):
        Maximum length of model inputs.
    num_attention_heads (int):
        Number of self-attention heads.
    num_hidden_layers (int):
        Number of hidden layers.
    padding_idx (int):
        Index of the padding meta-token.
    type_vocab_size (int):
        Type vocabulary size.
    torchscript (bool):
        Set to `True` when loading TorchScript models, `False` otherwise.
    mixed_precision (bool):
        Use mixed-precision training.
    grad_scaler_config (dict):
        Configuration passed to the PyTorch gradient scaler.
    wrapped_listener (Optional[TransformerListenerModelT]):
        Optional listener to wrap. Only used when replacing listeners
        in downstream components.
    r\   rN   rO   rP   r`   r]   rW   rW   rX   build_xlmr_transformer_model_v1  r_   rc   rQ   c                 C   s$   t | ||d}|d urt||S |S )N)rK   rQ   rL   )build_transformer_model_v1r%   )rK   rQ   rL   rI   thinc_transformerrW   rW   rX   rT   S  s   

rT   c                 C   sH   t t|| |t g}||d}tdtt||ttdd|didS )N)rL   rQ   transformer_model)replace_listenerreplace_listener_cfgnO)initlayersrefsattrsdims)	r4   r   r)   r   transformer_model_forwardtransformer_model_initr&   r'   get_dim)rK   rQ   rL   rk   rl   rW   rW   rX   rd   f  s$   rd   modeldocsis_trainc                    s*   | j d ||d\}  fdd}||fS )Nr   )rt   c                    s    |  g S NrW   )dYbackprop_layerrW   rX   backprop  s   z+transformer_model_forward.<locals>.backprop)rk   )rr   rs   rt   Yry   rW   rw   rX   ro     s   ro   Xc                 C   s   | j d || | S )Nr   )rk   
initialize)rr   r{   rz   rW   rW   rX   rp     s   rp   )rG   rH   rV   c             	   C   sf   t |tri }d|vr||d< tt| tt| j| jdt|t	di |d}|
d| d|jd< |S )Nenabledmax_model_seq_lenrD   )convert_inputsconvert_outputsrG   grad_scalerri   T_all_layer_outputsrW   )
isinstancer   r   r   r   _convert_inputsmax_seq_lenrD   _convert_outputsr   set_dimrm   )rV   r<   rG   rH   rr   rW   rW   rX   rS     s$   

rS   c                 C   s   t tt| |dtdS )Nr~   )r   r   )r   r   r   r   rN   rW   rW   rX   rR     s   rR   r   c                   s   | j tdd  D }||krtd| d| djt |f|}tt D ]} | }|jd }	|||d |	f< q,t|}dt	f fdd	}
t
|fi d
}||
fS )Nc                 s   s    | ]}|j V  qd S ru   )size.0xrW   rW   rX   	<genexpr>  s    z"_convert_inputs.<locals>.<genexpr>zAAt least one sequence in the transformer's input has a length of z>, which is larger than the model's maximum sequence length of z tokensr   d_inputsc                    s   fdd D S )Nc                    s   g | ]
}  |jd  qS r   )alloc1fshaper   opsrW   rX   
<listcomp>  s    zH_convert_inputs.<locals>.convert_from_torch_backward.<locals>.<listcomp>rW   )r   r{   r   rW   rX   convert_from_torch_backward  s   z4_convert_inputs.<locals>.convert_from_torch_backwardargskwargs)r   max
ValueErrorxpfulllenranger   r   r   r   )rr   r{   rt   r   rD   r   Xtispanspan_lenr   outputrW   r   rX   r     s&   
r   inputs_outputsc           	         s   |\}| j | jd }dd |D }|r!fddt|D  nfddt|D  fdd D }t|| d}dttt  f fd	d
}||fS )Nr   c                 S   s   g | ]}|j d  qS r   )r   r   rW   rW   rX   r     s    z$_convert_outputs.<locals>.<listcomp>c                    s&   g | ]\  fd dj D qS )c                    s"   g | ]}| d d d f qS ru   rW   r   r   r   r   rW   rX   r     s   " /_convert_outputs.<locals>.<listcomp>.<listcomp>all_outputs)r   model_outputsr   rX   r     s    c                    s.   g | ]\}} j d  |d|ddf gqS )Nr   )r   r   r   r   rW   rX   r     s    c                    s   g | ]} fd d|D qS )c                    s   g | ]}t tt| d qS )r   )r
   r    r   )r   layerr   rW   rX   r         r   rW   r   r   rW   rX   r     s    )outputslast_layer_onlyrv   c                    sB   dd  D }dd | D }t |t |ksJ t|fd|idS )Nc                 S   s   g | ]	}|D ]}|qqS rW   rW   r   inneryrW   rW   rX   r     s    zH_convert_outputs.<locals>.convert_for_torch_backward.<locals>.<listcomp>c                 S   s   g | ]}|D ]}t |qqS rW   )r   r   rW   rW   rX   r   	  r   grad_tensorsr   )r   r   )rv   Yt_flatdYt_flat)YtrW   rX   convert_for_torch_backward  s   z4_convert_outputs.<locals>.convert_for_torch_backward)r   rm   	enumerater(   r   r    )	rr   r   rt   model_inputsall_layer_outputs
input_lensrz   r   r   rW   )r   r   r   rX   r     s"   




r   pathc                    s   d fdd	}|S )zConstruct a callback that initializes a supported transformer
    model with weights from a PyTorch checkpoint.

    path (Path):
        Path to the PyTorch checkpoint.
    Nc                    sB   | j d j}t }tj |d}t||}|| || | S )Nr   )map_location)shims_modelr   torchloadr   load_state_dictto)rr   r{   rz   rV   deviceparamsr   rW   rX   r     s   


z0build_pytorch_checkpoint_loader_v1.<locals>.loadNNrW   )r   r   rW   r   rX   "build_pytorch_checkpoint_loader_v1  s   	r   r   )Z	functoolsr   pathlibr   typingr   r   r   r   r   r	   r
   r   "curated_transformers.models.albertr   r    curated_transformers.models.bertr   r   /curated_transformers.models.curated_transformerr   r   #curated_transformers.models.hf_utilr   "curated_transformers.models.outputr   #curated_transformers.models.robertar   r   spacy.tokensr   
spacy.utilr   	thinc.apir   r   r   r   r   r   thinc.layersr   thinc.modelthinc.shims.pytorch_grad_scalerr   thinc.typesr   r    r!   tokenization.typesr#   	listenersr%   r&   r'   r   r(   remove_eos_bosr)   typesr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   intfloatstrbooldictrY   r^   ra   rb   rc   rT   rd   ro   rp   rS   rR   r   r   r   rW   rW   rW   rX   <module>   sD   $ 0	


z	


s	


s	


s	


k
	

"



!
!

)