o
    Ti                     @   s  d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
 G dd de
jZG dd de
jZG d	d
 d
e
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZdS )z0Declares specification of the Transformer model.    )OptionalTupleUnionN)attention_speccommon_spec
model_specc                /   @   s   e Zd Zddejjdejjddddddddddddddddfdedede	d	e	d
ejdedejde	de	de	de	de	de	de
e de
e de
e de	de
ej dedede
e de
e	 de	f.ddZdS )TransformerEncoderSpecTF   N'  
num_layers	num_headspre_normno_final_norm
activationnum_source_embeddingsembeddings_mergelayernorm_embeddingrelative_positionrelative_attention_biasffn_glurms_normmulti_query_attentionnum_heads_kvhead_dim
rotary_dimrotary_interleaverotary_scaling_typerotary_scaling_factorrotary_basesliding_windowqk_normpre_post_layer_normc                    s  |rdurdkrt dd|| _td|| _|| _td|| _td|| _dd t	|D | _
d| _sFsFt | _|rQ|sQtjd	| _|rZtjd	| _|durgtd
|| _ 	
fddt	|D | _dS )a'  Initializes a Transformer encoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of dimensions per attention head.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          sliding_window: Max sequence length to retain in KV Cache.
          qk_norm: Apply layer normalization to the query and key projections.
          pre_post_layer_norm: Add post layer norm for each pre norm layer.
        Nr	   5Enabling multi_query_attention implies num_heads_kv=1int16int8c                 S   s   g | ]}t  qS  )r   EmbeddingsSpec.0_r%   r%   V/home/ubuntu/.local/lib/python3.10/site-packages/ctranslate2/specs/transformer_spec.py
<listcomp>S   s    z3TransformerEncoderSpec.__init__.<locals>.<listcomp>Tr   int32c                    s.   g | ]}t  	
d qS ))r   r   r   r   r   r   r   r   r   r   r   r    r!   )TransformerEncoderLayerSpecr'   r   r   r   r!   r    r   r   r   r   r   r   r   r   r%   r*   r+   `   s$    )
ValueErrorr   npdtypetyper   r   r   r   range
embeddingsscale_embeddingsPositionEncoderSpecposition_encodingsr   LayerNormSpec
layer_normr   r   layer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r%   r/   r*   __init__   s4   <"zTransformerEncoderSpec.__init__)__name__
__module____qualname__r   
ActivationRELUEmbeddingsMergeCONCATintboolr   r   RotaryScalingTypefloatr=   r%   r%   r%   r*   r   
   s    	
r   c                J   @   s&  e Zd Zdejjddddddddddddddddddddddddddddddddf"ded	ed
edejdedededededededededededededee dedee	j
 dedededed ed!ed"ed#ed$ee d%ee d&ee d'eej d(ee d)ee d*ed+ee fHd,d-Zed.d/ ZdS )0TransformerDecoderSpecTFr	   Nr
   r   r   r   r   r   r   with_encoder_attentionr   project_in_outr   r   alignment_layeralignment_headsr   r   alibialibi_use_positive_positionsscale_alibir   r   r   r   r    original_max_position_embeddingsmax_position_embeddingsparallel_residualshared_layer_normr!   r   r   r   r   
quant_typequant_group_size
quant_bitsr     external_pre_post_encoder_layersc%           %         s  t  | _r|stdrtd|r"dur dkr tddtd|| _|| _td|| _td|| _	td|| _
t | _d| _tj| _|| _|| _|| _durktd	| _
sy	sy|sydu ryt | _|r|stjd
| _|rtjd
| _t | _ 	
fddt|D | _d| _|p|k| jd< |rt | _ t | _!| r| | jd< |"| jd< |!| jd< dS dS )a.  Initializes a Transformer decoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          with_encoder_attention: Enable the encoder attention sublayers.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          project_in_out: Add linear transformations after the embedding layer and before
            the final layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: Add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          sliding_window: Max sequence length to retain in KV Cache.
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
          external_pre_post_encoder_layers: if the encoder attention pre and processing
            is done outside the attention.
        z/The GPT-J block expects a pre-norm architecturez-The GPT-J block does not have cross attentionNr	   r"   r#   r$   Tr-   r,   c                    s   g | ]C}t di d d
d	ddddddd	d
ddddddddd qS )rK   r   r   r   r   r   r   r   r   r   rR   rS   rT   rU   r!   r   r   r   r    rY   r%   )TransformerDecoderLayerSpecr'   rY   r   r   rS   r   rR   rT   r!   r    r   r   r   r   r   r   r   r   rU   r   rK   r%   r*   r+      sX    	
z3TransformerDecoderSpec.__init__.<locals>.<listcomp>Fr   quantization_typequantization_bitsquantization_group_size)"dict_configr0   r1   r2   r3   r   r   r   rM   rN   r   r&   r5   r6   r   OPTIONALscale_outputsrO   rP   rQ   r   r7   r8   r9   r:   r   
LinearSpec
projectionr4   r;   start_from_zero_embedding
project_inproject_out)%r<   r   r   r   r   r   rK   r   rL   r   r   rM   rN   r   r   rO   rP   rQ   r   r   r   r   r   rR   rS   rT   rU   r!   r   r   r   r   rV   rW   rX   r    rY   r%   r[   r*   r=   u   sj   X

0



zTransformerDecoderSpec.__init__c                 C   s   | j S N)r`   r<   r%   r%   r*   config  s   zTransformerDecoderSpec.config)r>   r?   r@   r   rA   rB   rE   rF   r   r   rG   rH   Quantizationr=   propertyrj   r%   r%   r%   r*   rI   t   s    	
 !"#$%
 'rI   c                   @   sV   e Zd Z														ddee dedeej d	ed
edefddZ	dS )r.   FNTr	   r
   r   r   r   r   r   r!   c                 C   s   t jd||||||||	|
|||d| _t||d| _|rFtj|d| _tj|d| _tj|d| _	tj|d| _
t| jd t| jd d S d S )NT)self_attentionr   r   r   r   r   r   r   r   r   r   r   r    glur   r,   r:   )r   MultiHeadAttentionSpecrm   FeedForwardSpecffnr   r9   input_layer_normpost_attention_layer_normpre_feedforward_layer_normpost_feedforward_layer_normdelattr)r<   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r%   r%   r*   r=   !  s<   z$TransformerEncoderLayerSpec.__init__)FFFFNNNNTNr	   r
   FF)
r>   r?   r@   r   rE   rF   r   rG   rH   r=   r%   r%   r%   r*   r.      s8    	
r.   c                   @   s>   e Zd Z																				d	ddZdS )
rZ   TFNr	   r
   r   c                 C   s  t jd|||||||	|
||||||d| _|r%t j||||||du d| _t||d| _|rL|r6t | _n
t | _	t | _
t| jd t| jd |rtj|d| _	tj|d| _
|rn|rntj|d| _tj|d| _tj|d| _tj|d| _t| jd t| jd d S d S )NT)rm   r   r   r   r   r   r   r   r   rR   rS   r   r   r   r    F)r   r   r   r   r    has_normrn   r:   r,   )r   rp   rm   	attentionrq   rr   r   r9   rU   rs   rt   rw   *external_post_encoder_attention_layer_norm)external_pre_encoder_attention_layer_normru   rv   )r<   rK   r   r   r   r   r   r   r   r   r   rR   rS   rT   rU   r!   r   r   r   r    rY   r%   r%   r*   r=   T  sj   	



z$TransformerDecoderLayerSpec.__init__)TFFFFNTNr	   r
   r   r   FFFNNNFFr>   r?   r@   r=   r%   r%   r%   r*   rZ   S  s,    rZ   c                   @   s   e Zd ZdddZdS )rq   Fc                 C   s8   t j|d| _t  | _t  | _|rt  | _d S d S )Nr,   )r   r9   r:   rc   linear_0linear_1linear_0_noact)r<   ro   r   r%   r%   r*   r=     s   

zFeedForwardSpec.__init__N)FFr|   r%   r%   r%   r*   rq     s    rq   c                   @   s   e Zd Zdd ZdS )r7   c                 C   s   t j| _d S rh   )r   ra   	encodingsri   r%   r%   r*   r=     s   zPositionEncoderSpec.__init__Nr|   r%   r%   r%   r*   r7     s    r7   c                       ,   e Zd ZdZddee f fddZ  ZS )TransformerConfigz%Configuration for Transformer models.Nlayer_norm_epsilonc                       t  jdd|i| dS )zInitializes the configuration for Transformer models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr%   superr=   r<   r   kwargs	__class__r%   r*   r=        zTransformerConfig.__init__rh   r>   r?   r@   __doc__r   rH   r=   __classcell__r%   r%   r   r*   r          r   c                        s   e Zd ZdZdedef fddZedddej	j
dd	d	ejjdddddfd
eeeeef f dededededej	dedededejdededededefddZedd Zedd Zdd  Zd!d" Zd#d$ Z  ZS )%TransformerSpeczDescribes a Transformer model.

    The specification is invariant to hidden dimensions but requires to
    explicitly set the number of layers and attention heads.
    encoderdecoderc                    sP   t |ts	tdt |tstdt   || _|| _| j	d| jj
 dS )zInitializes a Transformer model specification.

        Args:
          encoder: The encoder specification.
          decoder: The decoder specification.
        1encoder argument must be a TransformerEncoderSpec1decoder argument must be a TransformerDecoderSpecr   N)
isinstancer   	TypeErrorrI   r   r=   r   r   r`   add_attributer   )r<   r   r   r   r%   r*   r=     s   
	

zTransformerSpec.__init__FTrJ   r	   r   r   with_relative_positionr   r   r   rM   rN   r   r   r   r   r   r   r   c                 C   sp   t |ttfr|\}}n||}}t||||||	|
||||||d}t|||||||||||||d}| ||S )a  Creates a Transformer model specification.

        Args:
          num_layers: Number of encoder and decoder layers, or a 2-tuple if the
            number is different.
          num_heads: Number of attention heads.
          with_relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layer as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention.
        )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   rM   rN   r   r   r   )r   listtupler   rI   )clsr   r   r   r   r   r   rM   rN   r   r   r   r   r   r   r   num_encoder_layersnum_decoder_layersr   r   r%   r%   r*   from_config  sD   +


zTransformerSpec.from_configc                 C      dS )Nr   r%   ri   r%   r%   r*   name7     zTransformerSpec.namec                 C   r   )N   r%   ri   r%   r%   r*   revision;  r   zTransformerSpec.revisionc                 C      t  S rh   )r   ri   r%   r%   r*   get_default_config?     z"TransformerSpec.get_default_configc                 C   s   dd | j jD S )Nc                 S   s   g | ]}|j jd  qS )r   )weightshape)r(   specr%   r%   r*   r+   C  s    z>TransformerSpec.get_source_vocabulary_size.<locals>.<listcomp>)r   r5   ri   r%   r%   r*   get_source_vocabulary_sizeB  s   z*TransformerSpec.get_source_vocabulary_sizec                 C      | j jjjd S Nr   r   r5   r   r   ri   r%   r%   r*   get_target_vocabulary_sizeE     z*TransformerSpec.get_target_vocabulary_size)r>   r?   r@   r   r   rI   r=   classmethodr   rA   rB   rC   rD   r   rE   r   rF   r   rl   r   r   r   r   r   r   r%   r%   r   r*   r     sv    	
Q

r   c                       r   )TransformerDecoderModelConfigz-Configuration for Transformer decoder models.Nr   c                    r   )zInitializes the configuration for Transformer decoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr%   r   r   r   r%   r*   r=   L  r   z&TransformerDecoderModelConfig.__init__rh   r   r%   r%   r   r*   r   I  r   r   c                @       s>  e Zd ZdZdef fddZedejj	dddddddddddddd	d
d
dddddddddddfde
de
dedejdedededededededededee
 dedeej dedede
de
ded ed!ed"ed#ee
 d$ee
 d%ee
 d&eej d'ee
 d(ee
 d)ef>d*d+Zed,d- Zed.d/ Zd0d1 Zd2d3 Z  ZS )4TransformerDecoderModelSpecz3Describes a Transformer decoder model (e.g. GPT-2).r   c                    sJ   t |ts	tdt   || _| jj D ]\}}| j	|| qdS )z|Initializes a Transformer decoder model specification.

        Args:
          decoder: The decoder specification.
        r   N)
r   rI   r   r   r=   r   rj   itemsr`   r   )r<   r   keyvaluer   r%   r*   r=   Y  s   

z$TransformerDecoderModelSpec.__init__TFNr	   r
   r   r   r   r   r   r   r   rL   r   r   r   rO   rP   rQ   r   r   r   r   r   rR   rS   rT   rU   r!   r   r   r   r   rV   rW   rX   r    c            !      C   s   t ||fi d|d|d|ddd|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|} | | S ) a!
  Creates a Transformer decoder model specification.

        Args:
          num_layers: Number of decoder layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          no_final_norm: Do not apply layer normalization after the last decoder block.
          project_in_out: Add a linear layer after the embedding layer and another one
            before the final output projection.
          with_relative_position: Enable relative position representations modules.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of head
          sliding_window: max sequence length to retain KV cache
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
        r   r   r   rK   Fr   rL   r   r   r   rO   rP   rQ   r   r   r   r   r   rR   rS   rT   rU   r!   r   r   r   r   rV   rW   rX   r    )rI   )!r   r   r   r   r   r   r   rL   r   r   r   rO   rP   rQ   r   r   r   r   r   rR   rS   rT   rU   r!   r   r   r   r   rV   rW   rX   r    r   r%   r%   r*   r   g  s   L	
 #z'TransformerDecoderModelSpec.from_configc                 C   r   )NrI   r%   ri   r%   r%   r*   r     r   z TransformerDecoderModelSpec.namec                 C   r   )N   r%   ri   r%   r%   r*   r     r   z$TransformerDecoderModelSpec.revisionc                 C   r   rh   )r   ri   r%   r%   r*   r     r   z.TransformerDecoderModelSpec.get_default_configc                 C   r   r   r   ri   r%   r%   r*   get_vocabulary_size  r   z/TransformerDecoderModelSpec.get_vocabulary_size)r>   r?   r@   r   rI   r=   r   r   rA   rB   rE   rF   r   r   rG   rH   rk   r   rl   r   r   r   r   r   r%   r%   r   r*   r   V  s    	
 p

r   c                       r   )TransformerEncoderModelConfigz-Configuration for Transformer encoder models.Nr   c                    r   )zInitializes the configuration for Transformer encoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr%   r   r   r   r%   r*   r=     r   z&TransformerEncoderModelConfig.__init__rh   r   r%   r%   r   r*   r     r   r   c                       sb   e Zd ZdZdejjfdededejf fddZ	e
dd	 Ze
d
d Zdd Zdd Z  ZS )TransformerEncoderModelSpecz2Describes a Transformer encoder model (e.g. BERT).Fr   pooling_layerpooling_activationc                    s\   t |ts	tdt   || _| jd| jj |r,t	
 | _td|| _dS dS )zInitializes a Transformer encoder model specification.

        Args:
          encoder: The encoder specification.
          pooling_layer: Add the pooling layer.
          pooling_activation: The activation to apply after the pooling layer.
        r   r   r$   N)r   r   r   r   r=   r   r`   r   r   r   rc   pooler_denser1   r2   r3   pooler_activation)r<   r   r   r   r   r%   r*   r=     s   


z$TransformerEncoderModelSpec.__init__c                 C   r   )Nr   r%   ri   r%   r%   r*   r     r   z TransformerEncoderModelSpec.namec                 C   r   )Nr	   r%   ri   r%   r%   r*   r     r   z$TransformerEncoderModelSpec.revisionc                 C   r   rh   )r   ri   r%   r%   r*   r     r   z.TransformerEncoderModelSpec.get_default_configc                 C   s   | j jd jjd S r   )r   r5   r   r   ri   r%   r%   r*   r     s   z/TransformerEncoderModelSpec.get_vocabulary_size)r>   r?   r@   r   r   rA   Tanhr   rF   r=   rl   r   r   r   r   r   r%   r%   r   r*   r     s"    

r   )r   typingr   r   r   numpyr1   ctranslate2.specsr   r   r   	LayerSpecr   rI   r.   rZ   rq   r7   SequenceToSequenceModelConfigr   SequenceToSequenceModelSpecr   LanguageModelConfigr   LanguageModelSpecr   r   r   r%   r%   r%   r*   <module>   s&    j -3[	  