o
    ir                     @   s  d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
 G dd de
jZG dd de
jZG d	d
 d
e
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZdS )z0Declares specification of the Transformer model.    )OptionalTupleUnionN)attention_speccommon_spec
model_specc                   @   sn   e Zd Zddejjdejjddddddfdedede	de	dejd	ed
ejde	de	de	de	de	de	fddZ
dS )TransformerEncoderSpecTF   
num_layers	num_headspre_normno_final_norm
activationnum_source_embeddingsembeddings_mergelayernorm_embeddingrelative_positionrelative_attention_biasffn_glurms_normmulti_query_attentionc                    s   | _ td|| _|| _td|| _td|| _dd t|D | _	d| _
s6s6t | _|rA|sAtjd| _|rJtjd| _ fddt|D | _dS )	a  Initializes a Transformer encoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention.
        int16int8c                 S   s   g | ]}t  qS  )r   EmbeddingsSpec.0_r   r   _/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/ctranslate2/specs/transformer_spec.py
<listcomp>5   s    z3TransformerEncoderSpec.__init__.<locals>.<listcomp>Tr   c              	      s&   g | ]}t  rd nddqS )r	   N)r   r   r   r   num_heads_kv)TransformerEncoderLayerSpecr   r   r   r   r   r   r   r   r   ?   s    
N)r   npdtypetyper   r   r   r   range
embeddingsscale_embeddingsPositionEncoderSpecposition_encodingsr   LayerNormSpec
layer_normr   layer)selfr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r   __init__   s$   %zTransformerEncoderSpec.__init__N)__name__
__module____qualname__r   
ActivationRELUEmbeddingsMergeCONCATintboolr0   r   r   r   r   r   
   sN    	
r   c                F   @   s  e Zd Zdejjddddddddddddddddddddddddddddddf ded	ed
edejdedededededededededededededee dedee	j
 dedededed ed!ed"ed#ed$ee d%ee d&ee d'eej d(ee d)ee fDd*d+Zed,d- ZdS ).TransformerDecoderSpecTFr	   N'  r   r
   r   r   r   r   with_encoder_attentionr   project_in_outr   r   alignment_layeralignment_headsr   r   alibialibi_use_positive_positionsscale_alibi
rotary_dimrotary_interleaverotary_scaling_typerotary_scaling_factorrotary_base original_max_position_embeddingsmax_position_embeddingsparallel_residualshared_layer_normpre_post_layer_normr   r!   head_dimsliding_window
quant_typequant_group_size
quant_bitsc#           #         s  t  | _r|stdrtd|r"dur dkr tddr1dd|fvr1td td|| _|| _td|| _td|| _	td|| _
t | _d	| _tj| _|| _|| _|| _durztd
| _ss|sdu rt | _|r|stj	d| _|rtj	d| _t | _ 	
fddt|D | _d| _|pƈ|k| jd< |rt | _ t | _!| dur| | jd< |"| jd< |!| jd< dS dS )a  Initializes a Transformer decoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          with_encoder_attention: Enable the encoder attention sublayers.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          project_in_out: Add linear transformations after the embedding layer and before
            the final layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: Add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          sliding_window: Max sequence length to retain in KV Cache.
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
        z/The GPT-J block expects a pre-norm architecturez-The GPT-J block does not have cross attentionNr	   z5Enabling multi_query_attention implies num_heads_kv=1z>num_heads_kv=%d is not supported in the cross-attention layersr   r   Tint32r    c                    s   g | ]=}t di d ddd d	ddddd	
d
dddddddqS )r=   r   r   r   r   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r!   rN   rO   r   )TransformerDecoderLayerSpecr   r   rN   rJ   r!   rI   rK   rM   r   r   r   rH   rD   rE   rG   rF   rL   rO   r=   r   r   r      sP    	
z3TransformerDecoderSpec.__init__.<locals>.<listcomp>Fr   quantization_typequantization_bitsquantization_group_size)"dict_config
ValueErrorr$   r%   r&   r   r   r   r?   r@   r   r   r(   r)   r   OPTIONALscale_outputsrA   rB   rC   rO   r*   r+   r,   r-   r   
LinearSpec
projectionr'   r.   start_from_zero_embedding
project_inproject_out)#r/   r
   r   r   r   r   r=   r   r>   r   r   r?   r@   r   r   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r   r!   rN   rO   rP   rQ   rR   r   rU   r   r0   L   sv   T

,



zTransformerDecoderSpec.__init__c                 C   s   | j S N)rZ   r/   r   r   r   config   s   zTransformerDecoderSpec.config)r1   r2   r3   r   r4   r5   r8   r9   r   r   RotaryScalingTypefloatQuantizationr0   propertyre   r   r   r   r   r:   K   s    	
 !"#
 'r:   c                   @   s"   e Zd Z						dddZdS )r"   FNc                 C   s*   t jd|||||d| _t||d| _d S )NT)self_attentionr   r   r   r!   rO   glur   )r   MultiHeadAttentionSpecrj   FeedForwardSpecffn)r/   r   r   r   r   r!   rO   r   r   r   r0      s   	z$TransformerEncoderLayerSpec.__init__)FFFFNNr1   r2   r3   r0   r   r   r   r   r"      s    r"   c                   @   s:   e Zd Z																		d	ddZdS )
rT   TFNr	   r<   r   c                 C   s   t jd|||||||	|
|||||d| _|rt j|||d| _t||d| _|rF|r0t | _n
t | _	t | _
t| jd t| jd |rrtj|d| _	tj|d| _
tj|d| _tj|d| _t| jd t| jd d S d S )NT)rj   r   r   r   rD   rE   rF   rG   rH   rI   rJ   r!   rN   rO   )r   r!   rO   rk   r-   r    )r   rm   rj   	attentionrn   ro   r   r,   rL   input_layer_normpost_attention_layer_normdelattrpre_feedforward_layer_normpost_feedforward_layer_norm)r/   r=   r   r   r   r   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r!   rN   rO   r   r   r   r0     sX   

z$TransformerDecoderLayerSpec.__init__)TFFFFNTNr	   r<   r   r   FFFNNNrp   r   r   r   r   rT     s(    rT   c                   @   s   e Zd ZdddZdS )rn   Fc                 C   s8   t j|d| _t  | _t  | _|rt  | _d S d S )Nr    )r   r,   r-   r^   linear_0linear_1linear_0_noact)r/   rl   r   r   r   r   r0   W  s   

zFeedForwardSpec.__init__N)FFrp   r   r   r   r   rn   V  s    rn   c                   @   s   e Zd Zdd ZdS )r*   c                 C   s   t j| _d S rc   )r   r\   	encodingsrd   r   r   r   r0   `  s   zPositionEncoderSpec.__init__Nrp   r   r   r   r   r*   _  s    r*   c                       ,   e Zd ZdZddee f fddZ  ZS )TransformerConfigz%Configuration for Transformer models.Nlayer_norm_epsilonc                       t  jdd|i| dS )zInitializes the configuration for Transformer models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r}   Nr   superr0   r/   r}   kwargs	__class__r   r   r0   g     zTransformerConfig.__init__rc   r1   r2   r3   __doc__r   rg   r0   __classcell__r   r   r   r   r|   d       r|   c                        s   e Zd ZdZdedef fddZedddej	j
dd	d	ejjdddddfd
eeeeef f dededededej	dedededejdededededefddZedd Zedd Zdd  Zd!d" Zd#d$ Z  ZS )%TransformerSpeczDescribes a Transformer model.

    The specification is invariant to hidden dimensions but requires to
    explicitly set the number of layers and attention heads.
    encoderdecoderc                    sP   t |ts	tdt |tstdt   || _|| _| j	d| jj
 dS )zInitializes a Transformer model specification.

        Args:
          encoder: The encoder specification.
          decoder: The decoder specification.
        1encoder argument must be a TransformerEncoderSpec1decoder argument must be a TransformerDecoderSpecr   N)
isinstancer   	TypeErrorr:   r   r0   r   r   rZ   add_attributer   )r/   r   r   r   r   r   r0   x  s   
	

zTransformerSpec.__init__FTr;   r	   r
   r   with_relative_positionr   r   r   r?   r@   r   r   r   r   r   r   r   c                 C   sp   t |ttfr|\}}n||}}t||||||	|
||||||d}t|||||||||||||d}| ||S )a  Creates a Transformer model specification.

        Args:
          num_layers: Number of encoder and decoder layers, or a 2-tuple if the
            number is different.
          num_heads: Number of attention heads.
          with_relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layer as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention.
        )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r?   r@   r   r   r   )r   listtupler   r:   )clsr
   r   r   r   r   r   r?   r@   r   r   r   r   r   r   r   num_encoder_layersnum_decoder_layersr   r   r   r   r   from_config  sD   +


zTransformerSpec.from_configc                 C      dS )Nr   r   rd   r   r   r   name     zTransformerSpec.namec                 C   r   )N   r   rd   r   r   r   revision  r   zTransformerSpec.revisionc                 C      t  S rc   )r|   rd   r   r   r   get_default_config     z"TransformerSpec.get_default_configc                 C   s   dd | j jD S )Nc                 S   s   g | ]}|j jd  qS )r   )weightshape)r   specr   r   r   r     s    z>TransformerSpec.get_source_vocabulary_size.<locals>.<listcomp>)r   r(   rd   r   r   r   get_source_vocabulary_size  s   z*TransformerSpec.get_source_vocabulary_sizec                 C      | j jjjd S Nr   r   r(   r   r   rd   r   r   r   get_target_vocabulary_size     z*TransformerSpec.get_target_vocabulary_size)r1   r2   r3   r   r   r:   r0   classmethodr   r4   r5   r6   r7   r   r8   r   r9   r   ri   r   r   r   r   r   r   r   r   r   r   r   q  sv    	
Q

r   c                       r{   )TransformerDecoderModelConfigz-Configuration for Transformer decoder models.Nr}   c                    r~   )zInitializes the configuration for Transformer decoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r}   Nr   r   r   r   r   r   r0     r   z&TransformerDecoderModelConfig.__init__rc   r   r   r   r   r   r     r   r   c                ?       s8  e Zd ZdZdef fddZedejj	dddddddddddddd	d
d
ddddddddddfde
de
dedejdedededededededededee
 dedeej dedede
de
ded ed!ed"ed#ee
 d$ee
 d%ee
 d&eej d'ee
 d(ee
 f<d)d*Zed+d, Zed-d. Zd/d0 Zd1d2 Z  ZS )3TransformerDecoderModelSpecz3Describes a Transformer decoder model (e.g. GPT-2).r   c                    sJ   t |ts	tdt   || _| jj D ]\}}| j	|| qdS )z|Initializes a Transformer decoder model specification.

        Args:
          decoder: The decoder specification.
        r   N)
r   r:   r   r   r0   r   re   itemsrZ   r   )r/   r   keyvaluer   r   r   r0     s   

z$TransformerDecoderModelSpec.__init__TFNr	   r<   r   r
   r   r   r   r   r   r>   r   r   r   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r   r!   rN   rO   rP   rQ   rR   c                  C   s   t ||fi d|d|d|ddd|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|}| |S )a!
  Creates a Transformer decoder model specification.

        Args:
          num_layers: Number of decoder layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          no_final_norm: Do not apply layer normalization after the last decoder block.
          project_in_out: Add a linear layer after the embedding layer and another one
            before the final output projection.
          with_relative_position: Enable relative position representations modules.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of head
          sliding_window: max sequence length to retain KV cache
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
        r   r   r   r=   Fr   r>   r   r   r   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r   r!   rN   rO   rP   rQ   rR   )r:   ) r   r
   r   r   r   r   r   r>   r   r   r   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r   r!   rN   rO   rP   rQ   rR   r   r   r   r   r     s~   K	
"z'TransformerDecoderModelSpec.from_configc                 C   r   )Nr:   r   rd   r   r   r   r   ~  r   z TransformerDecoderModelSpec.namec                 C   r   )N   r   rd   r   r   r   r     r   z$TransformerDecoderModelSpec.revisionc                 C   r   rc   )r   rd   r   r   r   r     r   z.TransformerDecoderModelSpec.get_default_configc                 C   r   r   r   rd   r   r   r   get_vocabulary_size  r   z/TransformerDecoderModelSpec.get_vocabulary_size)r1   r2   r3   r   r:   r0   r   r   r4   r5   r8   r9   r   r   rf   rg   rh   r   ri   r   r   r   r   r   r   r   r   r   r     s    	
n

r   c                       r{   )TransformerEncoderModelConfigz-Configuration for Transformer encoder models.Nr}   c                    r~   )zInitializes the configuration for Transformer encoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r}   Nr   r   r   r   r   r   r0     r   z&TransformerEncoderModelConfig.__init__rc   r   r   r   r   r   r     r   r   c                       sb   e Zd ZdZdejjfdededejf fddZ	e
dd	 Ze
d
d Zdd Zdd Z  ZS )TransformerEncoderModelSpecz2Describes a Transformer encoder model (e.g. BERT).Fr   pooling_layerpooling_activationc                    s\   t |ts	tdt   || _| jd| jj |r,t	
 | _td|| _dS dS )zInitializes a Transformer encoder model specification.

        Args:
          encoder: The encoder specification.
          pooling_layer: Add the pooling layer.
          pooling_activation: The activation to apply after the pooling layer.
        r   r   r   N)r   r   r   r   r0   r   rZ   r   r   r   r^   pooler_denser$   r%   r&   pooler_activation)r/   r   r   r   r   r   r   r0     s   


z$TransformerEncoderModelSpec.__init__c                 C   r   )Nr   r   rd   r   r   r   r     r   z TransformerEncoderModelSpec.namec                 C   r   )Nr	   r   rd   r   r   r   r     r   z$TransformerEncoderModelSpec.revisionc                 C   r   rc   )r   rd   r   r   r   r     r   z.TransformerEncoderModelSpec.get_default_configc                 C   s   | j jd jjd S r   )r   r(   r   r   rd   r   r   r   r     s   z/TransformerEncoderModelSpec.get_vocabulary_size)r1   r2   r3   r   r   r4   Tanhr   r9   r0   ri   r   r   r   r   r   r   r   r   r   r     s"    

r   )r   typingr   r   r   numpyr$   ctranslate2.specsr   r   r   	LayerSpecr   r:   r"   rT   rn   r*   SequenceToSequenceModelConfigr|   SequenceToSequenceModelSpecr   LanguageModelConfigr   LanguageModelSpecr   r   r   r   r   r   r   <module>   s&    A -J	  