o
    TiX                     @   s   d dl mZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZmZ ddlT ddlT ddlT ddlmZ d	d
lmZmZ G dd deZdS )    )IterableOptionalTupleN   )
empty_from)ActivationType	DtypeEnum   )*)RaggedBatchWrapper   )FalconNonTransformerContainerFalconTransformerContainerc                   @   s  e Zd ZU dZee ed< 	 eee  ed< 	 	 e	de
fddZ	 e	de
fddZe	de
fd	d
Ze	de
fddZe	de
fddZe	de
fddZe	de
fddZe	de
fddZe	defddZe	defddZe	defddZe	defddZe	defddZ	 dedejfd d!Z d"e
d#ejd$ejd%ede!ejejf f
d&d'Z"d$ejd%edejfd(d)Z#d*edejfd+d,Z$d-S ).FalconInferenceModelzP
    Inference model implementation for ragged batching for Llama-2 models.
    _non_transformer_transformerreturnc                 C      | j jS N)_configmax_seq_lengthself r   m/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/falcon/model.pymax_sequence_length(      z(FalconInferenceModel.max_sequence_lengthc                 C   r   r   )r   num_hidden_layersr   r   r   r   
num_layers0   r   zFalconInferenceModel.num_layersc                 C   r   r   r   hidden_sizer   r   r   r   	model_dim4   r   zFalconInferenceModel.model_dimc                 C   r   r   )r   
vocab_sizer   r   r   r   r"   8   r   zFalconInferenceModel.vocab_sizec                 C   s   | j | j S r   )r!   n_headsr   r   r   r   	head_size<      zFalconInferenceModel.head_sizec                 C   r   r   )r   num_attention_headsr   r   r   r   r#   @   r   zFalconInferenceModel.n_headsc                 C   s   d| j j S )N   r   r   r   r   r   intermediate_dimD   r%   z%FalconInferenceModel.intermediate_dimc                 C   s   | j js| j js| j jS dS )Nr   )r   new_decoder_architecturemulti_querynum_kv_headsr   r   r   r   
n_heads_kvH   s
   
zFalconInferenceModel.n_heads_kvc                 C   s0   | j jtjkr
tjS | j jtjkrtjS td)Nz Only fp16 and bf16 are supported)	r   torch_dtypetorchfloat16r   fp16bfloat16bf16NotImplementedErrorr   r   r   r   activation_dtypeM   s
   z%FalconInferenceModel.activation_dtypec                 C      t jS r   )r   GELUr   r   r   r   mlp_activation_fnV      z&FalconInferenceModel.mlp_activation_fnc                 C   r5   r   )NormTypeEnum	LayerNormr   r   r   r   	norm_typeZ   r8   zFalconInferenceModel.norm_typec                 C   r5   r   )PositionalEmbeddingTyperotate_halfr   r   r   r   positional_embedding_type^   r8   z.FalconInferenceModel.positional_embedding_typec                 C   s   t  S )zG
        The positional embedding configuration for the model.
        )RotateHalfConfigr   r   r   r   positional_embedding_configb   s   z0FalconInferenceModel.positional_embedding_configragged_batchc                 C   s<   |  || jj}|jd | jkrtd|j d| j |S )z
        Performs the embedding lookup prior to running the transformer of the model.

        Arguments:
            ragged_batch (RaggedBatchWrapper): The batch to embed.

        Returns:
            torch.Tensor: The embedded batch.
        zEmbedding output shape z does not match model_dim )embedr   word_embshaper!   
ValueError)r   rA   rC   r   r   r   _forward_embedm   s   
z#FalconInferenceModel._forward_embed	layer_idxresidualhidden_statesragged_batch_infoc                 C   s  | j jsJ d| j| }| j|}|}| j||jdd}| |||}| j||j	dd}	| j j
r@| j|d|j|jd\}}
n|}
| j|
|jdd}| j||jdd}||	 | jdkrftj|| jd || jd kr| j|d  }| j|||j|jd\}}||fS || ||fS )aL  
        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
        optimization to fuse the layer norm of the next layer into the current layer.

        Arguments:
            layer_idx (int): The index of the layer to execute.
            residual (torch.Tensor): The residual tensor from the previous layer.
            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
                hidden states after pre normalization.
            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
        z3Only parallel attention implementation is supportedN)bgammabetar   group)rO   )configparallel_attnr   state_manager	get_cacheqkvqkv_wattnattn_out
attn_out_wr)   normln_mlp_gammaln_mlp_betamlp_1mlp_1_wmlp_2mlp_2_wadd_tp_sizedist
all_reduce_base_mp_groupr   ln_attn_gammaln_attn_beta)r   rH   rI   rJ   rK   
cur_paramskv_cacheattn_ln_outattn_hidden_stateattention_output
mlp_ln_outmlp_hidden_state
mlp_outputnext_paramsr   r   r   _forward_transformer_layer~   s:   



	
z/FalconInferenceModel._forward_transformer_layerc                 C   s   | j || jj|| jj| jjd}| jdkrNt| j| j|jd |jd f}t| j	|jd | j
f}tj||| jd ||ddd|jd | j
 |S |S )z
        Performs unembedding of the hidden states to logits. This will only sample the final
        token of each sequence.
        rM   r   r   rP   r	   )unembedr   word_unembedfinal_norm_gammafinal_norm_betarc   r   _comm_logitsrE   _return_logitsr"   rd   all_gather_into_tensorrf   copy_permutereshape)r   rJ   rK   logitscomm_bufferfull_logitsr   r   r   _forward_unembed   s   
 $z%FalconInferenceModel._forward_unembedwrapped_batchc                 C   s`   |  |}| j|d | jd j| jd jd\}}t| jD ]}| ||||\}}q| ||S )Nr   rM   )	rG   r[   r   rg   rh   ranger   rr   r   )r   r   rI   rJ   rH   r   r   r   forward   s   





zFalconInferenceModel.forwardN)%__name__
__module____qualname____doc__r   r   __annotations__r   r   propertyintr   r   r!   r"   r$   r#   r(   r,   r   r4   r   r7   r9   r;   r<   r>   r?   r@   r   r.   TensorrG   r   rr   r   r   r   r   r   r   r      sV   
 
4r   )typingr   r   r   r.   deepspeed.commcommrd   	allocatorr   inference_utilsr   r    modules.configsmodules.interfacesraggedr   	containerr   r   DSTransformerModelBaser   r   r   r   r   <module>   s   