o
    Tib                     @   s  d dl Z d dlm  mZ d dlZd dlmZ ddl	m
Z
mZ G dd dejZG dd dejZG dd	 d	ejZG d
d dejZG dd dZG dd de jjZdd ZG dd de jjZG dd deZG dd deZG dd de jjZG dd deZG dd deZdS )    N   )DominoAsyncColumnParallelLinearRowParallelLinearNoCommc                   @      e Zd ZdZdZdS )	LayerTyper      N)__name__
__module____qualname__encoderdecoder r   r   X/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/domino/transformer.pyr          r   c                   @   r   )AttnTyper   r   N)r   r	   r
   	self_attn
cross_attnr   r   r   r   r      r   r   c                   @   r   )AttnMaskTyper   r   N)r   r	   r
   paddingcausalr   r   r   r   r      r   r   c                   @   r   )	ModelTyper   r   N)r   r	   r
   encoder_or_decoderencoder_and_decoderr   r   r   r   r      r   r   c                   @   s   e Zd ZdZdZdddZdS )
DominoUtilBATCH0BATCH1N)r   r   )r   r	   r
   BATCH_0BATCH_1
HANDLE_DICr   r   r   r   r   "   s    r   c                       s    e Zd ZdZ fddZ  ZS )DominoModulezextensions of torch Module.c                    s   t t|   d S N)superr   __init__)self	__class__r   r   r"   .   s   zDominoModule.__init__)r   r	   r
   __doc__r"   __classcell__r   r   r$   r   r   +   s    r   c                 C   s   t | ||S r    )NoOperapply)input_dic_h_idr   r   r   _Wait_bwd_comm2   s   r-   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )r(   c                 C   s   |S r    r   )graphr*   
handle_dicr,   r   r   r   symbolic8   s   zNoOper.symbolicc                 C   s   || _ || _|S r    )r/   r,   )ctxr*   r/   r,   r   r   r   forward<   s   zNoOper.forwardc                 C   s   | j | j }|  |d d fS r    )r/   r,   wait)r1   grad_outputhandler   r   r   backwardB   s   
zNoOper.backwardN)r   r	   r
   staticmethodr0   r2   r6   r   r   r   r   r(   6   s    

r(   c                       s*   e Zd Zejf fdd	Zdd Z  ZS )CoreAttentionc                    sV   t t|   || _|j|j }|| dks J d| d| || | _|j| _d S )Nr   zprojection size z% should be multiple of TP world size )	r!   r8   r"   attn_mask_typekv_channelsnum_attention_headshidden_size_per_partitionattention_dropoutattention_dropout_rate)r#   configtp_world_sizer9   projection_sizer$   r   r   r"   K   s    
zCoreAttention.__init__c              	   C   sX   t jjj|||d | jdd d}|dddd }| d d | jf }|j	| }|S )NT)	attn_mask	dropout_p	is_causalscaler   r   r      )
torchnn
functionalscaled_dot_product_attentionr>   permute
contiguoussizer<   view)r#   query_layer	key_layervalue_layerattention_maskcontext_layernew_context_layer_shaper   r   r   r2   W   s   
	
zCoreAttention.forward)r   r	   r
   r   r   r"   r2   r'   r   r   r$   r   r8   I   s    r8   c                       s>   e Zd ZdZejejf fdd	Zd	ddZ	d	ddZ
  ZS )
ShardedAttentionzkSharded self-attention layer class.
    Only support self attention and causal attention mask for now.
    c                    s   t t|   |tjksJ dtd|| _|| _|| _|j	| _	|| _
|j|j }|j|j }| }	||j | _|j|	 | _|d|  |	 }
t|j|
| ||j|jd| _t||	| j| _||	 }t||j||j|jdd| _d S )NzOnly support self_attn for now!r   r   r?   init_methodbiasTr?   rX   rY   skip_bias_add)r!   rV   r"   r   r   maxlayer_numberattention_typer9   params_dtypeapply_rotary_pos_embr:   r;   $get_tensor_model_parallel_world_sizehidden_size_per_attention_head!num_attention_heads_per_partitionr   hidden_sizeget_tensor_model_parallel_grouprX   add_bias_linearquery_key_valuer8   core_attentionr   output_layer_init_methoddense)r#   r?   mpur`   r]   r^   r9   query_projection_sizekv_projection_sizer@   qkv_projection_per_partition#query_projection_size_per_partitionr$   r   r   r"   q   s:   zShardedAttention.__init__Nc                 C   s   |  |tj|\}}| d d | jd| j f }|j| }|dddd }t	j
|| j| j| jgdd\}}	}
||d|dd| j}|d urjt|trU|}n|fd }|\}}| ||}| |	|}	| ||	|
|}| |\}}||fS NrF   r   r   r   dim)rg   r   r   rN   rc   rb   rO   rL   rM   rH   split
isinstancetupler`   rh   rj   )r#   hidden_statesrS   micro_batch_numrotary_pos_embmixed_x_layer_new_tensor_shaperP   rQ   rR   	q_pos_emb	k_pos_embrT   outputrY   r   r   r   r2      s4   


zShardedAttention.forwardc                 C   s   |  d d | jd| j f }|j| }|dddd }tj|| j| j| jgdd\}}}|| d| dd| j}|d ur`t|t	rK|}n|fd }|\}}	| 
||}| 
||	}| ||||}
|
S rp   )rN   rc   rb   rO   rL   rM   rH   rt   ru   rv   r`   rh   )r#   rz   rS   ry   r|   rP   rQ   rR   r}   r~   rT   r   r   r   domino_core_attention_forward   s0   


z.ShardedAttention.domino_core_attention_forwardr    )r   r	   r
   r&   r   r   r   r   r"   r2   r   r'   r   r   r$   r   rV   l   s    	
-%rV   c                       sD   e Zd Zdef fddZdejdejdejdejfdd	Z  ZS )
bias_dropout_addprobc                    s    t t|   tj|| _d S r    )r!   r   r"   rH   rI   Dropoutdropout)r#   r   r$   r   r   r"      s   zbias_dropout_add.__init__xrY   residualreturnc                 C   s&   |d ur|| }|  |}|| }|S r    )r   )r#   r   rY   r   outr   r   r   r2      s
   
zbias_dropout_add.forward)	r   r	   r
   floatr"   rH   Tensorr2   r'   r   r   r$   r   r      s    *r   c                       s6   e Zd ZdZejejdf fdd	ZdddZ	  Z
S )	DominoTransformerLayerzBA domino single transformer layer.
    [s, b, h] -> [s, b, h]
            c           
         s*  t t|   || _|| _|j| _d| _tjj	|j
|jd| _t||||tj|d| _|j| _tjj	|j
|jd| _|j}|jrD|d9 }|j| _|j
| _|j| _| j| _| }	| | _| j|	 | _| j|	 | _t| j| j| ||j|j d| _!t"j#| _$t%| j| j||j&|j dd| _'t(| j| _)d S )NFeps)r^   r9   r   rW   TrZ   )*r!   r   r"   r]   
layer_type(apply_residual_connection_post_layernormllama_modelrH   rI   	LayerNormrd   layernorm_epsiloninput_layernormrV   r   r   self_attentionhidden_dropoutpost_attention_layernormffn_hidden_sizegated_linear_unitoutput_size_cinput_size_cinput_size_routput_size_rra   re   TP_groupoutput_size_per_partitioninput_size_per_partitionr   rX   rf   
linear_fc1Fgelumlp_activation_funcr   ri   
linear_fc2r   bias_dropout_add_func)
r#   r?   rk   r`   r]   r   self_attn_mask_typedrop_path_rater   r@   r$   r   r   r"      sV   	
zDominoTransformerLayer.__init__Nc                 C   s  |\}}|  |}t|tjtj}| j||tj|d\}}tj|| jdd}	|  |}
t|
tjtj	}
| j|
|tj	|d\}}tj|| jdd}|	
  | jrR|}n|}| |||}| |}t|tjtj}| jrn|}n|}| |tjtj\}}| |}|
  | jr|
}n|}| |||}| |}
t|
tjtj	}
| jr|
}n|}| |\}}tj|| jdd}	| |
tjtj	\}}| |}| |\}}tj|| jdd}|	
  | |||}|
  | |||}||fS )Nry   Tgroupasync_op)r   r-   r   r   r   r   dist
all_reducer   r   r3   r   r   r   r   r   r   )r#   rw   rS   ry   hidden_states0hidden_states1layernorm_output0attention_output0attention_bias0fwd_handle0layernorm_output1attention_output1attention_bias1fwd_handle1	residual0layernorm_input0output0r{   	residual1layernorm_input1last_mlp_biasoutput1r   r   r   r2   @  sd   







zDominoTransformerLayer.forwardr    )r   r	   r
   r&   r   r   r   r   r"   r2   r'   r   r   r$   r   r      s    	Ar   c                       sP   e Zd ZdZejejddddf fdd	ZdddZ	dd	d
Z
dddZ  ZS )DominoTransformerzTransformer class.Tr   c                    s   t t  _|_|_|	_d _|
_	 _
t s.t  t s.J dj_dd tdjjD _ fddtjfddtjD _jrnjrntjjjjd_j_jr{j_d S d S )	Nz$deepspeed.comm failed to initialize!c                 S   s   g | ]}|  qS r   )item).0rater   r   r   
<listcomp>  s    z.DominoTransformer.__init__.<locals>.<listcomp>r   c              
      s$   }t  | |j| d  dS )Nr   )r   r   r   )r   drop_path_rates)r]   current_layer_type)r`   r?   r   rk   r#   r   r   r   build_layer  s   z/DominoTransformer.__init__.<locals>.build_layerc                    s   g | ]} |d  qS )r   r   )r   i)r   r   r   r     s    r   )r!   r   r"   r   
model_typepost_layer_normpost_processinput_tensorr   re   r   r   is_initializedinit_distributed
num_layersrH   linspacer   rI   
ModuleListrangelayersr   rd   r   final_layernorminter_layer_overlap_forward_forward_impldomino_intra_layer_overlapintra_layer_overlap_forward)r#   r?   rk   r`   r   r   r   r   pre_processr   r   r$   )r`   r   r?   r   rk   r#   r   r   r"     s*   
"zDominoTransformer.__init__Nc                 C   s   |  |||S r    )r   )r#   rw   rS   ry   r   r   r   r2     s   zDominoTransformer.forwardc                 C   s  t j|ddd\}}d }d\}}d\}	}
| jd |}t|tjtj}t| j	D ]h}| j| j
|tjtj\}}| j| j
j|||d}|dkr[|  | j|d  |||
}| j| |}t|tjtj}| j| j
|\}}tj|| jdd}| j| j
|tjtj\}}| j| j
j|||d}|  | j| jr|}	n|}	| j| |||	}| j| |}t|tjtj}| j| jr|}	n|}	| j| j
|\}}tj|| jdd}| j| |tjtj\}}| j| |}|  | j| jr|}
n|}
| j| |||
}| j| |}t|tjtj}| j| jr,|}
n|}
| j| |\}}tj|| jdd}| j| |tjtj\}}| j| |}|  | j| |||	}|| j	d k r| j|d  |}t|tjtj}| j| |\}}tj|| jdd}q)| jr| jr| |}| j	d }|  | j| |||
}| jr| jr| |}t j||gdd	}|S )
Nr   r   chunksrs   )NNr   r   Tr   rr   )rH   chunkr   r   r-   r   r   r   r   r   r   rg   r   r3   r   r   rj   r   r   r   r   r   r   r   r   r   r   r   cat)r#   rw   rS   ry   r   r   r   r   r   r   r   r   indexr   r{   r   r   r   r   r   r   r   r   r   r   r   r     s   




z-DominoTransformer.inter_layer_overlap_forwardc                 C   st   t j|ddd}t| jD ]}| j| }||||}q|\}}| jr/| jr/| |}| |}t j||gdd}|S )Nr   r   r   rr   )	rH   r   r   r   r   r   r   r   r   )r#   rw   rS   ry   r   layerr   r   r   r   r   r   O  s   


z-DominoTransformer.intra_layer_overlap_forwardr    )r   r	   r
   r&   r   r   r   r   r"   r2   r   r   r'   r   r   r$   r   r     s    
1
|r   )rH   torch.nn.functionalrI   rJ   r   enumdeepspeed.commcommr   async_linearr   r   Enumr   r   r   r   r   Moduler   r-   autogradFunctionr(   r8   rV   r   r   r   r   r   r   r   <module>   s(   	#  "