o
    iH                     @   s
  d dl mZmZmZ e rddlZddlmZ e r ddlmZ ddlZddl	m
Z
 eeZg dZe
dd	 Zd
d Zdd ZejdddejdedejfddZG dd dejZdd Zdd Zdd Zdd Zdd  Z				!	d&d"d#Z				d'd$d%ZdS )(   )is_accelerate_availableis_torch_availablelogging    N)nn)init_empty_weights)contextmanager)g        g      ?g      ?g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      c                 c   s    t  redd l}t| |jr| j} n
t| tr|| } t| dd }|dkrA|j|  d V  	 W d    d S 1 s<w   Y  |dkret|dre|j	|  d V  	 W d    d S 1 s`w   Y  d V  d S )Nr   typecudaxpu)
r   torch
isinstanceTensordevicestrgetattrr
   hasattrr   )devr   dev_type r   \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/integrations/mxfp4.py	on_device3   s&   

  
r   c                 C   s.   |j jj}|| tjtjdd\} }| |fS )N   )axis)numerics_detailsmxfpdowncast_to_mxfp_torchtor   bfloat16uint8)wtriton_kernels_hubr   w_scaler   r   r   quantize_to_mxfp4J   s   
r#   c           
      C   sn   |j j|j j|j j}}}|jj}|jjj}|jdd\}}	||| |d|fi |	} ||||}| |fS )zE
    Changes the layout of the tensors depending on the hardware
    r   )mx_axisdtype)tensorFP4convert_layoutwrap_torch_tensortensor_detailslayoutStridedLayout"make_default_matmul_mxfp4_w_layout)
r    r"   r!   r(   r)   r*   r,   r-   value_layoutvalue_layout_optsr   r   r   swizzle_mxfp4P   s   

r1   i   )r&   rows_per_chunkr&   r2   returnc                C   s  ddl }| jstj r|  } | }|tjd }| jdd |jks6J d| jdd d|jtjt	|| j
d}| j^ }}}||| }	| |	|} ||	d}tj|	|d	 || j
d}
td|	|D ]R}t|| |	}| || }||| }|d
@ tj}|d? tj}|
|| }|| |ddddd	f< || |ddddd	f< tj|||d ~~~~~qk|
jg |||d	 R  jg ||| d	 R  }
~ ~~|
dd	 S )zw
    Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
    pass of GPT_OSS.
    r   N   zblocks.shape[:-1]=z does not match scales.shape=)r&   r   r   r         )out)mathis_cudar   r
   is_availabler   int32shaper'   
FP4_VALUESr   prodreshapeemptyrangeminlongldexpview	transpose
contiguous)blocksscalesr&   r2   r9   lutprefix_shapeGB
rows_totalr8   r0r1blkexpidx_loidx_hisubr   r   r   convert_moe_packed_tensorsd   s4   44rW   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Mxfp4GptOssExpertsc                    sR  t    |j| _|j| _|j| _tjtj	| jd| j | jd dtj
ddd| _tjtj	| jd| j | jd tj
ddd| _tjtj	| jd| j tjddd| _tjtj	| j| j| jd dftj
ddd| _tjtj	| j| j| jd tj
ddd| _tjtj	| j| jtjddd| _d| _t|dd	| _d | _d | _t|dd	| _d S )
Nr          r%   Frequires_gradgZd;?swiglu_limitg      @)super__init__num_local_expertsnum_expertsintermediate_sizehidden_sizer   	Parameterr   zerosr   gate_up_proj_blocksgate_up_proj_scalesfloat32gate_up_proj_biasdown_proj_blocksdown_proj_scalesdown_proj_biasalphar   limitgate_up_proj_precision_configdown_proj_precision_config)selfconfig	__class__r   r   r_      s>   
"  zMxfp4GptOssExperts.__init__hidden_statesr3   c                 C   s   t jjt jjt jj}}}t jj}t|j= ||d|d| j| j	fd}	||| j
| jtj||| jd |	d}
||
| j| jtj||| j|jd}W d    |S 1 sWw   Y  |S )Nswiglu)rm   rn   r   )gather_indxprecision_configgammasfused_activation)scatter_indxrx   ry   )r!   
matmul_ogsFnSpecsFusedActivationrv   	swiglu_fnr   r   rm   rn   gate_up_projri   r   r   rh   ro   	down_projrl   rp   	gate_scal)rq   ru   routing_data
gather_idxscatter_idxr}   r~   r|   r   actintermediate_cache1intermediate_cache3r   r   r   forward   s<   

zMxfp4GptOssExperts.forward)__name__
__module____qualname__r_   r   r   r   __classcell__r   r   rs   r   rX      s    $rX   c                 C   s
  dd l }tjjtjjtjjtjjf\}}}}t| j t	j
 }t|jdd}d}	| jd }
| jd }|| }|| }|d | }|
| }dd }|| |\}}t	j|dd}t	j|dd\}}t	|d|}|d}t	j|||d d	|| }|dt	j}d
}t	||k ||}t	j|ddt	j}t	|t	j}t	||k ||	}t	||k||	}t	||	k|	|}|| }t	|| |	k|	|}|| | d}|| | d}||||}|}W d    n1 sw   Y  ||||||||fS )Nr   
LOCAL_RANK0r5   r   c                 S   sF   t j|  dddd d d |f }| }t j| |dd}|| fS )Nr   T)dimstabler   )r   argsortrD   take_along_dimint)valsktk_indxtk_valr   r   r   topk   s   "z routing_torch_dist.<locals>.topkr   )binsmaxi  T)r   )src_indxdst_indx)osr!   routing
GatherIndxRoutingDataScatterIndxcompute_expt_data_torchr   r   r   distributedget_world_sizer   environgetr=   softmaxsortgatherr@   histcrF   r   r<   wherer   )logitsn_expts_actr   r   r   r   r   
world_sizerankreplace_valuen_tokensn_expts_totn_local_expertslocal_expert_startlocal_expert_endn_gates_padr   	expt_scal	expt_indxsort_indiceshistvar	topk_indx	gate_indxr   rw   r{   	expt_datahit_expertsr   r   r   routing_torch_dist   sN   



4r   c           
      C   s   dd l m} | r| rt| drt}ntjj}|jd }|	d| j
j}tj|| j
j| j
j}t|j ||| j
j\}}}W d    n1 sMw   Y  | ||||}	|		|d| j
j}	|	|fS )Nr   
_is_hookedr5   )torch.distributedr   r;   is_initializedr   r   r!   r   r=   r@   router
hidden_dimr   
functionallinearweightbiasr   r   top_kexperts)
rq   ru   distr   
batch_sizerouter_logitsr   r   r   
routed_outr   r   r   mlp_forward'  s   
r   c                    s(   d |  t fdd|D sdS dS )N.c                 3   s0    | ]}t | d  pt |  V  qdS )z\.N)rematch).0keycurrent_key_name_strr   r   	<genexpr>=  s     
z(should_convert_module.<locals>.<genexpr>TF)joinany)current_key_namepatternsr   r   r   should_convert_module;  s   
r   c              
   K   s  ddl m} |d}|d}|d}	|d}
|d}|d}d	D ]e}||v r|d ur;||||||	|
||}| d
}| d}t| |ddd | t| |rt| |rtt| |t| |}|dkrttj	
 rttj	  t| |tj|| t| | t| | q&d S )Nr   shard_and_distribute_modulemodelempty_paramcasting_dtypeto_contiguousr   device_mesh)r   r   _blocks_scalesr   r   cpu)integrations.tensor_parallelr   r   setattrrsplitr   rW   r   r   r
   r;   empty_cacher   rd   r   delattr)module
param_nameparam_valuetarget_devicedq_param_namekwargsr   r   r   r   r   r   r   projblocks_attrscales_attrdequantizedr   r   r   
dequantizeD  s@   











r   c              	   K   sl  |j j|j j|j j}}}ddlm}	 |d}
|d}|d}|d}|d}|d}d	|v rB|d
d dd }d|v rR|d
d dd }|durb|	|
||||||| nt| |	d
dd t
jj|dd | d}| d}t| |}t| |}|jjdkr2|jjdkr4|d}|dkr||| jd d}n
||d| jd }t|d|dkrd}|| }|| }t| t|dd|dd|\}}W d   n1 sw   Y  |dkrt
|| j| jd g|_nt
|| j| jg|_t| || t| | d|||| dd t| | t| | ~dS dS dS )zq
    This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
    r   r   r   r   r   r   r   r   rI   r   r5   r   r   rJ   r   Nr   Fr[   metar   r	   r   r
   _precision_config)rhs_data)weight_scaleflex_ctx)r|   PrecisionConfigFlexCtx
InFlexDatar   r   r   splitr   r   r   r   rd   r   r   r	   sizer@   rb   r   rH   r   r1   rG   Sizerc   r=   r   )r   r   r   r   r!   r   r   r  r  r   r   r   r   r   r   r   r   r   r   rI   rJ   local_expertstriton_weight_tensorr   r   r   r   load_and_swizzle_mxfp4g  sf   






$









r  Fc           
   	   C   s   |d u rg }|   D ]i\}}|| t||s|d q
|jjdkrC|jsCt  t|| j	|< d}W d    n1 s>w   Y  |jjdkrX|jsXddl
m} |t||_tt| dkrnt||||||d\}	}|d q
| |fS )Nr5   GptOssExpertsT	GptOssMLPr   )
MethodType)has_been_replacedrr   )named_childrenappendr   poprt   r   r   r   rX   _modulestypesr  r   r   lenlistchildren_replace_with_mxfp4_linear)
r   modules_to_not_convertr   quantization_configr  rr   namer   r  _r   r   r   r    s4   



r  c                 C   sz   |j r| S ddlm} |da|d u rdgn|}|jd ur#||j tt|}t| ||||d\} }|s;t	
d | S )Nr   )
get_kernelz kernels-community/triton_kernelslm_head)rr   zYou are loading your model using mixed-precision FP4 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   kernelsr  r!   r  extendr  setr  loggerwarning)r   r  r   r  rr   r  r  r   r   r   replace_with_mxfp4_linear  s(   

r!  )NNNFN)NNNN) utilsr   r   r   r   r   
accelerater   r   
contextlibr   
get_loggerr   r  r>   r   r#   r1   r   r&   r   r   rW   ModulerX   r   r   r   r   r  r  r!  r   r   r   r   <module>   sP   


6ID	#E
'