o
    Tiό                     @   sp  d dl Z d dlZd dl mZ d dlmZ d dlmZ ddlm	Z	m
Z
mZmZmZ d dlmZ daG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZdd Zd.ddZdd Zdd ZG dd de jj Z!G dd de jj Z"G dd de jj Z#G d d! d!e jj Z$d"d# Z%d$d% Z&d&d' Z'd(d) Z(G d*d+ d+eZ)G d,d- d-eZ*dS )/    N)nn)init   )TopKBinarizerSymQuantizerAsymQuantizerTernaryQuantizerBinaryQuantizer)loggerc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	QuantActa  
    Class to quantize given activations. Note that when using this function, the input activation quantization range will be fixed for all
    tokens/images for inference. This generally will affect some accuracy but achieve better latency performance.
    Parameters:
    ----------
    act_range_momentum : float, default 0.95
        Momentum for updating the activation quantization range.
    quant_mode : str, default 'symmetric'
    ffffff?	symmetricc                    sJ   t t|   || _|| _|dkrtj| _ntj| _| 	dt
d d S )Nr   	x_min_max   )superr   __init__act_range_momentum
quant_moder   applyact_functionr   register_buffertorchzeros)selfr   r   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/compression/basic_layer.pyr      s   
zQuantAct.__init__c                 G   s   | j rE|j }|j }| jd | jd kr!|| jd< || jd< | jd | j |d| j   | jd< | jd | j |d| j   | jd< | ||| jd | jd }|S )a  
        x: the activation that we need to quantize
        num_bits: the number of bits we need to quantize the activation to
        *args: some extra arguments that are useless but needed for align with the interface of other quantization functions
        r   r   )trainingdataminmaxr   r   r   )r   xnum_bitsargsx_minx_maxx_qr   r   r   forward(   s   



$$zQuantAct.forward)r   r   )__name__
__module____qualname____doc__r   r(   __classcell__r   r   r   r   r      s    
r   c                       s<   e Zd Z fddZdd Zdd Zdd Zd	d
 Z  ZS )Embedding_Compressc                    s8   t t| j|  d | j_d | j_d | j_d| _d| _d S NF)	r   r.   r   weight
start_bitstarget_bitsq_period&weight_quantization_enabled_in_forwardweight_quantization_enabledr   kargsr   r   r   r   C   s   
zEmbedding_Compress.__init__c                 C   s   d | j| j| jjS )Nz;num_embeddings={}, embedding_dim={}, weight_quantization={})formatnum_embeddingsembedding_dimr0   r2   r   r   r   r   
extra_reprK   s   zEmbedding_Compress.extra_reprc                 C   s   || j _|| j _|| j _|| _| jrYtd | j jdkr+|dkr&tj| _	n*t
j| _	n%| j jdkr>|dks9J dtj| _	n| j jdkrP|dksLJ dtj| _	| j d| _d S d S )	N************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************   r   r   HOnly symmetric quantization is supported for ternary weight quantizationr   GOnly symmetric quantization is supported for binary weight quantizationr   )r0   r1   r2   r3   r4   r
   warningr   r   weight_quantizerr   r   r	   sizeweight_quantize_num_groupsr   r1   r2   quantization_periodr4   quantization_type
num_groupsr   r   r   enable_weight_quantizationO   s(   


z-Embedding_Compress.enable_weight_quantizationc                 C   *   |  | j| jjd d | jj| j_d| _d S r/   rB   r0   r2   rD   r   r4   r;   r   r   r   fix_weight_quantizationg      z*Embedding_Compress.fix_weight_quantizationc              	   C   sT   | j r| jr| | j| jjd d | j}n| j}tj||| j	| j
| j| j| j}|S N)r4   r5   rB   r0   r2   rD   r   
functional	embeddingpadding_idxmax_norm	norm_typescale_grad_by_freqsparse)r   inputr0   outr   r   r   r(   m   s   zEmbedding_Compress.forward)	r)   r*   r+   r   r<   rI   rL   r(   r-   r   r   r   r   r.   A   s    r.   c                       s   e Zd ZdZdd fdd
Zdd Zdd	 Zd
d Zdd Zdd Z	d#ddZ
d$ddZd%ddZdd Zdd Zdd Zdd  Zd&d!d"Z  ZS )'LinearLayer_Compressz(
    Linear layer with compression.
    Tbiasc                   sn   t t| j|d|i d | _d | _d | _d | _d | j_d | j_	d | j_
d| _d| _d| _d| _d| _d| _d S )NrZ   F)r   rX   r   sparse_pruning_methodrow_pruning_methodhead_pruning_methodactivation_quantization_methodr0   r1   r2   r3   r4   r5   sparse_pruning_enabledrow_pruning_enabledhead_pruning_enabledactivation_quantization_enabled)r   rZ   r7   r   r   r   r   ~   s   
zLinearLayer_Compress.__init__c              
   C   s>   d | j| j| jd u| jd u| jd u| jd u| jd u| jj	S )Nzin_features={}, out_features={}, bias={}, sparse pruning={}, row pruning={}, head pruning={}, activation quantization={}, weight_quantization={})
r8   in_featuresout_featuresrZ   r[   r\   r]   r^   r0   r2   r;   r   r   r   r<      s   zLinearLayer_Compress.extra_reprc                 C      || _ || _|dkr)t| jj}t|| j d}|| j	 }|
| jj}n+|dkrRtt| j	 | _| jj
| jj| j_tj| jtdd d }nt| d| d S Nl1Ftopk   asparse_pruning_masksparse_pruning_ratior[   r   absr0   r   r   r   viewrC   todevicer   	ParameterTensorsparse_mask_scoresr   kaiming_uniform_mathsqrtNotImplementedErrorr   r   ratiomethodweight_normmaskr   r   r   enable_sparse_pruning   s   z*LinearLayer_Compress.enable_sparse_pruningc                 C   s   || _ || _|dkr+tjj| jjddd}t|| j d}|	dd}|
| jj}n-|dkrVtt| jdd| _| jj
| jj| j_tj| jtdd	 d }nt| d
| d S )Nrg   r   orddimFrh   r   ri   rj   row_pruning_mask)row_pruning_ratior\   r   linalgnormr0   r   r   r   rp   rq   rr   r   rs   rt   rC   row_mask_scoresr   rv   rw   rx   ry   r   rz   r   r   r   enable_row_pruning   s   z'LinearLayer_Compress.enable_row_pruningc                 C   sj   || _ || _|| _|dvrt|| _ttd| j | _| jj	
| jj| j_	tj| jtdd d S )N)rh   r   ri   rj   )	num_headshead_pruning_ratior]   ry   r   rs   r   rt   head_pruning_scoresr   rq   r0   rr   r   rv   rw   rx   )r   r{   r|   r   r   r   r   enable_head_pruning   s   
z(LinearLayer_Compress.enable_head_pruningc                 C   >   | j dd}| jj| | j_| `| jdkr| `d | _d| _d S NrU   pruning_typerh   Fget_maskr0   r   rl   r[   ru   r_   r   r~   r   r   r   fix_sparse_pruning_helper      
z.LinearLayer_Compress.fix_sparse_pruning_helperNFc                 C   sb  |d u rz| j dd }|rQ| jj}| jj}| jj}t| jj|	dd d f | _|| j_|| j_|| j_| j
d urIt| j
j|	d | _
| jd| _n| jj|	dd | j_| j
d urm| j
j|	d | j
_| `| jdkrv| `d | _n2| jj}| jj}| jj}t| jjd d |	df | _|| j_|| j_|| j_| jd| _d }d| _|S )Nrowr   r   r   r   rh   F)r   boolr0   r1   r2   r3   r   rs   r   rp   rZ   rC   rd   r   r\   r   rc   r`   r   r~   dim_reductionr1   r2   r3   r   r   r   fix_row_col_pruning_helper   s>   "


"z/LinearLayer_Compress.fix_row_col_pruning_helperc                 C   s  |r|n| j }|d u r| jdkr| jdd }|rV| jd}| jj}| jj}| jj}t	
| jj |d|dd d f d| | _|| j_|| j_|| j_n"| j }| jj | j d|dd |d |d  | j_| jdkr| `d | _nVt| jj}| jj}| jj}| jd}t	
| jj|d|dd d f d|| _|| j_|| j_|| j_| jd urt	
| jj|d|dd d f d| _d| _|S )Nrh   headr   r   r   r   F)r   r]   r   r   r0   rC   r1   r2   r3   r   rs   r   treshaperp   r   ry   rZ   ra   )r   r~   r   r   shaper1   r2   r3   r   r   r   fix_head_pruning_helper   sX   


"
2
0z,LinearLayer_Compress.fix_head_pruning_helperr   c                 C   s   |dkr!| j dkr| j| jjS | j dkrt| j| jdS t	|dkrB| j
dkr2| j| jjS | j
dkr@t| j| jdS t	|dkrV| jdkrTt| j| jdS t	t	)NrU   rg   rh   Fr   r   )r[   rl   rq   r0   rr   r   r   ru   rn   ry   r\   r   r   r   r]   r   r   r   r   r   r   r   r   (  s"   




zLinearLayer_Compress.get_maskc                 C   s   || j _|| j _|| j _|| _| jrUtd | j jdkr+|dkr&tj| _	n*t
j| _	n%| j jdkr>|dks9J dtj| _	n| j jdkrP|dksLJ dtj| _	|| _d S d S )Nr=   r>   r   r   r?   r   r@   )r0   r1   r2   r3   r4   r
   rA   r   r   rB   r   r   r	   rD   rE   r   r   r   rI   ?  s(   



z/LinearLayer_Compress.enable_weight_quantizationc                 C   rJ   r/   rK   r;   r   r   r   rL   V  rM   z,LinearLayer_Compress.fix_weight_quantizationc                 C   ^   |dv sJ d|| _ | d| | _|dkrt|d| _d S |dkr)tj| _d S tj| _d S N)      z;Only 4/8 bits activation quantization are supported for now_static)r   r   activation_quantization_bitsr^   r   activation_quantizerr   r   r   r   bitsrG   range_calibrationr   r   r   enable_activation_quantization\     z3LinearLayer_Compress.enable_activation_quantizationc                 C   s8   |j }| | jd|dd |d |d  S )Nr   r   r   )r   r   r   r   rp   )r   wr~   r   r   r   r   head_pruning_reshapeh  s   2z)LinearLayer_Compress.head_pruning_reshapec                 C   s4  | j r| jr| | j| jjd d | j}| j}n| j}| j}| jr3| jr3| j	dd}||
| j  }| jrR| jrR| j	dd}||
dd }|d urR||
d }| jrd| jrd| j	dd}| ||}| jrd| jv rv| |d }nd}| || jd d |}|rtj||d }||fS tj|||}|S )NrU   r   r   r   r   r   dynamic)r4   r5   rB   r0   r2   rD   rZ   r_   r[   r   rp   rC   r`   r\   ra   r]   r   rb   r^   numelr   r   r   rO   linear)r   rV   skip_bias_addr0   rZ   r~   rH   outputr   r   r   r(   l  s8   
zLinearLayer_Compress.forwardr/   )NNF)r   F)r)   r*   r+   r,   r   r<   r   r   r   r   r   r   r   rI   rL   r   r   r(   r-   r   r   r   r   rX   y   s     


*
*rX   c                       st   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d ZdddZ	dddZ
dd Zdd Zdd Zdd Z  ZS )Conv2dLayer_Compressz(
    Conv2D layer with compression.
    c                    sV   t t| j|  d | _d | _d | _d | j_d | j_d | j_	d| _
d| _d| _d| _d S r/   )r   r   r   r[   channel_pruning_methodr^   r0   r1   r2   r3   r4   r_   channel_pruning_enabledrb   r6   r   r   r   r     s   
zConv2dLayer_Compress.__init__c                 C   s   d}| j dt| j  kr|d7 }| jdt| j kr|d7 }| jdt| j kr,|d7 }| jdkr5|d7 }| jd u r>|d	7 }| jd
krG|d7 }|jdi | j}|d| j	d u| j
d u| jd u| jj S )NzI{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride})r   z, padding={padding})r   z, dilation={dilation}z!, output_padding={output_padding}r   z, groups={groups}z, bias=Falser   z, padding_mode={padding_mode}zZ sparse pruning={}, channel pruning={}, activation quantization={}, weight_quantization={}r   )paddinglendilationoutput_paddinggroupsrZ   padding_moder8   __dict__r[   r   r^   r0   r2   )r   sr   r   r   r   __repr__  s$   


zConv2dLayer_Compress.__repr__c                 C   re   rf   rm   rz   r   r   r   r     s   z*Conv2dLayer_Compress.enable_sparse_pruningc                 C   s   || _ || _|dkr/tjj| jjdg dd}t|| j d}|	dddd}|
| jj}n/|dkr\tt| jdddd| _| jj
| jj| j_tj| jtd	d
 d }nt| d| d S )Nrg   r   )r   r   r>   r   Fr   rh   r   ri   rj   channel_pruning_mask)channel_pruning_ratior   r   r   r   r0   r   r   r   rp   rq   rr   r   rs   rt   rC   channel_mask_scoresr   rv   rw   rx   ry   r   rz   r   r   r   enable_channel_pruning  s    z+Conv2dLayer_Compress.enable_channel_pruningc                 C   r   r   r   r   r   r   r   r     r   z.Conv2dLayer_Compress.fix_sparse_pruning_helperNFc                 C   sV  |d u rz| j dv rx| jdd }|rM| jj}| jj}| jj}t| jj	|
ddf | _|| j_|| j_|| j_| jd urLt| jj	|
d | _n| jj	|
dddd | j_	| jd urk| jj	|
d | j_	| `| j dkrt| `d | _ n.t| jj}| jj}| jj}t| jj	d d |
ddf | _|| j_|| j_|| j_d }d| _|S )	N)rg   rh   channelr   r   .r   rh   F)r   r   r   r0   r1   r2   r3   r   rs   r   rp   rZ   r   r   ry   r   r   r   r   r   fix_channel_pruning_helper  s@   



$z/Conv2dLayer_Compress.fix_channel_pruning_helperrU   c                 C   s   |dkr!| j dkr| j| jjS | j dkrt| j| jdS t	|dkrB| j
dkr2| j| jjS | j
dkr@t| j| jdS t	t	)NrU   rg   rh   Fr   )r[   rl   rq   r0   rr   r   r   ru   rn   ry   r   r   r   r   r   r   r   r   r     s   



zConv2dLayer_Compress.get_maskc                 C   rJ   r/   rK   r;   r   r   r   rL   "  rM   z,Conv2dLayer_Compress.fix_weight_quantizationc                 C   sj   || j _|| j _|| j _|| _| jr3| j jdksJ dtd |dkr*tj| _	nt
j| _	|| _d S d S )Nr   zKOnly >=4 bits weight quantization are supported during forward pass for nowr=   r   )r0   r1   r2   r3   r4   r
   rA   r   r   rB   r   rD   rE   r   r   r   rI   (  s   

z/Conv2dLayer_Compress.enable_weight_quantizationc                 C   r   r   r   r   r   r   r   r   9  r   z3Conv2dLayer_Compress.enable_activation_quantizationc              	   C   s   | j r| jr| | j| jjd d | j}| j}n| j}| j}| jr3| jr3| j	dd}||
| j  }| jrQ| j	dd}||
dddd }|d urQ||
d }| jrpd| jv rd| |d   }nd}| || jd d |}tj|||| j| j| j| jS )NrU   r   r   r   r   r   r   )r4   r5   rB   r0   r2   rD   rZ   r_   r[   r   rp   rC   r   rb   r^   r   r   r   r   rO   conv2dstrider   r   r   )r   rV   r0   rZ   r~   rH   r   r   r   r(   E  s*   
 zConv2dLayer_Compress.forwardr/   )rU   )r)   r*   r+   r,   r   r   r   r   r   r   r   rL   rI   r   r(   r-   r   r   r   r   r     s    


$r   c                   @   s   e Zd ZdddZdS )BNLayer_CompressTc                 C   s\   t | jj|d | _t | jj|d | _| j|d | _| j|d | _d S )Nr   )r   rs   r0   r   rp   rZ   running_meanrunning_var)r   r~   r   r   r   r   r   e  s   z+BNLayer_Compress.fix_channel_pruning_helperN)T)r)   r*   r+   r   r   r   r   r   r   c  s    r   c                 C   s.   t  }tj|ddkr| S tj| |d | S )z8All-reduce the input tensor across model parallel group.groupr   )g_mpuget_model_parallel_groupdistget_world_size
all_reduce)input_r   r   r   r   _reducel  s
   r   Fc                 C   s^   |   d }|  | | dksJ |  | | }tj| ||d}|r-tdd |D S |S )a  Split a tensor along its last dimension.
    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.
    r   r   r   c                 s   s    | ]}|  V  qd S rN   )
contiguous).0chunkr   r   r   	<genexpr>  s    z.split_tensor_along_last_dim.<locals>.<genexpr>)r   rC   r   splittuple)tensornum_partitionscontiguous_split_chunkslast_dimlast_dim_sizetensor_listr   r   r   split_tensor_along_last_dimz  s   	r   c                 C   sN   t  }tj|ddkr| S tj|d}t| |}tj|d}||  }|S )zOSplit the tensor along its last dimension and keep the
    corresponding slice.r   r   )r   r   r   r   r   get_rankr   )r   r   
world_size
input_listrankr   r   r   r   _split  s   
r   c                    s   t  }tj|ddkr S   d }tj|d}tj|d} fddt|D } ||< tj| |d tj	||d
 }|S )z8Gather tensors and concatenate along the last dimension.r   r   c                    s   g | ]}t  qS r   )r   
empty_like)r   r   r   r   r   
<listcomp>  s    z_gather.<locals>.<listcomp>r   )r   r   r   r   r   r   range
all_gatherr   catr   )r   r   r   r   r   r   r   r   r   r   _gather  s   r   c                   @   (   e Zd ZdZedd Zedd ZdS )_CopyToModelParallelRegionz,Pass the input to the model parallel region.c                 C      |S rN   r   ctxr   r   r   r   r(        z"_CopyToModelParallelRegion.forwardc                 C      t |S rN   r   r   grad_outputr   r   r   backward     z#_CopyToModelParallelRegion.backwardNr)   r*   r+   r,   staticmethodr(   r   r   r   r   r   r         
r   c                   @   r   )_ReduceFromModelParallelRegionz4All-reduce the input from the model parallel region.c                 C   r   rN   r   r   r   r   r   r(     r   z&_ReduceFromModelParallelRegion.forwardc                 C   r   rN   r   r   r   r   r   r     r   z'_ReduceFromModelParallelRegion.backwardNr   r   r   r   r   r    r   r  c                   @   r   )_ScatterToModelParallelRegionzBSplit the input and keep only the corresponding chuck to the rank.c                 C   r   rN   r   r   r   r   r   r(     r   z%_ScatterToModelParallelRegion.forwardc                 C   r   rN   r   r   r   r   r   r     r   z&_ScatterToModelParallelRegion.backwardNr   r   r   r   r   r    r   r  c                   @   r   )_GatherFromModelParallelRegionz<Gather the input from model parallel region and concatenate.c                 C   r   rN   r  r   r   r   r   r(     r   z&_GatherFromModelParallelRegion.forwardc                 C   r   rN   r  r   r   r   r   r     r   z'_GatherFromModelParallelRegion.backwardNr   r   r   r   r   r    r   r  c                 C   
   t | S rN   )r   r   r   r   r   r   copy_to_model_parallel_region     
r  c                 C   r  rN   )r  r   r   r   r   r   !reduce_from_model_parallel_region  r  r	  c                 C   r  rN   )r  r   r   r   r   r    scatter_to_model_parallel_region  r  r
  c                 C   r  rN   )r  r   r   r   r   r   !gather_from_model_parallel_region  r  r  c                       *   e Zd Zd fdd	Z fddZ  ZS )ColumnParallelLinear_CompressTFc                    s\   |a || _|| _|| _|| _| }|| dksJ || | _tt| j	| j| j|d d S Nr   rY   )
r   
input_sizeoutput_sizegather_outputr   get_model_parallel_world_sizeoutput_size_per_partitionr   r  r   )r   mpur  r  rZ   r  r   r   r   r   r   r        
z&ColumnParallelLinear_Compress.__init__c                    sT   t |}| jrt |d\}}nt |}d }| jr$t|}||fS |}||fS NT)r  r   r   r(   r  r  )r   r   input_paralleloutput_parallelrZ   r   r   r   r   r(     s   z%ColumnParallelLinear_Compress.forward)TTFr)   r*   r+   r   r(   r-   r   r   r   r   r        r  c                       r  )RowParallelLinear_CompressTFc                    s\   |a || _|| _|| _|| _| }|| dksJ || | _tt| j	| j| j|d d S r  )
r   r  r  input_is_parallelr   r  input_size_per_partitionr   r  r   )r   r  r  r  rZ   r  r   r   r   r   r   r   $  r  z#RowParallelLinear_Compress.__init__c                    sf   | j r|}nt|}t |d\}}t|}| js+|d ur#|| }n|}d }||fS |}|}||fS r  )r  r
  r   r(   r	  r   )r   r   r  r  rZ   output_r   output_biasr   r   r   r(   4  s   
z"RowParallelLinear_Compress.forward)TFFr  r   r   r   r   r  "  r  r  r   )+r   rw   r   torch.nnr   deepspeed.commcommr   utilsr   r   r   r   r	   deepspeed.utilsr
   r   Moduler   	Embeddingr.   LinearrX   Conv2dr   BatchNorm2dr   r   r   r   r   autogradFunctionr   r  r  r  r  r	  r
  r  r  r  r   r   r   r   <module>   s<   08   P	
#