o
    Ti(                     @   s   d dl Z d dlZd dlmZ d dlm  mZ d dlmZ d dl	m
Z
 d dlmZ ddlmZmZ ddlmZmZ G dd dejZG d	d
 d
ejZdS )    N)is_dataclass)get_accelerator   )
LoRAConfigQuantizationConfig)QuantizedParameterQuantizedLinearc                   @   s@   e Zd ZdZddddejejfdedede	de
def
d	d
ZdS )OptimizedLineara  
    Optimized version of nn.Linear that adds features such as:
      * LoRA w. base weight sharding
      * FP [6,8,12] quantization

    Arguments:
        input_dim: Required: size of each input sample
        output_dim: Required: size of each output sample
        bias: Optional: If set to False, the layer will not learn an additive bias. Default: False
        lora_config: Optional: LoRAConfig defining lora features and base-weight-sharding degree
        quantization_config: Optional: QuantizationConfig defining quantization features
        dtype: Optional: parameter dtype, only supports bfloat16 currently

    Returns:
        Returns a new nn.Module depending on the input config. Either native
        torch.nn.Linear, QuantizedLinear, or the full-featured DSOptimizedLinear.
    FN	input_dim
output_dimbiaslora_configquantization_configc	           	   
   C   s   |d urt |stdt| |d ur"t |s"tdt| |d u r5|d u r5||||||d} | S |rEt||||||||d} | S |rPt|||||d} | S )Nz*Expecting QuantizationConfig but received z"Expecting LoRAConfig but received )r   dtypedevice)r
   r   r   r   r   r   r   
linear_cls)r
   r   r   r   r   )r   
ValueErrortypeLoRAOptimizedLinearr   )	selfr
   r   r   r   r   r   r   r    r   U/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/linear/optimized_linear.py__new__%   s4   
zOptimizedLinear.__new__)__name__
__module____qualname____doc__torchbfloat16nnLinearintboolr   r   r   r   r   r   r   r	      s&    r	   c                       sx   e Zd Zddddejejfdededede	de
f
 fdd	Zd
d Zdd Z fddZdd Zdd Zdd Z  ZS )r   FNr
   r   r   r   r   c	           
         sb  t    || _|| _|| _|| _|| _|d u rt  n|| _	|| _
|| _| jd us/J d| jr6J d| jj| _tt| j| j | _| jdkrh| jt ksVJ dtjjtj| j| j |ddd}	ntjjtj| j| jf|ddd}	tjj|	| j| j | jd ur|tjksJ dt|	|d	| _n|	| _d| _d| _| jjs|    d S d S )
Nz(DSOptimizedLinear requires a LoRA configz1bias=True is not supported by LoRAOptimizedLinearr   z8base weight sharding is only supported across world sizer   Frequires_gradz2only bfloat16 is supported when using quantizationr   )!super__init__r
   r   r   r   r   r   current_device_namer   r   r   base_weight_shardingzero_shardsr!   floatsharded_weight_sizedistget_world_sizer   r   	Parameteremptyinitxavier_uniform_reshaper   r   weightdisabled_initializeddelay_lora_init	init_lora)
r   r
   r   r   r   r   r   r   r   w	__class__r   r   r(   N   s>   
	

"
zLoRAOptimizedLinear.__init__c                 C   s0   d| _ tjjtj| j| jf| jddd| _d S )NTr#   Fr$   )	r6   r   r   r0   r1   r   r
   r   r5   r   r   r   r   disablex   s   zLoRAOptimizedLinear.disablec                 C   s   | j rd S | jd urt| jtst| j| jd| _d| _d| j_d| j_| jj	| jj
 | _| j| j| jj
| j| j| jd| _| j| jj
| j| j| j| jd| _tjj| jjtdd tj| jj d| jj_d| jj_d S )Nr&   TF)r   r   r      )a)r6   r   
isinstancer5   r   r7   r%   ds_optim_paramr   
lora_alphalora_rlora_scaling_factorr   r
   r   r   r   lora_weight_1r   lora_weight_2r   r2   kaiming_uniform_mathsqrtzeros_r=   r   r   r   r9   }   s2   


zLoRAOptimizedLinear.init_lorac              	      s   t  fdd| jjD s|   t | |||||S | jdkrJt s*t	dt
 }| j| j }	  d}
||
 }| d||	 |	||
< t | |||||S )Nc                    s   g | ]}| v qS r   r   ).0targetprefixr   r   
<listcomp>   s    z=LoRAOptimizedLinear._load_from_state_dict.<locals>.<listcomp>r   ztattempting to use optimized linear base weight sharding but torch-distributed is not initialized, please init first.r5   r   )anyr   target_modsr>   r'   _load_from_state_dictr+   r.   is_initializedRuntimeErrorget_rankr   r-   flattennarrow)r   
state_dictrO   local_metadatastrictmissing_keysunexpected_keys
error_msgsrankshape_localbase_weight_nameincoming_paramr;   rN   r   rS      s$   

z)LoRAOptimizedLinear._load_from_state_dictc                 C   s   | j }t|ddr/|jtdksJ d|j |jdd t|tr(| n|}|  nt|tr8| n|}tj| j	| j
 |j|jd}t|| || j	| j
S )N
ds_offloadFcpuz&expected base weight on cpu but found T)revert)r   r   )r5   getattrr   r   offloadrA   r   dequantizedr1   r   r
   r   r.   all_gather_into_tensorr4   )r   base_weightlocal_weight
tensor_outr   r   r   full_weight   s   

zLoRAOptimizedLinear.full_weightc                 C   sB   t |d|jd |}|jg |jd d |jd R  }|S )Nr   )r   mmr4   shapeview)r   inputr5   outputr   r   r   linear_without_F_linear   s   $z+LoRAOptimizedLinear.linear_without_F_linearc                 C   s   | j r
t|| jS | jsJ d| jdkr/t  |  }W d    n1 s)w   Y  n| j	r8| j
 }n| j}t||}| | |}|| j|  S )Nz?init_lora was never called, please initialize before proceedingr   )r6   Flinearr5   r7   r+   r   no_gradrm   r   rh   rG   rF   rE   )r   input_tensorrj   base_weight_outputlora_outputr   r   r   forward   s   


zLoRAOptimizedLinear.forward)r   r   r   r   r   r   r    r!   r"   r   r   r(   r>   r9   rS   rm   rt   r{   __classcell__r   r   r;   r   r   L   s0    *$r   )r   rI   torch.nnr   torch.nn.functional
functionalru   dataclassesr   deepspeed.acceleratorr   deepspeed.commcommr.   configr   r   quantizationr   r   Moduler	   r   r   r   r   r   <module>   s   :