o
    
۾iY                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZ dd	lmZ dd
lmZmZ d ddZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )!    N)PretrainedConfig)
LoRAConfig) tensor_model_parallel_all_gather)divide)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinear)current_platform   )BaseLinearLayerWithLoRA)_fully_sharded_can_replace_not_fully_sharded_can_replacelayerColumnParallelLinearWithLoRAc                 C   s  |j t|j  krt|j  krt|jksJ  J |jj|j| |}| d| j	d } |d|j	d |j	}}t
j|j | j	d |jd j	d ft
j| jd}|j|| |jd}t sd|}t|}|jj|||j|jddd}t s||}|j| }|S )z
    For `ColumnParallelLinearWithLoRA` or classes that inherit from
    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
    r      dtypedeviceg      ?T)offset_start	add_input)n_sliceslenlora_a_stackedlora_b_stackedoutput_slices
base_layerquant_methodapplyviewshapetorchzerosfloat32r   punica_wrapper
add_shrinkr	   can_update_inplacer   
add_expand)xbiasr   outputout_orig_shapebuffersshrunk_bufferslora_output r/   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/lora/layers/column_parallel_linear.py
_mcp_apply   sB   
	
r1   c                       s   e Zd ZdZdeddf fddZdejdejfdd	Zd
ejdejfddZ	dejdeje
ejejdB f B fddZee	ddejdedededB def
ddZ  ZS )r   a$  
    LoRA on top of ColumnParallelLinear layer.
    LoRA B is sliced for tensor parallelism.
    There are two types for the `base_layer`:
    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
    r   returnNc                    s.   t  | t|tu | _| jj| _d| _d S Nr
   )	super__init__typer   is_merged_col_linearr   output_size_per_partitionoutput_sizer   selfr   	__class__r/   r0   r5   S   s   

z%ColumnParallelLinearWithLoRA.__init__lora_ac                 C      |S Nr/   r;   r>   r/   r/   r0   slice_lora_a]   s   z)ColumnParallelLinearWithLoRA.slice_lora_alora_bc                 C   s   | j rB| jd }|jd d }|| j| | jd | d d f }||| j|  || jd |  d d f }tj||gdd}|S | j}| j| }| jd | }|||d d f }|S )Nr   r   r
   dim)r7   r9   r    tp_rankr!   cat)r;   rC   
shard_sizeoffsetleft_weightright_weight	start_idxend_idxr/   r/   r0   slice_lora_b`   s(   

z)ColumnParallelLinearWithLoRA.slice_lora_binput_c                 C   sh   | j js| j jnd}| ||}| j jr| jdkrt|}n|}| j js&|S | j jr.| j jnd}||fS )zForward of ColumnParallelLinear

        Args:
            input_: Tensor whose last dimension is `input_size`.

        Returns:
            - output
            - bias
        Nr
   )r   skip_bias_addr)   r   gather_outputtp_sizer   return_bias)r;   rO   r)   output_parallelr*   output_biasr/   r/   r0   forwardy   s   
z$ColumnParallelLinearWithLoRA.forwardsource_layerlora_configpacked_modules_listmodel_configc                 C   sJ   t |tu rdS t |tu r#t|dkrdS t|do!t|jdk S dS )NTr
   Foutput_sizes   )r6   r   r   r   hasattrr[   clsrW   rX   rY   rZ   r/   r/   r0   can_replace_layer   s   	
z.ColumnParallelLinearWithLoRA.can_replace_layerr@   )__name__
__module____qualname____doc__r   r5   r!   TensorrB   rN   tuplerV   classmethodr   nnModuler   listr   boolr`   __classcell__r/   r/   r<   r0   r   J   s0    

c                       s   e Zd ZdZdeeB ddf fddZ	ddeded	e	dB ddfd
dZ
deejdB  deejdB  fddZdeejdB  deejdB  fddZdedejeej B dejeej B fddZee	ddejdeded	e	dB def
ddZ  ZS )"MergedColumnParallelLinearWithLoRAzColumnParallelLinear layer that is composed of 2 sublayers (slices)
    packed together (e.g. gate_proj + up_proj -> gate_up_proj).

    This means we have 2 LoRAs, each applied to one half of the layer.

    Both slices must have the same size.
    r   r2   Nc                    sL   t  |  jj}t fdd|D  _t j _ jf j  _	d S )Nc                 3   s    | ]	}t | jV  qd S r@   )r   rR   .0r9   r;   r/   r0   	<genexpr>   s    
z>MergedColumnParallelLinearWithLoRA.__init__.<locals>.<genexpr>)
r4   r5   r   r[   rf   r   r   r   rF   
output_ids)r;   r   r[   r<   rp   r0   r5      s   
z+MergedColumnParallelLinearWithLoRA.__init__	max_lorasrX   rZ   c                    sf   _ js	jntjj t fddtjD _tfddj	D _
dS )zk
        The main reason for overriding this function is to enhance  code
        maintainability.
        c              	   3   s,    | ]}t jd  jjjdV  qdS r
   r   N)r!   r"   
input_size
lora_dtyper   )ro   _ lora_a_output_size_per_partitionrX   rs   r;   r/   r0   rq          	
zIMergedColumnParallelLinearWithLoRA.create_lora_weights.<locals>.<genexpr>c              	   3   s,    | ]}t jd | j jjdV  qdS rt   )r!   r"   max_lora_rankrv   r   rn   )rX   rs   r;   r/   r0   rq      rz   N)rX   fully_sharded_lorasr{   r   rR   rf   ranger   r   r   r   r;   rs   rX   rZ   r/   rx   r0   create_lora_weights   s   
	
	z6MergedColumnParallelLinearWithLoRA.create_lora_weightsr>   c                 C   r?   r@   r/   rA   r/   r/   r0   rB      s   z/MergedColumnParallelLinearWithLoRA.slice_lora_arC   c                 C   sd   d g| j  }tt| j| jD ] \}\}}||  }d ur/||| ||d  d d f ||< q|S r3   )r   	enumerateziprr   r   )r;   rC   sliced_lora_bishard_idrH   lora_b_ir/   r/   r0   rN      s   z/MergedColumnParallelLinearWithLoRA.slice_lora_bindexc                 C   s   |  | | jdkr| |}| |}t| jD ]F}||  }d ur=| j| |dd |jd d |jd f j|dd ||  }d ur_| j	| |dd |jd d |jd f j|dd qd S )Nr
   r   T)non_blocking)

reset_lorarR   rB   rN   r}   r   r   r    copy_r   )r;   r   r>   rC   r   lora_a_ir   r/   r/   r0   set_lora   s&   



z+MergedColumnParallelLinearWithLoRA.set_lorarW   rY   c                 C      t |tu ot|dkS )Nr   )r6   r   r   r^   r/   r/   r0   r`     s   

z4MergedColumnParallelLinearWithLoRA.can_replace_layerr@   )ra   rb   rc   rd   r   r   r5   intr   r   r   rj   r!   re   rB   rN   r   rg   r   rh   ri   rk   r`   rl   r/   r/   r<   r0   rm      s`    
)


rm   c                       sp   e Zd ZdZdeddf fddZdejdejfdd	Ze	e
	dd
ejdedededB def
ddZ  ZS )QKVParallelLinearWithLoRAa  
    ColumnParallelLinear layer that is specifically designed for
    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
    only contains a single LoRA within their qkv_proj layer.

    During inference with Tensor Parallel, the weights of lora_b
    must be accurately partitioned according to the respective ranks.

    Q slice may have different shape than K and V slices (which both have
    the same shape).
    r   r2   Nc                    s^   t  | | jj| jj | _| jj| jj | _| jj| jj | _	| jj
| jj | _d| _d S r3   )r4   r5   r   total_num_heads	head_sizeq_proj_total_size	num_headsq_proj_shard_sizenum_kv_headskv_proj_shard_sizetotal_num_kv_headskv_proj_total_sizer   r:   r<   r/   r0   r5   1  s   
z"QKVParallelLinearWithLoRA.__init__rC   c                 C   s   | j | _| j | jj | _|| j| j | j| jd  d d f }| j}||| j| j  || j| jd   d d f }|| j }||| j| j  || j| jd   d d f }t	j
|||gdd}|S )Nr
   r   rD   )rF   
q_shard_idr   num_kv_head_replicaskv_shard_idr   r   r   r   r!   rG   )r;   rC   lora_b_qk_offsetlora_b_kv_offsetlora_b_vr/   r/   r0   rN   @  s6   
z&QKVParallelLinearWithLoRA.slice_lora_brW   rX   rY   rZ   c                 C   r   r3   r6   r   r   r^   r/   r/   r0   r`   W     	z+QKVParallelLinearWithLoRA.can_replace_layerr@   )ra   rb   rc   rd   r   r5   r!   re   rN   rg   r   rh   ri   r   rj   r   rk   r`   rl   r/   r/   r<   r0   r   $  s$    r   c                       s   e Zd ZdZdeddf fddZ	ddeded	edB ddf fd
dZ	e
e	ddejdeded	edB def
ddZ  ZS )MergedQKVParallelLinearWithLoRAaK  MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
    packed together in qkv proj fashion
    (q_proj + k_proj + v_proj -> qkv_proj).

    This means we have 3 LoRAs, each applied to one slice of the layer.

    Q slice may have different shape than K and V slices (which both have
    the same shape).
    r   r2   Nc                    s~   t  | t| jj| _| jj| jj | _| jj	| jj | _
| j| _| j| jj | _| j| j
| j
f| _| j| j| jf| _d S r@   )r4   r5   r   r   r[   r   r   r   r   r   r   rF   r   r   r   r   rr   r:   r<   r/   r0   r5   n  s   
z(MergedQKVParallelLinearWithLoRA.__init__rs   rX   rZ   c                    s   t  ||| dS )z
        The main reason for overloading this function is to handle inconsistent
        weight dimensions in qkv lora.
        N)r4   r   r~   r<   r/   r0   r     s   
z3MergedQKVParallelLinearWithLoRA.create_lora_weightsrW   rY   c                 C   r   )Nr\   r   r^   r/   r/   r0   r`     r   z1MergedQKVParallelLinearWithLoRA.can_replace_layerr@   )ra   rb   rc   rd   r   r5   r   r   r   r   rg   r   rh   ri   rj   rk   r`   rl   r/   r/   r<   r0   r   c  s6    
r   c                          e Zd ZdZdejdejfddZddejdejdB dejfd	d
Zee		dde
jdedededB def
 fddZ  ZS )#ColumnParallelLinearWithShardedLoRAz
    Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also.

    Based on S-LoRA, slicing happens along the rank dim.
    r>   r2   c                 C   6   | j d jd }| j| }|||| d d f }|S Nr   r   r   r    rF   r;   r>   rH   rL   r/   r/   r0   rB        
z0ColumnParallelLinearWithShardedLoRA.slice_lora_aNr(   r)   c                 C      t ||| S r@   r1   r;   r(   r)   r/   r/   r0   r        z)ColumnParallelLinearWithShardedLoRA.applyrW   rX   rY   rZ   c                       t  j||||ddS NF)rW   rX   rY   rZ   decorater4   r`   r^   r<   r/   r0   r`        
z5ColumnParallelLinearWithShardedLoRA.can_replace_layerr@   ra   rb   rc   rd   r!   re   rB   r   rg   r   rh   ri   r   rj   r   rk   r`   rl   r/   r/   r<   r0   r     s$    "r   c                          e Zd ZdZdeejdB  deejdB  fddZddejdejdB dejfd	d
Ze	e
	ddejdedededB def
 fddZ  ZS ))MergedColumnParallelLinearWithShardedLoRAz
    Differs from MergedColumnParallelLinearWithLoRA by slicing the
    LoRA A's also.

    Based on S-LoRA, slicing happens along the rank dim.
    r>   Nr2   c                 C   s|   | j d jd }| j| }|d d ur!|d ||| d d f nd |d d ur9|d ||| d d f g}|S d g}|S )Nr   r   r
   r   )r;   r>   output_shard_sizeoutput_start_idxr/   r/   r0   rB     s   
z6MergedColumnParallelLinearWithShardedLoRA.slice_lora_ar(   r)   c                 C   r   r@   r   r   r/   r/   r0   r     r   z/MergedColumnParallelLinearWithShardedLoRA.applyrW   rX   rY   rZ   c                    r   r   r   r^   r<   r/   r0   r`     r   z;MergedColumnParallelLinearWithShardedLoRA.can_replace_layerr@   ra   rb   rc   rd   rj   r!   re   rB   r   rg   r   rh   ri   r   r   rk   r`   rl   r/   r/   r<   r0   r     s,    
"r   c                       r   ) QKVParallelLinearWithShardedLoRAz
    Differs from QKVParallelLinearWithLoRA by slicing the
    LoRA A's also.

    Based on S-LoRA, slicing happens along the rank dim.
    r>   r2   c                 C   r   r   r   r   r/   r/   r0   rB      r   z-QKVParallelLinearWithShardedLoRA.slice_lora_aNr(   r)   c                 C   r   r@   r   r   r/   r/   r0   r     r   z&QKVParallelLinearWithShardedLoRA.applyrW   rX   rY   rZ   c                    r   r   r   r^   r<   r/   r0   r`   	  r   z2QKVParallelLinearWithShardedLoRA.can_replace_layerr@   r   r/   r/   r<   r0   r     s$    "r   c                       r   )&MergedQKVParallelLinearWithShardedLoRAz
    Differs from MergedQKVParallelLinearWithLoRA by slicing the
    LoRA A's also.

    Based on S-LoRA, slicing happens along the rank dim.
    r>   Nr2   c                    s    fddt dD  fddt dD }|d d ur1|d |d |d d  d d f nd |d d urL|d |d |d d  d d f nd |d d urj|d |d |d d  d d f g}|S d g}|S )Nc                    s   g | ]
} j | jd  qS )r   )r   r    ro   r   rp   r/   r0   
<listcomp>(  s    zGMergedQKVParallelLinearWithShardedLoRA.slice_lora_a.<locals>.<listcomp>r\   c                    s   g | ]	} j |  qS r/   )rF   r   r;   rH   r/   r0   r   )  s    r   r
   r   )r}   )r;   r>   rL   r/   r   r0   rB   $  s   ((&z3MergedQKVParallelLinearWithShardedLoRA.slice_lora_ar(   r)   c                 C   r   r@   r   r   r/   r/   r0   r   7  r   z,MergedQKVParallelLinearWithShardedLoRA.applyrW   rX   rY   rZ   c                    r   r   r   r^   r<   r/   r0   r`   :  r   z8MergedQKVParallelLinearWithShardedLoRA.can_replace_layerr@   r   r/   r/   r<   r0   r     s,    
"r   c                       sv   e Zd ZdZee	ddejdede	de
dB def
dd	Zd
edeje	ej B deje	ej B f fddZ  ZS )/MergedColumnParallelLinearVariableSliceWithLoRAzMergedColumnParallelLinear with variable number of slices (3+).

    This handles cases where the checkpoint has a single weight for the whole
    module (not split into slices), but the layer itself has multiple slices.
    NrW   rX   rY   rZ   r2   c                 C   sH   t |turdS t|dkrdS t|dkrdS t|do#t|jdkS )NFr\   Tr   r[   )r6   r   r   r]   r[   r^   r/   r/   r0   r`   V  s   
zAMergedColumnParallelLinearVariableSliceWithLoRA.can_replace_layerr   r>   rC   c           	         s   |  | t|tjr|g| j }t|tjr9| jj}g }d}|D ]}|| }||||ddf  |}q!|}t 	||| dS )zSOverride to handle single tensor weights
        that need to be split into slices.r   N)
r   
isinstancer!   re   r   r   r[   appendr4   r   )	r;   r   r>   rC   r[   lora_b_listrL   r9   rM   r<   r/   r0   r   v  s   
z8MergedColumnParallelLinearVariableSliceWithLoRA.set_lorar@   )ra   rb   rc   rd   rg   r   rh   ri   r   rj   r   rk   r`   r   r!   re   r   rl   r/   r/   r<   r0   r   M  s.    r   )r   r   ) r!   torch.nnrh   transformersr   vllm.config.lorar   vllm.distributedr   vllm.distributed.utilsr   !vllm.model_executor.layers.linearr   r   r   vllm.platformsr	   base_linearr   utilsr   r   r1   r   rm   r   r   r   r   r   r   r   r/   r/   r/   r0   <module>   s,   
3cw??(.$
1