o
    ۷iW                     @   s  d dl Z d dlZd dlmZ d dlZd dlZd dlmZ ddlm	Z	m
Z
 e	 r9d dlZd dlmZ d dlmZmZ edd d	v oPej oPej d  d
kZerae
 rad dlmZ edZndZejjejjejjhZejjejjejj ejj!ejj"ejj#hZ$ejj%ejj&ejj'ejj(ejj)hZ*ejj+ejj,ejj-ejj.ejj/ejj0ejj1ejj2ejj3h	Z4e$e*B e4B Z5e$e*B e4B Z6e$e*B Z7dej8dej8de9dej8fddZ:dd Z;dg fddZ<g fddZ=dZ>dZ?dd Z@dd ZAd d! ZBdDd"d#ZCdDd$d%ZDdDd&d'ZEdDd(d)ZFdDd*d+ZGdDd,d-ZHdDd.d/ZIdDd0d1ZJdDd2d3ZKdDd4d5ZLdDd6d7ZMdDd8d9ZNdDd:d;ZOejPZPejj3eNejj2eOejjeMejj"eCejj!eDejj eEejjeFejjeGejj)eHejj(eIejj'eJejj&eKejj%eLiZQeReQS ZTd<d= ZUd>d? ZVG d@dA dAejjWZXG dBdC dCejYZZdS )E    N)nullcontext   )is_accelerate_availableis_kernels_available)init_empty_weights)add_hook_to_moduleremove_hook_from_moduleDIFFUSERS_GGUF_CUDA_KERNELSfalse)1trueyes   )
get_kernelzIsotr0py/ggmlxqweightqweight_typereturnc                 C   s   |t v rt|}| |j S |tv r=tj| \}}|jd |jd | | f}tj||g|R  }| |	| j
j }| S t|}td| )Nr      z$Unsupported GGUF quantization type: )UNQUANTIZED_TYPESdequantize_gguf_tensorTDEQUANT_TYPESggufGGML_QUANT_SIZESshapeopsggml_dequantizetodtypeGGMLQuantizationTypeNotImplementedError	as_tensor)r   r   r   weight
block_size	type_sizer   y r'   U/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/quantizers/gguf/utils.py_fused_mul_mat_ggufO   s   

r)   c                 C   s\   t tj| jj}| j}i }t|j}|	 D ]}||j
v r$|| ||< q|di |}|S )a  
    Creates a new hook based on the old hook. Use it only if you know what you are doing ! This method is a copy of:
    https://github.com/huggingface/peft/blob/748f7968f3a31ec06a1c2b0328993319ad9a150a/src/peft/utils/other.py#L245 with
    some changes
    Nr'   )getattr
acceleratehooks	__class____name____dict__inspect	signature__init__keys
parameters)old_hookold_hook_clsold_hook_attrfiltered_old_hook_attrold_hook_init_signatureknew_hookr'   r'   r(   _create_accelerate_new_hookn   s   
r<    c              	   C   s   dd }t |  }|sd S |  D ][\}}|| d }	t||||	| t|tjrm|||	rm||vrmt r8tnt	}
|
  t
|j|j|jd u|d| j|< W d    n1 sXw   Y  t|| j| _| j| d q| S )Nc                 S   s   |d }|| v ot | | tS )Nr#   )
isinstanceGGUFParameter)
state_dictprefix
weight_keyr'   r'   r(   _should_convert_to_gguf   s   z:_replace_with_gguf_linear.<locals>._should_convert_to_gguf.)compute_dtypeF)listchildrennamed_children_replace_with_gguf_linearr>   nnLinearr   r   r   
GGUFLinearin_featuresout_featuresbias_modulestype
source_clsrequires_grad_)modelrE   r@   rA   modules_to_not_convertrC   has_childrennamemodulemodule_prefixctxr'   r'   r(   rI      s2   
rI   c              	   C   s  |   D ]{\}}t|trr||vrr|jj}t|dd }t r tnt}|  t	j
|j|j|jd u|d}W d    n1 s>w   Y  t	t|j|_|d urS||_t|drh|j}t|}	t| t||	 || || j|< t| }
|
rt|| q| S )NrO   )device_hf_hook)rH   r>   rL   r#   r[   r*   r   r   r   rJ   rK   rM   rN   rO   	Parameterr   hasattrr\   r<   r   r   r   rP   rF   rG   #_dequantize_gguf_and_restore_linear)rT   rU   rW   rX   r[   rO   rZ   
new_moduler5   r;   rV   r'   r'   r(   r_      s8   




r_         c                 C   sf   |  tjtj} | d d df | d d df d> B | d d df d> B | d d df d> B dS )Nr   r            r      )viewtorchuint8r   int32	unsqueeze)r   r'   r'   r(   	to_uint32   s   Rrl   c                 G   s0   | j d }t||t| g }tj| |ddS )Nr   dim)r   rF   sumrh   split)blocksargsn_maxdimsr'   r'   r(   split_block_dims   s   
ru   c                 C   s   | j d }| tj} | |ddf} tj| | j d d dd\}}}tj|d@ |d@ |d? d	@ B gd
d}tj|d@ |d? |d? d	@ B gd
d}||df||dffS )Nr   r      rm   ?      rd   0   rc   )r   rg   rh   ri   reshaperp   cat)scalesn_blocksdmm_dscminr'   r'   r(   get_scale_min   s   
 &&r   c                 C   s4   t | d\}}|tj|}|tj}|| S )Nrd   )ru   rg   rh   float16r   int8)rq   r$   r%   r   r   r   r'   r'   r(   dequantize_blocks_Q8_0   s   r   c           
      C   s   | j d }t| ddd\}}}}|tj|}|tj|}t|}||dftjd|j	tj
ddd? }||dd|d ftjddg|j	tjddddd? }	|d@ tj}|	d@ |df}	|	|d> B }|| | S )	Nr   rd   rv   r       r[   r   r{   ry   )r   ru   rg   rh   r   r   rl   r|   aranger[   rj   tensorri   )
rq   r$   r%   r   r   r   r   qhqsqlr'   r'   r(   dequantize_blocks_Q5_1   s   
*r   c           	      C   s   | j d }t| dd\}}}|tj|}t|}||dtjd|j	tj
ddd? }||dd|d tjddg|j	tjddddd? }|d@ tj}|d@ |d}||d> B tjd	 }|| S )
Nr   rd   rv   r   r   r   r{   ry   re   )r   ru   rg   rh   r   r   rl   r|   r   r[   rj   r   ri   r   )	rq   r$   r%   r   r   r   r   r   r   r'   r'   r(   dequantize_blocks_Q5_0   s   
(r   c                 C   s   | j d }t| dd\}}}|tj|}|tj|}||dd|d ftjddg|jtj	ddddd? }|d@ |d}|| | S )Nr   rd   r{   r   rv   r   ry   )
r   ru   rg   rh   r   r   r|   r   r[   ri   )rq   r$   r%   r   r   r   r   r   r'   r'   r(   dequantize_blocks_Q4_1  s   
r   c                 C   s   | j d }t| d\}}|tj|}||dd|d ftjddg|jtj	dd? }|d@ |dftj
d	 }|| S )
Nr   rd   r{   r   rv   r   r   r   rd   r   ry   rc   )r   ru   rg   rh   r   r   r|   r   r[   ri   r   )rq   r$   r%   r   r   r   r   r'   r'   r(   dequantize_blocks_Q4_0!  s   
r   c           
      C   s.  | j d }t| td td td \}}}}|tj|}|tj|}|| |td df}||dddftj	ddg|j
tjdd	? }|d
@ |ddf}||dddftj	g d|j
tjdd? }|d@ |ddf}||d> B tjd }	|	|td df}	||	 |tfS )Nr   rd   rv   re   r   r{   @   r   r   ry   r   r   rd   rv      r   r   rv   r   r   )r   ru   QK_Krg   rh   r   r   r   r|   r   r[   ri   )
rq   r$   r%   r   r   r   r   r~   r   qr'   r'   r(   dequantize_blocks_Q6_K.  s*   
((r   c                 C   s(  | j d }t| ddttd \}}}}}	|tj|}|tj|}t|\}
}||
 	|ddf}|| 	|ddf}|		|dddftj
ddg|jtjd	d	? }|	|dddftjdd|jtjd	d
? }|d@ 	|ddf}|d@ 	|ddf}||d> B }|| | 	|tfS )Nr   rd   rc   r{   r   r   rv   r   r   r   r   rc   r   ry   )r   ru   K_SCALE_SIZEr   rg   rh   r   r   r   r|   r   r[   ri   r   )rq   r$   r%   r   r   r   dminr~   r   r   r   r   dmr   r   r'   r'   r(   dequantize_blocks_Q5_KJ  s"   
(&r   c                 C   s   | j d }t| ddt\}}}}|tj|}|tj|}t|\}	}
||	 |ddf}||
 |ddf}||dddftj	ddg|j
tjdd? }|d	@ |ddf}|| | |tfS )
Nr   rd   r{   r   r   rv   r   r   ry   )r   ru   r   rg   rh   r   r   r   r|   r   r[   ri   r   )rq   r$   r%   r   r   r   r   r~   r   r   r   r   r'   r'   r(   dequantize_blocks_Q4_Kd  s   
(r   c                 C   s  | j d }t| td td d\}}}}|tj|}|d d d df |d d dd f }	}
|	|ddftjddg|j	tj
dd? }	|	|df}	|
|ddftjg d	|j	tj
dd
? }
|
|df}
|	d@ |
d@ d> B }|tjd }|| |ddf}||dddftjg d	|j	tj
dd? }||dddtjdd|j	tj
dd? }||dtd fd@ }||dtd fd@ dA }|tj|d> tj }|| |tfS )Nr   rc   rv   rb   r   r   )r   rd   r   re   r   )r   rv   r   ry   r   r   r{   r   r   rd   )r   ru   r   rg   rh   r   r   r|   r   r[   ri   r   r   )rq   r$   r%   r   r   hmaskr   r~   r   lscaleshscalesdlr   r   r   r'   r'   r(   dequantize_blocks_Q3_Kx  s6   
*&($r   c                 C   s   | j d }t| td td d\}}}}|tj|}|tj|}||d@  |td df}	||d?  |td df}
tjg d|j	tj
dd	}||d
ddf|? d@ }||td df}|	| |
 }||d
fS )Nr   re   rv   rd   ry   r   r   r   r   r{   r   r   )r   ru   r   rg   rh   r   r   r|   r   r[   ri   )rq   r$   r%   r   r   r~   r   r   r   r   mlshiftr'   r'   r(   dequantize_blocks_Q2_K  s   
r   c                 C   s    |  tjtjd>  tjS )Nre   )rg   rh   int16r   rj   float32)rq   r$   r%   r   r'   r'   r(   dequantize_blocks_BF16  s    r   c                 C   s   t jg dt j| jd}| jd }t| d\}}|t j|}|	|dd|d ft jddg| jt j
d	d	? }|d
@ 	|dft j}|ddd}|d}t ||jd |jd dd|}|d|}|| S )Niiiiiiiir         &   5   E   Y   q   r   r[   r   rd   r{   r   rv   r   r   ry   re   )rh   r   r   r[   r   ru   rg   r   r   r|   ri   int64rk   gatherexpandsqueeze)rq   r$   r%   r   kvaluesr   r   r   r'   r'   r(   dequantize_blocks_IQ4_NL  s&   

$r   c                 C   s  t jg dt j| jd}| jd }t| ddtd \}}}}	|t j	|}|t j
}||ddft jddg| jt jd	d
? }||ddft jdd ttd D | jt jd	d? }||dfd@ }||dfd@ }||d> B d }
||
	| |ddf}t jddg| jt jd	dddd}|	|dddf|? }	|	d@ |ddf	t j}	|dddd}|	d}	t ||	jd |	jd |	jd dd|	}	|	d	|}	||	 |dS )Nr   r   r   rd   r   r{   r   rv   r   )r   r   rd   c                 S   s   g | ]}d | qS )rd   r'   ).0ir'   r'   r(   
<listcomp>  s    z,dequantize_blocks_IQ4_XS.<locals>.<listcomp>r   )r   r{   r   ry   r   re   )rh   r   r   r[   r   ru   r   rg   r   r   r   r|   ri   ranger   rk   r   r   r   )rq   r$   r%   r   r   r   r   scales_hscales_lr   r~   r   shifts_qr'   r'   r(   dequantize_blocks_IQ4_XS  s>   
$
,r   c                 C   s"   g | d d | d | | R S )Nr{   r'   )r   r%   r$   r'   r'   r(   _quant_shape_from_byte_shape  s   "r   c           	      C   s~   t | ds| S | j}t| }t| \}}|  } | tj} t| j	||}| 
 | }| ||f}||||}||}|S )N
quant_type)r^   r   dequantize_functionsr   r"   rg   rh   ri   r   r   numelr|   )	r   r   
dequant_fnr$   r%   r   r   rq   dequantr'   r'   r(   r     s   

r   c                       s@   e Zd ZdddZdd Zedd Zed fd
d	Z  Z	S )r?   FNc                 C   sL   |d ur|nt d}t j| ||}||_t| \}}t|j|||_|S Nr   )	rh   emptyTensor_make_subclassr   r   r   r   quant_shape)clsdatarequires_gradr   selfr$   r%   r'   r'   r(   __new__  s   zGGUFParameter.__new__c                 C   s   t jt j| | jS N)rh   r   r   r   )r   r'   r'   r(   r"      s   zGGUFParameter.as_tensorc                 C   sH   | D ]}t |trt |d tr|d j  S t |tr!|j  S qd S r   )r>   rF   r?   r   )rr   argr'   r'   r(   _extract_quant_type#  s   

z!GGUFParameter._extract_quant_typer'   c                    s~   |d u ri }t  ||||}t|tjr  | |dS t|ttfv r= | fdd|D }t||S |S )Nr   c                    s(   g | ]}t |tjr |d n|qS )r   )r>   rh   r   )r   r   r   r   r'   r(   r   =  s   ( z4GGUFParameter.__torch_function__.<locals>.<listcomp>)	super__torch_function__r>   rh   r   r   rQ   rF   tuple)r   functypesrr   kwargsresultwrappedr-   r   r(   r   /  s   

z GGUFParameter.__torch_function__)FN)r'   N)
r.   
__module____qualname__r   r"   staticmethodr   classmethodr   __classcell__r'   r'   r   r(   r?     s    
	
r?   c                       sX   e Zd Z			d	d fddZdejfddZdejfd	d
ZdejfddZ  Z	S )rL   FNr   c                    s"   t  |||| || _|| _d S r   )r   r2   rE   r[   )r   rM   rN   rO   rE   r[   r   r'   r(   r2   D  s   
zGGUFLinear.__init__inputsc                 C   s*   t d ur| jjr|jr| |S | |S r   )r   r#   is_cudaforward_cudaforward_native)r   r   r'   r'   r(   forwardP  s   

zGGUFLinear.forwardc                 C   sH   t | j}|| j}| jd ur| j| jnd }tjj|||}|S r   )	r   r#   r   rE   rO   rh   rJ   
functionallinear)r   r   r#   rO   outputr'   r'   r(   r   U  s
   
zGGUFLinear.forward_nativec                 C   s>   | j j}t|| j| j |}| jd ur|| j| j7 }|S r   )r#   r   r)   r   rE   rO   )r   r   r   r   r'   r'   r(   r   ]  s
   
zGGUFLinear.forward_cuda)FNN)r   N)
r.   r   r   r2   rh   r   r   r   r   r   r'   r'   r   r(   rL   C  s    rL   r   )[r0   os
contextlibr   r   rh   torch.nnrJ   utilsr   r   r+   r   accelerate.hooksr   r   getenvlowercudais_availableget_device_capabilitycan_use_cuda_kernelskernelsr   r   r    F32F16BF16r   Q4_0Q4_1Q5_0Q5_1Q8_0Q8_1STANDARD_QUANT_TYPESQ2_KQ3_KQ4_KQ5_KQ6_KKQUANT_TYPESIQ1_MIQ1_SIQ2_XXSIQ2_XSIQ2_SIQ3_XXSIQ3_SIQ4_XSIQ4_NLIMATRIX_QUANT_TYPESr   MMVQ_QUANT_TYPESMMQ_QUANT_TYPESr   intr)   r<   rI   r_   r   r   rl   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rF   r3   SUPPORTED_GGUF_QUANT_TYPESr   r   r]   r?   rK   rL   r'   r'   r'   r(   <module>   s   

	 !)









!


-