o
    㥵i@                     @   s  d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ dd ZdDd	d
Zdd Zdd ZdDddZdDddZ	dDddZdDddZG dd dZdd ZG dd dZG dd dejjZdd  Zd!d" ZdEd$d%Zd&d' Z G d(d) d)Z!G d*d+ d+ejjZ"d,d- Z#e$ ej%d.ejed/d0d1d2ej%d3e&d4d5d6ej%d7e'dd8d6ej%d9e&d:d;d6d<ed=e&d>e'd?e&d@df
dAdBZ(e)dCkre(  dS dS )F    N)Path)
load_model)find_multiplec                 C   s   t t jj}t j| dd\}}t |t |}t |t |}|j}	t | |}|t	|| d  }
t j
|
|d| j}
t j| t j|	d}| |
d }t |}||d }t 
||||}||
|fS )N   )dim   min)dtypedevice)torchfinfofloat32epsaminmaxr	   
zeros_likemaxr   floatclamptor
   zerossizeint64	unsqueezeround)x	quant_min	quant_maxtarget_dtyper   min_valmax_valmin_val_negmax_val_posr   scaleszero_pointsx_divx_roundx_zpquant r*   H/home/ubuntu/.local/lib/python3.10/site-packages/tools/llama/quantize.py dynamically_quantize_per_channel   s   

r,         c           	      C   s   || j d kr| j d }|dksJ | j d | dksJ |  dks%J | d|}t| dks6J |jddd}|jddd}d| d }|| jdd| }||d|d    }|	tj
| j d d|	tj
| j d dfS )	Nr   r   r   r   T)r   keepdimgư>r   )shaper   reshaper   isnansumamaxaminr   r   bfloat16)	wn_bit	groupsizeto_quantr!   r    max_intr$   r   r*   r*   r+   get_group_qparams9   s"   
r<   c              	   C   sx   | j |j ksJ | jtjksJ |jtjksJ t| | d| dd||d|ddgddd S )Nr   r   r   )	r0   r
   r   r6   catr1   r   	transpose
contiguous)r$   r   r*   r*   r+   pack_scales_and_zerosN   s   r@   c                 C   sF   t | jdkr| jd dksJ | jtjksJ t| ddddS )N   r   r   r   )lenr0   r
   r   r   splitr>   )scales_and_zerosr*   r*   r+   unpack_scales_and_zeros_   s    rE   c           
      C   s   |dksJ || j d kr|j d dkr| j d }| j d | dks$J |  dks,J | d|}t| dks=J |dd}|dd}||d|d    }d| d }d}||| 	||
tj| }	|	S Nr   r   r   r   )r0   r   r1   r   r2   r3   subdivr   clamp_r   int32
reshape_as)
r7   r$   r   r8   r9   r:   r    r;   min_intw_int32r*   r*   r+   "group_quantize_tensor_from_qparamse   s(   
	rN   c                 C   s2   t | ||\}}t| ||||}t||}||fS N)r<   rN   r@   )r7   r8   r9   r$   r   rM   rD   r*   r*   r+   group_quantize_tensor   s   
rP   c                 C   s   |dksJ || j d kr|j d dkr| j d }| j d | dks$J |  dks,J | d|}|dd}|dd}|d|d  ||| }|S rF   )r0   r   r1   rG   muladdrK   )rM   r$   r   r8   r9   w_int32_groupedw_dqr*   r*   r+   $group_dequantize_tensor_from_qparams   s   
"rU   c                 C   s   t |\}}t| ||||S rO   )rE   rU   )rM   rD   r8   r9   r$   r   r*   r*   r+   group_dequantize_tensor   s   
rV   c                   @   s(   e Zd Zdd ZdddZddd	Zd
S )QuantHandlerc                 C   
   || _ d S rO   modselfrZ   r*   r*   r+   __init__      
zQuantHandler.__init__return	StateDictc                 C      d S rO   r*   r\   r*   r*   r+   create_quantized_state_dict      z(QuantHandler.create_quantized_state_dict	nn.Modulec                 C   ra   rO   r*   rb   r*   r*   r+   convert_for_runtime   rd   z QuantHandler.convert_for_runtimeN)r_   r`   )r_   re   )__name__
__module____qualname__r]   rc   rf   r*   r*   r*   r+   rW      s    
rW   c                 C   sB   |   D ]\}}t|tjrt| |t|j|j qt| qd S rO   )	named_children
isinstancennLinearsetattrWeightOnlyInt8Linearin_featuresout_features+replace_linear_weight_only_int8_per_channel)modulenamechildr*   r*   r+   rr      s   
rr   c                   @   s,   e Zd Zdd Ze dd Zdd ZdS )WeightOnlyInt8QuantHandlerc                 C   rX   rO   rY   r[   r*   r*   r+   r]      r^   z#WeightOnlyInt8QuantHandler.__init__c                 C   sr   | j  }| j  D ],\}}t|tjjr6t|j	 ddtj
\}}}||| d< ||jj|| d< q
|S )Ni   .weightz.scales)rZ   
state_dictnamed_modulesrk   r   rl   rm   r,   weightr   int8r   r
   )r\   cur_state_dictfqnrZ   int8_weightr$   _r*   r*   r+   rc      s   

z6WeightOnlyInt8QuantHandler.create_quantized_state_dictc                 C   s   t | j | jS rO   )rr   rZ   rb   r*   r*   r+   rf      s   
z.WeightOnlyInt8QuantHandler.convert_for_runtimeNrg   rh   ri   r]   r   no_gradrc   rf   r*   r*   r*   r+   rv      s
    
rv   c                	       sp   e Zd ZU ddgZeed< eed< ejed< 			ddedededdf fdd	Z	d
ejdejfddZ
  ZS )ro   rp   rq   r{   TNbiasr_   c                    sX   ||d}t    || _|| _| dtj||ftjd | dtj|tj	d d S )N)r   r
   r{   r
   r$   )
superr]   rp   rq   register_bufferr   emptyr|   onesr6   )r\   rp   rq   r   r   r
   factory_kwargs	__class__r*   r+   r]      s   

zWeightOnlyInt8Linear.__init__inputc                 C   s   t || jj|jd| j S )Nr   )Flinearr{   r   r
   r$   )r\   r   r*   r*   r+   forward   s   zWeightOnlyInt8Linear.forward)TNNrg   rh   ri   __constants__int__annotations__r   Tensorboolr]   r   __classcell__r*   r*   r   r+   ro      s$   
 
ro   c                 C   s*   t | d|d\}}tjj||}||fS )Nr-   )r8   r9   )rP   r   opsaten_convert_weight_to_int4pack)weight_bf16r9   inner_k_tilesweight_int32rD   weight_int4packr*   r*   r+   (prepare_int4_weight_and_scales_and_zeros   s   
r   c                 C   sL   |   }| d|d } tjj| |||}|d d |f }||}|S )Nr   )r   r1   r   r   r   _weight_int4pack_mm)r   r   rD   rq   r9   origin_x_sizec	new_shaper*   r*   r+   linear_forward_int4   s   
r   r   c                 C   s   | | dko| |d  dkS )Nr      r*   )kr9   r   r*   r*   r+   _check_linear_int4_k  s   r   c                 C   s   |   D ]<\}}t|tjr9t|j||r&t| |t|j|jd||dd q|r8t| |t|j|jd||dd qt	|||| qd S )NF)r   r9   r   paddingT)
rj   rk   rl   rm   r   rp   rn   WeightOnlyInt4Linearrq   replace_linear_int4)rs   r9   r   r   rt   ru   r*   r*   r+   r     s>   r   c                   @   s.   e Zd ZdddZe dd Zdd	 Zd
S )WeightOnlyInt4QuantHandlerr.      Tc                 C   s4   || _ || _|| _|| _|dv sJ |dv sJ d S )N)    @   r.      )r   r-   r   )rZ   r9   r   r   )r\   rZ   r9   r   r   r*   r*   r+   r]   -  s   z#WeightOnlyInt4QuantHandler.__init__c                 C   s6  | j  }| j  D ]\}}t|tjjr|jrJ |j}|j	}|d dks*J dt
d| d| d|  |jj}t|| j| jss| jrhdd lm  m} t
d| d t|d	}|j|d|| fd
}nt
d| dd  q
t|tjd| j| j\}	}
|	d|| d< |
d|| d< q
|S )Nr   r   require out_features % 8 == 0zlinear: z, in=z, out=z	warning: z- is padded to satisfy in_features % 1024 == 0   padzP is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, z=and that groupsize and inner_k_tiles*16 evenly divide into itcudacpurx   z.scales_and_zeros)rZ   ry   rz   rk   r   rl   rm   r   rq   rp   printr{   datar   r9   r   r   torch.nn.functional
functionalr   r   r   r   r6   )r\   r}   r~   rZ   rq   rp   r{   r   padded_in_featuresr   rD   r*   r*   r+   rc   5  sN   





z6WeightOnlyInt4QuantHandler.create_quantized_state_dictc                 C   s   t | j| j| j| j | jS rO   )r   rZ   r9   r   r   rb   r*   r*   r+   rf   a  s   z.WeightOnlyInt4QuantHandler.convert_for_runtimeN)r.   r   Tr   r*   r*   r*   r+   r   ,  s
    

+r   c                       s~   e Zd ZU ddgZeed< eed< ejed< 						ddededed	ed
eddf fddZ	dejdejfddZ
  ZS )r   rp   rq   r{   TNr.   r   r9   r   r   r_   c	           	   	      s   t    || _|r|| _t|d}|| _|| _|rJ d|| _|| _|d dks.J d||d  dks:J d| 	dt
j|d ||d  d	|d
 ft
jd | 	dt
j|| |d
ft
jd d S )Nr   zrequire bias=Falser   r   r   r   z-require in_features % (innerKTiles * 16) == 0r{   r   r   r   rD   )r   r]   r   origin_in_featuresr   rp   rq   r9   r   r   r   r   rJ   r6   )	r\   rp   rq   r   r   r
   r9   r   r   r   r*   r+   r]   l  s>   


zWeightOnlyInt4Linear.__init__r   c                 C   sV   | tj}| jrdd lm  m} |j|d| j| j	 fd}t
|| j| j| j| jS )Nr   r   )r   r   r6   r   r   rl   r   r   rp   r   r   r{   rD   rq   r9   )r\   r   r   r*   r*   r+   r     s   zWeightOnlyInt4Linear.forward)TNNr.   r   Tr   r*   r*   r   r+   r   f  s2   
 
	
.r   c                  C   s   t j  } | d}|S )Nz%Y%m%d_%H%M%S)datetimenowstrftime)r   folder_namer*   r*   r+   generate_folder_name  s   

r   z--checkpoint-pathT)	path_typeexistszcheckpoints/fish-speech-1.4)typedefaultz--moder|   ztype of quantization to perform)r   r   helpz--groupsizez!Group size for int4 quantization.z--timestampNonezWhen to do quantizationcheckpoint_pathmoder9   	timestampr_   c                 C   s  d}t j}td t }t| ||dd\}}d}	|dkr|nt }
|dkrZtd t|}| }| }td	|
 }t	
t| t|  ||	  rU||	   |d
 }nD|dkrtd t||}| }| }td| d|
 }t	
t| t|  ||	  r||	   |d
 }ntd| dtd|  |jdd t || tdt | dd d S )Nr   zLoading model ...F)r   r   	precisioncompilez	codec.pthr   r|   zPQuantizing model weights for int8 weight-only symmetric per-channel quantizationzcheckpoints/fs-1.2-int8-z	model.pthint4zWQuantizing model weights for int4 weight-only affine per-channel groupwise quantizationzcheckpoints/fs-1.2-int4-g-zInvalid quantization mode z, needs to be one of [int8, int4, int4-gpptq]zWriting quantized weights to T)
missing_okzQuantization complete took z.02fz seconds)r   r6   r   timer   r   rv   rc   r   shutilcopytreestrresolver   unlinkr   
ValueErrorsave)r   r   r9   r   r   r   t0modelr   vq_modelr   quant_handlerquantized_state_dictdir_namedst_namequantize_pathr*   r*   r+   quantize  sV   




r   __main__)r-   r.   )r   r   )*r   r   r   pathlibr   clickr   torch.nnrl   r   r   r   *fish_speech.models.text2semantic.inferencer   &fish_speech.models.text2semantic.llamar   r,   r<   r@   rE   rN   rP   rU   rV   rW   rr   rv   Modulero   r   r   r   r   r   r   r   commandoptionr   r   r   rg   r*   r*   r*   r+   <module>   s^   
#





!:?(8
