o
    ॵink                     @   s  d dl Z d dlZd dlmZ d dlZd dlm  mZ d dlZd dl	m	Z	 ddl
mZmZmZmZ dZdd Zd	d
 Z	 dZdZdZdZd Ze Ze Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd(ddZ 			d)ddZ!d d! Z"d"d# Z#G d$d% d%Z$G d&d' d'Z%dS )*    N)time)tqdm   )deviceget_optimal_devicetest_for_nanstorch_gcFc                  C   s\   t j r*t jtjd } | dkrd}|S | dkrd}|S | dkr&d}|S d}|S d	}|S )
N   >  i   .  i   @  i   i  i   torchcudais_availableget_device_propertiesr   total_memory)r   ENCODER_TILE_SIZE r   n/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/pipelines/multi_modal/diffusers_wrapped/vaehook.pyget_recommend_encoder_tile_size   s&   
	r   c                  C   sl   t j r2t jtjd } | dkrd}|S | dkrd}|S | dkr&d}|S | dkr.d	}|S d
}|S d
}|S )Nr	   i0u     r
      r      r   `   @   r   )r   DECODER_TILE_SIZEr   r   r   get_recommend_decoder_tile_size#   s,   
	r   zglobal constTc                 C   s   t j| ddS )NT)inplace)Fsiluxr   r   r   inplace_nonlinearityA   s   r#   c                 C   s   |j \}}}}||||| dd}d }d }|j \}}	}
| ||	|}| |}|d u r1|}n| jr9| |}| |}| |}| 	|}| 	|}| 	|}| 
|||}t||}| |}| jd |}| jd |}|dd||||}|S )Nr      r   )shapeview	transposeprepare_attention_maskto_q
norm_crossnorm_encoder_hidden_statesto_kto_vhead_to_batch_dimget_attention_scoresr   bmmbatch_to_head_dimto_outreshape)selfh_
batch_sizechannelheightwidthhidden_statesattention_maskencoder_hidden_statessequence_length_querykeyvalueattention_probsr   r   r   attn_forward_newJ   sB   






rE   c                 C   sJ   |  ddd f |  d|jf |  d|fddf |  dd g d S )N	store_resc                 S      | S Nr   r!   r   r   r   <lambda>t       zattn2task.<locals>.<lambda>pre_normattnc                 S   s
   t || S rH   )rE   )r"   netr   r   r   rI   v   s   
 add_res)append
group_norm)
task_queuerM   r   r   r   	attn2tasks   s   rR   c                 C   s   |j |jkr2tr|jr| d|jf n'| d|jf n|jr)| d|jf n| d|jf n	| ddd f | d|jf | dt	f | d|j
f | d|jf | dt	f | d|jf | dd	g d	S )
z
    Turn a ResNetBlock into a sequence of tasks and append to the task queue

    @param queue: the target task queue
    @param block: ResNetBlock

    rF   c                 S   rG   rH   r   r!   r   r   r   rI      rJ   zresblock2task.<locals>.<lambda>rK   r    conv1conv2rN   N)in_channelsout_channelssd_flaguse_conv_shortcutrO   conv_shortcutnin_shortcutuse_in_shortcutnorm1r#   rS   norm2rT   )queueblockr   r   r   resblock2taskz   s    r`   c           
      C   s  |rbt r1t| |jj t| |jj t|  t| |jj tt	|j
}|jd }d}|j}d}nCt| |jjd  t| |jjd  t| |jjd  t	t|j}d}t|jd }|j}d}nt	|j
}|j}|j
d }|j}d}|D ]?}t	|D ]}	t rt| || j|	  q|t| || j|	  q|||krt r| |t|| |f qv| ||| jd f qv|st rt| |jj t| |jj t| |jj dS t| |jjd  t| |jjd  t| |jjd  dS dS )z
    Build the sampling part of a task queue
    @param task_queue: the target task queue
    @param net: the network
    @param is_decoder: currently building decoder or encoder
    r   r   upsample   
upsamplers
downsampleN)rW   r`   midblock_1rR   attn_1printblock_2reversedrangenum_resolutionsnum_res_blocksup	mid_blockresnets
attentionslen	up_blocksdownr_   rO   getattrrc   )
rQ   rM   
is_decoderresolution_iter	block_ids	conditionmodule	func_namei_leveli_blockr   r   r   build_sampling   sZ   


r~   c                 C   s   g }| d| jf t|| | |rtsd| _d| _|r| jsNtr*| d| jf n| d| jf | dtf | d| j	f |rN| jrN| dt
jf |S )z
    Build a single task queue for the encoder or decoder
    @param net: the VAE decoder or encoder network
    @param is_decoder: currently building decoder or encoder
    @return: the task queue
    conv_inFrK   r    conv_outtanh)rO   r   r~   rW   give_pre_endtanh_outnorm_outconv_norm_outr#   r   r   r   )rM   rv   rQ   r   r   r   build_task_queue   s   

r   c                 C   s   dd | D S )zr
    Clone a task queue
    @param task_queue: the task queue to be cloned
    @return: the cloned task queue
    c                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]}|qS r   r   ).0itemr   r   r   
<listcomp>   s    z/clone_task_queue.<locals>.<listcomp>.<listcomp>r   )r   taskr   r   r   r      s    z$clone_task_queue.<locals>.<listcomp>r   )rQ   r   r   r   clone_task_queue   s   r   ư>c           	      C   sp   |  d|  d}}t|| }|  jdt|| |g|   dd R  }tj|g ddd\}}||fS )z)
    Get mean and var for group norm
    r   r   r$   N)r   r$   rb      F)dimunbiased)sizeint
contiguousr(   r   var_mean)	input
num_groupsepsbcchannel_in_groupinput_reshapedvarmeanr   r   r   get_var_mean   s   

r   c              
   C   s   |  d|  d}}t|| }	|  jdt|| |	g|   dd R  }
tj|
||dddd|d}|j||g|   dd R  }|durS||dddd9 }|dura||dddd7 }|S )a  
    Custom group norm with fixed mean and var

    @param input: input tensor
    @param num_groups: number of groups. by default, num_groups = 32
    @param mean: mean, must be pre-calculated by get_var_mean
    @param var: var, must be pre-calculated by get_var_mean
    @param weight: weight, should be fetched from the original group norm
    @param bias: bias, should be fetched from the original group norm
    @param eps: epsilon, by default, eps = 1e-6 to match the original group norm

    @return: normalized tensor
    r   r   r$   NF)weightbiastrainingmomentumr   r%   )r   r   r   r(   r   
batch_norm)r   r   r   r   r   r   r   r   r   r   r   outr   r   r   custom_group_norm	  s.    
r   c                    sn    fdd|D fddt dD }| dddd|d | d|d  |d | d|d	  f S )
z
    Crop the valid region from the tile
    @param x: input tile
    @param input_bbox: original input bounding box
    @param target_bbox: output bounding box
    @param scale: scale factor
    @return: cropped tile
    c                    s    g | ]} r
|d  n|d  qS    r   r   irv   r   r   r   @  s     z%crop_valid_region.<locals>.<listcomp>c                    s   g | ]
}|  |  qS r   r   r   )padded_bboxtarget_bboxr   r   r   A  s    r   Nr$   rb   r   r   )rk   r   )r"   
input_bboxr   rv   marginr   )rv   r   r   r   crop_valid_region7  s
   	&r   c                    s    fdd}|S )Nc                     s   t  }tj rtjt t  t   | i |}t  t  tj rHtj	td }tjt t
dt  | dd|dd |S t
dt  | dd |S )Nr	   z[Tiled VAE]: Done in z.3fzs, max VRAM alloc z MBs)r   r   r   r   reset_peak_memory_statsr   r   gccollectmax_memory_allocatedrh   )argskwargstsretvramfnr   r   wrapperK  s"   

zperfcount.<locals>.wrapperr   )r   r   r   r   r   	perfcountI  s   r   c                   @   s0   e Zd Zdd Zdd Zdd Zedd Zd	S )
GroupNormParamc                 C   s"   g | _ g | _g | _d | _d | _d S rH   )var_list	mean_list
pixel_listr   r   r6   r   r   r   __init__j  s
   
zGroupNormParam.__init__c                 C   s   t |d\}}|jtjkr|  r| }t |d\}}| j| | j	| | j
|jd |jd   t|drG|j| _|j| _d S d | _d | _d S )N    r$   rb   r   )r   dtyper   float16isinfanyfloatr   rO   r   r   r'   hasattrr   r   )r6   tilelayerr   r   	fp32_tiler   r   r   add_tileq  s   

zGroupNormParam.add_tilec                    s   t jdkr	dS tjtj tj}tjjtjt	d| }t
|}|d| }tj
| ddtj
 | dd  fddS )zm
        summarize the mean and var and return a function
        that apply group norm on each tile
        r   N)r   r   r   )r   c                    s   t | d jjS )Nr   )r   r   r   r!   r   r6   r   r   r   rI     s    z(GroupNormParam.summary.<locals>.<lambda>)rr   r   r   vstackr   maxr   tensorfloat32r   sum	unsqueeze)r6   	max_valuepixels
sum_pixelsr   r   r   summary  s   


zGroupNormParam.summaryc                 C   s   t | d\}}|jtjkr3|  r3|  }t |d\}}|jjdkr3t	|dd}|
 }|
 }t|dr?|j}|j}nd}d}||||fdd}|S )	zF
        create a function from a single tile without summary
        r   mpsr   i`  r   Nc                 S   s   t | d||||dS )Nr   r   )r   )r"   r   r   r   r   r   r   r   group_norm_func  s   z1GroupNormParam.from_tile.<locals>.group_norm_func)r   r   r   r   r   r   r   r   typeclamphalfr   r   r   )r   normr   r   r   r   r   r   r   r   r   	from_tile  s   
zGroupNormParam.from_tileN)__name__
__module____qualname__r   r   r   staticmethodr   r   r   r   r   r   h  s    r   c                   @   sT   e Zd Z	dddZdd Zdd Zdd	 Ze d
d Z	e
e dd ZdS )VAEHookFc                 C   sP   || _ || _|| _|r| p|o|| _|o| | _|| _|r#d| _d S d| _d S )N   r   )rM   	tile_sizerv   	fast_mode	color_fixto_gpupad)r6   rM   r   rv   fast_decoderfast_encoderr   r   r   r   r   r     s   zVAEHook.__init__c              	   C   s   |j \}}}}t| j j}z4| jr| jt  t||| j	d | j
 kr8td | j|W | j| S | |W | j| S | j| w )Nr$   z<[Tiled VAE]: the input size is tiny and unnecessary to tile.)r'   nextrM   
parametersr   r   tor   r   r   r   rh   original_forwardvae_tile_forward)r6   r"   BCHWoriginal_devicer   r   r   __call__  s   
zVAEHook.__call__c                 C   sL   d}|dkr$|| }|dkr|S || | }||kr|S |d }|dks|S )z7
        Get the best tile size for GPU memory
        r   r$   r   r   )r6   
lowerbound
upperbounddividerremainer	candidater   r   r   get_best_tile_size  s   zVAEHook.get_best_tile_sizec                    s  g g }} j } j}t|d|  | }t|d|  | }t|d}t|d}t|d|  | }	t|d|  | }
 |	|}	 |
|}
td| d| d||  dd|
 d|	 d| d|  t|D ]}t|D ]}|||
  t||d |
  ||||	  t||d |	  |g}|d	 |kr|d	 nd	|d || k r|d n||d |kr|d nd	|d
 || k r|d
 n|g} fdd|D }|	| |	td	|d	 | t||d | td	|d | t||d
 | g qtqn||fS )z
        Tool function to split the image into tiles
        @param h: height of the image
        @param w: width of the image
        @return: tile_input_bboxes, tile_output_bboxes
        r$   r   z[Tiled VAE]: split to r"   =z tiles.zOptimal tile size z, original tile size r   rb   c                    s"   g | ]} j r|d  n|d  qS r   r   )r   r"   r   r   r   r     s    z'VAEHook.split_tiles.<locals>.<listcomp>)
r   r   mathceilr   r   rh   rk   minrO   )r6   hwtile_input_bboxestile_output_bboxesr   r   num_height_tilesnum_width_tilesreal_tile_heightreal_tile_widthr   jr   output_bboxr   r   r   split_tiles  sN   




	

 zVAEHook.split_tilesc                 C   s  |j }|}t|d }|dkr'|| d dkr'|d8 }|dkr'|| d dks|dks3|| d dkr7tdt|d D ]}|| }|d dkrct||d }	d|	f||< ||kr^ dS |	|}nw|d dkr|d }
|
|k r||
 d dkr|
d7 }
|
|k r||
 d dksy|
|krq=|d |||
 d< nA|d dkr||d |7 }d |d< n-|r|d d	krt||d D ]}|| d dkrd
|| d f||< q dS |d |}zt|d W q= ty } zt	| d W Y d }~ dS d }~ww t
d)Nr   r   rK   z%No group norm found in the task queue
apply_normTrF   rN   rd   store_res_cpuvaez;. Nan detected in fast mode estimation. Fast mode disabled.FzShould not reach here)r   rr   
ValueErrorrk   r   r   r   r   	Exceptionrh   
IndexError)r6   zrQ   r   r   r   last_idr   r   r   task_idr
  er   r   r   estimate_group_norm&  sX   

zVAEHook.estimate_group_normc           $   	      s  t | j j}| j}| j}| j}| }|jd |jd |jd }}}|j|_t	d|j d| d| j
  | ||\}	}
g }|	D ]!}|dddd|d |d |d |d f  }|| qEt|}d}t|| | jr|t|| }||}tj||d	d
}t	d|jd  d|jd  d tj|g ddd\}}tj|g ddd\}}|| | | | }~~~~tj|| | d}t }| j||| jdr| ~ fddt|D }d}d}~t|t|d  d|rdnd dd}d}d}	 t }|rt|ntt|D ]-}|| |}|	| }|| }d}t|dkr| d} | d dkrE|!|| d  ni| d dksS| d dkrd}!| d |}"| jrf| d dkrj|" }"||! d dkr|!d7 }!||! d dkss|"||! d< n| d dkr|| d |7 }d| d< n| d |}|"d t|dks0|r nt#|d t|dkrd||< |d7 }|du rtj$||jd |r|d  n|d  |r|d  n|d  f|dd!}t%||	| |
| ||dddd|
| d |
| d |
| d |
| d f< ~q||d kr+|r+d}|||< q|dkr;|s;d}|||< q| ||< ~q|rHn%||krNn|& }#|#durkt|D ]}|| }|'dd"|#f q[q|(  |durx|S ||S )#z
        Decode a latent vector z into an image in a tiled manner.
        @param z: latent vector
        @return: image
        r   r$   rb   z[Tiled VAE]: input_size: z, tile_size: z, padding: Nr   znearest-exact)scale_factormodezX[Tiled VAE]: Fast mode enabled, estimating group norm parameters on                     z x z image)r   r$   rb   T)r   keepdim)r  r   )r   c                    s   g | ]}t  qS r   )r   )r   r@   single_task_queuer   r   r     s    z,VAEHook.vae_tile_forward.<locals>.<listcomp>z[Tiled VAE]: Executing DecoderEncoderz Task Queue: )totaldescFrK   rF   r  rN   r  r   )r   requires_gradr  ))r   rM   r   r   r   rv   detachr'   last_z_shaperh   r   r  cpurO   rr   r   r   r   r   r   interpolater   std_meanclamp_r  r   r  r   rk   r   r   rj   popr   updater   zerosr   r   insertclose)$r6   r  r   rM   r   rv   Nr:   r;   	in_bboxes
out_bboxestilesr   r   	num_tilesnum_completedr  downsampled_zstd_oldmean_oldstd_newmean_newestimate_task_queuetask_queuesresultresult_approxpbarforwardinterruptedgroup_norm_paramr   rQ   r   r  resr   r   r  r   r   S  s   "












$

MzVAEHook.vae_tile_forwardN)F)r   r   r   r   r   r   r  r   no_gradr  r   r   r   r   r   r   r     s    	
?
,r   )r   )NNr   )&r   r   r   r   torch.nn.functionalnn
functionalr   torch.versionr   devicesr   r   r   r   rW   r   r   DEFAULT_ENABLEDDEFAULT_MOVE_TO_GPUDEFAULT_FAST_ENCODERDEFAULT_FAST_DECODERDEFAULT_COLOR_FIXDEFAULT_ENCODER_TILE_SIZEDEFAULT_DECODER_TILE_SIZEr#   rE   rR   r`   r~   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sF   	);
	
.M