o
    ߥiYS                     @   s  d dl Zd dlZd dlmZ d dlm  mZ dgZdd Z	dddZ
G dd	 d	eZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejjZdS )    NAutoencoderKLc                 C   s   | t |  S N)torchsigmoid)x r   k/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/autoencoder.pynonlinearity   s   r	       c                 C   s   t jj|| dddS )Ngư>T)
num_groupsnum_channelsepsaffine)r   nn	GroupNorm)in_channelsr   r   r   r   	Normalize   s   r   c                   @   s@   e Zd ZdddZdd ZdddZg d	fd
dZdd ZdS )DiagonalGaussianDistributionFc                 C   s   || _ tj|ddd\| _| _t| jdd| _|| _td| j | _t| j| _	| jr@t
| jj| j jd | _	| _d S d S )N      dimg      >g      4@      ?device)
parametersr   chunkmeanlogvarclampdeterministicexpstdvar
zeros_liketor   )selfr   r    r   r   r   __init__   s   z%DiagonalGaussianDistribution.__init__c                 C   s*   | j | jt| j jj| jjd  }|S )Nr   )r   r"   r   randnshaper%   r   r   r&   r   r   r   r   sample"   s   
z#DiagonalGaussianDistribution.sampleNc                 C   s   | j r	tdgS |d u r%dtjt| jd| j d | j g dd S dtjt| j|j d|j | j|j  d | j |j g dd S )N        r   r         ?r   r      r   )r    r   Tensorsumpowr   r#   r   )r&   otherr   r   r   kl'   s&   
zDiagonalGaussianDistribution.klr.   c                 C   sR   | j r	tdgS tdtj }dtj|| j t|| j	 d| j
  |d S )Nr,          @r   r   r   )r    r   r0   nplogpir1   r   r2   r   r#   )r&   r+   dimslogtwopir   r   r   nll5   s   z DiagonalGaussianDistribution.nllc                 C   s   | j S r   )r   r&   r   r   r   mode>   s   z!DiagonalGaussianDistribution.mode)Fr   )__name__
__module____qualname__r'   r+   r4   r;   r=   r   r   r   r   r      s    

	r   c                       $   e Zd Z fddZdd Z  ZS )
Downsamplec                    6   t    || _| jrtjj||dddd| _d S d S Nr/   r   r   kernel_sizestridepaddingsuperr'   	with_convr   r   Conv2dconvr&   r   rK   	__class__r   r   r'   D      

Downsample.__init__c                 C   F   | j rd}tjjj||ddd}| |}|S tjjj|ddd}|S N)r   r   r   r   constantr   )r=   valuer   )rF   rG   rK   r   r   
functionalpadrM   
avg_pool2dr&   r   rY   r   r   r   forwardL      
Downsample.forwardr>   r?   r@   r'   r\   __classcell__r   r   rO   r   rB   B       rB   c                       s.   e Zd Zdddd fdd
Zdd Z  ZS )	ResnetBlockNFi   )out_channelsconv_shortcuttemb_channelsc                   s   t    || _|d u r|n|}|| _|| _t|| _tjj	||dddd| _
|dkr3tj||| _t|| _tj|| _tjj	||dddd| _| j| jkrp| jrbtjj	||dddd| _d S tjj	||dddd| _d S d S )Nr/   r   rE   r   )rJ   r'   r   rc   use_conv_shortcutr   norm1r   r   rL   conv1Linear	temb_projnorm2Dropoutdropoutconv2rd   nin_shortcut)r&   r   rc   rd   rm   re   rO   r   r   r'   X   sB   




zResnetBlock.__init__c                 C   s   |}|  |}t|}| |}|d ur'|| t|d d d d d d f  }| |}t|}| |}| |}| j| jkrQ| j	rL| 
|}|| S | |}|| S r   )rg   r	   rh   rj   rk   rm   rn   r   rc   rf   rd   ro   )r&   r   tembhr   r   r   r\   ~   s    

&




zResnetBlock.forwardr_   r   r   rO   r   rb   V   s    &rb   c                       rA   )	AttnBlockc                    ~   t    || _t|| _tjj||dddd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _d S Nr   r   rE   rJ   r'   r   r   normr   r   rL   qkvproj_outr&   r   rO   r   r   r'         





AttnBlock.__init__c                 C      |}|  |}| |}| |}| |}|j\}}}}	|||||	 }|ddd}|||||	 }t||}
|
t	|d  }
tj
jj|
dd}
|||||	 }|
ddd}
t||
}|||||	}| |}|| S Nr   r   r   g      r   rv   rw   rx   ry   r)   reshapepermuter   bmmintr   rX   softmaxrz   r&   r   h_rw   rx   ry   bcrq   ww_r   r   r   r\      (   




AttnBlock.forwardr_   r   r   rO   r   rr          rr   c                       rA   )rr   c                    rs   rt   ru   r{   rO   r   r   r'      r|   r}   c                 C   r~   r   r   r   r   r   r   r\      r   r   r_   r   r   rO   r   rr      r   c                       rA   )Upsamplec                    s6   t    || _| jrtjj||dddd| _d S d S )Nr/   r   rE   rI   rN   rO   r   r   r'      s   

zUpsample.__init__c                 C   s(   t jjj|ddd}| jr| |}|S )Nr5   nearest)scale_factorr=   )r   r   rX   interpolaterK   rM   r*   r   r   r   r\      s   
zUpsample.forwardr_   r   r   rO   r   r      s    r   c                       rA   )rB   c                    rC   rD   rI   rN   rO   r   r   r'      rQ   rR   c                 C   rS   rT   rW   r[   r   r   r   r\     r]   r^   r_   r   r   rO   r   rB      ra   c                       s4   e Zd Zddddddd fdd
Zd	d
 Z  ZS )Encoderr   r         r,   TFvanilla)ch_multrm   resamp_with_convdouble_zuse_linear_attn	attn_typec             
      s  t    || _d| _t|| _|| _|	| _|| _t	j
j|| jdddd| _|	}dt| }|| _t
 | _t| jD ]X}t
 }t
 }|||  }|||  }t| jD ]}|t||| j|d |}||v rq|t| qVt
 }||_||_|| jd krt|||_|d }| j| q;t
 | _t||| j|d| j_t|| j_t||| j|d| j_t|| _t	j
j||rd|
 n|
dddd| _ d S )Nr   r/   r   rE   )r   r   rc   re   rm   r   )!rJ   r'   chtemb_chlennum_resolutionsnum_res_blocks
resolutionr   r   r   rL   conv_intuple
in_ch_mult
ModuleListdownrangeappendrb   rr   ModuleblockattnrB   
downsamplemidblock_1attn_1block_2r   norm_outconv_out)r&   r   out_chr   r   attn_resolutionsrm   r   r   r   
z_channelsr   r   r   ignore_kwargscurr_resr   i_levelr   r   block_in	block_outi_blockr   rO   r   r   r'     sz   






zEncoder.__init__c                 C   s   d }|  |g}t| jD ]D}t| jD ](}| j| j| |d |}t| j| jdkr7| j| j| |}|| q|| jd krQ|| j| 	|d  q|d }| j
||}| j
|}| j
||}| |}t|}| |}|S )Nr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   )r&   r   rp   hsr   r   rq   r   r   r   r\   b  s&   

zEncoder.forwardr_   r   r   rO   r   r     s    Nr   c                       s6   e Zd Zdddddddd fdd
Zd	d
 Z  ZS )Decoderr   r,   TFr   )r   rm   r   give_pre_endtanh_outr   r   c             
      s  t    || _d| _t|| _|| _|	| _|| _|| _	|| _
||| jd   }|	d| jd   }d|
||f| _td| jt| j tjj|
|dddd| _t | _t||| j|d| j_t|| j_t||| j|d| j_t | _tt| jD ]R}t }t }|||  }t| jd D ]}|t||| j|d |}||v r|t| qt }||_ ||_!|dkrt"|||_#|d }| j$d| q~t%|| _&tjj||dddd| _'d S )Nr   r   r   z+Working with z of shape {} = {} dimensions.r/   rE   r   )(rJ   r'   r   r   r   r   r   r   r   r   r   z_shapeprintformatr6   prodr   r   rL   r   r   r   rb   r   rr   r   r   r   upreversedr   r   r   r   r   upsampleinsertr   r   r   )r&   r   r   r   r   r   rm   r   r   r   r   r   r   r   r   ignorekwargsr   r   r   r   r   r   r   r   rO   r   r   r'     sz   








zDecoder.__init__c                 C   s   |j | _d }| |}| j||}| j|}| j||}tt| j	D ]7}t| j
d D ]!}| j| j| ||}t| j| jdkrP| j| j| |}q/|dkr]| j| |}q&| jrc|S | |}t|}| |}| jryt|}|S )Nr   r   )r)   last_z_shaper   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   tanh)r&   zrp   rq   r   r   r   r   r   r\     s.   



zDecoder.forwardr_   r   r   rO   r   r   ~  s    Qr   c                       s   e Zd Zdg dddddf fdd	Ze fddZe fdd	Zd
d Zdd Zdd Z	dddZ
dd Zdd Ze dddZdd Z  ZS )r   NimageFc
           
   	      s   t    |	| _|| _tdi || _tdi || _|d s!J tj	
d|d  d| d| _tj	
||d d| _|| _|d urVt|tksJJ | dtd|dd |d ur]|| _|d u| _|d uro| j||d d S d S )	Nr   r   r   r   colorizer/   )ignore_keysr   )rJ   r'   learn_logvar	image_keyr   encoderr   decoderr   r   rL   
quant_convpost_quant_conv	embed_dimtyper   register_bufferr(   monitoruse_emainit_from_ckpt)
r&   ddconfigr   	ckpt_pathr   r   colorize_nlabelsr   	ema_decayr   rO   r   r   r'     s0   


zAutoencoderKL.__init__c           
      C   s   t j|ddd }t| }|D ]
}t||| j qdd l}| }|D ]}|ddkr<|	dd }	|| ||	< q&| j
|dd	 td
|  d S )Ncpumap_location
state_dictr   first_stage_modelzfirst_stage_model.r   TstrictRestored from )r   loadlistkeysr   r)   collectionsOrderedDictfindsplitload_state_dict)
r&   pathr   sdr   keyr   sd_newrx   k_newr   r   r   r     s   zAutoencoderKL.init_from_ckptc                 C   st   t j|ddd }t| }t |D ]}|D ]}||r(td| ||= qq| j|dd td|  d S )Nr   r   r   z Deleting key {} from state_dict.Fr   r   )	r   r   r   r   r   
startswithr   r   r   )r&   r   r   r   r   rx   ikr   r   r   init_from_ckpt2#  s   
zAutoencoderKL.init_from_ckpt2c                 O   s   | j r
| |  d S d S r   )r   	model_ema)r&   argskwargsr   r   r   on_train_batch_end0  s   z AutoencoderKL.on_train_batch_endc                 C   s    |  |}| |}t|}|S r   )r   r   r   )r&   r   rq   moments	posteriorr   r   r   encode4  s   

zAutoencoderKL.encodec                 C   s   |  |}| |}|S r   )r   r   )r&   r   decr   r   r   decode:  s   

zAutoencoderKL.decodeTc                 C   s2   |  |}|r| }n| }| |}||fS r   )r   r+   r=   r  )r&   inputsample_posteriorr   r   r  r   r   r   r\   ?  s   


zAutoencoderKL.forwardc                 C   s@   || }t |jdkr|d }|ddddjtjd }|S )Nr/   ).Nr   r   r   )memory_format)r   r)   r   r%   r   contiguous_formatfloat)r&   batchrx   r   r   r   r   	get_inputH  s   
zAutoencoderKL.get_inputc                 C   s
   | j jjS r   )r   r   weightr<   r   r   r   get_last_layerP  s   
zAutoencoderKL.get_last_layerc                 K   s  t  }| || j}|| j}|s| |\}}|jd dkr2|jd dks(J | |}| |}| t	|
 |d< ||d< |sG| jr|  3 | |\}	}
|jd dkrg|	jd dksbJ | |	}	| t	|

 |d< |	|d< W d    n1 sw   Y  ||d< |S )Nr   r/   samplesreconstructionssamples_emareconstructions_emainputs)dictr	  r   r%   r   r)   to_rgbr  r   
randn_liker+   r   	ema_scope)r&   r  only_inputslog_emar   r7   r   xrecr   xrec_emaposterior_emar   r   r   
log_imagesS  s0   





	zAutoencoderKL.log_imagesc              	   C   st   | j dksJ t| ds| dtd|jd dd| tj|| j	d}d||
   | |
   d }|S )Nsegmentationr   r/   r   )r
  r5   r-   )r   hasattrr   r   r(   r)   r%   Fconv2dr   minmaxr*   r   r   r   r  n  s   
$zAutoencoderKL.to_rgb)T)FF)r>   r?   r@   r'   r   r   r   r   r   r  r\   r	  r  r   no_gradr  r  r`   r   r   rO   r   r     s(    
	c                       sB   e Zd Zdd fdd
Zdd Zdd Zd	d
 Zdd Z  ZS )IdentityFirstStageFvq_interfacec                   s   || _ t   d S r   )r$  rJ   r'   )r&   r$  r   r   rO   r   r   r'   z  s   zIdentityFirstStage.__init__c                 O      |S r   r   r&   r   r   r   r   r   r   r   ~     zIdentityFirstStage.encodec                 O   r%  r   r   r&  r   r   r   r    r'  zIdentityFirstStage.decodec                 O   s   | j r
|d g dfS |S )N)NNNr#  r&  r   r   r   quantize  s   zIdentityFirstStage.quantizec                 O   r%  r   r   r&  r   r   r   r\     r'  zIdentityFirstStage.forward)	r>   r?   r@   r'   r   r  r(  r\   r`   r   r   rO   r   r"  x  s    r"  )r
   )numpyr6   r   torch.nnr   torch.nn.functionalrX   r  __all__r	   r   objectr   r   rB   rb   rr   r   r   r   r   r"  r   r   r   r   <module>   s&   
-?,,lv 