o
    ߥiS                     @   s  d dl Z d dlmZmZ d dl mZ d dlmZmZ d dlZd dl	Z	d dl
Z
d dlm  mZ d dlmZ d dlmZ d dlm  m  m  mZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z: ddl;m<Z< dgZ=ej>e:j?ejdG dd deZ@dS )    N)copydeepcopy)path)AnyDict)	rearrange)Models)
TorchModel)MODELS)pidinet_bsdsketch_simplification_gan)AutoencoderKL)FrozenOpenCLIPEmbedderFrozenOpenCLIPVisualEmbedder)GaussianDiffusionbeta_schedule)get_first_stage_encodingmake_masked_imagesprepare_model_kwargssave_with_model_kwargs)UNetSD_temporal)Config)find_free_port
setup_seed	to_device)
OutputKeys)
load_image)	ModelFileTasks   )cfgVideoComposer)module_namec                       s6   e Zd ZdZ fddZdeeef fddZ  Z	S )r!   a$  
    task for video composer.

    Attributes:
        sd_model: denosing model using in this task.
        diffusion: diffusion model for DDIM.
        autoencoder: decode the latent representation into visual space with VQGAN.
        clip_encoder: encode the text into text embedding.
    c                    sV  t  j|d|i| tj rtdntd| _|dd| _|dd}|dd	}|d
d}tddd||d}t	
|j tt	j}tt	j}	t	jd||	  |	  t	_t	jtt	j t	_t	| _	dtjvrudtjd< t tjd< ttdd| j	_ttdd| j	_t| j	j |dd| _|dd| _|dd| _|dd| _|dg d| _ | j	j| _!t"dtj#$||d| _%| j%&| j| _%t'dtj#$||d| _(| j(j)&| j ddd d!d!d"g d#d$g d%d&
}
t*|
dtj#$||d'| _+| %d(, | _-| (| j(j./d}t0|}| j+1  | j+2 D ]}d|_3q| j+  t4dNi d)| j	d*| j	j5d+| j	j6d,| j	j7d-| j	j8d.| j	j9d/| j	j:d0| j	j;d1| j	j<d2| j	j=d3| j	j>d4| j	j?d5| j	j@d6| j	jAd7| j	jBd8| j	jCd9| j	jDd:| j	jEd| j	j d;| j	jFd<| j	jGd=| j	jGd>| j-d?|&| j| _)| j	jHr| j	jIrtJ| j	d@r| j	jKrt	jILdAdB }tMtj#$| jNt	jI}dCdD |O D }| j)jP|ddE nt	jILdAdB }| j)jPtjMtj#$| jN|ddFddE tjQ  n
tRdG| j	jI dHtSdI| j	jTdJdKdL}tU|| j	jV| j	jW| j	jXddM| _YdS )Oa8  
        Args:
            model_dir (`str` or `os.PathLike`)
                Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on modelscope
                      or modelscope.cn. Valid model ids can be located at the root-level, like `bert-base-uncased`,
                      or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
                      `True`.
        	model_dircudacpuduration   clip_checkpointzopen_clip_pytorch_model.binsd_checkpointzv2-1_512-ema-pruned.ckptcfg_file_namezexp06_text_depths_vs_style.yamlTN)loadcfg_dict	cfg_levelr#   r*   r   MASTER_ADDR	localhostMASTER_PORTRANK
WORLD_SIZEr   
read_imageF
read_styleread_sketchsave_origin_videovideo_compositions)textmaskdepthmapsketchmotionimagelocal_imagesingle_sketchpenultimate)layer
pretrained            )r      rC   rC   rG           )
double_z
z_channels
resolutionin_channelsout_chchch_multnum_res_blocksattn_resolutionsdropout)	ckpt_path r    in_dim
concat_dimdimy_dimcontext_dimout_dimdim_mult	num_headshead_dimrP   attn_scalesrR   temporal_attentiontemporal_attn_timesuse_checkpointuse_fps_conditionuse_sim_maskmisc_dropout
p_all_zero
p_all_keepzero_yblack_image_featuretext_to_video_pretrain/c                 S   s   i | ]\}}d |vr||qS )zinput_blocks.0.0 ).0keyprl   rl   s/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/videocomposer_model.py
<dictcomp>   s    z*VideoComposer.__init__.<locals>.<dictcomp>)strict)map_locationzThe checkpoint file z
 is wrong 	linear_sdg_QK?g~jt?)	init_beta	last_beta)betas	mean_typevar_type	loss_typerescale_timestepsrl   )Zsuper__init__torchr$   is_availabledevicepopr&   r   r    updater,   len
frame_lensfeature_framerates
max_framesbatch_sizesstr
batch_sizeosenvironr   intgetenvpmi_rankpmi_world_sizer   seedr3   r4   r5   r6   r7   viz_numr   r   joinclip_encodertor   clip_encoder_visualmodelr   autoencoderdetachrg   black_image	unsqueeze
zeros_likeeval
parametersrequires_gradr   unet_in_dimunet_concat_dimunet_dim
unet_y_dimunet_context_dimunet_out_dimunet_dim_multunet_num_headsunet_head_dimunet_res_blocksunet_attn_scalesunet_dropoutr_   r`   ra   rb   rc   rd   re   resumeresume_checkpointhasattrri   splitr+   r#   itemsload_state_dictempty_cache
ValueErrorr   num_timestepsr   rx   ry   rz   	diffusion)selfr#   argskwargsr(   r)   r*   _cfgl1l2ddconfigrh   paramcheckpoint_namessrw   	__class__rl   rp   r}   7   s@  








	
zVideoComposer.__init__inputc           ;         s	  d }| j r|d }t|}t|g}d }| jr$| jj}t|}t|g}d }| jr/t|d }d| jv rLtj	d| j
d djtjd | j}d| jv rTt  d| jv rt| j
ddd	 d| j}	t| j
dd
 d| j}
t| jjdddd| j}t| jjdddd| j}d }| j  |d }| jjdkr| jjr|d }|d }|d }|d }|d }tj| jjg| jj tj| jd}n%|d }|d }|d }|d }|d }tj| jjg| jj tj| jd}t|}t |d}g }d| jjv rt |d}g }d| jjv r,t!|"d#d|}t |d}g }d| jjv rm|j$d }|j$d }| jj rS|%d&||ddd' }n|d d d df ( &d|ddd}t |d|d}|j$d }t |d}t |d}tj)||j$d | jj* dd}tj)||j$d | jj* dd}t+  g }|D ]}| j,-|} t.| / }!|0|! qtj1|dd}t |d|d}g }"d| jjv r|D ]%}#||#"d#djtjd }$|$| jj2 3d| jj4}$|"0|$ qtj1|"dd}"t |"d|d}"g }%d| jjv r@|D ] }#t |#( d}#t5 fdd|#D }&t |&d }&|%0|& qtj1|%dd}%t |%d|d}%g }'d| jjv r|}(| jjr]|&|ddd' })|)g}(|(D ]}#|	|#"|#|}*d!|
d!|*  }*|'0|* q_tj1|'dd}'t |'d|d}'g }+d"| jjv r|'( d d d d d df &dd|dd}+W d    n	1 sw   Y  | 6|/ },|,( }-g }.d#| jjv rt+ 4 | jjr| 7| j78|%d' %d}.|.( }/n|9d}| 7|%d}.|.( }/W d    n	1 sw   Y  t+  t:;  t<j=| jj>d$ | jj?rN|j$\}0}1}2}3}4tj@| jA|1|3|4f| jd%}5|5jB|2dd&}5t |5d'| jAd}5|5C }5n
tD|d | jA }5|-d | jA tE|dkrgd n|d | jA tE|.dkrvd n|/d | jA tE|"dkrd n|"d | jA tE|%dkrd n|%d | jA tE|'dkrd n|'d | jA tE|dkrd n|d | jA tE|dkrd n|d | jA tE|+dkrd n|+d | jA |d | jA d(
| jjFs| jG&| jAddn	tH|-d | jA tE|dkrd n|d | jA tE|.dkrd n	tH|/d | jA tE|"dkrd n|"d | jA tE|%dkr.d n|%d | jA tE|'dkr=d n|'d | jA tE|dkrLd n|d | jA tE|dkr[d n|d | jA tE|+dkrjd n|+d | jA |d | jA d(
g}6| jjI}7|5( }8tJ|7|6| jjFd)}9| jKjL|8| j |9d*| jjMd+d,}:tN|9|:| j,|| jAd||| j| jOd-
 W d    n	1 sw   Y  W d    n	1 sw   Y  |:PtjQR | jd.S )/Nstyle_imager:   T)rB   r#   F)memory_formatcannyr;   )rB   vanilla_cnn)rB   r   rk   cap_txt	ref_frame
video_data	misc_datar9   mv_data)dtyper   zb f c h w -> b c f h wr<   g      ?r>   r   )bzb f c h w -> (b f) c h w)rW   z(b f) c h w -> b c f h wzk c h w -> k h w cc                    s   g | ]} |qS rl   rl   )rm   misc_imgcanny_detectorrl   rp   
<listcomp>Q  s    z)VideoComposer.forward.<locals>.<listcomp>zk h w c-> k c h wg      ?r?   r=   )enabled)r   )repeatsrW   z(b f) c h w->b c f h w)
yr>   r=   depthr   r;   maskedr<   r?   fps)partial_keysfull_model_kwargsrb   g      "@rH   )noiser   model_kwargsguide_scaleddim_timestepseta)
r   r   r   	ori_videor   stepcapspaletter    r&   )video
video_path)Sr3   r   misc_transformsr5   r    sketch_pathr4   r7   modelsmidas_v3r#   r   requires_grad_r   r~   channels_lasthalfr   CannyDetectorr   r   tensorsketch_meanview
sketch_stdr   r   use_image_datasetfeature_framerater   longr   r   r   subdiv_shaper   repeatr$   clonechunk
chunk_sizeno_gradr   encoder   r   appendcat	depth_stdclamp_depth_clampstackr   r   
preprocesssqueezepynvmlnvmlInitampautocastuse_fp16share_noiserandnr   repeat_interleave
contiguous
randn_liker   rb   rg   r   	guidancesr   r   ddim_sample_loopr   r   r&   typefloat32r%   );r   r   frame_in	image_keyframeframe_sketch
sketch_keyframe_stylemidaspidinetcleaner	pidi_meanpidi_stdr   r   ref_imgsr   r   r9   r   r   misc_backupsmv_data_videomasked_videoimage_local
frames_numbs_vd_localbs_vdvideo_data_listmisc_data_listdecode_datavd_dataencoder_posteriortmp
depth_data	misc_imgsr   
canny_datacanny_conditionsketch_datasketch_listsketch_repeatr;   single_sketch_datar   y0y_visual	y_visual0r   cfhwr   r   r   noise_motionr   video_outputrl   r   rp   forward   sJ  




















;






>izVideoComposer.forward)
__name__
__module____qualname____doc__r}   r   r   r   r<  __classcell__rl   rl   r   rp   r!   *   s
    
 )Ar   r   r   r   osptypingr   r   	open_clipr  r~   torch.cuda.ampr$   r  torch.nnnneinopsr   2modelscope.models.multi_modal.videocomposer.modelsr   multi_modalvideocomposermodelscope.metainfor   modelscope.modelsr	   modelscope.models.builderr
   <modelscope.models.multi_modal.videocomposer.annotator.sketchr   r   7modelscope.models.multi_modal.videocomposer.autoencoderr   0modelscope.models.multi_modal.videocomposer.clipr   r   5modelscope.models.multi_modal.videocomposer.diffusionr   r   5modelscope.models.multi_modal.videocomposer.ops.utilsr   r   r   r   3modelscope.models.multi_modal.videocomposer.unet_sdr   8modelscope.models.multi_modal.videocomposer.utils.configr   7modelscope.models.multi_modal.videocomposer.utils.utilsr   r   r   modelscope.outputsr   modelscope.preprocessors.imager   modelscope.utils.constantr   r   configr    __all__register_moduletext_to_video_synthesisr!   rl   rl   rl   rp   <module>   s>   