o
    ॵi{7                     @   s  d dl Zd dlmZmZmZmZ d dlZd dl	Z	d dl
m  mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& e& Z'dgZ(ej)e$j*ej*dG dd de Z+G dd deZ,dS )    N)AnyDictOptionalUnion)DDIMSchedulerStableDiffusionPipeline)Image)
transforms)tqdm)	Pipelines)MutualSelfAttentionControl#register_attention_editor_diffusers)
OutputKeys)	PIPELINES)DiffusersPipeline)	LoadImage)Tasks)
get_loggerImageEditingPipeline)module_namec                       s   e Zd Zedf fdd	Zdeeef deeef fddZdeeef deeef fdd	Zdeeef deeef fd
dZ	  Z
S )r   Nc                    s   t  jd||d| |dtj}t|dttj rdnd| _	t
d tjtj|ddd	}tjtj|d||d
d| j	| _dS )a    MasaCtrl Image Editing Pipeline.

        Examples:

        >>> import cv2
        >>> from modelscope.pipelines import pipeline
        >>> from modelscope.utils.constant import Tasks

        >>> prompts = [
        >>>     "",                           # source prompt
        >>>     "a photo of a running corgi"  # target prompt
        >>> ]
        >>> output_image_path = './result.png'
        >>> img = 'https://public-vigen-video.oss-cn-shanghai.aliyuncs.com/public/ModelScope/test/images/corgi.jpg'
        >>> input = {'img': img, 'prompts': prompts}
        >>>
        >>> pipe = pipeline(
        >>>     Tasks.image_editing,
        >>>     model='damo/cv_masactrl_image-editing')
        >>>
        >>> output = pipe(input)['output_img']
        >>> cv2.imwrite(output_image_path, output)
        >>> print('pipeline: the output image path is {}'.format(output_image_path))
        )modelpreprocessortorch_dtypedevicecudacpuz load image editing pipeline donezstable-diffusion-v1-4	scheduler)	subfolderT)r   r   use_safetensorsN )super__init__gettorchfloat32getattrr   r   is_available_deviceloggerinfor   from_pretrainedospathjoin_MasaCtrlPipelinetopipeline)selfr   r   kwargsr   r   	__class__r   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/pipelines/cv/image_editing_pipeline.pyr!   !   s&   
zImageEditingPipeline.__init__inputreturnc                 C   s\   t |d}tt tdgdgg}||d}t	|d}|
| j|d< |S )Nimg      ?r   )   r:   )r   convert_to_imgr"   r	   ComposeToTensor	Normalize	unsqueezeFinterpolater/   r'   )r1   r6   r8   test_transformsr   r   r5   
preprocessJ   s   zImageEditingPipeline.preprocessc           	      C   s   t |tstdt| |d}| jj|d|d dddd\}}|t|d	d	d	}d
\}}t	||}t
| j| | j|||dddd	d  }d|iS )Nz/Expected the input to be a dictionary, but got promptsr8   r         @2   T)guidance_scalenum_inference_stepsreturn_intermediates)   
   rG   )latentsrG   output_tensor)
isinstancedict
ValueErrortyper"   r0   invertexpandlenr   r   )	r1   r6   rD   
start_codelatents_listSTEPLAYEReditoroutputr   r   r5   forwardT   s0   




zImageEditingPipeline.forwardc                 C   sL   |d  dd  ddd d}tj|d d d d d d df iS )NrN   r            uint8rJ   )squeezer   permutenumpyastyper   
OUTPUT_IMG)r1   r6   
output_imgr   r   r5   postprocesso   s   
"z ImageEditingPipeline.postprocess)__name__
__module____qualname__strr!   r   r   rC   r\   rg   __classcell__r   r   r3   r5   r      s
    ")"
*c                	   @   s   e Zd Z		ddejdedejfddZ		ddejdedejd	efd
dZe	 dd Z
e	 dddZe	 											dddZe	 				ddejfddZdS ) r.   r   Fmodel_outputtimestepxc                 C   s   |rt d| |}t|| jjj| jj  d}|dkr!| jj| n| jj}| jj| }d| }	||	d |  |d  }
d| d | }|d |
 | }||
fS )zL
        Inverse sampling for DDIM Inversion
        x_t -> x_(t+1)
        z
timestep: i  r   r^   r9   )printminr   confignum_train_timestepsrH   alphas_cumprodfinal_alpha_cumprod)r1   rm   rn   ro   etaverbose	next_stepalpha_prod_talpha_prod_t_nextbeta_prod_tpred_x0pred_dirx_nextr   r   r5   rx   w   s*   

z_MasaCtrlPipeline.next_step        rv   c                 C   s   || j jj| j j  }| j j| }|dkr| j j| n| j j}d| }	||	d |  |d  }
d| d | }|d |
 | }||
fS )za
        predict the sample the next step in the denoise process.
        x_t -> x_(t-1)
        r   r^   r9   )r   rr   rs   rH   rt   ru   )r1   rm   rn   ro   rv   rw   prev_timestepry   alpha_prod_t_prevr{   r|   r}   x_prevr   r   r5   step   s   z_MasaCtrlPipeline.stepc                 C   sj   | j }t|tu r&t|}t| d d }|ddd	d
|}| j|d j}|d }|S )Ng     _@r^   r_   r   latent_distg{P?)_execution_devicerR   r   nparrayr#   
from_numpyfloatrb   r?   r/   vaeencodemean)r1   imageDEVICErM   r   r   r5   image2latent   s   
z_MasaCtrlPipeline.image2latentptc                 C   s   d|   }| j|d }|dkr4|d d dd}| dddd d }|d	 tj	}|S |d
krB|d d dd}|S )Ng!ޅ@sampler   r_   r9   r   r^      r]   r   )
detachr   decodeclampr   rb   rc   rd   r   r`   )r1   rM   return_typer   r   r   r5   latent2image   s   z_MasaCtrlPipeline.latent2imager^   r:   rF   rE   Nc           "         s   j }t|trt|}nt|tr|dkr|g| } j|dddd} |j|d }t	d|j
 | jj|d |d f}|d u rMtj||d	}n|j
|ks[J d
|j
 d|dkr|
rd|
}nd} j|g| dddd} |j|d }tj||gdd}t	d|j
  j| |g}|g}tt jjddD ]z\}}|d ur|d|  }|d\}}t||g}|dkrt|gd }n|}|	d urt|	tr|d\}}t|	| j|j
 |g} j|||dj}|dkr	|jddd\}}||||   } |||\}} || ||  q j|dd}!|r> fdd|D } fdd|D }|!||fS |!S )Nr^   
max_lengthM   r   paddingr   return_tensorsr   input text embeddings :   )r   z!The shape of input latent tensor z  should equal to predefined one.      ? dimlatents shape: zDDIM SamplerdescrJ   r_   encoder_hidden_statesr   c                       g | ]	} j |d dqS r   r   r   .0r8   r1   r   r5   
<listcomp>      z._MasaCtrlPipeline.__call__.<locals>.<listcomp>c                    r   r   r   r   r   r   r5   r      r   )r   rO   listrU   rk   	tokenizertext_encoder	input_idsr/   rp   shapeunetin_channelsr#   randncatr   set_timesteps	enumerater
   	timestepschunkrT   r   r   appendr   )"r1   prompt
batch_sizeheightwidthrH   rG   rv   rM   unconditioning
neg_promptref_intermediate_latentsrI   kwdsr   
text_inputtext_embeddingslatents_shapeuc_textunconditional_inputunconditional_embeddingsrW   pred_x0_listitlatents_ref_latents_curmodel_inputs
noise_prednoise_pred_unconnoise_pred_conr|   r   r   r   r5   __call__   s   









z_MasaCtrlPipeline.__call__r   c                 K   s  | j }|jd }	t|tr|	dkr|t|ddd}nt|tr*|	dkr*|g|	 }| j|dddd}
| |
j	
|d }td|j | |}|}|d	kro| jd
g|	 dddd}| |j	
|d }tj||gdd}td|j | j| tdt| jj |g}|g}ttt| jjddD ]D\}}|d	krt|gd }n|}| j|||dj}|d	kr|jddd\}}||||   }| |||\}}|| || q|r||fS ||fS )zT
        invert a real image into noise map with determinisc DDIM inversion
        r   r^   rJ   r   r   r   r   r   r   r   r   r   zValid timesteps: zDDIM Inversionr   r_   r   )r   r   rO   r   rT   rU   rk   r   r   r   r/   rp   r   r#   r   r   r   reversedr   r   r
   r   r   r   rx   r   )r1   r   r   rH   rG   rv   rI   r   r   r   r   r   rM   start_latentsr   r   rW   r   r   r   r   r   r   r   r|   r   r   r5   rS   '  sz   







z_MasaCtrlPipeline.invert)r   F)r   F)r   )r^   r:   r:   rF   rE   r   NNNNF)rF   rE   r   F)rh   ri   rj   r#   FloatTensorintrx   r   r   no_gradr   r   r   TensorrS   r   r   r   r5   r.   u   s\    
 

er.   )-os.pathr+   typingr   r   r   r   rc   r   r#   torch.nn.functionalnn
functionalr@   	diffusersr   r   PILr   torchvisionr	   r
   modelscope.metainfor   "modelscope.models.cv.image_editingr   r   modelscope.outputsr   modelscope.pipelines.builderr   Emodelscope.pipelines.multi_modal.diffusers_wrapped.diffusers_pipeliner   modelscope.preprocessorsr   modelscope.utils.constantr   modelscope.utils.loggerr   r(   __all__register_moduleimage_editingr   r.   r   r   r   r5   <module>   s0   V