o
    Gi_                     @   s  d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlm Z  e rkd dl!m"  m#Z$ dZ%ndZ%e&e'Z(dZ)eG dd deZ*G dd deZ+dS )    )	dataclass)AnyN)Image)tqdm)CLIPTextModelCLIPTokenizer   )PipelineImageInput)AutoencoderKLUNet2DConditionModel)DDIMSchedulerLCMScheduler)
BaseOutputis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipeline   )MarigoldImageProcessorTFa  
Examples:
```py
>>> import diffusers
>>> import torch

>>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
...     "prs-eth/marigold-iid-appearance-v1-1", variant="fp16", torch_dtype=torch.float16
... ).to("cuda")

>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
>>> intrinsics = pipe(image)

>>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
>>> vis[0]["albedo"].save("einstein_albedo.png")
>>> vis[0]["roughness"].save("einstein_roughness.png")
>>> vis[0]["metallicity"].save("einstein_metallicity.png")
```
```py
>>> import diffusers
>>> import torch

>>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
...     "prs-eth/marigold-iid-lighting-v1-1", variant="fp16", torch_dtype=torch.float16
... ).to("cuda")

>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
>>> intrinsics = pipe(image)

>>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
>>> vis[0]["albedo"].save("einstein_albedo.png")
>>> vis[0]["shading"].save("einstein_shading.png")
>>> vis[0]["residual"].save("einstein_residual.png")
```
c                   @   sD   e Zd ZU dZejejB ed< dejB ejB ed< dejB ed< dS )MarigoldIntrinsicsOutputuS  
    Output class for Marigold Intrinsic Image Decomposition pipeline.

    Args:
        prediction (`np.ndarray`, `torch.Tensor`):
            Predicted image intrinsics with values in the range [0, 1]. The shape is `(numimages * numtargets) × 3 ×
            height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for `np.ndarray`,
            where `numtargets` corresponds to the number of predicted target modalities of the intrinsic image
            decomposition.
        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `(numimages *
            numtargets) × 3 × height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for
            `np.ndarray`.
        latent (`None`, `torch.Tensor`):
            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
            The shape is `(numimages * numensemble) × (numtargets * 4) × latentheight × latentwidth`.
    
predictionNuncertaintylatent)	__name__
__module____qualname____doc__npndarraytorchTensor__annotations__ r$   r$   m/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.pyr   _   s
   
 r   c                !       s"  e Zd ZdZdZdZ				d7dededee	B de
d	ed
edB deeef dB dedB dedB f fddZedd Zdedededededededeeef dB dejdB dejeej B dB dededefdd Zejjd8d"d#Ze ee		$		!	%	%	$				&	'	'	!d9dededB dededB d(ededededeeef dB dejeej B dB dejeej B dB deded)ed*efd+d,Z dejdejdB dejdB dedede!ejejf fd-d.Z"d/ejdejfd0d1Z#e$	'	2d:d3ejded4ede!ejejdB f fd5d6Z%  Z&S );MarigoldIntrinsicsPipelinea	  
    Pipeline for Intrinsic Image Decomposition (IID) using the Marigold method:
    https://marigoldcomputervision.github.io.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        unet (`UNet2DConditionModel`):
            Conditional U-Net to denoise the targets latent, conditioned on image latent.
        vae (`AutoencoderKL`):
            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
            representations.
        scheduler (`DDIMScheduler` or `LCMScheduler`):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
        text_encoder (`CLIPTextModel`):
            Text-encoder, for empty text embedding.
        tokenizer (`CLIPTokenizer`):
            CLIP tokenizer.
        prediction_type (`str`, *optional*):
            Type of predictions made by the model.
        target_properties (`dict[str, Any]`, *optional*):
            Properties of the predicted modalities, such as `target_names`, a `list[str]` used to define the number,
            order and names of the predicted modalities, and any other metadata that may be required to interpret the
            predictions.
        default_denoising_steps (`int`, *optional*):
            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
            quality with the given model. This value must be set in the model config. When the pipeline is called
            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
        default_processing_resolution (`int`, *optional*):
            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
            default value is used. This is required to ensure reasonable results with various model flavors trained
            with varying optimal processing resolution values.
    ztext_encoder->unet->vae)
intrinsicsNunetvae	schedulertext_encoder	tokenizerprediction_typetarget_propertiesdefault_denoising_stepsdefault_processing_resolutionc
           
         s   t    || jvrtd| d| j d | j|||||d | j||||	d t| dd r;dt| j	j
jd  nd	| _|| _|| _|	| _d | _t| jd
| _d S )Nz*Potentially unsupported `prediction_type='z&'`; values supported by the pipeline: .)r(   r)   r*   r+   r,   )r-   r.   r/   r0   r)   r   r      )vae_scale_factor)super__init__supported_prediction_typesloggerwarningregister_modulesregister_to_configgetattrlenr)   configblock_out_channelsr3   r.   r/   r0   empty_text_embeddingr   image_processor)
selfr(   r)   r*   r+   r,   r-   r.   r/   r0   	__class__r$   r%   r5      s4   

(z#MarigoldIntrinsicsPipeline.__init__c                 C   s   | j jj| jjj S )N)r(   r=   out_channelsr)   latent_channels)rA   r$   r$   r%   	n_targets   s   z$MarigoldIntrinsicsPipeline.n_targetsimagenum_inference_stepsensemble_sizeprocessing_resolutionresample_method_inputresample_method_output
batch_sizeensembling_kwargslatents	generatoroutput_typeoutput_uncertaintyreturnc              	      s  dt | jjjd  }|| jkrtd| j d| d|d u r$td|dk r,td|dk r4td|dkr=td	 |dkrG|rGtd
|d u rOtd|dk rWtd|| j dkrgtd| j d|dvrotd|dvrwtd|dk rtd|dvrtd|	d ur d urtd|d urt|t	stdd|v r|d dvrtdd}d\}}t|t
s|g}t|D ]{\}}t|tjst|r|jdvrtd| d|j d|jdd  \}}d}|jd kr|jd }nt|tjr|j\}}d}ntd!| d"t| d|d u r||}}n||f||fkr7td#| d$||f d%||f ||7 }q|	d urt|	sKtd&|	 d kr[td'|	j d|dkrt||}|| | }|| | }|dks{|dkrtd(| d)| d*||}}|| j d | j }|| j d | j }|| | jjj||f}|	j|krtd+|	j d,| d d urt t
rt  || krtd-t fd.d/ D std0|S t tjstd1t  d|S )2Nr   r   z/`vae_scale_factor` computed at initialization (z) differs from the actual one (z).zW`num_inference_steps` is not specified and could not be resolved from the model config.z'`num_inference_steps` must be positive.z!`ensemble_size` must be positive.zk`ensemble_size` == 2 results are similar to no ensembling (1); consider increasing the value to at least 3.zpComputing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` greater than 1.zY`processing_resolution` is not specified and could not be resolved from the model config.r   zx`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for downsampled processing.z.`processing_resolution` must be a multiple of r1   )nearestznearest-exactbilinearbicubicareazy`resample_method_input` takes string values compatible with PIL library: nearest, nearest-exact, bilinear, bicubic, area.zz`resample_method_output` takes string values compatible with PIL library: nearest, nearest-exact, bilinear, bicubic, area.z`batch_size` must be positive.)ptr   z*`output_type` must be one of `pt` or `np`.z2`latents` and `generator` cannot be used together.z)`ensembling_kwargs` must be a dictionary.	reductionmedianmeanzF`ensembling_kwargs['reduction']` can be either `'median'` or `'mean'`.)NN)r   r      z`image[z(]` has unsupported dimensions or shape: r]   zUnsupported `image[z	]` type: zInput `image[z]` has incompatible dimensions z with the previous images z!`latents` must be a torch.Tensor.z/`latents` has unsupported dimensions or shape: z*Extreme aspect ratio of the input image: [z x ]z`latents` has unexpected shape=z
 expected=z^The number of generators must match the total number of ensemble members for all input images.c                 3   s$    | ]}|j j d  j jkV  qdS )r   N)devicetype).0grP   r$   r%   	<genexpr>M  s   " z:MarigoldIntrinsicsPipeline.check_inputs.<locals>.<genexpr>z;`generator` device placement is not consistent in the list.zUnsupported generator type: )r<   r)   r=   r>   r3   
ValueErrorr7   r8   
isinstancedictlist	enumerater   r    r!   	is_tensorndimshaper   sizera   dimmaxr(   rD   all	Generator)rA   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   actual_vae_scale_factor
num_imagesWHiimgH_iW_iN_imax_orignew_Hnew_Wwhshape_expectedr$   rd   r%   check_inputs   s   













z'MarigoldIntrinsicsPipeline.check_inputsTc                 C   s   t | ds	i | _nt| jtstdt| j dtdi | j}|d||d< |d||d< |d ur>t|fi |S |d urKtdd|i|S td)	N_progress_bar_configz=`self._progress_bar_config` should be of type `dict`, but is r1   descleavetotalz/Either `total` or `iterable` has to be defined.r$   )hasattrr   rg   rh   rf   ra   getr   )rA   iterabler   r   r   progress_bar_configr$   r$   r%   progress_barT  s   
z'MarigoldIntrinsicsPipeline.progress_barr   rU   r   Fmatch_input_resolutionoutput_latentreturn_dictc           %         sZ  j }j}|du rj}|du rj}|||||| |
||}jdu rEd}j|djjddd}|j	|}
|d _j|||||\}}}||
|| \}}~jj	||d d	d	}g }jtd||  dd
dD ]\}|||   }|||   }|jd }|d| } jj||d jjjdddD ]'}!tj||gd	d}"j|"|!| ddd }#jj|#|!||dj}trt  q|| q{tj|dd}~~~~~~ ~"~#|j|| j j j!j"g|jdd R  tj fddtdjd  D dd|sd}j#|d}$|d	krbj||jgjd	d R  fddt|D t$ \}$tjddr`tj|$dd}$nd}$|rjj%||dd|$durrjj%|$||dd}$|dkrj&|$durrj&|$}$'  |s|$|fS t(|$|dS )aj  
        Function invoked when calling the pipeline.

        Args:
            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`),
                `list[torch.Tensor]`: An input image or images used as an input for the intrinsic decomposition task.
                For arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is
                possible by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
                same width and height.
            num_inference_steps (`int`, *optional*, defaults to `None`):
                Number of denoising diffusion steps during inference. The default value `None` results in automatic
                selection.
            ensemble_size (`int`, defaults to `1`):
                Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
            processing_resolution (`int`, *optional*, defaults to `None`):
                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
                produces crisper predictions, but may also lead to the overall loss of global context. The default
                value `None` resolves to the optimal value from the model config.
            match_input_resolution (`bool`, *optional*, defaults to `True`):
                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
                side of the output will equal to `processing_resolution`.
            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
                Resampling method used to resize output predictions to match the input resolution. The accepted values
                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
            batch_size (`int`, *optional*, defaults to `1`):
                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
                Extra dictionary with arguments for precise ensembling control. The following options are available:
                - reduction (`str`, *optional*, defaults to `"median"`): Defines the ensembling function applied in
                  every pixel location, can be either `"median"` or `"mean"`.
            latents (`torch.Tensor`, *optional*, defaults to `None`):
                Latent noise tensors to replace the random initialization. These can be taken from the previous
                function call's output.
            generator (`torch.Generator`, or `list[torch.Generator]`, *optional*, defaults to `None`):
                Random number generator object to ensure reproducibility.
            output_type (`str`, *optional*, defaults to `"np"`):
                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
            output_uncertainty (`bool`, *optional*, defaults to `False`):
                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
                the `ensemble_size` argument is set to a value above 2.
            output_latent (`bool`, *optional*, defaults to `False`):
                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
                `latents` argument.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.marigold.MarigoldIntrinsicsOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.marigold.MarigoldIntrinsicsOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldIntrinsicsOutput`] is returned, otherwise a
                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
                (or `None`), and the third is the latent (or `None`).
        N 
do_not_padTrX   )padding
max_length
truncationreturn_tensorsr   )r`   dtyper   zMarigold predictions...)r   r   )r`   FzDiffusion steps...ro   )encoder_hidden_statesr   rd   r   c                    s"   g | ]} ||   qS r$   )decode_predictionrb   rw   )rM   pred_latent_for_decodingrA   r$   r%   
<listcomp>1  s    z7MarigoldIntrinsicsPipeline.__call__.<locals>.<listcomp>c                    s(   g | ]}j | fi  pi qS r$   )ensemble_intrinsicsr   )rN   rR   r   rA   r$   r%   r   I      )is_aar   )r   r   r   ))_execution_devicer   r/   r0   r   r?   r,   model_max_length	input_idstor+   r@   
preprocessprepare_latentsrepeatr   rangerm   r*   set_timesteps	timestepsr!   catr(   stepprev_sampleXLA_AVAILABLExm	mark_stepappendreshaperF   r)   r=   rE   unpad_imagezipresize_antialiaspt_to_numpymaybe_free_model_hooksr   )%rA   rG   rH   rI   rJ   r   rK   rL   rM   rN   rO   rP   rQ   rR   r   r   r`   r   rt   prompttext_inputstext_input_idsr   original_resolutionimage_latentpred_latentbatch_empty_text_embeddingpred_latentsrw   batch_image_latentbatch_pred_latenteffective_batch_sizetexttbatch_latentnoiser   r$   )rM   rN   rR   r   r   rA   r%   __call__g  s   R
	








z#MarigoldIntrinsicsPipeline.__call__c                    s   dd t j fddtdjd  D dd}|jjj }|j|dd}|j\}}}	}
|}|d u rHt|j	| |	|
f||j
|jd}||fS )Nc                 S   s,   t | dr
| j S t | dr| jS td)Nlatent_distrO   z3Could not access latents of provided encoder_output)r   r   moderO   AttributeError)encoder_outputr$   r$   r%   retrieve_latents{  s
   


zDMarigoldIntrinsicsPipeline.prepare_latents.<locals>.retrieve_latentsc              	      s(   g | ]}j ||   qS r$   )r)   encoder   rM   rG   r   rA   r$   r%   r     r   z>MarigoldIntrinsicsPipeline.prepare_latents.<locals>.<listcomp>r   r   )rP   r`   r   )r!   r   r   rm   r)   r=   scaling_factorrepeat_interleaver   rF   r`   r   )rA   rG   rO   rP   rI   rM   r   N_ECrv   ru   r   r$   r   r%   r   s  s&   z*MarigoldIntrinsicsPipeline.prepare_latentsr   c                 C   sz   |  dks|jd | jjjkrtd| jjj d|j d| jj|| jjj ddd }t	|d	d
}|d
 d }|S )Nr]   r   z Expecting 4D tensor of shape [B,z,H,W]; got r1   F)r   r   g      g      ?g       @)
ro   rm   r)   r=   rE   rf   decoder   r!   clip)rA   r   r   r$   r$   r%   r     s    z,MarigoldIntrinsicsPipeline.decode_predictionr[   targetsrY   c           
      C   s   |   dks| jd dkrtd| j d|dvr"td| d| j\}}}}}d}|d	krDtj| d
d}	|r@tj| d
d}|	|fS |dkrktj| d
ddj}	|rbt| |	 }tj|d
dj}|		d
}	|	|fS td| d)a  
        Ensembles the intrinsic decomposition represented by the `targets` tensor with expected shape `(B, T, 3, H,
        W)`, where B is the number of ensemble members for a given prediction of size `(H x W)`, and T is the number of
        predicted targets.

        Args:
            targets (`torch.Tensor`):
                Input ensemble of intrinsic image decomposition maps.
            output_uncertainty (`bool`, *optional*, defaults to `False`):
                Whether to output uncertainty map.
            reduction (`str`, *optional*, defaults to `"mean"`):
                Reduction method used to ensemble aligned predictions. The accepted values are: `"median"` and
                `"mean"`.

        Returns:
            A tensor of aligned and ensembled intrinsic decomposition maps with shape `(T, 3, H, W)` and optionally a
            tensor of uncertainties of shape `(T, 3, H, W)`.
           r   r   z.Expecting 4D tensor of shape [B,T,3,H,W]; got r1   rZ   zUnrecognized reduction method: Nr\   r   r   r[   T)ro   keepdim)
ro   rm   rf   r!   r\   stdr[   valuesabssqueeze)
r   rR   rY   BT_rv   ru   r   r   r$   r$   r%   r     s&   	
z.MarigoldIntrinsicsPipeline.ensemble_intrinsics)NNNN)NNNT)Nr   NTrU   rU   r   NNNr   FFT)Fr[   )'r   r   r   r   model_cpu_offload_seqr6   r   r
   r   r   r   r   strrh   r   intr5   propertyrF   r	   r!   r"   rr   ri   boolr   compilerdisabler   no_gradr   EXAMPLE_DOC_STRINGr   tupler   r   staticmethodr   __classcell__r$   r$   rB   r%   r&   x   s   &		
,
	

 	
  
&r&   ),dataclassesr   typingr   numpyr   r!   PILr   	tqdm.autor   transformersr   r   r@   r	   modelsr
   r   
schedulersr   r   utilsr   r   r   r   utils.torch_utilsr   pipeline_utilsr   marigold_image_processingr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   r7   r   r   r&   r$   r$   r$   r%   <module>   s.   
%