o
    Gi4                     @   s   d dl Z d dlmZ d dlZd dlZd dlZd dlm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZ ddlmZ d	d
lmZ ddlmZ e rWd dlm  mZ dZndZee Z!dZ"eG dd deZ#G dd deZ$dS )    N)	dataclass)CLIPTextModelWithProjectionCLIPTokenizer   )PriorTransformer)HeunDiscreteScheduler)
BaseOutputis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipeline   )ShapERendererTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import DiffusionPipeline
        >>> from diffusers.utils import export_to_gif

        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        >>> repo = "openai/shap-e"
        >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
        >>> pipe = pipe.to(device)

        >>> guidance_scale = 15.0
        >>> prompt = "a shark"

        >>> images = pipe(
        ...     prompt,
        ...     guidance_scale=guidance_scale,
        ...     num_inference_steps=64,
        ...     frame_size=256,
        ... ).images

        >>> gif_path = export_to_gif(images[0], "shark_3d.gif")
        ```
c                   @   s4   e Zd ZU dZeeejj  eeej  B e	d< dS )ShapEPipelineOutputz
    Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].

    Args:
        images (`torch.Tensor`)
            A list of images for 3D rendering.
    imagesN)
__name__
__module____qualname____doc__listPILImagenpndarray__annotations__ r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/shap_e/pipeline_shap_e.pyr   J   s   
 &r   c                       s   e Zd ZdZdZdgZdededede	de
f
 fdd	Zd
d Zdd Ze ee								d dedededejeej B dB dejdB dedededB defddZ  ZS )!ShapEPipelinea  
    Pipeline for generating latent representation of a 3D asset and rendering with the NeRF method.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer ([`~transformers.CLIPTokenizer`]):
             A `CLIPTokenizer` to tokenize text.
        scheduler ([`HeunDiscreteScheduler`]):
            A scheduler to be used in combination with the `prior` model to generate image embedding.
        shap_e_renderer ([`ShapERenderer`]):
            Shap-E renderer projects the generated latents into parameters of a MLP to create 3D objects with the NeRF
            rendering method.
    ztext_encoder->priorshap_e_rendererpriortext_encoder	tokenizer	schedulerc                    s"   t    | j|||||d d S )N)r!   r"   r#   r$   r    )super__init__register_modules)selfr!   r"   r#   r$   r    	__class__r   r   r&   o   s   

zShapEPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r(   r.   r-   r,   r+   latentsr$   r   r   r   prepare_latents   s   


zShapEPipeline.prepare_latentsc                 C   s  t |tr	t|nd d| j_| j|d| jjddd}|j}| j|dddj}|jd	 |jd	 krVt	||sV| j
|d d | jjd d	f }td
| jj d|  | ||}	|	j}
|
j|dd}
|
tjj|
d	dd }
|rt|
}t||
g}
t|
jd |
 }
|
S )Nr   r   
max_lengthTpt)paddingr4   
truncationreturn_tensorslongest)r6   r8   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: dim)r<   keepdim)
isinstancer   lenr#   pad_token_idmodel_max_length	input_idsr.   torchequalbatch_decodeloggerwarningr"   r0   text_embedsrepeat_interleavelinalgnorm
zeros_likecatmathsqrt)r(   promptr,   num_images_per_promptdo_classifier_free_guidancetext_inputstext_input_idsuntruncated_idsremoved_texttext_encoder_outputprompt_embedsnegative_prompt_embedsr   r   r   _encode_prompt   s<    $
zShapEPipeline._encode_promptr      N      @@   pilTrP   rQ   num_inference_stepsr+   r2   guidance_scale
frame_sizeoutput_typereturn_dictc
                    s\  t |trd}
nt |trt|}
n	tdt|  j}|
| }
|dk} ||||} jj	||d  jj
} jjj} jjj} |
|| f|j||| j}||jd ||}t |D ]M\}}|rst|gd n|} j||} j|||dj}|j|jd dd\}}|r|d\}}||||   } jj|||d	j}trt  qe    |d
vrtd| |dkrt!|dS g }|dkrt|D ]\}} j"#|dddf |}|$| qn6t|D ]\}} j"j%|dddf ||d}|$| qt&|}|' ( }|dkr# fdd|D }|	s)|fS t!|dS )aq	  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            frame_size (`int`, *optional*, default to 64):
                The width and height of each image frame of the generated 3D output.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`), `"latent"` (`torch.Tensor`), or mesh ([`MeshDecoderOutput`]).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
                tuple.

        Examples:

        Returns:
            [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
                otherwise a `tuple` is returned where the first element is a list with the generated images.
        r   z2`prompt` has to be of type `str` or `list` but is g      ?)r,   r   r   )timestepproj_embeddingr;   )rd   sample)r   r^   latentmeshzUOnly the output types `pil`, `np`, `latent` and `mesh` are supported not output_type=rg   )r   rh   N)sizer^   c                    s   g | ]}  |qS r   )numpy_to_pil).0imager(   r   r   
<listcomp>T  s    z*ShapEPipeline.__call__.<locals>.<listcomp>))r>   strr   r?   r/   type_execution_devicerZ   r$   set_timesteps	timestepsr!   confignum_embeddingsembedding_dimr3   r-   reshaper.   	enumerateprogress_barrC   rM   scale_model_inputpredicted_image_embeddingsplitchunkstepprev_sampleXLA_AVAILABLExm	mark_stepmaybe_free_model_hooksr   r    decode_to_meshappenddecode_to_imagestackcpunumpy)r(   rP   rQ   r_   r+   r2   r`   ra   rb   rc   
batch_sizer,   rR   rX   rs   ru   rv   itlatent_model_inputscaled_model_input
noise_pred_noise_pred_uncondr   rg   rh   rl   r   rm   r   __call__   s   
4








	


zShapEPipeline.__call__)r   r[   NNr\   r]   r^   T)r   r   r   r   model_cpu_offload_seq_exclude_from_cpu_offloadr   r   r   r   r   r&   r3   rZ   rC   no_gradr   EXAMPLE_DOC_STRINGro   int	Generatorr   Tensorfloatboolr   __classcell__r   r   r)   r   r   W   s\    1	
r   )%rN   dataclassesr   r   r   	PIL.Imager   rC   transformersr   r   modelsr   
schedulersr   utilsr   r	   r
   r   utils.torch_utilsr   pipeline_utilsr   rendererr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rF   r   r   r   r   r   r   r   <module>   s*   
