o
    ۷is                     @   s   d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ eeZG dd dejZG dd dZdS )    )CallableN   )BasicTransformerBlockFreeNoiseTransformerBlock)Downsample2DResnetBlock2D
Upsample2D)Transformer2DModel)AnimateDiffTransformer3DCrossAttnDownBlockMotionDownBlockMotionUpBlockMotion)DiffusionPipeline)logging)randn_tensorc                       s`   e Zd ZdZdddgfdejdededee d	d
f
 fddZ	d	e
jee
j B fddZ  ZS )SplitInferenceModuleap  
    A wrapper module class that splits inputs along a specified dimension before performing a forward pass.

    This module is useful when you need to perform inference on large tensors in a memory-efficient way by breaking
    them into smaller chunks, processing each chunk separately, and then reassembling the results.

    Args:
        module (`nn.Module`):
            The underlying PyTorch module that will be applied to each chunk of split inputs.
        split_size (`int`, defaults to `1`):
            The size of each chunk after splitting the input tensor.
        split_dim (`int`, defaults to `0`):
            The dimension along which the input tensors are split.
        input_kwargs_to_split (`list[str]`, defaults to `["hidden_states"]`):
            A list of keyword arguments (strings) that represent the input tensors to be split.

    Workflow:
        1. The keyword arguments specified in `input_kwargs_to_split` are split into smaller chunks using
        `torch.split()` along the dimension `split_dim` and with a chunk size of `split_size`.
        2. The `module` is invoked once for each split with both the split inputs and any unchanged arguments
        that were passed.
        3. The output tensors from each split are concatenated back together along `split_dim` before returning.

    Example:
        ```python
        >>> import torch
        >>> import torch.nn as nn

        >>> model = nn.Linear(1000, 1000)
        >>> split_module = SplitInferenceModule(model, split_size=2, split_dim=0, input_kwargs_to_split=["input"])

        >>> input_tensor = torch.randn(42, 1000)
        >>> # Will split the tensor into 21 slices of shape [2, 1000].
        >>> output = split_module(input=input_tensor)
        ```

    It is also possible to nest `SplitInferenceModule` across different split dimensions for more complex
    multi-dimensional splitting.
       r   hidden_statesmodule
split_size	split_diminput_kwargs_to_splitreturnNc                    s*   t    || _|| _|| _t|| _d S )N)super__init__r   r   r   setr   )selfr   r   r   r   	__class__ Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/free_noise_utils.pyr   N   s
   
zSplitInferenceModule.__init__c           	         s   i }t | D ]!}| jvst|| sqt||  j j||< || qg }t	|
  D ]}tt	| |}||  j|i |}|| q2t|d tjr`tj| jdS t|d trtt fddt	| D S td)a  Forward method for the `SplitInferenceModule`.

        This method processes the input by splitting specified keyword arguments along a given dimension, running the
        underlying module on each split, and then concatenating the results. The splitting is controlled by the
        `split_size` and `split_dim` parameters specified during initialization.

        Args:
            *args (`Any`):
                Positional arguments that are passed directly to the `module` without modification.
            **kwargs (`dict[str, torch.Tensor]`):
                Keyword arguments passed to the underlying `module`. Only keyword arguments whose names match the
                entries in `input_kwargs_to_split` and are of type `torch.Tensor` will be split. The remaining keyword
                arguments are passed unchanged.

        Returns:
            `torch.Tensor | tuple[torch.Tensor]`:
                The outputs obtained from `SplitInferenceModule` are the same as if the underlying module was inferred
                without it.
                - If the underlying module returns a single tensor, the result will be a single concatenated tensor
                along the same `split_dim` after processing all splits.
                - If the underlying module returns a tuple of tensors, each element of the tuple will be concatenated
                along the `split_dim` across all splits, and the final result will be a tuple of concatenated tensors.
        r   dimc                    s   g | ]
}t j| jd qS )r!   )torchcatr   .0xr   r   r    
<listcomp>   s    z0SplitInferenceModule.forward.<locals>.<listcomp>zIn order to use the SplitInferenceModule, it is necessary for the underlying `module` to either return a torch.Tensor or a tuple of torch.Tensor's.)listkeysr   r#   	is_tensorsplitr   r   popzipvaluesdictupdater   append
isinstanceTensorr$   tuple
ValueError)	r   argskwargssplit_inputskeyresultssplit_inputinputs#intermediate_tensor_or_tensor_tupler   r(   r    forward\   s&   
zSplitInferenceModule.forward)__name__
__module____qualname____doc__nnModuleintr*   strr   r#   r5   r6   r@   __classcell__r   r   r   r    r   %   s"    +"r   c                   @   s  e Zd ZdZdeeB eB fddZdeeB eB fddZ		dFd	d
Z						dGde
eee
f B dedejdedede
eee
f B dB dejdB dejdB dedB dedB dejfddZ		dHdedededededejdejdejdB dejdB fddZd ed!ed"ejd#ejdejf
d$d%Z	&	'	(	)	dId*edB d+ed,e
d-e
d.eeeeejejgejf dB ddfd/d0ZdFd1d2Zd3ee d4eddfd5d6Zd7ee d8eddfd9d:Zd;ee  d8eddfd<d=Z!d>ee" ee# B d8eddfd?d@Z$dJd4ed8eddfdBdCZ%e&dDdE Z'dS )KAnimateDiffFreeNoiseMixinzFMixin class for [FreeNoise](https://huggingface.co/papers/2310.15169).blockc                 C   s   |j D ]t}t|j}t|D ]h}t|j| tr&|j| | j| j| j	 qt|j| t
s0J |j| }t|j|j|j|j|j|j|j|j|j|j|j| j| j| j	dj| j| jd|j|< |j| j| dd |j| |j|j qqdS )z:Helper function to enable FreeNoise in transformer blocks.)r"   num_attention_headsattention_head_dimdropoutcross_attention_dimactivation_fnattention_biasonly_cross_attentiondouble_self_attentionpositional_embeddingsnum_positional_embeddingscontext_lengthcontext_strideweighting_schemedevicedtypeTstrictN)motion_moduleslentransformer_blocksranger4   r   set_free_noise_properties_free_noise_context_length_free_noise_context_stride_free_noise_weighting_schemer   r"   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   torZ   r[   load_state_dict
state_dictset_chunk_feed_forward_chunk_size
_chunk_dim)r   rK   motion_modulenum_transformer_blocksibasic_transfomer_blockr   r   r    _enable_free_noise_in_block   sJ   






z5AnimateDiffFreeNoiseMixin._enable_free_noise_in_blockc                 C   s   |j D ]V}t|j}t|D ]J}t|j| trX|j| }t|j|j|j	|j
|j|j|j|j|j|j|jdj| j| jd|j|< |j| j| dd |j| |j|j qqdS )z;Helper function to disable FreeNoise in transformer blocks.)r"   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rY   Tr\   N)r^   r_   r`   ra   r4   r   r   r"   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rf   rZ   r[   rg   rh   ri   rj   rk   )r   rK   rl   rm   rn   free_noise_transfomer_blockr   r   r    _disable_free_noise_in_block   s:   





z6AnimateDiffFreeNoiseMixin._disable_free_noise_in_blockr   Nc           
      C   s   t |ttfstdt||d ur$t |ttfs$tdt||d us,|d ur0tddd | D }dd | D }tt| }t	t| }	t
|sZtdt
|sbtd|d	krjtd
|	|krvtd|dd S )NzFExpected `prompt` to have type `str` or `dict` but found type(prompt)=zXExpected `negative_prompt` to have type `str` or `dict` but found type(negative_prompt)=zO`prompt_embeds` and `negative_prompt_embeds` is not supported in FreeNoise yet.c                 S      g | ]}t |tqS r   )r4   rG   r%   r   r   r    r)          zFAnimateDiffFreeNoiseMixin._check_inputs_free_noise.<locals>.<listcomp>c                 S   rs   r   )r4   rH   r%   r   r   r    r)      rt   z5Expected integer keys in `prompt` dict for FreeNoise.z3Expected str values in `prompt` dict for FreeNoise.r   zUThe minimum frame index in `prompt` dict must be 0 as a starting prompt is necessary.zHThe maximum frame index in `prompt` dict must be lesser than num_frames=z and follow 0-based indexing.)r4   rH   r1   r7   typer+   r0   minr*   maxall)
r   promptnegative_promptprompt_embedsnegative_prompt_embeds
num_framesframe_indicesframe_prompts	min_frame	max_framer   r   r    _check_inputs_free_noise   s0   
z2AnimateDiffFreeNoiseMixin._check_inputs_free_noisery   r}   rZ   num_videos_per_promptdo_classifier_free_guidancerz   r{   r|   
lora_scale	clip_skipc                 C   s<  |d u rd}t |trd|i}t |trd|i}| ||||| tt| }tt| }|t| d  ||d < |t| d  ||d < t| }t| }t| }t| }| j	|||dd d d |	|
d	\}}|g|j
dd  R }||}tt|d D ]*}|| }||d  }|| d}||d  d}| |||||||d < qd }d }|r| j	dgt| ||d|d d |	|
d	\}}||}tt|d D ]*}|| }||d  }|| d}||d  d}| |||||||d < q|}|}|rt||g}||fS )N r   r   F)	ry   rZ   num_images_per_promptr   rz   r{   r|   r   r   T)r4   rH   r   r1   sorteditemsr*   r+   r0   encode_promptshape	new_zerosra   r_   	unsqueeze)_free_noise_prompt_interpolation_callbackr#   r$   )r   ry   r}   rZ   r   r   rz   r{   r|   r   r   r~   r   frame_negative_indicesframe_negative_prompts_r   prompt_interpolation_embedsrn   start_frame	end_framestart_tensor
end_tensor$negative_prompt_interpolation_embedsr   r   r    _encode_prompt_free_noise   s~   





z3AnimateDiffFreeNoiseMixin._encode_prompt_free_noise
batch_sizenum_channels_latentsheightwidthr[   	generatorlatentsc
              	   C   s.  t |trt||krtdt| d| d| jdkr | jn|}
|||
|| j || j f}|	d u rCt||||d}	| jdkrB|	S n'|	d|krL|	S |	d| jkretd| d	| j d
|	d |		|}	| jdkrt
| j|| jD ]u}td|| j }t||| j }|| }|dkr nZttt
||}|tj||d }|}t||| }||| kr|	d d d d |f |	d d d d ||f< qw|| }|d | }|	d d d d |f |	d d d d ||f< qwn| jdkr|| j d | j }tj|	g| dd}	|	d d d d d |f }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.repeat_context)r   rZ   r[   randomr   z_You have passed `latents` as a parameter to FreeNoise. The expected number of frames is either z or z, but found shuffle_contextr   )r   r   r!   )r4   r*   r_   r7   rc   vae_scale_factorr   _free_noise_noise_typesizerf   ra   rd   rw   rv   r#   
LongTensorrandpermr$   )r   r   r   r}   r   r   r[   rZ   r   r   context_num_framesr   rn   window_start
window_endwindow_lengthindicesshuffled_indicescurrent_startcurrent_endprefix_lengthnum_repeatsr   r   r    _prepare_latents_free_noised  s`   


..z5AnimateDiffFreeNoiseMixin._prepare_latents_free_noisestart_index	end_indexr   r   c           
      C   sV   || d }g }t |D ]}||d  }d| | ||  }	||	 qt|}|S )Nr   )ra   r3   r#   r$   )
r   r   r   r   r   num_indicesinterpolated_tensorsrn   alphainterpolated_tensorr   r   r    _lerp  s   
zAnimateDiffFreeNoiseMixin._lerp      pyramidr   rV   rW   rX   
noise_typeprompt_interpolation_callbackc           
      C   s   g d}g d}|| j jjkrtd|d| j jjd ||vr,td| d|||vr:td| d	||p@| j jj| _|| _|| _|| _	|pO| j
| _t| jjd
rgg | jj| jj| jj}n
g | jj| jj}|D ]}	| |	 qsdS )ad	  
        Enable long video generation using FreeNoise.

        Args:
            context_length (`int`, defaults to `16`, *optional*):
                The number of video frames to process at once. It's recommended to set this to the maximum frames the
                Motion Adapter was trained with (usually 16/24/32). If `None`, the default value from the motion
                adapter config is used.
            context_stride (`int`, *optional*):
                Long videos are generated by processing many frames. FreeNoise processes these frames in sliding
                windows of size `context_length`. Context stride allows you to specify how many frames to skip between
                each window. For example, a context length of 16 and context stride of 4 would process 24 frames as:
                    [0, 15], [4, 19], [8, 23] (0-based indexing)
            weighting_scheme (`str`, defaults to `pyramid`):
                Weighting scheme for averaging latents after accumulation in FreeNoise blocks. The following weighting
                schemes are supported currently:
                    - "flat"
                       Performs weighting averaging with a flat weight pattern: [1, 1, 1, 1, 1].
                    - "pyramid"
                        Performs weighted averaging with a pyramid like weight pattern: [1, 2, 3, 2, 1].
                    - "delayed_reverse_sawtooth"
                        Performs weighted averaging with low weights for earlier frames and high-to-low weights for
                        later frames: [0.01, 0.01, 3, 2, 1].
            noise_type (`str`, defaults to "shuffle_context"):
                Must be one of ["shuffle_context", "repeat_context", "random"].
                    - "shuffle_context"
                        Shuffles a fixed batch of `context_length` latents to create a final latent of size
                        `num_frames`. This is usually the best setting for most generation scenarios. However, there
                        might be visible repetition noticeable in the kinds of motion/animation generated.
                    - "repeated_context"
                        Repeats a fixed batch of `context_length` latents to create a final latent of size
                        `num_frames`.
                    - "random"
                        The final latents are random without any repetition.
        )flatr   delayed_reverse_sawtooth)r   r   r   zYou have set context_length=zH which is greater than self.motion_adapter.config.motion_max_seq_length=z*. This can lead to bad generation results.z0The parameter `weighting_scheme` must be one of z, but got weighting_scheme=z*The parameter `noise_type` must be one of z, but got noise_type=r^   N)motion_adapterconfigmotion_max_seq_lengthloggerwarningr7   rc   rd   re   r   r   r   hasattrunet	mid_blockdown_blocks	up_blocksrp   )
r   rV   rW   rX   r   r   allowed_weighting_schemeallowed_noise_typeblocksrK   r   r   r    enable_free_noise  s.   /z+AnimateDiffFreeNoiseMixin.enable_free_noisec                 C   sz   d| _ t| jjdrg | jj| jj| jj}n
g | jj| jj}g | jj| jj| jj}|D ]}| | q3dS )z)Disable the FreeNoise sampling mechanism.Nr^   )rc   r   r   r   r   r   rr   )r   r   rK   r   r   r    disable_free_noise  s   z,AnimateDiffFreeNoiseMixin.disable_free_noiser^   spatial_split_sizec                 C   sh   |D ]/}t |j|ddg|_tt|jD ]}t |j| |dddg|j|< qt |j|ddg|_qd S )Nr   inputr   encoder_hidden_states)r   proj_inra   r_   r`   proj_out)r   r^   r   rl   rn   r   r   r    '_enable_split_inference_motion_modules_  s   zAAnimateDiffFreeNoiseMixin._enable_split_inference_motion_modules_
attentionstemporal_split_sizec                 C   0   t t|D ]}t|| |dddg||< qd S )Nr   r   r   ra   r_   r   )r   r   r   rn   r   r   r    #_enable_split_inference_attentions_$  s
   
z=AnimateDiffFreeNoiseMixin._enable_split_inference_attentions_resnetsc                 C   r   )Nr   input_tensortembr   )r   r   r   rn   r   r   r     _enable_split_inference_resnets_,  s   z:AnimateDiffFreeNoiseMixin._enable_split_inference_resnets_samplersc                 C   s.   t t|D ]}t|| |ddg||< qd S )Nr   r   r   )r   r   r   rn   r   r   r    !_enable_split_inference_samplers_0  s   z;AnimateDiffFreeNoiseMixin._enable_split_inference_samplers_   c                 C   s   g | j j| j j| j j}|D ]M}t|dddur!| |j| t|dddur0| |j| t|dddur?| 	|j
| t|dddurN| |j| t|dddur]| |j| qdS )az  
        Enable FreeNoise memory optimizations by utilizing
        [`~diffusers.pipelines.free_noise_utils.SplitInferenceModule`] across different intermediate modeling blocks.

        Args:
            spatial_split_size (`int`, defaults to `256`):
                The split size across spatial dimensions for internal blocks. This is used in facilitating split
                inference across the effective batch dimension (`[B x H x W, F, C]`) of intermediate tensors in motion
                modeling blocks.
            temporal_split_size (`int`, defaults to `16`):
                The split size across temporal dimensions for internal blocks. This is used in facilitating split
                inference across the effective batch dimension (`[B x F, H x W, C]`) of intermediate tensors in spatial
                attention, resnets, downsampling and upsampling blocks.
        r^   Nr   r   downsamplers
upsamplers)r   r   r   r   getattrr   r^   r   r   r   r   r   r   r   )r   r   r   r   rK   r   r   r    !enable_free_noise_split_inference6  s   z;AnimateDiffFreeNoiseMixin.enable_free_noise_split_inferencec                 C   s   t | do	| jd uS )Nrc   )r   rc   r(   r   r   r    free_noise_enabledS  s   z,AnimateDiffFreeNoiseMixin.free_noise_enabled)r   N)NNNNN)NN)r   r   r   r   N)r   r   )(rA   rB   rC   rD   r   r   r   rp   rr   r   rH   r1   rG   r#   rZ   boolr5   floatr   r[   	Generatorr   r   r   r   r   r   r*   r
   r   r	   r   r   r   r   r   r   r   propertyr   r   r   r   r    rJ      s    )
+	

m	

J



K


rJ   )typingr   r#   torch.nnrE   models.attentionr   r   models.resnetr   r   r   "models.transformers.transformer_2dr	   models.unets.unet_motion_modelr
   r   r   r   pipelines.pipeline_utilsr   utilsr   utils.torch_utilsr   
get_loggerrA   r   rF   r   rJ   r   r   r   r    <module>   s   
l