o
    GiV                  
   @   s>  d dl Z d dlZd dlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZmZmZ d
dlmZ e	eZdededefddZ				d#dedB deejB dB dee dB dee dB fddZG dd deZG dd deZG dd deZ G dd deZ!G dd  d eZ"G d!d" d"eZ#dS )$    N   )Flux2Transformer2DModel)FlowMatchEulerDiscreteScheduler)logging)randn_tensor   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )Flux2ModularPipelineimage_seq_len	num_stepsreturnc                 C   sp   d\}}d\}}| dkr||  | }t |S ||  | }||  | }|| d }	|d|	  }
|	| |
 }t |S )z3Compute empirical mu for Flux2 timestep scheduling.)gT	?gŒ_?)g w:/&?gDw:?i  g     g@g      i@)float)r   r   a1b1a2b2mum_200m_10ab r   d/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/modular_pipelines/flux2/before_denoise.pycompute_empirical_mu    s   r   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr!   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r!   r    r"   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r"   r    r    r   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r!   len)	schedulerr   r    r!   r"   kwargsaccepts_timestepsaccept_sigmasr   r   r   retrieve_timesteps4   s2   r0   c                   @   s   e Zd ZdZedee fddZedefddZ	edee
 fddZedee fd	d
Ze dededefddZdS )Flux2SetTimestepsStepflux2r   c                 C   s   t dtt dtgS )Nr,   transformer)r
   r   r   selfr   r   r   expected_componentsr   s   z)Flux2SetTimestepsStep.expected_componentsc                 C      dS )Nz[Step that sets the scheduler's timesteps for Flux2 inference using empirical mu calculationr   r4   r   r   r   descriptiony      z!Flux2SetTimestepsStep.descriptionc              	   C   s:   t dddt dt dt dtjdt dtdt d	tdgS )
Nr   2   defaultr!   r"   latents	type_hintheightwidth)r   torchTensorintr4   r   r   r   inputs}   s   


zFlux2SetTimestepsStep.inputsc                 C   s   t dtjddt dtddgS )Nr!   z"The timesteps to use for inferencer?   r8   r   z:The number of denoising steps to perform at inference time)r   rB   rC   rD   r4   r   r   r   intermediate_outputs   s   z*Flux2SetTimestepsStep.intermediate_outputs
componentsstatec                 C   s  |  |}|j}|j}|jp|j}|jp|j}|j}dt||d   }	dt||d   }
|	d |
d  }|j	}|j
}|j}|d u rP|d u rPtdd| |}t|jdr\|jjr\d }t||d}t||||||d\}}||_||_	|jd | || ||fS )Nr   g      ?r   use_flow_sigmas)r   r   )r!   r"   r   r   )get_block_state_execution_devicer,   r@   default_heightrA   default_widthvae_scale_factorrD   r   r"   r!   nplinspacehasattrconfigrJ   r   r0   set_begin_indexset_block_state)r5   rH   rI   block_stater    r,   r@   rA   rO   latent_heightlatent_widthr   r   r"   r!   r   r   r   r   __call__   s<   

zFlux2SetTimestepsStep.__call__N)__name__
__module____qualname__
model_namepropertylistr
   r6   strr8   r   rE   r   rG   rB   no_gradr   r	   rY   r   r   r   r   r1   o   s    

r1   c                   @   s   e Zd ZdZedee fddZedefddZ	edee
 fddZedee fd	d
Zedd ZedejfddZedd Ze	dddZe dededefddZdS )Flux2PrepareLatentsStepr2   r   c                 C   s   g S Nr   r4   r   r   r   r6      r9   z+Flux2PrepareLatentsStep.expected_componentsc                 C   r7   )Nz_Prepare latents step that prepares the initial noise latents for Flux2 text-to-image generationr   r4   r   r   r   r8      r9   z#Flux2PrepareLatentsStep.descriptionc                 C   sV   t dtdt dtdt dtjd B dt dtddt dt d	d
tddt dtjddgS )Nr@   r>   rA   r=   num_images_per_promptr   )r?   r<   	generator
batch_sizeTgNumber of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.requiredr?   r8   dtypezThe dtype of the model inputsrF   )r   rD   rB   rC   rj   r4   r   r   r   rE      s   

zFlux2PrepareLatentsStep.inputsc                 C       t dtjddt dtjddgS )Nr=   z4The initial latents to use for the denoising processrF   
latent_idsz'Position IDs for the latents (for RoPE)r   rB   rC   r4   r   r   r   rG      s
   z,Flux2PrepareLatentsStep.intermediate_outputsc              	   C   sp   | j }|jd ur|j|d  dks|jd ur4|j|d  dkr6td|d  d|j d|j d d S d S d S )Nr   r   z-`height` and `width` have to be divisible by z	 but are z and .)rO   r@   rA   loggerwarning)rH   rV   rO   r   r   r   check_inputs   s   z$Flux2PrepareLatentsStep.check_inputsr=   c           
      C   s^   | j \}}}}td}t|}t|}td}t||||}	|	d|dd}	|	S )z
        Generates 4D position coordinates (T, H, W, L) for latent tensors.

        Args:
            latents: Latent tensor of shape (B, C, H, W)

        Returns:
            Position IDs tensor of shape (B, H*W, 4)
        r   r   )shaperB   arangecartesian_prod	unsqueezeexpand)
r=   rf   _r@   rA   thwlrl   r   r   r   _prepare_latent_ids   s   



z+Flux2PrepareLatentsStep._prepare_latent_idsc                 C   .   | j \}}}}| |||| ddd} | S zePack latents: (batch_size, num_channels, height, width) -> (batch_size, height * width, num_channels)r   r   r   rs   reshapepermuter=   rf   num_channelsr@   rA   r   r   r   _pack_latents     z%Flux2PrepareLatentsStep._pack_latentsNc	           
      C   s   dt || jd   }dt || jd   }||d |d |d f}	t|tr:t||kr:tdt| d| d|d u rHt|	|||d}|S |j||d}|S )Nr      z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)re   r    rj   r    rj   )rD   rO   
isinstancer_   r+   r#   r   to)
comprf   num_channels_latentsr@   rA   rj   r    re   r=   rs   r   r   r   prepare_latents  s   z'Flux2PrepareLatentsStep.prepare_latentsrH   rI   c                 C   s   |  |}|jp
|j|_|jp|j|_|j|_|j|_| || |j	|j
 }| |||j|j|j|j|j|j|j	}| |}||j}| |}||_||_| || ||fS rc   )rK   r@   rM   rA   rN   rL   r    r   rq   rf   rd   r   rj   re   r=   r}   r   r   rl   rU   )r5   rH   rI   rV   rf   r=   rl   r   r   r   rY   '  s2   


z Flux2PrepareLatentsStep.__call__rc   )rZ   r[   r\   r]   r^   r_   r
   r6   r`   r8   r   rE   r   rG   staticmethodrq   rB   rC   r}   r   r   ra   r   r	   rY   r   r   r   r   rb      s*    
	

rb   c                   @      e Zd ZdZedefddZedee fddZ	edee
 fddZedd
ejdejd	B fddZdededefddZd	S )Flux2RoPEInputsStepr2   r   c                 C   r7   )NzStep that prepares the 4D RoPE position IDs for Flux2 denoising. Should be placed after text encoder and latent preparation steps.r   r4   r   r   r   r8   M  r9   zFlux2RoPEInputsStep.descriptionc                 C   s   t dddgS )Nprompt_embedsTnameri   r   r4   r   r   r   rE   Q  s   
zFlux2RoPEInputsStep.inputsc                 C   s   t ddtjddgS )Ntxt_idsdenoiser_input_fieldsH4D position IDs (T, H, W, L) for text tokens, used for RoPE calculation.r   kwargs_typer?   r8   rm   r4   r   r   r   rG   W  s   z(Flux2RoPEInputsStep.intermediate_outputsNxt_coordc                 C   z   | j \}}}g }t|D ]+}|du rtdn|| }td}td}	t|}
t|||	|
}|| qt|S z(Prepare 4D position IDs for text tokens.Nr   rs   rangerB   rt   ru   appendstackr   r   BLrx   out_idsiry   rz   r{   seq_lcoordsr   r   r   _prepare_text_idsb     



z%Flux2RoPEInputsStep._prepare_text_idsrH   rI   c                 C   sD   |  |}|j}|j}| ||_|j||_| || ||fS rc   )rK   r   r    r   r   r   rU   r5   rH   rI   rV   r   r    r   r   r   rY   s  s   
zFlux2RoPEInputsStep.__call__rc   rZ   r[   r\   r]   r^   r`   r8   r_   r   rE   r   rG   r   rB   rC   r   r   r	   rY   r   r   r   r   r   J  s    
r   c                   @   r   )Flux2KleinBaseRoPEInputsStepzflux2-kleinr   c                 C   r7   )NzStep that prepares the 4D RoPE position IDs for Flux2-Klein base model denoising. Should be placed after text encoder and latent preparation steps.r   r4   r   r   r   r8     r9   z(Flux2KleinBaseRoPEInputsStep.descriptionc                 C   s   t dddt dddgS )Nr   Tr   negative_prompt_embedsFr   r4   r   r   r   rE     s   

z#Flux2KleinBaseRoPEInputsStep.inputsc                 C   s$   t ddtjddt ddtjddgS )Nr   r   r   r   negative_txt_idszQ4D position IDs (T, H, W, L) for negative text tokens, used for RoPE calculation.rm   r4   r   r   r   rG     s   z1Flux2KleinBaseRoPEInputsStep.intermediate_outputsNr   r   c                 C   r   r   r   r   r   r   r   r     r   z.Flux2KleinBaseRoPEInputsStep._prepare_text_idsrH   rI   c                 C   sp   |  |}|j}|j}| ||_|j||_d |_|jd ur.| |j|_|j||_| || ||fS rc   )	rK   r   r    r   r   r   r   r   rU   r   r   r   r   rY     s   

z%Flux2KleinBaseRoPEInputsStep.__call__rc   r   r   r   r   r   r     s    r   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedd
eej defddZedd Ze dededefddZdS )Flux2PrepareImageLatentsStepr2   r   c                 C   r7   )NzUStep that prepares image latents and their position IDs for Flux2 image conditioning.r   r4   r   r   r   r8     r9   z(Flux2PrepareImageLatentsStep.descriptionc                 C   s,   t dttj dt ddtdt ddtdgS )	Nimage_latentsr>   rf   T)ri   r?   rd   r   )r<   r?   )r   r_   rB   rC   rD   r4   r   r   r   rE     s   z#Flux2PrepareImageLatentsStep.inputsc                 C   rk   )Nr   z%Packed image latents for conditioningrF   image_latent_idszPosition IDs for image latentsrm   r4   r   r   r   rG     s   z1Flux2PrepareImageLatentsStep.intermediate_outputs
   r   scalec           
   	      s   t | tstdt|  d fddtdt| D }dd |D }g }t| |D ]%\}}|d}|j	\}}}t
|t|t|td}	||	 q,tj|dd}|d}|S )	ak  
        Generates 4D time-space coordinates (T, H, W, L) for a sequence of image latents.

        Args:
            image_latents: A list of image latent feature tensors of shape (1, C, H, W).
            scale: Factor used to define the time separation between latents.

        Returns:
            Combined coordinate tensor of shape (1, N_total, 4)
        z+Expected `image_latents` to be a list, got rn   c                    s   g | ]}  |  qS r   r   .0ry   r   r   r   
<listcomp>  s    zCFlux2PrepareImageLatentsStep._prepare_image_ids.<locals>.<listcomp>r   c                 S   s   g | ]}| d qS )rr   )viewr   r   r   r   r     s    r   dim)r   r_   r#   typerB   rt   r+   zipsqueezers   ru   r   catrv   )
r   r   t_coordsr   r   ry   rx   r@   rA   x_idsr   r   r   _prepare_image_ids  s   

"
z/Flux2PrepareImageLatentsStep._prepare_image_idsc                 C   r~   r   r   r   r   r   r   r     r   z*Flux2PrepareImageLatentsStep._pack_latentsrH   rI   c                 C   s   |  |}|j}|d u rd |_d |_| || ||fS |j}|j|j }| |}g }|D ]}	| |	}
|
	d}
|
|
 q.tj|dd}|d}||dd}||dd}||}||_||_| || ||fS )Nr   r   r   )rK   r   r   rU   rL   rf   rd   r   r   r   r   rB   r   rv   repeatr   )r5   rH   rI   rV   r   r    rf   r   packed_latentslatentpackedr   r   r   rY     s0   





z%Flux2PrepareImageLatentsStep.__call__N)r   )rZ   r[   r\   r]   r^   r`   r8   r_   r   rE   r   rG   r   rB   rC   rD   r   r   ra   r   r	   rY   r   r   r   r   r     s    
r   c                   @   sl   e Zd ZdZedefddZedee fddZ	edee
 fddZe d	ed
edefddZdS )Flux2PrepareGuidanceStepr2   r   c                 C   r7   )Nz@Step that prepares the guidance scale tensor for Flux2 inferencer   r4   r   r   r   r8   .  r9   z$Flux2PrepareGuidanceStep.descriptionc                 C   s&   t dddt dddt ddtdd	gS )
Nguidance_scaleg      @r;   rd   r   rf   Trg   rh   )r   rD   r4   r   r   r   rE   2  s   

zFlux2PrepareGuidanceStep.inputsc                 C   s   t dtjddgS )NguidancezGuidance scale tensorrF   rm   r4   r   r   r   rG   ?  s   z-Flux2PrepareGuidanceStep.intermediate_outputsrH   rI   c                 C   sX   |  |}|j}|j|j }tjdg|j|tjd}||}||_	| 
|| ||fS )Nr   r   )rK   rL   rf   rd   rB   fullr   float32rw   r   rU   )r5   rH   rI   rV   r    rf   r   r   r   r   rY   E  s   

z!Flux2PrepareGuidanceStep.__call__N)rZ   r[   r\   r]   r^   r`   r8   r_   r   rE   r   rG   rB   ra   r   r	   rY   r   r   r   r   r   +  s    r   )NNNN)$r%   numpyrP   rB   modelsr   
schedulersr   utilsr   utils.torch_utilsr   modular_pipeliner   r	   modular_pipeline_utilsr
   r   r   r   
get_loggerrZ   ro   rD   r   r   r`   r    r_   r0   r1   rb   r   r   r   r   r   r   r   r   <module>   s@   



;O 6Bi