o
    iL                  "   @   s~  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dlmZmZ e eZdedB dejdB dedejdededB fddZdejdedB dedB dededede edB edB f fddZ!	d9dejdedB dedB dededede"de edB edB f fddZ#dejdejfd d!Z$dejd"ej%dejfd#d$Z&	%	&d:ded'ejd(ejd)ej%d*eejej%gejf dedejded+e'd,e"dejfd-d.Z(d/dd&d0d1e&ej)d0fdejdedB dedB dededed2ed3edB d,e"d4e'd5ed*eejej%gejf d6ej*d+e'de edB edB f fd7d8Z+dS );    N)replace)partial)Callable)tqdm)Res2sDiffusionStep)DiffusionStepProtocol)X0Model)to_denoisedto_velocity)post_process_latenttimesteps_from_mask)get_res2s_coefficients)DenoiserLatentStatestatedenoisedsteppersigmasstep_idxreturnc                 C   s>   | du s|du r
| S t || j| j}t| || j|||dS )zOAdvance one diffusion step for a single modality, or return ``None`` if absent.Nlatent)r   denoise_maskclean_latentr   stepr   )r   r   r   r   r    r   M/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/utils/samplers.py_step_state   s   r   video_stateaudio_statetransformerdenoiserc           
      C   sZ   t t| dd D ]\}}||||| |\}}	t|||| |}t||	|| |}q
||fS )a  
    Perform the joint audio-video denoising loop over a diffusion schedule.
    Either ``video_state`` or ``audio_state`` may be ``None`` for absent
    modalities; the absent modality is passed through unchanged.
    ### Parameters
    sigmas:
        A 1D tensor of noise levels (diffusion sigmas) defining the sampling
        schedule. All steps except the last element are iterated over.
    video_state:
        The current video :class:`LatentState`, or ``None`` if video is absent.
    audio_state:
        The current audio :class:`LatentState`, or ``None`` if audio is absent.
    stepper:
        An implementation of :class:`DiffusionStepProtocol` that updates a
        latent given the current latent, its denoised estimate, the full
        ``sigmas`` schedule, and the current step index.
    transformer:
        The diffusion model passed to the denoiser at each step.
    denoiser:
        A callable implementing :class:`Denoiser`. It is invoked as
        ``denoiser(transformer, video_state, audio_state, sigmas, step_index)``
        and must return ``(denoised_video, denoised_audio)``.
    ### Returns
    tuple[LatentState | None, LatentState | None]
        Final ``(video_state, audio_state)`` after the denoising loop.
    N)	enumerater   r   )
r   r   r   r   r    r!   r   _denoised_videodenoised_audior   r   r   euler_denoising_loop"   s
   "r'          @ge_gammac                    s  d}d}dt jdt jdtdt jdB dtt jt jf f
 fdd}	tt| dd	 D ]\}
}||||| |
\}}|durH|durHt||j|j}|durX|durXt||j|j}| |
d
  dkr|durn|durnt	||d}|dur||dur|t	||d}||f  S |dur|dur|	|j
|| |
 |\}}t	|||j
|| |
d}|dur|dur|	|j
|| |
 |\}}t	|||j
|| |
d}q*||fS )a  
    Perform the joint audio-video denoising loop using gradient-estimation sampling.
    Same interface as :func:`euler_denoising_loop` with an additional
    ``ge_gamma`` parameter for velocity correction.
    ### Parameters
    ge_gamma:
        Gradient estimation coefficient controlling the velocity correction term.
        Default is 2.0. Paper: https://openreview.net/pdf?id=o2ND9v0CeK
    ### Returns
    tuple[LatentState | None, LatentState | None]
        See :func:`euler_denoising_loop` for return value description.
    Nnoisy_sampledenoised_samplesigmaprevious_velocityr   c                    s<   t | ||}|d ur|| } | | }t| ||}||fS )N)r
   r	   )r*   r+   r,   r-   current_velocitydelta_vtotal_velocityr)   r   r   update_velocity_and_samplef   s   zLgradient_estimating_euler_denoising_loop.<locals>.update_velocity_and_sampler"      r   r   )torchTensorfloattupler#   r   r   r   r   r   r   r   )r   r   r   r   r    r!   r)   previous_audio_velocityprevious_video_velocityr2   r   r$   r%   r&   r   r1   r   (gradient_estimating_euler_denoising_loopM   sR   
r:   xc                 C   s$   |  | jddd| jdddS )N)r"   T)dimkeepdim)sub_meandiv_std)r;   r   r   r   _channelwise_normalize   s   $rC   	generatorc                 C   s4   t j| j|t j|jd}||  |  }t|S )N)rD   dtypedevice)r4   randnshapefloat64rF   r@   rB   rC   )r;   rD   noiser   r   r   _get_new_noise   s   rK   F      ?sampler+   step_noise_generatornew_noise_fnlegacy_modeetac
                 C   s   |  }
|| j|}|s/t| j |
|  }t| j |
|d   }t||g}d}|j||||||	d}|rDt|| j| j	}|S )Nr3   r   )rM   r+   r   
step_indexrJ   rQ   )
cloner   r   r   doubler4   stackr   r   r   )r   rM   r+   rN   rO   r   r   r   rP   rQ   sigmas_copy	new_noise	timestepsnext_timestepsx_nextr   r   r   _inject_sde_noise   s$   	r[   r"   Td   
noise_seednoise_seed_substepbongmathbongmath_max_itermodel_dtypec           2   	   C   s  |p|}|du rt d|jj}|du r|d }tj|d|}tj|d|}tt|||d}t|||d}t||dd}t|t	sIt dt
| d	 }| d
 dkrjtj| dd
 tjddg| jdgdd} t| d	d   | dd
     }i }d}tt|D ]}| |  }| |d	   }|dur|j  nd}|dur|j  nd}||||| |\}}|dur|durt||j|j}|dur|durt||j|j}||  } t| ||\}!}"}#t|| }$|dur|dur| | }%| | |! |%  }&nd}%d}&|dur1|dur1| | }'| | |! |'  }(nd}'d}(|&durM|durM||||&t||$gdd}&|(dure|dure||||(t||$gdd}(|	r| dk r|dkrt|
D ]3})|&dur|%dur|&| |! |%  }| | }%|(dur|'dur|(| |! |'  }| | }'qv|dur|&durt||&|dnd}*|dur|(durt||(|dnd}+|||*|+t|$g| jdd\},}-|dur|,durt|,|j|j},|dur|-durt|-|j|j}-|dur-|%dur-|,dur-|, | }.|| |"|% |#|.    }/nd}/|durQ|'durQ|-durQ|- | }0|| |"|' |#|0    }1nd}1|/durf|durf||||/| |d}/|1dury|dury||||1| |d}1|dur|/durt||/|d}|dur|1durt||1|d}q| d
 dkr||||| |\}}|dur|durt||j|j}t|||d}|dur|durt||j|j}t|||d}||fS )a  
    Joint audio-video denoising loop using the res_2s second-order sampler.
    Iterates over the diffusion schedule with a two-stage Runge-Kutta step:
    evaluates the denoiser at the current point and at a midpoint (with SDE
    noise), then combines both with RK coefficients. Supports anchor-point
    refinement (bong iteration) and optional SDE noise injection. Requires
    :class:`Res2sDiffusionStep` as ``stepper``.
    Either modality may be ``None`` (absent).
    ### Parameters
    transformer:
        The diffusion model passed to the denoiser at each step.
    denoiser:
        Callable implementing :class:`Denoiser`.
    noise_seed:
        Seed for step-level SDE noise; substep seed defaults to ``noise_seed + 10000``.
    noise_seed_substep:
        Optional seed for substep SDE noise; if None, derived from ``noise_seed``.
    eta:
        Controls stochastic noise injection strength (0=deterministic, 1=maximum).
        Applies to main diffusion steps; substeps always use 0.5. Default 0.5.
    bongmath:
        Whether to run iterative anchor refinement (bong iteration) when step size is small.
    bongmath_max_iter:
        Max iterations for bong refinement when enabled.
    new_noise_fn:
        Callable ``(latent, generator) -> noise`` for SDE injection.
    model_dtype:
        Dtype for latent state updates (e.g. bfloat16).
    ### Returns
    tuple[LatentState | None, LatentState | None]
        Final ``(video_state, audio_state)`` after the denoising loop.
    Nz;At least one of video_state or audio_state must be providedi'  )rF   )r   rO   rP   )rN   rQ   rL   z1stepper must be an instance of Res2sDiffusionStepr3   r"   r   g/nR?g        )r=   )r   rM   r+   r   r   gQ?r   )r   r   r   rR   )
ValueErrorr   rF   r4   	Generatormanual_seedr   r[   
isinstancer   lencattensorlogrT   cpur   rangerS   r   r   r   itemr   sqrtrU   r   to)2r   r   r   r   r    r!   r]   r^   rQ   r_   r`   rO   ra   rP   present_statestate_devicerN   substep_noise_generatorsde_noise_injecting_fnstep_noise_injecting_fnsubstep_noise_injecting_fnn_full_stepshs	phi_cachec2r   r,   
sigma_nextx_anchor_videox_anchor_audiodenoised_video_1denoised_audio_1ha21b1b2	sub_sigmaeps_1_videox_mid_videoeps_1_audiox_mid_audior$   mid_video_statemid_audio_statedenoised_video_2denoised_audio_2eps_2_videox_next_videoeps_2_audiox_next_audior   r   r    res2s_audio_video_denoising_loop   s   1
*0
	r   )r(   )FrL   ),loggingdataclassesr   	functoolsr   typingr   r4   r   #ltx_core.components.diffusion_stepsr   ltx_core.components.protocolsr   ltx_core.model.transformerr   ltx_core.utilsr	   r
   ltx_pipelines.utils.helpersr   r   ltx_pipelines.utils.res2sr   ltx_pipelines.utils.typesr   r   	getLogger__name__loggerr5   intr   r7   r'   r6   r:   rC   rc   rK   boolr[   bfloat16rE   r   r   r   r   r   <module>   s    


2
E	

)	
