o
    iv[                  
   @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZmZmZ d dlmZmZmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8 G dd dZ9e: d"ddZ;de<de=de=de=dej>f
ddZ?de<de=fdd Z@eAd!kre;  dS dS )#    N)Iterator)	rearrange)	safe_open)GaussianNoiser)ConditioningItem(ConditioningItemAttentionStrengthWrapperVideoConditionByReferenceLatent)LoraPathStrengthAndSDOps)Registry)TilingConfigVideoEncoderget_video_chunks_number)QuantizationPolicy)AudioVideoLatentShapeVideoPixelShape)ImageConditioningInputVideoConditioningActionVideoMaskConditioningAction$default_2_stage_distilled_arg_parserdetect_checkpoint_path)AudioDecoderDiffusionStageImageConditionerPromptEncoderVideoDecoderVideoUpsampler)DISTILLED_SIGMA_VALUESSTAGE_2_DISTILLED_SIGMA_VALUESdetect_params)SimpleDenoiser)assert_resolutioncombined_image_conditionings
get_device)decode_video_by_frameencode_videovideo_preprocess)ModalitySpecc                !   @   sB  e Zd ZdZ				d'dedededee dejdB d	e	dB d
e
dB defddZ						d(dededededededee deeeef  dededB dededejdB dedB deeej ef fddZ		d)dee deeeef  dededed ededejdB dee fd!d"Zed#ejd$edejfd%d&ZdS )*ICLoraPipelinea  
    Two-stage video generation pipeline with In-Context (IC) LoRA support.
    Allows conditioning the generated video on control signals such as depth maps,
    human pose, or image edges via the video_conditioning parameter.
    The specific IC-LoRA model should be provided via the loras parameter.
    Stage 1 generates video at half of the target resolution, then Stage 2 upsamples
    by 2x and refines with additional denoising steps for higher quality output.
    Both stages use distilled models for efficiency.
    NFdistilled_checkpoint_pathspatial_upsampler_path
gemma_rootlorasdevicequantizationregistrytorch_compilec	              	   C   s  |pt  | _tj| _t||| j| j|d| _t|| j| j|d| _t	|| j| jt
||||d| _t	|| j| jd|||d| _t||| j| j|d| _t|| j| j|d| _t|| j| j|d| _d| _|D ]%}	t|	j}
|
dkr| jd|
fvrtd| j d|	j d|
 d|
| _qfd S )	N)r/   )r,   r.   r/   r0       zEConflicting reference_downscale_factor values in LoRAs: already have z, but z specifies z7. Cannot combine LoRAs with different reference scales.)r#   r-   torchbfloat16dtyper   prompt_encoderr   image_conditionerr   tuplestage_1stage_2r   	upsamplerr   video_decoderr   audio_decoderreference_downscale_factor%_read_lora_reference_downscale_factorpath
ValueError)selfr)   r*   r+   r,   r-   r.   r/   r0   lorascaler1   r1   F/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/ic_lora.py__init__8   s\   		
zICLoraPipeline.__init__      ?promptseedheightwidth
num_frames
frame_rateimagesvideo_conditioningenhance_prompttiling_configconditioning_attention_strengthskip_stage_2conditioning_attention_maskstreaming_prefetch_countreturnc                    s  t ||dd d  krdksn td tjjd|}t|d}j|g|	tdkr9d d nd	||d
\}|j	|j
}}td|d |d |d fdd}ttj}jt||||jj|t||dt|d|d
\}}|rtd |j|
|}|j}||fS |jd	d }ttj}td|||dfdd}jt|||||||t|||d  |dt||d  |jd|d
\}}|j|
|}|j}||fS )a0  
        Generate video with IC-LoRA conditioning.
        Args:
            prompt: Text prompt for video generation.
            seed: Random seed for reproducibility.
            height: Output video height in pixels (must be divisible by 64).
            width: Output video width in pixels (must be divisible by 64).
            num_frames: Number of frames to generate.
            frame_rate: Output video frame rate.
            images: List of (path, frame_idx, strength) tuples for image conditioning.
            video_conditioning: List of (path, strength) tuples for IC-LoRA video conditioning.
            enhance_prompt: Whether to enhance the prompt using the text encoder.
            tiling_config: Optional tiling configuration for VAE decoding.
            conditioning_attention_strength: Scale factor for IC-LoRA conditioning attention.
                Controls how strongly the conditioning video influences the output.
                0.0 = ignore conditioning, 1.0 = full conditioning influence. Default 1.0.
                When conditioning_attention_mask is provided, the mask is multiplied by
                this strength before being passed to the conditioning items.
            skip_stage_2: If True, skip Stage 2 upsampling and refinement. Output will be
                at half resolution (height//2, width//2). Default is False.
            conditioning_attention_mask: Optional pixel-space attention mask with the same
                spatial-temporal dimensions as the input reference video. Shape should be
                (B, 1, F, H, W) or (1, 1, F, H, W) where F, H, W match the reference
                video's pixel dimensions. Values in [0, 1].
                The mask is downsampled to latent space using VAE scale factors (with
                causal temporal handling for the first frame), then multiplied by
                conditioning_attention_strength.
                When None (default): scalar conditioning_attention_strength is used
                directly.
        Returns:
            Tuple of (video_iterator, audio_tensor).
        T)rJ   rK   is_two_stage        rG   z;conditioning_attention_strength must be in [0.0, 1.0], got )r-   )	generatorr   N)enhance_first_promptenhance_prompt_imageenhance_prompt_seedrU   r2      )batchframesrK   rJ   fpsc              
      s   j jj|  dS )N)rN   rO   rJ   rK   video_encoderrL   rR   rT   )_create_conditioningsrJ   rK   enc)rT   rR   rN   rL   rB   stage_1_output_shaperO   r1   rE   <lambda>   s    z)ICLoraPipeline.__call__.<locals>.<lambda>)contextconditionings)rg   )
denoisersigmasnoiserrK   rJ   r_   r`   videoaudiorU   z3[IC-LoRA] Skipping Stage 2 (--skip-stage-2 enabled)c                    s   t  jj| jjdS )NrN   rJ   rK   ra   r5   r-   )r"   rJ   rK   r5   r-   rc   )rN   rB   stage_2_output_shaper1   rE   rf      s    )rg   rh   noise_scaleinitial_latent)rg   rp   rq   )r!   rA   r3   	Generatorr-   manual_seedr   r6   lenvideo_encodingaudio_encodingr   r7   Tensorr   tor9   r    rK   rJ   r'   logginginfor<   latentr=   r;   r   r:   item)rB   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rY   rk   ctx_pvideo_contextaudio_contextstage_1_conditioningsstage_1_sigmasvideo_stateaudio_statedecoded_videodecoded_audioupscaled_video_latentdistilled_sigmasstage_2_conditioningsr1   )rT   rR   rN   rL   rB   re   ro   rO   rE   __call__q   s   1
	




zICLoraPipeline.__call__ra   c	                 C   s,  t ||||| j| jd}	| j}
|
dkr-||
 dks||
 dkr-td| d| d|
 d||
 }||
 }|D ]O\}}t||| jd}t|||| j| j}||}t|j	}|d	urg| j
||d
}|| }n	|dk rn|}nd	}t||
|d}|d	urt||d}|	| q7|rtdt| d |	S )a!  
        Create conditioning items for video generation.
        Args:
            conditioning_attention_strength: Scalar attention weight in [0, 1].
                If conditioning_attention_mask is also provided, the downsampled mask
                is multiplied by this strength. Otherwise this scalar is passed
                directly as the attention mask.
            conditioning_attention_mask: Optional pixel-space attention mask with shape
                (B, 1, F_pixel, H_pixel, W_pixel) matching the reference video's
                pixel dimensions. Downsampled to latent space with causal temporal
                handling, then multiplied by conditioning_attention_strength.
        Returns:
            List of conditioning items. IC-LoRA conditionings are appended last.
        rn   r2   r   zOutput dimensions (xz3) must be divisible by reference_downscale_factor ()r@   	frame_capr-   N)masktarget_latent_shaperG   )r{   downscale_factorstrength)attention_maskz[IC-LoRA] Added z video conditioning(s))r"   r5   r-   r>   rA   r$   r&   r   from_torch_shapeshape_downsample_mask_to_latentr   r   appendry   rz   rt   )rB   rN   rO   rJ   rK   rL   ra   rR   rT   rh   rD   
ref_height	ref_width
video_pathr   	frame_genrl   encoded_videoreference_video_shapelatent_mask	attn_maskcondr1   r1   rE   rb     sP    
z$ICLoraPipeline._create_conditioningsr   r   c                 C   s   | j d }|j}|j}|j}| j d }tjjjt| d||fdd}t|d|d}|dddddd	ddddf }|d	kr|d	kr|d	 |d	  }	|d	 |d	  dksaJ d
| d| dt|ddddd	dddddf d|	d}
|
j	dd}
tj
||
gdd}n|}t|dS )a  
        Downsample a pixel-space mask to latent space using VAE scale factors.
        Handles causal temporal downsampling: the first frame is kept separately
        (temporal scale factor = 1 for the first frame), while the remaining
        frames are downsampled by the VAE's temporal scale factor.
        Args:
            mask: Pixel-space mask of shape (B, 1, F_pixel, H_pixel, W_pixel).
                Values in [0, 1].
            target_latent_shape: Expected latent shape after VAE encoding.
                Used to determine the target (F_latent, H_latent, W_latent).
        Returns:
            Flattened latent-space mask of shape (B, F_lat * H_lat * W_lat),
            matching the patchifier's token ordering (f, h, w).
        r   r]   zb 1 f h w -> (b f) 1 h warea)sizemodez(b f) 1 h w -> b 1 f h w)bNr2   zPixel frames (z%) not compatible with latent frames (z/): (f_pix - 1) must be divisible by (f_lat - 1)zb 1 (f t) h w -> b 1 f t h w)t   )dimzb 1 f h w -> b (f h w))r   r_   rJ   rK   r3   nn
functionalinterpolater   meancat)r   r   r   f_lath_latw_latf_pixspatial_downfirst_framer   restr   r1   r1   rE   r   _  s,   

&0
z)ICLoraPipeline._downsample_mask_to_latent)NNNF)FNrG   FNN)rG   N)__name__
__module____qualname____doc__strlistr	   r3   r-   r   r
   boolrF   intfloatr   r8   r   rw   r   r   r   r   r   rb   staticmethodr   r   r1   r1   r1   rE   r(   -   s    	
C	

 (	

Or(   rV   c                  C   sB  t  t j tdd} t| }t|d}|jdtdddd |jdt	dd	d d
d |jdddd |
 }d }d}|jd urV|j\}}|}t||jd |jd |jd}t|j|j|j|jret|jnd|j|jd}t }	t|j|	}
||j|j|j|j|j|j|j|j|	||j ||j!d\}}t"||j||j#|
d d S )NT)	distilled)paramsz--video-conditioningr]   )PATHSTRENGTH)actionnargsmetavarrequiredz--conditioning-attention-mask)	MASK_PATHr   a  Optional spatial attention mask: path to a grayscale mask video and attention strength. The mask video pixel values in [0,1] control per-region conditioning attention strength. The strength scalar is multiplied with the spatial mask. 0.0 = ignore IC-LoRA conditioning, 1.0 = full conditioning influence. When not provided, full conditioning strength (1.0) is used. Example: --conditioning-attention-mask path/to/mask.mp4 0.5)r   r   r   defaulthelpz--skip-stage-2
store_truezSkip Stage 2 upsampling and refinement. Output will be at half resolution (height//2, width//2). Useful for faster iteration or when GPU memory is limited.)r   r   rG   )	mask_pathrJ   rK   rL   r1   )r)   r*   r+   r,   r.   r0   )rH   rI   rJ   rK   rL   rM   rN   rO   rQ   rR   rS   rT   rU   )rl   r`   rm   output_pathvideo_chunks_number)$ry   	getLoggersetLevelINFOr   r   r   add_argumentr   r   
parse_argsrT   _load_mask_videorJ   rK   rL   r(   r)   r*   r+   rC   r8   r.   compiler   r   r   rH   rI   rM   rN   rO   rS   rU   r%   r   )checkpoint_pathr   parserargsrT   rR   r   mask_strengthpipelinerQ   r   rl   rm   r1   r1   rE   main  s   





r   r   rJ   rK   rL   c                 C   sL   t  }t| ||d}t|||tj|}|jddd}|d d }|ddS )a  Load a mask video and return a pixel-space tensor of shape (1, 1, F, H, W).
    The mask video is loaded, resized to (height, width), converted to
    grayscale, and normalised to [0, 1].
    Args:
        mask_path: Path to the mask video file.
        height: Target height in pixels.
        width: Target width in pixels.
        num_frames: Maximum number of frames to load.
    Returns:
        Tensor of shape ``(1, 1, F, H, W)`` with values in ``[0, 1]``.
    r   r2   T)r   keepdimrG   g       @rX   )r#   r$   r&   r3   r4   r   clamp)r   rJ   rK   rL   r-   r   
mask_videor   r1   r1   rE   r     s   r   	lora_pathc              
   C   s   z(t | dd}| pi }t|ddW  d   W S 1 s!w   Y  W dS  tyF } ztd|  d|  W Y d}~dS d}~ww )a]  Read reference_downscale_factor from LoRA safetensors metadata.
    Some IC-LoRA models are trained with reference videos at lower resolution than
    the target output. This allows for more efficient training and can improve
    generalization. The downscale factor indicates the ratio between target and
    reference resolutions (e.g., factor=2 means reference is half the resolution).
    Args:
        lora_path: Path to the LoRA .safetensors file
    Returns:
        The reference downscale factor (1 if not specified in metadata, meaning
        reference and target have the same resolution)
    pt)	frameworkr>   r2   Nz(Failed to read metadata from LoRA file 'z': )r   metadatar   get	Exceptionry   warning)r   fr   er1   r1   rE   r?     s   (r?   __main__)rV   N)Bry   collections.abcr   r3   einopsr   safetensorsr   ltx_core.components.noisersr   ltx_core.conditioningr   r   r   ltx_core.loaderr	   ltx_core.loader.registryr
   ltx_core.model.video_vaer   r   r   ltx_core.quantizationr   ltx_core.typesr   r   r   ltx_pipelines.utils.argsr   r   r   r   r   ltx_pipelines.utils.blocksr   r   r   r   r   r   ltx_pipelines.utils.constantsr   r   r   ltx_pipelines.utils.denoisersr    ltx_pipelines.utils.helpersr!   r"   r#   ltx_pipelines.utils.media_ior$   r%   r&   ltx_pipelines.utils.typesr'   r(   inference_moder   r   r   rw   r   r?   r   r1   r1   r1   rE   <module>   sN       jV

