o
    i>U                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZ d dlm	Z	 d dl
mZmZmZmZmZmZ G dd deZG d	d
 d
e jZG dd de jZG dd de jZG dd de jZdedefddZdZG dd de jZd,dedefddZedfdedede jfddZedfdedede jfdd Z	!d-dede jfd"d#Z efdede jfd$d%Z!efdede jfd&d'Z"efdede jfd(d)Z#efdede jfd*d+Z$dS ).    N)Path)
NamedTuple)LTXV_LORA_COMFY_RENAMING_MAPLoraPathStrengthAndSDOps)QuantizationPolicy)DEFAULT_IMAGE_CRFDEFAULT_LORA_STRENGTHDEFAULT_NEGATIVE_PROMPTLTX_2_3_HQ_PARAMSLTX_2_3_PARAMSPipelineParamsc                   @   s2   e Zd ZU eed< eed< eed< eZeed< dS )ImageConditioningInputpath	frame_idxstrengthcrfN)	__name__
__module____qualname__str__annotations__intfloatr   r    r   r   I/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/utils/args.pyr      s
   
 r   c                   @   :   e Zd Z	d	dejdejdee dedB ddf
ddZdS )
VideoConditioningActionNparser	namespacevaluesoption_stringreturnc           
      C   sH   |\}}t |}t|}t|| jpg }	|	||f t|| j|	 d S N)resolve_pathr   getattrdestappendsetattr)
selfr   r   r   r    r   strength_strresolved_pathr   currentr   r   r   __call__   s   z VideoConditioningAction.__call__r"   	r   r   r   argparseArgumentParser	Namespacelistr   r,   r   r   r   r   r          r   c                   @   s>   e Zd ZdZ	d
dejdejdee dedB ddf
dd	Z	dS )VideoMaskConditioningActionaO  Parse ``--conditioning-attention-mask PATH STRENGTH``.
    Stores a ``(mask_path, strength)`` tuple on the namespace.  The mask video
    should be grayscale with pixel values in [0, 1] controlling per-region
    conditioning attention strength.  The scalar *STRENGTH* is multiplied with
    the spatial mask before it is applied.
    Nr   r   r   r    r!   c                 C   sX   t |dkr| dt | }t| |t|d }t|d }t|| j||f d S )N   z8 requires exactly 2 arguments (MASK_PATH STRENGTH), got r      )lenr.   ArgumentErrorr#   r   r'   r%   )r(   r   r   r   r    msg	mask_pathr   r   r   r   r,   0   s   z$VideoMaskConditioningAction.__call__r"   )
r   r   r   __doc__r.   r/   r0   r1   r   r,   r   r   r   r   r3   (   s    r3   c                   @   r   )
ImageActionNr   r   r   r    r!   c                 C   s   t |dvr| dt | }t| |tt|d t|d t|d t |dkr1t|d ntd}t|| j	p<g }|
| t|| j	| d S )N)      z@ requires 3 or 4 arguments (PATH FRAME_IDX STRENGTH [CRF]), got r   r5   r4   r<   )r   r   r   r   )r6   r.   r7   r   r#   r   r   r   r$   r%   r&   r'   )r(   r   r   r   r    r8   conditioningr+   r   r   r   r,   A   s   



zImageAction.__call__r"   r-   r   r   r   r   r;   @   r2   r;   c                   @   r   )

LoraActionNr   r   r   r    r!   c                 C   s   t |dkr| dt | d}t| ||d }t |dkr$|d ntt}t|}t|}	t|| jp7g }
|
	t
||	t t|| j|
 d S )Nr4   z? accepts at most 2 arguments (PATH and optional STRENGTH), got  valuesr   r5   )r6   r.   r7   r   r   r#   r   r$   r%   r&   r   r   r'   )r(   r   r   r   r    r8   r   r)   r*   r   r+   r   r   r   r,   X   s   zLoraAction.__call__r"   r-   r   r   r   r   r?   W   r2   r?   r   r!   c                 C   s   t t|    S r"   )r   r   
expanduserresolveas_posix)r   r   r   r   r#   n   s   r#   )fp8-castfp8-scaled-mmc                   @   r   )
QuantizationActionNr   r   r   r    r!   c           	      C   s   t |dkr| dt | d}t| ||d }|tvr/d| ddt }t| ||dkrIt |d	krD| d
}t| |t }n|dkr`t |d	krYt|d	 nd }t|}t	|| j
| d S )Nr4   zB accepts at most 2 arguments (POLICY and optional AMAX_PATH), got r@   r   zUnknown quantization policy 'z'. Choose from: , rD   r5   z. fp8-cast does not accept additional argumentsrE   )r6   r.   r7   QUANTIZATION_POLICIESjoinr   fp8_castr#   fp8_scaled_mmr'   r%   )	r(   r   r   r   r    r8   policy_namepolicy	amax_pathr   r   r   r,   v   s"   


zQuantizationAction.__call__r"   r-   r   r   r   r   rF   u   r2   rF   F	distilledc                 C   sD   t jdd}| r
dnd}|j|tdd | \}}| r|jS |jS )zNPre-parse argv to extract the checkpoint path before building the full parser.F)add_help--distilled-checkpoint-path--checkpoint-pathT)typerequired)r.   r/   add_argumentr#   parse_known_argsdistilled_checkpoint_pathcheckpoint_path)rO   preflagknown_r   r   r   detect_checkpoint_path   s
   r]   paramsc                 C   sF  t  }|r|jdtddd n|jdtddd |jdt| jd| j d	d
 |jdtddd |jdtddd |jdtddd |jdt| jd| j d	d
 |jddtddg dt	 dd |jddd dtdtfdd }|jd!|d d"d#d$ |jd%|d&d"d'd$ |jd(d)t
dd*d d+d,t d-d |jd.dd/d0 |S )1NrQ   Tz=Path to LTX-2 distilled model checkpoint (.safetensors file).rS   rT   helprR   z3Path to LTX-2 model checkpoint (.safetensors file).z--num-inference-stepszNumber of denoising steps in the diffusion sampling process. Higher values improve quality but increase generation time (default: ).rS   defaultr`   z--gemma-rootzIPath to the root directory containing the Gemma text encoder model files.z--promptzNText prompt describing the desired video content to be generated by the model.z--output-pathz+Path to the output video file (MP4 format).z--seedz2Random seed for reproducible generation (default: z--loralora+PATHSTRENGTHz^LoRA (Low-Rank Adaptation) model: path to model file and optional strength (default strength: zr). Can be specified multiple times. Example: --lora path/to/lora1.safetensors 0.8 --lora path/to/lora2.safetensorsr%   actionnargsmetavarrc   r`   z--enhance-prompt
store_true)rj   valuer!   c              
   S   sL   zt | }|dk rtd|W S  ty% } z	td|  |d }~ww )Nr5   zmust be >= 1zmust be an integer, got )r   r.   ArgumentTypeError
ValueError)rn   	int_valueer   r   r   _positive_int   s   
z'basic_arg_parser.<locals>._positive_intz--streaming-prefetch-countNzEnable layer streaming prefetching N layers ahead. At most 1 + N layers reside on GPU at once. Must be >= 1. Example: --streaming-prefetch-count 2)rS   rc   rl   r`   z--max-batch-sizer5   a  Maximum batch size per transformer forward pass. Guided denoisers batch up to 4 guidance passes into a single call. Default 1 runs passes sequentially. Set to 4 to batch all passes together, which reduces layer-streaming PCIe transfers. Example: --max-batch-size 4z--quantizationquantization)POLICY	AMAX_PATHzQuantization policy: rG   z. fp8-cast uses FP8 casting with upcasting during inference. fp8-scaled-mm uses FP8 scaled matrix multiplication (optionally provide amax calibration file path). Example: --quantization fp8-cast or --quantization fp8-scaled-mm /path/to/amax.jsonz	--compilezDEnable torch.compile for transformer blocks to optimize performance.)rj   r`   )r.   r/   rU   r#   r   num_inference_stepsr   seedr?   r   rF   rI   rH   )r^   rO   r   rs   r   r   r   basic_arg_parser   s   	
rz   c              
   C   s   t | |d}|jdt| jd| j dd |jdt| jd| j dd |jdt| jd	| j dd |jd
t| jd| j dd |jddtddg dt	 dd |S )Nr^   rO   --heightz2Video height in pixels, divisible by 32 (default: ra   rb   --widthzLWidth of the generated video in pixels, should be divisible by 32 (default: z--num-frameszNumber of frames to generate in the output video sequence, num-frames = (8 x K) + 1, where k is a non-negative integer (default: z--frame-ratez2Frame rate of the generated video (fps) (default: z--imageimagesre   ARGzImage conditioning input: PATH FRAME_IDX STRENGTH [CRF]. PATH is the image file, FRAME_IDX is the target frame index, STRENGTH is the conditioning strength (all three required). CRF is the optional H.264 compression quality (0=lossless, default: zr). Can be specified multiple times. Example: --image path/to/image1.jpg 0 0.8 --image path/to/image2.jpg 160 0.9 0ri   )
rz   rU   r   stage_1_heightstage_1_width
num_framesr   
frame_rater;   r   )r^   rO   r   r   r   r   new_video_gen_arg_parser  sN   r   Tc                 C   sD   t | d}|jdtddd |jdtddd |jdtdd	d |S )
a6  Base argument parser for video-editing pipelines (retake, extension, inpainting, sticker movement).
    Uses the same actions and conventions as basic_arg_parser but only the args needed for editing
    (no height/width/num-frames; resolution comes from input video). Default is distilled checkpoint only.
    )rO   z--video-pathTzPath to the source video.r_   z--start-timez+Start time of the region to regenerate (s).z
--end-timez)End time of the region to regenerate (s).)rz   rU   r#   r   )rO   r   r   r   r   video_editing_arg_parserM  s
   
r   c                 C   s  | j }| j}t| d}|jdttdd |jdt|jd|j dd |jdt|jd	|j dd |jd
t|j	d|j	 dd |jdt
d|jd|j dd |jdt|jd|j dd |jdt
|jd|j dd |jdt|jd|j dd |jdt|jd|j dd |jdt|j	d|j	 dd |jdt
d|jd|j dd |jdt|jd|j dd |jdt
|jd |j dd |S )!Nr^   z--negative-promptzNegative prompt describing what should not appear in the generated video, used to guide the diffusion process away from unwanted content. Default: a comprehensive negative prompt covering common artifacts and quality issues.rb   z--video-cfg-guidance-scalezClassifier-free guidance (CFG) scale controlling how strongly the model adheres to the video prompt. Higher values increase prompt adherence but may reduce diversity. 1.0 means no effect (default: ra   z--video-stg-guidance-scalezSTG (Spatio-Temporal Guidance) scale controlling how strongly the model reacts to the perturbation of the video modality. Higher values increase the effect but may reduce quality. 0.0 means no effect (default: z--video-rescale-scalezRescale scale controlling how strongly the model rescales the video modality after applying other guidance. Higher values tend to decrease oversaturation effects. 0.0 means no effect (default: z--video-stg-blocks*z6Which transformer blocks to perturb for STG. Default: .)rS   rk   rc   r`   z--a2v-guidance-scalezA2V (Audio-to-Video) guidance scale controlling how strongly the model reacts to the perturbation of the audio-to-video cross-attention. Higher values may increase lipsync quality. 1.0 means no effect (default: z--video-skip-stepzVideo skip step N controls periodic skipping during the video diffusion process: only steps where step_index % (N + 1) == 0 are processed, all others are skipped (e.g., 0 = no skipping; 1 = skip every other step; 2 = skip 2 of every 3 steps; default: z--audio-cfg-guidance-scalezAudio CFG (Classifier-free guidance) scale controlling how strongly the model adheres to the audio prompt. Higher values increase prompt adherence but may reduce diversity. 1.0 means no effect (default: z--audio-stg-guidance-scalezAudio STG (Spatio-Temporal Guidance) scale controlling how strongly the model reacts to the perturbation of the audio modality. Higher values increase the effect but may reduce quality. 0.0 means no effect (default: z--audio-rescale-scalezAudio rescale scale controlling how strongly the model rescales the audio modality after applying other guidance. Experimental. 0.0 means no effect (default: z--audio-stg-blocksz<Which transformer blocks to perturb for Audio STG. Default: z--v2a-guidance-scalezV2A (Video-to-Audio) guidance scale controlling how strongly the model reacts to the perturbation of the video-to-audio cross-attention. Higher values may increase lipsync quality. 1.0 means no effect (default: z--audio-skip-stepzAudio skip step N controls periodic skipping during the audio diffusion process: only steps where step_index % (N + 1) == 0 are processed, all others are skipped (e.g., 0 = no skipping; 1 = skip every other step; 2 = skip 2 of every 3 steps; default: )video_guider_paramsaudio_guider_paramsr   rU   r   r	   r   	cfg_scale	stg_scalerescale_scaler   
stg_blocksmodality_scale	skip_step)r^   video_guideraudio_guiderr   r   r   r   default_1_stage_arg_parser[  s   





r   c              
   C   s   t | d}|j| j| jd |jD ]}d|jv r d| j d|_d|jv r-d| j d|_q|jdd	td
dddt	 dd |jdt
ddd |S )Nr   heightwidthr|   MHeight of the generated video in pixels, should be divisible by 64 (default: ra   r}   LWidth of the generated video in pixels, should be divisible by 64 (default: z--distilled-loradistilled_lorare   rf   TzDistilled LoRA (Low-Rank Adaptation) model used in the second stage (upscaling and refinement): path to model file and optional strength (default strength: a5  ). The second stage upsamples the video by 2x resolution and refines it using a distilled denoising schedule (fewer steps, no CFG). The distilled LoRA is specifically trained for this refinement process to improve quality at higher resolutions. Example: --distilled-lora path/to/distilled_lora.safetensors 0.8)r%   rj   rk   rl   rT   r`   --spatial-upsampler-pathoPath to the spatial upsampler model used to increase the resolution of the generated video in the latent space.r_   )r   set_defaultsstage_2_heightstage_2_width_actionsoption_stringsr`   rU   r?   r   r#   r^   r   rj   r   r   r   default_2_stage_arg_parser  s<   



	r   c                 C   sB   t | d}|jdtddd dd |jdtdd	d dd |S )
Nr   z!--distilled-lora-strength-stage-1g      ?zAStrength of the distilled LoRA used in the first stage (default: ra   rb   z!--distilled-lora-strength-stage-2g      ?zBStrength of the distilled LoRA used in the second stage (default: )r   rU   r   )r^   r   r   r   r   hq_2_stage_arg_parser  s   


r   c                 C   st   t | dd}|j| j| jd |jD ]}d|jv r!d| j d|_d|jv r.d| j d|_q|jd	tdd
d |S )NTr{   r   r|   r   ra   r}   r   r   r   r_   )	r   r   r   r   r   r   r`   rU   r#   r   r   r   r   $default_2_stage_distilled_arg_parser  s&   


	r   )F)T)%r.   pathlibr   typingr   ltx_core.loaderr   r   ltx_core.quantizationr   ltx_pipelines.utils.constantsr   r   r	   r
   r   r   r   Actionr   r3   r;   r?   r   r#   rH   rF   boolr]   r/   rz   r   r   r   r   r   r   r   r   r   r   <module>   sX     



3
 
*