o
    xiA                     @  s   d dl mZ d dlZddlmZ dRddZ				dSdTddZ				dSdTddZdUdd ZdVd!d"Z	dWd%d&Z
dXd*d+Z	dYdZd2d3Ze 	4	5	6	7	8		 					9			d[d\dPdQZdS )]    )annotationsN   )TextToLatentRFDiTseedintdevicetorch.devicereturn$tuple[torch.Generator, torch.device]c                 C  sH   zt j|d| |fW S  ty#   t jdd| t df Y S w )Nr   cpu)torch	Generatormanual_seedRuntimeErrorr   r   r    r   */home/ubuntu/Irodori-TTS/irodori_tts/rf.py	_make_rng   s
    r                 ?MbP?+?
batch_sizemeanfloatstdt_mint_maxtorch.Tensorc                 C  s.   t j| |d| | }t |}|j||dS )Nr   minmax)r   randnsigmoidclamp)r   r   r   r   r   r   ztr   r   r   sample_logit_normal_t   s   
r(   c           	      C  s   | dkrt jd|dS t j| |t jdt j| |d t|  }|dd}t d| d d	 }|| | }t |}|t j	| |d }|j||d
S )z
    Stratified sampling for logit-normal timesteps.

    u ~ stratified U(0, 1), z = mean + std * Phi^{-1}(u), t = sigmoid(z)
    r   )r   r   r   dtypeư>g!?g       @r   g;f?r    )
r   emptyarangefloat32randr   r%   erfinvr$   randperm)	r   r   r   r   r   r   ur&   r'   r   r   r    sample_stratified_logit_normal_t   s   
r3   x0noiser'   c                 C  s0   d|d d d d f  |  |d d d d f |  S )Nr   r   )r4   r5   r'   r   r   r   rf_interpolate:   s   0r6   c                 C  s   ||  S Nr   )r4   r5   r   r   r   rf_velocity_target?   s   r8   x_tv_predc                 C  s   | |d d d d f |  S r7   r   )r9   r:   r'   r   r   r   rf_predict_x0D   s   r;   float | torch.Tensor	rescale_krescale_sigmac           
      C  s   t |tjrt| nt|}|dkr| S d| }|| ||  }t|t| }|| d || t| d  }	|	||  |  | | S )zI
    Temporal score rescaling from https://arxiv.org/pdf/2510.01184.
    r   )
isinstancer   Tensorr   item)
r:   r9   r'   r=   r>   t_valueone_minus_tsnrsigma_sqratior   r   r   temporal_score_rescaleI   s    
 rG   context_kv_cacheClist[tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]scale
max_layers
int | NoneNonec                 C  s`   |du r	t | }ntdtt|t | }t|D ]}| | \}}}}|| || qdS )zO
    In-place scaling of speaker K/V tensors in precomputed context cache.
    Nr   )lenr"   r!   r   rangemul_)rH   rJ   rK   n_layersi_	k_speaker	v_speakerr   r   r   scale_speaker_kv_cache]   s   

rV   (         @      @independent      ?Tmodelr   text_input_ids	text_mask
ref_latentref_masksequence_length	num_stepscfg_scale_textcfg_scale_speakercfg_guidance_modestr	cfg_min_t	cfg_max_t	cfg_scalefloat | Nonetruncation_factoruse_context_kv_cacheboolspeaker_kv_scalespeaker_kv_max_layersspeaker_kv_min_tc           I   
   C  s  | j }| j}|jd }| jj}t||d\}}tj|||f|||d}||kr-|j|d}|dur7|t	| }|durCt	|}t	|}t
|	  }	|	dvrWtd|	dd	}tjd
d|d |d| }|dk}|dk}|	dk} |	dk}!|	dk}"| j||||d\}#}$}%}&t|#}'t|$}(t|%})t|&}*d}+|#},|$}-|%}.|&}/| r*|r|rd}+tj|#|'|#gdd},tj|$|(|$gdd}-tj|%|%|)gdd}.tj|&|&|*gdd}/nS|rd}+tj|#|'gdd},tj|$|(gdd}-tj|%|%gdd}.tj|&|&gdd}/n)|r*d}+tj|#|#gdd},tj|$|$gdd}-tj|%|)gdd}.tj|&|*gdd}/t|p1|du}0d}1d}2d}3d}4d}5|0r| j|#|%d}1| rW|+dkrW| j|,|.d}2n(|!rh|s`|rg| j|'|)d}5n|"r|ru| j|'|%d}3|r| j|#|)d}4|durt|1t	||d |2durt|2t	||d |3durt|3t	||d |du}6t|D ]}7||7 }8||7d  }9tj|f|8||d}:|dks|dko|
|8   ko|kn  };|;r| rKtj|g|+ dd|}<|:|+}=| j|<|=|,|-|.|/|2d}>|r$|r$|>jddd\}?}@}A|?||?|@   ||?|A   }Bn|r9|>jddd\}?}@|?||?|@   }Bn|>jddd\}?}A|?||?|A   }Bn| j|||:|#|$|%|&|1d}?|!r|ry|rytt	|t	| dkrttdt	|}Cn|rt	|}Cnt	|}C| j|||:|'|(|)|*|5d}D|?|C|?|D   }Bn[|"r|r|r|7d dk}En|}E|Ert	|}F| j|||:|'|(|%|&|3d}Gnt	|}F| j|||:|#|$|)|*|4d}G|?|F|?|G   }Bntd|	 | j|||:|#|$|%|&|1d}B|dur|durt|B||8t	|t	|d}B|6rH|durH|9|k rH|8|krHd
t	| }Ht|1|H|d |2dur:t|2|H|d |3durFt|3|H|d d}6||B|9|8   }q|S )z
    Euler sampling over RF ODE with text+reference conditioning CFG.

    Returns:
      latent sequence in patched space, shape (B, sequence_length, patched_latent_dim)
    r   r   )r   r*   	generatorr   N>   jointalternatingrZ   zUnsupported cfg_guidance_mode=z3. Expected one of: independent, joint, alternating.r   r   r   r   rZ   rr   rs   )r]   r^   r_   r`      )dim   )
text_statespeaker_state)rH   rJ   rK   r)   )r9   r'   rw   r^   rx   speaker_maskrH   r+   zlcfg_guidance_mode='joint' expects a single guidance scale; set equal text/speaker scales or use --cfg-scale.zUnexpected cfg_guidance_mode: )r:   r9   r'   r=   r>   F)r   r*   shapecfgpatched_latent_dimr   r   r#   tor   rf   striplower
ValueErrorlinspaceencode_conditions
zeros_likecatrm   build_context_kv_cacherV   rO   fullrA   repeatforward_with_encoded_conditionschunkabsr   rG   )Ir\   r]   r^   r_   r`   ra   rb   rc   rd   re   rg   rh   r   ri   rk   r=   r>   rl   rn   ro   rp   r   r*   r   
latent_dimrng
rng_devicer9   
init_scale
t_schedulehas_text_cfghas_speaker_cfguse_independent_cfguse_joint_cfguse_alternating_cfgtext_state_condtext_mask_condspeaker_state_condspeaker_mask_condtext_state_uncondtext_mask_uncondspeaker_state_uncondspeaker_mask_uncondcfg_batch_multtext_state_cfgtext_mask_cfgspeaker_state_cfgspeaker_mask_cfgeffective_use_context_kv_cachecontext_kv_condcontext_kv_cfgcontext_kv_uncond_textcontext_kv_uncond_speakercontext_kv_uncond_jointspeaker_kv_activerR   r'   t_nextttuse_cfgx_t_cfgtt_cfgv_outv_condv_uncond_textv_uncond_speakervjoint_scalev_uncond_jointuse_text_uncond	alt_scalev_uncond_alt	inv_scaler   r   r   sample_euler_rf_cfgo   s  















	

	
	
	




r   )r   r   r   r   r	   r
   )r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r	   r   )r4   r   r5   r   r'   r   r	   r   )r4   r   r5   r   r	   r   )r9   r   r:   r   r'   r   r	   r   )r:   r   r9   r   r'   r<   r=   r   r>   r   r	   r   r7   )rH   rI   rJ   r   rK   rL   r	   rM   )rW   rX   rY   rZ   r[   r   r   NNNNTNNN),r\   r   r]   r   r^   r   r_   r   r`   r   ra   r   rb   r   rc   r   rd   r   re   rf   rg   r   rh   r   r   r   ri   rj   rk   rj   r=   rj   r>   rj   rl   rm   rn   rj   ro   rL   rp   rj   r	   r   )
__future__r   r   r\   r   r   r(   r3   r6   r8   r;   rG   rV   inference_moder   r   r   r   r   <module>   sJ    




