o
    xi[                     @  s,  d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 ddl
mZ d1d2ddZd3ddZd4ddZG dd dejZG dd dejZd5dd ZG d!d" d"ejZG d#d$ d$ejZG d%d& d&ejZG d'd( d(ejZG d)d* d*ejZG d+d, d,ejZG d-d. d.ejZG d/d0 d0ejZdS )6    )annotations)asdictN   )ModelConfig     @dimintendthetafloatreturntorch.Tensorc                 C  sT   d|t jd| dt jd|    }t j|t jd}t ||}t t |t |S )N      ?r      dtype)torcharangefloat32outercomplexcossin)r   r	   r
   freqst r   -/home/ubuntu/Irodori-TTS/irodori_tts/model.pyprecompute_freqs_cis   s    r   x	freqs_cisc                 C  sb   t |  jg | jd d ddR  }||d d d d d d f  }t || }|| S )N   r   )r   view_as_complexr   reshapeshapeview_as_real
reshape_astype_as)r   r   x_r   r   r   apply_rotary_emb   s   ,
r)   timestepc              
   C  s   |d dksJ |d }dt t t jd| jt jd t j|| jt jd |  }| d d d f  |d d d f  }t jt 	|t 
|gdd| jS )Nr   r   g     @@r   )devicer   r!   r   )r   explogtensorr+   r   r   r   catr   r   tor   )r*   r   halfr   argsr   r   r   get_timestep_embedding   s   $&r4   c                      s*   e Zd Zdd fddZdddZ  ZS )RMSNormư>r   int | tuple[int, ...]epsr   c                   s6   t    t|tr|f}tt|| _|| _	d S N)
super__init__
isinstancer   nn	Parameterr   onesweightr8   )selfr   r8   	__class__r   r   r;   (   s
   


zRMSNorm.__init__r   r   r   c                 C  s@   |j }| }|t|| jddd| j  }|| j |S )Nr!   Tr   keepdim)r   r   r   rsqrtmeanr8   r@   r1   )rA   r   x_dtyper   r   r   forward/   s   "zRMSNorm.forward)r6   )r   r7   r8   r   r   r   r   r   __name__
__module____qualname__r;   rI   __classcell__r   r   rB   r   r5   '   s    r5   c                      s,   e Zd ZdZd fddZdddZ  ZS )LowRankAdaLNz`
    Echo-style low-rank AdaLN that returns both modulated activations and a residual gate.
    	model_dimr   rankr8   r   c                   s   t    tdtt|t|}|| _tj||dd| _tj||dd| _	tj||dd| _
tj||dd| _tj||dd| _tj||dd| _d S )Nr   FbiasT)r:   r;   maxminr   r8   r=   Linear
shift_down
scale_down	gate_downshift_upscale_upgate_up)rA   rQ   rR   r8   rB   r   r   r;   ;   s   
zLowRankAdaLN.__init__r   r   
cond_embedr   !tuple[torch.Tensor, torch.Tensor]c                 C  s   |j ddd\}}}| | t|| }| | t|| }| | t|| }|j	}|
 }|t|| jddd| j  }|d|  | }t|}|||fS )Nr    r!   r,   TrD   r   )chunkr[   rX   Fsilur\   rY   r]   rZ   r   r   r   rF   rG   r8   tanhr1   )rA   r   r^   shiftscalegaterH   r   r   r   rI   F   s   "
zLowRankAdaLN.forward)rQ   r   rR   r   r8   r   )r   r   r^   r   r   r_   )rL   rM   rN   __doc__r;   rI   rO   r   r   rB   r   rP   6   s    rP   seqmask
patch_sizer_   c                 C  s  |dkr| |fS | j dks|j dkr"tdt| j dt|j | jd |jd ks6| jd |jd krGtdt| j dt|j d	| j\}}}|| | }|dkratd
| d| | ddd|f ||| || } |ddd|f ||| |jdd}| |fS )a  
    Patch along sequence axis:
      seq: (B, S, D) -> (B, S//patch, D*patch)
      mask: (B, S) -> (B, S//patch) with all() over patch window.

    Note:
      For speaker conditioning in this project, `seq` is already in
      latent-patched space (D = latent_dim * latent_patch_size).
      This helper applies an additional sequence patching for
      `speaker_patch_size`.
    r   r    r   z*Expected seq=(B,S,D), mask=(B,S), got seq=z mask=r   z"Sequence/mask shape mismatch: seq=z, mask=z. Expected matching (B,S).z4Reference sequence too short for speaker_patch_size=z
: seq_len=Nr!   r,   )ndim
ValueErrortupler$   r#   all)rh   ri   rj   bszseq_lenr   usabler   r   r   patch_sequence_with_maskV   s&   (&*rr   c                      s(   e Zd Zd fddZdddZ  ZS )SelfAttentionr   r   headsnorm_epsr   c                   s   t    || dkrtd| d| || d dkr!td|| _|| _|| | _tj||dd| _tj||dd| _	tj||dd| _
tj||dd| _tj||dd| _t| j| jf|d| _t| j| jf|d| _d S 	Nr   zdim=z must be divisible by heads=r   zhead_dim must be even for RoPEFrS   r8   )r:   r;   rl   r   rt   head_dimr=   rW   wqwkwvworf   r5   q_normk_norm)rA   r   rt   ru   rB   r   r   r;   }   s   

zSelfAttention.__init__r   r   key_masktorch.Tensor | Noner   r   c                 C  s  |j \}}}| |||| j| j}| |||| j| j}| |||| j| j}	| |}
| |}| 	|}t
||d | }t
||d | }d }|d ur`|d d d d d d f }tj|dd|dd|	dd|dddd}|||| j}|t|
 }| |S )Nr   r   F	attn_mask	is_causal)r$   ry   r#   rt   rx   rz   r{   rf   r}   r~   r)   ra   scaled_dot_product_attention	transposer   r   sigmoidr|   )rA   r   r   r   ro   rp   _qkvrf   r   yr   r   r   rI      s0   






zSelfAttention.forward)r   r   rt   r   ru   r   )r   r   r   r   r   r   r   r   rK   r   r   rB   r   rs   |   s    rs   c                      sF   e Zd ZdZd fd	d
Zd ddZd!ddZ		d"d#ddZ  ZS )$JointAttentionz^
    Echo-style joint attention over latent self tokens + text context + speaker context.
    r   r   rt   text_ctx_dimspeaker_ctx_dimru   r   c                   s*  t    || dkrtd| d| || d dkr!td|| _|| _|| | _tj||dd| _tj||dd| _	tj||dd| _
tj||dd| _tj||dd| _tj||dd| _tj||dd| _tj||dd| _tj||dd| _t| j| jf|d| _t| j| jf|d| _d S rv   )r:   r;   rl   r   rt   rx   r=   rW   ry   rz   r{   wk_textwv_text
wk_speaker
wv_speakerrf   r|   r5   r}   r~   )rA   r   rt   r   r   ru   rB   r   r   r;      s&   

zJointAttention.__init__r   r   r   r   c                 C  s.   |j ddd\}}t||}tj||gddS )Nr   r,   )r`   r)   r   r0   )rA   r   r   x_rotx_passthroughr   r   r   _apply_rotary_half   s   
z!JointAttention._apply_rotary_halftext_contextspeaker_context=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]c                 C  s   |j d }|j d |krtdt|j  dt|j  | |||j d | j| j}| |||j d | j| j}| |||j d | j| j}| 	|||j d | j| j}| 
|}| 
|}||||fS )zQ
        Precompute text/speaker KV projections for static conditioning.
        r   z,Batch mismatch for context projection: text=z	 speaker=r   )r$   rl   rm   r   r#   rt   rx   r   r   r   r~   )rA   r   r   ro   k_textv_text	k_speaker	v_speakerr   r   r   project_context_kv   s(   






z!JointAttention.project_context_kvN	text_maskr   speaker_mask	self_mask
context_kvDtuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] | Nonec	                 C  s  |j \}	}
}| ||	|
| j| j}| ||	|
| j| j}| ||	|
| j| j}|d u r=| j||d\}}}}n|\}}}}| |}| 	|}| 
||d |
 }| 
||d |
 }|d u rqtj|	|
ftj|jd}|d u rtj|	|j d ftj|jd}|d u rtj|	|j d ftj|jd}tj|||gdd}tj|||gdd}tj|||gdd}|d d d d d d f }tj|dd|dd|dd|dddd}||	|
| j}|t| | }| |S )Nr   r   )r   r+   r   r,   r   Fr   )r$   ry   r#   rt   rx   rz   r{   r   r}   r~   r   r   r?   boolr+   r0   ra   r   r   r   r   rf   r|   )rA   r   r   r   r   r   r   r   r   ro   rp   r   r   k_selfv_selfr   r   r   r   r   r   r   r   r   r   r   rI      sX   





zJointAttention.forward)
r   r   rt   r   r   r   r   r   ru   r   r   r   r   r   r   r   )r   r   r   r   r   r   NN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )	rL   rM   rN   rg   r;   r   r   rI   rO   r   r   rB   r   r      s    

%r   c                      s(   e Zd Zd fddZdd	d
Z  ZS )SwiGLUr   r   
hidden_dimc                   sD   t    tj||dd| _tj||dd| _tj||dd| _d S )NFrS   )r:   r;   r=   rW   w1w2w3)rA   r   r   rB   r   r   r;   3  s   
zSwiGLU.__init__r   r   r   c                 C  s    |  t| || | S r9   )r   ra   rb   r   r   )rA   r   r   r   r   rI   9  s    zSwiGLU.forward)r   r   r   r   rJ   rK   r   r   rB   r   r   2  s    r   c                      s(   e Zd Zd fdd	ZdddZ  ZS )	TextBlockr   r   rt   	mlp_ratior   ru   dropoutc                   sZ   t    t||d| _t|||d| _t||d| _t|t|| | _	t
|| _d S )Nrw   ru   )r:   r;   r5   attention_normrs   	attentionmlp_normr   r   mlpr=   Dropoutr   )rA   r   rt   r   ru   r   rB   r   r   r;   >  s   
zTextBlock.__init__r   r   ri   r   r   c                 C  s>   ||  | j| |||d }||  | | | }|S )N)r   r   )r   r   r   r   r   )rA   r   ri   r   r   r   r   rI   F  s
   zTextBlock.forward)
r   r   rt   r   r   r   ru   r   r   r   )r   r   ri   r   r   r   r   r   rK   r   r   rB   r   r   =  s    r   c                      s2   e Zd Zd fddZdddZdddZ  ZS )TextEncodercfgr   c                   st   t    t j j| _ jt fddt	 j
D | _ j j | _| jdtjddtjddd d S )Nc                 3  *    | ]}t  j j j jd V  qdS )r   rt   r   ru   r   N)r   text_dim
text_headsru   r   .0r   r   text_mlp_ratior   r   	<genexpr>S      
z'TextEncoder.__init__.<locals>.<genexpr>_freqs_cis_cacher   r   F
persistent)r:   r;   r=   	Embeddingtext_vocab_sizer   text_embeddingtext_mlp_ratio_resolved
ModuleListrangetext_layersblocksr   rx   register_bufferr   empty	complex64rA   r   rB   r   r   r;   O  s   



zTextEncoder.__init__rp   r   r+   torch.devicer   r   c                 C  B   | j }|j|ks|jd |k rt| j||}|| _ |d | S Nr   r   r+   r$   r   rx   r1   rA   rp   r+   cacher   r   r   _rope_freqsb  
   zTextEncoder._rope_freqs	input_idsri   c                 C  sd   |  |}|dj|jd}|| }| |jd |j}| jD ]}||||d}|| }q || S )Nr!   r   r   ri   r   )r   	unsqueezer1   r   r   r$   r+   r   )rA   r   ri   r   mask_fr   blockr   r   r   rI   i  s   


zTextEncoder.forwardr   r   rp   r   r+   r   r   r   )r   r   ri   r   r   r   )rL   rM   rN   r;   r   rI   rO   r   r   rB   r   r   N  s    
r   c                      s6   e Zd ZdZd fddZdddZdddZ  ZS )ReferenceLatentEncoderzK
    Encoder for reference latents used as speaker/style conditioning.
    r   r   c                   sx   t    tj j jdd| _ jt fddt	 j
D | _ j j | _| jdtjddtjddd	 d S )
NTrS   c                 3  r   r   )r   speaker_dimspeaker_headsru   r   r   r   speaker_mlp_ratior   r   r   ~  r   z2ReferenceLatentEncoder.__init__.<locals>.<genexpr>r   r   r   Fr   )r:   r;   r=   rW   speaker_patched_latent_dimr   in_projspeaker_mlp_ratio_resolvedr   r   speaker_layersr   r   rx   r   r   r   r   r   rB   r   r   r;   z  s   



zReferenceLatentEncoder.__init__rp   r   r+   r   r   r   c                 C  r   r   r   r   r   r   r   r     r   z"ReferenceLatentEncoder._rope_freqslatentri   c                 C  sl   |  |}|d }|dj|jd}|| }| |jd |j}| jD ]}||||d}|| }q$|| S )Ng      @r!   r   r   r   )r   r   r1   r   r   r$   r+   r   )rA   r   ri   r   r   r   r   r   r   r   rI     s   


zReferenceLatentEncoder.forwardr   r   )r   r   ri   r   r   r   )rL   rM   rN   rg   r;   r   rI   rO   r   r   rB   r   r   u  s
    
r   c                      s.   e Zd Zd fddZ		ddddZ  ZS )DiffusionBlockr   r   c                   s   t    t|j|j|j|j|jd| _t	|jt
|j|j | _tdtt
|jt
|j}t|j||jd| _t|j||jd| _t|j| _d S )Nr   r   )rQ   rR   r8   )r:   r;   r   rQ   	num_headsr   r   ru   r   r   r   r   r   rU   rV   
adaln_rankrP   attention_adaln	mlp_adalnr=   r   r   )rA   r   r   rB   r   r   r;     s*   
zDiffusionBlock.__init__Nr   r   r^   
text_stater   speaker_stater   r   r   r   r   r   r   c
                 C  sd   |  ||\}
}|| || j|
|||||||	d  }| ||\}
}|| || |
  }|S )N)r   r   r   r   r   r   r   r   )r   r   r   r   r   )rA   r   r^   r   r   r   r   r   r   r   hattention_gatemlp_gater   r   r   rI     s$   zDiffusionBlock.forwardr   r   )r   r   r^   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rK   r   r   rB   r   r     s
     r   c                      s   e Zd ZdZd/ fddZd0ddZ	d1d2ddZ		d3d4d d!Z		d3d5d"d#Zd6d%d&Z	e
d7d'd(Ze
d8d*d+Zd9d-d.Z  ZS ):TextToLatentRFDiTz
    Text + reference-latent conditioned RF diffusion model over patched DACVAE latent sequences.

    Input x_t shape: (B, S, latent_dim * latent_patch_size)
    Output v_pred shape: same as input.
    r   r   c                   s`  t     | _t | _t | _t j j	d| _
t j j	d| _ttj j jddt tj j jddt tj j jd dd| _t j j| _t fddt jD | _t j j	d| _t j j| _tj| jj | jjd urtj| jj  j j  | _!| j!d dkrt"d	| j#d
t$j%ddt$j&ddd d S )Nrw   FrS   r    c                 3  s    | ]}t  V  qd S r9   )r   r   r   r   r   r     s    z-TextToLatentRFDiT.__init__.<locals>.<genexpr>r   r   z$model head_dim must be even for RoPEr   r   r   )'r:   r;   r   r   text_encoderr   speaker_encoderr5   r   ru   	text_normr   speaker_normr=   
SequentialrW   timestep_embed_dimrQ   SiLUcond_modulepatched_latent_dimr   r   r   
num_layersr   out_normout_projinitzeros_r@   rT   r   rx   rl   r   r   r   r   r   rB   r   r   r;     s4   


 
zTextToLatentRFDiT.__init__rp   r   r+   r   r   r   c                 C  r   r   r   r   r   r   r   r     r   zTextToLatentRFDiT._rope_freqsNtext_input_idsr   
ref_latentref_maskcondition_dropoutr   r   c                 C  sv   |d ur|  }|  }d||< d||< t||| jjd\}}| ||}| ||}| |}| |}||||fS )NF)rh   ri   rj   )clonerr   r   speaker_patch_sizer   r   r   r   )rA   r  r   r  r  r	  r   	ref_stater   r   r   encode_conditions  s   


z#TextToLatentRFDiT.encode_conditionsx_tr   r   r   r   latent_maskcontext_kv_cacheJlist[tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] | Nonec	                 C  s   t || jjj|jd}	| |	}
|
d d d d d f }
| |}| |jd |j	}t
| jD ]\}}|||
|||||||d urE|| nd d	}q0| |}| |}|j|jdS )Nr   r   )	r   r^   r   r   r   r   r   r   r   )r4   r   r   r1   r   r   r   r   r$   r+   	enumerater   r  r  )rA   r  r   r   r   r   r   r  r  t_embedr^   r   r   ir   r   r   r   forward_with_encoded_conditions!  s(   



z1TextToLatentRFDiT.forward_with_encoded_conditionsc	              	   C  s4   | j |||||d\}	}}
}| j|||	||
||dS )N)r  r   r  r  r	  )r  r   r   r   r   r   r  )r  r  )rA   r  r   r  r   r  r  r  r	  r   r   r   r   r   r   rI   D  s    zTextToLatentRFDiT.forwardClist[tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]c                   s    fdd| j D S )zg
        Build per-layer projected text/speaker KV tensors for faster repeated sampling steps.
        c                   s   g | ]
}|j j d qS )r   )r   r   )r   r   r   r   r   r   
<listcomp>h  s    z<TextToLatentRFDiT.build_context_kv_cache.<locals>.<listcomp>)r   )rA   r   r   r   r  r   build_context_kv_cache`  s   z(TextToLatentRFDiT.build_context_kv_cachec                 C     t |  jS r9   )next
parametersr+   rA   r   r   r   r+   p     zTextToLatentRFDiT.devicetorch.dtypec                 C  r  r9   )r  r  r   r  r   r   r   r   t  r  zTextToLatentRFDiT.dtypedictc                 C  s
   t | jS r9   )r   r   r  r   r   r   as_dictx  s   
zTextToLatentRFDiT.as_dictr   r   r9   )r  r   r   r   r  r   r  r   r	  r   r   r   r   )r  r   r   r   r   r   r   r   r   r   r   r   r  r   r  r  r   r   )r  r   r   r   r  r   r   r   r  r   r  r   r  r   r	  r   r   r   )r   r   r   r   r   r  )r   r   )r   r  )r   r   )rL   rM   rN   rg   r;   r   r  r  rI   r  propertyr+   r   r!  rO   r   r   rB   r   r     s$    
 !+
r   )r   )r   r   r	   r   r
   r   r   r   r   )r*   r   r   r   r   r   )rh   r   ri   r   rj   r   r   r_   )
__future__r   dataclassesr   r   torch.nnr=   torch.nn.functional
functionalra   configr   r   r)   r4   Moduler5   rP   rr   rs   r   r   r   r   r   r   r   r   r   r   r   <module>   s*    


 &5 ',8