o
    xi                     @  sf   d dl mZ d dlZd dlmZ d dlmZ d dlZd dlZdd
dZ	dddZ
eG dd dZdS )    )annotationsN)	dataclass)Pathlatenttorch.Tensor
patch_sizeintreturnc                 C  sR   |dkr| S | j \}}}|| | }| ddd|f } | ||| || } | S )ze
    Convert latent from (B, T, D) -> (B, T//patch, D*patch).
    Extra tail tokens are dropped.
       N)shapereshape)r   r   bszseq_lendimusable r   -/home/ubuntu/Irodori-TTS/irodori_tts/codec.pypatchify_latent   s   r   patched
latent_dimc                 C  s*   |dkr| S |  | jd | jd | |S )zE
    Convert latent from (B, T_p, D*patch) -> (B, T_p*patch, D).
    r
   r   )r   r   )r   r   r   r   r   r   unpatchify_latent   s   r   c                   @  s   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< e					d&d'ddZe dddd(ddZe d)d d!Z	d*d$d%Z
dS )+DACVAECodecztorch.nn.Modulemodelr   sample_rater   ztorch.devicedeviceztorch.dtypedtypeboolenable_watermarkfloat | Nonewatermark_alphafacebook/dacvae-watermarkedcudaNFrepo_idstrtorch.dtype | Noner	   c              	   C  s  zddl m} W n' ty/   tt jd d }| r'tj	
dt| ddl m} Y nw || |}|d urD|j|d}d }	d}
t|dd }|d urt|drt|j}|d uret|}n|rj|}nd	}t||_t|j}	|	d	k}
|
st|d
rd |fddd}||_t| j}tjddd||d}t  ||}W d    n1 sw   Y  | |t|jt|jd t|||
|	dS )Nr   )DACVAE   dacvaer   Fdecoderalphag        wm_modelxr   messagetorch.Tensor | Noner	   c                 S  s   ~|j j| S )N)r+   encoder_blockforward_no_conv)r,   r-   _decoderr   r   r   _watermark_passthroughR   s   z0DACVAECodec.load.<locals>._watermark_passthroughr
   i   r   r   )r   r   r   r   r   r   r   )r,   r   r-   r.   r	   r   )r'   r%   ImportErrorr   __file__resolveparentsexistssyspathinsertr#   loadevaltogetattrhasattrfloatr*   	watermarknext
parametersr   torchzerosinference_modeencoder   r   r   r   )clsr"   r   r   r   r   r%   
local_repor   configured_watermark_alphaconfigured_enable_watermarkr)   default_alphatarget_alphar2   model_dtypedummyzr   r   r   r<   ,   sV   





zDACVAECodec.load)normalize_db
ensure_maxwaveformr   rR   rS   c                C  sz  |j dkr
|d}|j dkrtdt|j |jd dkr'|jddd}|| jkr5tj	||| j}|j
tjd}|d	urzBtj|| j}|j dkrS|d}t|| jd
dd}ttjd|j|jd|d ddd}t|}t||| |}W n	 ty   Y nw |r| jdddjddd}	|	jdd}
||
 }|j
| j| jd}| j|}|dd S )a  
        Input:
          waveform: (B, C, T) or (C, T)
          normalize_db: Optional target loudness (LUFS-like dB) applied before encode
          ensure_max: If True, scale down only when abs peak exceeds 1.0
        Output:
          latent: (B, T_latent, D_latent)
        r&   r      z$Expected waveform ndim=3, got shape=r
   T)r   keepdimr(   Ng      Tg      T@)minmaxg      $@r3   g      4@g      ?)rW   )ndim	unsqueeze
ValueErrortupler   meanr   
torchaudio
functionalresampler>   rE   float32loudnessrA   clamppowtensorr   r   viewisfinitewhere	Exceptionabsamaxr   rH   	transpose
contiguous)selfrT   r   rR   rS   rc   gain_dbgainfinite_maskpeak	safe_peakencodedr   r   r   encode_waveformk   sB   







zDACVAECodec.encode_waveformr   c                 C  sH   |j dkrtdt|j |dd j| j| jd}| j	
|S )ze
        Input:
          latent: (B, T, D)
        Output:
          audio: (B, 1, samples)
        rU   z"Expected latent ndim=3, got shape=r
   r&   r(   )rZ   r\   r]   r   rm   rn   r>   r   r   r   decode)ro   r   rQ   r   r   r   decode_latent   s   
zDACVAECodec.decode_latentr:   
str | Pathc                 C  s   zt t|\}}W n+ ty6   dd l}|jt|dd\}}t|}|jdkr1|	d}n|j
}Y nw |	d}| || S )Nr   rb   r(   r
   )r_   r<   r#   RuntimeError	soundfilereadrE   
from_numpyrZ   r[   Trv   cpu)ro   r:   wavsrsfdatar   r   r   encode_file   s   


	zDACVAECodec.encode_file)r    r!   NFN)r"   r#   r   r#   r   r$   r   r   r   r   r	   r   )
rT   r   r   r   rR   r   rS   r   r	   r   )r   r   r	   r   )r:   ry   r	   r   )__name__
__module____qualname____annotations__classmethodr<   rE   rG   rv   rx   r   r   r   r   r   r   "   s,   
 >4r   )r   r   r   r   r	   r   )r   r   r   r   r   r   r	   r   )
__future__r   r9   dataclassesr   pathlibr   rE   r_   r   r   r   r   r   r   r   <module>   s    

	