o
    㥵i4                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ d dlm  m	Z
 d dlmZ d dlmZ d dlmZ dejdejeef fdd	Z	 d,dejd
edededef
ddZ		d-dejdejeef dedefddZG dd dejZG dd dejZG dd dejZeG dd dZG dd dejZed kred!d"d"d#d$d%d%gd&Z e !  e"d%d!d'Z#e e#Z$e%e  e%e$j&j'e$j(j'e$j)j' e e#dddddd(f Z*e%e*j&j'e*j(j'e*j)j' ej+e$j)dddddd(f e*j)d)d*sJ e%d+ dS dS ).    N)	dataclass)ResidualVectorQuantize)weight_norm)remove_parametrizationsxpaddingsc                 C   sX   |\}}|dkr|dksJ ||f|| | j d ksJ | j d | }| d||f S )zCRemove padding from x, handling properly zero padding. Only for 1d!r   .)shape)r   r   padding_leftpadding_rightend r   N/home/ubuntu/.local/lib/python3.10/site-packages/fish_speech/models/dac/rvq.pyunpad1d   s
   r   kernel_sizestridepadding_totalreturnc                 C   s@   | j d }|| | | d }t|d | ||  }|| S )zSee `pad_for_conv1d`.r      )r	   mathceil)r   r   r   r   lengthn_framesideal_lengthr   r   r   get_extra_padding_for_conv1d   s   
r   zeros        modevaluec                 C   s   | j d }|\}}|dkr|dksJ ||f|dkrKt||}d}||kr4|| d }t| d|f} t| |||}	|	j d | }
|	dd|
f S t| |||S )zTiny wrapper around F.pad, just to allow for reflect padding on small input.
    If this is the case, we insert extra 0 padding to the right
    before the reflection happen.
    r   r   reflectr   .N)r	   maxFpad)r   r   r   r   r   r
   r   max_pad	extra_padpaddedr   r   r   r   pad1d    s   


r&   c                       s@   e Zd Z				d fdd	Zdd Zdd	d
Zdd Z  ZS )CausalConvNetr   Nc                    sV   t t|   tj||||||d| _|| _|d | d | _|| _| j| j | _	d S )N)r   dilationgroupsr   )
superr'   __init__nnConv1dconvr   r   r(   padding)selfin_channelsout_channelsr   r(   r   r)   r/   	__class__r   r   r+   ;   s   
zCausalConvNet.__init__c                 C   s:   | j }t|| j| j|}t|||fddd}| | S )Nconstantr   )r   r   )r/   r   r   r   r&   r.   
contiguous)r0   r   r"   extra_paddingr   r   r   forwardS   s   zCausalConvNet.forwardweightr   c                 C      t | j||d| _| S N)namedimr   r.   r0   r<   r=   r   r   r   r   [      zCausalConvNet.weight_normc                 C      t | j| _| S Nr   r.   r0   r   r   r   remove_weight_norm_      z CausalConvNet.remove_weight_norm)r   r   r   Nr9   r   __name__
__module____qualname__r+   r8   r   rE   __classcell__r   r   r3   r   r'   :   s    
r'   c                       s:   e Zd Z	d fdd	Zdd Zdd	d
Zdd Z  ZS )CausalTransConvNetr   Nc                    s4   t t|   tj|||||d| _|| _|| _d S )N)r   r(   )r*   rM   r+   r,   ConvTranspose1dr.   r   r   )r0   r1   r2   r   r(   r   r/   r3   r   r   r+   e   s   

zCausalTransConvNet.__init__c                 C   s>   |  |}| j| j }t|}|| }t|||f}| S rB   )r.   r   r   r   r   r   r6   )r0   r   r"   r   r
   r   r   r   r8   o   s   

zCausalTransConvNet.forwardr9   r   c                 C   r:   r;   r>   r?   r   r   r   r   w   r@   zCausalTransConvNet.weight_normc                 C   rA   rB   rC   rD   r   r   r   rE   {   rF   z%CausalTransConvNet.remove_weight_norm)r   r   NrG   rH   r   r   r3   r   rM   d   s    

rM   c                       sP   e Zd ZdZ				ddededed	ed
ef
 fddZddefddZ  Z	S )ConvNeXtBlocka  ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in PyTorch
    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
        kernel_size (int): Kernel size for depthwise conv. Default: 7.
        dilation (int): Dilation for depthwise conv. Default: 1.
    ư>      @   r   r=   layer_scale_init_value	mlp_ratior   r(   c                    s   t    t}||||||d| _tj|dd| _t|t|| | _	t
 | _tt|| || _|dkrGtj|t| dd| _d S d | _d S )N)r   r)   r(   rP   )epsr   T)requires_grad)r*   r+   r'   dwconvr,   	LayerNormnormLinearintpwconv1GELUactpwconv2	Parametertorchonesgamma)r0   r=   rS   rT   r   r(   convnet_typer3   r   r   r+      s(   

zConvNeXtBlock.__init__Tapply_residualc                 C   sv   |}|  |}|ddd}| |}| |}| |}| |}| jd ur,| j| }|ddd}|r9|| }|S )Nr      r   )rW   permuterY   r\   r^   r_   rc   )r0   r   re   inputr   r   r   r8      s   






zConvNeXtBlock.forward)rP   rQ   rR   r   )T)
rI   rJ   rK   __doc__r[   floatr+   boolr8   rL   r   r   r3   r   rO      s$    rO   c                   @   sR   e Zd ZU ejed< ejed< ejed< ejed< ejed< dZejdB ed< dS )VQResultzcodeslatentscodebook_losscommitment_lossNsemantic_distill_z)rI   rJ   rK   ra   Tensor__annotations__rr   r   r   r   r   rl      s   
 




rl   c                       s   e Zd Z											dded	ed
ededededee dee dB dejdB dejdB dejdB f fddZdd Z		ddede
jfddZde
jfddZ  ZS )  DownsampleResidualVectorQuantize   	            ?   rf   rf   N	input_dimn_codebookscodebook_dimquantizer_dropoutcodebook_sizesemantic_codebook_sizedownsample_factordownsample_dims
pre_modulepost_modulesemantic_predictor_modulec                    s  t    |d u rfddtt|D }ft|  td||dd| _t||||d| _|| _|| _	t
ttj fddt|D  | _tj fddttt|D  | _| | j |	d urk|	nt | _|
d urv|
nt | _|d ur|| _d S t | _d S )Nc                    s   g | ]} qS r   r   ).0_)r|   r   r   
<listcomp>   s    z=DownsampleResidualVectorQuantize.__init__.<locals>.<listcomp>r   r   )r|   r}   r   r~   r   c              
      sB   g | ]\}}t  |  |d   ||dt |d   dqS r   )r   r   r=   r,   
SequentialrO   r   idxfactor)all_dimsrd   r   r   r      s    

c              
      s>   g | ]\}}t  |d    | ||dt | dqS r   r   r   )r   transconvnet_typer   r   r     s    

)r*   r+   rangelentupler   semantic_quantizer	quantizerr   r   r'   rM   r,   r   	enumerate
downsamplereversedlistupsampleapply_init_weightsIdentityr   r   r   )r0   r|   r}   r~   r   r   r   r   r   r   r   r   r3   )r   rd   r|   r   r   r+      sR   


z)DownsampleResidualVectorQuantize.__init__c                 C   s<   t |tjtjfrtjj|jdd tj|jd d S d S )Ng{Gz?)stdr   )	
isinstancer,   r-   rZ   inittrunc_normal_r9   	constant_bias)r0   mr   r   r   r      s   z.DownsampleResidualVectorQuantize._init_weightsn_quantizerssemantic_lenc                 K   s"  |j }|d u rt|j d g}| |}| |}| |\}}}}	}
|| }| j||d\}}}}}|| }||	 }||
 }tj||gdd}tj||gdd}| |}| 	|}|d |j d  }d}t
|| }|dkrzt|||f}n|dk r|d|d f }t|||||d}|S )Nr   )r   r   r   r   .)rm   rn   ro   rq   rp   )r	   ra   
LongTensorr   r   r   r   catr   r   absr!   r"   rl   )r0   rm   r   r   kwargsoriginal_shape
semantic_zsemantic_codessemantic_latentssemantic_commitment_losssemantic_codebook_loss
residual_zrn   ro   rq   rp   diffrightleftresultsr   r   r   r8   %  sL   



z(DownsampleResidualVectorQuantize.forwardindicesc                 C   s   t |}t j|d d df | jjd d|d d df< t j|d d dd f | jjd d|d d dd f< | j|d d d df d }| j|d d dd f d }|| }| |}| |}|S )Nr   r   )r    )	ra   
zeros_likeclampr   r   r   
from_codesr   r   )r0   r   new_indicesz_q_semanticz_q_residualz_qr   r   r   decode`  s   
  

z'DownsampleResidualVectorQuantize.decode)rv   rw   rx   ry   rv   rz   r{   NNNN)NN)rI   rJ   rK   r[   rj   r   r,   Moduler+   r   ra   rs   r8   r   rL   r   r   r3   r   ru      sV    
	
S
;ru   __main__i   rx   rv   ry   rf   )r|   r}   r~   r   r   r   i  (   g:0yE>)atolSuccess)r   )r   r   ),r   typingtpdataclassesr   ra   torch.nnr,   torch.nn.functional
functionalr!   dac.nn.quantizer   torch.nn.utils.parametrizationsr   torch.nn.utils.parametrizer   rs   Tupler[   r   r   strrj   r&   r   r'   rM   rO   rl   ru   rI   rvqevalrandnr   resultprintro   r	   rn   rm   result1allcloser   r   r   r   <module>   sv    


*A	 /.