o
    Si                     @   s   d dl mZmZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ G dd	 d	ejZG d
d dejZG dd dejZG dd dejZdS )    )ListOptionalTupleN)	rearrange)nn)Conv2d)weight_norm)Spectrogramc                       s   e Zd ZdZddeedf dee f fddZ	dd	ej	d
ej	deej	 dee
ej	 e
ej	 e
e
ej	  e
e
ej	  f fddZ  ZS )MultiPeriodDiscriminatora  
    Multi-Period Discriminator module adapted from https://github.com/jik876/hifi-gan.
    Additionally, it allows incorporating conditional information with a learned embeddings table.

    Args:
        periods (tuple[int]): Tuple of periods for each discriminator.
        num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
            Defaults to None.
                   Nperiods.num_embeddingsc                    s(   t    t fdd|D | _d S )Nc                       g | ]}t | d qS ))periodr   )DiscriminatorP).0pr    H/home/ubuntu/.local/lib/python3.10/site-packages/vocos/discriminators.py
<listcomp>       z5MultiPeriodDiscriminator.__init__.<locals>.<listcomp>super__init__r   
ModuleListdiscriminators)selfr   r   	__class__r   r   r      s   
z!MultiPeriodDiscriminator.__init__yy_hatbandwidth_idreturnc                 C   p   g }g }g }g }| j D ]&}|||d\}	}
|||d\}}||	 ||
 || || q||||fS N)xcond_embedding_idr!   appendr"   r%   r&   r'   y_d_rsy_d_gsfmap_rsfmap_gsdy_d_rfmap_ry_d_gfmap_gr   r   r   forward   s   



z MultiPeriodDiscriminator.forward)r   NN)__name__
__module____qualname____doc__r   intr   r   torchTensorr   r9   __classcell__r   r   r#   r   r
      s    $
.r
   c                       sx   e Zd Z					ddededed	ed
edee f fddZ	ddejdeej de	eje
ej f fddZ  ZS )r      r   r   皙?Nr   in_channelskernel_sizestridelrelu_sloper   c                    s  t    || _ttt|d|df|df|d dfdttdd|df|df|d dfdttdd|df|df|d dfdttdd|df|df|d dfdttdd|dfd	|d dfdg| _|d ur|tjj	|dd
| _
tjj| j
j ttdddddd| _|| _d S )N    rC   r   r   padding         rC   rC   r   embedding_dim)r   rC   )rC   r   )r   r   r   r   r    r   r   convsr@   	Embeddingembinitzeros_weight	conv_postrH   )r"   r   rE   rF   rG   rH   r   r#   r   r   r   -   s   
	$$$$ 	
zDiscriminatorP.__init__r+   r,   r(   c                 C   s  | d}g }|j\}}}|| j dkr+| j|| j  }tjj|d|fd}|| }||||| j | j}t| j	D ]\}}	|	|}tjj
|| j}|dkrV|| q<|d urp| |}
|
dddd| jddd}nd}| |}|| ||7 }t|dd}||fS )NrC   r   reflectTdimkeepdims)	unsqueezeshaper   r@   r   
functionalpadview	enumeraterR   
leaky_relurH   r.   rT   sumrX   flatten)r"   r+   r,   fmapbctn_padilrT   hr   r   r   r9   H   s.   


 

zDiscriminatorP.forward)rC   r   r   rD   Nr:   )r;   r<   r=   r?   floatr   r   r@   rA   r   r   r9   rB   r   r   r#   r   r   ,   s6    r   c                       s   e Zd Z		ddeedf dee f fddZ	ddejd	ejd
ejdee	ej e	ej e	e	ej  e	e	ej  f fddZ
  ZS )MultiResolutionDiscriminatori   rN   rM   N	fft_sizes.r   c                    s(   t    t fdd|D | _dS )a  
        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
        Additionally, it allows incorporating conditional information with a learned embeddings table.

        Args:
            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
                Defaults to None.
        c                    r   ))window_lengthr   )DiscriminatorR)r   wr   r   r   r   y   r   z9MultiResolutionDiscriminator.__init__.<locals>.<listcomp>Nr   )r"   rr   r   r#   r   r   r   h   s   

z%MultiResolutionDiscriminator.__init__r%   r&   r'   r(   c                 C   r)   r*   r-   r/   r   r   r   r9   |   s   



z$MultiResolutionDiscriminator.forward)rq   Nr:   )r;   r<   r=   r   r?   r   r   r@   rA   r   r9   rB   r   r   r#   r   rp   g   s$    
.rp   c                       sp   e Zd Z				ddedee deded	eeeef d
f f
 fddZdd Zdde	j
de	j
fddZ  ZS )rt   NrI         ?)g        rD   )rD   rv   )rv         ?)rx         ?)ry   g      ?rs   r   channels
hop_factorbands.c                    s   t    || _|| _t|t|| |d d| _|d d fdd|D }|| _ fddt	fddt
t| jD | _|d urXtjj| d	| _tjj| jj ttj dd
ddd| _d S )N)n_fft
hop_length
win_lengthpowerr   rC   c                    s,   g | ]}t |d    t |d   fqS )r   rC   )r?   r   rh   )r}   r   r   r      s   , z+DiscriminatorR.__init__.<locals>.<listcomp>c                      sx   t tt jd ddddtt j  ddddtt j  ddddtt j  ddddtt j  ddddgS )Nr   )r   	   rO   )rC      rJ   )rC   r   r   r   )r   r    r   r   r   )rz   r   r   <lambda>   s    z)DiscriminatorR.__init__.<locals>.<lambda>c                    s   g | ]}  qS r   r   )r   _)rR   r   r   r      s    rP   r   rO   rJ   )r   r   rs   r{   r	   r?   spec_fnr|   r   r    rangelen
band_convsr@   rS   rT   rU   rV   rW   r   r   rX   )r"   rs   r   rz   r{   r|   r#   )rz   rR   r}   r   r      s   
$	zDiscriminatorR.__init__c                    sj     j ddd  d    jdddd d   |   t  t d  fd	d
| jD }|S )NrZ   Tr[   g?)r\   keepdimr   g&.>zb f t c -> b c t fc                    s$   g | ]} d |d |d f qS ).r   rC   r   r   r+   r   r   r      s   $ z.DiscriminatorR.spectrogram.<locals>.<listcomp>)meanabsmaxr   r@   view_as_realr   r|   )r"   r+   x_bandsr   r   r   spectrogram   s   "


zDiscriminatorR.spectrogramr+   r,   c                 C   s   |  |}g }g }t|| jD ]'\}}t|D ]\}}||}tjj|d}|dkr0|| q|| qtj	|dd}|d urW| 
|}	|	dddd| jddd}
nd}
| |}|| ||
7 }||fS )NrD   r   rZ   )r\   rC   Tr[   )r   zipr   rc   r@   r   r`   rd   r.   catrT   rb   re   rX   )r"   r+   r,   r   rg   bandstackrl   layerrT   rn   r   r   r   r9      s(   


 

zDiscriminatorR.forward)NrI   rv   rw   r:   )r;   r<   r=   r?   r   ro   r   r   r   r@   rA   r9   rB   r   r   r#   r   rt      s$    " rt   )typingr   r   r   r@   einopsr   r   torch.nnr   torch.nn.utilsr   torchaudio.transformsr	   Moduler
   r   rp   rt   r   r   r   r   <module>   s    !;(