o
    9wixj                     @   sp  d dl Z d dlZd dlZd dlmZ d dlm  mZ d dl	Z
d dlZd dlmZ 	d6dededefdd	Zd7d
dZ					d8dedededededededededefddZdd Zdd Zdd ZG dd  d ejZG d!d" d"ejZG d#d$ d$ejZG d%d& d&ejZG d'd( d(ejZG d)d* d*ejZ G d+d, d,ejZ!d-d. Z"d/d0 Z#d1d2 Z$d9d4d5Z%dS ):    N)	rearrangecpu'  lengthdimshiftc           	      C   s   |d dksJ |t j| |dddd }|d }t j|d |dddd}||||d    }t jt |t |gddS )N   r   device   r   )torcharangeviewcatcossin)	r   r   r   r
   
max_periodposhalf_dimadimphase r   O/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/demucs/transformer.pycreate_sin_embedding   s   r   c           	      C   s  | d dkrt d| t| ||}t| d } ttd| dt||    }td|	d}td|	d}t
|| dd	dd|d|d| dddddf< t|| dd	dd|d|d| dddddf< t
|| dd	ddd||| ddddddf< t|| dd	ddd||| d ddddddf< |dddf |S )z
    :param d_model: dimension of the model
    :param height: height of the positions
    :param width: width of the positions
    :return: d_model*height*width position matrix
       r   zHCannot use sin/cos positional encoding with odd dimension (got dim={:d})r           r   N)
ValueErrorformatr   zerosintexpr   mathlog	unsqueezer   	transposerepeatr   to)	d_modelheightwidthr
   r   pediv_termpos_wpos_hr   r   r   create_2d_sin_embedding%   s*   $$$$r0   r         ?     @
batch_sizemean_normalizeaugmentmax_global_shiftmax_local_shift	max_scaler
   r   c
                 C   s,  |d dksJ dt | ddd }
|
d|d}
|r'|
t j|
ddd8 }
|rdtjj| |
 d|dgd}tjj| |
 | |dgd}tjjt| t|
 d|dgd}|
| | t	| }
|

|}
|d }t j|d |d	ddd}|
|	||d    }t jt |t |gdd
 S )Nr   r   r1   r   r   T)r   keepdim)sizer	   r   )r   r   r   r'   nanmeannprandomuniformr$   r"   r(   r   r   r   float)r   r   r3   r4   r5   r6   r7   r8   r
   r   r   deltadelta_locallog_lambdasr   r   r   r   r   r   create_sin_embedding_capeI   s8   
rC   c                 C   s   t | }||d d d f kS N)r   r   )r   r   r   r   r   get_causal_maskv   s   
rE   c                 C   s
  |dv sJ |dkr/t j|| t jd}d|ddd|f< t|| |  }	d|d|	ddf< |dkrot j|| t jd}t |dddf }
| | |
 t | |d   d| d }|d|t jdt jd	| n|d	krt j|d
 | d
 t jd}t |d
 dddf }
t dtd
|  d d }||d  d
  }t 
|ddd  |g}| | |
 |  d| d }|d|t jdt jd	| |ddddf }n|dkrt j|d}|| t j| | ||d|| |k}||}|S )zn
    When the input of the Decoder has length T1 and the output T2
    The mask matrix has shape (T2, T1)
    )diagjmaskr=   globalrH   )dtypeTNrF   r   r   rG   r         ?r   r=   r	   )	generatorr
   )r   r    boolr!   r   longclampscatter_ones	expand_asr   flip	Generatormanual_seedrandreshaper(   )T1T2	mask_typesparse_attn_windowglobal_windowmask_random_seedsparsityr
   maskline_windowrowscolstgener   r   r   get_elementary_mask{   s@   "  

rd   c                    sX   ddl m} |d}	 fdd|	D }
t|
jdddk}||d S )z
    Return a SparseCSRTensor mask that is a combination of elementary masks
    mask_type can be a combination of multiple masks: for instance "diag_jmask_random"
    r   )SparseCSRTensor_c                    s"   g | ]}t  |qS r   )rd   ).0r^   rW   rX   r
   r[   r\   rZ   r]   r   r   
<listcomp>   s    zget_mask.<locals>.<listcomp>)axisN)xformers.sparsere   splitr   stacksum
from_dense)rW   rX   rY   rZ   r[   r\   r]   r
   re   
mask_types	all_masks
final_maskr   rh   r   get_mask   s   
rs   c                	       sH   e Zd Z		ddedededef fddZed	d
 Zdd Z  Z	S )ScaledEmbeddingr1         @num_embeddingsembedding_dimscaleboostc                    s8   t    t||| _| jj j|| 9  _|| _d S rD   )super__init__nn	Embedding	embeddingweightdatary   )selfrv   rw   rx   ry   	__class__r   r   r{      s   

zScaledEmbedding.__init__c                 C   s   | j j| j S rD   )r~   r   ry   )r   r   r   r   r      s   zScaledEmbedding.weightc                 C   s   |  || j S rD   )r~   ry   r   xr   r   r   forward   s   zScaledEmbedding.forward)r1   ru   )
__name__
__module____qualname__r!   r?   r{   propertyr   r   __classcell__r   r   r   r   rt      s    
rt   c                       s4   e Zd ZdZd
dedef fddZdd	 Z  ZS )
LayerScalezLayer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
    This rescales diagonaly residual outputs close to 0 initially, then learnt.
    r   Fchannelsinitc                    s:   t    || _ttj|dd| _|| jjdd< dS )z
        channel_last = False corresponds to (B, C, T) tensors
        channel_last = True corresponds to (T, B, C) tensors
        T)requires_gradN)	rz   r{   channel_lastr|   	Parameterr   r    rx   r   )r   r   r   r   r   r   r   r{      s   
zLayerScale.__init__c                 C   s&   | j r| j| S | jd d d f | S rD   )r   rx   r   r   r   r   r      s   
zLayerScale.forward)r   F)	r   r   r   __doc__r!   r?   r{   r   r   r   r   r   r   r      s    
r   c                       s(   e Zd Z fddZ fddZ  ZS )MyGroupNormc                    s   t  j|i | d S rD   )rz   r{   )r   argskwargsr   r   r   r{     s   zMyGroupNorm.__init__c                    s    | dd}t | ddS )zh
        x: (B, T, C)
        if num_groups=1: Normalisation on all T and C together for each B
        r   r   )r&   rz   r   r   r   r   r   r     s   zMyGroupNorm.forwardr   r   r   r{   r   r   r   r   r   r   r     s    r   c                       sP   e Zd Zddejddddddddddd	d
ddddf fdd	ZdddZ  ZS )MyTransformerEncoderLayer   皙?r   Fh㈵>-C6?NrF   *     2   ffffff?c                    s*  ||d}t  j||||||	||||d
 || _|| _|r+|s(|| _|| _|| _|| _|rItt	||fd|	i|| _
tt	||fd|	i|| _d | _| j|@ rZtt	||d| _|
rbt||dnt | _|
rot||dnt | _|rt|||||r|ndd| _| dtd	d	 || _d S d S )
Nr
   rI   )
r)   nheaddim_feedforwarddropout
activationlayer_norm_epsbatch_first
norm_firstr
   rI   eps
num_groupsnum_channelsTr   r   r   auto_sparsitysrc_maskr   )rz   r{   sparser   rY   rZ   r[   r]   r   r!   norm1norm2norm_outr   r   r|   Identitygamma_1gamma_2MultiheadAttention	self_attn__setattr__r   r    r\   )r   r)   r   r   r   r   
group_normr   r   r   layer_scaleinit_valuesr
   rI   r   rY   r\   rZ   r[   r   r]   r   factory_kwargsr   r   r   r{     sN   



z"MyTransformerEncoderLayer.__init__c           	   
   C   s   |j }|}|j\}}}| jr7| js7|du sJ | j}|jd |kr7t||| j| j| j| j	| j
|}| d| | jr`|| | | ||| }|| | | | }| jr^| |}|S | || | ||| }| || | | }|S )zw
        if batch_first = False, src shape is (T, B, C)
        the case where batch_first=True is not covered
        Nr   r   )r
   shaper   r   r   rs   rY   rZ   r[   r\   r]   r   r   r   	_sa_blockr   r   	_ff_blockr   r   )	r   srcr   src_key_padding_maskr
   r   TBCr   r   r   r   S  s>   

z!MyTransformerEncoderLayer.forward)NN)r   r   r   Frelur{   r   r   r   r   r   r   r     s,    Cr   c                       s   e Zd Zddejddddddddddd	d
ddddfdedededededededededef fddZd ddZ	d ddZ
dd Zdd Z  ZS )!CrossTransformerEncoderLayerr   r   r   Fr   rF   r   r   r   r   Nr)   r   r   r   r   r   r   r   r   r   c                    s  ||d}t    || _|| _|r |s|| _|| _|| _|| _|  tj	||||d| _
tj||fi || _t|| _tj||fi || _|	| _|  |  |  |
r~tt|
|fd|i|| _tt|
|fd|i|| _tt|
|fd|i|| _n$tj|fd|i|| _tj|fd|i|| _tj|fd|i|| _d | _| j|@ rtt||d| _|rt||dnt | _|rt||dnt | _t|| _t|| _t|tr|  || _!n|| _!|rt	|||||r|ndd| _
|s| "dt#$d	d	 || _%d S d S d S )
Nr   )r   r   r   r   Tr   r   r^   r   )&rz   r{   r   r   rY   rZ   r[   r]   r|   r   
cross_attnLinearlinear1Dropoutr   linear2r   r   r!   r   r   norm3	LayerNormr   r   r   r   r   dropout1dropout2
isinstancestr_get_activation_fnr   r   r   r    r\   )r   r)   r   r   r   r   r   r   r   r   r   r   r   rY   r\   rZ   r[   r]   r   r
   rI   r   r   r   r   r   r{   }  sd   





z%CrossTransformerEncoderLayer.__init__c           
   
   C   s  |j }|j\}}}|j\}}}| jrB| jsB|du sJ | j}|jd |ks,|jd |krBt||| j| j| j| j	| j
|}| d| | jrn|| | | || || }	|	| | | |	 }	| jrl| |	}	|	S | || | ||| }	| |	| | |	 }	|	S )z
        Args:
            q: tensor of shape (T, B, C)
            k: tensor of shape (S, B, C)
            mask: tensor of shape (T, S)

        Nr   r^   )r
   r   r   r   r^   rs   rY   rZ   r[   r\   r]   r   r   r   	_ca_blockr   r   r   r   r   r   )
r   qkr^   r
   r   r   r   Sr   r   r   r   r     s6   
$
z$CrossTransformerEncoderLayer.forwardc                 C   s"   | j ||||ddd }| |S )NF)	attn_maskneed_weightsr   )r   r   )r   r   r   r   r   r   r   r   r     s   
z&CrossTransformerEncoderLayer._ca_blockc              	   C   s&   |  | | | |}| |S rD   )r   r   r   r   r   r   r   r   r   r     s   
z&CrossTransformerEncoderLayer._ff_blockc                 C   s*   |dkrt jS |dkrt jS td|)Nr   geluz&activation should be relu/gelu, not {})r   r   r   RuntimeErrorr   )r   r   r   r   r   r     s
   z/CrossTransformerEncoderLayer._get_activation_fnrD   )r   r   r   r   r   r!   r?   rL   r{   r   r   r   r   r   r   r   r   r   r   |  sZ    	

U
'r   c                ?       s   e Zd Zddddddddddddd	dd
ddddddg dddddddddfdededededededededededededed ed!ed"eje d#ed$ed%ed&ed'ed(ed)e	d*ed+ed,ed-ed.ed/ed0ed1ef> fd2d3Z
d4d5 Zd6d7 Zd8d9 Z  ZS ):CrossTransformerEncoderr   g      @      Fr   i  Tr2   Nr   r1   )g     @r1   gffffff?rF   r   r   r   r   r   embhidden_scale	num_heads
num_layerscross_firstr   max_positionsnorm_innorm_in_groupr   r   r   r   weight_decaylrr   r   sin_random_shiftweight_pos_embedcape_mean_normalizecape_augmentcape_glob_loc_scalesparse_self_attnsparse_cross_attnrY   r\   rZ   r[   r   r]   c            &         s   t    	 || dksJ t|| } || _|rdnd| _|| _|| _|| _|| _|| _	|dkr:|| _
|| _|| _|dkrFt||dd| _|| _|rNtjntj}!|  |  |	rdt|| _t|| _n|
rwtt|
|| _tt|
|| _n
t | _t | _t | _t | _i d|d|d	| d
|d|!d|d|d|d|d|d|d|d|d|d|dd}"t|"}#|#d|i t|"}$|$d|i t|D ]6}%|%d | jkr| jt di |# | jt di |# q| jt!di |$ | jt!di |$ qd S )Nr   r   capescaledg?)rx   r)   r   r   r   r   r   r   r   r   rY   r\   rZ   r[   r]   r   r   Tr   r   r   )"rz   r{   r!   r   classic_parityr   r   r   r   r   r   r   r   rt   position_embeddingsr   r   r   r   r|   r   r   	norm_in_tr   r   
ModuleListlayerslayers_tdictupdaterangeappendr   r   )&r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rY   r\   rZ   r[   r   r]   
hidden_dimr   kwargs_commonkwargs_classic_encoderkwargs_cross_encoderidxr   r   r   r{     s   
"



	
z CrossTransformerEncoder.__init__c                 C   s  |j \}}}}t||||j| j}t|d}t|d}| |}|| j|  }|j \}}}t|d}| ||||j}	t|	d}	| |}|| j|	  }t	| j
D ]*}
|
d | jkri| j|
 |}| j|
 |}qQ|}| j|
 ||}| j|
 ||}qQt|d|d}t|d}||fS )Nzb c fr t1 -> b (t1 fr) czb c t2 -> b t2 czt2 b c -> b t2 cr   zb (t1 fr) c -> b c fr t1)t1zb t2 c -> b c t2)r   r0   r
   r   r   r   r   _get_pos_embeddingr   r   r   r   r   r   )r   r   xtr   r   FrrW   
pos_emb_2drX   pos_embr  old_xr   r   r   r     s0   






zCrossTransformerEncoder.forwardc                 C   s   | j dkrt| jd }t||||| jd}|S | j dkrL| jr=t||||| j| j| j	| j
d | j
d | j
d d
}|S t||||| j| jdd	}|S | j d
krctj||d}| |d d d f }|S )Nr   r   )r   r
   r   r   r   r   )r
   r   r4   r5   r6   r7   r8   F)r
   r   r4   r5   r   r	   )r   r=   	randranger   r   r   trainingrC   r   r   r   r   r   r   )r   r   r   r   r
   r   r  r   r   r   r   r    sF   
 

z*CrossTransformerEncoder._get_pos_embeddingc                 C   s,   t |  | jd}| jd ur| j|d< |S )N)paramsr   r   )list
parametersr   r   )r   groupr   r   r   make_optim_group  s   

z(CrossTransformerEncoder.make_optim_group)r   r   r   r!   r   r?   rL   tpOptionalr  r{   r   r  r  r   r   r   r   r   r     s    	
 y%r   c                       s@   e Zd Z								d	 fdd	Z				d
ddZ  ZS )r   r   TFNc                    s   t    |
d usJ d|| _tjj|||d| _tjj|||d| _tjj|||d| _tj	|| _
tj|||| _tj	|| _|	| _|
| _d S )Nzsanity check)bias)rz   r{   r   r   r|   r   r   r   vr   	attn_dropproj	proj_dropr   r   )r   	embed_dimr   r   r  add_bias_kvadd_zero_attnkdimvdimr   r   r   r   r   r{     s   

zMultiheadAttention.__init__c                 C   s  | j s|ddd}|ddd}|ddd}|j\}}	}
|j\}}}
| |||	| j|
| j dddd}|dd}| |||| j|
| j dddd}|dd}| |||| j|
| j dddd}|dd}| j	r|d u s~J t
|||| j	d}n
t||||| jd}||| j|	|
| j }|dd||	|
}| |}| |}| j s|ddd}|d fS )Nr   r   r      )r]   )r   )r   permuter   r   rV   r   flattenr   r  r   dynamic_sparse_attentionscaled_dot_product_attentionr  r&   r  r  )r   querykeyvaluekey_padding_maskr   r   average_attn_weightsr   N_qr   N_kr   r   r  r   r   r   r   r     s@   

zMultiheadAttention.forward)r   TFFNNFN)NTNTr   r   r   r   r   r     s    r   c                 C   sF   ddl m} | |dd  } || |dd|}tjj|d}|S )Nr   )masked_matmulr   rJ   r   )xformers.opsr(  r:   r&   r   r|   
functionalsoftmax)r   r   att_maskr(  attr   r   r   scaled_query_key_softmax#  s
   r.  c                 C   s"   t | ||d}||}|| }|S )N)r,  )r.  )r   r   r  r,  r   r-  yr   r   r   r   +  s   r   c                 C   sD   t d| |}t j|| gdd}|jdd}|ddd  S )Nzbtf,bfhi->bhtir   r   r   r   r   )r   einsumr   argmaxr  byte
contiguous)r   Rqqbucketsr   r   r   _compute_buckets2  s   r7  Tc                 C   s   ddl m}m} d}d}	dd | ||fD \} }}t + tjd| jd ||	d	 | jd
}
t| |
}t||
}|||||\}}W d    n1 sKw   Y  || |||||S )Nr   )find_locations!sparse_memory_efficient_attention    r   c                 S   s   g | ]}|  qS r   )r3  )rg   r   r   r   r   ri   ?  s    z,dynamic_sparse_attention.<locals>.<listcomp>r   r   r   r	   )	r)  r8  r9  r   no_gradrandnr   r
   r7  )r!  r"  r#  r]   infer_sparsity	attn_biasr8  r9  n_hashes	proj_sizer4  bucket_query
bucket_keyrow_offsetscolumn_indicesr   r   r   r  :  s   
 


r  )r   r   r   )r   r   )r   r   r1   r   r2   )TN)&r=   typingr  r   torch.nnr|   torch.nn.functionalr*  r   numpyr<   r#   einopsr   r!   r   r0   rL   r?   r   rC   rE   rd   rs   Modulert   r   	GroupNormr   TransformerEncoderLayerr   r   r   r   r.  r   r7  r  r   r   r   r   <module>   sv   

*	

-7%m  HN