o
    :i                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZ zd dlZW n e	y; Z
 ze de
 d W Y dZ
[
ndZ
[
ww d dlZd dlmZ d dlmZ ej rTdZnejj r]dZndZd	d
 Zdd ZeejdrxdZd dlmZmZ n
dZd dlmZmZ eeeZeeeZdededefddZdededefddZG dd de Z!G dd dejj"Z#ee$e#f Z%G dd dZ&G dd dej'j(Z)ded e*fd!d"Z+G d#d$ d$ej'j(Z,G d%d& d&ejj"Z-d'd(d)e$dej.fd*d+Z/G d,d- d-ej'j(Z0G d.d/ d/ejj"Z1	d{ded0e$d1e$d2e2def
d3d4Z3defd5d6Z4ded7e*fd8d9Z5G d:d; d;ej'j(Z6G d<d= d=ej"Z7G d>d? d?ej'j(Z8d@dA Z9G dBdC dCej'j(Z:	d|dedEe$dFe$dGe$dHe;f
dIdJZ<dedefdKdLZ=G dMdN dNejj"Z>G dOdP dPej"Z?G dQdR dRej'j(Z@G dSdT dTej"ZAG dUdV dVej'j(ZBG dWdX dXejj"ZCG dYdZ dZejj"ZDG d[d\ d\ej'j(ZEG d]d^ d^ejj"ZFG d_d` d`ejj"ZGdefdadbZHdefdcddZIG dedf dfej'j(ZJG dgdh dhejj"ZKdidj ZLdkdl ZMdmdn ZNdodp ZOdqdr ZPdsdt ZQdudv ZRdwdx ZSeTdykrDe U Ve jW eXdz eYdz eR  eQ  eL  eM  eN  eP  eO  eS  dS dS )}    N)OptionalTupleUnionzFailed import k2 with error zr. Swoosh functions will fallback to PyTorch implementation, leading to slower speed and higher memory consumption.)Tensorcudampscpuc                   C   s(   t dkr	tj S t dkrtj S dS )Nr   r   r   )DEVICE_TYPEtorchr   memory_allocatedr   current_allocated_memory r   r   6/home/ubuntu/LuxTTS/zipvoice/models/modules/scaling.pyget_memory_allocated+   s
   

r   c                    s    fdd}|S )Nc                    s    s| S t d| S )N)device_type)r	   )funccuda_amp_deprecateddecr   r   	decorator4   s   z'custom_amp_decorator.<locals>.decoratorr   )r   r   r   r   r   r   custom_amp_decorator3   s   r   
custom_fwdT)
custom_bwdr   Fxyreturnc                 C   s0   t | |}t | | }|t t |  S N)r
   maxabslog1pexp)r   r   	max_valuediffr   r   r   logaddexp_onnxE   s   r#   c                 C   s6   t j rt | |S t j rt| |S t | |S r   )r
   jitis_scripting	logaddexponnxis_in_onnx_exportr#   )r   r   r   r   r   r&   Q   s
   


r&   c                   @   sd   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zddd defddZdS )PiecewiseLinearz
    Piecewise linear function, from float to float, specified as nonempty list of (x,y)
    pairs with the x values in order.  x values <[initial x] or >[final x] are map to
    [initial y], [final y] respectively.
    c                 G   s   t |dksJ t |t |dkr"t|d tr"t|d j| _ndd |D | _| jD ]\}}t|ttfs>J t|t|ttfsKJ t|q-tt | jd D ]!}| j|d  d | j| d ksvJ || j| | j|d  fqUd S )N   r   c                 S   s    g | ]\}}t |t |fqS r   )float.0r   r   r   r   r   
<listcomp>p        z,PiecewiseLinear.__init__.<locals>.<listcomp>)	len
isinstancer)   listpairsr+   inttyperange)selfargsr   r   ir   r   r   __init__k   s   "zPiecewiseLinear.__init__c                 C   s   dt | jdd  dS )NzPiecewiseLinear(r*   ))strr3   r7   r   r   r   __str__|   s   zPiecewiseLinear.__str__c                 C   s   || j d d kr| j d d S || j d d kr | j d d S | j d \}}tdt| j D ]&}| j | \}}||krP||krP||| ||  ||     S ||}}q/J )Nr   r*   r;   )r3   r6   r0   )r7   r   cur_xcur_yr9   next_xnext_yr   r   r   __call__   s    zPiecewiseLinear.__call__c                    s   t  fdd| jD  S )Nc                    s   g | ]
\}}||  fqS r   r   r,   alphar   r   r.      s    z+PiecewiseLinear.__mul__.<locals>.<listcomp>)r)   r3   )r7   rF   r   rE   r   __mul__   s   zPiecewiseLinear.__mul__c                    sP   t  ttfrt fdd| jD  S |  \} tdd t|j jD  S )Nc                    s    g | ]}|d  |d   fqS r   r*   r   )r-   pr   r   r   r.      r/   z+PiecewiseLinear.__add__.<locals>.<listcomp>c                 S   s(   g | ]\}}|d  |d |d  fqS rH   r   r-   spxpr   r   r   r.      s   ( )r1   r+   r4   r)   r3   get_common_basiszipr7   r   sr   rJ   r   __add__   s   zPiecewiseLinear.__add__c                 C   sH   t |ttfrtd|f}| j|dd\}}tdd t|j|jD  S )Nr   Tinclude_crossingsc                 S   *   g | ]\}}|d  t |d |d fqS rH   )r   rK   r   r   r   r.         * z'PiecewiseLinear.max.<locals>.<listcomp>r1   r+   r4   r)   rN   rO   r3   rP   r   r   r   r      s   zPiecewiseLinear.maxc                 C   sN   t |ts
t |trtd|f}| j|dd\}}tdd t|j|jD  S )Nr   TrS   c                 S   rU   rH   minrK   r   r   r   r.      rV   z'PiecewiseLinear.min.<locals>.<listcomp>rW   rP   r   r   r   rY      s   zPiecewiseLinear.minc                 C   s   | j |j kS r   )r3   )r7   otherr   r   r   __eq__   s   zPiecewiseLinear.__eq__FrI   rT   c                    sj  t  tsJ t ttdd jD dd  jD  }fdd|D } fdd|D }|rg }tt|d D ]I}|| || k||d  ||d  kkrt|| ||  }t||d  ||d   }	|||	  }
|| |
||d  ||    }|	| q=t|dkrtt|| }fdd|D } fdd|D }tt
|| tt
|| fS )	aI  
        Returns (self_mod, p_mod) which are equivalent piecewise linear
        functions to self and p, but with the same x values.

          p: the other piecewise linear function
          include_crossings: if true, include in the x values positions
              where the functions indicate by this and p crosss.
        c                 S   s   g | ]\}}|qS r   r   )r-   r   _r   r   r   r.          z4PiecewiseLinear.get_common_basis.<locals>.<listcomp>c                       g | ]} |qS r   r   r-   r   r>   r   r   r.      r]   c                    r^   r   r   r_   rI   r   r   r.      r]   r*   r   c                    r^   r   r   r_   r>   r   r   r.      r]   c                    r^   r   r   r_   r`   r   r   r.      r]   )r1   r)   r5   sortedsetr3   r6   r0   r   appendrO   )r7   rI   rT   x_valsy_vals1y_vals2extra_x_valsr9   diff_cur	diff_nextposextra_x_valr   )rI   r7   r   rN      s*   	(( 
z PiecewiseLinear.get_common_basisN)F)__name__
__module____qualname____doc__r:   r?   rD   rG   rR   r   rY   r[   boolrN   r   r   r   r   r)   d   s    r)   c                       sR   e Zd ZdZdddef fddZdefdd	Zd
d Zdd Z	dd Z
  ZS )ScheduledFloataK  
    This object is a torch.nn.Module only because we want it to show up in
    [top_level module].modules(); it does not have a working forward() function.
    You are supposed to cast it to float, as in, float(parent_module.whatever), and use
    it as something like a dropout prob.

    It is a floating point value whose value changes depending on the batch count of the
    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
    in sorted order on x; x corresponds to the batch index.  For batch-index values
    before the first x or after the last x, we just use the first or last y value.

    Example:
       self.dropout = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0.0)

    `default` is used when self.batch_count is not set or not in training mode or in
     torch.jit scripting mode.
            defaultrt   c                   s*   t    d | _d | _|| _t| | _d S r   )superr:   batch_countnamert   r)   schedule)r7   rt   r8   	__class__r   r   r:      s
   
zScheduledFloat.__init__r   c                 C   s"   d| j  dt| jjdd  S )Nzbatch_count=z, schedule=r*   r;   )rv   r=   rx   r3   r>   r   r   r   
extra_repr   s    zScheduledFloat.extra_reprc                 C   sn   | j }|d u s| jrtj stj rt| jS | | j }t		 dk r5t
d| j d| j  d|  |S )Ng-C6*?zScheduledFloat: name=z, batch_count=z, ans=)rv   trainingr
   r$   r%   
is_tracingr+   rt   rx   randomloggingdebugrw   )r7   rv   ansr   r   r   	__float__   s&   

zScheduledFloat.__float__c                 C   sD   t |ts
t |trt| j| | jdS t| j|j | j|j dS Nrs   )r1   r+   r4   rq   rx   rt   r7   r   r   r   r   rR     s
   zScheduledFloat.__add__c                 C   sJ   t |ts
t |trt| j|| jdS t| j|jt| j|jdS r   )r1   r+   r4   rq   rx   r   rt   r   r   r   r   r     s   zScheduledFloat.max)rl   rm   rn   ro   r+   r:   r=   r{   r   rR   r   __classcell__r   r   ry   r   rq      s    rq   c                   @   s0   e Zd ZdZdefddZdedefddZd	S )
CutoffEstimatorz
    Estimates cutoffs of an arbitrary numerical quantity such that a specified
    proportion of items will be above the cutoff on average.

      p is the proportion of items that should be above the cutoff.
    rI   c                 C   s   || _ d| _d| _d| _d S Nr   )rI   countcount_abovecutoffr7   rI   r   r   r   r:   !  s   
zCutoffEstimator.__init__r   r   c                 C   sp   || j k}|  jd7  _|r|  jd7  _| j| j }|| j }|dk|kr6t|}|| | j d|   | _ |S )z8
        Returns true if x is above the cutoff.
        r*   r   )r   r   r   rI   r   )r7   r   r   cur_pdelta_pqr   r   r   rD   *  s   

zCutoffEstimator.__call__N)rl   rm   rn   ro   r+   r:   rp   rD   r   r   r   r   r     s    	r   c                   @   s8   e Zd ZdZededefddZedefddZd	S )
SoftmaxFunctionz
    Tries to handle half-precision derivatives in a randomized way that should
    be more accurate for training than the default behavior.
    r   dimc                 C   s<   |j |d}t r|tj}| | |j| _|| _|S Nr   )	softmaxr
   is_autocast_enabledtofloat16save_for_backwarddtypex_dtyper   )ctxr   r   r   r   r   r   forward@  s   
zSoftmaxFunction.forwardans_gradc                 C   s|   | j \}tjjtdd( |tj}|tj}|| }|||j| jdd  }|d fW  d    S 1 s7w   Y  d S )NFenabledTr   keepdim)	saved_tensorsr
   ampautocastr	   r   float32sumr   r   r   r   x_gradr   r   r   backwardM  s   $zSoftmaxFunction.backwardN)	rl   rm   rn   ro   staticmethodr   r4   r   r   r   r   r   r   r   :  s    r   r   c                 C   s2   | j rtj stj r| j|dS t| |S r   )requires_gradr
   r$   r%   r}   r   r   apply)r   r   r   r   r   r   X  s   r   c                   @   sH   e Zd ZededededededefddZed	edefd
dZdS )BiasNormFunctionr   bias	log_scalechannel_dimstore_output_for_backpropr   c           	      C   s   |j dksJ |dk r||j  }|| _|| _t|d |j D ]}|d}qtj|| d |ddd |  }|| }| |rD|	 n||	 |	 |	  |S )Nr*   r   r;      Tr         )
ndimr   r   r6   	unsqueezer
   meanr    r   detach)	r   r   r   r   r   r   r\   scalesr   r   r   r   r   f  s$   	
zBiasNormFunction.forwardr   c                 C   s   | j \}}}}| jr|| }n|}| }d|_d|_d|_t % tj|| d | jddd |  }|| }|j	|d W d    n1 sJw   Y  |j
|j
 |j
d d fS )NTr   r   r   gradient)r   r   r   r   r
   enable_gradr   r   r    r   gradflatten)r   r   ans_or_xr   r   r   r   r   r   r   r   r     s    

zBiasNormFunction.backwardN)	rl   rm   rn   r   r   r4   rp   r   r   r   r   r   r   r   _  s"    r   c                       s\   e Zd ZdZ					ddeded	ed
edededdf fddZdedefddZ	  Z
S )BiasNorma  
    This is intended to be a simpler, and hopefully cheaper, replacement for
    LayerNorm.  The observation this is based on, is that Transformer-type
    networks, especially with pre-norm, sometimes seem to set one of the
    feature dimensions to a large constant value (e.g. 50), which "defeats"
    the LayerNorm because the output magnitude is then not strongly dependent
    on the other (useful) features.  Presumably the weight and bias of the
    LayerNorm are required to allow it to do this.

    Instead, we give the BiasNorm a trainable bias that it can use when
    computing the scale for normalization.  We also give it a (scalar)
    trainable scale on the output.


    Args:
       num_channels: the number of channels, e.g. 512.
       channel_dim: the axis/dimension corresponding to the channel,
         interpreted as an offset from the input's ndim if negative.
         This is NOT the num_channels; it should typically be one of
         {-2, -1, 0, 1, 2, 3}.
      log_scale: the initial log-scale that we multiply the output by; this
         is learnable.
      log_scale_min: FloatLike, minimum allowed value of log_scale
      log_scale_max: FloatLike, maximum allowed value of log_scale
      store_output_for_backprop: only possibly affects memory use; recommend
         to set to True if you think the output of this module is more likely
         than the input of this module to be required to be stored for the
         backprop.
    r;         ?            ?Fnum_channelsr   r   log_scale_minlog_scale_maxr   r   Nc                    sT   t t|   || _|| _tt|| _	tt
|| _|| _|| _|| _d S r   )ru   r   r:   r   r   nn	Parameterr
   tensorr   zerosr   r   r   r   )r7   r   r   r   r   r   r   ry   r   r   r:     s   	
zBiasNorm.__init__r   c                 C   s   |j | j | jksJ tj stj rK| j}|dk r!||j7 }| j}t	|d |jD ]}|
d}q,tj|| d |ddd | j  }|| S t| jt| jt| j| jd}t|| j|| j| jS )	Nr   r*   r;   r   Tr   r   )rY   r   r|   )shaper   r   r
   r$   r%   r}   r   r   r6   r   r   r   r    limit_param_valuer+   r   r   r|   r   r   r   )r7   r   r   r   r\   r   r   r   r   r   r     s2   
zBiasNorm.forward)r;   r   r   r   F)rl   rm   rn   ro   r4   r+   rp   r:   r   r   r   r   r   ry   r   r     s.    !r   r   )initial_scaler   c                 O   s   t j|i |}t / |jdd  | 9  < |jdur3tj j|jd|  d|   W d   |S W d   |S 1 s>w   Y  |S )aT  
    Behaves like a constructor of a modified version of nn.Linear
    that gives an easy way to set the default initial parameter scale.

    Args:
        Accepts the standard args and kwargs that nn.Linear accepts
        e.g. in_features, out_features, bias=False.

        initial_scale: you can override this if you want to increase
           or decrease the initial magnitude of the module's output
           (affects the initialization of weight_scale and bias_scale).
           Another option, if you want to do something like this, is
           to re-initialize the parameters.
    Ng皙?)r   Linearr
   no_gradweightr   inituniform_)r   r8   kwargsr   r   r   r   ScaledLinear  s   



r   c                   @   s`   e Zd Zededededededededefd	d
Zededeedddddf fddZ	dS )BalancerFunctionr   min_meanmax_meanmin_rmsmax_rms
grad_scaler   r   c                 C   s8   |dk r	||j 7 }|| _| | ||||||f| _|S r   )r   r   r   config)r   r   r   r   r   r   r   r   r   r   r   r     s   

zBalancerFunction.forwardr   Nc              
      s  | j \}| j\}}}}} zt  tjjtdd |tj}|	 }d|_
 fddt|jD }|d j|dd}	|j|dd}
|	|
|
  jdd	 }|	jdd	 }|
| }||j||d
  }|j||d
}||   }|| }|jt|d |j}|d j|dd jdd	}|||  }|tj}|| |  }||j}W d    n1 sw   Y  W d    n1 sw   Y  W n! ty } ztd| dt|j d W Y d }~nd }~ww |d d d d d d fS )NFr   Tc                    s   g | ]}| kr|qS r   r   )r-   r9   r   r   r   r.   ,  s    z-BalancerFunction.backward.<locals>.<listcomp>r   r   #B;rX   )rY   r   r   z'Caught exception in Balancer backward: , size=, will continue.)r   r   r
   r   r   r   r	   r   r   r   r   r6   r   r   clampsqrtr   logr   	ones_liker   r   	Exceptionr   infor2   r   )r   r   r   r   r   r   r   r   	mean_dimsuncentered_varr   stddevrmsmm_lossrms_clampedr_lossloss	loss_gradloss_grad_rmsx_grad_float
x_grad_moder   r   r   r     sf   
(zBalancerFunction.backward)
rl   rm   rn   r   r   r+   r4   r   r   r   r   r   r   r   r      s*    	(r   c                       sf   e Zd ZdZ						dded	ed
edededededee f fddZdedefddZ	  Z
S )Balancera*  
    Modifies the backpropped derivatives of a function to try to encourage, for
    each channel, that it is positive at least a proportion `threshold` of the
    time.  It does this by multiplying negative derivative values by up to
    (1+max_factor), and positive derivative values by up to (1-max_factor),
    interpolated from 1 at the threshold to those extremal values when none
    of the inputs are positive.

    Args:
           num_channels: the number of channels
           channel_dim: the dimension/axis corresponding to the channel, e.g.
               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
           min_positive: the minimum, per channel, of the proportion of the time
               that (x > 0), below which we start to modify the derivatives.
           max_positive: the maximum, per channel, of the proportion of the time
               that (x > 0), above which we start to modify the derivatives.
           scale_gain_factor: determines the 'gain' with which we increase the
              change in gradient once the constraints on min_abs and max_abs
              are violated.
           min_abs:  the minimum average-absolute-value difference from the mean
               value per channel, which we allow, before we start to modify
               the derivatives to prevent this.
           max_abs:  the maximum average-absolute-value difference from the mean
               value per channel, which we allow, before we start to modify
               the derivatives to prevent this.
         prob: determines the minimum probability with which we modify the
             gradients for the {min,max}_positive and {min,max}_abs constraints,
             on each forward().  This is done randomly to prevent all layers
             from doing it at the same time.
    皙?ffffff?皙?      Y@{Gz?Nr   r   min_positivemax_positivemin_absmax_absr   probc	           	         s^   t    |d u rtdddd}|| _td| _|| _|| _|| _|| _	|| _
|| _|| _d S )N)rr         ?)g     @@g      ?g?rs   r   )ru   r:   rq   r   r   
mem_cutoffr   r   r   r   r   r   r   )	r7   r   r   r   r   r   r   r   r   ry   r   r   r:   x  s   


zBalancer.__init__r   r   c           
   	   C   s   t j s|jr|js|jjdkr| t rt	|S t
| j}t |k rfdd }dd }|t
| j}|t
| j}|t
| j}|t
| j}t
| j}	|j| j | jksZJ t||||||	| jS t	|S )Nr   c                 S   s   d|  S )Ng/v?r   rJ   r   r   r   _abs_to_rms  s   z%Balancer.forward.<locals>._abs_to_rmsc                    s(   dd   fdd}dd|   } || S )Nc                 S   s,   d}t d|  | t d|  |  d S )Ng|=r*          @)mathr   )r   epsr   r   r   _atanh  s   (zFBalancer.forward.<locals>._proportion_positive_to_mean.<locals>._atanhc                    s   d |  S )Ng?=?r   rJ   r  r   r   _approx_inverse_erf  s   
zSBalancer.forward.<locals>._proportion_positive_to_mean.<locals>._approx_inverse_erfr;   r   r   )r   r  r   r  r   _proportion_positive_to_mean  s   z6Balancer.forward.<locals>._proportion_positive_to_mean)r
   r$   r%   r   is_cudadevicer5   r   r   _no_opr+   r   r~   r   r   r   r   r   r   r   r   r   r   )
r7   r   r   r   r  r   r   r   r   r   r   r   r   r     s8   


zBalancer.forward)r   r   r   r   r   N)rl   rm   rn   ro   r4   	FloatLiker   r:   r   r   r   r   r   ry   r   r   X  s4    #	r   limitpenaltyrw   c                 C   s@   |   }|  | dk}||| tj|   }t| ||} | S )aH  
    Returns x unmodified, but in backprop will put a penalty for the excess of
    the absolute values of elements of x over the limit "limit".  E.g. if
    limit == 10.0, then if x has any values over 10 it will get a penalty.

    Caution: the value of this penalty will be affected by grad scaling used
    in automatic mixed precision training.  For this reasons we use this,
    it shouldn't really matter, or may even be helpful; we just use this
    to disallow really implausible values of scores to be given to softmax.

    The name is for randomly printed debug info.
    r   )signr   r   r
   int8	with_loss)r   r  r  rw   x_sign
over_limitaux_lossr   r   r   penalize_abs_values_gt  s
   r  c                 C   s^   | j dkr	|  S | j\}}}| ||| } | d d d d |d f } | j||fks-J | S )Nr   r*   )r   diagr   reshape)r   batchr   r   r   r   _diag  s   
r  
num_groupsc           	      C   s   | j tjksJ | d| jd } | j\}}|| dksJ || }| |||dd} | | jddd } t| dd| }t| }|d 	 ||  }||d d  }|S )a  
    Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
        of the centered feature covariance are the same within each group's covariance
        matrix and also between groups.
    Args:
        x: a Tensor of shape (*, num_channels)
     num_groups:  the number of groups of channels, a number >=1 that divides
        num_channels
    Returns:
        Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and
    greater than 1.0 otherwise.
    r;   r   r*   Tr   r   r   )
r   r
   r   r  r   	transposer   matmulr  r   )	r   r  
num_framesr   channels_per_groupx_covarx_covar_mean_diagx_covarsq_mean_diagmetricr   r   r   _whitening_metric   s   
r"  c                   @   s:   e Zd ZededejdefddZedefddZd	S )
WhiteningPenaltyFunctionr   moduler   c                 C   s   |  | || _|S r   )r   r$  )r   r   r$  r   r   r   r   &  s   
z WhiteningPenaltyFunction.forwardr   c           	      C   s  | j \}| j}zt  tjjtdd |tj	 }d|_
t||j}t dk s1tdkrPtd|j d|j d|jd	  d
| ddt|j 
 |t|jk rn|j|_|d fW  d    W  d    W S |j|_|  |j}|j|tj | d   }|| }|||j d fW  d    W  d    W S 1 sw   Y  W d    n1 sw   Y  W |d fS W |d fS  ty } ztd| dt |j d W Y d }~|d fS d }~ww )NFr   T{Gzt?__main__zWhitening: name=z, num_groups=z,num_channels=r;   z	, metric=z.2fz vs. limit=r   z%Caught exception in Whiten backward: r   r   )!r   r$  r
   r   r   r   r	   r   r   r   r   r"  r  r~   rl   r   r   rw   r   itemr+   whitening_limitmin_probr   max_probr   r   r   normr   r   r   r2   )	r   r   x_origw
x_detachedr!  penalty_gradscaler   r   r   r   r   ,  sb   

!!z!WhiteningPenaltyFunction.backwardN)	rl   rm   rn   r   r   r   Moduler   r   r   r   r   r   r#  %  s
    r#  c                
       sP   e Zd Zdededeeeeef f def fddZde	de	fd	d
Z
  ZS )Whitenr  r(  r   r   c                    s   t t|   |dksJ t|dksJ |dksJ || _|| _|| _t|tr-||f}|\| _| _	d| j  k rE| j	  krEdksHJ  J | j	| _
d| _dS )a  
        Args:
          num_groups: the number of groups to divide the channel dim into before
            whitening.  We will attempt to make the feature covariance
            within each group, after mean subtraction, as "white" as possible,
            while having the same trace across all groups.
         whitening_limit: a value greater than 1.0, that dictates how much
           freedom we have to violate the constraints.  1.0 would mean perfectly
           white, with exactly the same trace across groups; larger values
           give more freedom.  E.g. 2.0.
         prob: the probability with which we apply the gradient modification
           (also affects the grad scale).  May be supplied as a float,
           or as a pair (min_prob, max_prob)

          grad_scale: determines the scale on the gradient term from this object,
            relative to the rest of the gradient on the attention weights.
            E.g. 0.02 (you may want to use smaller values than this if prob is large)
        r*   r   N)ru   r2  r:   r+   r  r(  r   r1   r)  r*  r   rw   )r7   r  r(  r   r   ry   r   r   r:   W  s   
*
zWhiten.__init__r   r   c                 C   s:   t | j}|jrt | jks|dkrt|S t|| S )a  
        In the forward pass, this function just returns the input unmodified.
        In the backward pass, it will modify the gradients to ensure that the
        distribution in each group has close to (lambda times I) as the covariance
        after mean subtraction, with the same lambda across groups.
        For whitening_limit > 1, there will be more freedom to violate this
        constraint.

        Args:
           x: the input of shape (*, num_channels)

        Returns:
            x, unmodified.   You should make sure
        you use the returned value, or the graph will be freed
        and nothing will happen in backprop.
        r   )r+   r   r   r~   r   r
  r#  r   )r7   r   r   r   r   r   r     s   
zWhiten.forward)rl   rm   rn   r4   r  r   r+   r   r:   r   r   r   r   r   ry   r   r2  V  s    (r2  c                   @   s8   e Zd ZedededefddZedefddZd	S )
WithLossr   r   rw   c                 C   sD   |j | _t dk r |d ur |  }td| d|d |S )NgMb`?zWithLoss: name=z, loss-sum=z.3e)r   y_shaper~   r   r'  r   r   )r   r   r   rw   loss_sumr   r   r   r     s
   zWithLoss.forwardr   c                 C   s   |t j| j|j|jdd fS )Nr   r	  )r
   onesr4  r   r	  )r   r   r   r   r   r     s   zWithLoss.backwardN)rl   rm   rn   r   r   r=   r   r   r   r   r   r   r3    
    r3  c                 C   s   t | ||S r   )r3  r   )r   r   rw   r   r   r   r    s   r  c                   @   s8   e Zd ZedededefddZedefddZd	S )
LimitParamValuer   rY   r   c                 C   s&   |  | ||ksJ || _|| _|S r   )r   rY   r   )r   r   rY   r   r   r   r   r     s
   
zLimitParamValue.forwardr   c                 C   sZ   | j \}|tt|dk|| jk dd }|tt|dk || jkdd9 }|d d fS )Nr         r   )r   r
   wherelogical_andrY   r   )r   r   r   r   r   r   r     s   $
zLimitParamValue.backwardN)rl   rm   rn   r   r   r+   r   r   r   r   r   r   r9    r8  r9  333333?rY   r   r   r|   c                 C   s"   |rt   |k rt| ||S | S r   )r~   r9  r   )r   rY   r   r   r|   r   r   r   r     s   r   c                 C   s*   t j s
t j r| S | jdddd S )Nr*   r;   r   r   )r
   r$   r%   r}   chunkrJ   r   r   r   r
    s   r
  c                       s$   e Zd Z fddZdd Z  ZS )Identityc                    s   t t|   d S r   )ru   r?  r:   r>   ry   r   r   r:     s   zIdentity.__init__c                 C   s   t |S r   )r
  r   r   r   r   r     s   zIdentity.forward)rl   rm   rn   r:   r   r   r   r   ry   r   r?    s    r?  c                       s4   e Zd Zdef fddZdedefddZ  ZS )Dropout2rI   c                    s   t    || _d S r   )ru   r:   rI   r   ry   r   r   r:     s   

zDropout2.__init__r   r   c                 C   s   t jjj|t| j| jdS )N)rI   r|   )r
   r   
functionaldropoutr+   rI   r|   r   r   r   r   r     s   zDropout2.forward)rl   rm   rn   r  r:   r   r   r   r   r   ry   r   r@    s    r@  c                   @   s,   e Zd Zeedd Zeedd ZdS )MulForDropout3c                 C   s*   |j rJ || | }| | || _|S r   )r   r   rF   )r   r   r   rF   r   r   r   r   r     s
   

zMulForDropout3.forwardc                 C   s$   | j \}| j| |dk }|d d fS r   )r   rF   r   r   r   r   r     s   
zMulForDropout3.backwardN)rl   rm   rn   r   r   r   r   r   r   r   r   r   rC    s    rC  c                       s8   e Zd Zdedef fddZdedefddZ  ZS )	Dropout3rI   
shared_dimc                    s   t    || _|| _d S r   )ru   r:   rI   rE  )r7   rI   rE  ry   r   r   r:     s   

zDropout3.__init__r   r   c                 C   sh   t | j}| jr|dkrt|S dd|  }t|j}d|| j< tj|d|j	i|k}t
|||}|S )Nr   r   r*   r	  )r+   rI   r|   r
  r2   r   rE  r
   randr	  rC  r   )r7   r   rI   r0  
rand_shapemaskr   r   r   r   r     s   


zDropout3.forward)	rl   rm   rn   r  r4   r:   r   r   r   r   r   ry   r   rD    s    rD  c                   @   <   e Zd ZdZededefddZededefddZd	S )
SwooshLFunctionz;
    swoosh_l(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
    r   r   c              	   C   s  |j }|jtjkr|tj}tjd|j|jd}d}tjj	t
dd t  | }d|_ t||d ||  d }|sP|W  d    W  d    S |jt|d	 |j}|}d
| d }|| d||   t| }	tdkr|	 dksJ |	 dk sJ |	tj}
| |
 |jtjkst r|tj}|W  d    W  d    S 1 sw   Y  W d    d S 1 sw   Y  d S )Nrr   r6  {GzFr   T      @Q?r   r   r%       o@r&        p@r   r   r
   r   r   r   r   r	  r   r   r	   r   r   r&   r   r   r   	rand_likerl   rY   r   uint8r   r   )r   r   r   zerocoeffr   r   floorceild_scaledd_intr   r   r   r     sB   
	
"zSwooshLFunction.forwardy_gradc                 C   s8   | j \}d}|}d| d }||| d  | }|| S )NrK  r   r%  rN  r   )r   rY  drT  rU  rV  r   r   r   r   D  s   zSwooshLFunction.backwardNrl   rm   rn   ro   r   r   r   r   r   r   r   r   rJ    s    &rJ  c                   @      e Zd ZdedefddZdS )SwooshLr   r   c                 C   p   t j s
t j r!t jd|j|jd}t||d d|  d S dtj	vr+t
|S |js3t|S t|S )Return Swoosh-L activation.rr   r6  rL  {Gz?rM  k2)r
   r$   r%   r}   r   r   r	  r&   sysmodulesrJ  r   r   rb  swoosh_l_forwardswoosh_lr7   r   rS  r   r   r   r   P     



zSwooshL.forwardNrl   rm   rn   r   r   r   r   r   r   r^  O      r^  c                   @   r]  )SwooshLOnnxr   r   c                 C   .   t jd|j|jd}t||d d|  d S )r`  rr   r6  rL  ra  rM  r
   r   r   r	  r#   rg  r   r   r   r   _     zSwooshLOnnx.forwardNri  r   r   r   r   rk  ^  rj  rk  c                   @   rI  )
SwooshRFunctionzo
     swoosh_r(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687

    derivatives are between -0.08 and 0.92.
    r   r   c           
   	   C   s  |j }|jtjkr|tj}tjd|j|jd}tjj	t
dd t  | }d|_ t||d d|  d }|sN|W  d    W  d    S |jt|d	 |j}d
}d}|| d||   t| }tdkr| dksyJ | dk sJ |tj}	| |	 |jtjkst r|tj}|W  d    W  d    S 1 sw   Y  W d    d S 1 sw   Y  d S )Nrr   r6  Fr   Tr   ra  tN0z?r   rK  皙?rN  r&  rO  rP  )
r   r   r   rS  r   r   rU  rV  rW  rX  r   r   r   r   l  s@   

"zSwooshRFunction.forwardrY  c                 C   s,   | j \}d}d}||| d  | }|| S )NrK  rq  rN  rZ  )r   rY  r[  rU  rV  r   r   r   r     s
   zSwooshRFunction.backwardNr\  r   r   r   r   ro  e  s    $ro  c                   @   r]  )SwooshRr   r   c                 C   r_  )Return Swoosh-R activation.rr   r6  r   ra  rp  rb  )r
   r$   r%   r}   r   r   r	  r&   rc  rd  ro  r   r   rb  swoosh_r_forwardswoosh_rrg  r   r   r   r     rh  zSwooshR.forwardNri  r   r   r   r   rr    rj  rr  c                   @   r]  )SwooshROnnxr   r   c                 C   rl  )rs  rr   r6  r   ra  rp  rm  rg  r   r   r   r     rn  zSwooshROnnx.forwardNri  r   r   r   r   rv    rj  rv  c                 C   s   t jjtdd1 | t j} | d }d|   | j}t 	|t
dk||}|d|   d W  d    S 1 s<w   Y  d S )NFr   rL  r   infra  rM  r
   r   r   r	   r   r   r    r   r   r;  r+   r   x_offsetlog_sumr   r   r   SwooshLForward     $r|  c                 C   s   t jjtdd1 | t j} | d }d|   | j}t 	|t
dk||}|d|   d W  d    S 1 s<w   Y  d S )NFr   r   rw  ra  rp  rx  ry  r   r   r   SwooshRForward  r}  r~  c                   @   sT   e Zd Zeedededee dededee	 fddZ
eed	efd
dZdS )"ActivationDropoutAndLinearFunctionr   r   r   
activation	dropout_pdropout_shared_dimc                 C   s   |dkr%t |j}|d urd||< dd|  tj||j|jd|k }nd }| |||| || _tj	tj
d}	|	| }
|
|}|d urI|| }tjj|||}|S )Nrr   r*   r   )r	  r   r^  rr  )r2   r   r
   rF  r	  r   r   r  rb  re  rt  r   rA  linear)r   r   r   r   r  r  r  dropout_shapedropout_maskforward_activation_dictactivation_funcr   r   r   r     s&   

z*ActivationDropoutAndLinearFunction.forwardr   c                 C   s   | j }|\}}}}tjtjd}|| j }||\}	}
|d ur#|	| }	|j\}}|	jd }|d|}t|	 |	d|}t||}|d u rKd n|j
dd}||
 }|d ur]|| }|||d d d fS )Nr  r;   r   r   )r   rb  swoosh_l_forward_and_derivswoosh_r_forward_and_derivr  r   r  r
   r  tr   )r   r   savedr   r   r   r  !forward_and_deriv_activation_dictr   r   
func_derivout_channelsin_channelsgweight_derivy_deriv
bias_derivx_derivr   r   r   r     s&   


z+ActivationDropoutAndLinearFunction.backwardN)rl   rm   rn   r   r   r   r   r=   r+   r4   r   r   r   r   r   r   r   r    s&    %r  c                       s\   e Zd ZdZ					ddeded	ed
ededee de	f fddZ
defddZ  ZS )ActivationDropoutAndLineara2  
     This merges an activation function followed by dropout and then a nn.Linear module;
     it does so in a memory efficient way so that it only stores the input to the whole
     module.  If activation == SwooshL and dropout_shared_dim != None, this will be
     equivalent to:
       nn.Sequential(SwooshL(),
                     Dropout3(dropout_p, shared_dim=dropout_shared_dim),
                     ScaledLinear(in_channels, out_channels, bias=bias,
                                  initial_scale=initial_scale))
    If dropout_shared_dim is None, the dropout would be equivalent to
    Dropout2(dropout_p).  Note: Dropout3 will be more memory efficient as the dropout
    mask is smaller.

     Args:
        in_channels: number of input channels, e.g. 256
        out_channels: number of output channels, e.g. 256
        bias: if true, have a bias
        activation: the activation function, for now just support SwooshL.
        dropout_p: the dropout probability or schedule (happens after nonlinearity).
        dropout_shared_dim: the dimension, if any, across which the dropout mask is
             shared (e.g. the time dimension).  If None, this may be less memory
             efficient if there are modules before this one that cache the input
             for their backprop (e.g. Balancer or Whiten).
    Tr^  rr   r;   r   r  r  r   r  r  r  r   c           	         sF   t    t||||d}|j| _| d|j || _|| _|| _d S )Nr   r   r   )	ru   r:   r   r   register_parameterr   r  r  r  )	r7   r  r  r   r  r  r  r   lry   r   r   r:   +  s   


z#ActivationDropoutAndLinear.__init__r   c                 C   s   t j st j sdtjvr3| jdkrt|}n| jdkr#t|}nJ | jt j	j
|| j| jS t|| j| j| jt| j| jS )Nrb  r^  rr  )r
   r$   r%   r}   rc  rd  r  r|  r~  r   rA  r  r   r   r  r   r+   r  r  r   r   r   r   r   G  s&   





z"ActivationDropoutAndLinear.forward)Tr^  rr   r;   r   )rl   rm   rn   ro   r4   rp   r=   r  r   r+   r:   r   r   r   r   r   ry   r   r    s.    r  c                  C   s   dD ]_} t d|   tdd}td}tdd}|| | | 7 }d|_tdddd	d
}tdD ]}||}q4t|}|j|d | dk rTt	|j
|sSJ q| dkrat	|j
|raJ qd S )N)r   r         $@z_test_whiten(): proportion = d      r*   T      @r   r   )r   r      r   r   )r   r   r
   randnr   r2  r6   
randn_liker   allcloser   )
proportionr   	directioncoeffsr   r\   r   rY  r   r   r   _test_whiten_  s(   


r  c                  C   s   t ddd} d}ddt |  || dk  d  }| }d|_t|  dd	d
ddd}t t 	|  |}||}|j
|d td| td| td|j d S )Nr   r*   {Gz?  r   r  r;   Tr   r   rr   )r   r   r   r   r   r   z_test_balancer_sign: x = z_test_balancer_sign: y grad = z_test_balancer_sign: x grad = )r
   arangerF  numelr   r   r   r   r  r  r   printr   )probsNr   r   rY  r   r   r   r   _test_balancer_signy  s&   &	

r  c               	   C   s   t ddd} d}t t |  || d }| }d|_t|  dddd	d
dd}t t |  |}||}|j	|d t
d| t
d| t
d|j d S )Nr   r*   r  r  r;   Trr   r   r   ffffff?)r   r   r   r   r   r   r   z_test_balancer_magnitude: x = z#_test_balancer_magnitude: y grad = z#_test_balancer_magnitude: x grad = )r
   r  r  r  r  r   r   r   r   r   r  r   )
magnitudesr  r   r   rY  r   r   r   r   _test_balancer_magnitude  s(    


r  c                  C   b   t jddt jdd } d| _t }d}t jj|| |dd t jd	d	t jdd } d| _|| }|S 
N
      r   g      @Tgp?r  )atolr  r  )r
   r  doubler   r^  autograd	gradcheckr   r   tolr   r   r   r   _test_swooshl_deriv     r  c                  C   r  r  )r
   r  r  r   rr  r  r  r  r   r   r   _test_swooshr_deriv  r  r  c                  C   s   t jddt jd} |  }d| _d|_| jddd d df    td| j	 t|ddd d df    td	|j	 t 
| j	|j	sIJ d S )
Nr   r  r  Tr*   r   r   z	a grad = z	b grad = )r
   r  float64cloner   r   r   r   r  r   r  abr   r   r   _test_softmax  s     r  c                  C   s<  t d} dD ]
}| |dksJ qt dd} dD ]\}}td|| | ||ks1J || ||fqt dd}g d	}| |}|D ]}t| |||}||}t|| d
k s[J qB| |}|D ]}t| |||}||}t|| d
k s|J qc| | }|D ]}| ||| }||}t|| d
k sJ qd S )Nr   r  )r   r  r  r*   rr   ))r  r  r  )r   r  r  )r   rr   zx, y = )r   g      .@)r=  r   )
r:  rr   r   r   r   r=  r  ?r   r  MbP?)r)   r  r   r   rY   )rI   r   r   r   rd   pqy1y2r   r   r   _test_piecewise_linear  s2   
 


r  c                  C   s  d} d}dD ]}dD ]}dD ]}t |dkrt nt t|ddt| ||d	d
}t| ||d	||d}t  |d j	|j	d d < |rN|d j
|j
d d < W d    n1 sXw   Y  td| }d|_tjt|t|ddsuJ |  }d|_d}	t|	 ||}
t|
}|
j|d t|	 ||}|j|d td| d| d|  td|
 td| tj|
|ddsJ tj|d j	j|j	jddsJ |rtj|d j
j|j
jddsJ td|j td|j dd }||j|jsJ qq
qd S )N      )TF)rr   r  r^  r;   )rI   rE  r   r  )r   r   r  r  r   r  Tr  )r  r   zbias = z, dropout_p = z, activation = zy1 = zy2 = g{Gz?gh㈵>z
x1.grad = z
x2.grad = c                 S   s,   | |   d| d   |d      kS )Nr  r   )r   r   r  r   r   r   isclose.  s   z4_test_activation_dropout_and_linear.<locals>.isclose)r   
Sequentialr^  rr  rD  r   r  r
   r   r   r   r  r   r  ro  r   r~  r  r   manual_seedr  r   r  r   )r  r  r   r  r  m1m2x1x2seedr  rY  r  r  r   r   r   #_test_activation_dropout_and_linear  sr   







  r  r&  r*   r   )r=  T)Zr   r  r~   rc  typingr   r   r   rb  r   r   warningr
   torch.nnr   r   r   is_availabler	   backendsr   r   r   hasattrr   
deprecated	torch.ampr   r   torch.cuda.ampr#   r&   objectr)   r1  rq   r+   r  r   r  Functionr   r4   r   r   r   r   r   r   r   r=   r  r  r"  r#  r2  r3  r  r9  rp   r   r
  r?  r@  rC  rD  rJ  r^  rk  ro  rr  rv  r|  r~  r  r  r  r  r  r  r  r  r  r  rl   	getLoggersetLevelDEBUGset_num_threadsset_num_interop_threadsr   r   r   r   <module>   s   



nD!8RX~
 %1A


	76		JN
I


