o
    i(.                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZmZmZ dd	 ZG d
d dejZG dd dejjZG dd dejZG dd deZdS )z7Class Declaration of Transformer's Training Subprocess.    N)cuda)	functions)training)	extension)gather_gradsgather_paramsscatter_gradsc              	   C   s   t t}| D ].}t|}|dur&| }||}|t|  |7  < W d   n1 s0w   Y  qtdd t	
|D S )zCalculate the norm of the array.

    Args:
        arr (numpy.ndarray)

    Returns:
        Float: Sum of the norm calculated from the given array.

    Nc                 S   s   g | ]}t |qS  )float).0ir	   r	   d/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/chainer_backend/transformer/training.py
<listcomp>'   s    zsum_sqnorm.<locals>.<listcomp>)collectionsdefaultdictr
   r   get_device_from_arrayraveldotintsumsix
itervalues)arrsq_sumxdevsr	   r	   r   
sum_sqnorm   s   


r   c                       2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
CustomUpdatera%  Custom updater for chainer.

    Args:
        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
            training dataset. It can also be a dictionary that maps strings to
            iterators. If this is just an iterator, then the iterator is
            registered by the name ``'main'``.
        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
            parameters. It can also be a dictionary that maps strings to
            optimizers. If this is just an optimizer, then the optimizer is
            registered by the name ``'main'``.
        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
            function to build input arrays. Each batch extracted by the main
            iterator and the ``device`` option are passed to this function.
            :func:`chainer.dataset.concat_examples` is used by default.
        device (int or dict): The destination device info to send variables. In the
            case of cpu or single gpu, `device=-1 or 0`, respectively.
            In the case of multi-gpu, `device={"main":0, "sub_1": 1, ...}`.
        accum_grad (int):The number of gradient accumulation. if set to 2, the network
            parameters will be updated once in twice,
            i.e. actual batchsize will be doubled.

       c                    s>   t t| j||||d || _d| _d| _|| _td dS )zInitialize Custom Updater.)	converterdevicer   Tz&using custom converter for transformerN)	superr   __init__
accum_gradforward_countstartr"   loggingdebug)self
train_iter	optimizerr!   r"   r%   	__class__r	   r   r$   C   s   
zCustomUpdater.__init__c                 C   s   |  d}| d}| }| || j}| jr |j  d| _|j| | j }|	  |  j
d7  _
| j
| jkr;dS d| _
ttdd |jdD }td| t|ratd	 n|  |j  dS )
z/Process main update routine for Custom Updater.mainFr    Nr   c                 S      g | ]}|j qS r	   gradr   pr	   r	   r   r   e       z-CustomUpdater.update_core.<locals>.<listcomp>grad norm={}&grad norm is nan. Do not update model.)get_iteratorget_optimizernextr!   r"   r'   target
cleargradsr%   backwardr&   npsqrtr   paramsr(   infoformatmathisnanwarningupdate)r*   r+   r,   batchr   loss	grad_normr	   r	   r   update_coreO   s*   



zCustomUpdater.update_corec                 C   (   |    | jdkr|  jd7  _dS dS )zUpdate step for Custom Updater.r   r    NrJ   r&   	iterationr*   r	   r	   r   rF   n      
zCustomUpdater.updater    __name__
__module____qualname____doc__r$   rJ   rF   __classcell__r	   r	   r-   r   r   *   s
    r   c                       r   )
CustomParallelUpdatera  Custom Parallel Updater for chainer.

    Defines the main update routine.

    Args:
        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
            training dataset. It can also be a dictionary that maps strings to
            iterators. If this is just an iterator, then the iterator is
            registered by the name ``'main'``.
        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
            parameters. It can also be a dictionary that maps strings to
            optimizers. If this is just an optimizer, then the optimizer is
            registered by the name ``'main'``.
        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
            function to build input arrays. Each batch extracted by the main
            iterator and the ``device`` option are passed to this function.
            :func:`chainer.dataset.concat_examples` is used by default.
        device (torch.device): Device to which the training data is sent. Negative value
            indicates the host memory (CPU).
        accum_grad (int):The number of gradient accumulation. if set to 2, the network
            parameters will be updated once in twice,
            i.e. actual batchsize will be doubled.

    r    c                    sD   ddl m} tt| j||||d || _d| _|| _td dS )z#Initialize custom parallel updater.r   )nccl)r!   devicesz-using custom parallel updater for transformerN)		cupy.cudarX   r#   rW   r$   r%   r&   r(   r)   )r*   train_itersr,   r!   rY   r%   rX   r-   r	   r   r$      s   
zCustomParallelUpdater.__init__c           	   
   C   s  |    | d t| jd  | d}| d }| || jd }| j	| | j
 }|  tjj}| jdur^t| j	}| j|jj|jj|j| jj| jjd|j t| j	| ~|  jd7  _| j| j
krt	 W d   dS d| _ttdd |jdD }td	 | t!"|rt#d
 n|$  | j	%  | jdurt&| j	}| j'|jj|j| jjd|j W d   dS W d   dS 1 sw   Y  dS )z8Process main update routine for Custom Parallel Updater.)rF   Nr   r/   Nr    c                 S   r0   r	   r1   r3   r	   r	   r   r      r5   z5CustomParallelUpdater.update_core.<locals>.<listcomp>Fr6   r7   )(setup_workers_send_messager   Device_devicesr9   r8   r:   r!   _masterr%   r=   Streamnullcommr   reducedataptrsizerX   
NCCL_FLOATNCCL_SUMr   r&   r>   r?   r   r;   r@   r(   rA   rB   rC   rD   rE   rF   r<   r   bcast)	r*   r,   rG   r   rH   null_streamggrI   gpr	   r	   r   rJ      sV   



	



+"z!CustomParallelUpdater.update_corec                 C   rK   )z(Update step for Custom Parallel Updater.r   r    NrL   rN   r	   r	   r   rF      rO   zCustomParallelUpdater.updaterP   rQ   r	   r	   r-   r   rW   u   s
    6rW   c                   @   sL   e Zd ZdZ					dddZdd Zd	d
 Zdd Zdd Zdd Z	dS )VaswaniRulea  Trainer extension to shift an optimizer attribute magically by Vaswani.

    Args:
        attr (str): Name of the attribute to shift.
        rate (float): Rate of the exponential shift. This value is multiplied
            to the attribute at each call.
        init (float): Initial value of the attribute. If it is ``None``, the
            extension extracts the attribute at the first call and uses it as
            the initial value.
        target (float): Target value of the attribute. If the attribute reaches
            this value, the shift stops.
        optimizer (~chainer.Optimizer): Target optimizer to adjust the
            attribute. If it is ``None``, the main optimizer of the updater is
            used.

      N      ?c                 C   s@   || _ |d | | _|d | _|| _|| _|| _d| _d| _dS )z"Initialize Vaswani rule extension.      g      r   N)_attr_d_inv05_warmup_steps_inv15_init_target
_optimizer_t_last_value)r*   attrdwarmup_stepsinitr;   r,   scaler	   r	   r   r$      s   

zVaswaniRule.__init__c                 C   sT   |  |}| jdu r| jd| j  | _| jdur!| || j dS | || j dS )zInitialize Optimizer values.Nrp   )_get_optimizerru   rs   rt   ry   _update_value)r*   trainerr,   r	   r	   r   
initialize   s   


zVaswaniRule.initializec                 C   sF   |  j d7  _ | |}| jt| j d | j | j  }| || dS )zForward extension.r    rq   N)rx   r   rs   minrt   r   )r*   r   r,   valuer	   r	   r   __call__
  s   
zVaswaniRule.__call__c                 C   s    |d| j | _ |d| j| _dS )zSerialize extension.rx   ry   N)rx   ry   )r*   
serializerr	   r	   r   	serialize  s   zVaswaniRule.serializec                 C   s   | j p|jdS )zObtain optimizer from trainer.r/   )rw   updaterr9   )r*   r   r	   r	   r   r     s   zVaswaniRule._get_optimizerc                 C   s   t || j| || _dS )z!Update requested variable values.N)setattrrr   ry   )r*   r,   r   r	   r	   r   r     s   
zVaswaniRule._update_value)ro   NNNrp   )
rR   rS   rT   rU   r$   r   r   r   r   r   r	   r	   r	   r   rn      s    
	rn   c                   @   s    e Zd ZdZdd Zdd ZdS )CustomConverterz\Custom Converter.

    Args:
        subsampling_factor (int): The subsampling factor.

    c                 C   s   dS )zInitialize subsampling.Nr	   rN   r	   r	   r   r$   *  s   zCustomConverter.__init__c                 C   sP   t |dksJ |d \}}tj|ddj}tjdd |D tjd}|||fS )a  Perform subsampling.

        Args:
            batch (list): Batch that will be sabsampled.
            device (chainer.backend.Device): CPU or GPU device.

        Returns:
            chainer.Variable: xp.array that are padded and subsampled from batch.
            xp.array: xp.array of the length of the mini-batches.
            chainer.Variable: xp.array that are padded and subsampled from batch.

        r    r   )paddingc                 S   s   g | ]}|j d  qS )r   )shape)r   r   r	   r	   r   r   A  s    z,CustomConverter.__call__.<locals>.<listcomp>)dtype)lenFpad_sequencere   r>   arrayint32)r*   rG   r"   xsysilensr	   r	   r   r   .  s
   
zCustomConverter.__call__N)rR   rS   rT   rU   r$   r   r	   r	   r	   r   r   "  s    r   )rU   r   r(   rC   numpyr>   r   chainerr   r   r   r   chainer.trainingr   7chainer.training.updaters.multiprocess_parallel_updaterr   r   r   r   StandardUpdaterr   updatersMultiprocessParallelUpdaterrW   	Extensionrn   objectr   r	   r	   r	   r   <module>   s    KdI