o
    ig.                     @   s$  U d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZ eeef Zeeee ef Zeeee ef Zdd	d
dZeeeeeef f ed< dddded ed d	dZeeeeeef f ed< eded fed ed ed ed e	eed ed d	ddedededededededefddZeded fed ed ed ed ed e	eed d	d dedededededededefd!d"Zed#ed ed e	eed d	d$dededededef
d%d&ZG d'd( d(eZ g d)Z!dS )*    N)defaultdict)DictListOptionalTupleUnioncast   )get_array_ops)registry)FloatsXd	Generator        T      ?)L2L2_is_weight_decay	grad_clipSGD_DEFAULTSgMbP?g?g+?g:0yE>r   r   )
learn_ratebeta1beta2epsr   r   r   ADAM_DEFAULTSzRAdam.v1r   r   r   r   r   )r   r   r   r   r   r   use_averagesr   c                C   s   t | |||||||dd	S )NT)r   r   r   r   r   r   r   	use_radam	Optimizer)r   r   r   r   r   r   r   r    r   D/home/ubuntu/.local/lib/python3.10/site-packages/thinc/optimizers.pyRAdam      r   zAdam.v1)r   r   r   r   r   r   r   c                C   s   t | |||||||dd	S )NF)r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   Adam8   r    r!   zSGD.v1)r   r   r   r   c             	   C   s   t | |||dd|dS )Nr   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   SGDQ   s   	r"   c                   @   sp  e Zd ZU dZeeef ed< eeef ed< eeeef  ed< ee	e
f ed< eeef ed< eeef ed< eed< eed	< eed
< eed< eed< eed< eed< eed< eeee   ed< g dZed ed ed ed ed ddddd	ededededededededefddZdd Zdd Zddd eee	f d!ed"ed#efd$d%Zd&d' Zd(d) Zd*S )+r   zDo various flavours of stochastic gradient descent, with first and
    second order momentum. Currently support 'vanilla' SGD, Adam, and RAdam.
    mom1mom2averages	schedules	nr_update	last_seenr   r   b1b2r   r   r   r   _radam_buffer)r#   r$   r%   r&   r'   r(   r   r   r)   r*   r   r   r   r   r+   r   r   TF)r   r   r   r   r   r   r   r   r   c          
      C   s   i | _ i | _|ri | _nd| _i | _tt| _tt| _| d| | d| | d| | d| | d| | d| || _	|	| _
dd	 td
D | _dS )al  
        Initialize an optimizer.

        learn_rate (float): The initial learning rate.
        L2 (float): The L2 regularization term.
        beta1 (float): First-order momentum.
        beta2 (float): Second-order momentum.
        eps (float): Epsilon term for Adam etc.
        grad_clip (float): Gradient clipping.
        use_averages (bool): Whether to track moving averages of the parameters.
        use_radam (bool): Whether to use the RAdam optimizer.
        L2_is_weight_decay (bool): Whether to interpret the L2 parameter as a
            weight decay term, in the style of the AdamW optimizer.
        Nr   r   r)   r*   r   r   c                 S   s   g | ]}g d qS ))NNNr   ).0_r   r   r   
<listcomp>   s    z&Optimizer.__init__.<locals>.<listcomp>
   )r#   r$   r%   r&   r   intr'   r(   _set_attr_or_scheduler   r   ranger+   )
selfr   r   r   r   r   r   r   r   r   r   r   r   __init__   s"   

zOptimizer.__init__c              
   C   s   t |tttfrt| || d S t |trt|}|| j|< zt| |t| W d S  t	t
fyH } zd| dt| d| }t|d }~ww )NzInvalid schedule for 'z' (z)
)
isinstancefloatboolr0   setattrlistiterr&   nextStopIteration	TypeErrortype
ValueError)r3   namevalueeerrr   r   r   r1      s   

zOptimizer._set_attr_or_schedulec              	   C   sN   | j  D ]\}}zt|}W n ty   t| |}Y nw t| || qd S )N)r&   itemsr;   r<   getattrr8   )r3   keyschedulerA   r   r   r   step_schedules   s   zOptimizer.step_schedulesr   )lr_scalerF   weightsgradientrI   c                C   sN  t |dk r
||fS t|}| j|  d7  < | j| }| jdkr+| js+|| j| 7 }| jr5||| j}| jrE| ||||||\}}n'| j	dkr\| j
dkr\| ||||||\}}n| j
dkrct||| j | 8 }|d9 }| jdkr| jr||| j | j | 8 }| jdur|| jvr|j|jdd| j|< || j| || ||fS )zCall the optimizer with weights and a gradient. The key is the
        identifier for the parameter, usually the node ID and parameter name.
        r	   r   r   Nfloat32)dtype)lenr
   r'   r   r   r   clip_gradientr   _radamr)   r*   _adamNotImplementedErrorr   r%   allocshapeupdate_averages)r3   rF   rJ   rK   rI   opsnr_updr   r   r   __call__   s8   





zOptimizer.__call__c                 C   s  || j vr||j| j |< || jvr||j| j|< |||j}|||j}| j| | j | | j| d}	| j| j| jg| j	d| j
d}
d}|	d |	d }}|
d \}}||9 }|d| |d	  7 }||9 }|d| | 7 }|	d
  d7  < |
d t|	d
 d  }|	d
 |d kr|d |d	 }}na|	d
 |d< ||	d
  }d	d|  d }|d	|	d
  | d|   }||d< |dkrtd| |d  |d  |d	  | | |d	  d||	d
    }n|rdd||	d
    }nd}||d	< |dkr+|
d dkr||
d  |
d  | 7 }|j||
d  }|| |
d  ||  7 }n$|dkrO|
d dkrD||
d  |
d  | 7 }|| |
d  | 7 }|||j|||jfS )N)stepexp_avg
exp_avg_sqr   )lrbetasr   weight_decaybufferTrZ   r[   r]   r	      rY   r_   r/   r         r   r^   r\   r   )r#   alloc1fsizer$   	reshape1fr'   r   r)   r*   r   r+   r0   mathsqrtxp	reshape_frT   )r3   rV   rJ   gradrI   rF   rW   
weights_1Dgradient_1Dstategroupdegenerated_to_sgdrZ   r[   r   r   bufferedN_sma	step_sizebeta2_t	N_sma_maxdenomr   r   r   rP      s   


	

zOptimizer._radamc                 C   s   | ||j}| ||j}|| jvr||j| j|< || jvr*||j| j|< | j| }	| j| }
| j}| j}d||  }d||  }| j|d  | }| j}|	|||	|
||||| \}}}	}
|	| j|< |
| j|< |
||j|
||jfS )Nr   g      ?)rf   re   r#   rd   r$   r)   r*   r   r   adamrj   rT   )r3   rV   rJ   rK   rI   rF   rW   rl   rm   r#   r$   r)   r*   fix1fix2r\   r   r   r   r   rQ   H  s,   





zOptimizer._adamN)__name__
__module____qualname____doc__r   KeyTr   __annotations__r   strr   r0   r6   r7   r   	__slots__r   
FloatOrSeqr4   r1   rH   r   rX   rP   rQ   r   r   r   r   r   e   sv   
 	

.

)Nr   )r!   r   r"   r   r   r   )"rg   collectionsr   typingr   r   r   r   r   r   backendsr
   configr   typesr   r   r0   r   r~   r6   r   IntOrSeqr   r7   r   r   
optimizersr   r!   r"   objectr   __all__r   r   r   r   <module>   s   
    		 