o
    i                     @   sD   d dl mZmZ ddlmZ dddZdd Zd	d
 ZdddZdS )    )Modelnormal_init   )registry皙?c              	   C   s.   t dtt| |||dd d d dd|id}|S )Nprecomputable_affine)nOnInFnP)Wbpaddropout_rate)initdimsparamsattrs)r   forwardr   )r   r	   r
   r   dropoutmodel r   R/home/ubuntu/.local/lib/python3.10/site-packages/spacy/ml/_precomputable_affine.pyPrecomputableAffine   s   
r   c                    s    d d d dd jjjd d   dd	}jj   fd
|dd  d ||jd f}jjdd|d<  fdd}||fS )Nr
   r   r   r	   r   r      F)zerosT)trans2outr   c                    s8  | \}}|j dksJ |jd ksJ |j|jd ks#J |jdt|| | }||jd  f}d|jdd ||jd  f} d}|  f}j||jd  f|}jj||d	d
}|f}|d}d| ||jd fS )N   r   r   r   r   r   )axis)r   r   r   r   Ttrans1)r   r   r   r   r   )	ndimshapeinc_grad&_backprop_precomputable_affine_paddingreshapesum	transposeopsgemm)dY_idsdYidsXfWopfidXfdWopfir   Xr   r
   r	   r   r   r   r   backward"   s"   
"
zforward.<locals>.backward)	get_dim	get_paramr)   alloc2fr#   r*   r&   xpsqueeze)r   r3   is_trainYfr4   r   r2   r   r      s   




$.%r   c           	      C   sl   |j d }| d}| d}| d}| jj|dk dd}| jj||||| dd}|d	|||fS )
Nr   r
   r   r   fdtypeTr    r   )r#   r5   r)   asarrayr*   r&   )	r   r,   r-   nBr
   r   r   maskd_padr   r   r   r%   J   s   



r%   Nc                    s   drd rdS dddd}j|}j jd}j}t||jt	|j
d | d	}t||jdd	}d| d
  d| |jdfdd}||j
jdd|j7 }|j|dd}|jd|fdd}||j
jjdd|jd|j7 } fdd}	d}
d}d}d }d
  t|D ]A}|	||}jj
|}jj
|}t|d |
kr|jj
| }d| qt||kr |8  d
  q dS dS )a  This is like the 'layer sequential unit variance', but instead
    of taking the actual inputs, we randomly generate whitened data.

    Why's this all so complicated? We have a huge number of inputs,
    and the maxout unit makes guessing the dynamics tricky. Instead
    we set the maxout weights to values that empirically result in
    whitened outputs given whitened inputs.
    r   Nr
   r   r   r	   r   g      ?)meanr   r   i  r<   r=   r   i  ig        )locscalesizec                    s    |d d }jj| jd  fdd}||jd   f}j||  | ||jd f}| 7 }j|}dkrQj|d S ||dk S )Nr   r<   r=   r   )	predictr)   allocr#   r&   scatter_addflattenr?   maxout)r-   tokvecshiddensvectorsr   r   r
   r   r   r   r   rI      s   zinit.<locals>.predictg{Gz?
   )	has_paramr6   anyr5   r)   alloc4fr7   r   r#   floatr8   sqrt	set_paramrJ   randomuniformr?   normalrG   r&   copyrangevarrC   abs)r   r3   Yr	   r   r   r)   r-   rN   rI   tol_vartol_meant_maxt_iacts1r^   rC   r   rQ   r   r   a   sP   	



$
r   )r   )NN)		thinc.apir   r   utilr   r   r   r%   r   r   r   r   r   <module>   s    
8