o
    Qi                     @   sx   d dl Zd dlmZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ dd	 Zd
d ZdddZdddZdS )    N)
csr_matrix)diags)
dok_matrix)pairwise_distances)get_process_memory)sent_to_word_contexts_matrixc                    s8   t |  d }t fdd|jd D |jd< |S )Nr   c                    $   g | ]}|d kr
d nd|   qS r       .0valphar   D/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/word/_pmi.py
<listcomp>      $ z_as_diag.<locals>.<listcomp>)r   tolistnpasarraydata)pxr   px_diagr   r   r   _as_diag	   s   $r   c           	      C   sl   | j \}}|  \}}| j}t||kd }|| }|| }|| }t|}t|||ff||fd}|S )Nr   )shape)r   nonzeror   r   wherelogr   )	exp_pmimin_exp_pminmrowscolsr   indicesexp_pmi_r   r   r   _logarithm_and_ppmi   s   

r'           r
   c                 C   s   d|  k rdksJ  J t | jdd|   d}|du r2t | jdd|   d}|dk r@|| }||  }| |   }t|d}t||}|||}	|dkr^dnt |}
t|	|
}|||fS )az  
    :param X: scipy.sparse.csr_matrix
        (word, contexts) sparse matrix
    :param py: numpy.ndarray
        (1, word) shape, probability of context words.
    :param min_pmi: float
        Minimum value of pmi. all the values that smaller than min_pmi
        are reset to zero.
        Default is zero.
    :param alpha: float
        Smoothing factor. pmi(x,y; alpha) = p_xy /(p_x * (p_y + alpha))
        Default is 0.0
    :param beta: float
        Smoothing factor. pmi(x,y) = log ( Pxy / (Px x Py^beta) )
        Default is 1.0

    Returns
    ----------
    pmi : scipy.sparse.csr_matrix
        (word, contexts) pmi value sparse matrix
    px : numpy.ndarray
        Probability of rows (items)
    py : numpy.ndarray
        Probability of columns (features)

    Usage
    -----
        >>> pmi, px, py = pmi_memory_friendly(X, py=None, min_pmi=0, alpha=0, beta=1.0)
    r   r
   axisN)r   r   sumreshaper   dotexpr'   )Xpymin_pmir   betar   pxyr   py_diagr   r    pmir   r   r   r6   !   s     



r6         ?Fc                    s  d|  k rdksJ  J | j dd|    d}|du r,| j dd|    d}| |    }|jd |jd ks>J |dk rL|| }||   }t| d }t| d }	tdd |jd D |jd< t fdd|	jd D |	jd< |||	}
|dkrdnt	|}t
|
j|kd }t|
j}|
 \}}|
j}t|D ]0\}}|r|d	 dkrtd
d| |jd  dt  ddd t|| ||| || f< q|rtdddd |||fS )a  
    :param X: scipy.sparse.csr_matrix
        (word, contexts) sparse matrix
    :param py: numpy.ndarray
        (1, word) shape, probability of context words.
    :param min_pmi: float
        Minimum value of pmi. all the values that smaller than min_pmi
        are reset to zero.
        Default is zero.
    :param alpha: float
        Smoothing factor. pmi(x,y; alpha) = p_xy /(p_x * (p_y + alpha))
        Default is 0.0
    :param beta: float
        Smoothing factor. pmi(x,y) = log ( Pxy / (Px x Py^beta) )
        Default is 1.0
    :param verbose: Boolean
        If True, verbose mode on

    Returns
    ----------
    pmi : scipy.sparse.dok_matrix
        (word, contexts) pmi value sparse matrix
    px : numpy.ndarray
        Probability of rows (items)
    py : numpy.ndarray
        Probability of columns (features)

    Usage
    -----
        >>> pmi, px, py = pmi_memory_friendly(X, py=None, min_pmi=0, alpha=0, beta=1.0)
    r   r
   r)   r+   Nc                 S   s    g | ]}|d kr
d nd| qS r	   r   r   r   r   r   r      s     z'pmi_memory_friendly.<locals>.<listcomp>c                    r   r	   r   r   r   r   r   r      r   i'  z%computing pmi {:.3} %  mem={} Gb    d   z%.3fT )flushendzcomputing pmi was done{}z                              )r:   )r,   r-   r   r   r   r   r   r   r.   r/   r   r   r   	enumerateprintformatr   r   )r0   r1   r2   r   r3   verboser   r4   r   r5   r   r    r%   pmi_dokr#   r$   r   _n_idxidxr   r   r   pmi_memory_friendlyX   s<   ! $
 
rC   )Nr   r(   r
   )Nr   r(   r7   F)numpyr   scipy.sparser   r   r   sklearn.metricsr   soynlp.utilsr   soynlp.vectorizerr   r   r'   r6   rC   r   r   r   r   <module>   s    
7