o
    ߥi                     @   sN   d dl Z d dlZd dlZd dlZd
ddZd
ddZdd ZG dd	 d	ZdS )    Nc                 C   sP   i }t  |d< tj |d< t |d< | d ur&| jdkr&t j| |d< |S Nrng_state_torchrng_state_nprng_state_rndcudarng_state_torch_cuda)torchget_rng_statenprandom	get_stategetstatetyper   )devicerandom_states r   X/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/peer/sas_utils.pyget_random_states   s   r   c                 C   sZ   t | d  tj| d  t| d  |d ur)|jdkr+t j| d  d S d S d S r   )r   set_rng_stater
   r   	set_statesetstater   r   )r   r   r   r   r   set_random_states   s   r   c                 C   s  | d u rd S ddg}t | r&t |  rd|d< t |  r%d|d< nTt| tu rztt| D ]+}t | | r]t | |  rL|d  d7  < t | |  r]|d  d7  < q2|d dkrl|d  d7  < |d dkrz|d  d7  < t	|dkr|S d S )Nr      
   )
r   	is_tensorisnananyisinfr   tuplerangelensum)dataresultir   r   r   check_nan_inf%   s,   
r%   c                   @   s&   e Zd ZdddZdd Zdd ZdS )	SequenceSideInfoNc                    s   |d ur| _ nddlm} |d _ tjj  _ fddt	 j j
D tfddt	tD  _dd D }tj|tjd _d S )	Nr   )ElectraTokenizerzgoogle/electra-small-generatorc                    s   g | ]	} j |gqS r   )	tokenizerdecode.0r$   )selfr   r   
<listcomp>L   s    z-SequenceSideInfo.__init__.<locals>.<listcomp>c                    s$   g | ]} | d d dkr|qS )r      ##r   r*   )tokensr   r   r-   Q   s   $ c                 S   s\   g | ]*}|d  dkr|d dkrd n|d d dkr%dt dt|d  nt dt|qS )r   []r.   r/   r      )minr    )r+   tr   r   r   r-   R   s    
dtype)r(   transformersr'   from_pretrainednltktokenizepunktPunktSentenceTokenizersen_tokenizerr   
vocab_sizesetr    ind_subtokensr   tensorint8
len_tokens)r,   r(   r'   tmpr   )r,   r0   r   __init__B   s"   

zSequenceSideInfo.__init__c                    s   | j |}tdd | j|d D  | dd     d< t fddtt	 D }t fddtt	 D }t||fS )Nc                 S   s   g | ]}t |d  qS )r.   )r    )r+   xr   r   r   r-   ]   s    
z1SequenceSideInfo.getSenTokIdx.<locals>.<listcomp>	input_idsr   r   c                    s$   g | ]}|t j | t jd  qS r7   )r
   onesrD   r*   sen_lengthsr   r   r-   d   s    c                    s    g | ]}t j | t jd qS rJ   )r
   arangerD   r*   rL   r   r   r-   h   s    )
r?   r<   r
   arrayr(   batch_encode_plusr!   concatenater   r    )r,   sentence_position_embedding
inputs_strseq_len_total	sentencesidx_senidx_tokr   rL   r   getSenTokIdxZ   s   


zSequenceSideInfo.getSenTokIdxc                    s  d}t  d ttjfrd}t  tjdr4j }tjt	 fdd|D  j
d}ntjt	 fdd  D  j
d}t } jd	 }|d d d|f |d
< |d d d	| d| f |d< dkrtj dd\}}	jt|}
t|
dkrt fdd|
D jdd }n	tj jtjd}d|d d df< |d d d	d f }td	dD ]"}t||k|d d ddf |k}t|dkr n|d	 ||< q||d< j   }||d< |r| D ]
}||  ||< q|S )NFr   Tbatch_decodec                    s    g | ]} | jd  qS r   )rX   shape)r+   	input_str	inputs_idr,   rR   r   r   r-   x   s    z;SequenceSideInfo.generate_seq_side_info.<locals>.<listcomp>)r   c                    s(   g | ]} j| jd  qS rZ   )rX   r(   r)   r[   )r+   	input_orir]   r   r   r-      s    
r    ss_sentence_position_in_sequencer.   ss_token_position_in_sentence)return_inversec                    s   g | ]} |kqS r   r   )r+   st)r^   r   r   r-      s    )axisr7      r2   ss_token_position_in_whole_wordss_token_string_length)
isinstancelistr
   ndarrayr   rC   hasattrr(   rY   rO   r   numpydictr[   uniquerB   intersectionrA   r    stackr   charzerosrD   r   logical_andrE   longkeys)r,   rR   r^   is_np_arrayrS   sen_tok_idxside_info_dict
seq_lengthrn   _rB   
idx_tok_wwidx_tok_ww_1r$   posinputs_str_lenkeyr   r]   r   generate_seq_side_infoo   st   
	

z'SequenceSideInfo.generate_seq_side_infoN)__name__
__module____qualname__rG   rX   r   r   r   r   r   r&   @   s    
r&   r   )	r   r;   rl   r
   r   r   r   r%   r&   r   r   r   r   <module>   s    

