o
    yi-                     @   s  d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZmZ d dlmZmZ erMd dlmZmZmZmZ nd ZZerWd dlZded	efd
dZ	d1deeef de
eeejf  d	eeef fddZdedede d	eeef fddZ!deded	eeeef fddZ"				d2de	e dede de#de#de#d	eeeef e
e f fd d!Z$d3d"ed#e#d	eed$f fd%d&Z%d'eded	dfd(d)Z&	d1d*eeej'f de
eeejf  d	eeef fd+d,Z(G d-d. d.eZ)G d/d0 d0e)Z*dS )4    N)Counterdefaultdict)AnyCallableDictListOptionalSetTupleUnion)Tensor)
DataLoaderDataset)_TQDM_AVAILABLE_TRANSFORMERS_AVAILABLE)AutoModelForMaskedLMAutoTokenizerPreTrainedModelPreTrainedTokenizerBaseattention_maskreturnc                 C   sD   d| dddf< | d  dd}d| t| d |f< | S )a(  Process attention mask to be zero for special [CLS] and [SEP] tokens as they're not included in a
    calculation for BERT score.

    Args:
        attention_mask: An attention mask to be returned, for example, by a `transformers` tokenizer.

    Return:
        A processed attention mask.
    r   Ng?)cumsumargmaxtorcharangesizelong)r   sep_token_position r   h/home/ubuntu/.local/lib/python3.10/site-packages/torchmetrics/functional/text/helper_embedding_metric.py*_process_attention_mask_for_special_tokens"   s   r!   batchdevicec                 C   sj   t | d d  }| d ddd|f |}| d ddd|f |}| ||d | S )z|Helper function that trims model inputs to the longest sequence within the batch and put the input on the
    proper device.r      	input_idsNr%   r   )intsummaxitemtoupdate)r"   r#   max_lenr%   r   r   r   r    _input_data_collator4   s
   r.   model_output
target_lenc                 C   s|   t | j}||d  |d< tj| tj|| jd| jgdd} tj|d |d |jd|j}tj||gdd}| |fS )zSHelper function that pads the model output and attention mask to the target length.   )dtype)dimr   r$   )listshaper   catzerosr2   r+   r#   )r/   r   r0   zeros_shaper7   r   r   r    _output_data_collator@   s   
"r9   r%   c                 C   s(   | d }| | } || }| ||fS )z=Sort tokenized sentence from the shortest to the longest one.r$   )r(   argsort)r%   r   sorted_indicesr   r   r    _sort_data_according_lengthL   s   
r<      TFtext	tokenizer
max_length
truncationsort_according_lengthown_tokenizerc              
   C   s   |s|| d||dd}nz|| |}W n t y& } zt d| d}~ww |r>t|d |d \}}	}
||	d}||
fS |d |d d}d}
||
fS )	a  Default text pre-processing function using `transformers` `AutoTokenizer` instance.

    Args:
        text:
            An iterable of sentences.
        tokenizer:
            Either `AutoTokenizer` instance from `transformers` package, or a user's own tokenizer.
        max_length:
            A maximum sequence length.
        truncation:
            An indication of whether tokenized sequences should be padded only to the length of the longest sequence.
        sort_according_length:
            An indication of whether tokenized sequences should be sorted from shortest to longest. This is appropriate
            to do for leveraging dynamic padding during embedding calculation and thereby to hasten inference.
        own_tokenizer:
            An indication of whether a non-default user's own tokenizer is used.

    Return:
        A dictionary of tokenized sentences including input_ids and attention_mask.

    Raises:
        BaseException:
            If a tokenization with a user's own tokenizer is not successful.
    r@   pt)paddingr@   rA   return_tensorsz!Tokenization was not successful: Nr%   r   r&   )BaseExceptionr<   )r>   r?   r@   rA   rB   rC   tokenized_dataer%   r   sorting_indices
input_dictr   r   r    _preprocess_textT   s&    


rL   
dataloaderverboseztqdm.auto.tqdmc                 C   s   |rt j | S | S )zHelper function returning either the dataloader itself when `verbose = False`, or it wraps the dataloader with
    `tqdm.auto.tqdm`, when `verbose = True` to display a progress bar during the embbeddings calculation.)tqdmauto)rM   rN   r   r   r    _get_progress_bar   s   rQ   outputc                 C   sb   |j dd \}}t| j dkp| j d |kp| j d |k}|r/td| d| d| j  d	dS )
z2Check if the shape of the user's own model output.Nr1      r   r$   zVThe model output must be `Tensor` of a shape `[batch_size, seq_len, model_dim]` i.e. [z, z. , `model_dim`], but got .)r5   len
ValueError)rR   r%   bsseq_leninvalid_out_shaper   r   r    _check_shape_of_model_output   s   *rZ   model_name_or_pathc                 C   s.   t | }t| }|  || ||fS )ag  Load HuggingFace `transformers`' tokenizer and model. This function also handle a device placement.

    Args:
        model_name_or_path:
            A name or a model path used to load `transformers` pretrained model.
        device:
            A device to be used for calculation.

    Return:
        Initialized `transformers`' tokenizer and model.
    )r   from_pretrainedr   evalr+   )r[   r#   r?   modelr   r   r    _load_tokenizer_and_model   s
   


r_   c                   @   s   e Zd ZdZdeddfdee dedede	ee eege
eeef eeeef ee f f f d	ed
eeeef  ddfddZdedeeef fddZdefddZdeeef fddZdefddZededefddZdS )TextDatasetzoPyTorch dataset class for storing tokenized sentences and other properties used for BERT score
    calculation.r=   FNr>   r?   r@   preprocess_text_fnidf
tokens_idfr   c                 C   st   ||||}t |tr|\| _| _n|| _| jd jd | _t|| _|| _i | _	|r8|dur1|n| 
 | _	dS dS )aX  
        Args:
            text:
                An iterable of sentences.
            tokenizer:
                `AutoTokenizer` instance from `transformers` package.
            max_length:
                A maximum sequence length.
            preprocess_text_fn:
                A function used for processing the input sentences.
            idf:
                An indication of whether calculate token inverse document frequencies to weight the model embeddings.
            tokens_idf:
                Inverse document frequencies (these should be calculated on reference sentences).
        r%   r$   N)
isinstancetupler>   rJ   r5   r@   rU   num_sentencesrb   rc   _get_tokens_idf)selfr>   r?   r@   ra   rb   rc   _textr   r   r    __init__   s   

zTextDataset.__init__idxc                    sd    j d |d d f } j d |d d f }||d} jr0t fdd| D }||d< |S )Nr%   r   r&   c                    s   g | ]} j | qS r   )rc   ).0	input_idxrh   r   r    
<listcomp>   s    z+TextDataset.__getitem__.<locals>.<listcomp>input_ids_idf)r>   rb   r   tensortolist)rh   rk   r%   r   inputs_dictrp   r   rn   r    __getitem__   s   
zTextDataset.__getitem__c                 C   s   | j S N)rf   rn   r   r   r    __len__   s   zTextDataset.__len__c                    sR   t  }t j jd D ]}|| qt j}| fdd| D  |S )zCalculate token inverse document frequences.

        Return:
            A python dictionary containing inverse document frequences for token ids.
        r%   c                    s*   i | ]\}}|t  jd  |d   qS )r$   mathlogrf   )rl   rk   
occurrencern   r   r    
<dictcomp>   s   * z/TextDataset._get_tokens_idf.<locals>.<dictcomp>)r   map_set_of_tokensr>   r,   r   _get_tokens_idf_default_valueitems)rh   token_countertokensrc   r   rn   r    rg      s   
zTextDataset._get_tokens_idfc                 C   s   t | jd d S )z9Helper function that ensures `defaultdict` to be pickled.r$   rw   rn   r   r   r    r~      s   z)TextDataset._get_tokens_idf_default_valuer%   c                 C   s   t |  S )zAReturn set of tokens from the `input_ids` :class:`~torch.Tensor`.)setrr   )r%   r   r   r    r}      s   zTextDataset._set_of_tokens)__name__
__module____qualname____doc__rL   r   strr   r'   r   r   r   r   r
   r   boolfloatrj   rt   rv   rg   r~   staticmethodr	   r}   r   r   r   r    r`      s:    4	

&	r`   c                   @   s@   e Zd ZdZ		ddedededeeee	f  ddf
d	d
Z
dS )TokenizedDatasetzHThe child class of `TextDataset` class used with already tokenized data.FNr%   r   rb   rc   r   c                 C   s~   t tg dt||}|d| _t|| _t| jd | _| jd j	d | _
|| _i | _|r=|dur6|n|  | _dS dS )ah  
        Args:
            input_ids: Input indexes
            attention_mask: Attention mask
            idf:
                An indication of whether calculate token inverse document frequencies to weight the model embeddings.
            tokens_idf:
                Inverse document frequencies (these should be calculated on reference sentences).
        )r%   r   rJ   rJ   r%   r$   N)dictzipr<   poprJ   r.   r>   rU   rf   r5   r@   rb   rc   rg   )rh   r%   r   rb   rc   r>   r   r   r    rj     s   
zTokenizedDataset.__init__)FN)r   r   r   r   r   r   r   r   r'   r   rj   r   r   r   r    r      s    r   ru   )r=   TTF)F)+rx   oscollectionsr   r   typingr   r   r   r   r   r	   r
   r   r   r   torch.utils.datar   r   torchmetrics.utilities.importsr   r   transformersr   r   r   r   rO   r!   r   r#   r.   r'   r9   r<   r   rL   rQ   rZ   PathLiker_   r`   r   r   r   r   r    <module>   sn   (


" 
 6

P