o
    ei#                     @   s^   d Z ddlZddlZddlmZ dd ZG dd dejZG dd	 d	eZ	d
d Z
dd ZdS )z
A convenience wrapper for word embeddings retrieved out of
HuggingFace transformers (e.g. BERT)

Authors
* Artem Ploujnikov 2021
    N)nnc                 C   s   t |  dS Nr   )range)count r   ]/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/wordemb/transformer.py_last_n_layers   s   r   c                       s~   e Zd ZdZdZdZd fdd	ZdddZd	d
 Zdd Z	dd Z
dd Zdd Zdd ZdddZdd Zdd Z  ZS )TransformerWordEmbeddingsa=
  A wrapper to retrieve word embeddings out of a pretrained Transformer model
    from HuggingFace Transformers (e.g. BERT)

    Arguments
    ---------
    model: str|nn.Module
        the underlying model instance or the name of the model
        to download

    tokenizer: str|transformers.tokenization_utils_base.PreTrainedTokenizerBase
        a pretrained tokenizer - or the identifier to retrieve
        one from HuggingFace

    layers: int|list
        a list of layer indexes from which to construct an embedding or the number of layers

    device: str
        a torch device identifier. If provided, the model
        will be transferred onto that device

    Example
    -------
    NOTE: Doctests are disabled because the dependency on the
    HuggingFace transformer library is optional.

    >>> from transformers import AutoTokenizer, AutoModel # doctest: +SKIP
    >>> from speechbrain.wordemb.transformer import TransformerWordEmbeddings
    >>> model_name = "bert-base-uncased" # doctest: +SKIP
    >>> tokenizer = AutoTokenizer.from_pretrained(
    ...    model_name, return_tensors='pt') # doctest: +SKIP
    >>> model = AutoModel.from_pretrained(
    ...     model_name,
    ...     output_hidden_states=True) # doctest: +SKIP
    >>> word_emb = TransformerWordEmbeddings(
    ...     model=model,
    ...     layers=4,
    ...     tokenizer=tokenizer
    ... ) # doctest: +SKIP
    >>> embedding = word_emb.embedding(
    ...     sentence="THIS IS A TEST SENTENCE",
    ...     word="TEST"
    ... ) # doctest: +SKIP
    >>> embedding[:8] # doctest: +SKIP
    tensor([ 3.4332, -3.6702,  0.5152, -1.9301,  0.9197,  2.1628, -0.2841, -0.3549])
    >>> embeddings = word_emb.embeddings("This is cool") # doctest: +SKIP
    >>> embeddings.shape # doctest: +SKIP
    torch.Size([3, 768])
    >>> embeddings[:, :3] # doctest: +SKIP
    tensor([[-2.9078,  1.2496,  0.7269],
        [-0.9940, -0.6960,  1.4350],
        [-1.2401, -3.8237,  0.2739]])
    >>> sentences = [
    ...     "This is the first test sentence",
    ...     "This is the second test sentence",
    ...     "A quick brown fox jumped over the lazy dog"
    ... ]
    >>> batch_embeddings = word_emb.batch_embeddings(sentences) # doctest: +SKIP
    >>> batch_embeddings.shape # doctest: +SKIP
    torch.Size([3, 9, 768])
    >>> batch_embeddings[:, :2, :3] # doctest: +SKIP
    tensor([[[-5.0935, -1.2838,  0.7868],
             [-4.6889, -2.1488,  2.1380]],

            [[-4.4993, -2.0178,  0.9369],
             [-4.1760, -2.4141,  1.9474]],

            [[-1.0065,  1.4227, -2.6671],
             [-0.3408, -0.6238,  0.1780]]])
    z5'word' should be either a word or the index of a word   Nc                    s   t    |s
| j}t|trt|n|}t|| _t|tr3|d u r%|}t	|}t|tr2t
|}n	|d u r<t| j|| _|| _|d urR|| _| j|| _d S | jj| _d S N)super__init__DEFAULT_LAYERS
isinstanceintr   listlayersstr
_get_model_get_tokenizer
ValueErrorMSG_model	tokenizerdeviceto)selfr   r   r   r   	__class__r   r   r   \   s(   




z"TransformerWordEmbeddings.__init__c                 C   s   |r|  ||S | |S )a&  Retrieves a word embedding for the specified word within
        a given sentence, if a word is provided, or all word embeddings
        if only a sentence is given

        Arguments
        ---------
        sentence: str
            a sentence
        word: str|int
            a word or a word's index within the sentence. If a word
            is given, and it is encountered multiple times in a
            sentence, the first occurrence is used

        Returns
        -------
        emb: torch.Tensor
            the word embedding
        )	embedding
embeddings)r   sentencewordr   r   r   forwardt   s
   z!TransformerWordEmbeddings.forwardc                 C   s   | j j|dd}t  | jdi | |}W d   n1 s"w   Y  t|tr3| ||}nt|t	r;|}nt
| jt|j}| |||jdd}|S )a  Retrieves a word embedding for the specified word within
        a given sentence

        Arguments
        ---------
        sentence: str
            a sentence
        word: str|int
            a word or a word's index within the sentence. If a word
            is given, and it is encountered multiple times in a
            sentence, the first occurrence is used

        Returns
        -------
        emb: torch.Tensor
            the word embedding
        ptreturn_tensorsNr   )dimr   )r   encode_plustorchno_gradr   
_to_devicer   r   _get_word_idxr   r   MSG_WORDstackhidden_states_get_word_vectormean)r   r!   r"   encodedoutputidxstatesword_embeddingr   r   r   r      s   



z#TransformerWordEmbeddings.embeddingc                 C   s   | j j|dd}t  | jdi | |}W d   n1 s"w   Y  tjdd t| D | j	d}t
|j}| ||S )a  
        Returns the model embeddings for all words
        in a sentence

        Arguments
        ---------
        sentence: str
            a sentence

        Returns
        -------
        emb: torch.Tensor
            a tensor of all word embeddings

        r$   r%   Nc                 S   s   g | ]
\}}|d ur|qS r   r   ).0r4   word_idr   r   r   
<listcomp>   s
    z8TransformerWordEmbeddings.embeddings.<locals>.<listcomp>)r   r   )r   r(   r)   r*   r   r+   tensor	enumerateword_idsr   r.   r/   _get_hidden_states)r   r!   r2   r3   token_ids_wordr5   r   r   r   r       s   

z$TransformerWordEmbeddings.embeddingsc                 C   sf   | j j|ddd}t  | jdi | |}W d   n1 s#w   Y  t|j}| |S )a  Returns embeddings for a collection of sentences

        Arguments
        ---------
        sentences: List[str]
            a list of strings corresponding to a batch of
            sentences

        Returns
        -------
        emb: torch.Tensor
            a (B x W x E) tensor
            B - the batch dimensions (samples)
            W - the word dimension
            E - the embedding dimension
        Tr$   )paddingr&   Nr   )	r   batch_encode_plusr)   r*   r   r+   r.   r/   r=   )r   	sentencesr2   r3   r5   r   r   r   batch_embeddings   s   

z*TransformerWordEmbeddings.batch_embeddingsc                    s    fdd|  D S )Nc                    s   i | ]
\}}|  |qS r   )_tensor_to_device)r7   keyvaluer   r   r   
<dictcomp>   s    z8TransformerWordEmbeddings._to_device.<locals>.<dictcomp>)items)r   r2   r   rF   r   r+      s   
z$TransformerWordEmbeddings._to_devicec                 C   s   t |tjr|| jS |S r   )r   r)   Tensorr   r   )r   rE   r   r   r   rC      s   z+TransformerWordEmbeddings._tensor_to_devicec                 C   s   | d|S )N )splitindex)r   sentr"   r   r   r   r,      s   z'TransformerWordEmbeddings._get_word_idxc                 C   sF   || j  d }|d ur|| }|S |d d ddd d f }|S )Nr      )r   sumsqueeze)r   r5   r>   r3   r   r   r   r=      s   z,TransformerWordEmbeddings._get_hidden_statesc                 C   s6   t tt| |kd | j}| ||S r   )	r)   
from_numpynpwherearrayr<   r   r   r=   )r   r2   r5   r4   r>   r   r   r   r0      s   z*TransformerWordEmbeddings._get_word_vectorc                 C   s   || _ | j|| _| S )z3Transfers the model to the specified PyTorch device)r   r   r   )r   r   r   r   r   r     s   zTransformerWordEmbeddings.to)NNNr   )__name__
__module____qualname____doc__r-   r   r   r#   r   r    rB   r+   rC   r,   r=   r0   r   __classcell__r   r   r   r   r	      s    F
" 
r	   c                       s$   e Zd ZdZdZ fddZ  ZS )MissingTransformersErrorz5Thrown when HuggingFace Transformers is not installedz-This module requires HuggingFace Transformersc                    s   t  | j d S r   )r   r   MESSAGErF   r   r   r   r     s   z!MissingTransformersError.__init__)rV   rW   rX   rY   r\   r   rZ   r   r   r   r   r[     s    r[   c                 C   s2   zddl m} |j| ddW S  ty   t w )z5Tries to retrieve a pretrained model from Huggingfacer   )	AutoModelT)output_hidden_states)transformersr]   from_pretrainedImportErrorr[   )
identifierr]   r   r   r   r     s   r   c                 C   s.   zddl m} || W S  ty   t w )z9Tries to retrieve a pretrained tokenizer from HuggingFacer   )AutoTokenizer)r_   rc   r`   ra   r[   )rb   rc   r   r   r   r     s   r   )rY   numpyrS   r)   r   r   Moduler	   	Exceptionr[   r   r   r   r   r   r   <module>   s     {	
