o
    ½e¦i`  ã                   @   sn   d Z ddlmZmZmZmZ ddlZddlZdeeee f fdd„Z	deej
j fdd	„ZG d
d„ dƒZdS )zaModels and tooling for natural language processing using spaCy

Authors
* Sylvain de Langen 2024
é    )ÚIterableÚIteratorÚListÚUnionNÚsentencec                 C   s   t | tƒr| S d | ¡S )ax  Ensures that a sentence is a `str` rather than a list of `str` tokens to
    be passed to spaCy pipelines correctly.

    Arguments
    ---------
    sentence: str or list of str
        Sentence to return or list of tokens.

    Returns
    -------
    str
        The sentence, returned from the `sentence` argument as-is or joined with
        spaces from a list of tokens.ú )Ú
isinstanceÚstrÚjoin)r   © r   ú`/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/spacy/nlp.pyÚ_as_sentence   s   

r   Údocsc                 C   s   dd„ | D ƒS )a>  Returns a batch of list of lemmas from a list of Doc (as returned by the
    pipeline).

    Arguments
    ---------
    docs: iterable of Doc
        Documents, typically as returned by `nlp.pipe`.

    Returns
    -------
    list of list of str
        For each sentence, the sequence of extracted lemmas as `str`s.c                 S   s   g | ]	}d d„ |D ƒ‘qS )c                 S   s   g | ]}|j ‘qS r   )Úlemma_)Ú.0Útokr   r   r   Ú
<listcomp>/   s    z._extract_lemmas.<locals>.<listcomp>.<listcomp>r   )r   Údocr   r   r   r   /   s    z#_extract_lemmas.<locals>.<listcomp>r   )r   r   r   r   Ú_extract_lemmas"   s   r   c                   @   sŠ   e Zd ZdZdejjfdd„Zedd„ ƒZ	de
ee eee  f deejj fd	d
„Zde
ee eee  f deee  fdd„ZdS )ÚSpacyPipelineaV  Wraps a `spaCy pipeline <https://spacy.io/usage/processing-pipelines>`_
    with methods that makes it easier to deal with SB's typical sentence format,
    and adds some convenience functions if you only care about a specific task.

    Arguments
    ---------
    nlp : spacy.language.Language
        spaCy text processing pipeline to use.Únlpc                 C   s
   || _ d S )N)r   )Úselfr   r   r   r   Ú__init__<   s   
zSpacyPipeline.__init__c                 O   s   t tj| g|¢R i |¤ŽƒS )aR  Create a pipeline by loading a model using `spacy.load`.
        Unlike other toolkits, you must explicitly download the model if you
        want to use a remote model (e.g. `spacy download fr_core_news_md`)
        rather than just specifying a HF hub name.

        .. note::
            If you only need a subset of modules enabled in the pipeline,
            e.g. for lemmatization, consider
            `excluding <https://spacy.io/usage/processing-pipelines#disabling>_`
            using the `exclude=[...]` argument.

        Arguments
        ---------
        name: str | Path
            Package name or model path.
        *args
            Extra positional arguments passed to `spacy.load`.
        **kwargs
            Extra keyword arguments passed to `spacy.load`.

        Returns
        -------
        New SpacyPipeline
        )r   ÚspacyÚload)ÚnameÚargsÚkwargsr   r   r   Ú	from_name?   s   zSpacyPipeline.from_nameÚinputsÚreturnc                 C   s   | j  tt|ƒ¡S )aQ  Processes a batch of sentences into an iterator of spaCy documents.

        Arguments
        ---------
        inputs: list of sentences (str or list of tokens)
            Sentences to process, in the form of batches of lists of tokens
            (list of str) or a str.
            In the case of token lists, tokens do *not* need to be already
            tokenized for this specific sequence tagger, and they will be joined
            with spaces instead.

        Returns
        -------
        iterator of spacy.tokens.Doc
            Iterator of documents for the passed sentences.)r   ÚpipeÚmapr   ©r   r   r   r   r   Ú__call__\   s   zSpacyPipeline.__call__c                 C   s   t | |ƒƒS )a„  Lemmatize a batch of sentences by processing the input sentences,
        discarding other irrelevant outputs.

        Arguments
        ---------
        inputs: list of sentences (str or list of tokens)
            Sentences to lemmatize, in the form of batches of lists of tokens
            (list of str) or a str.
            In the case of token lists, tokens do *not* need to be already
            tokenized for this specific sequence tagger, and they will be joined
            with spaces instead.

        Returns
        -------
        list of list of str
            For each sentence, the sequence of extracted lemmas as `str`s.)r   r#   r   r   r   Ú	lemmatizeq   s   zSpacyPipeline.lemmatizeN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚlanguageÚLanguager   Ústaticmethodr   r   r   r	   r   ÚtokensÚDocr$   r%   r   r   r   r   r   2   s    	
ÿ

þÿ
þr   )r)   Útypingr   r   r   r   r   Úspacy.tokensr	   r   r-   r.   r   r   r   r   r   r   Ú<module>   s    