o
    wi#                     @   s  d dl Zd dlZd dlZd dlmZ zd dlmZ W n e	y)   d dlm
Z Y nw d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ ed
dG dd dZdededejfddZdededefddZdedefddZ G dd de!Z"dS )    N)	dataclass)_read_fileobject)_validate_fileobject_and_memmap)LogisticRegression)Pipeline)StandardScaler)ConfidenceConfigConfidenceMethodConfigget_confidence_aggregation_bankget_confidence_measure_bank)
HypothesisT)frozenc                   @   s<   e Zd ZU eed< eed< eed< eed< defddZdS )	ConfidenceSpecexclude_blankaggregationconfidence_typealphareturnc              	   C   sJ   | j dkrd}d}d}n	| j d\}}}t| j| jt||| j|ddS )aI  Converts confidence spec to the confidence config.

        Internally, the tuning procedure uses this "spec" objects as they
        are more aligned with how things are implemented. But when it's time
        to save the models or call transcribe, we need to use the proper
        object of type ``ConfidenceConfig``.
        max_probtsallislin_)nameentropy_typer   entropy_norm)r   r   
method_cfg)r   splitr   r   r   r	   r   )selfr   r   r    r   l/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/models/confidence_ensemble.pyto_confidence_config0   s   
z#ConfidenceSpec.to_confidence_configN)	__name__
__module____qualname__bool__annotations__strfloatr   r    r   r   r   r   r   )   s   
 r   
hypothesisr   r   c                 C   s$  t | jtrKg }| jD ]'}|D ]"}|s||d  q|d  |d jd d kr1||d  qq|s<||d  t|}tj	 rJ| }n/| j
}tj	 rW| }|rx|jdd}|||jd d k }|jd dkrw|dd }n|}t|d   tdstj|dd}|S )a  Returns logprobs from the hypothesis object with optional blanks filter.

    This function supports both CTC and Transducer hypotheses. Will place the
    logprobs on GPU if it's available.

    Args:
        hypothesis: generated hypothesis as returned from the transcribe
            method of the ASR model.
        exclude_blank: whether to filter out all ``<blank>`` tokens.

    Returns:
        torch.Tensor: of shape [S, V], where S is (filtered) sequence length and
        V is the vocabulary size.
    r      )dimN      ?)
isinstance
alignmentslistappenditemshapetorchstackcudais_available
y_sequenceargmaxallcloseexpsumtensorlog_softmax)r(   r   filtered_logprobs	alignment
align_elemlogprobslabelsr   r   r   get_filtered_logprobsJ   s:   



rC   confidence_cfgc           	      C   s   t | |j}|jd }t |j }|jjdkrd}d}nd|jj d|jj }|jj	}t
 | }|||||d  }|S )aU  Computes confidence score of the full utterance from a given hypothesis.

    This is essentially a re-implementation of the built-in confidence
    computation in NeMo. The difference is that we aggregate full-utterance
    scores, while core functionality only supports word and token level
    aggregations.

    Args:
        hypothesis: generated hypothesis as returned from the transcribe
            method of the ASR model.
        confidence_cfg: confidence config specifying what kind of
            method/aggregation should be used.

    Returns:
        float: confidence score.

    r)   r   r,   entropy_r   )vt)rC   r   r2   r
   r   r   r   r   r   r   r   cpur1   )	r(   rD   r>   
vocab_size	aggr_func	conf_typer   	conf_func
conf_valuer   r   r   compute_confidencey   s   

rN   	file_pathc           	         s  t j| std|  h d G  fdddtj}zt  td zt	| dO}t
|| dd8}t|tr?|d	 }t|tr`t	|d}|| }W d   n1 sZw   Y  n|| }W d   n1 spw   Y  W d   n1 sw   Y  t|tstd
|j D ]\}}t|ttfstdt| qW n tjtfy } ztd| d}~ww W d   |W S 1 sw   Y  |W S  ty } z	tdt| d}~ww )ab  
    Safely load a joblib file containing a scikit-learn pipeline.

    Args:
        file_path: Path to the joblib file

    Returns:
        Pipeline: A scikit-learn pipeline object

    Raises:
        ValueError: If the file doesn't exist or contains unauthorized content
        SecurityError: If the file contains potentially malicious content
    zModel file not found: >   numpy.dtypenumpy.ndarraysklearn.pipeline.Pipeline*sklearn.preprocessing._data.StandardScaler1sklearn.linear_model._logistic.LogisticRegressionnumpy._picklec                       s   e Zd Z fddZ  ZS )z-safe_joblib_load.<locals>.RestrictedUnpicklerc                    sN   | d| }| v r|dkrdd l }t||S t ||S td| d)N.rU   r   zUnauthorized class z in joblib file)numpygetattrsuper
find_classSecurityError)r   moduler   
class_pathnp)ALLOWED_CLASSES	__class__r   r   rZ      s   
z8safe_joblib_load.<locals>.RestrictedUnpickler.find_class)r!   r"   r#   rZ   __classcell__r   r_   )r`   r   RestrictedUnpickler   s    rc   ignorerbN)	mmap_moder   z,Loaded model must be a scikit-learn PipelinezUnauthorized pipeline step: zFailed to safely load model: )ospathexists
ValueErrorpickle	Unpicklerwarningscatch_warningssimplefilteropen_validate_joblib_filer-   tupler&   loadr   named_stepsitemsr   r   typeUnpicklingErrorAttributeErrorr[   	Exception)	rO   rc   rawfstreamfmodel	step_namestep_objer   rb   r   safe_joblib_load   sT   	





r   c                   @   s   e Zd ZdZdS )r[   z-Custom exception for security-related errors.N)r!   r"   r#   __doc__r   r   r   r   r[      s    r[   )#os.pathrg   rk   rm   dataclassesr   joblib.numpy_pickle_utilsr   rq   ImportErrorr   r3   sklearn.linear_modelr   sklearn.pipeliner   sklearn.preprocessingr   5nemo.collections.asr.parts.utils.asr_confidence_utilsr   r	   r
   r   +nemo.collections.asr.parts.utils.rnnt_utilsr   r   r$   TensorrC   r'   rN   r&   r   ry   r[   r   r   r   r   <module>   s*    /"K