o
    i.                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d d	lmZ d d
lmZ eeZG dd deZdS )    N)Optional)Union)
get_logger)EVALUATION_KIND_METADATA)EVALUATION_SPAN_METADATA)#FAITHFULNESS_DISAGREEMENTS_METADATA)IS_EVALUATION_SPAN)BaseRagasEvaluator)_get_ml_app_for_ragas_tracec                       s   e Zd ZdZdZdZ fddZdee fddZ	d	e
deeeef ee
 f fd
dZdededeee  fddZdedee fddZdd Zdedee fddZdefddZ  ZS )RagasFaithfulnessEvaluatorzA class used by EvaluatorRunner to conduct ragas faithfulness evaluations
    on LLM Observability span events. The job of an Evaluator is to take a span and
    submit evaluation metrics based on the span's attributes.
    ragas_faithfulnessscorec                    sZ   t  | |  | _| jj| jjd| _| jj| jjd| _	| jj
| jjjdd| _dS )a9  
        Initialize an evaluator that uses the ragas library to generate a faithfulness score on finished LLM spans.

        Faithfulness measures the factual consistency of an LLM's output against a given context.
        There are two LLM calls required to generate a faithfulness score - one to generate a set of statements from
        the answer, and another to measure the faithfulness of those statements against the context using natural
        language entailment.

        For more information, see https://docs.ragas.io/en/latest/concepts/metrics/faithfulness/

        The `ragas.metrics.faithfulness` instance is used for faithfulness scores. If there is no llm attribute set
        on this instance, it will be set to the default `llm_factory()` which uses openai.

        :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
                                      submitting evaluation metrics.

        Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
        )pydantic_objectF)languagecleanN)super__init___get_faithfulness_instanceragas_faithfulness_instanceragas_dependenciesRagasoutputParserStatementsAnswers*llm_output_parser_for_generated_statementsStatementFaithfulnessAnswers(llm_output_parser_for_faithfulness_scoreget_segmenternli_statements_messager   split_answer_into_sentences)selfllmobs_service	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/ddtrace/llmobs/_evaluators/ragas/faithfulness.pyr      s   

z#RagasFaithfulnessEvaluator.__init__returnc                 C   s.   | j jdu rdS | j j}|js| j  |_|S )z
        This helper function ensures the faithfulness instance used in
        ragas evaluator is updated with the latest ragas faithfulness
        instance AND has an non-null llm
        N)r   faithfulnessllmllm_factory)r   r   r"   r"   r#   r   :   s   z5RagasFaithfulnessEvaluator._get_faithfulness_instance
span_eventc                 C   s  |   | _| jsdi fS tdi}tjdddddf\}}}}}}| jjdt|d%}	|	t	d z| jj
|	d|t< | |}
|
du rktd d	|fW | jj|	||||dur]| ndd
d W  d   S |
d }|
d }d|
d }| ||}|du rtd d|fW | jj|	||||dur| ndd
d W  d   S | ||}|du rtd d|fW | jj|	||||dur| ndd
d W  d   S dd |jD |t< | |}t|rtd d|fW | jj|	||||dur| ndd
d W  d   S ||fW | jj|	||||dur*| ndd
d W  d   S | jj|	||||durH| ndd
d w 1 sSw   Y  dS )aS  
        Performs a faithfulness evaluation on a span event, returning either
            - faithfulness score (float) OR
            - failure reason (str)
        If the ragas faithfulness instance does not have `llm` set, we set `llm` using the `llm_factory()`
        method from ragas which defaults to openai's gpt-4o-turbo.
        fail_faithfulness_is_noner%   Nzdd-ragas.faithfulness)ml_appT)spanzYFailed to extract evaluation inputs from span sampled for `ragas_faithfulness` evaluation fail_extract_faithfulness_inputs)
statementsfaithfulness_list)r+   
input_dataoutput_datametadataquestionanswer contextszJFailed to create statements from answer for `ragas_faithfulness` evaluatorstatements_is_nonezAFailed to create faithfulness list `ragas_faithfulness` evaluator#statements_create_faithfulness_listc                 S   s    g | ]}|j d krd|jiqS )r   answer_quote)verdict	statement.0r3   r"   r"   r#   
<listcomp>}   s    z7RagasFaithfulnessEvaluator.evaluate.<locals>.<listcomp>zAScore computation returned NaN for `ragas_faithfulness` evaluatorstatements_compute_score)r   r   r   mathnanr   workflowr
   _set_ctx_itemr   export_spanr   $_extract_evaluation_inputs_from_spanloggerdebugannotatedictsjoin_create_statements_create_verdicts__root__r   _compute_scoreisnan)r   r(   evaluation_metadatar   r2   r3   contextr-   r.   ragas_faithfulness_workflowfaithfulness_inputsr"   r"   r#   evaluateG   s   
	






')z#RagasFaithfulnessEvaluator.evaluater2   r3   c                 C   s   | j da | j j||dd | j||d}	 | jj|}| j|j	d d j
}|d u r9	 W d    d S dd | D }dd |D }| j j|d	 t|ts^	 W d    d S |W  d    S 1 sjw   Y  d S )
Nzdd-ragas.create_statements)r2   r3   )r/   )r3   r2   r   c                 S   s   g | ]}|d  qS )simpler_statementsr"   )r<   itemr"   r"   r#   r=      s    zARagasFaithfulnessEvaluator._create_statements.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|qqS r"   r"   )r<   sublistrU   r"   r"   r#   r=      s    )r0   )r   rA   rG   _create_statements_promptr   r&   generate_textr   parsegenerationstextrH   
isinstancelist)r   r2   r3   statements_promptr-   r"   r"   r#   rJ      s*   
$z-RagasFaithfulnessEvaluator._create_statementsrP   r-   c           	         sp  j d}j j||d 	 jj|| t jdkr+	 W d   dS t	jdd} fddt
|D }d	d fd
d|D D }t|dkrZ	 W d   dS jj|d}zBzjj|W W j j||d W  d   S  ty } ztjd|d W Y d}~W j j||d W d   dS d}~ww j j||d w 1 sw   Y  dS )zv
        Returns: `StatementFaithfulnessAnswers` model detailing which statements are faithful to the context
        zdd-ragas.create_verdicts)r+   r/   r   N_reproducibility   c                    s   g | ]
} j d  | jqS )r   )rZ   r[   )r<   i)raw_nli_resultsr"   r#   r=      s    z?RagasFaithfulnessEvaluator._create_verdicts.<locals>.<listcomp>c                 S   s   g | ]
}|d ur|  qS )N)rH   )r<   faithr"   r"   r#   r=      s
    c                    s   g | ]} j |qS r"   )r   rY   )r<   r[   )r   r"   r#   r=      s    r9   )r+   r0   z!Failed to parse faithfulness_list)exc_info)r   rA   rG   r   r&   rX   )_create_natural_language_inference_promptlenrZ   getattrranger   	ensemblerfrom_discreter   	parse_obj	ExceptionrE   rF   )	r   rP   r-   create_verdicts_workflowreproducibilityraw_nli_results_textsraw_faithfulness_listr.   er"   )rb   r   r#   rK      sZ   

z+RagasFaithfulnessEvaluator._create_verdictsc                 C   sv   | j d+ | j|}dd |D }ddd t|D }| jjj|||dW  d    S 1 s4w   Y  d S )Nz!dd-ragas.create_statements_promptc                 S   s   g | ]}|  d r|qS ).)stripendswith)r<   sentencer"   r"   r#   r=          zHRagasFaithfulnessEvaluator._create_statements_prompt.<locals>.<listcomp>
c                 S   s   g | ]\}}| d | qS ):r"   )r<   ra   xr"   r"   r#   r=      rv   )r2   r3   	sentences)	r   taskr   segmentrI   	enumerater   statement_promptformat)r   r3   r2   rz   r"   r"   r#   rW      s   $z4RagasFaithfulnessEvaluator._create_statements_promptcontext_strc                 C   sL   | j d | jjj|t|d}|W  d    S 1 sw   Y  d S )Nz1dd-ragas.create_natural_language_inference_prompt)rP   r-   )r   r{   r   r   r   jsondumps)r   r   r-   prompt_valuer"   r"   r#   re      s   
$zDRagasFaithfulnessEvaluator._create_natural_language_inference_promptc                 C   s|   | j d. tdd |jD }t|j}|r|| }ntj}| j j||d|d |W  d   S 1 s7w   Y  dS )z
        Args:
            faithfulness_list (StatementFaithfulnessAnswers): a list of statements and their faithfulness verdicts
        zdd-ragas.compute_scorec                 s   s    | ]
}|j r	d ndV  qdS )r`   r   N)r9   r;   r"   r"   r#   	<genexpr>   s    z<RagasFaithfulnessEvaluator._compute_score.<locals>.<genexpr>)faithful_statementsnum_statements)r1   r0   N)r   r{   sumrL   rf   r?   r@   rG   )r   r.   r   r   r   r"   r"   r#   rM      s   

$z)RagasFaithfulnessEvaluator._compute_score)__name__
__module____qualname____doc__LABELMETRIC_TYPEr   r   objectr   dicttupler   floatstrrS   r]   rJ   rK   rW   re   rM   __classcell__r"   r"   r    r#   r      s    &K,
r   )r   r?   typingr   r   ddtrace.internal.loggerr   ddtrace.llmobs._constantsr   r   r   r   %ddtrace.llmobs._evaluators.ragas.baser	   r
   r   rE   r   r"   r"   r"   r#   <module>   s    