o
    i{0                 
   @   sX  U d dl mZ d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	Z	d dl
Z
d dlZd dlZd dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ zd dlmZ W n ey   d dlmZ Y nw d dlZzd dlmZ d dlmZ W n ey   dZdZY nw d dl m!Z! d dl"m#Z# d dl"m$Z$ d dl"m%Z% d dl&m'Z' d dl(m)Z) d dl(m*Z* d dl(m+Z+ d dl(m,Z, d dl-m.Z. d dl-m/Z/ d d l0m1Z1 erd d!l2m3Z3 d d"l4m5Z5 d d#l4m6Z6 d d$l7m8Z8 e'e9Z:ee;e<e=e>de?d% e@e;d%f f ZAee;e<e=e>e?eA e@e;eAf f ZBe@e;eAf ZCe@e;eBf ZDeeDeeC geAf ZEeeDeeC geeA f ZFG d&d' d'ZGd(e;d)dfd*d+ZHed,d-G d.d/ d/ZIed,d-G d0d1 d1ZJG d2d3 d3eZKG d4d5 d5eZLG d6d7 d7eZMG d8d9 d9eZNedureduree?e e?e f ZOeePd:< eeeDeAeAgeeAd'f f eKeOf ZQeePd;< eeeDeAeAgeeeAd'f  f eMeOf ZReePd<< n,eeeDeAeAgeeAd'f f eKf ZQeePd;< eeeDeAeAgeeeAd'f  f eMf ZReePd<< eeeeD eeA eeA e@e;eeA f geAf eLf ZSeeeeD eeA eeA e@e;eeA f geeA f eNf ZTd=ed)e>fd>d?ZUd=ed)e>fd@dAZVd=ed)e>fdBdCZWd=ed)e>fdDdEZXeduredurdqd=edGe>d)efdHdIZYd=ed)efdJdKZZndqd=edGe>d)efdLdIZYd=ed)efdMdKZZG dNdO dOeZ[G dPdQ dQedFdRZ\G dSdT dTe\Z]G dUdV dVedFdRZ^G dWdX dXe^Z_G dYdZ dZedFdRZ`G d[d\ d\e]e`ZaG d]d^ d^eZbG d_d` d`eZcG dadb dbZdG dcdd ddeZeG dedf dfZfG dgdh dheZgG didj djZhG dkdl dlZiG dmdn dnZjd)e;fdodpZkdS )r    )ABC)abstractmethodNdeepcopy)	dataclass)field)TYPE_CHECKING)Any)	Awaitable)Callable)Iterator)Optional)Sequence)	TypedDict)Union)cast)overload)	TypeAlias)BaseConversationalMetric)
BaseMetric)config)	ERROR_MSG)ERROR_STACK)
ERROR_TYPE)
get_logger)DD_SITES_NEEDING_APP_SUBDOMAIN)EXPERIMENT_CONFIG)EXPERIMENT_EXPECTED_OUTPUT)EXPERIMENT_RECORD_METADATA)convert_tags_dict_to_list)	safe_json)__version__)LLMObs)LLMObsExperimentEvalMetricEvent)LLMObsExperimentsClient)ExportedLLMObsSpanJSONTypec                   @   s\   e Zd ZdZ				ddedee dee deeeef  deeeef  ddfd	d
ZdS )EvaluatorResulta  Container for evaluator results with additional metadata.

    This class allows evaluators to return not just a value, but also
    reasoning, assessment, metadata, and tags alongside the evaluation result.

    Example::

        def my_evaluator(input_data, output_data, expected_output):
            score = calculate_score(output_data, expected_output)
            return EvaluatorResult(
                value=score,
                reasoning="The output matches the expected format",
                assessment="pass" if score > 0.8 else "fail",
                metadata={"confidence": 0.95},
                tags={"category": "accuracy"}
            )
    Nvalue	reasoning
assessmentmetadatatagsreturnc                 C   s"   || _ || _|| _|| _|| _dS )a  Initialize an EvaluatorResult.

        :param value: The primary evaluation result (numeric, boolean, string, etc.)
        :param reasoning: Optional explanation of why this evaluation result was produced
        :param assessment: Optional categorical assessment (e.g., "pass", "fail", "good", "bad")
        :param metadata: Optional dictionary of additional metadata about the evaluation
        :param tags: Optional dictionary of tags to categorize or label the evaluation
        N)r(   r)   r*   r+   r,   )selfr(   r)   r*   r+   r,    r/   N/home/ubuntu/.local/lib/python3.10/site-packages/ddtrace/llmobs/_experiment.py__init__Z   s
   
zEvaluatorResult.__init__)NNNN)	__name__
__module____qualname____doc__r&   r   strdictr1   r/   r/   r/   r0   r'   G   s&    r'   namer-   c                 C   s>   t | ts	td| stdtd| std|  ddS )zValidate that evaluator name is valid.

    :param name: The evaluator name to validate
    :raises TypeError: If the name is not a string
    :raises ValueError: If the name is empty or contains invalid characters
    zEvaluator name must be a stringzEvaluator name cannot be emptyz^[a-zA-Z0-9_-]+$zEvaluator name 'zW' is invalid. Name must contain only alphanumeric characters, underscores, and hyphens.N)
isinstancer6   	TypeError
ValueErrorrematch)r8   r/   r/   r0   _validate_evaluator_nameq   s   

r>   T)frozenc                   @   st   e Zd ZU dZeeef ed< eed< dZe	e
 ed< eedZeeef ed< dZe	e ed< dZe	e ed	< dS )
EvaluatorContextaw  Context object containing all data needed for evaluation.

    This frozen dataclass wraps all metadata needed to run an evaluation,
    providing better state management and extensibility compared to individual parameters.

    :param input_data: The input data that was provided to the task (read-only).
                       Dictionary with string keys mapping to JSON-serializable values.
    :param output_data: The output data produced by the task (read-only).
                        Any JSON-serializable type.
    :param expected_output: The expected output for comparison, if available (read-only).
                            Optional JSON-serializable type.
    :param metadata: Additional metadata including dataset record metadata and experiment configuration (read-only).
                     Dictionary with string keys mapping to JSON-serializable values.
    :param span_id: The span ID associated with the task execution, if available (read-only).
                    Optional string.
    :param trace_id: The trace ID associated with the task execution, if available (read-only).
                     Optional string.
    
input_dataoutput_dataNexpected_outputdefault_factoryr+   span_idtrace_id)r2   r3   r4   r5   r7   r6   r	   __annotations__rC   r   r&   r   r+   rF   rG   r/   r/   r/   r0   r@      s   
 r@   c                   @   sh   e Zd ZU dZee ed< ee ed< ee ed< ee	ee f ed< e
edZeee	ef  ed< dS )	SummaryEvaluatorContextai  Context object containing all data needed for summary evaluation.

    :param inputs: list of all input data from the dataset records (read-only).
    :param outputs: list of all outputs produced by the task (read-only).
    :param expected_outputs: list of all expected outputs (read-only).
    :param evaluation_results: Dictionary mapping evaluator names to their results (read-only).
    :param metadata: list of metadata for each dataset record, each combined with experiment configuration (read-only).
                     Each element contains the record's metadata merged with {"experiment_config": ...}.
    inputsoutputsexpected_outputsevaluation_resultsrD   r+   N)r2   r3   r4   r5   listDatasetRecordInputTyperH   r&   r7   r6   r   r+   r	   r/   r/   r/   r0   rI      s   
 
"rI   c                   @   sz   e Zd ZdZddee fddZedede	e
ef fdd	Z		dd
edee deeeef  deeef fddZdS )BaseEvaluatora  This class provides a unified interface for evaluators.

    Subclasses must implement the `evaluate` method.

    **Evaluator Return Values**
    LLM Observability supports storing and representing the following evaluator return value types:
    - **Numeric**: int/float values
    - **Boolean**: pass/fail boolean values
    - **Null**: None values
    - **JSON serializable**: string/dict/list values, which will be serialized into strings
    - **EvaluatorResult**: Any of the above values plus optional associated reasoning, assessment, metadata, and tags

    Example (simple return)::

        class SemanticSimilarityEvaluator(BaseEvaluator):
            def __init__(self, threshold=0.8):
                super().__init__(name="semantic_similarity")
                self.threshold = threshold
                self.model = load_embedding_model()

            def evaluate(self, context: EvaluatorContext):
                score = self.model.compare(context.output_data, context.expected_output)
                return score

    Example (with EvaluatorResult)::

        class SemanticSimilarityEvaluator(BaseEvaluator):
            def __init__(self, threshold=0.8):
                super().__init__(name="semantic_similarity")
                self.threshold = threshold
                self.model = load_embedding_model()

            def evaluate(self, context: EvaluatorContext):
                score = self.model.compare(context.output_data, context.expected_output)
                return EvaluatorResult(
                    value=score,
                    reasoning=f"Similarity score: {score:.2f}",
                    assessment="pass" if score >= self.threshold else "fail",
                    metadata={"threshold": self.threshold},
                    tags={"type": "semantic"}
                )

    Note: The ``evaluate`` method may be called concurrently from multiple threads.
    Avoid modifying instance attributes inside ``evaluate()``; use local variables instead.
    Nr8   c                 C   ,   |dur	|  }n| jj}t| || _dS )zInitialize the evaluator.

        :param name: Optional custom name for the evaluator. If not provided,
                     the class name will be used.
                     Name must contain only alphanumeric characters and underscores.
        Nstrip	__class__r2   r>   r8   r.   r8   r/   r/   r0   r1      
   

zBaseEvaluator.__init__contextr-   c                 C      t d)ag  Perform evaluation.

        This method must be implemented by all subclasses.

        :param context: The evaluation context containing input, output, and metadata
        :return: Evaluation results - can be a JSONType value (dict, primitive, list, None)
                 or an EvaluatorResult object containing the value plus additional metadata
        -Subclasses must implement the evaluate methodNotImplementedErrorr.   rW   r/   r/   r0   evaluate      
zBaseEvaluator.evaluateml_app	eval_namevariable_mappingc                 C   rX   )Nz+This evaluator does not support publishing.rZ   )r.   r_   r`   ra   r/   r/   r0   _build_publish_payload   s   z$BaseEvaluator._build_publish_payloadN)NN)r2   r3   r4   r5   r   r6   r1   r   r@   r   r&   r'   r]   r7   r	   rb   r/   r/   r/   r0   rP      s     .
rP   c                   @   :   e Zd ZdZd
dee fddZedede	fdd	Z
dS )BaseSummaryEvaluatora"  Base class for summary evaluators that operate on aggregated experiment results.

    Summary evaluators receive all inputs, outputs, expected outputs, and per-row
    evaluation results at once, allowing them to compute aggregate metrics.

    Subclasses must implement the `evaluate` method.

    Example::

        class AverageScoreEvaluator(BaseSummaryEvaluator):
            def __init__(self, target_evaluator: str):
                super().__init__(name="average_score")
                self.target_evaluator = target_evaluator

            def evaluate(self, context: SummaryEvaluatorContext):
                scores = context.evaluation_results.get(self.target_evaluator, [])
                if not scores:
                    return None
                return sum(scores) / len(scores)
    Nr8   c                 C   rQ   )a   Initialize the summary evaluator.

        :param name: Optional custom name for the evaluator. If not provided,
                     the class name will be used.
                     Name must contain only alphanumeric characters and underscores.
        NrR   rU   r/   r/   r0   r1     rV   zBaseSummaryEvaluator.__init__rW   r-   c                 C   rX   )a  Perform summary evaluation on aggregated experiment results.

        This method must be implemented by all subclasses.

        :param context: The summary evaluation context containing all inputs, outputs,
                        expected outputs, and per-row evaluation results
        :return: Evaluation result as a JSON-serializable value (dict, primitive, list, None)
        rY   rZ   r\   r/   r/   r0   r]   +  r^   zBaseSummaryEvaluator.evaluaterc   r2   r3   r4   r5   r   r6   r1   r   rI   r&   r]   r/   r/   r/   r0   re     s
    re   c                   @   sB   e Zd ZdZd
dee fddZedede	e
ef fdd	ZdS )BaseAsyncEvaluatorz*Base class for async row-level evaluators.Nr8   c                 C   rQ   )zInitialize the async evaluator.

        :param name: Optional custom name for the evaluator. If not provided,
                     the class name will be used.
                     Name must contain only alphanumeric characters and underscores.
        NrR   rU   r/   r/   r0   r1   ;  rV   zBaseAsyncEvaluator.__init__rW   r-   c                    
   t d)zPerform async evaluation.rY   rZ   r\   r/   r/   r0   r]   J     zBaseAsyncEvaluator.evaluaterc   )r2   r3   r4   r5   r   r6   r1   r   r@   r   r&   r'   r]   r/   r/   r/   r0   rg   8  s
     rg   c                   @   rd   )BaseAsyncSummaryEvaluatorzVBase class for async summary evaluators that operate on aggregated experiment results.Nr8   c                 C   rQ   )a  Initialize the async summary evaluator.

        :param name: Optional custom name for the evaluator. If not provided,
                     the class name will be used.
                     Name must contain only alphanumeric characters and underscores.
        NrR   rU   r/   r/   r0   r1   S  rV   z"BaseAsyncSummaryEvaluator.__init__rW   r-   c                    rh   )zBPerform async summary evaluation on aggregated experiment results.rY   rZ   r\   r/   r/   r0   r]   b  ri   z"BaseAsyncSummaryEvaluator.evaluaterc   rf   r/   r/   r/   r0   rj   P  s
    rj   _DeepEvalListTypeEvaluatorTypeAsyncEvaluatorType	evaluatorc                 C   
   t | tS )zCheck if an evaluator is a class-based evaluator (inherits from BaseEvaluator).

    :param evaluator: The evaluator to check
    :return: True if it's a class-based evaluator, False otherwise
    )r9   rP   rn   r/   r/   r0   _is_class_evaluator     
rq   c                 C   s(   t du stdu r
dS t| t pt| tS )zCheck if an evaluator is a deep eval evaluator (inherits from BaseMetric or BaseConversationalMetric).

    :param evaluator: The evaluator to check
    :return: True if it's a class-based deepeval evaluator, False otherwise
    NF)r   r   r9   rp   r/   r/   r0   _is_deep_eval_evaluator  s   rs   c                 C   ro   )zCheck if an evaluator is a class-based summary evaluator (inherits from BaseSummaryEvaluator).

    :param evaluator: The evaluator to check
    :return: True if it's a class-based summary evaluator, False otherwise
    )r9   re   rp   r/   r/   r0   _is_class_summary_evaluator  rr   rt   c                 C   s"   t | t ot | t ot|  S )zCheck if an evaluator is a function-based evaluator.

    :param evaluator: The evaluator to check
    :return: True if it's a function evaluator, False otherwise
    )r9   rP   re   rs   rp   r/   r/   r0   _is_function_evaluator  s
   
ru   Fis_asyncc              	      sN   ddl m  	ddtttf dtdtt dtf fdd	}td
d|_	|S )zWrapper to run deep eval evaluators and convert their result to an EvaluatorResult.

        :param evaluator: The deep eval evaluator to run
        :return: A callable function that can be used as an evaluator
        r   LLMTestCaseNrA   rB   rC   r-   c           	         sX    t | t |t |d}| j}j}jrdnd}j}t||||d}|S )aO  Wrapper to run deep eval evaluators and convert their result to an EvaluatorResult.

            :param input_data: The input data
            :param output_data: The output data
            :param expected_output: The expected output
            :return: An EvaluatorResult containing the score, reasoning, and assessment
            inputactual_outputrC   passfailr(   r)   r*   r+   )r6   measurescorereasonsuccessscore_breakdownr'   	rA   rB   rC   deepEvalTestCaser   r)   r*   r+   eval_resultrx   rn   r/   r0   wrapped_evaluator  s"   
z7_deep_eval_evaluator_wrapper.<locals>.wrapped_evaluatorr8   deep_eval_evaluatorrc   
deepeval.test_caserx   r7   r6   r	   r   r&   r'   getattrr2   )rn   rv   r   r/   r   r0   _deep_eval_evaluator_wrapper  s   
r   c              	      sN   ddl m  	ddtttf dtdtt dtf fdd	}td
d|_	|S )zTSync factory that returns an async callable for use with await in async experiments.r   rw   NrA   rB   rC   r-   c           	         s`    t | t |t |d}|I d H  j}j}jr!dnd}j}t||||d}|S )Nry   r|   r}   r~   )r6   	a_measurer   r   r   r   r'   r   r   r/   r0   r     s$   z=_deep_eval_async_evaluator_wrapper.<locals>.wrapped_evaluatorr8   r   rc   r   )rn   r   r/   r   r0   "_deep_eval_async_evaluator_wrapper  s   
r   c                 C      | S )zDummy wrapper; should never be called but used to satisfy type checking.

        :param evaluator: The deep eval evaluator to run
        :return: A callable function that can be used as an evaluator
        r/   )rn   rv   r/   r/   r0   r     s   c                 C   r   )zHDummy wrapper; should never be called but used to satisfy type checking.r/   rp   r/   r/   r0   r   #  s   c                   @   s   e Zd ZU eed< eed< dS )Projectr8   _idNr2   r3   r4   r6   rH   r/   r/   r/   r0   r   (  s   
 r   c                   @      e Zd ZU ee ed< dS )_DatasetRecordRawOptionalr,   N)r2   r3   r4   rN   r6   rH   r/   r/   r/   r0   r   -     
 r   )totalc                   @   s.   e Zd ZU eed< eed< eeef ed< dS )DatasetRecordRawrA   rC   r+   N)	r2   r3   r4   rO   rH   r&   r7   r6   r	   r/   r/   r/   r0   r   1  s   
 r   c                   @   s:   e Zd ZU eed< eed< eeef ed< e	e ed< dS )_UpdatableDatasetRecordOptionalrA   rC   r+   r,   N)
r2   r3   r4   rO   rH   r&   r7   r6   r	   rN   r/   r/   r/   r0   r   7  s
   
 r   c                   @      e Zd ZU eed< dS )UpdatableDatasetRecord	record_idNr   r/   r/   r/   r0   r   >     
 r   c                   @   r   )_DatasetRecordOptionalcanonical_idN)r2   r3   r4   r   r6   rH   r/   r/   r/   r0   r   B  r   r   c                   @   r   )DatasetRecordr   Nr   r/   r/   r/   r0   r   F  r   r   c                   @   sZ   e Zd ZU eed< eed< eed< eed< eed< eeef ed< eeee f ed< dS )	
TaskResultidxrF   rG   	timestampoutputr+   errorN)	r2   r3   r4   intrH   r6   r&   r7   r   r/   r/   r/   r0   r   J  s   
 r   c                   @   s.   e Zd ZU eed< eeeeef f ed< dS )EvaluationResultr   evaluationsN)r2   r3   r4   r   rH   r7   r6   r&   r/   r/   r/   r0   r   T  s   
 r   c                   @   s   e Zd ZdefddZdS )_ExperimentRunInforun_interationc                 C   s   t  | _|d | _d S )N   )uuiduuid4r   _run_iteration)r.   r   r/   r/   r0   r1   Z  s   
z_ExperimentRunInfo.__init__N)r2   r3   r4   r   r1   r/   r/   r/   r0   r   Y  s    r   c                   @   s   e Zd ZU eed< ee ed< eed< eed< eed< eeef ed< e	ed< e	ed< eeeee	f f ed	< eee	f ed
< eeee f ed< dS )ExperimentRowResultr   r   rF   rG   r   rz   r   rC   r   r+   r   N)
r2   r3   r4   r   rH   r   r6   r7   NonNoneJSONTyper&   r/   r/   r/   r0   r   `  s   
 r   c                   @   s6   e Zd Zdedeeeeef f dee fddZ	dS )ExperimentRunrunsummary_evaluationsrowsc                 C   s(   |j | _|j| _|pi | _|pg | _d S rc   )r   run_idr   run_iterationr   r   )r.   r   r   r   r/   r/   r0   r1   o  s   
zExperimentRun.__init__N)
r2   r3   r4   r   r7   r6   r&   rN   r   r1   r/   r/   r/   r0   r   n  s    r   c                   @   s>   e Zd ZU eeeeef f ed< ee ed< ee	 ed< dS )ExperimentResultr   r   runsN)
r2   r3   r4   r7   r6   r&   rH   rN   r   r   r/   r/   r/   r0   r   {  s   
 r   c                   @   s(  e Zd ZU eed< eed< eee  ed< eed< ee ed< eed< eed< ded	< e	ee
f ed
< e	eef ed< ee ed< dZ	d;dedededee dededed	ddeee  ddfddZ			d<dededee fddZ			d<dededee defddZded e
ddfd!d"Zd e
ddfd#d$Zdee
 ddfd%d&Zdeddfd'd(Zedefd)d*Zedefd+d,Zedefd-d.Zdefd/d0Zededefd1d2Zededee fd3d2Zdeeef deeee f fd4d2Zdefd5d6Zde e fd7d8Z!d=d9d:Z"dS )>Datasetr8   descriptionfilter_tagsr   _records_version_latest_versionr$   _dne_client_new_records_by_record_id!_updated_record_ids_to_new_fields_deleted_record_idsi  P Nproject
dataset_idrecordslatest_versionversionr-   c
           
      C   sP   || _ || _|| _|	pg | _|| _|| _|| _|| _|| _i | _	i | _
g | _d S rc   )r8   r   r   r   r   r   r   r   r   r   r   r   )
r.   r8   r   r   r   r   r   r   r   r   r/   r/   r0   r1     s   

zDataset.__init__Tdeduplicatecreate_new_versionbulk_uploadc                 C   s   |  ||| dS )a  Pushes any local changes in this dataset since the last push.

        :param deduplicate:
            Wether to deduplicate the records or not. Does not deduplicate against existing
            data if bulk_upload is False.
        :param create_new_version:
            Whether to create a new version of the dataset when changes are detected, or update the
            existing version.
        :param bulk_upload:
            - True:
                Uploads all records in a single request. This method does not support deduplication
                against existing data and is best suited for initial uploads.
            - False:
                Splits the data into batches and uploads them individually. This method supports
                deduplication against existing records but does not provide transactional guarantees
                when the same dataset is modified concurrently by multiple clients.
            - None:
                The SDK chooses between the above two approaches using data size.
        N)_push)r.   r   r   r   r/   r/   r0   push  s   zDataset.pushc              	   C   sN  | j std| jstdd}|  }|s|d u r1|| jkr1td| | jj| j | j|d nntd| t	| j
 }| jj| j | jd t	| j || j||d\}}}	t	| j }
t|
||	D ]\}}}|| j| d	< |rz|| j| d
< | j|= qet|dkpt| jdk}|dkr|| _td|| j | j| _g | _i | _
|S )Nz}Dataset ID is required to push data to Experiments. Use LLMObs.create_dataset() or LLMObs.pull_dataset() to create a dataset.zLLMObs client is required to push data to Experiments. Use LLMObs.create_dataset() or LLMObs.pull_dataset() to create a dataset.Fz&dataset delta is %d, using bulk upload)r   z'dataset delta is %d, using batch updater   )r   
project_idinsert_recordsupdate_recordsdelete_record_idsr   r   r   r   r   z new_version %d latest_version %d)r   r;   r   _estimate_delta_sizeBATCH_UPDATE_THRESHOLDloggerdebugdataset_bulk_uploadr   rN   r   valuesdataset_batch_updater   r   r   keysziplenr   r   )r.   r   r   r   data_changed
delta_sizeupdated_recordsnew_versionnew_record_idsnew_canonical_idspending_keyskeyr   r   r/   r/   r0   r     sT   
zDataset._pushindexrecordc                    sx   t  fdddD rtd| j| d }i | j|d|i d|i| j|< i | j|  d|i| j|< d S )Nc                 3   s    | ]}| vV  qd S rc   r/   ).0kr   r/   r0   	<genexpr>  s    z!Dataset.update.<locals>.<genexpr>)rA   rC   r+   zhinvalid update, record should contain at least one of input_data, expected_output, or metadata to updater   )allr;   r   r   get)r.   r   r   r   r/   r   r0   update
  s&   zDataset.updatec                 C   s6   t  j}i ||d d}|| j|< | j| d S )N)r   r   )r   r   hexr   r   append)r.   r   r   rr/   r/   r0   r     s   

zDataset.appendc                 C   s   |D ]}|  | qd S rc   )r   )r.   r   r   r/   r/   r0   extend%  s   zDataset.extendc                 C   sz   | j | d }d}| j |= |d u s|dkrtd| d S || jv r&| j|= || jv r1| j|= d}|r;| j| d S d S )Nr   T z/encountered unexpected record_id on deletion %sF)r   r   warningr   r   r   r   )r.   r   r   should_append_to_be_deletedr/   r/   r0   delete)  s   

zDataset.deletec                 C      t   d| j S )Nz/llm/datasets/_get_base_urlr   r.   r/   r/   r0   url=     zDataset.urlc                 C      | j S rc   )r   r   r/   r/   r0   r   B     zDataset.latest_versionc                 C   r   rc   )r   r   r/   r/   r0   r   F  r   zDataset.versionc                 C   s,   t t| jt t| j }td| |S )zQrough estimate (in bytes) of the size of the next batch update call if it happenszestimated delta size %d)r   r    r   r   r   r   )r.   sizer/   r/   r0   r   J  s   zDataset._estimate_delta_sizec                 C      d S rc   r/   r.   r   r/   r/   r0   __getitem__P     zDataset.__getitem__c                 C   r   rc   r/   r   r/   r/   r0   r   S  r  c                 C   s   | j |S rc   )r   r   r   r/   r/   r0   r   V     c                 C   
   t | jS rc   )r   r   r   r/   r/   r0   __len__Y     
zDataset.__len__c                 C   r  rc   )iterr   r   r/   r/   r0   __iter__\  r  zDataset.__iter__c              
      s  zdd l }W n ty } ztd|d }~ww t }g }| jD ]}i  |di }t|trF| D ]\}}| d|f< |d|f q3n	| d< |d |di }	t|	trq|	 D ]\}
}| d|
f< |d|
f q^n	|	 d< |d |di }t|tr| D ]\}}| d|f< |d|f qnt	
dt| |  q g }|D ]  fd	d
|D }|| q|j||j|dS )Nr   z[pandas is required to convert dataset to DataFrame. Please install via `pip install pandas`rA   )rA   r   rC   )rC   r   r+   zunexpected metadata format %sc                    s   g | ]}  |d qS rc   )r   )r   colflat_recordr/   r0   
<listcomp>  s    z(Dataset.as_dataframe.<locals>.<listcomp>)datacolumns)pandasImportErrorsetr   r   r9   r7   itemsaddr   r   typer   	DataFrame
MultiIndexfrom_tuples)r.   pdecolumn_tuples	data_rowsr   rA   input_data_colinput_data_valrC   expected_output_colexpected_output_valr+   metadata_colmetadata_valrecords_listrowr/   r	  r0   as_dataframe_  sV   





zDataset.as_dataframerc   )TTN)r-   N)#r2   r3   r4   r6   rH   r   rN   r   r   r7   r   r   r   r   r1   boolr   r   r   r   r   r   propertyr   r   r   r   r   r   slicer   r  r   r  r#  r/   r/   r/   r0   r     s   
 	




E	&r   c                   @   st  e Zd ZU dZeeef ed< eee	e
f  ed< eeeef  ed< edd Z								dod
edeeef dedeee	e
f  dededeeeef  dee ded deeeeef   dee dee ddfddZedefddZdedee dee deee  def
d d!Z	"				dpd#ed$ed%ed&ed'ed(ed)ed*ee d+ee d,eeeef  deeeef  dd-fd.d/Z d0eded- fd1d2Z!d3ee defd4d5Z"d6e#deee$f fd7d8Z%dee d9ee de&ee' ee ee eeee$f  eeee f f fd:d;Z(dqd<ed=ee ddfd>d?Z)drdAedBeddfdCdDZ*	E				F	dsdGedHed3ee dIedJee+ege,f  de-fdKdLZ.e/dMede-fdNdOZ0dPe-defdQdRZ1e/dPe-defdSdTZ2dPe-ddfdUdVZ3dFdWdX fdYe&ee4f dedZe5j6dIedJe+ege,f dee fd[d\Z7d	ddFd]dX fdGededHed3ee dIedJe+ege,f dee fd^d_Z8d	dEdFd`dX fdee dHedGedIedJe+ege,f dee fdadbZ9			Edtdee d9ee dHedGedee f
dcddZ:	e			FdudGedHedfee de-fdgdhZ;							dvd#ed$ediedj dkee dlee d*ee d+ee d,eeeef  deeeef  ddfdmdnZ<dS )w
Experimenta  Async-native experiment supporting both sync and async tasks, evaluators, and summary evaluators.

    This is the core experiment class. Sync evaluators are run via asyncio.to_thread().
    Sync tasks are also supported and will be run via asyncio.to_thread().

    Use ``LLMObs.async_experiment()`` to create an instance directly (for async callers),
    or ``LLMObs.experiment()`` to get a ``SyncExperiment`` wrapper (for sync callers).
    _task_evaluators_summary_evaluatorsc                 C   s   dS )zQNo-op task used when initializing distributed experiment objects on remote hosts.Nr/   )clsrA   r   r/   r/   r0   _NO_OP_TASK  s   zExperiment._NO_OP_TASKr   NFr8   taskdataset
evaluatorsproject_namer   r,   r   _llmobs_instancer"   summary_evaluatorsr   is_distributedr-   c                 C   s   || _ || _|| _t|| _|
rt|
ng | _|| _|pi | _tt	| jd< || jd< |j | jd< || jd< |p9i | _
|jrGtt|j| j
d< |pJd| _|	| _|| _g | _|s[td|| _d | _d | _d | _d | _d S )Nzddtrace.versionr0  dataset_nameexperiment_namefiltered_record_tagsr   zproject_name must be provided for the experiment, either configured via the `DD_LLMOBS_PROJECT_NAME` environment variable, or an argument to `LLMObs.enable(project_name=...)`, or as an argument to `LLMObs.experiment(project_name=...)`.)r8   r(  _datasetrN   r)  r*  _description_tagsr6   r!   _configr   r   r&   _runsr1  _is_distributed_retriesr;   _project_name_project_idr   	_run_nameexperiment_span)r.   r8   r-  r.  r/  r0  r   r,   r   r1  r2  r   r3  r/   r/   r0   r1     s6   






zExperiment.__init__c                 C   r   )Nz/llm/experiments/r   r   r/   r/   r0   r     r   zExperiment.urlr   task_resultsr   r   c                 C   s   g }t |D ]R\}}|d }dttt t| ji}	|	|dp"i  | j| }
|| d }||dd|dd|dd	|
d
d|
d |
d |||	|d d}|	| qi }|rq|D ]}|d 
 D ]\}}|||< qgq_t|||S )Nr   r,   r+   r   rF   r   rG   r   r   r   rA   rC   r   )r   rF   rG   r   r   rz   rC   r   r   r+   r   )	enumerater   rN   r&   r   r9  r   r   r7  r   r  r   )r.   r   rB  r   r   experiment_resultsr   task_resultrB   r+   r   evals
exp_resultsummary_evalssummary_evaluationr8   	eval_datar/   r/   r0   _merge_results  s6   





zExperiment._merge_resultscustomr`   
eval_valueerrrF   rG   timestamp_nssourcer)   r*   r+   r#   c                 C   s   |d u rd}n"t |trd}nt |ttfrd}nt |tr!d}nd}t| }d|d|d|dt|d	 d
|d|| d|d|dt|d| ji
}|rO||d< |	rU|	|d< |
r[|
|d< |S )Ncategoricalbooleanr   jsonmetric_sourcerF   rG   timestamp_ms    .Ametric_typelabel_valuer   r,   experiment_idr)   r*   r+   )	r9   r$  r   floatr7   r6   lowerr   r   )r.   r`   rM  rN  rF   rG   rO  rP  r)   r*   r+   r,   rW  eval_metricr/   r/   r0    _generate_metric_from_evaluation  s8   


z+Experiment._generate_metric_from_evaluationexperiment_resultc                 C   s  g }d}|j D ]}|dpi }|dd}|dd}tt|dd}||kr+|}| D ]i\}	}
|
s6q/|
d}| j|	||
d|||t|
d	trUt|
d	nd t|
d
tret|
d
nd t|
dtrztttt	f |
dnd t|
dtrttttf |
dnd d
}|
| q/q|j D ]\}}|sq| j||d|ddd|dd}|
| q|S )Nr   r   rF   r   rG   r   r(   r   r)   r*   r+   r,   )r)   r*   r+   r,   summary)rP  )r   r   r   r   r  r^  r9   r6   r7   r&   r   r   )r.   r_  eval_metricslatest_timestamprG  r   rF   rG   rO  r`   rJ  rM  r]  r8   summary_eval_datar/   r/   r0   "_generate_metrics_from_exp_results)  s^   

 	z-Experiment._generate_metrics_from_exp_resultssample_sizec              
   C   sv   |dur8|t | jk r8dd | jjd| D }d|| jj}t|| jj| jj|| jj| jj	| jj
| jjdS | jS )zMGet dataset containing the first sample_size records of the original dataset.Nc                 S   s   g | ]}t |qS r/   r   )r   r   r/   r/   r0   r  `      z2Experiment._get_subset_dataset.<locals>.<listcomp>z[Test subset of {} records] {})r8   r   r   r   r   r   r   r   )r   r7  r   formatr8   r   r   r   r   r   r   r   )r.   re  subset_recordssubset_namer/   r/   r0   _get_subset_dataset]  s   
zExperiment._get_subset_datasetexcc                 C   sH   t  \}}}|d urt|jnd}dt|||}t|||dS )NzUnknown Exceptionr   )messager  stack)sysexc_infor  r2   join	tracebackformat_exceptionr6   )r.   rk  exc_type	exc_valueexc_tbexc_type_name	exc_stackr/   r/   r0   _build_evaluator_errorn  s   z!Experiment._build_evaluator_erroreval_resultsc                 C   s   g }g }g }g }i }t |D ]Q\}}	||	d  | j| }
||
d  ||
d  |
di }|i |d| ji || d }| D ]\}}||vrTg ||< || |d qHq|||||fS )Nr   rA   rC   r+   experiment_configr   r(   )rC  r   r7  r   r:  r  )r.   rB  ry  rJ   rK   rL   metadata_listeval_results_by_namer   rE  r   record_metadataeval_result_at_idx_by_namer8   rM  r/   r/   r0   _prepare_summary_evaluator_datax  s&   	
z*Experiment._prepare_summary_evaluator_datastatusr   c                 C   sX   | j r| jsd S z| j jjtt| j||d W d S  ty+   tjd|dd Y d S w )N)r  r   z(Failed to update experiment status to %sT)ro  )	r1  r   r   experiment_updater   r6   	Exceptionr   r   )r.   r  r   r/   r/   r0   _update_status  s   "zExperiment._update_statusTllmobs_not_enabled_errorensure_uniquec                 C   s   | j r| j jst|| j j| j}|dd| _| j| jd< | j j	| j
| jj| j| jj| jt| j| j| j|	\}}|| _t|| jd< || _d S )Nr   r   r   rZ  )r1  enabledr;   r   project_create_or_getr>  r   r?  r9  experiment_creater8   r7  r   r   r:  r   r8  r;  r6   r@  )r.   r  r  r   rZ  experiment_run_namer/   r/   r0   _setup_experiment  s,   
zExperiment._setup_experiment
   r   jobsraise_errorsmax_retriesretry_delayc              	      s  dt dtfdd}|du r|}nt|stdt|j|dk r'td|d	k r/td
| d g | _	| j
  d| _d| _| d zzht| jD ]`}t|}t|j| jd< t|j| jd< | j||||||dI dH }	| j|	||||dI dH }
| j|	|
||dI dH }| ||	|
|}| |}| jjtt| j|t| j | j	 | qNW n t!y   d| _| d  w W | "| j	}| #| n| "| j	}| #| w | jr| jd| $|d |S | d |S )a/  Run the experiment by executing the task on all dataset records and evaluating the results.

        :param jobs: Maximum number of concurrent task and evaluator executions (default: 10)
        :param raise_errors: Whether to raise exceptions on task or evaluator errors (default: False)
        :param sample_size: Optional number of dataset records to sample for testing
                            (default: None, uses full dataset)
        :param max_retries: Maximum number of retries for failed tasks and evaluators (default: 0)
        :param retry_delay: Callable that takes the attempt number (0-based) and returns the delay
                            in seconds before the next retry. Default: ``0.1 * (attempt + 1)``
        :return: ExperimentResult containing evaluation results and metadata
        attemptr-   c                 S      d| d  S Ng?r   r/   r  r/   r/   r0   _default_retry_delay  r  z,Experiment.run.<locals>._default_retry_delayNz&retry_delay must be a callable, got {}r   zjobs must be at least 1r   zmax_retries must be >= 0zLLMObs is not enabled. Ensure LLM Observability is enabled via `LLMObs.enable(...)` and create the experiment via `LLMObs.async_experiment(...)` before running the experiment.Frunningr   r   r  r  )r  r  r  r  )r  Tinterruptedfailed)r   	completed)%r   r[  callabler:   rg  r  r2   r;   r  _run_resultsr=  clear_interrupted_has_errorsr  ranger;  r   r6   r   r9  r   	_run_task_run_evaluators_run_summary_evaluatorsrK  rd  r1  r   experiment_eval_postr   r   r   BaseException_build_result_log_experiment_summary_build_error_summary)r.   r  r  re  r  r  r  r   r   rB  r   rH  
run_resultexperiment_evalsresultr/   r/   r0   r     sl   






zExperiment.runrun_resultsc                 C   s.   | r| d j ni | r| d j| dS g | dS )Nr   r   r   r   )r   r   )r  r/   r/   r0   r    s   zExperiment._build_resultr  c                 C   s  | dg }g }| jr|d| jt|| j t|D ]\}}|j}| jdkr2d|d | jnd}t	dd |D }i }	|D ]3}
|
 dpIi 
 D ]'\}}|	|d	d	d
}|d  d7  < t|trs| drs|d  d7  < qLqAd| j}|r|d|7 }|d|t|t|	 |r|d|t| |	
 D ](\}}|d r|d||d t||d  q|d||d t| qq| jr|dt| jd| j d|S )Nr   z1Experiment '{}' was interrupted after {}/{} runs.r   z	Run {}/{}r   c                 s   s2    | ]}t |d tr|d  drdV  qdS )r   rl  r   N)r9   r   r7   )r   r"  r/   r/   r0   r     s    
z8Experiment._format_experiment_summary.<locals>.<genexpr>r   r   )r   errorsr   r   r  zExperiment '{}'z - {}z{}: {} rows, {} evaluator(s).z  Task errors: {}/{}z"  {}: {}/{} evaluated, {} error(s)z  {}: {}/{} evaluatedzRetries ({}):
  {}z
  
)r   r  r   rg  r8   r   r;  rC  r   sumr  
setdefaultr9   r7   r=  rp  )r.   r  r   partsrun_idxr   r   	run_labeltask_error_count
eval_statsr"  r8   r  statsheaderr`   r/   r/   r0   _format_experiment_summary  sJ    
 
z%Experiment._format_experiment_summaryc                 C   s   g }|  dg D ]Y}|jD ]S}| d}t|tr-| dr-|d| dd|d  | dp3i  D ])\}}t|ts@q6| d}t|tr_| dr_|d|| dd|d  q6qq|sfd	S d
t|S )Nr   r   rl  z{}: {}r  Errorr   z{} ({}): {}zunknown errorz; )	r   r   r9   r7   r   rg  r  rp  r  )r  r  r   r"  rN  r8   r  eval_errr/   r/   r0   r  6  s$   



 
zExperiment._build_error_summaryc                 C   s8   |  |}|s	d S | jrtjntj}||ddid d S )Nproductllmobs)extra)r  r  r   r   info)r.   r  msglog_fnr/   r/   r0   r  H  s
   
z"Experiment._log_experiment_summaryc                 C   r  r  r/   r  r/   r/   r0   <lambda>U      zExperiment.<lambda>
idx_record	semaphorec                    sF  | j r| j js
dS |4 I dH  |\}}| j j| jj| jt|j|j| jj	| j
| j| j	dP}| j j|d}	| jr?|	| _|	rN|	dd}
|	dd}nd\}
}|d }|d	d}|d
}i | jt| jjt|t| jd}|dg }|D ]}d|v r|dd\}}|||< q}|r||d< d}d}td| D ]p}z"t| jr| || jI dH }nt| j|| jI dH }d}W  nL ty } z?t }||k r| jd||d |d | |  zt||I dH  W | I dH  n| I dH  w W Y d}~qd}~ww |dkrt||d< |r'd| _ |j!|  | j j"||||d |#t$|d  d|v rF|#t%|d  | jrQ|#t&| j ||
||j'||| j	| jj	d|(t)|(t*|(t+ddW  d   W  d  I dH  S 1 sw   Y  W d  I dH  dS 1 I dH sw   Y  dS )z%Process single record asynchronously.N)r8   rZ  r   r   r4  r0  r   r5  )spanrF   r   rG   )r   r   rA   r   r   )r   dataset_record_idrZ  r,   :r   dataset_record_canonical_idz%task row {}: attempt {}/{} failed: {}r   retriesT)rA   rB   r,   rC   r+   )dataset_record_indexr5  r4  )rl  rm  r  )r   rF   rG   r   r   r+   r   ),r1  r  _experimentr(  r2   r   r6   r   r7  r8   r>  r?  export_spanr<  rA  r   r9  splitr  asyncioiscoroutinefunctionr:  	to_threadr  rn  ro  r=  r   rg  releasesleepacquirer  set_exc_infoannotate_set_ctx_itemr   r   r   start_nsget_tagr   r   r   )r.   r  r   r  r  r  r   r   r  span_contextrF   rG   rA   r   r   r,   record_tagstagr   r(   rB   last_exc_infor  r  r/   r/   r0   _process_recordO  s   		


 


0zExperiment._process_recordc                 C   r  r  r/   r  r/   r/   r0   r    r  c                    s   j rj js
g S |}t| fddt|D }tj|ddiI d H }	g }
|	D ]A}t|tr>|r=|q2|sAq2|}|
	| |
dpNi }t|trs|
d}|
d}|
d}|rs|rstd	|d
 |||q2j   |
S )Nc              	      s    g | ]}j | d qS )r  )r  )r   r  r  r  r   r.   r  r/   r0   r    s    z(Experiment._run_task.<locals>.<listcomp>return_exceptionsTr   rl  rm  r  zError on record {}: {}
{}
{}r   )r1  r  rj  r  	SemaphorerC  gatherr9   r  r   r   r7   RuntimeErrorrg  flush)r.   r  r   r  re  r  r  subset_datasetcorosresultsrB  r  rE  err_dicterr_msg	err_stackerr_typer/   r  r0   r    s>   	








zExperiment._run_taskc                 C   r  r  r/   r  r/   r/   r0   r    r  c              
      s   t |dtdtdtttttf f ffdd  fddt|D }t j| I d H }g }t|D ]\}	}
|	|	|
d q8|S )	Nr   rE  r-   c                    s  4 I d H g j |  }|d }|d }|d }|di }i }jD ]<}d }	d }
d}td  D ]#}d }	d }
zt|trf|j}i |dji}t|||||d|d	d
}|	|I d H }n\t
|rx|j}||||I d H }nJt|r|j}i |dji}t|||||d|d	d
}t
|j	|I d H }nt|r|j}t
||||I d H }ntd| t|}d }i }t|tr|jr|j|d< |jr|j|d< |jr|j|d< |jr|j|d< |j}	n|}	W  ng tyX } zZi }|}
| k r=jd|| |d  d |   zt
|I d H  W   I d H  n  I d H  w W Y d }~q4d_!rNt"d| d|  |W Y d }~q4d }~ww |	|
d|||< q%|W  d   I d H  S 1 I d H svw   Y  d S )NrA   r   rC   r+   r   r   rz  rF   rG   )rA   rB   rC   r+   rF   rG   zHEvaluator %s is neither a BaseEvaluator instance nor a callable functionr)   r*   r,   z/evaluator '{}' row {}: attempt {}/{} failed: {}Tz
Evaluator z failed on row r(   r   )#r7  r   r)  r  r9   rg   r8   r:  r@   r]   r  r  r2   rq   r  ru   r   r   r6   r'   r)   r*   r+   r,   r(   r  rx  r=  r   rg  r  r  r  r  r  )r   rE  r   rA   rB   rC   r+   row_resultsrn   eval_result_valuer  evaluator_namer  combined_metadatarW   r   extra_return_valuesr  )r  r  r  r.   r  r/   r0   _evaluate_row  s   









 
2z1Experiment._run_evaluators.<locals>._evaluate_rowc                    s   g | ]	\}} ||qS r/   r/   )r   r   rE  )r  r/   r0   r  Y  s    z.Experiment._run_evaluators.<locals>.<listcomp>r   r   )
r  r  r   r   r7   r6   r&   rC  r  r   )r.   rB  r  r  r  r  r  r  r   r   r  r/   )r  r  r  r  r.   r  r0   r    s   
2qzExperiment._run_evaluatorsc              	      s    ||\t|dtdtttttf f ffdd  fddjD }tj	|d iI d H }g }i }t
|D ]%\}	}
t|
trSqIttttttf f |
\}}|||< ||	|d qI|S )	Nsummary_evaluatorr-   c                    sZ  4 I d H  d }d }d}z[t | tr)| j}t d}| |I d H }n=t| r<| j}|  I d H }n*t| rW| j}t d}t	| j|I d H }n| j}t	|  I d H }|}W n% t
y } zd_|}rtd| d|W Y d }~nd }~ww |||dfW  d   I d H  S 1 I d H sw   Y  d S )Nr   )rJ   rK   rL   rM   r+   TzSummary evaluator z failedr  )r9   rj   r8   rI   r]   r  r  r2   rt   r  r  r  rx  r  )r  r  r  r  rW   r   r  )r|  rL   rJ   r{  rK   r  r.   r  r/   r0   _evaluate_summary_singles  sh   



0zDExperiment._run_summary_evaluators.<locals>._evaluate_summary_singlec                    s   g | ]} |qS r/   r/   )r   r  )r  r/   r0   r    rf  z6Experiment._run_summary_evaluators.<locals>.<listcomp>r  r  )r  r  r  r	   tupler6   r7   r&   r*  r  rC  r9   r  r   r   )r.   rB  ry  r  r  r  r  r   
evals_dictr   r  r  rJ  r/   )	r  r|  rL   rJ   r{  rK   r  r.   r  r0   r  b  s2   

8
z"Experiment._run_summary_evaluatorsr   r   c           	         s   t |pd}t|j| jd< t|j| jd< | |||d I d H }| j|||dI d H }| |||g }| |}| j	j
tt| j|t| j i g |gdS )Nr   r   r   )r  r  r  )r   r6   r   r9  r   r  r  rK  rd  r1  r   r  r   r   )	r.   r  r  r   r   rB  r   r  r  r/   r/   r0   _run_task_single_iteration  s   
z%Experiment._run_task_single_iterationr  r%   rU  is_summary_evalc
                 C   s  | j std|dur$t|tr t|dtr t|dts$td|du r3|s3| jdu r3td|du r<|s<| j}|durFt|d ntt		 d }
| 
||d|r[|dd	nd	|rd|dd	nd	|
|rjd
nd||||	}| jjtt| j|gt| j dS )a  Submit an evaluation metric for a distributed experiment.

        :param eval_name: Name of the evaluation metric
        :param eval_value: Value of the evaluation metric
        :param span: Optional span context dict with span_id and trace_id. If None and not a
                     summary eval, uses the last span from _run_task_single_iteration.
        :param timestamp_ms: Optional timestamp in milliseconds
        :param is_summary_eval: Whether this is a summary-level evaluation
        :param reasoning: Optional reasoning string
        :param assessment: Optional assessment string
        :param metadata: Optional metadata dict
        :param tags: Optional tags dict
        z4this method is only used for distributed experimentsNrF   rG   z`span` must be a dictionary containing both span_id and trace_id keys. LLMObs.export_span() can be used to generate this dictionary from a given span.zCunexpected state, must supply span or must run the experiment firstrV  g    eAr   r`  rL  )r<  r;   r9   r7   r   r6   r:   rA  r   timer^  r1  r   r  r   r   r   r9  )r.   r`   rM  r  rU  r  r)   r*   r+   r,   rO  r]  r/   r/   r0   _submit_eval_metric  sB   $
zExperiment._submit_eval_metric)r   NNNNNF)rL  NNNNrc   )T)r  FNr   N)Fr  )r   Fr   )NNNNNNN)=r2   r3   r4   r5   r   TaskTypeAsyncTaskTyperH   r   rl   rm   SummaryEvaluatorTypeAsyncSummaryEvaluatorTypeclassmethodr,  r6   r   r   r7   
ConfigTyper   r$  r1   r%  r   r   rN   r   r   r   rK  r&   r^  rd  rj  r  r	   rx  r  rO   r  r  r  r   r[  r   r   staticmethodr  r  r  r  r   r  r  r  r  r  r  r  r  r/   r/   r/   r0   r'    s  
 	

	

/

-	

-
4


K*

h
.
 	
Z
	
r'  c                   @   s   e Zd ZdZ						ddedeeef dede	ee
ef  ded	ed
eeeef  dee ded dee	eeef   dee ddfddZ					d dededee dedeeegef  defddZedefddZdS )!SyncExperimentzThin synchronous wrapper around the async-native ``Experiment``.

    Provides a blocking ``run()`` method for callers that do not have an event loop.
    r   Nr8   r-  r.  r/  r0  r   r,   r   r1  r"   r2  r   r-   c                 C   s$   t |||||||||	|
|d| _d S )N)r8   r-  r.  r/  r0  r   r,   r   r1  r2  r   )r'  r  )r.   r8   r-  r.  r/  r0  r   r,   r   r1  r2  r   r/   r/   r0   r1     s   zSyncExperiment.__init__r   Fr   r  r  re  r  r  c           
      C   s   | j j|||||d}zt  W n ty   t| Y S w ddl}|jjdd}|tj|}	|		 W  d   S 1 sAw   Y  dS )a  Run the experiment synchronously.

        :param jobs: Maximum number of concurrent task and evaluator executions (default: 1)
        :param raise_errors: Whether to raise exceptions on task or evaluator errors (default: False)
        :param sample_size: Optional number of dataset records to sample for testing
                            (default: None, uses full dataset)
        :param max_retries: Maximum number of retries for failed tasks and evaluators (default: 0)
        :param retry_delay: Callable that takes the attempt number (0-based) and returns the delay
                            in seconds before the next retry. Default: ``0.1 * (attempt + 1)``
        :return: ExperimentResult containing evaluation results and metadata
        )r  r  re  r  r  r   Nr   )max_workers)
r  r   r  get_running_loopr  concurrent.futuresfuturesThreadPoolExecutorsubmitr  )
r.   r  r  re  r  r  coro
concurrentpoolfuturer/   r/   r0   r   3  s"   $zSyncExperiment.runc                 C   s   | j jS rc   )r  r   r   r/   r/   r0   r   X  s   zSyncExperiment.url)r   NNNNN)r   FNr   N)r2   r3   r4   r5   r6   r   r  r  r   r   rl   rm   r   r7   r   r  r  r   r1   r$  r   r[  r   r   r%  r   r/   r/   r/   r0   r    sj    
	


%r  c                  C   s"   d} t jtv r	d} d|  t j S )Nr   zapp.zhttps://)r   _dd_siter   )	subdomainr/   r/   r0   r   ]  s   
r   )F)labcr   r   r  copyr   dataclassesr   r   r<   rn  r  rq  typingr   r	   r
   r   r   r   r   r   r   r   r   r   r  typing_extensionsr   deepeval.metricsr   r   ddtracer   ddtrace.constantsr   r   r   ddtrace.internal.loggerr   ddtrace.llmobs._constantsr   r   r   r   ddtrace.llmobs._utilsr   r    ddtrace.versionr!   ddtrace.llmobsr"   ddtrace.llmobs._writerr#   r$   ddtrace.llmobs.typesr%   r2   r   r6   r   r[  r$  rN   r7   r&   r   r   rO   r  r  r'   r>   r@   rI   rP   re   rg   rj   rk   rH   rl   rm   r  r  rq   rs   rt   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r'  r  r   r/   r/   r/   r0   <module>   sR   " *S2
	
					) 
         L