HEX

File: //home/arjun/.local/lib/python3.10/site-packages/langsmith/evaluation/integrations/_langchain.py
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union

from langsmith.evaluation.evaluator import DynamicRunEvaluator
from langsmith.run_helpers import traceable
from langsmith.schemas import Example, Run

if TYPE_CHECKING:
    from langchain.evaluation.schema import StringEvaluator

    from langsmith.evaluation.evaluator import RunEvaluator


class SingleEvaluatorInput(TypedDict):
    """The input to a `StringEvaluator`."""

    prediction: str
    """The prediction string."""
    reference: Optional[Any]
    """The reference string."""
    input: Optional[str]
    """The input string."""


class LangChainStringEvaluator:
    r"""A class for wrapping a LangChain StringEvaluator.

    Requires the `langchain` package to be installed.

    Attributes:
        evaluator (StringEvaluator): The underlying StringEvaluator OR the name
            of the evaluator to load.

    Methods:
        as_run_evaluator() -> RunEvaluator:
            Convert the LangChainStringEvaluator to a RunEvaluator.

    Examples:
        Creating a simple LangChainStringEvaluator:

        >>> evaluator = LangChainStringEvaluator("exact_match")

        Converting a LangChainStringEvaluator to a RunEvaluator:

        >>> from langsmith.evaluation import LangChainStringEvaluator
        >>> from langchain_openai import ChatOpenAI
        >>> evaluator = LangChainStringEvaluator(
        ...     "criteria",
        ...     config={
        ...         "criteria": {
        ...             "usefulness": "The prediction is useful if"
        ...             " it is correct and/or asks a useful followup question."
        ...         },
        ...         "llm": ChatOpenAI(model="gpt-4o"),
        ...     },
        ... )
        >>> run_evaluator = evaluator.as_run_evaluator()
        >>> run_evaluator  # doctest: +ELLIPSIS
        <DynamicRunEvaluator ...>

        Customizing the LLM model used by the evaluator:

        >>> from langsmith.evaluation import LangChainStringEvaluator
        >>> from langchain_anthropic import ChatAnthropic
        >>> evaluator = LangChainStringEvaluator(
        ...     "criteria",
        ...     config={
        ...         "criteria": {
        ...             "usefulness": "The prediction is useful if"
        ...             " it is correct and/or asks a useful followup question."
        ...         },
        ...         "llm": ChatAnthropic(model="claude-3-opus-20240229"),
        ...     },
        ... )
        >>> run_evaluator = evaluator.as_run_evaluator()
        >>> run_evaluator  # doctest: +ELLIPSIS
        <DynamicRunEvaluator ...>

        Using the `evaluate` API with different evaluators:
        >>> def prepare_data(run: Run, example: Example):
        ...     # Convert the evaluation data into the format expected by the evaluator
        ...     # Only required for datasets with multiple inputs/output keys
        ...     return {
        ...         "prediction": run.outputs["prediction"],
        ...         "reference": example.outputs["answer"],
        ...         "input": str(example.inputs),
        ...     }
        >>> import re
        >>> from langchain_anthropic import ChatAnthropic
        >>> import langsmith
        >>> from langsmith.evaluation import LangChainStringEvaluator, evaluate
        >>> criteria_evaluator = LangChainStringEvaluator(
        ...     "criteria",
        ...     config={
        ...         "criteria": {
        ...             "usefulness": "The prediction is useful if it is correct"
        ...             " and/or asks a useful followup question."
        ...         },
        ...         "llm": ChatAnthropic(model="claude-3-opus-20240229"),
        ...     },
        ...     prepare_data=prepare_data,
        ... )
        >>> embedding_evaluator = LangChainStringEvaluator("embedding_distance")
        >>> exact_match_evaluator = LangChainStringEvaluator("exact_match")
        >>> regex_match_evaluator = LangChainStringEvaluator(
        ...     "regex_match", config={"flags": re.IGNORECASE}, prepare_data=prepare_data
        ... )
        >>> scoring_evaluator = LangChainStringEvaluator(
        ...     "labeled_score_string",
        ...     config={
        ...         "criteria": {
        ...             "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate"
        ...         },
        ...         "normalize_by": 10,
        ...         "llm": ChatAnthropic(model="claude-3-opus-20240229"),
        ...     },
        ...     prepare_data=prepare_data,
        ... )
        >>> string_distance_evaluator = LangChainStringEvaluator(
        ...     "string_distance",
        ...     config={"distance_metric": "levenshtein"},
        ...     prepare_data=prepare_data,
        ... )
        >>> from langsmith import Client
        >>> client = Client()
        >>> results = evaluate(
        ...     lambda inputs: {"prediction": "foo"},
        ...     data=client.list_examples(dataset_name="Evaluate Examples", limit=1),
        ...     evaluators=[
        ...         embedding_evaluator,
        ...         criteria_evaluator,
        ...         exact_match_evaluator,
        ...         regex_match_evaluator,
        ...         scoring_evaluator,
        ...         string_distance_evaluator,
        ...     ],
        ... )  # doctest: +ELLIPSIS
        View the evaluation results for experiment:...
    """  # noqa: E501

    def __init__(
        self,
        evaluator: Union[StringEvaluator, str],
        *,
        config: Optional[dict] = None,
        prepare_data: Optional[
            Callable[[Run, Optional[Example]], SingleEvaluatorInput]
        ] = None,
    ):
        """Initialize a LangChainStringEvaluator.

        See: https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.StringEvaluator.html#langchain-evaluation-schema-stringevaluator

        Args:
            evaluator (StringEvaluator): The underlying StringEvaluator.
        """
        from langchain.evaluation.schema import StringEvaluator  # noqa: F811

        if isinstance(evaluator, StringEvaluator):
            self.evaluator = evaluator
        elif isinstance(evaluator, str):
            from langchain.evaluation import load_evaluator  # noqa: F811

            self.evaluator = load_evaluator(evaluator, **(config or {}))  # type: ignore[assignment, arg-type]
        else:
            raise NotImplementedError(f"Unsupported evaluator type: {type(evaluator)}")

        self._prepare_data = prepare_data

    def as_run_evaluator(
        self,
    ) -> RunEvaluator:
        """Convert the LangChainStringEvaluator to a RunEvaluator.

        This is the object used in the LangSmith `evaluate` API.

        Returns:
            RunEvaluator: The converted RunEvaluator.
        """
        input_str = (
            "\n       \"input\": example.inputs['input'],"
            if self.evaluator.requires_input
            else ""
        )
        reference_str = (
            "\n       \"reference\": example.outputs['expected']"
            if self.evaluator.requires_reference
            else ""
        )
        customization_error_str = f"""
def prepare_data(run, example):
    return {{
        "prediction": run.outputs['my_output'],{reference_str}{input_str}
    }}
evaluator = LangChainStringEvaluator(..., prepare_data=prepare_data)
"""

        @traceable
        def prepare_evaluator_inputs(
            run: Run, example: Optional[Example] = None
        ) -> SingleEvaluatorInput:
            if run.outputs and len(run.outputs) > 1:
                raise ValueError(
                    f"Evaluator {self.evaluator} only supports a single prediction "
                    "key. Please ensure that the run has a single output."
                    " Or initialize with a prepare_data:\n"
                    f"{customization_error_str}"
                )
            if (
                self.evaluator.requires_reference
                and example
                and example.outputs
                and len(example.outputs) > 1
            ):
                raise ValueError(
                    f"Evaluator {self.evaluator} nly supports a single reference key. "
                    "Please ensure that the example has a single output."
                    " Or create a custom evaluator yourself:\n"
                    f"{customization_error_str}"
                )
            if (
                self.evaluator.requires_input
                and example
                and example.inputs
                and len(example.inputs) > 1
            ):
                raise ValueError(
                    f"Evaluator {self.evaluator} only supports a single input key. "
                    "Please ensure that the example has a single input."
                    " Or initialize with a prepare_data:\n"
                    f"{customization_error_str}"
                )

            return SingleEvaluatorInput(
                prediction=next(iter(run.outputs.values())),  # type: ignore[union-attr]
                reference=(
                    next(iter(example.outputs.values()))
                    if (
                        self.evaluator.requires_reference
                        and example
                        and example.outputs
                    )
                    else None
                ),
                input=(
                    next(iter(example.inputs.values()))
                    if (self.evaluator.requires_input and example and example.inputs)
                    else None
                ),
            )

        @traceable(name=self.evaluator.evaluation_name)
        def evaluate(run: Run, example: Optional[Example] = None) -> dict:
            eval_inputs = (
                prepare_evaluator_inputs(run, example)
                if self._prepare_data is None
                else self._prepare_data(run, example)
            )
            results = self.evaluator.evaluate_strings(**eval_inputs)
            return {"key": self.evaluator.evaluation_name, **results}

        @traceable(name=self.evaluator.evaluation_name)
        async def aevaluate(run: Run, example: Optional[Example] = None) -> dict:
            eval_inputs = (
                prepare_evaluator_inputs(run, example)
                if self._prepare_data is None
                else self._prepare_data(run, example)
            )
            results = await self.evaluator.aevaluate_strings(**eval_inputs)
            return {"key": self.evaluator.evaluation_name, **results}

        return DynamicRunEvaluator(evaluate, aevaluate)