Source code for eval.llm_as_judge

"""This is the metric to use an LLM as a judge for evaluating the performance of predicted answers."""

from typing import List, Dict, Any, Optional, TYPE_CHECKING, Union, Literal, Tuple
from dataclasses import dataclass
import logging
from itertools import zip_longest


if TYPE_CHECKING:
    pass
from adalflow.core.component import Component
from adalflow.optim.parameter import Parameter, ParameterType
from adalflow.core.model_client import ModelClient
from adalflow.eval.base import BaseEvaluator
from adalflow.eval.functional import confidence_interval

__all__ = ["DefaultLLMJudge", "LLMasJudge", "LLMJudgeEvalResult"]

log = logging.getLogger(__name__)

DEFAULT_LLM_EVALUATOR_PROMPT = r"""<START_OF_SYSTEM_PROMPT>
{# task desc #}
{{task_desc_str}}
{# examples #}
{% if examples_str %}
{{examples_str}}
{% endif %}
<END_OF_SYSTEM_PROMPT>
---------------------
<START_OF_USER>
{# question #}
{% if question_str is defined %}
Question: {{question_str}}
{% endif %}
{# ground truth answer #}
{% if gt_answer_str is defined %}
Ground truth answer: {{gt_answer_str}}
{% endif %}
{# predicted answer #}
Predicted answer: {{pred_answer_str}}
<END_OF_USER>
"""

DEFAULT_JUDGEMENT_QUERY = "Does the predicted answer contain the ground truth answer? Say True if yes, False if no."


DEFAULT_LLM_EVALUATOR_MODEL_KWARGS = {
    "model": "gpt-3.5-turbo",
    "temperature": 1,
    "stream": False,
}



[docs]
@dataclass
class LLMJudgeEvalResult:
    avg_score: float
    judgement_score_list: List[bool]
    confidence_interval: Tuple[float, float]




[docs]
class DefaultLLMJudge(Component):
    __doc__ = r"""Demonstrate how to use an LLM/Generator to output True or False for a judgement query.

    You can use any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False.

    A call on the LLM judge equalize to _compute_single_item method.

    Args:
        model_client (ModelClient): The model client to use for the generator.
        model_kwargs (Dict[str, Any], optional): The model kwargs to pass to the model client. Defaults to {}. Please refer to :ref:`ModelClient<components-model_client>` for the details on how to set the model_kwargs for your specific model if it is from our library.
        template (str, optional): The template to use for the LLM evaluator. Defaults to None.
        jugement_query (str, optional): The judgement query string. Defaults to DEFAULT_JUDGEMENT_QUERY.
        output_type (Literal["bool", "float"], optional): The output type of the judgement. Defaults to "bool".
        use_cache (bool, optional): Whether to use cache for the LLM evaluator. Defaults to True.

    Note:
        Must use True/False instead of Yes/No in the judgement_query for response.
    """

    def __init__(
        self,
        model_client: Optional[ModelClient] = None,
        model_kwargs: Optional[Dict[str, Any]] = None,
        template: Optional[str] = None,
        jugement_query: Optional[str] = None,
        example_str: Optional[str] = None,
        output_type: Literal["bool", "float"] = "bool",
        use_cache: bool = True,
    ):
        from adalflow.core.generator import Generator

        super().__init__()
        self.model_client = model_client
        if model_client is None:
            log.info("model_client is None, default to OpenAIClient.")
            try:
                from adalflow.components.model_client import OpenAIClient
            except ImportError:
                raise ImportError(
                    "OpenAIClient is not available. Please fix the import error or set your own choice of model_client and model_kwargs."
                )
            self.model_client = OpenAIClient()
        self.model_kwargs = model_kwargs or DEFAULT_LLM_EVALUATOR_MODEL_KWARGS
        self.template = template or DEFAULT_LLM_EVALUATOR_PROMPT
        self._jugement_query = jugement_query or DEFAULT_JUDGEMENT_QUERY
        self.llm_evaluator = Generator(
            model_client=self.model_client,
            model_kwargs=self.model_kwargs,
            template=self.template,
            use_cache=use_cache,
            prompt_kwargs={
                "task_desc_str": Parameter(
                    data=f"""You are an evaluator. Given the question(optional), ground truth answer(optional), and predicted answer, {self._jugement_query}""",
                    param_type=ParameterType.PROMPT,
                ),
                "examples_str": Parameter(
                    data=example_str, param_type=ParameterType.DEMOS
                ),
            },
        )

        self.output_type = output_type


[docs]
    def call(
        self,
        question: str,
        gt_answer: str,
        pred_answer: str,
    ) -> Union[bool, float]:
        r"""
        Get the judgement of the predicted answer for a single question.

        Args:
            question (str): Question string.
            gt_answer (str): Ground truth answer string.
            pred_answer (str): Predicted answer string.
            judgement_query (str): Judgement query string.

        Returns:
            bool: Judgement result.
        """
        output = self.llm_evaluator(
            prompt_kwargs={
                "question_str": question,
                "gt_answer_str": gt_answer,
                "pred_answer_str": pred_answer,
            }
        )

        judgement = output.raw_response
        judgement = judgement.strip().lower()
        output = False if self.output_type == "bool" else 0.0
        if "true" in judgement:
            output = True if self.output_type == "bool" else 1.0
        elif "false" in judgement:
            output = False if self.output_type == "bool" else 0.0
        else:
            print(f"Invalid judgement: {judgement}, use False or 0.0 instead.")
            # raise ValueError(f"Invalid judgement: {judgement}")
        return output


    def _extra_repr(self) -> str:
        s = f"judgement_query= {self._jugement_query}, "
        return s




[docs]
class LLMasJudge(BaseEvaluator):
    r"""
    LLM as judge for evaluating the performance of a LLM.

    Args:
        llm_evaluator (Component, optional): The LLM evaluator to use. Defaults to DefaultLLMJudge.

    Examples:
        >>> questions = [
        "Is Beijing in China?",
        "Is Apple founded before Google?",
        "Is earth flat?",
        ]
        >>> pred_answers = ["Yes", "Yes, Appled is founded before Google", "Yes"]
        >>> gt_answers = ["Yes", "Yes", "No"]
        >>> judgement_query = "For the question, does the predicted answer contain the ground truth answer?"
        >>> llm_judge = LLMasJudge()
        >>> avg_judgement, judgement_list = llm_judge.compute(
        questions, gt_answers, pred_answers, judgement_query
        )
        >>> avg_judgement
        2 / 3
        >>> judgement_list
        [True, True, False]

    Customize the LLMJudge

    .. code-block:: python

        llm_judge = Def
    """

    def __init__(
        self,
        llm_judge: Optional[Component] = None,
    ):
        super().__init__()
        self.llm_judge = llm_judge or DefaultLLMJudge()


[docs]
    def compute(
        self,
        *,
        pred_answers: List[str],
        questions: Optional[List[str]] = None,
        gt_answers: Optional[List[str]] = None,
        # judgement_query: Optional[str] = None,
    ) -> LLMJudgeEvalResult:
        r"""
        Get the judgement of the predicted answer for a list of questions.

        Args:
            questions (List[str]): List of question strings.
            gt_answers (List[str]): List of ground truth answer strings.
            pred_answers (List[str]): List of predicted answer strings.
            judgement_query (str): Judgement query string.

        Returns:
            LLMEvalResult: The evaluation result.

        """
        judgement_list = []
        questions = questions or [None] * len(pred_answers)
        gt_answers = gt_answers or [None] * len(pred_answers)

        for question, gt_answer, pred_answer in zip_longest(
            questions, gt_answers, pred_answers, fillvalue=None
        ):
            judgement = self.llm_judge(
                question,
                gt_answer,
                pred_answer,
            )
            judgement_list.append(judgement)

        avg_score = judgement_list.count(True) / len(judgement_list)

        judgement_score_list = [1 if judgement else 0 for judgement in judgement_list]
        confidence = confidence_interval(judgement_score_list)
        return LLMJudgeEvalResult(avg_score, judgement_score_list, confidence)


    def __str__(self) -> str:
        s = f"llm_judge={self.llm_judge}"
        return s