Source code for eval.answer_match_acc

"""This is the metric for QA generation. It compares the predicted answer with the ground truth answer."""

from typing import List, Literal
from adalflow.eval.base import BaseEvaluator, EvaluationResult
from adalflow.optim.parameter import Parameter
from adalflow.eval.utils import normalize_answer, f1_score



[docs]
class AnswerMatchAcc(BaseEvaluator):
    r"""
    Metric for answer matching. It compares the predicted answer with the ground truth answer.

    Args:
        type (str): Type of matching evaluation. Can be "exact_match" or "fuzzy_match". "exact_match" requires the predicted answer to be exactly the same as the ground truth answer. "fuzzy_match" requires the predicted answer to contain the ground truth answer.

    Examples:
        >>> pred_answers = ["positive", "negative", "this is neutral"]
        >>> gt_answers = ["positive", "negative", "neutral"]
        >>> answer_match_acc = AnswerMatchAcc(type="exact_match")
        >>> avg_acc, acc_list = answer_match_acc.compute(all_pred_answer, all_gt_answer)
        >>> avg_acc
        2 / 3
        >>> acc_list
        [1.0, 1.0, 0.0]
        >>> answer_match_acc = AnswerMatchAcc(type="fuzzy_match")
        >>> avg_acc, acc_list = answer_match_acc.compute(all_pred_answer, all_gt_answer)
        >>> avg_acc
        1.0
        >>> acc_list
        [1.0, 1.0, 1.0]

    References:
    1. HotpotQA: https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py
    """

    def __init__(
        self,
        type: Literal[
            "exact_match",
            "fuzzy_match",
            "rouge_score",
            "bleu_score",
            "bert_score",
            "f1_score",
        ] = "exact_match",
    ):
        self.type = type
        if self.type == "bert_score":
            from torchmetrics.text.bert import BERTScore

            self.bertscore = BERTScore()

        elif self.type == "rouge_score":
            from torchmetrics.text.rouge import ROUGEScore

            self.rougescore = ROUGEScore()

        elif self.type == "bleu_score":
            from torchmetrics.text.bleu import BLEUScore

            self.bleuscore = BLEUScore()


[docs]
    def compute_single_item(
        self,
        y: object,
        y_gt: object,
    ) -> float:
        r"""
        Compute the match accuracy of the predicted answer for a single query.

        Allow any type of input for pred_answer and gt_answer.
        When evaluating, the input will be converted to string.

        Args:
            pred_answer (object): Predicted answer.
            gt_answer (object): Ground truth answer.

        Returns:
            float: Match accuracy.
        """
        if isinstance(y, Parameter):
            y = y.data
        if isinstance(y_gt, Parameter):
            y_gt = y_gt.data
        try:
            y = str(y).strip()
            y_gt = str(y_gt).strip()
        except Exception as e:
            raise ValueError(
                f"Error converting pred_answer and gt_answer to string: {e}"
            )
        if self.type == "exact_match":
            return 1.0 if normalize_answer(y) == normalize_answer(y_gt) else 0.0
        elif self.type == "fuzzy_match":
            y = normalize_answer(y)
            y_gt = normalize_answer(y_gt)
            return 1.0 if y_gt in y else 0.0
        elif self.type == "f1_score":
            return f1_score(y, y_gt)
        elif self.type == "bert_score":
            from torchmetrics.text.bert import BERTScore

            self.bertscore = BERTScore()
            score = self.bertscore([y], [y_gt])
            # get the data from the tensor
            print(f"y: {[y]}, y_gt: {[y_gt]}, type: {type(y)}, type_gt: {type(y_gt)}")
            print(score)
            single_score = score["precision"].item()
            return single_score
        elif self.type == "rouge_score":
            from torchmetrics.text.rouge import ROUGEScore

            self.rougescore = ROUGEScore()
            score = self.rougescore([y], [y_gt])
            # get the data from the tensor
            print(f"y: {[y]}, y_gt: {[y_gt]}, type: {type(y)}, type_gt: {type(y_gt)}")
            print(score)
            single_score = score["rouge1_precision"].item()
            return single_score
        elif self.type == "bleu_score":
            from torchmetrics.text.bleu import BLEUScore

            self.bleuscore = BLEUScore()
            score = self.bleuscore([y], [y_gt])
            # get the data from the tensor
            print(f"y: {[y]}, y_gt: {[y_gt]}, type: {type(y)}, type_gt: {type(y_gt)}")
            print(score)
            single_score = score.item()
            return single_score

        else:
            raise NotImplementedError



[docs]
    def compute(
        self, pred_answers: List[str], gt_answers: List[str]
    ) -> EvaluationResult:
        r"""
        Compute the match accuracy of the predicted answer for a list of queries.

        Args:
            pred_answers (List[str]): List of predicted answer strings.
            gt_answers (List[str]): List of ground truth answer strings.

        Returns:
            tuple:
                - float: Average match accuracy.
                - List[float]: Match accuracy values for each query.
        """
        match_acc_list = []
        for pred_answer, gt_answer in zip(pred_answers, gt_answers):
            match = self.compute_single_item(pred_answer, gt_answer)
            match_acc_list.append(match)

        return EvaluationResult(
            avg_score=sum(match_acc_list) / len(match_acc_list),
            per_item_scores=match_acc_list,
        )