Source code for eval.answer_match_acc

"""This is the metric for QA generation. It compares the predicted answer with the ground truth answer."""

from typing import List, Literal
from adalflow.eval.base import BaseEvaluator, EvaluationResult
from adalflow.optim.parameter import Parameter
from adalflow.eval.utils import normalize_answer, f1_score


[docs] class AnswerMatchAcc(BaseEvaluator): r""" Metric for answer matching. It compares the predicted answer with the ground truth answer. Args: type (str): Type of matching evaluation. Can be "exact_match" or "fuzzy_match". "exact_match" requires the predicted answer to be exactly the same as the ground truth answer. "fuzzy_match" requires the predicted answer to contain the ground truth answer. Examples: >>> pred_answers = ["positive", "negative", "this is neutral"] >>> gt_answers = ["positive", "negative", "neutral"] >>> answer_match_acc = AnswerMatchAcc(type="exact_match") >>> avg_acc, acc_list = answer_match_acc.compute(all_pred_answer, all_gt_answer) >>> avg_acc 2 / 3 >>> acc_list [1.0, 1.0, 0.0] >>> answer_match_acc = AnswerMatchAcc(type="fuzzy_match") >>> avg_acc, acc_list = answer_match_acc.compute(all_pred_answer, all_gt_answer) >>> avg_acc 1.0 >>> acc_list [1.0, 1.0, 1.0] References: 1. HotpotQA: https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py """ def __init__( self, type: Literal[ "exact_match", "fuzzy_match", "rouge_score", "bleu_score", "bert_score", "f1_score", ] = "exact_match", ): self.type = type if self.type == "bert_score": from torchmetrics.text.bert import BERTScore self.bertscore = BERTScore() elif self.type == "rouge_score": from torchmetrics.text.rouge import ROUGEScore self.rougescore = ROUGEScore() elif self.type == "bleu_score": from torchmetrics.text.bleu import BLEUScore self.bleuscore = BLEUScore()
[docs] def compute_single_item( self, y: object, y_gt: object, ) -> float: r""" Compute the match accuracy of the predicted answer for a single query. Allow any type of input for pred_answer and gt_answer. When evaluating, the input will be converted to string. Args: pred_answer (object): Predicted answer. gt_answer (object): Ground truth answer. Returns: float: Match accuracy. """ if isinstance(y, Parameter): y = y.data if isinstance(y_gt, Parameter): y_gt = y_gt.data try: y = str(y).strip() y_gt = str(y_gt).strip() except Exception as e: raise ValueError( f"Error converting pred_answer and gt_answer to string: {e}" ) if self.type == "exact_match": return 1.0 if normalize_answer(y) == normalize_answer(y_gt) else 0.0 elif self.type == "fuzzy_match": y = normalize_answer(y) y_gt = normalize_answer(y_gt) return 1.0 if y_gt in y else 0.0 elif self.type == "f1_score": return f1_score(y, y_gt) elif self.type == "bert_score": from torchmetrics.text.bert import BERTScore self.bertscore = BERTScore() score = self.bertscore([y], [y_gt]) # get the data from the tensor print(f"y: {[y]}, y_gt: {[y_gt]}, type: {type(y)}, type_gt: {type(y_gt)}") print(score) single_score = score["precision"].item() return single_score elif self.type == "rouge_score": from torchmetrics.text.rouge import ROUGEScore self.rougescore = ROUGEScore() score = self.rougescore([y], [y_gt]) # get the data from the tensor print(f"y: {[y]}, y_gt: {[y_gt]}, type: {type(y)}, type_gt: {type(y_gt)}") print(score) single_score = score["rouge1_precision"].item() return single_score elif self.type == "bleu_score": from torchmetrics.text.bleu import BLEUScore self.bleuscore = BLEUScore() score = self.bleuscore([y], [y_gt]) # get the data from the tensor print(f"y: {[y]}, y_gt: {[y_gt]}, type: {type(y)}, type_gt: {type(y_gt)}") print(score) single_score = score.item() return single_score else: raise NotImplementedError
[docs] def compute( self, pred_answers: List[str], gt_answers: List[str] ) -> EvaluationResult: r""" Compute the match accuracy of the predicted answer for a list of queries. Args: pred_answers (List[str]): List of predicted answer strings. gt_answers (List[str]): List of ground truth answer strings. Returns: tuple: - float: Average match accuracy. - List[float]: Match accuracy values for each query. """ match_acc_list = [] for pred_answer, gt_answer in zip(pred_answers, gt_answers): match = self.compute_single_item(pred_answer, gt_answer) match_acc_list.append(match) return EvaluationResult( avg_score=sum(match_acc_list) / len(match_acc_list), per_item_scores=match_acc_list, )