Source code for datasets.types

import uuid
from dataclasses import dataclass, field
from typing import Dict
from adalflow.core.base_data_class import DataClass



[docs]
@dataclass
class BaseData(DataClass):
    __doc__ = """A common dataclass for representing examples in a dataset."""
    id: str = field(
        metadata={"desc": "The unique identifier of the example", "type": "id"},
        default=str(uuid.uuid4()),
    )




[docs]
@dataclass
class Example(DataClass):
    __doc__ = """A common dataclass for representing examples in a dataset."""
    id: str = field(
        metadata={"desc": "The unique identifier of the example"},
        default=str(uuid.uuid4()),
    )
    question: str = field(
        metadata={"desc": "The question to be answered"}, default=None
    )
    answer: str = field(metadata={"desc": "The answer to the question"}, default=None)




[docs]
@dataclass
class GSM8KData(Example):
    __doc__ = """A dataclass for representing examples in the GSM8K dataset.

    You can reset the output fields:

    .. code-block:: python

        GSM8KData.set_output_fields(["answer"])
    """
    gold_reasoning: str = field(
        metadata={"desc": "The ground truth reasoning for the answer"}, default=None
    )
    reasoning: str = field(
        metadata={"desc": "The reasoning for the answer"}, default=None
    )  # your model's reasoning

    __input_fields__ = ["question"]
    __output_fields__ = ["reasoning", "answer"]  # default output fields




[docs]
@dataclass
class HotPotQAData(Example):
    __doc__ = """A dataclass for representing examples in the HotPotQA dataset."""
    gold_titles: set = field(
        metadata={"desc": "The set of titles that support the answer"},
        default=None,
    )
    context: Dict[str, object] = field(
        metadata={"desc": "The context of the question"},
        default=None,
    )

    __input_fields__ = ["question"]
    __output_fields__ = ["answer"]


    # @staticmethod
    # def from_dict(d: Dict[str, Any]) -> "HotPotQAData":
    #     # Preprocess gold_titles
    #     if "gold_titles" in d and isinstance(d["gold_titles"], str):
    #         try:
    #             d["gold_titles"] = json.loads(d["gold_titles"])
    #         except json.JSONDecodeError:
    #             # Replace single quotes with double quotes
    #             fixed_str = d["gold_titles"].replace("'", '"')
    #             d["gold_titles"] = set(json.loads(fixed_str))

    #     # Preprocess context
    #     if "context" in d and isinstance(d["context"], str):
    #         try:
    #             d["context"] = json.loads(d["context"])
    #         except json.JSONDecodeError:
    #             fixed_str = d["context"].replace("'", '"')
    #             d["context"] = json.loads(fixed_str)

    #     return HotPotQAData(**d)



[docs]
@dataclass
class TrecData(BaseData):
    __doc__ = """A dataclass for representing examples in the TREC dataset."""
    question: str = field(
        metadata={"desc": "The question to be classified"},
        default=None,
    )
    class_name: str = field(
        metadata={"desc": "One of {ABBR, ENTY, DESC, HUM, LOC, NUM}"},
        default=None,
    )
    class_index: int = field(
        metadata={"desc": "The class label, in range [0, 5]"},
        default=-1,
    )

    __input_fields__ = ["question"]  # follow this order too.
    __output_fields__ = ["class_name", "class_index"]



if __name__ == "__main__":
    # test the hotpotqa data
    data = HotPotQAData(
        question="What is the capital of France?",
        answer="Paris",
        gold_titles=set(["Paris", "France"]),
        context={"Paris": "The capital of France"},
    )

    data_dict = data.to_dict()
    print("data_dict", data_dict)
    data = HotPotQAData.from_dict(data_dict)
    print("data", data)

    from adalflow.utils.file_io import save_json, load_json

    # save json
    save_json(data_dict, f="task.json")
    # load json
    data_dict_loaded = load_json(f="task.json")

    print("data_dict_loaded", data_dict_loaded)

    # restore the data
    data_restored = HotPotQAData.from_dict(data_dict_loaded)
    print("data_restored", data_restored)