"""A base class that provides an easy way for data to interact with LLMs."""
from typing import List, Dict, Any, Optional, Union, Callable, Type
import collections
from collections import OrderedDict
import enum
from copy import deepcopy
from dataclasses import (
import json
import yaml
import logging
from adalflow.core.functional import (
# dataclass_obj_to_dict,
__all__ = [
logger = logging.getLogger(__name__)
# Register the custom representer
yaml.add_representer(collections.OrderedDict, represent_ordereddict)
def required_field() -> Callable[[], Any]:
A factory function to create a required field in a dataclass.
The returned callable raises a TypeError when invoked, indicating a required field was not provided.
name (Optional[str], optional): The name of the required field. Defaults to None
Callable[[], Any]: A callable that raises TypeError when called, indicating a missing required field.
.. code-block:: python
from dataclasses import dataclass
from adalflow.core.base_data_class import required_field, DataClass
class Person(DataClass):
name: str = field(default=None)
age: int = field(default_factory=required_field())# allow required field after optional field
def required_field_error():
"""This function is returned by required_field and raises an error indicating the field is required."""
raise TypeError("This field is required and was not provided.")
required_field_error.__name__ = (
"required_field" # Set the function's name explicitly
return required_field_error
# Dict is for the nested dataclasses, e.g. {"Person": ["name", "age"], "Address": ["city"]}
ExcludeType = Optional[Union[List[str], Dict[str, List[str]]]]
IncludeType = Optional[Union[List[str], Dict[str, List[str]]]]
class DataClass:
__doc__ = r"""The base data class for all data types that interact with LLMs.
Please only exclude optional fields in the exclude dictionary.
Designed to streamline the handling, serialization, and description of data within our applications, especially to LLM prompt.
We explicitly handle this instead of relying on 3rd party libraries such as pydantic or marshmallow to have better
transparency and to keep the order of the fields when get serialized.
How to create your own dataclass?
1. Subclass DataClass and define the fields with the `field` decorator.
2. Use the `medata` argument and a `desc` key to describe the field.
3. Keep the order of the fields as how you want them to be serialized and described to LLMs.
4. field with default value is considered optional. Field without default value and field with default_factory=required_field is considered required.
How to use it?
We defined :class:`DataClassFormatType<core.types.DataClassFormatType>` to categorize DataClass description formats
as input or output in LLM prompt.
(1) For describing the class (data structure):
`Signature` is more token effcient than schema, and schema as it is always a json string, when you want LLMs to output yaml, it can be misleading if you describe the data structure in json.
- DataClassFormatType.SCHEMA: a more standard way to describe the data structure in Json string, :meth:`to_schema` as string and :meth:`to_schema` as dict.
- DataClassFormatType.SIGNATURE_JSON: emitating a json object with field name as key and description as value, :meth:`to_json_signature` as string.
- DataClassFormatType.SIGNATURE_YAML: emitating a yaml object with field name as key and description as value, :meth:`to_yaml_signature` as string.
(2) For describing the class instance: this is helpful to do few-shot examples in LLM prompt.
- DataClassFormatType.EXAMPLE_JSON: the json representation of the instance, :meth:`to_json` as string.
- DataClassFormatType.EXAMPLE_YAML: the yaml representation of the instance, :meth:`to_yaml` as string.
Overall, we have a unified class method :meth:`format_str` to generate formatted output based on the type of operation and class/instance context.
1. Avoid using Optional[Type] for the type of fields, as dataclass already distingushes between optional and required fields using default value.
2. If you need to customize, you can subclass and overwrite any method to fit your needs.
Loading data:
- :meth:`from_dict` is used to create a dataclass instance from a dictionary.
Refer :ref:`DataClass<core-base_data_class_note>` for more detailed instructions.
.. code-block:: python
# Define a dataclass
from adalflow.core import DataClass
from dataclasses import dataclass, field
class MyOutputs(DataClass):
age: int = field(metadata={"desc": "The age of the person", "prefix": "Age:"})
name: str = field(metadata={"desc": "The name of the person", "prefix": "Name:"})
# Create json signature
# Output:
# {
# "age": "The age of the person",
# "name": "The name of the person"
# }
# Create yaml signature
# Output:
# age: The age of the person
# name: The name of the person
# Create a dataclass instance
my_instance = MyOutputs(age=25, name="John Doe")
# Create json example
# Output:
# {
# "age": 25,
# "name": "John Doe"
# }
# Create yaml signature
# Output:
# age: 25
# name: John Doe
__input_fields__: List[str] = []
__output_fields__: List[str] = []
def __post_init__(self):
for f in fields(self):
if "desc" not in f.metadata and "description" not in f.metadata:
f"Class { self.__class__.__name__} Field {f.name} is missing 'desc' in metadata"
def get_task_desc(cls) -> str:
"""Get the task description for the dataclass.
str: The task description for the dataclass.
return cls.__doc__
def set_task_desc(cls, task_desc: str) -> None:
"""Set the task description for the dataclass.
task_desc (str): The task description to set.
cls.__doc__ = task_desc
def get_output_fields(cls):
"""Return a list of all output fields."""
return cls.__output_fields__
def set_output_fields(cls, output_fields: List[str]):
"""Set the output fields for the dataclass.
When creating schema or instance, it will follow the input field and output field order
output_fields (List[str]): The output fields to set.
cls.__output_fields__ = output_fields
def to_dict(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> Dict[str, Any]:
"""Convert a dataclass object to a dictionary.
Supports nested dataclasses, lists, and dictionaries.
Allow exclude keys for each dataclass object.
Use cases:
- Decide what information will be included to be serialized to JSON or YAML that can be used in LLM prompt.
- Exclude sensitive information from the serialized output.
- Serialize the dataclass instance to a dictionary for saving states.
exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None.
.. code-block:: python
from dataclasses import dataclass
from typing import List
class TrecData:
question: str
label: int
class TrecDataList(DataClass):
data: List[TrecData]
name: str
trec_data = TrecData(question="What is the capital of France?", label=0)
trec_data_list = TrecDataList(data=[trec_data], name="trec_data_list")
trec_data_list.to_dict(exclude={"TrecData": ["label"], "TrecDataList": ["name"]})
# Output:
# {'data': [{'question': 'What is the capital of France?'}]}
if not is_dataclass(self):
raise ValueError(
f"to_dict() is not called on a dataclass instance: {self.__class__}. You might forget to use @dataclass decorator."
# convert all fields to its data if its parameter
fields = self.__dataclass_fields__
from adalflow.optim.parameter import Parameter
for f in fields.values():
# print(f"field: {f}")
field_value = getattr(self, f.name)
# if its a parameter, convert to its data
if isinstance(field_value, Parameter):
setattr(self, f.name, field_value.data)
# print(f"adapted self: {self}")
# ensure only either include or exclude is used not both
if include and exclude:
raise ValueError("Either include or exclude can be used, not both.")
# convert include to excluded
excluded: Optional[Dict[str, List[str]]] = None
if include: # only support unnested fields
# fild all fields of the class
fields = self.__dataclass_fields__
# generate the excluded dict
excluded = {
self.__class__.__name__: [
f.name for f in fields.values() if f.name not in include
elif exclude:
if exclude and isinstance(exclude, List):
excluded = {self.__class__.__name__: exclude}
elif exclude and isinstance(exclude, Dict):
excluded = deepcopy(exclude)
excluded = None
# return custom_asdict(self, exclude=excluded)
# Convert the dataclass to a dictionary
raw_dict = custom_asdict(self, exclude=excluded)
# Reorder the dictionary based on input_field and output_field
input_fields = self.get_input_fields()
output_fields = self.get_output_fields()
ordered_dict = OrderedDict()
# First, add input fields in order
for field_name in input_fields:
if field_name in raw_dict:
ordered_dict[field_name] = raw_dict[field_name]
# Then, add output fields in order
for field_name in output_fields:
if field_name in raw_dict:
ordered_dict[field_name] = raw_dict[field_name]
# Finally, add any remaining fields (if there are any)
for field_name, value in raw_dict.items():
if field_name not in ordered_dict:
ordered_dict[field_name] = value
return dict(ordered_dict)
def from_dict(cls, data: Dict[str, Any]) -> "DataClass":
"""Create a dataclass instance from a dictionary.
Supports nested dataclasses, lists, and dictionaries.
Example from the :meth:`to_dict` method.
..code-block:: python
data_dict = trec_data_list.to_dict()
restored_data = TreDataList.from_dict(data_dict)
assert str(restored_data.__dict__) == str(trec_data_list.__dict__)
.. note::
If any required field is missing, it will raise an error.
Do not use the dict that has excluded required fields.
Use cases:
- Convert the json/yaml output from LLM prediction to a dataclass instance.
- Restore the dataclass instance from the serialized output used for states saving.
dclass = dataclass_obj_from_dict(cls, data)
logger.debug(f"Dataclass instance created from dict: {dclass}")
return dclass
except TypeError as e:
raise ValueError(f"Failed to load data: {e}")
def from_json(cls, json_str: str) -> "DataClass":
"""Create a dataclass instance from a JSON string.
json_str (str): The JSON string to convert to a dataclass instance.
.. code-block:: python
json_str = '{"question": "What is the capital of France?", "label": 0}'
trec_data = TrecData.from_json(json_str)
data = json.loads(json_str)
return cls.from_dict(data)
except json.JSONDecodeError as e:
raise ValueError(f"Failed to load JSON string: {e}")
def to_json_obj(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> Any:
r"""Convert the dataclass instance to a JSON object.
:meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained.
This can be important to llm prompt.
exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None.
return json.loads(self.to_json(exclude=exclude, include=include))
def to_json(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> str:
r"""Convert the dataclass instance to a JSON string.
:meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained.
This can be important to llm prompt.
exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None.
return json.dumps(
self.to_dict(exclude=exclude, include=include), indent=4, sort_keys=False
def from_yaml(cls, yaml_str: str) -> "DataClass":
"""Create a dataclass instance from a YAML string.
yaml_str (str): The YAML string to convert to a dataclass instance.
.. code-block:: python
yaml_str = 'question: What is the capital of France?\nlabel: 0'
trec_data = TrecData.from_yaml(yaml_str)
data = yaml.safe_load(yaml_str)
return cls.from_dict(data)
except yaml.YAMLError as e:
raise ValueError(f"Failed to load YAML string: {e}")
def to_yaml_obj(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> Any:
r"""Convert the dataclass instance to a YAML object.
:meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained.
exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None.
return yaml.safe_load(self.to_yaml(exclude=exclude, include=include))
def to_yaml(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> str:
r"""Convert the dataclass instance to a YAML string.
:meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained.
exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None.
return yaml.dump(
self.to_dict(exclude=exclude, include=include),
def dict_to_yaml(self, data: Dict[str, Any]) -> str:
"""Convert a dictionary to a YAML string.
data (Dict[str, Any]): The dictionary to convert to a YAML string.
str: The YAML string representation of the dictionary.
return yaml.dump(data, default_flow_style=False, sort_keys=False).strip()
def to_schema(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> Dict[str, Dict[str, Any]]:
"""Generate a Json schema which is more detailed than the signature."""
# convert exclude to dict if it is a list
if include and exclude:
raise ValueError("Either include or exclude can be used, not both.")
excluded: Optional[Dict[str, List[str]]] = None
if include: # only support unnested fields
# fild all fields of the class
fields = cls.__dataclass_fields__
# generate the excluded dict
excluded = {
cls.__name__: [f.name for f in fields.values() if f.name not in include]
elif exclude:
if exclude and isinstance(exclude, List):
excluded = {cls.__name__: exclude}
elif exclude and isinstance(exclude, Dict):
excluded = deepcopy(exclude)
excluded = None
raw_dict = get_dataclass_schema(
cls, excluded, getattr(cls, "__type_var_map__", None)
# Reorder the dictionary based on input_field and output_field
properties = raw_dict.get("properties", {})
# reorder the properties fields
input_fields = cls.get_input_fields()
output_fields = cls.get_output_fields()
ordered_dict = OrderedDict()
# First, add input fields in order
for field_name in input_fields:
if field_name in properties:
ordered_dict[field_name] = properties[field_name]
# Then, add output fields in order
for field_name in output_fields:
if field_name in properties:
ordered_dict[field_name] = properties[field_name]
# Finally, add any remaining fields (if there are any)
for field_name, value in properties.items():
if field_name not in ordered_dict:
ordered_dict[field_name] = value
# Update the properties field
raw_dict["properties"] = ordered_dict
return raw_dict
def to_schema_str(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> str:
"""Generate a Json schema which is more detailed than the signature."""
schema = cls.to_schema(exclude=exclude, include=include)
return json.dumps(schema, indent=4).strip()
def to_yaml_signature(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> str:
r"""Generate a YAML signature for the class from desc in metadata.
Used mostly as LLM prompt to describe the output data format.
# NOTE: we manually format the yaml string as the yaml.dump will always sort the keys
# Which can impact the final model output
schema = cls.to_schema(exclude=exclude, include=include)
signature_dict = convert_schema_to_signature(schema)
yaml_content = []
for key, value in signature_dict.items():
yaml_content.append(f"{key}: {value}")
yaml_output = "\n".join(yaml_content)
return yaml_output
def to_json_signature(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> str:
"""Generate a JSON `signature`(json string) for the class from desc in metadata.
Used mostly as LLM prompt to describe the output data format.
>>> @dataclass
>>> class MyOutputs(DataClass):
>>> age: int = field(metadata={"desc": "The age of the person", "prefix": "Age:"})
>>> name: str = field(metadata={"desc": "The name of the person", "prefix": "Name:"})
>>> print(MyOutputs.to_json_signature())
>>> # Output is a JSON string:
>>> # '{
>>> # "age": "The age of the person (int) (required)",
>>> # "name": "The name of the person (str) (required)"
>>> #}'
schema = cls.to_schema(exclude=exclude, include=include)
signature_dict = convert_schema_to_signature(schema)
return json.dumps(signature_dict, indent=4)
def to_dict_class(
exclude: ExcludeType = None,
include: IncludeType = None,
) -> Dict[str, Any]:
"""More of an internal used class method for serialization.
Converts the dataclass to a dictionary, optionally excluding specified fields.
Use this to save states of the class in serialization, not advised to use in LLM prompt.
return cls.to_schema(exclude=exclude, include=include)
def check_adal_dataclass(data_class: Type) -> None:
"""Check if the provided class is a valid dataclass for the AdalFlow framework.
data_class (Type): The class to check.
if not is_dataclass(data_class):
raise TypeError(f"Provided class is not a dataclass: {data_class}")
if not issubclass(data_class, DataClass):
raise TypeError(f"Provided class is not a subclass of DataClass: {data_class}")
class DynamicDataClassFactory:
def from_dict(
data: Dict[str, Any],
base_class: Type = DataClass,
class_name: str = "DynamicDataClass",
) -> DataClass:
Create an instance of a dataclass from a dictionary. The dictionary should have the following structure:
"field_name": field_value,
data (dict): The dictionary with field names and values.
base_class (type): The base class to inherit from (default: BaseDataClass).
class_name (str): The name of the generated dataclass (default: DynamicDataClass).
BaseDataClass: An instance of the generated dataclass.
# Create field specifications for the dataclass
# fields_spec = [
# (key, type(value), field(default=value)) for key, value in data.items()
# ]
fields_spec = []
for key, value in data.items():
field_type = type(value)
if isinstance(value, (list, dict, set)):
(key, field_type, field(default_factory=lambda v=value: v))
fields_spec.append((key, field_type, field(default=value)))
# Create the dataclass
dynamic_class = make_dataclass(class_name, fields_spec, bases=(base_class,))
# Create an instance of the dataclass with the provided values
instance = dynamic_class.from_dict(data)
return instance