Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat(dspy): from_pandas support #1176

Merged
merged 7 commits into from
Jun 27, 2024
63 changes: 45 additions & 18 deletions dspy/datasets/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
from collections.abc import Mapping
from typing import List, Tuple, Union

import pandas as pd
from datasets import load_dataset

import dspy
from dspy.datasets.dataset import Dataset


class DataLoader(Dataset):
def __init__(self,):
def __init__(
self,
):
pass

def from_huggingface(
Expand All @@ -27,40 +30,64 @@ def from_huggingface(
raise ValueError("Invalid input keys provided. Please provide a tuple of input keys.")

dataset = load_dataset(dataset_name, *args, **kwargs)

if isinstance(dataset, list) and isinstance(kwargs["split"], list):
dataset = {split_name:dataset[idx] for idx, split_name in enumerate(kwargs["split"])}
dataset = {split_name: dataset[idx] for idx, split_name in enumerate(kwargs["split"])}

try:
returned_split = {}
for split_name in dataset.keys():
if fields:
returned_split[split_name] = [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset[split_name]]
returned_split[split_name] = [
dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys)
for row in dataset[split_name]
]
else:
returned_split[split_name] = [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(*input_keys) for row in dataset[split_name]]
returned_split[split_name] = [
dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys)
for row in dataset[split_name]
]

return returned_split
except AttributeError:
if fields:
return [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]
return [
dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset
]
else:
return [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(*input_keys) for row in dataset]
return [
dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys)
for row in dataset
]

def from_csv(self, file_path:str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
def from_csv(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
dataset = load_dataset("csv", data_files=file_path)["train"]

if not fields:
fields = list(dataset.features)

return [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

def from_json(self, file_path:str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

def from_pandas(
self,
df: pd.DataFrame,
fields: list[str] = None,
input_keys: tuple[str] = (),
) -> list[dspy.Example]:
if fields is None:
fields = list(df.columns)

return [
dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for _, row in df.iterrows()
]

def from_json(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
dataset = load_dataset("json", data_files=file_path)["train"]

if not fields:
fields = list(dataset.features)
return [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

def from_parquet(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
dataset = load_dataset("parquet", data_files=file_path)["train"]
Expand All @@ -79,7 +106,7 @@ def sample(
) -> List[dspy.Example]:
if not isinstance(dataset, list):
raise ValueError(f"Invalid dataset provided of type {type(dataset)}. Please provide a list of examples.")

return random.sample(dataset, n, *args, **kwargs)

def train_test_split(
Expand Down Expand Up @@ -115,6 +142,6 @@ def train_test_split(
test_end = len(dataset_shuffled) - train_end

train_dataset = dataset_shuffled[:train_end]
test_dataset = dataset_shuffled[train_end:train_end + test_end]
test_dataset = dataset_shuffled[train_end : train_end + test_end]

return {'train': train_dataset, 'test': test_dataset}
return {"train": train_dataset, "test": test_dataset}
Loading