Skip to content

Commit

Permalink
Merge pull request #1176 from Anindyadeep/anindya/from_df
Browse files Browse the repository at this point in the history
Feat(dspy): from_pandas support
  • Loading branch information
krypticmouse committed Jun 27, 2024
2 parents 5ad63db + 4ae8049 commit deff8ec
Showing 1 changed file with 45 additions and 18 deletions.
63 changes: 45 additions & 18 deletions dspy/datasets/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
from collections.abc import Mapping
from typing import List, Tuple, Union

import pandas as pd
from datasets import load_dataset

import dspy
from dspy.datasets.dataset import Dataset


class DataLoader(Dataset):
def __init__(self,):
def __init__(
self,
):
pass

def from_huggingface(
Expand All @@ -27,40 +30,64 @@ def from_huggingface(
raise ValueError("Invalid input keys provided. Please provide a tuple of input keys.")

dataset = load_dataset(dataset_name, *args, **kwargs)

if isinstance(dataset, list) and isinstance(kwargs["split"], list):
dataset = {split_name:dataset[idx] for idx, split_name in enumerate(kwargs["split"])}
dataset = {split_name: dataset[idx] for idx, split_name in enumerate(kwargs["split"])}

try:
returned_split = {}
for split_name in dataset.keys():
if fields:
returned_split[split_name] = [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset[split_name]]
returned_split[split_name] = [
dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys)
for row in dataset[split_name]
]
else:
returned_split[split_name] = [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(*input_keys) for row in dataset[split_name]]
returned_split[split_name] = [
dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys)
for row in dataset[split_name]
]

return returned_split
except AttributeError:
if fields:
return [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]
return [
dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset
]
else:
return [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(*input_keys) for row in dataset]
return [
dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys)
for row in dataset
]

def from_csv(self, file_path:str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
def from_csv(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
dataset = load_dataset("csv", data_files=file_path)["train"]

if not fields:
fields = list(dataset.features)

return [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

def from_json(self, file_path:str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

def from_pandas(
self,
df: pd.DataFrame,
fields: list[str] = None,
input_keys: tuple[str] = (),
) -> list[dspy.Example]:
if fields is None:
fields = list(df.columns)

return [
dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for _, row in df.iterrows()
]

def from_json(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
dataset = load_dataset("json", data_files=file_path)["train"]

if not fields:
fields = list(dataset.features)
return [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

def from_parquet(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
dataset = load_dataset("parquet", data_files=file_path)["train"]
Expand All @@ -79,7 +106,7 @@ def sample(
) -> List[dspy.Example]:
if not isinstance(dataset, list):
raise ValueError(f"Invalid dataset provided of type {type(dataset)}. Please provide a list of examples.")

return random.sample(dataset, n, *args, **kwargs)

def train_test_split(
Expand Down Expand Up @@ -115,6 +142,6 @@ def train_test_split(
test_end = len(dataset_shuffled) - train_end

train_dataset = dataset_shuffled[:train_end]
test_dataset = dataset_shuffled[train_end:train_end + test_end]
test_dataset = dataset_shuffled[train_end : train_end + test_end]

return {'train': train_dataset, 'test': test_dataset}
return {"train": train_dataset, "test": test_dataset}

0 comments on commit deff8ec

Please sign in to comment.