Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Feb 12, 2023
1 parent 2c8d322 commit 614d403
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions elk/extraction/dataset_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,22 @@
def undersample(dataset: DatasetDict, label_column: str = "label"):
"""
Balance a dataset by undersampling the majority class.
Args:
dataset (DatasetDict): The dataset to balance.
label_column (str, optional): The column containing the labels. Defaults to "label".
Returns:
DatasetDict: The balanced dataset.
"""
labels, counts = np.unique(dataset[label_column], return_counts=True)

subsets = []
for label in labels:
subsets.append(dataset.filter(lambda x: x[label_column] == label).select(range(min(counts))))
subsets.append(
dataset.filter(lambda x: x[label_column] == label).select(
range(min(counts))
)
)

return concatenate_datasets(subsets).shuffle()
return concatenate_datasets(subsets).shuffle()

0 comments on commit 614d403

Please sign in to comment.