Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Feb 12, 2023
1 parent 940c2ac commit 7902f9b
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions elk/extraction/dataset_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,21 @@ def undersample(dataset: DatasetDict, label_column: str = "label"):
Args:
dataset (DatasetDict): The dataset to balance.
label_column (str, optional):
label_column (str, optional):
The column containing the labels.
Defaults to "label".
Returns:
DatasetDict: The balanced dataset.
"""
labels, counts = np.unique(dataset[label_column], return_counts=True)

subsets = []
for label in labels:
subsets.append(dataset.filter(
lambda x: x[label_column] == label).select(range(min(counts)))
subsets.append(
dataset.filter(lambda x: x[label_column] == label).select(
range(min(counts))
)
)

return concatenate_datasets(subsets).shuffle()

0 comments on commit 7902f9b

Please sign in to comment.