Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Check for infinite values in categorical feature encoder (#277)
Browse files Browse the repository at this point in the history
Check for infinite values in categorical feature encoder. The current version crashes in the categorical features contain any infinite values. The desired behavior is to convert those value to NaN.
  • Loading branch information
melanibe committed Oct 12, 2020
1 parent d34ed28 commit edd5ac4
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
5 changes: 4 additions & 1 deletion InnerEye/ML/utils/dataset_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def __init__(self, columns_and_possible_categories: OrderedDict[str, List[str]])
self._encoder = {}
for col, value in columns_and_possible_categories.items():
# Fit only once during initialization with all possible values.
if np.inf in value:
value.remove(np.inf)
self._encoder[col] = OneHotEncoder(handle_unknown='ignore').fit(np.array(value).reshape(-1, 1))
self._feature_length[col] = len(value)

Expand All @@ -63,7 +65,8 @@ def encode(self, x: Dict[str, List[str]]) -> torch.Tensor:
"""
encoded: np.ndarray = np.empty(0)
for col in x:
encoded_col = self._encoder[col].transform(np.reshape(x[col], (-1, 1))).toarray()
input_col = np.reshape(x[col], (-1, 1)).astype(str)
encoded_col = self._encoder[col].transform(input_col).toarray()
# By default OneHotEncoder will set all values of the encoded vector to be 0 if an illegal column
# value was provided. Replace this with NaN.
encoded_col[np.where(~encoded_col.any(axis=1))[0]] = np.NaN
Expand Down
15 changes: 15 additions & 0 deletions Tests/ML/utils/test_dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import numpy as np
import pandas as pd

from InnerEye.ML.utils.dataset_util import CategoricalToOneHotEncoder


def test_one_hot_encoder_with_infinite_values() -> None:
df = pd.DataFrame(columns=["categorical"])
df["categorical"] = ["F", "M", np.inf]
encoder = CategoricalToOneHotEncoder.create_from_dataframe(df, ["categorical"])
assert np.isnan(encoder.encode({"categorical": np.inf})).all()

0 comments on commit edd5ac4

Please sign in to comment.