return dict instead of array for easier maintainance

inovex · Jan 7, 2020 · 14353af · 14353af
1 parent 3a793c3
commit 14353af
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 47 deletions.
diff --git a/notebooks/example_evaluation.ipynb b/notebooks/example_evaluation.ipynb
@@ -130,13 +130,13 @@
  " train_ite, test_ite = weighted_slearner(train, test)\n",
  "\n",
  " # Calculate the scores and append them to a dataframe\n",
- " test_scores.loc[len(test_scores)] = calc_scores(test[Col.ite],\n",
- "  test_ite,\n",
- "  metrics)\n",
+ " test_scores = test_scores.append(calc_scores(\n",
+ " test[Col.ite], test_ite, metrics\n",
+ " ), ignore_index=True)\n",
  "\n",
- " train_scores.loc[len(train_scores)] = calc_scores(train[Col.ite],\n",
- "  train_ite,\n",
- "  metrics)\n",
+ " train_scores = train_scores.append(calc_scores(\n",
+ " train[Col.ite], train_ite, metrics\n",
+ " ), ignore_index=True)\n",
  "\n",
  "# Summarize the scores and save them in a dataframe\n",
  "train_result, test_result = summarize_scores(train_scores), summarize_scores(test_scores)\n",
@@ -273,13 +273,13 @@
  " train_ite, test_ite = method(train, test)\n",
  "\n",
  " # Calculate the scores and append them to a dataframe\n",
- " test_scores.loc[len(test_scores)] = calc_scores(test[Col.ite],\n",
- "  test_ite,\n",
- "  metrics)\n",
+ " test_scores = test_scores.append(calc_scores(\n",
+ " test[Col.ite], test_ite, metrics\n",
+ " ), ignore_index=True)\n",
  "\n",
- " train_scores.loc[len(train_scores)] = calc_scores(train[Col.ite],\n",
- "  train_ite,\n",
- "  metrics)\n",
+ " train_scores = train_scores.append(calc_scores(\n",
+ " train[Col.ite], train_ite, metrics\n",
+ " ), ignore_index=True)\n",
  "\n",
  " # Summarize the scores and save them in a dataframe\n",
  " train_result, test_result = summarize_scores(train_scores), summarize_scores(test_scores)\n",
@@ -784,7 +784,7 @@
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
- "version": "3.7.5"
+ "version": "3.7.6"
  }
  },
  "nbformat": 4,

diff --git a/src/justcause/evaluation.py b/src/justcause/evaluation.py
@@ -123,7 +123,7 @@ def _evaluate_single_method(
  train_size=0.8,
  random_state=None,
 ):
- """Helper to evaluate method with multiple metrics on the given replications
+ """Helper to evaluate method with multiple metrics on the given replications.
 
  This is the standard variant of an evaluation loop, which the user can implement
  manually to modify parts of it. Here, only ITE prediction and evaluation is
@@ -149,12 +149,12 @@ def _evaluate_single_method(
  else:
  train_ite, test_ite = default_predictions(method, train, test)
 
- test_scores.loc[len(test_scores)] = calc_scores(
- test[Col.ite], test_ite, metrics
+ test_scores = test_scores.append(
+ calc_scores(test[Col.ite], test_ite, metrics), ignore_index=True
  )
 
- train_scores.loc[len(train_scores)] = calc_scores(
- train[Col.ite], train_ite, metrics
+ train_scores = train_scores.append(
+ calc_scores(train[Col.ite], train_ite, metrics), ignore_index=True
  )
 
  train_results = summarize_scores(train_scores, formats)
@@ -163,25 +163,29 @@ def _evaluate_single_method(
  return train_results, test_results
 
 
-def calc_scores(true: np.array, pred: np.array, metrics):
+def calc_scores(
+ true: np.array, pred: np.array, metrics: Union[List[Metric], Metric]
+) -> dict:
  """Compare ground-truth to predictions with given metrics for one replication
 
  Call for train and test separately
 
+ TODO: Also replace np.array with dict
+
  Args:
- true: True ITE
+ true: true ITE
  pred: predicted ITE
  metrics: metrics to evaluate on the ITEs
 
- Returns: a list of scores with length == len(metrics), i.e. the row to be added to
- the scores dataframe
+ Returns:
+ dict: a dict of (score_name, scores) pairs with len(metrics) entries
 
  """
  # ensure metrics and replications are lists, even if with just one element
  if not isinstance(metrics, list):
  metrics = [metrics]
 
- return np.array([metric(true, pred) for metric in metrics])
+ return {metric.__name__: metric(true, pred) for metric in metrics}
 
 
 def default_predictions(
@@ -211,29 +215,6 @@ def default_predictions(
  return train_ite, test_ite
 
 
-def get_default_callable(method):
- """Helper to get an evaluation callable for standard methods
-
- Args:
- method: Method to use for the standard callable
-
- Returns: Callable for evaluation in custom loop
- """
-
- def default_callable(train, test):
- train_X, train_t, train_y = train.np.X, train.np.t, train.np.y
- test_X, test_t, test_y = test.np.X, test.np.t, test.np.y
-
- method.fit(train_X, train_t, train_y)
-
- train_ite = method.predict_ite(train_X, train_t, train_y)
- test_ite = method.predict_ite(test_X, test_t, test_y)
-
- return train_ite, test_ite
-
- return default_callable
-
-
 def summarize_scores(
  scores_df: pd.DataFrame,
  formats: Union[List[Format], Format] = (np.mean, np.median, np.std),

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -41,7 +41,9 @@ def test_summary():
 def test_calc_scores():
  true = np.full(100, 1)
  pred = np.full(100, 0)
- assert calc_scores(true, pred, pehe_score)[0] == 1
+ score_dict = calc_scores(true, pred, pehe_score)
+ assert list(score_dict.values())[0] == 1
+ assert "pehe_score" in score_dict.keys()
 
 
 def test_setup_df():