:Merge branch 'master' of github.com:inovex/justcause

inovex · Oct 1, 2019 · a239a49 · a239a49
2 parents 94e49d5 + 1c72b6d
commit a239a49
Show file tree

Hide file tree

Showing 4 changed files with 177 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -127,13 +127,17 @@ from the root directory of this repository.
 
 # Further Work
 Some steps to continue the work on this project would be
+ - Implement a fully parametric DGP, following the dimensions roughly outlined in Chapter 4 of my thesis
  - Rewrite the plot functions in `utils.py` to simply take `DataProvider` as inputs and handle the internals within
  the functions.
  - Implement within-sample and out-of-sample evaluation (switch between the two) as proposed in [this paper](https://arxiv.org/pdf/1606.03976.pdf).
  - Implement a run-checker that ensures that all methods fit on the data and/or that no complications arise,
  before expensive computation is started.
  (e.g. requested size is to big for given DataProvider)
+ - Enable evaluation without `sacred` logging, only storing results.
+ - Ensure train/test split can be requested for all DataProviders
  - Obviously, add more methods and reference datasets
+ - Implement experiment as a module, which is given methods, data and settings of the experiments and returns the full
  - Write tests ;)
 
 
diff --git a/causaleval/data/data_provider.py b/causaleval/data/data_provider.py
@@ -75,7 +75,7 @@ def set_train_test_split(self, train_size=0.8):
  :param train_size: fraction of the whole data to be used as training
  """
  length = self.x.shape[0]
- self.train_selection = np.random.choice(length, size=int(train_size*length))
+ self.train_selection = np.random.choice(length, size=int(train_size*length), replace=False)
  full = np.arange(length)
  self.test_selection = full[~np.isin(full, self.train_selection)]
 

diff --git a/causaleval/methods/basics/outcome_regression.py b/causaleval/methods/basics/outcome_regression.py
@@ -4,7 +4,7 @@
 from causaleval.methods.causal_method import CausalMethod
 from causaleval.utils import get_regressor_name
 
-class SingleOutcomeRegression(CausalMethod):
+class SLearner(CausalMethod):
  """
  Implements a generic S-Learner
 
@@ -39,9 +39,90 @@ def fit(self, x, t, y, refit=False) -> None:
  self.regressor.fit(train, y)
  self.is_trained = True
 
+class WeightedSLearner(CausalMethod):
+ """
+ Implements a generic S-Learner
+
+ :ref:
+ [S-Learner](https://arxiv.org/pdf/1706.03461.pdf)
+ """
+
+ def __init__(self, propensity_regressor, regressor, dgp, seed=0):
+ """
+
+ :param regressor: a sklearn regressor with methods `fit` and `predict`
+ """
+ super().__init__(seed)
+ self.propensity_regressor = propensity_regressor
+ self.regressor = regressor
+ self.dgp = dgp
+ self.is_trained = False
+
+ def __str__(self):
+ return "Weighted S-Learner - " + get_regressor_name(self.regressor)
+
+ @staticmethod
+ def union(x, t):
+ return np.c_[x, t]
+
+ def predict_ite(self, x, t=None, y=None):
+ return self.regressor.predict(self.union(x, np.ones(x.shape[0]))) - self.regressor.predict(self.union(x, np.zeros(x.shape[0])))
+
+ def predict_ate(self, x, t=None, y=None):
+ return np.mean(self.predict_ite(x))
+
+ def fit(self, x, t, y, refit=False) -> None:
+ self.propensity_regressor.fit(x, t)
+ prob = self.propensity_regressor.predict_proba(x)
+ prob = self.dgp.get_train_propensity()
+
+ weights = 1 / prob
+ train = self.union(x, t)
+ self.regressor.fit(train, y, sample_weight=weights)
+ self.is_trained = True
+
+class WeightedTLearner(CausalMethod):
+ """
+ Implements a generic S-Learner
+
+ :ref:
+ [S-Learner](https://arxiv.org/pdf/1706.03461.pdf)
+ """
+
+ def __init__(self, propensity_regressor, regressor, dgp, seed=0):
+ """
+
+ :param regressor: a sklearn regressor with methods `fit` and `predict`
+ """
+ super().__init__(seed)
+ self.propensity_regressor = propensity_regressor
+ self.regressor_one = regressor
+ self.regressor_two = copy.deepcopy(regressor)
+ self.dgp = dgp
+ self.is_trained = False
+
+ def __str__(self):
+ return "Weighted T-Learner - " + get_regressor_name(self.regressor_one)
+
+ @staticmethod
+ def union(x, t):
+ return np.c_[x, t]
+
+ def predict_ite(self, x, t=None, y=None):
+ return self.regressor_one.predict(x) - self.regressor_two.predict(x)
+
+ def fit(self, x, t, y, refit=False) -> None:
+ self.propensity_regressor.fit(x, t)
+ prob = self.propensity_regressor.predict_proba(x)
+ prob = self.dgp.get_train_propensity()
+ weights = 1 / prob
+ x_treatment = x[t == 1]
+ x_control = x[t == 0]
+ self.regressor_one.fit(x_treatment, y[t == 1], sample_weight=weights[t==1])
+ self.regressor_two.fit(x_control, y[t == 0], sample_weight=weights[t==0])
 
 
-class DoubleOutcomeRegression(CausalMethod):
+class TLearner(CausalMethod):
  """
  Implements a generic T-learner :py:meth:`.fit()`
 

diff --git a/tests/tests.py b/tests/tests.py
@@ -1,6 +1,93 @@
+from causaleval.data.generators.toy import SWagerDataProvider
+from causaleval.methods.basics.outcome_regression import SLearner
+from causaleval.metrics import StandardEvaluation
+
+from sklearn.linear_model import LinearRegression
+
+import sacred
+from sacred.observers import FileStorageObserver
+
 from unittest import TestCase
 
-class TestCausalForests(TestCase):
+def test_whole_experiment():
+ """
+ Currently, `sacred is very interwoven with the whole experiment process and thus unit-testing is hard.
+ """
+
+ ex = sacred.Experiment('normal')
+ ex.observers.append(FileStorageObserver.create('results'))
+
+ @ex.main
+ def run(_run):
+ method = SLearner(LinearRegression())
+ data = SWagerDataProvider()
+ metric = StandardEvaluation(ex, sizes=None, num_runs=1)
+ metric.evaluate(data, method)
+ assert len(metric.output.index) == 8 # 4 scores on train/test each
+ assert metric.output['score'][0] != 0
+
+ # test multirun
+ metric = StandardEvaluation(ex, sizes=None, num_runs=5)
+ metric.evaluate(data, method)
+
 
- def test_construction(self):
+ # test varying sizes
+ metric = StandardEvaluation(ex, sizes=[100, 200], num_runs=5)
+ metric.evaluate(data, method)
+ assert len(metric.output.index) == 16
+
+ ex.run()
+
+class IntegrationTests(TestCase):
+
+ def setUp(self):
+ self.ex = sacred.Experiment('normal')
+ self.ex.observers.append(FileStorageObserver.create('results'))
+
+ def tearDown(self):
  pass
+
+ def test_experiment(self):
+ """
+ Tests the integration of method, data and metric into an experiment, logged via sacred
+
+ requires a directory `results` for the FileStorageObserver
+
+ :return:
+ """
+ test_whole_experiment()
+
+
+ def test_dataprovider(self):
+
+ data = SWagerDataProvider()
+ x, t, y = data.get_training_data(size=500)
+ self.assertEqual(len(t), len(y))
+ self.assertEqual(len(y), len(x))
+ self.assertEqual(len(t), 500)
+
+ with self.assertRaises(AssertionError):
+ data.get_training_data(size=2001)
+
+ data.set_train_test_split(train_size=0.5)
+ x, t, y = data.get_training_data(size=1000)
+ self.assertEqual(len(t), 0.5*len(data.t))
+ x_test, t_test, y_test = data.get_test_data()
+ self.assertEqual(len(t_test), 1000)
+
+
+ def test_rpy2(self):
+ """
+ Tests whether rpy2 is able to load the R environment and
+ execute a causal forest
+ """
+ data = SWagerDataProvider()
+ from causaleval.methods.causal_forest import CausalForest
+ cf = CausalForest()
+ cf.fit(*data.get_training_data())
+ self.assertIsNotNone(cf.predict_ite(*data.get_test_data()))
+
+
+
+
+