Skip to content

Commit

Permalink
:Merge branch 'master' of github.com:inovex/justcause
Browse files Browse the repository at this point in the history
  • Loading branch information
Maximilian Franz committed Oct 1, 2019
2 parents 94e49d5 + 1c72b6d commit a239a49
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 5 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,17 @@ from the root directory of this repository.

# Further Work
Some steps to continue the work on this project would be
- Implement a fully parametric DGP, following the dimensions roughly outlined in Chapter 4 of my thesis
- Rewrite the plot functions in `utils.py` to simply take `DataProvider` as inputs and handle the internals within
the functions.
- Implement within-sample and out-of-sample evaluation (switch between the two) as proposed in [this paper](https://arxiv.org/pdf/1606.03976.pdf).
- Implement a run-checker that ensures that all methods fit on the data and/or that no complications arise,
before expensive computation is started.
(e.g. requested size is to big for given DataProvider)
- Enable evaluation without `sacred` logging, only storing results.
- Ensure train/test split can be requested for all DataProviders
- Obviously, add more methods and reference datasets
- Implement experiment as a module, which is given methods, data and settings of the experiments and returns the full
- Write tests ;)


2 changes: 1 addition & 1 deletion causaleval/data/data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def set_train_test_split(self, train_size=0.8):
:param train_size: fraction of the whole data to be used as training
"""
length = self.x.shape[0]
self.train_selection = np.random.choice(length, size=int(train_size*length))
self.train_selection = np.random.choice(length, size=int(train_size*length), replace=False)
full = np.arange(length)
self.test_selection = full[~np.isin(full, self.train_selection)]

Expand Down
85 changes: 83 additions & 2 deletions causaleval/methods/basics/outcome_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from causaleval.methods.causal_method import CausalMethod
from causaleval.utils import get_regressor_name

class SingleOutcomeRegression(CausalMethod):
class SLearner(CausalMethod):
"""
Implements a generic S-Learner
Expand Down Expand Up @@ -39,9 +39,90 @@ def fit(self, x, t, y, refit=False) -> None:
self.regressor.fit(train, y)
self.is_trained = True

class WeightedSLearner(CausalMethod):
"""
Implements a generic S-Learner
:ref:
[S-Learner](https://arxiv.org/pdf/1706.03461.pdf)
"""

def __init__(self, propensity_regressor, regressor, dgp, seed=0):
"""
:param regressor: a sklearn regressor with methods `fit` and `predict`
"""
super().__init__(seed)
self.propensity_regressor = propensity_regressor
self.regressor = regressor
self.dgp = dgp
self.is_trained = False

def __str__(self):
return "Weighted S-Learner - " + get_regressor_name(self.regressor)

@staticmethod
def union(x, t):
return np.c_[x, t]

def predict_ite(self, x, t=None, y=None):
return self.regressor.predict(self.union(x, np.ones(x.shape[0]))) - self.regressor.predict(self.union(x, np.zeros(x.shape[0])))

def predict_ate(self, x, t=None, y=None):
return np.mean(self.predict_ite(x))

def fit(self, x, t, y, refit=False) -> None:
self.propensity_regressor.fit(x, t)
prob = self.propensity_regressor.predict_proba(x)
prob = self.dgp.get_train_propensity()

weights = 1 / prob
train = self.union(x, t)
self.regressor.fit(train, y, sample_weight=weights)
self.is_trained = True

class WeightedTLearner(CausalMethod):
"""
Implements a generic S-Learner
:ref:
[S-Learner](https://arxiv.org/pdf/1706.03461.pdf)
"""

def __init__(self, propensity_regressor, regressor, dgp, seed=0):
"""
:param regressor: a sklearn regressor with methods `fit` and `predict`
"""
super().__init__(seed)
self.propensity_regressor = propensity_regressor
self.regressor_one = regressor
self.regressor_two = copy.deepcopy(regressor)
self.dgp = dgp
self.is_trained = False

def __str__(self):
return "Weighted T-Learner - " + get_regressor_name(self.regressor_one)

@staticmethod
def union(x, t):
return np.c_[x, t]

def predict_ite(self, x, t=None, y=None):
return self.regressor_one.predict(x) - self.regressor_two.predict(x)

def fit(self, x, t, y, refit=False) -> None:
self.propensity_regressor.fit(x, t)
prob = self.propensity_regressor.predict_proba(x)
prob = self.dgp.get_train_propensity()
weights = 1 / prob
x_treatment = x[t == 1]
x_control = x[t == 0]
self.regressor_one.fit(x_treatment, y[t == 1], sample_weight=weights[t==1])
self.regressor_two.fit(x_control, y[t == 0], sample_weight=weights[t==0])


class DoubleOutcomeRegression(CausalMethod):
class TLearner(CausalMethod):
"""
Implements a generic T-learner :py:meth:`.fit()`
Expand Down
91 changes: 89 additions & 2 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,93 @@
from causaleval.data.generators.toy import SWagerDataProvider
from causaleval.methods.basics.outcome_regression import SLearner
from causaleval.metrics import StandardEvaluation

from sklearn.linear_model import LinearRegression

import sacred
from sacred.observers import FileStorageObserver

from unittest import TestCase

class TestCausalForests(TestCase):
def test_whole_experiment():
"""
Currently, `sacred is very interwoven with the whole experiment process and thus unit-testing is hard.
"""

ex = sacred.Experiment('normal')
ex.observers.append(FileStorageObserver.create('results'))

@ex.main
def run(_run):
method = SLearner(LinearRegression())
data = SWagerDataProvider()
metric = StandardEvaluation(ex, sizes=None, num_runs=1)
metric.evaluate(data, method)
assert len(metric.output.index) == 8 # 4 scores on train/test each
assert metric.output['score'][0] != 0

# test multirun
metric = StandardEvaluation(ex, sizes=None, num_runs=5)
metric.evaluate(data, method)


def test_construction(self):
# test varying sizes
metric = StandardEvaluation(ex, sizes=[100, 200], num_runs=5)
metric.evaluate(data, method)
assert len(metric.output.index) == 16

ex.run()

class IntegrationTests(TestCase):

def setUp(self):
self.ex = sacred.Experiment('normal')
self.ex.observers.append(FileStorageObserver.create('results'))

def tearDown(self):
pass

def test_experiment(self):
"""
Tests the integration of method, data and metric into an experiment, logged via sacred
requires a directory `results` for the FileStorageObserver
:return:
"""
test_whole_experiment()


def test_dataprovider(self):

data = SWagerDataProvider()
x, t, y = data.get_training_data(size=500)
self.assertEqual(len(t), len(y))
self.assertEqual(len(y), len(x))
self.assertEqual(len(t), 500)

with self.assertRaises(AssertionError):
data.get_training_data(size=2001)

data.set_train_test_split(train_size=0.5)
x, t, y = data.get_training_data(size=1000)
self.assertEqual(len(t), 0.5*len(data.t))
x_test, t_test, y_test = data.get_test_data()
self.assertEqual(len(t_test), 1000)


def test_rpy2(self):
"""
Tests whether rpy2 is able to load the R environment and
execute a causal forest
"""
data = SWagerDataProvider()
from causaleval.methods.causal_forest import CausalForest
cf = CausalForest()
cf.fit(*data.get_training_data())
self.assertIsNotNone(cf.predict_ite(*data.get_test_data()))





0 comments on commit a239a49

Please sign in to comment.