Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create estimation validation loop #136

Merged
merged 20 commits into from
May 6, 2020
Merged
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 37 additions & 24 deletions dowhy/causal_refuters/dummy_outcome_refuter.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,30 +105,41 @@ def refute_estimate(self):
self.logger.info("Refutation over {} simulated datasets".format(self._num_simulations) )
self.logger.info("The transformation passed: {}".format(self._transformations) )

data_chunks = self.preprocess_data_by_treatment()
groups = self.preprocess_data_by_treatment()
estimates = []
for chunk in data_chunks:

X_input = chunk[self._chosen_variables]
new_outcome = chunk['y']
X = self._data[self._chosen_variables]
for key_train, _ in groups:
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
X_train = groups.get_group(key_train)[self._chosen_variables].values
new_outcome_train = groups.get_group(key_train)['y'].values
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
validation_df = []
for key_validation, _ in groups:
if key_validation != key_train:
validation_df.append(groups.get_group(key_validation))

validation_df = pd.concat(validation_df)
X_validation = validation_df[self._chosen_variables].values
new_outcome_validation = validation_df['y'].values
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved

Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
for action, func_args in self._transformations:

if callable(action):
estimator = action(X_input, new_outcome, **func_args)
new_outcome = estimator(X)
estimator = action(X_train, new_outcome_train, **func_args)
new_outcome_train = estimator(X_train)
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
new_outcome_validation = estimator(X_validation)
elif action in DummyOutcomeRefuter.SUPPORTED_ESTIMATORS:
estimator = self._estimate_dummy_outcome(func_args, action, new_outcome, X_input)
new_outcome = estimator(X)
estimator = self._estimate_dummy_outcome(func_args, action, new_outcome_train, X_train)
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
new_outcome_train = estimator(X_train)
new_outcome_validation = estimator(X_validation)
elif action == 'noise':
new_outcome = self._noise(new_outcome, func_args)
new_outcome_train = self._noise(new_outcome_train, func_args)
new_outcome_validation = self._noise(new_outcome_validation, func_args)
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
elif action == 'permute':
new_outcome = self._permute(new_outcome, func_args)
new_outcome_train = self._permute(new_outcome_train, func_args)
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
new_outcome_validation = self._permute(new_outcome_validation, func_args)
elif action =='zero':
new_outcome = np.zeros(new_outcome.shape)

new_data = chunk.assign(dummy_outcome=new_outcome)
new_outcome_train = np.zeros(new_outcome_train.shape)
new_outcome_validation = np.zeros(new_outcome_train.shape)

new_data = validation_df.assign(dummy_outcome=new_outcome_validation)
new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
new_effect = new_estimator.estimate_effect()
estimates.append(new_effect.value)
Expand Down Expand Up @@ -203,36 +214,38 @@ def refute_estimate(self):
return refute

def preprocess_data_by_treatment(self):
data_chunks = []

assert len(self._treatment_name) == 1, "At present, DoWhy supports a simgle treatment variable"

treatment_variable_name = self._target_estimand.treatment_name[0] # As we only have a single treatment
variable_type = self._data[treatment_variable_name].dtypes

if bool == variable_type:
# All the positive values go the first bucket
data_chunks.append( self._data[ self._data[treatment_variable_name] ])
# All the negative values go into the other
data_chunks.append( ~self._data[ self._data[treatment_variable_name] ])

# All the True values go the first bucket
# data_chunks.append( self._data[ self._data[treatment_variable_name] ])
# All the False values go into the other
# data_chunks.append( self._data[ ~self._data[treatment_variable_name] ])
groups = self._data.groupby(treatment_variable_name)
return groups
# We use string arguments to account for both 32 and 64 bit varaibles
elif 'float' in variable_type.name or\
'int' in variable_type.name:
# action for continuous variables
data_copy = copy.deepcopy( self._data )
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
data_copy['bins'] = pd.qcut(data_copy[treatment_variable_name], 10)
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
groups = data_copy.groupby('bins')
data_chunks = [groups.get_group(group) for group in groups ]
# data_chunks = [groups.get_group(group) for group in groups ]
return groups

elif 'categorical' in variable_type.name:
# Action for categorical variables
groups = data_copy.groupby(treatment_variable_name)
Tanmay-Kulkarni101 marked this conversation as resolved.
Show resolved Hide resolved
data_chunks = [groups.get_group(group) for group in groups ]
# data_chunks = [groups.get_group(group) for group in groups ]
return groups
else:
raise ValueError("Passed {}. Expected bool, float, int or categorical".format(variable_type.name))

return data_chunks
# return data_chunks

def _estimate_dummy_outcome(self, func_args, action, outcome, X_chunk):
estimator = self._get_regressor_object(action, func_args)
Expand Down