Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement sample weight #324

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
0bfe5f3
add manifest to simplify non-develop install
peterfoley Jan 23, 2019
87423f8
record the number of iterations and convergence status
peterfoley Mar 1, 2019
5ba9fab
test glmnet with nonzero reg_lambda, alpha
peterfoley Mar 5, 2019
34be299
recalculate z every iteration in GLM._cdfast
peterfoley Mar 5, 2019
2350c7e
flake8 fixes in test
peterfoley Mar 5, 2019
48d6759
don't cache z outside _cdfast
peterfoley Mar 6, 2019
42315f8
remove MANIFEST.in so it can be created properly in a later PR
peterfoley Mar 6, 2019
d80b11f
remove a dangling creation of z cache
peterfoley Mar 6, 2019
92238b2
resolved remaining flake8 issues by disabling checks
peterfoley Mar 6, 2019
2c06fbb
resolve flake8 indentation error
peterfoley Mar 6, 2019
a9209f5
update test_cdfast to remove z from _cdfast interface
peterfoley Mar 6, 2019
2674333
fail test_glmnet based on loss increase runlength
peterfoley Mar 6, 2019
5175d24
mkl dylibs are unavailable on travis
peterfoley Mar 6, 2019
2e29239
add a test that uses sample_weight parameter
peterfoley Dec 13, 2018
192d938
implement sample weights
peterfoley Dec 13, 2018
bc9b9df
update cheatsheet with weighted loss and grad/hess calculations
peterfoley Dec 14, 2018
832452b
typo fixes and formatting cleanup to reduce flake8 warnings/errors
peterfoley Jan 4, 2019
d26abe9
have setuptools build package list
peterfoley Jan 23, 2019
4e97389
remove math.inf for python 2.7 compatibility
peterfoley Mar 7, 2019
6c250a0
merging cdfast convergence fixes
peterfoley Mar 7, 2019
217dc5f
flake8 fixes
peterfoley Mar 7, 2019
cc0bd9c
resolve indentation flake8 errors
peterfoley Mar 7, 2019
8a6dec4
logger.warn is deprecated in favor of logger.warning
peterfoley Mar 7, 2019
e8f038a
use scipy.special.comb instead of removed scipy.misc.comb
May 17, 2019
8255617
Merge pull request #1 from peterfoley605/fix_scipy
May 17, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
typo fixes and formatting cleanup to reduce flake8 warnings/errors
  • Loading branch information
peterfoley committed Mar 7, 2019
commit 832452b066aaef0298c71916448fec4cf211b599
3 changes: 3 additions & 0 deletions pyglmnet/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
from .pyglmnet import _logL


def deviance(y, yhat, sample_weight, distr):
"""Deviance metrics.

Expand Down Expand Up @@ -34,6 +35,7 @@ def deviance(y, yhat, sample_weight, distr):
score = -2 * (L1 - LS)
return score


def pseudo_R2(X, y, yhat, ynull_, sample_weight, distr):
"""Pseudo-R2 metric.

Expand Down Expand Up @@ -73,6 +75,7 @@ def pseudo_R2(X, y, yhat, ynull_, sample_weight, distr):
score = (1 - L1 / L0)
return score


def accuracy(y, yhat, sample_weight):
"""Accuracy as ratio of correct predictions.

Expand Down
41 changes: 24 additions & 17 deletions pyglmnet/pyglmnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,29 +116,31 @@ def _logL(distr, y, y_hat, w, z=None):
"""The log likelihood."""
if distr in ['softplus', 'poisson']:
eps = np.spacing(1)
logL = np.dot(w, y * np.log(y_hat + eps) - y_hat)
logL = np.sum(w * (y * np.log(y_hat + eps) - y_hat))
elif distr == 'gaussian':
logL = -0.5 * np.dot(w, (y - y_hat)**2)
logL = -0.5 * np.sum(w * ((y - y_hat) ** 2))
elif distr == 'binomial':

# prevents underflow
if z is not None:
logL = np.dot(w, y * z - np.log(1 + np.exp(z)))
logL = np.sum(w * (y * z - np.log(1 + np.exp(z))))
# for scoring
else:
logL = np.dot(w, y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
logL = np.sum(w * (y * np.log(y_hat) +
(1 - y) * np.log(1 - y_hat)))
elif distr == 'probit':
if z is not None:
pdfz, cdfz = norm.pdf(z), norm.cdf(z)
logL = np.dot(w, y * _probit_g1(z, pdfz, cdfz) +
(1 - y) * _probit_g2(z, pdfz, cdfz))
logL = np.sum(w * (y * _probit_g1(z, pdfz, cdfz) +
(1 - y) * _probit_g2(z, pdfz, cdfz)))
else:
logL = np.dot(w, y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
logL = np.sum(w * (y * np.log(y_hat) +
(1 - y) * np.log(1 - y_hat)))
elif distr == 'gamma':
# see
# https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
nu = 1. # shape parameter, exponential for now
logL = np.dot(w, nu * (-y / y_hat - np.log(y_hat)))
logL = np.sum(w * (nu * (-y / y_hat - np.log(y_hat))))
return logL


Expand Down Expand Up @@ -183,6 +185,7 @@ def _L1penalty(beta, group=None):
L1penalty += np.linalg.norm(beta[group == 0], 1)
return L1penalty


def _loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta):
"""Define the objective function for elastic net."""
n_samples = X.shape[0]
Expand All @@ -193,6 +196,7 @@ def _loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta):
J = -L + reg_lambda * P
return J


def _L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta):
"""Define the objective function for elastic net."""
n_samples = X.shape[0]
Expand Down Expand Up @@ -250,6 +254,7 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, beta):
g[1:] = grad_beta
return g


def _gradhess_logloss_1d(distr, xk, y, w, z, eta):
"""
Compute gradient (1st derivative)
Expand Down Expand Up @@ -281,7 +286,8 @@ def _gradhess_logloss_1d(distr, xk, y, w, z, eta):

grad_s = s * (1 - s)
grad_s_by_mu = grad_s / mu - s / (mu ** 2)
hk = np.sum(w * grad_s * xk ** 2) - np.sum(w * y * grad_s_by_mu * xk ** 2)
hk = np.sum(w * grad_s * xk ** 2) - \
np.sum(w * y * grad_s_by_mu * xk ** 2)

elif distr == 'poisson':
mu = _mu(distr, z, eta)
Expand Down Expand Up @@ -662,7 +668,7 @@ def _cdfast(self, X, y, w, z, ActiveSet, beta, rl):
beta[k], z = beta[k] - update, z - update * xk
return beta, z

def fit(self, X, y, sample_weight = None):
def fit(self, X, y, sample_weight=None):
"""The fit function.

Parameters
Expand Down Expand Up @@ -785,7 +791,7 @@ def fit(self, X, y, sample_weight = None):
# Update the estimated variables
self.beta0_ = beta[0]
self.beta_ = beta[1:]
self.ynull_ = np.sum(sample_weight * y)/np.sum(sample_weight)
self.ynull_ = np.sum(sample_weight * y) / np.sum(sample_weight)
return self

def predict(self, X):
Expand Down Expand Up @@ -845,7 +851,7 @@ def predict_proba(self, X):
yhat = np.asarray(yhat)
return yhat

def fit_predict(self, X, y, sample_weight):
def fit_predict(self, X, y, sample_weight=None):
"""Fit the model and predict on the same data.

Parameters
Expand All @@ -862,7 +868,7 @@ def fit_predict(self, X, y, sample_weight):
"""
return self.fit(X, y, sample_weight).predict(X)

def score(self, X, y, sample_weight = None):
def score(self, X, y, sample_weight=None):
"""Score the model.

Parameters
Expand Down Expand Up @@ -1088,7 +1094,7 @@ def copy(self):
"""
return deepcopy(self)

def fit(self, X, y, sample_weight = None):
def fit(self, X, y, sample_weight=None):
"""The fit function.
Parameters
----------
Expand All @@ -1109,7 +1115,8 @@ def fit(self, X, y, sample_weight = None):
sample_weight = np.ones_like(y)
else:
sample_weight /= np.mean(sample_weight)
self.ynull_ = np.sum(sample_weight * y)/np.sum(sample_weight)

self.ynull_ = np.sum(sample_weight * y) / np.sum(sample_weight)

if not type(int):
raise ValueError('cv must be int. We do not support scikit-learn '
Expand Down Expand Up @@ -1202,7 +1209,7 @@ def predict_proba(self, X):
"""
return self.glm_.predict_proba(X)

def fit_predict(self, X, y, sample_weight = None):
def fit_predict(self, X, y, sample_weight=None):
"""Fit the model and predict on the same data.

Parameters
Expand All @@ -1220,7 +1227,7 @@ def fit_predict(self, X, y, sample_weight = None):
self.fit(X, y, sample_weight)
return self.glm_.predict(X)

def score(self, X, y, sample_weight = None):
def score(self, X, y, sample_weight=None):
"""Score the model.

Parameters
Expand Down
7 changes: 4 additions & 3 deletions tests/test_pyglmnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_sample_weight_cv():
glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
# check that cv and rest of sklearn interface works
cv_scores = cross_val_score(glm_normal, X, y, fit_params={'sample_weight': w}, cv=cv)
assert(len(scores) == 5)
assert(len(cv_scores) == 5)

param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
{'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
Expand Down Expand Up @@ -341,7 +341,8 @@ def test_cdfast():
z = beta_[0] + np.dot(X, beta_[1:])
k = 1
xk = X[:, k - 1]
gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, glm.eta)
w = np.ones_like(y)
gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, w, glm.eta)

# test grad and hess
if distr != 'multinomial':
Expand All @@ -359,7 +360,7 @@ def test_cdfast():

# test cdfast
ActiveSet = np.ones(n_features + 1)
beta_ret, z_ret = glm._cdfast(X, y, z,
beta_ret, z_ret = glm._cdfast(X, y, w, z,
ActiveSet, beta_, glm.reg_lambda)
assert(beta_ret.shape == beta_.shape)
assert(z_ret.shape == z.shape)
Expand Down