fixing bugs and adding class weight

OmerRonen · Sep 11, 2023 · 1b7ed99 · 1b7ed99
1 parent 3f789dc
commit 1b7ed99
Showing 1 changed file with 38 additions and 18 deletions.
diff --git a/imodels/linear_models/d_logistic_regression.py b/imodels/linear_models/d_logistic_regression.py
@@ -28,6 +28,19 @@ def sigmoid(t):
 def soft_thresh(x, u):
     """Soft thresholding of x at level u"""
     return np.maximum(0., np.abs(x) - u)
+
+
+def compute_sample_weight(y):
+    sample_weight =  np.zeros(len(y))
+    one_count = pd.Series(y).value_counts()[1.0]
+    one_proportion = y.shape[0]/one_count
+    zero_proportion = y.shape[0]/(y.shape[0] - one_count)
+    for i in range(len(y)):
+        if y[i] == 1:
+            sample_weight[i] = one_proportion
+        else:
+            sample_weight[i] = zero_proportion
+    return sample_weight
 
 class D_LogisticRegression():
     """
@@ -54,7 +67,7 @@ def __init__(self,alphas = [1.0],max_iter: int = 100, phases:dict=None, cv: int
         self.alphas = alphas
         self.cv = cv
 
-    def fit(self,X,y): 
+    def fit(self,X,y,use_class_weight = True): 
         """
         Fits dynamic logistic regression via coordinate descent. 
         """
@@ -73,9 +86,11 @@ def fit(self,X,y):
             non_na_indices = np.where(np.sum(np.isnan(X_phase), axis=1) == 0)[0] #extract indices 
             X_phase = X_phase[non_na_indices, :]
             y_phase = y_phase[non_na_indices]
+            print(X_phase)
+            sample_weight_phase = compute_sample_weight(y_phase)
             X_phase = np.hstack((np.ones((len(X_phase), 1)), X_phase))
-            opt_alpha = self.get_cv_phase_alpha(X_phase,y_phase,w_phase,phase,phase_features,adjusted_phases)
-            w_phase = self.fit_phase(phase,phase_features,adjusted_phases,X_phase,y_phase,opt_alpha,w_phase)
+            opt_alpha = self.get_cv_phase_alpha(X_phase,y_phase,w_phase,phase,phase_features,adjusted_phases,sample_weight_phase)
+            w_phase = self.fit_phase(phase,phase_features,adjusted_phases,X_phase,y_phase,opt_alpha,w_phase,sample_weight_phase)
 
         self.weights = w_phase
             #w_phase_alphas = []
@@ -85,7 +100,7 @@ def fit(self,X,y):
             # refit with chosen alpha
 
 
-    def fit_phase(self,phase,phase_features,adjusted_phases,X_phase,y_phase,alpha,w_phase):
+    def fit_phase(self,phase,phase_features,adjusted_phases,X_phase,y_phase,alpha,w_phase,sample_weight):
         #w_phase = self.weights
         if phase == 1: # if current phase is 1st phase
             w_phase = np.zeros(len(phase_features) + 1)
@@ -97,34 +112,35 @@ def fit_phase(self,phase,phase_features,adjusted_phases,X_phase,y_phase,alpha,w_
             w_phase = w_new_phase
             new_features = set(phase_features).difference(adjusted_phases[phase - 1])
             new_features = list(new_features)
-        w_phase = self.coordinate_descent(X_phase,y_phase,w_phase,new_features,phase,alpha)
+        w_phase = self.coordinate_descent(X_phase,y_phase,w_phase,new_features,phase,alpha,sample_weight)
 
         return w_phase
         #self.weights = w_phase
 
 
-    def coordinate_descent(self,X_phase,y_phase,w_phase,new_features,phase,alpha):
+    def coordinate_descent(self,X_phase,y_phase,w_phase,new_features,phase,alpha,sample_weight):
             X_phase_w_phase = X_phase.dot(w_phase)
             num_phase_samples = len(X_phase)
             new_features.append(0) # adding bias term
             for t in range(self.max_iter):
                 for j in new_features:
-                    lips_const = sum([X_phase[i,j]**2 * np.exp(X_phase_w_phase[i]) * sigmoid(-X_phase_w_phase[i])**2 for i in range(num_phase_samples)]) 
+                    #lips_const = sum([X_phase[i,j]**2 * np.exp(X_phase_w_phase[i]) * sigmoid(-X_phase_w_phase[i])**2 for i in range(num_phase_samples)]) 
+                    lips_const = np.sum(X_phase[:,j]**2)/4
                     old_w_phase_j = w_phase[j]
-                    grad_j = -sum([y_phase[i] * X_phase[i,j] * sigmoid(-y_phase[i]*X_phase_w_phase[i]) for i in range(num_phase_samples)])
+                    grad_j = -sum([y_phase[i] * X_phase[i,j] * sigmoid(-y_phase[i]*X_phase_w_phase[i]) * sample_weight[i] for i in range(num_phase_samples)])
                     w_phase[j] = np.sign(w_phase[j] - grad_j/lips_const) * soft_thresh(w_phase[j] - grad_j/lips_const, alpha/lips_const)
                     if old_w_phase_j != w_phase[j]:
                          X_phase_w_phase += w_phase[j] * X_phase[:,j] - old_w_phase_j * X_phase[:,j]
             return w_phase
 
-    def get_cv_phase_alpha(self,X_phase,y_phase,w_phase,phase,phase_features,adjusted_phases):
+    def get_cv_phase_alpha(self,X_phase,y_phase,w_phase,phase,phase_features,adjusted_phases,sample_weight):
         scores = np.zeros((self.cv,len(self.alphas)))
         kf = KFold(n_splits=self.cv, random_state=None)
         for i, (train_index , test_index) in enumerate(kf.split(X_phase)):
             X_train , X_test = X_phase[train_index,:],X_phase[test_index,:]
             y_train , y_test = y_phase[train_index] , y_phase[test_index]
             for j,alpha in enumerate(self.alphas):
-                w_alpha_phase = self.fit_phase(phase,deepcopy(phase_features),deepcopy(adjusted_phases),X_train,y_train,alpha,w_phase)
+                w_alpha_phase = self.fit_phase(phase,deepcopy(phase_features),deepcopy(adjusted_phases),X_train,y_train,alpha,w_phase,sample_weight)
                 y_alpha_preds =  sigmoid(np.matmul(X_test,w_alpha_phase))
                 scores[i,j] = roc_auc_score(y_test,y_alpha_preds)
         av_scores = scores.mean(axis = 1)
@@ -168,29 +184,33 @@ def predict_proba(self, X):
 
 if __name__ == '__main__':
 
-    X,y = make_classification(n_samples=350, n_features=5, n_informative=2,n_redundant = 0,random_state = 0)
+    X,y = make_classification(n_samples=250, n_features=5, n_informative=2,n_redundant = 3,random_state = 1)
     #X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
 
     phases = {0: [0, 1, 2], 1: [0, 1, 2, 3, 4]}
     X[0:int(0.6 * X.shape[0]), [p for p in phases[1] if p not in phases[0]]] = np.nan
 
     X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
     d_logistic_regression = D_LogisticRegression(alphas = [0.5,1.0,1.5],max_iter = 100, phases = phases)
-    d_logistic_regression.fit(X_train,y_train)
+    d_logistic_regression.fit(X_train,y_train,use_class_weight = True)
 
     print("Dynamic Logistic Regression - Weights:", d_logistic_regression.weights[1:])
     print("Dynamic Logistic Regression - Bias:", d_logistic_regression.weights[0])
 
 
-    #sklearn_logreg = LogisticRegression(penalty = 'l1', C = 1.0,solver = 'liblinear',max_iter = 100)
-    #sklearn_logreg.fit(X_train,y_train)
+    sklearn_logreg = LogisticRegressionCV(penalty = 'l1', solver = 'liblinear',max_iter = 100)
+    idx = pd.DataFrame(X_train).notna().all(axis=1)
+    X_train_no_na = X_train[idx,:]
+    idx_test =  pd.DataFrame(X_test).notna().all(axis=1)
+    X_test_no_na = X_test[idx_test,:]
+    sklearn_logreg.fit(X_train_no_na,y_train[idx])
 
-    #print("Scikit-learn - Weights:", sklearn_logreg.coef_)
-    #print("Scikit-learn - Bias:", sklearn_logreg.intercept_)
+    print("Scikit-learn - Weights:", sklearn_logreg.coef_)
+    print("Scikit-learn - Bias:", sklearn_logreg.intercept_)
 
 
-    print("Dynamic Logistic Regression - AUROC:",roc_auc_score(y_test,d_logistic_regression.predict_proba(X_test)[:,1]))
-    #print("Scikit-learn - AUROC:",roc_auc_score(y_test,sklearn_logreg.predict_proba(X_test)[:,1]))
+    print("Dynamic Logistic Regression - AUROC:",roc_auc_score(y_test[idx_test],d_logistic_regression.predict_proba(X_test[idx_test,:])[:,1]))
+    print("Scikit-learn - AUROC:",roc_auc_score(y_test[idx_test],sklearn_logreg.predict_proba(X_test[idx_test,:])[:,1]))