From 66a82269420406cb1abfb92f6ec716fd815cea85 Mon Sep 17 00:00:00 2001
From: plkmo <plkmo@hotmail.com>
Date: Wed, 11 Mar 2020 15:46:17 +0800
Subject: [PATCH] updated DGI for clustering

---
 classify.py                                   |  3 +-
 cluster.py                                    | 39 ++++++++
 nlptoolkit/clustering/models/DGI/DGI.py       | 39 +++++---
 nlptoolkit/clustering/models/DGI/__init__.py  |  2 +-
 .../models/DGI/preprocessing_funcs.py         |  4 +-
 .../clustering/models/DGI/train_funcs.py      | 70 ++++++++-----
 nlptoolkit/clustering/models/DGI/trainer.py   | 98 +++++--------------
 7 files changed, 138 insertions(+), 117 deletions(-)
 create mode 100644 cluster.py

diff --git a/classify.py b/classify.py
index 80d0ca2..8e725e8 100644
--- a/classify.py
+++ b/classify.py
@@ -55,7 +55,8 @@
                                                                             \n2: XLNet, 
                                                                             \n3: Graph Attention Network (GAT))
                                                                             \n4: ALBERT
-                                                                            \n5: XLMRoBERTa''')
+                                                                            \n5: XLMRoBERTa
+                                                                            \n6: GIN''')
     
     parser.add_argument("--train", type=int, default=1, help="Train model on dataset")
     parser.add_argument("--infer", type=int, default=1, help="Infer input sentence labels from trained model")
diff --git a/cluster.py b/cluster.py
new file mode 100644
index 0000000..a57fbd2
--- /dev/null
+++ b/cluster.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 11 09:48:49 2020
+
+@author: weetee
+"""
+from nlptoolkit.utils.misc import save_as_pickle
+import logging
+from argparse import ArgumentParser
+
+logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
+                    datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
+logger = logging.getLogger('__file__')
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--train_data", type=str, default="./data/train.csv", \
+                        help="training data csv file path")
+    parser.add_argument("--max_vocab_len", type=int, default=7000, help="GCN encoder: Max vocab size to consider based on top frequency tokens")
+    parser.add_argument("--hidden_size_1", type=int, default=330, help="Size of first GCN encoder hidden weights")
+    parser.add_argument("--batch_size", type=int, default=32, help="Training batch size")
+    parser.add_argument("--gradient_acc_steps", type=int, default=2, help="No. of steps of gradient accumulation")
+    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipped gradient norm")
+    parser.add_argument("--num_epochs", type=int, default=7000, help="No of epochs")
+    parser.add_argument("--lr", type=float, default=0.001, help="learning rate")
+    parser.add_argument("--model_no", type=int, default=0, help='''Model ID: (0: Deep Graph Infomax (DGI)), 
+                                                                            ''')
+    
+    parser.add_argument("--train", type=int, default=1, help="Train model on dataset")
+    parser.add_argument("--infer", type=int, default=1, help="Infer input sentence labels from trained model")
+    args = parser.parse_args()
+    save_as_pickle("args.pkl", args)
+    
+    if args.train:
+        if args.model_no == 0:
+            from nlptoolkit.clustering.models.DGI.trainer import train_and_fit
+        
+        output = train_and_fit(args)
\ No newline at end of file
diff --git a/nlptoolkit/clustering/models/DGI/DGI.py b/nlptoolkit/clustering/models/DGI/DGI.py
index 4bd8964..3790262 100644
--- a/nlptoolkit/clustering/models/DGI/DGI.py
+++ b/nlptoolkit/clustering/models/DGI/DGI.py
@@ -9,11 +9,8 @@
 import torch.nn.functional as F
 
 class GCN(nn.Module):
-    def __init__(self, X_size, A_hat, cuda, args, bias=True): # X_size = num features
+    def __init__(self, X_size, args, bias=True): # X_size = num features
         super(GCN, self).__init__()
-        self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
-        if cuda:
-            self.A_hat = self.A_hat.cuda()
         self.weight = nn.parameter.Parameter(torch.zeros(size=(X_size, args.hidden_size_1)))
         var = 2./(self.weight.size(1) + self.weight.size(0))
         self.weight.data.normal_(0, var)
@@ -24,19 +21,39 @@ def __init__(self, X_size, A_hat, cuda, args, bias=True): # X_size = num feature
         else:
             self.register_parameter("bias", None)
         
-    def forward(self, X): ### 1-layer GCN architecture
+    def forward(self, X, A_hat): ### 1-layer GCN architecture
         X = torch.mm(X, self.weight)
         if self.bias is not None:
             X = (X + self.bias)
-        X = F.relu(torch.mm(self.A_hat, X))
+        X = F.relu(torch.mm(A_hat, X))
         return X
     
 class DGI(nn.Module):
-    def __init__(self, X_size, A_hat, cuda, args, bias=True):
+    def __init__(self, X_size, args, bias=True):
         super(DGI, self).__init__()
-        self.encoder = GCN(X_size, A_hat, cuda, args, bias=bias)
-        self.D_weight = nn.parameter.Parameter(torch.zeros(size=(X_size, X_size))) # nodes X features
+        self.encoder = GCN(X_size, args, bias=bias)
+        self.D_weight = nn.parameter.Parameter(torch.zeros(size=(args.hidden_size_1,\
+                                                                 args.hidden_size_1))) # features X features
          
     def summarize_patch(self, X):
-        X = torch.sigmoid(X.mean(dim=1))
-        return X
\ No newline at end of file
+        X = torch.sigmoid(X.mean(dim=0))
+        return X
+    
+    def forward(self, X, A_hat, X_c):
+        X = self.encoder(X, A_hat) # nodes X features
+        X_c = self.encoder(X_c, A_hat) # nodes X features
+        s = self.summarize_patch(X) # s = features
+        fac = torch.mm(self.D_weight, s.unsqueeze(-1)) # fac = features X 1
+        
+        pos_D = []
+        for i in range(X.shape[0]):
+            pos_d_i = torch.sigmoid(torch.mm(X[i, :].unsqueeze(0), fac))
+            pos_D.append(pos_d_i)
+        pos_D = torch.tensor(pos_D, requires_grad=True)
+        
+        neg_D = []
+        for i in range(X_c.shape[0]):
+            neg_d_i = torch.sigmoid(torch.mm(X_c[i, :].unsqueeze(0), fac))
+            neg_D.append(neg_d_i)
+        neg_D = torch.tensor(neg_D, requires_grad=True)
+        return pos_D, neg_D
\ No newline at end of file
diff --git a/nlptoolkit/clustering/models/DGI/__init__.py b/nlptoolkit/clustering/models/DGI/__init__.py
index bd4380d..a6220c1 100644
--- a/nlptoolkit/clustering/models/DGI/__init__.py
+++ b/nlptoolkit/clustering/models/DGI/__init__.py
@@ -1,4 +1,4 @@
 from . import preprocessing_funcs
 from . import trainer
 from . import train_funcs
-from . import GCN
+from . import DGI
diff --git a/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py b/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py
index 5151ae7..62caaee 100644
--- a/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py
+++ b/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py
@@ -60,12 +60,10 @@ def word_word_edges(p_ij):
             word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]}))
     return word_word
 
-def generate_text_graph(train_data, infer_data, max_vocab_len, window=10):
+def generate_text_graph(train_data, max_vocab_len, window=10):
     """ generates graph based on text corpus (columns = (text, label)); window = sliding window size to calculate point-wise mutual information between words """
     logger.info("Preparing data...")
     df = pd.read_csv(train_data)
-    #infer_idx_start = len(df)
-    #df = pd.concat((df, pd.read_csv(infer_data)), ignore_index=True)
     df.dropna(inplace=True)
 
     stopwords = list(set(nltk.corpus.stopwords.words("english")))
diff --git a/nlptoolkit/clustering/models/DGI/train_funcs.py b/nlptoolkit/clustering/models/DGI/train_funcs.py
index fe3986d..dc940ac 100644
--- a/nlptoolkit/clustering/models/DGI/train_funcs.py
+++ b/nlptoolkit/clustering/models/DGI/train_funcs.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pandas as pd
 import torch
+import torch.nn as nn
 from .preprocessing_funcs import load_pickle, save_as_pickle, generate_text_graph
 import logging
 
@@ -16,8 +17,42 @@
                     datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
 logger = logging.getLogger(__file__)
 
+def get_X_A_hat(G, corrupt=False):
+    A = nx.to_numpy_matrix(G, weight="weight")
+    A = A + np.eye(G.number_of_nodes())
+    X = np.eye(G.number_of_nodes()) # Features are just identity matrix
+    
+    if corrupt:
+        np.random.shuffle(X)
+    
+    degrees = []
+    for d in G.degree(weight=None):
+        if d == 0:
+            degrees.append(0)
+        else:
+            degrees.append(d[1]**(-0.5))
+    degrees = np.diag(degrees)
+    A_hat = degrees@A@degrees
+    
+    X = torch.from_numpy(X).float()
+    A_hat = torch.tensor(A_hat).float()
+    return X, A_hat
+
+class JSdiv_Loss(nn.Module):
+    def __init__(self):
+        super(JSdiv_Loss, self).__init__()
+        self.BCE_pos = nn.BCELoss(reduction='mean')
+        self.BCE_neg = nn.BCELoss(reduction='mean')
+    
+    def forward(self, D_pos, D_neg):
+        label_pos = torch.ones(D_pos.shape[0])
+        label_neg = torch.zeros(D_neg.shape[0])
+        pos_loss = self.BCE_pos(D_pos, label_pos)
+        neg_loss = self.BCE_neg(D_neg, label_neg)
+        total_loss = 0.5*(pos_loss + neg_loss)
+        return total_loss
 
-def load_datasets(args, train_test_split=0):
+def load_datasets(args):
     """Loads dataset and graph if exists, else create and process them from raw data
     Returns --->
     f: torch tensor input of GCN (Identity matrix)
@@ -33,28 +68,13 @@ def load_datasets(args, train_test_split=0):
     graph_path = "./data/text_graph.pkl"
     if not os.path.isfile(df_data_path) or not os.path.isfile(graph_path):
         logger.info("Building datasets and graph from raw data... Note this will take quite a while...")
-        generate_text_graph(args.train_data, args.infer_data, args.max_vocab_len)
+        generate_text_graph(args.train_data, args.max_vocab_len)
     
     G_dict = load_pickle("text_graph.pkl")
     G = G_dict["graph"]
     del G_dict
-    
-    logger.info("Building adjacency and degree matrices...")
-    A = nx.to_numpy_matrix(G, weight="weight"); A = A + np.eye(G.number_of_nodes())
-    degrees = []
-    for d in G.degree(weight=None):
-        if d == 0:
-            degrees.append(0)
-        else:
-            degrees.append(d[1]**(-0.5))
-    degrees = np.diag(degrees)
-    X = np.eye(G.number_of_nodes()) # Features are just identity matrix
-    A_hat = degrees@A@degrees
-    f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net
-    
-    f = torch.from_numpy(f).float()
-    
-    return f, X, A_hat
+
+    return G
     
 def load_state(net, optimizer, scheduler, model_no=0, load_best=False):
     """ Loads saved model and optimizer states if exists """
@@ -81,16 +101,12 @@ def load_state(net, optimizer, scheduler, model_no=0, load_best=False):
 def load_results(model_no=0):
     """ Loads saved results if exists """
     losses_path = "./data/test_losses_per_epoch_%d.pkl" % model_no
-    accuracy_path = "./data/test_accuracy_per_epoch_%d.pkl" % model_no
-    train_accuracy_path = "./data/train_accuracy_per_epoch_%d.pkl" % model_no
-    if os.path.isfile(losses_path) and os.path.isfile(accuracy_path) and os.path.isfile(train_accuracy_path):
-        losses_per_epoch = load_pickle("test_losses_per_epoch_%d.pkl" % model_no)
-        accuracy_per_epoch = load_pickle("test_accuracy_per_epoch_%d.pkl" % model_no)
-        train_accuracy_per_epoch = load_pickle("train_accuracy_per_epoch_%d.pkl" % model_no)
+    if os.path.isfile(losses_path):
+        losses_per_epoch = load_pickle("train_losses_per_epoch_%d.pkl" % model_no)
         logger.info("Loaded results buffer")
     else:
-        losses_per_epoch, train_accuracy_per_epoch, accuracy_per_epoch = [], [], []
-    return losses_per_epoch, train_accuracy_per_epoch, accuracy_per_epoch
+        losses_per_epoch = []
+    return losses_per_epoch
 
 def evaluate(output, labels_e):
     if len(labels_e) == 0:
diff --git a/nlptoolkit/clustering/models/DGI/trainer.py b/nlptoolkit/clustering/models/DGI/trainer.py
index bfacc6e..58f2eb5 100644
--- a/nlptoolkit/clustering/models/DGI/trainer.py
+++ b/nlptoolkit/clustering/models/DGI/trainer.py
@@ -7,10 +7,10 @@
 import os
 import numpy as np
 import torch
-import torch.nn as nn
 import torch.optim as optim
-from .train_funcs import load_datasets, load_state, load_results, evaluate, infer
-from .GCN import gcn
+from .train_funcs import load_datasets, get_X_A_hat, JSdiv_Loss,\
+                        load_state, load_results, infer
+from .DGI import DGI
 from .preprocessing_funcs import load_pickle, save_as_pickle
 import matplotlib.pyplot as plt
 import logging
@@ -22,63 +22,45 @@
 def train_and_fit(args):
     cuda = torch.cuda.is_available()
     
-    f, X, A_hat, selected, labels_selected, labels_not_selected, test_idxs = load_datasets(args, train_test_split=args.train_test_split)
-    targets = torch.tensor(labels_selected).long()
+    G = load_datasets(args)
+    X, A_hat = get_X_A_hat(G, corrupt=False)
     #print(labels_selected, labels_not_selected)
-    net = gcn(X.shape[1], A_hat, cuda, args)
-    criterion = nn.CrossEntropyLoss()
+    net = DGI(X.shape[1], args)
+    criterion = JSdiv_Loss()
     optimizer = optim.Adam(net.parameters(), lr=args.lr)
     scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1000,2000,3000,4000,5000,6000], gamma=0.77)
     
     start_epoch, best_pred = load_state(net, optimizer, scheduler, model_no=args.model_no, load_best=False)
-    losses_per_epoch, evaluation_trained, evaluation_untrained = load_results(model_no=args.model_no)
+    losses_per_epoch = load_results(model_no=args.model_no)
     
     if cuda:
         net.cuda()
         optimizer = optim.Adam(net.parameters(), lr=args.lr)
-        f = f.cuda()
-        targets = targets.cuda()
         
     logger.info("Starting training process...")
     net.train()
     for e in range(start_epoch, args.num_epochs):
         optimizer.zero_grad()
-        output = net(f)
-        loss = criterion(output[selected], targets)
+        X, A_hat = get_X_A_hat(G, corrupt=False)
+        X_c, _ = get_X_A_hat(G, corrupt=True)
+        
+        if cuda:
+            X, A_hat, X_c = X.cuda(), A_hat.cuda(), X_c.cuda()
+            
+        pos_D, neg_D = net(X, A_hat, X_c)
+        loss = criterion(pos_D, neg_D)
         losses_per_epoch.append(loss.item())
         loss.backward()
         optimizer.step()
-        if e % 50 == 0:
-            #print(output[selected]); print(targets)
-            ### Evaluate other untrained nodes and check accuracy of labelling
-            net.eval()
-            with torch.no_grad():
-                pred_labels = net(f)
-                trained_accuracy = evaluate(pred_labels[selected], labels_selected); untrained_accuracy = evaluate(pred_labels[test_idxs], labels_not_selected)
-            evaluation_trained.append((e, trained_accuracy)); evaluation_untrained.append((e, untrained_accuracy))
-            print("[Epoch %d]: Evaluation accuracy of trained nodes: %.7f" % (e, trained_accuracy))
-            print("[Epoch %d]: Evaluation accuracy of test nodes: %.7f" % (e, untrained_accuracy))
-            print("[Epoch %d]: Loss: %.7f" % (e, losses_per_epoch[-1]))
-            print("Labels of trained nodes: \n", output[selected].max(1)[1])
-            net.train()
-            if trained_accuracy > best_pred:
-                best_pred = trained_accuracy
-                torch.save({
-                    'epoch': e + 1,\
-                    'state_dict': net.state_dict(),\
-                    'best_acc': trained_accuracy,\
-                    'optimizer' : optimizer.state_dict(),\
-                    'scheduler' : scheduler.state_dict(),\
-                }, os.path.join("./data/" ,\
-                    "test_model_best_%d.pth.tar" % args.model_no))
-        if (e % 250) == 0:
+        
+        if (e % 50) == 0:
+            print('[Epoch: %d] total loss: %.3f' %
+                      (e + 1, losses_per_epoch[-1]))
             save_as_pickle("test_losses_per_epoch_%d.pkl" % args.model_no, losses_per_epoch)
-            save_as_pickle("test_accuracy_per_epoch_%d.pkl" % args.model_no, evaluation_untrained)
-            save_as_pickle("train_accuracy_per_epoch_%d.pkl" % args.model_no, evaluation_trained)
             torch.save({
                     'epoch': e + 1,\
                     'state_dict': net.state_dict(),\
-                    'best_acc': trained_accuracy,\
+                    'best_acc': losses_per_epoch[-1],\
                     'optimizer' : optimizer.state_dict(),\
                     'scheduler' : scheduler.state_dict(),\
                 }, os.path.join("./data/",\
@@ -86,10 +68,7 @@ def train_and_fit(args):
         scheduler.step()
     
     logger.info("Finished training!")
-    evaluation_trained = np.array(evaluation_trained); evaluation_untrained = np.array(evaluation_untrained)
-    save_as_pickle("test_losses_per_epoch_%d_final.pkl" % args.model_no, losses_per_epoch)
-    save_as_pickle("train_accuracy_per_epoch_%d_final.pkl" % args.model_no, evaluation_trained)
-    save_as_pickle("test_accuracy_per_epoch_%d_final.pkl" % args.model_no, evaluation_untrained)
+    save_as_pickle("train_losses_per_epoch_%d_final.pkl" % args.model_no, losses_per_epoch)
     
     fig = plt.figure(figsize=(13,13))
     ax = fig.add_subplot(111)
@@ -98,34 +77,5 @@ def train_and_fit(args):
     ax.set_ylabel("Loss", fontsize=15)
     ax.set_title("Loss vs Epoch", fontsize=20)
     plt.savefig(os.path.join("./data/", "loss_vs_epoch_%d.png" % args.model_no))
-    
-    fig = plt.figure(figsize=(13,13))
-    ax = fig.add_subplot(111)
-    ax.scatter(evaluation_trained[:,0], evaluation_trained[:,1])
-    ax.set_xlabel("Epoch", fontsize=15)
-    ax.set_ylabel("Accuracy on trained nodes", fontsize=15)
-    ax.set_title("Accuracy (trained nodes) vs Epoch", fontsize=20)
-    plt.savefig(os.path.join("./data/", "trained_accuracy_vs_epoch_%d.png" % args.model_no))
-
-    if len(labels_not_selected) > 0:    
-        fig = plt.figure(figsize=(13,13))
-        ax = fig.add_subplot(111)
-        ax.scatter(evaluation_untrained[:,0], evaluation_untrained[:,1])
-        ax.set_xlabel("Epoch", fontsize=15)
-        ax.set_ylabel("Accuracy on untrained nodes", fontsize=15)
-        ax.set_title("Accuracy (untrained nodes) vs Epoch", fontsize=20)
-        plt.savefig(os.path.join("./data/", "untrained_accuracy_vs_epoch_%d.png" % args.model_no))
-        
-        fig = plt.figure(figsize=(13,13))
-        ax = fig.add_subplot(111)
-        ax.scatter(evaluation_trained[:,0], evaluation_trained[:,1], c="red", marker="v", \
-                   label="Trained Nodes")
-        ax.scatter(evaluation_untrained[:,0], evaluation_untrained[:,1], c="blue", marker="o",\
-                   label="Untrained Nodes")
-        ax.set_xlabel("Epoch", fontsize=15)
-        ax.set_ylabel("Accuracy", fontsize=15)
-        ax.set_title("Accuracy vs Epoch", fontsize=20)
-        ax.legend(fontsize=20)
-        plt.savefig(os.path.join("./data/", "combined_plot_accuracy_vs_epoch_%d.png" % args.model_no))
-    
-    infer(f, test_idxs, net)
\ No newline at end of file
+    return net
+    
\ No newline at end of file