From 66a82269420406cb1abfb92f6ec716fd815cea85 Mon Sep 17 00:00:00 2001 From: plkmo Date: Wed, 11 Mar 2020 15:46:17 +0800 Subject: [PATCH] updated DGI for clustering --- classify.py | 3 +- cluster.py | 39 ++++++++ nlptoolkit/clustering/models/DGI/DGI.py | 39 +++++--- nlptoolkit/clustering/models/DGI/__init__.py | 2 +- .../models/DGI/preprocessing_funcs.py | 4 +- .../clustering/models/DGI/train_funcs.py | 70 ++++++++----- nlptoolkit/clustering/models/DGI/trainer.py | 98 +++++-------------- 7 files changed, 138 insertions(+), 117 deletions(-) create mode 100644 cluster.py diff --git a/classify.py b/classify.py index 80d0ca2..8e725e8 100644 --- a/classify.py +++ b/classify.py @@ -55,7 +55,8 @@ \n2: XLNet, \n3: Graph Attention Network (GAT)) \n4: ALBERT - \n5: XLMRoBERTa''') + \n5: XLMRoBERTa + \n6: GIN''') parser.add_argument("--train", type=int, default=1, help="Train model on dataset") parser.add_argument("--infer", type=int, default=1, help="Infer input sentence labels from trained model") diff --git a/cluster.py b/cluster.py new file mode 100644 index 0000000..a57fbd2 --- /dev/null +++ b/cluster.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 11 09:48:49 2020 + +@author: weetee +""" +from nlptoolkit.utils.misc import save_as_pickle +import logging +from argparse import ArgumentParser + +logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \ + datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) +logger = logging.getLogger('__file__') + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--train_data", type=str, default="./data/train.csv", \ + help="training data csv file path") + parser.add_argument("--max_vocab_len", type=int, default=7000, help="GCN encoder: Max vocab size to consider based on top frequency tokens") + parser.add_argument("--hidden_size_1", type=int, default=330, help="Size of first GCN encoder hidden weights") + parser.add_argument("--batch_size", type=int, default=32, help="Training batch size") + parser.add_argument("--gradient_acc_steps", type=int, default=2, help="No. of steps of gradient accumulation") + parser.add_argument("--max_norm", type=float, default=1.0, help="Clipped gradient norm") + parser.add_argument("--num_epochs", type=int, default=7000, help="No of epochs") + parser.add_argument("--lr", type=float, default=0.001, help="learning rate") + parser.add_argument("--model_no", type=int, default=0, help='''Model ID: (0: Deep Graph Infomax (DGI)), + ''') + + parser.add_argument("--train", type=int, default=1, help="Train model on dataset") + parser.add_argument("--infer", type=int, default=1, help="Infer input sentence labels from trained model") + args = parser.parse_args() + save_as_pickle("args.pkl", args) + + if args.train: + if args.model_no == 0: + from nlptoolkit.clustering.models.DGI.trainer import train_and_fit + + output = train_and_fit(args) \ No newline at end of file diff --git a/nlptoolkit/clustering/models/DGI/DGI.py b/nlptoolkit/clustering/models/DGI/DGI.py index 4bd8964..3790262 100644 --- a/nlptoolkit/clustering/models/DGI/DGI.py +++ b/nlptoolkit/clustering/models/DGI/DGI.py @@ -9,11 +9,8 @@ import torch.nn.functional as F class GCN(nn.Module): - def __init__(self, X_size, A_hat, cuda, args, bias=True): # X_size = num features + def __init__(self, X_size, args, bias=True): # X_size = num features super(GCN, self).__init__() - self.A_hat = torch.tensor(A_hat, requires_grad=False).float() - if cuda: - self.A_hat = self.A_hat.cuda() self.weight = nn.parameter.Parameter(torch.zeros(size=(X_size, args.hidden_size_1))) var = 2./(self.weight.size(1) + self.weight.size(0)) self.weight.data.normal_(0, var) @@ -24,19 +21,39 @@ def __init__(self, X_size, A_hat, cuda, args, bias=True): # X_size = num feature else: self.register_parameter("bias", None) - def forward(self, X): ### 1-layer GCN architecture + def forward(self, X, A_hat): ### 1-layer GCN architecture X = torch.mm(X, self.weight) if self.bias is not None: X = (X + self.bias) - X = F.relu(torch.mm(self.A_hat, X)) + X = F.relu(torch.mm(A_hat, X)) return X class DGI(nn.Module): - def __init__(self, X_size, A_hat, cuda, args, bias=True): + def __init__(self, X_size, args, bias=True): super(DGI, self).__init__() - self.encoder = GCN(X_size, A_hat, cuda, args, bias=bias) - self.D_weight = nn.parameter.Parameter(torch.zeros(size=(X_size, X_size))) # nodes X features + self.encoder = GCN(X_size, args, bias=bias) + self.D_weight = nn.parameter.Parameter(torch.zeros(size=(args.hidden_size_1,\ + args.hidden_size_1))) # features X features def summarize_patch(self, X): - X = torch.sigmoid(X.mean(dim=1)) - return X \ No newline at end of file + X = torch.sigmoid(X.mean(dim=0)) + return X + + def forward(self, X, A_hat, X_c): + X = self.encoder(X, A_hat) # nodes X features + X_c = self.encoder(X_c, A_hat) # nodes X features + s = self.summarize_patch(X) # s = features + fac = torch.mm(self.D_weight, s.unsqueeze(-1)) # fac = features X 1 + + pos_D = [] + for i in range(X.shape[0]): + pos_d_i = torch.sigmoid(torch.mm(X[i, :].unsqueeze(0), fac)) + pos_D.append(pos_d_i) + pos_D = torch.tensor(pos_D, requires_grad=True) + + neg_D = [] + for i in range(X_c.shape[0]): + neg_d_i = torch.sigmoid(torch.mm(X_c[i, :].unsqueeze(0), fac)) + neg_D.append(neg_d_i) + neg_D = torch.tensor(neg_D, requires_grad=True) + return pos_D, neg_D \ No newline at end of file diff --git a/nlptoolkit/clustering/models/DGI/__init__.py b/nlptoolkit/clustering/models/DGI/__init__.py index bd4380d..a6220c1 100644 --- a/nlptoolkit/clustering/models/DGI/__init__.py +++ b/nlptoolkit/clustering/models/DGI/__init__.py @@ -1,4 +1,4 @@ from . import preprocessing_funcs from . import trainer from . import train_funcs -from . import GCN +from . import DGI diff --git a/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py b/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py index 5151ae7..62caaee 100644 --- a/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py +++ b/nlptoolkit/clustering/models/DGI/preprocessing_funcs.py @@ -60,12 +60,10 @@ def word_word_edges(p_ij): word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]})) return word_word -def generate_text_graph(train_data, infer_data, max_vocab_len, window=10): +def generate_text_graph(train_data, max_vocab_len, window=10): """ generates graph based on text corpus (columns = (text, label)); window = sliding window size to calculate point-wise mutual information between words """ logger.info("Preparing data...") df = pd.read_csv(train_data) - #infer_idx_start = len(df) - #df = pd.concat((df, pd.read_csv(infer_data)), ignore_index=True) df.dropna(inplace=True) stopwords = list(set(nltk.corpus.stopwords.words("english"))) diff --git a/nlptoolkit/clustering/models/DGI/train_funcs.py b/nlptoolkit/clustering/models/DGI/train_funcs.py index fe3986d..dc940ac 100644 --- a/nlptoolkit/clustering/models/DGI/train_funcs.py +++ b/nlptoolkit/clustering/models/DGI/train_funcs.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd import torch +import torch.nn as nn from .preprocessing_funcs import load_pickle, save_as_pickle, generate_text_graph import logging @@ -16,8 +17,42 @@ datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) logger = logging.getLogger(__file__) +def get_X_A_hat(G, corrupt=False): + A = nx.to_numpy_matrix(G, weight="weight") + A = A + np.eye(G.number_of_nodes()) + X = np.eye(G.number_of_nodes()) # Features are just identity matrix + + if corrupt: + np.random.shuffle(X) + + degrees = [] + for d in G.degree(weight=None): + if d == 0: + degrees.append(0) + else: + degrees.append(d[1]**(-0.5)) + degrees = np.diag(degrees) + A_hat = degrees@A@degrees + + X = torch.from_numpy(X).float() + A_hat = torch.tensor(A_hat).float() + return X, A_hat + +class JSdiv_Loss(nn.Module): + def __init__(self): + super(JSdiv_Loss, self).__init__() + self.BCE_pos = nn.BCELoss(reduction='mean') + self.BCE_neg = nn.BCELoss(reduction='mean') + + def forward(self, D_pos, D_neg): + label_pos = torch.ones(D_pos.shape[0]) + label_neg = torch.zeros(D_neg.shape[0]) + pos_loss = self.BCE_pos(D_pos, label_pos) + neg_loss = self.BCE_neg(D_neg, label_neg) + total_loss = 0.5*(pos_loss + neg_loss) + return total_loss -def load_datasets(args, train_test_split=0): +def load_datasets(args): """Loads dataset and graph if exists, else create and process them from raw data Returns ---> f: torch tensor input of GCN (Identity matrix) @@ -33,28 +68,13 @@ def load_datasets(args, train_test_split=0): graph_path = "./data/text_graph.pkl" if not os.path.isfile(df_data_path) or not os.path.isfile(graph_path): logger.info("Building datasets and graph from raw data... Note this will take quite a while...") - generate_text_graph(args.train_data, args.infer_data, args.max_vocab_len) + generate_text_graph(args.train_data, args.max_vocab_len) G_dict = load_pickle("text_graph.pkl") G = G_dict["graph"] del G_dict - - logger.info("Building adjacency and degree matrices...") - A = nx.to_numpy_matrix(G, weight="weight"); A = A + np.eye(G.number_of_nodes()) - degrees = [] - for d in G.degree(weight=None): - if d == 0: - degrees.append(0) - else: - degrees.append(d[1]**(-0.5)) - degrees = np.diag(degrees) - X = np.eye(G.number_of_nodes()) # Features are just identity matrix - A_hat = degrees@A@degrees - f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net - - f = torch.from_numpy(f).float() - - return f, X, A_hat + + return G def load_state(net, optimizer, scheduler, model_no=0, load_best=False): """ Loads saved model and optimizer states if exists """ @@ -81,16 +101,12 @@ def load_state(net, optimizer, scheduler, model_no=0, load_best=False): def load_results(model_no=0): """ Loads saved results if exists """ losses_path = "./data/test_losses_per_epoch_%d.pkl" % model_no - accuracy_path = "./data/test_accuracy_per_epoch_%d.pkl" % model_no - train_accuracy_path = "./data/train_accuracy_per_epoch_%d.pkl" % model_no - if os.path.isfile(losses_path) and os.path.isfile(accuracy_path) and os.path.isfile(train_accuracy_path): - losses_per_epoch = load_pickle("test_losses_per_epoch_%d.pkl" % model_no) - accuracy_per_epoch = load_pickle("test_accuracy_per_epoch_%d.pkl" % model_no) - train_accuracy_per_epoch = load_pickle("train_accuracy_per_epoch_%d.pkl" % model_no) + if os.path.isfile(losses_path): + losses_per_epoch = load_pickle("train_losses_per_epoch_%d.pkl" % model_no) logger.info("Loaded results buffer") else: - losses_per_epoch, train_accuracy_per_epoch, accuracy_per_epoch = [], [], [] - return losses_per_epoch, train_accuracy_per_epoch, accuracy_per_epoch + losses_per_epoch = [] + return losses_per_epoch def evaluate(output, labels_e): if len(labels_e) == 0: diff --git a/nlptoolkit/clustering/models/DGI/trainer.py b/nlptoolkit/clustering/models/DGI/trainer.py index bfacc6e..58f2eb5 100644 --- a/nlptoolkit/clustering/models/DGI/trainer.py +++ b/nlptoolkit/clustering/models/DGI/trainer.py @@ -7,10 +7,10 @@ import os import numpy as np import torch -import torch.nn as nn import torch.optim as optim -from .train_funcs import load_datasets, load_state, load_results, evaluate, infer -from .GCN import gcn +from .train_funcs import load_datasets, get_X_A_hat, JSdiv_Loss,\ + load_state, load_results, infer +from .DGI import DGI from .preprocessing_funcs import load_pickle, save_as_pickle import matplotlib.pyplot as plt import logging @@ -22,63 +22,45 @@ def train_and_fit(args): cuda = torch.cuda.is_available() - f, X, A_hat, selected, labels_selected, labels_not_selected, test_idxs = load_datasets(args, train_test_split=args.train_test_split) - targets = torch.tensor(labels_selected).long() + G = load_datasets(args) + X, A_hat = get_X_A_hat(G, corrupt=False) #print(labels_selected, labels_not_selected) - net = gcn(X.shape[1], A_hat, cuda, args) - criterion = nn.CrossEntropyLoss() + net = DGI(X.shape[1], args) + criterion = JSdiv_Loss() optimizer = optim.Adam(net.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1000,2000,3000,4000,5000,6000], gamma=0.77) start_epoch, best_pred = load_state(net, optimizer, scheduler, model_no=args.model_no, load_best=False) - losses_per_epoch, evaluation_trained, evaluation_untrained = load_results(model_no=args.model_no) + losses_per_epoch = load_results(model_no=args.model_no) if cuda: net.cuda() optimizer = optim.Adam(net.parameters(), lr=args.lr) - f = f.cuda() - targets = targets.cuda() logger.info("Starting training process...") net.train() for e in range(start_epoch, args.num_epochs): optimizer.zero_grad() - output = net(f) - loss = criterion(output[selected], targets) + X, A_hat = get_X_A_hat(G, corrupt=False) + X_c, _ = get_X_A_hat(G, corrupt=True) + + if cuda: + X, A_hat, X_c = X.cuda(), A_hat.cuda(), X_c.cuda() + + pos_D, neg_D = net(X, A_hat, X_c) + loss = criterion(pos_D, neg_D) losses_per_epoch.append(loss.item()) loss.backward() optimizer.step() - if e % 50 == 0: - #print(output[selected]); print(targets) - ### Evaluate other untrained nodes and check accuracy of labelling - net.eval() - with torch.no_grad(): - pred_labels = net(f) - trained_accuracy = evaluate(pred_labels[selected], labels_selected); untrained_accuracy = evaluate(pred_labels[test_idxs], labels_not_selected) - evaluation_trained.append((e, trained_accuracy)); evaluation_untrained.append((e, untrained_accuracy)) - print("[Epoch %d]: Evaluation accuracy of trained nodes: %.7f" % (e, trained_accuracy)) - print("[Epoch %d]: Evaluation accuracy of test nodes: %.7f" % (e, untrained_accuracy)) - print("[Epoch %d]: Loss: %.7f" % (e, losses_per_epoch[-1])) - print("Labels of trained nodes: \n", output[selected].max(1)[1]) - net.train() - if trained_accuracy > best_pred: - best_pred = trained_accuracy - torch.save({ - 'epoch': e + 1,\ - 'state_dict': net.state_dict(),\ - 'best_acc': trained_accuracy,\ - 'optimizer' : optimizer.state_dict(),\ - 'scheduler' : scheduler.state_dict(),\ - }, os.path.join("./data/" ,\ - "test_model_best_%d.pth.tar" % args.model_no)) - if (e % 250) == 0: + + if (e % 50) == 0: + print('[Epoch: %d] total loss: %.3f' % + (e + 1, losses_per_epoch[-1])) save_as_pickle("test_losses_per_epoch_%d.pkl" % args.model_no, losses_per_epoch) - save_as_pickle("test_accuracy_per_epoch_%d.pkl" % args.model_no, evaluation_untrained) - save_as_pickle("train_accuracy_per_epoch_%d.pkl" % args.model_no, evaluation_trained) torch.save({ 'epoch': e + 1,\ 'state_dict': net.state_dict(),\ - 'best_acc': trained_accuracy,\ + 'best_acc': losses_per_epoch[-1],\ 'optimizer' : optimizer.state_dict(),\ 'scheduler' : scheduler.state_dict(),\ }, os.path.join("./data/",\ @@ -86,10 +68,7 @@ def train_and_fit(args): scheduler.step() logger.info("Finished training!") - evaluation_trained = np.array(evaluation_trained); evaluation_untrained = np.array(evaluation_untrained) - save_as_pickle("test_losses_per_epoch_%d_final.pkl" % args.model_no, losses_per_epoch) - save_as_pickle("train_accuracy_per_epoch_%d_final.pkl" % args.model_no, evaluation_trained) - save_as_pickle("test_accuracy_per_epoch_%d_final.pkl" % args.model_no, evaluation_untrained) + save_as_pickle("train_losses_per_epoch_%d_final.pkl" % args.model_no, losses_per_epoch) fig = plt.figure(figsize=(13,13)) ax = fig.add_subplot(111) @@ -98,34 +77,5 @@ def train_and_fit(args): ax.set_ylabel("Loss", fontsize=15) ax.set_title("Loss vs Epoch", fontsize=20) plt.savefig(os.path.join("./data/", "loss_vs_epoch_%d.png" % args.model_no)) - - fig = plt.figure(figsize=(13,13)) - ax = fig.add_subplot(111) - ax.scatter(evaluation_trained[:,0], evaluation_trained[:,1]) - ax.set_xlabel("Epoch", fontsize=15) - ax.set_ylabel("Accuracy on trained nodes", fontsize=15) - ax.set_title("Accuracy (trained nodes) vs Epoch", fontsize=20) - plt.savefig(os.path.join("./data/", "trained_accuracy_vs_epoch_%d.png" % args.model_no)) - - if len(labels_not_selected) > 0: - fig = plt.figure(figsize=(13,13)) - ax = fig.add_subplot(111) - ax.scatter(evaluation_untrained[:,0], evaluation_untrained[:,1]) - ax.set_xlabel("Epoch", fontsize=15) - ax.set_ylabel("Accuracy on untrained nodes", fontsize=15) - ax.set_title("Accuracy (untrained nodes) vs Epoch", fontsize=20) - plt.savefig(os.path.join("./data/", "untrained_accuracy_vs_epoch_%d.png" % args.model_no)) - - fig = plt.figure(figsize=(13,13)) - ax = fig.add_subplot(111) - ax.scatter(evaluation_trained[:,0], evaluation_trained[:,1], c="red", marker="v", \ - label="Trained Nodes") - ax.scatter(evaluation_untrained[:,0], evaluation_untrained[:,1], c="blue", marker="o",\ - label="Untrained Nodes") - ax.set_xlabel("Epoch", fontsize=15) - ax.set_ylabel("Accuracy", fontsize=15) - ax.set_title("Accuracy vs Epoch", fontsize=20) - ax.legend(fontsize=20) - plt.savefig(os.path.join("./data/", "combined_plot_accuracy_vs_epoch_%d.png" % args.model_no)) - - infer(f, test_idxs, net) \ No newline at end of file + return net + \ No newline at end of file