Skip to content

Commit

Permalink
updated DGI for clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
plkmo committed Mar 11, 2020
1 parent a9fccb6 commit 66a8226
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 117 deletions.
3 changes: 2 additions & 1 deletion classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
\n2: XLNet,
\n3: Graph Attention Network (GAT))
\n4: ALBERT
\n5: XLMRoBERTa''')
\n5: XLMRoBERTa
\n6: GIN''')

parser.add_argument("--train", type=int, default=1, help="Train model on dataset")
parser.add_argument("--infer", type=int, default=1, help="Infer input sentence labels from trained model")
Expand Down
39 changes: 39 additions & 0 deletions cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 11 09:48:49 2020
@author: weetee
"""
from nlptoolkit.utils.misc import save_as_pickle
import logging
from argparse import ArgumentParser

logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
logger = logging.getLogger('__file__')

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--train_data", type=str, default="./data/train.csv", \
help="training data csv file path")
parser.add_argument("--max_vocab_len", type=int, default=7000, help="GCN encoder: Max vocab size to consider based on top frequency tokens")
parser.add_argument("--hidden_size_1", type=int, default=330, help="Size of first GCN encoder hidden weights")
parser.add_argument("--batch_size", type=int, default=32, help="Training batch size")
parser.add_argument("--gradient_acc_steps", type=int, default=2, help="No. of steps of gradient accumulation")
parser.add_argument("--max_norm", type=float, default=1.0, help="Clipped gradient norm")
parser.add_argument("--num_epochs", type=int, default=7000, help="No of epochs")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate")
parser.add_argument("--model_no", type=int, default=0, help='''Model ID: (0: Deep Graph Infomax (DGI)),
''')

parser.add_argument("--train", type=int, default=1, help="Train model on dataset")
parser.add_argument("--infer", type=int, default=1, help="Infer input sentence labels from trained model")
args = parser.parse_args()
save_as_pickle("args.pkl", args)

if args.train:
if args.model_no == 0:
from nlptoolkit.clustering.models.DGI.trainer import train_and_fit

output = train_and_fit(args)
39 changes: 28 additions & 11 deletions nlptoolkit/clustering/models/DGI/DGI.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,8 @@
import torch.nn.functional as F

class GCN(nn.Module):
def __init__(self, X_size, A_hat, cuda, args, bias=True): # X_size = num features
def __init__(self, X_size, args, bias=True): # X_size = num features
super(GCN, self).__init__()
self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
if cuda:
self.A_hat = self.A_hat.cuda()
self.weight = nn.parameter.Parameter(torch.zeros(size=(X_size, args.hidden_size_1)))
var = 2./(self.weight.size(1) + self.weight.size(0))
self.weight.data.normal_(0, var)
Expand All @@ -24,19 +21,39 @@ def __init__(self, X_size, A_hat, cuda, args, bias=True): # X_size = num feature
else:
self.register_parameter("bias", None)

def forward(self, X): ### 1-layer GCN architecture
def forward(self, X, A_hat): ### 1-layer GCN architecture
X = torch.mm(X, self.weight)
if self.bias is not None:
X = (X + self.bias)
X = F.relu(torch.mm(self.A_hat, X))
X = F.relu(torch.mm(A_hat, X))
return X

class DGI(nn.Module):
def __init__(self, X_size, A_hat, cuda, args, bias=True):
def __init__(self, X_size, args, bias=True):
super(DGI, self).__init__()
self.encoder = GCN(X_size, A_hat, cuda, args, bias=bias)
self.D_weight = nn.parameter.Parameter(torch.zeros(size=(X_size, X_size))) # nodes X features
self.encoder = GCN(X_size, args, bias=bias)
self.D_weight = nn.parameter.Parameter(torch.zeros(size=(args.hidden_size_1,\
args.hidden_size_1))) # features X features

def summarize_patch(self, X):
X = torch.sigmoid(X.mean(dim=1))
return X
X = torch.sigmoid(X.mean(dim=0))
return X

def forward(self, X, A_hat, X_c):
X = self.encoder(X, A_hat) # nodes X features
X_c = self.encoder(X_c, A_hat) # nodes X features
s = self.summarize_patch(X) # s = features
fac = torch.mm(self.D_weight, s.unsqueeze(-1)) # fac = features X 1

pos_D = []
for i in range(X.shape[0]):
pos_d_i = torch.sigmoid(torch.mm(X[i, :].unsqueeze(0), fac))
pos_D.append(pos_d_i)
pos_D = torch.tensor(pos_D, requires_grad=True)

neg_D = []
for i in range(X_c.shape[0]):
neg_d_i = torch.sigmoid(torch.mm(X_c[i, :].unsqueeze(0), fac))
neg_D.append(neg_d_i)
neg_D = torch.tensor(neg_D, requires_grad=True)
return pos_D, neg_D
2 changes: 1 addition & 1 deletion nlptoolkit/clustering/models/DGI/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import preprocessing_funcs
from . import trainer
from . import train_funcs
from . import GCN
from . import DGI
4 changes: 1 addition & 3 deletions nlptoolkit/clustering/models/DGI/preprocessing_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,10 @@ def word_word_edges(p_ij):
word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]}))
return word_word

def generate_text_graph(train_data, infer_data, max_vocab_len, window=10):
def generate_text_graph(train_data, max_vocab_len, window=10):
""" generates graph based on text corpus (columns = (text, label)); window = sliding window size to calculate point-wise mutual information between words """
logger.info("Preparing data...")
df = pd.read_csv(train_data)
#infer_idx_start = len(df)
#df = pd.concat((df, pd.read_csv(infer_data)), ignore_index=True)
df.dropna(inplace=True)

stopwords = list(set(nltk.corpus.stopwords.words("english")))
Expand Down
70 changes: 43 additions & 27 deletions nlptoolkit/clustering/models/DGI/train_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,50 @@
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from .preprocessing_funcs import load_pickle, save_as_pickle, generate_text_graph
import logging

logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
logger = logging.getLogger(__file__)

def get_X_A_hat(G, corrupt=False):
A = nx.to_numpy_matrix(G, weight="weight")
A = A + np.eye(G.number_of_nodes())
X = np.eye(G.number_of_nodes()) # Features are just identity matrix

if corrupt:
np.random.shuffle(X)

degrees = []
for d in G.degree(weight=None):
if d == 0:
degrees.append(0)
else:
degrees.append(d[1]**(-0.5))
degrees = np.diag(degrees)
A_hat = degrees@A@degrees

X = torch.from_numpy(X).float()
A_hat = torch.tensor(A_hat).float()
return X, A_hat

class JSdiv_Loss(nn.Module):
def __init__(self):
super(JSdiv_Loss, self).__init__()
self.BCE_pos = nn.BCELoss(reduction='mean')
self.BCE_neg = nn.BCELoss(reduction='mean')

def forward(self, D_pos, D_neg):
label_pos = torch.ones(D_pos.shape[0])
label_neg = torch.zeros(D_neg.shape[0])
pos_loss = self.BCE_pos(D_pos, label_pos)
neg_loss = self.BCE_neg(D_neg, label_neg)
total_loss = 0.5*(pos_loss + neg_loss)
return total_loss

def load_datasets(args, train_test_split=0):
def load_datasets(args):
"""Loads dataset and graph if exists, else create and process them from raw data
Returns --->
f: torch tensor input of GCN (Identity matrix)
Expand All @@ -33,28 +68,13 @@ def load_datasets(args, train_test_split=0):
graph_path = "./data/text_graph.pkl"
if not os.path.isfile(df_data_path) or not os.path.isfile(graph_path):
logger.info("Building datasets and graph from raw data... Note this will take quite a while...")
generate_text_graph(args.train_data, args.infer_data, args.max_vocab_len)
generate_text_graph(args.train_data, args.max_vocab_len)

G_dict = load_pickle("text_graph.pkl")
G = G_dict["graph"]
del G_dict

logger.info("Building adjacency and degree matrices...")
A = nx.to_numpy_matrix(G, weight="weight"); A = A + np.eye(G.number_of_nodes())
degrees = []
for d in G.degree(weight=None):
if d == 0:
degrees.append(0)
else:
degrees.append(d[1]**(-0.5))
degrees = np.diag(degrees)
X = np.eye(G.number_of_nodes()) # Features are just identity matrix
A_hat = degrees@A@degrees
f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net

f = torch.from_numpy(f).float()

return f, X, A_hat

return G

def load_state(net, optimizer, scheduler, model_no=0, load_best=False):
""" Loads saved model and optimizer states if exists """
Expand All @@ -81,16 +101,12 @@ def load_state(net, optimizer, scheduler, model_no=0, load_best=False):
def load_results(model_no=0):
""" Loads saved results if exists """
losses_path = "./data/test_losses_per_epoch_%d.pkl" % model_no
accuracy_path = "./data/test_accuracy_per_epoch_%d.pkl" % model_no
train_accuracy_path = "./data/train_accuracy_per_epoch_%d.pkl" % model_no
if os.path.isfile(losses_path) and os.path.isfile(accuracy_path) and os.path.isfile(train_accuracy_path):
losses_per_epoch = load_pickle("test_losses_per_epoch_%d.pkl" % model_no)
accuracy_per_epoch = load_pickle("test_accuracy_per_epoch_%d.pkl" % model_no)
train_accuracy_per_epoch = load_pickle("train_accuracy_per_epoch_%d.pkl" % model_no)
if os.path.isfile(losses_path):
losses_per_epoch = load_pickle("train_losses_per_epoch_%d.pkl" % model_no)
logger.info("Loaded results buffer")
else:
losses_per_epoch, train_accuracy_per_epoch, accuracy_per_epoch = [], [], []
return losses_per_epoch, train_accuracy_per_epoch, accuracy_per_epoch
losses_per_epoch = []
return losses_per_epoch

def evaluate(output, labels_e):
if len(labels_e) == 0:
Expand Down
98 changes: 24 additions & 74 deletions nlptoolkit/clustering/models/DGI/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from .train_funcs import load_datasets, load_state, load_results, evaluate, infer
from .GCN import gcn
from .train_funcs import load_datasets, get_X_A_hat, JSdiv_Loss,\
load_state, load_results, infer
from .DGI import DGI
from .preprocessing_funcs import load_pickle, save_as_pickle
import matplotlib.pyplot as plt
import logging
Expand All @@ -22,74 +22,53 @@
def train_and_fit(args):
cuda = torch.cuda.is_available()

f, X, A_hat, selected, labels_selected, labels_not_selected, test_idxs = load_datasets(args, train_test_split=args.train_test_split)
targets = torch.tensor(labels_selected).long()
G = load_datasets(args)
X, A_hat = get_X_A_hat(G, corrupt=False)
#print(labels_selected, labels_not_selected)
net = gcn(X.shape[1], A_hat, cuda, args)
criterion = nn.CrossEntropyLoss()
net = DGI(X.shape[1], args)
criterion = JSdiv_Loss()
optimizer = optim.Adam(net.parameters(), lr=args.lr)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1000,2000,3000,4000,5000,6000], gamma=0.77)

start_epoch, best_pred = load_state(net, optimizer, scheduler, model_no=args.model_no, load_best=False)
losses_per_epoch, evaluation_trained, evaluation_untrained = load_results(model_no=args.model_no)
losses_per_epoch = load_results(model_no=args.model_no)

if cuda:
net.cuda()
optimizer = optim.Adam(net.parameters(), lr=args.lr)
f = f.cuda()
targets = targets.cuda()

logger.info("Starting training process...")
net.train()
for e in range(start_epoch, args.num_epochs):
optimizer.zero_grad()
output = net(f)
loss = criterion(output[selected], targets)
X, A_hat = get_X_A_hat(G, corrupt=False)
X_c, _ = get_X_A_hat(G, corrupt=True)

if cuda:
X, A_hat, X_c = X.cuda(), A_hat.cuda(), X_c.cuda()

pos_D, neg_D = net(X, A_hat, X_c)
loss = criterion(pos_D, neg_D)
losses_per_epoch.append(loss.item())
loss.backward()
optimizer.step()
if e % 50 == 0:
#print(output[selected]); print(targets)
### Evaluate other untrained nodes and check accuracy of labelling
net.eval()
with torch.no_grad():
pred_labels = net(f)
trained_accuracy = evaluate(pred_labels[selected], labels_selected); untrained_accuracy = evaluate(pred_labels[test_idxs], labels_not_selected)
evaluation_trained.append((e, trained_accuracy)); evaluation_untrained.append((e, untrained_accuracy))
print("[Epoch %d]: Evaluation accuracy of trained nodes: %.7f" % (e, trained_accuracy))
print("[Epoch %d]: Evaluation accuracy of test nodes: %.7f" % (e, untrained_accuracy))
print("[Epoch %d]: Loss: %.7f" % (e, losses_per_epoch[-1]))
print("Labels of trained nodes: \n", output[selected].max(1)[1])
net.train()
if trained_accuracy > best_pred:
best_pred = trained_accuracy
torch.save({
'epoch': e + 1,\
'state_dict': net.state_dict(),\
'best_acc': trained_accuracy,\
'optimizer' : optimizer.state_dict(),\
'scheduler' : scheduler.state_dict(),\
}, os.path.join("./data/" ,\
"test_model_best_%d.pth.tar" % args.model_no))
if (e % 250) == 0:

if (e % 50) == 0:
print('[Epoch: %d] total loss: %.3f' %
(e + 1, losses_per_epoch[-1]))
save_as_pickle("test_losses_per_epoch_%d.pkl" % args.model_no, losses_per_epoch)
save_as_pickle("test_accuracy_per_epoch_%d.pkl" % args.model_no, evaluation_untrained)
save_as_pickle("train_accuracy_per_epoch_%d.pkl" % args.model_no, evaluation_trained)
torch.save({
'epoch': e + 1,\
'state_dict': net.state_dict(),\
'best_acc': trained_accuracy,\
'best_acc': losses_per_epoch[-1],\
'optimizer' : optimizer.state_dict(),\
'scheduler' : scheduler.state_dict(),\
}, os.path.join("./data/",\
"test_checkpoint_%d.pth.tar" % args.model_no))
scheduler.step()

logger.info("Finished training!")
evaluation_trained = np.array(evaluation_trained); evaluation_untrained = np.array(evaluation_untrained)
save_as_pickle("test_losses_per_epoch_%d_final.pkl" % args.model_no, losses_per_epoch)
save_as_pickle("train_accuracy_per_epoch_%d_final.pkl" % args.model_no, evaluation_trained)
save_as_pickle("test_accuracy_per_epoch_%d_final.pkl" % args.model_no, evaluation_untrained)
save_as_pickle("train_losses_per_epoch_%d_final.pkl" % args.model_no, losses_per_epoch)

fig = plt.figure(figsize=(13,13))
ax = fig.add_subplot(111)
Expand All @@ -98,34 +77,5 @@ def train_and_fit(args):
ax.set_ylabel("Loss", fontsize=15)
ax.set_title("Loss vs Epoch", fontsize=20)
plt.savefig(os.path.join("./data/", "loss_vs_epoch_%d.png" % args.model_no))

fig = plt.figure(figsize=(13,13))
ax = fig.add_subplot(111)
ax.scatter(evaluation_trained[:,0], evaluation_trained[:,1])
ax.set_xlabel("Epoch", fontsize=15)
ax.set_ylabel("Accuracy on trained nodes", fontsize=15)
ax.set_title("Accuracy (trained nodes) vs Epoch", fontsize=20)
plt.savefig(os.path.join("./data/", "trained_accuracy_vs_epoch_%d.png" % args.model_no))

if len(labels_not_selected) > 0:
fig = plt.figure(figsize=(13,13))
ax = fig.add_subplot(111)
ax.scatter(evaluation_untrained[:,0], evaluation_untrained[:,1])
ax.set_xlabel("Epoch", fontsize=15)
ax.set_ylabel("Accuracy on untrained nodes", fontsize=15)
ax.set_title("Accuracy (untrained nodes) vs Epoch", fontsize=20)
plt.savefig(os.path.join("./data/", "untrained_accuracy_vs_epoch_%d.png" % args.model_no))

fig = plt.figure(figsize=(13,13))
ax = fig.add_subplot(111)
ax.scatter(evaluation_trained[:,0], evaluation_trained[:,1], c="red", marker="v", \
label="Trained Nodes")
ax.scatter(evaluation_untrained[:,0], evaluation_untrained[:,1], c="blue", marker="o",\
label="Untrained Nodes")
ax.set_xlabel("Epoch", fontsize=15)
ax.set_ylabel("Accuracy", fontsize=15)
ax.set_title("Accuracy vs Epoch", fontsize=20)
ax.legend(fontsize=20)
plt.savefig(os.path.join("./data/", "combined_plot_accuracy_vs_epoch_%d.png" % args.model_no))

infer(f, test_idxs, net)
return net

0 comments on commit 66a8226

Please sign in to comment.