Skip to content

Commit

Permalink
Merge github.com:plkmo/NLP_Toolkit
Browse files Browse the repository at this point in the history
  • Loading branch information
plkmo committed Jan 19, 2020
2 parents 353ef1f + 5f380f5 commit 28a2f8c
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 20 deletions.
2 changes: 1 addition & 1 deletion nlptoolkit/classification/models/BERT/train_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def load_dataloaders(args):
test_set = sentiments(df_test, tokens_length=args.tokens_length, labels=False)
test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=False)
del df_train, df_test
return train_loader, test_loader, len(train_set)
return train_loader, test_loader, len(train_set), len(test_set)

class sentiments(Dataset):
def __init__(self, df, tokens_length=300, labels=True):
Expand Down
4 changes: 3 additions & 1 deletion nlptoolkit/classification/models/BERT/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
def train_and_fit(args):
cuda = torch.cuda.is_available()

train_loader, test_loader, train_len = load_dataloaders(args)
train_loader, test_loader, train_len, test_len = load_dataloaders(args)
logger.info("Training data points: %d" % train_len)
logger.info("Test data points: %d" % test_len)

net = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=args.num_classes)
if cuda:
Expand Down
2 changes: 1 addition & 1 deletion nlptoolkit/classification/models/XLNet/train_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def load_dataloaders(args):
test_set = sentiments(df_test, tokens_length=args.tokens_length, labels=False)
test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=False)
del df_train, df_test
return train_loader, test_loader, len(train_set)
return train_loader, test_loader, len(train_set), len(test_set)

class sentiments(Dataset):
def __init__(self, df, tokens_length=300, labels=True):
Expand Down
4 changes: 3 additions & 1 deletion nlptoolkit/classification/models/XLNet/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
def train_and_fit(args):
cuda = torch.cuda.is_available()

train_loader, test_loader, train_len = load_dataloaders(args)
train_loader, test_loader, train_len, test_len = load_dataloaders(args)
logger.info("Training data points: %d" % train_len)
logger.info("Test data points: %d" % test_len)

net = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=args.num_classes)
if cuda:
Expand Down
14 changes: 12 additions & 2 deletions nlptoolkit/summarization/preprocessing_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,20 @@ def load_dataloaders(args):
elif os.path.isfile(train_path):
df = load_pickle("df_encoded.pkl")

trainset = text_dataset(df, args)
# Train-Test split
msk = np.random.rand(len(df)) < args.train_test_ratio
trainset = df[msk]
testset = df[~msk]

trainset = text_dataset(trainset, args)
max_features_length = trainset.max_x_len
max_seq_len = trainset.max_y_len
train_length = len(trainset)
train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True,\
num_workers=0, collate_fn=Pad_Sequence(), pin_memory=False)
return train_loader, train_length, max_features_length, max_seq_len

testset = text_dataset(testset, args)
test_length = len(testset)
test_loader = DataLoader(testset, batch_size=args.batch_size, shuffle=True,\
num_workers=0, collate_fn=Pad_Sequence(), pin_memory=False)
return train_loader, train_length, max_features_length, max_seq_len, test_loader, test_length
16 changes: 9 additions & 7 deletions nlptoolkit/summarization/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def train_and_fit(args):

cuda = torch.cuda.is_available()

train_loader, train_length, max_features_length, max_seq_len = load_dataloaders(args)
train_loader, train_length, max_features_length, max_seq_len, test_loader, test_length = load_dataloaders(args)

if (args.level == "word") or (args.level == "char"):
vocab = load_pickle("vocab.pkl")
Expand All @@ -34,6 +34,8 @@ def train_and_fit(args):

logger.info("Max features length = %d %ss" % (max_features_length, args.level))
logger.info("Vocabulary size: %d" % vocab_size)
logger.info("Training data points: %d" % train_length)
logger.info("Test data points: %d" % test_length)

logger.info("Loading model and optimizers...")

Expand Down Expand Up @@ -98,9 +100,9 @@ def train_and_fit(args):
(e, (i + 1)*args.batch_size, train_length, losses_per_batch[-1]))
total_loss = 0.0
losses_per_epoch.append(sum(losses_per_batch)/len(losses_per_batch))
accuracy_per_epoch.append(evaluate_results(net, train_loader, cuda, None, None, args))
print("Losses at Epoch %d: %.7f" % (e, losses_per_epoch[-1]))
print("Accuracy at Epoch %d: %.7f" % (e, accuracy_per_epoch[-1]))
accuracy_per_epoch.append(evaluate_results(net, test_loader, cuda, None, None, args))
print("Training Losses at Epoch %d: %.7f" % (e, losses_per_epoch[-1]))
print("Test Accuracy at Epoch %d: %.7f" % (e, accuracy_per_epoch[-1]))

if (args.level == "word") or (args.level == "char"):
decode_outputs(outputs, labels, vocab.convert_idx2w, args)
Expand All @@ -126,15 +128,15 @@ def train_and_fit(args):
ax.scatter([i for i in range(len(losses_per_epoch))], losses_per_epoch)
ax.set_xlabel("Epoch", fontsize=15)
ax.set_ylabel("Loss", fontsize=15)
ax.set_title("Loss vs Epoch", fontsize=20)
ax.set_title("Training Loss vs Epoch", fontsize=20)
plt.savefig(os.path.join("./data/",\
"test_loss_vs_epoch_%d.png" % args.model_no))

fig = plt.figure(figsize=(13,13))
ax = fig.add_subplot(111)
ax.scatter([i for i in range(len(accuracy_per_epoch))], accuracy_per_epoch)
ax.set_xlabel("Epoch", fontsize=15)
ax.set_ylabel("Accuracy", fontsize=15)
ax.set_title("Accuracy vs Epoch", fontsize=20)
ax.set_ylabel("Test Accuracy", fontsize=15)
ax.set_title("Test Accuracy vs Epoch", fontsize=20)
plt.savefig(os.path.join("./data/",\
"test_Accuracy_vs_epoch_%d.png" % args.model_no))
15 changes: 8 additions & 7 deletions summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,22 @@
help="Full path to dailymail dataset (leave as empty string if none)")
parser.add_argument("--level", type=str, default="bpe", help="Level of tokenization (word, char or bpe)")
parser.add_argument("--bpe_word_ratio", type=float, default=0.7, help="Ratio of BPE to word vocab")
parser.add_argument("--bpe_vocab_size", type=int, default=7000, help="Size of bpe vocab if bpe is used")
parser.add_argument("--bpe_vocab_size", type=int, default=9000, help="Size of bpe vocab if bpe is used")
parser.add_argument("--max_features_length", type=int, default=1000, help="Max length of features (word, char or bpe level)")
parser.add_argument("--d_model", type=int, default=128, help="Transformer model dimension")
parser.add_argument("--ff_dim", type=int, default=128, help="Transformer Feed forward layer dimension")
parser.add_argument("--d_model", type=int, default=256, help="Transformer model dimension")
parser.add_argument("--ff_dim", type=int, default=256, help="Transformer Feed forward layer dimension")
parser.add_argument("--num", type=int, default=6, help="Transformer number of layers per block")
parser.add_argument("--n_heads", type=int, default=4, help="Transformer number of attention heads")
parser.add_argument("--LAS_embed_dim", type=int, default=256, help="LAS Embedding dimension")
parser.add_argument("--LAS_hidden_size", type=int, default=256, help="LAS listener hidden_size")
parser.add_argument("--LAS_embed_dim", type=int, default=512, help="LAS Embedding dimension")
parser.add_argument("--LAS_hidden_size", type=int, default=512, help="LAS listener hidden_size")
parser.add_argument("--batch_size", type=int, default=12, help="Batch size")
parser.add_argument("--fp16", type=int, default=0, help="1: use mixed precision ; 0: use floating point 32")
parser.add_argument("--train_test_ratio", type=float, default=0.9, help='Ratio for train-test split')
parser.add_argument("--num_epochs", type=int, default=8000, help="No of epochs")
parser.add_argument("--lr", type=float, default=0.0001, help="learning rate")
parser.add_argument("--gradient_acc_steps", type=int, default=8, help="Number of steps of gradient accumulation")
parser.add_argument("--gradient_acc_steps", type=int, default=5, help="Number of steps of gradient accumulation")
parser.add_argument("--max_norm", type=float, default=1.0, help="Clipped gradient norm")
parser.add_argument("--T_max", type=int, default=5000, help="number of iterations before LR restart")
parser.add_argument("--T_max", type=int, default=7000, help="number of iterations before LR restart")
parser.add_argument("--model_no", type=int, default=1, help="Model ID: 0 = Transformer, 1 = LAS")

parser.add_argument("--train", type=int, default=1, help="Train model on dataset")
Expand Down

0 comments on commit 28a2f8c

Please sign in to comment.