diff --git a/README.md b/README.md index 3e98549..935d52c 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ speech.py [-h] [--num NUM (default: 6)] [--n_heads N_HEADS(default: 4)] [--batch_size BATCH_SIZE (default: 30)] + [--fp16 FP16 (default:1)] [--num_epochs NUM_EPOCHS (default: 8000)] [--lr LR default=0.003] [--gradient_acc_steps GRADIENT_ACC_STEPS (default: 4)] @@ -187,7 +188,8 @@ summarize.py [-h] [--n_heads N_HEADS(default: 4)] [--LAS_embed_dim LAS_EMBED_DIM (default: 128)] [--LAS_hidden_size LAS_HIDDEN_SIZE (default: 128)] - [--batch_size BATCH_SIZE (default: 30)] + [--batch_size BATCH_SIZE (default: 32)] + [--fp16 FP16 (default: 1)] [--num_epochs NUM_EPOCHS (default: 8000)] [--lr LR default=0.003] [--gradient_acc_steps GRADIENT_ACC_STEPS (default: 4)] @@ -199,6 +201,23 @@ summarize.py [-h] ``` +Or if used as a package: +```python +from nlptoolkit.utils.config import Config +from nlptoolkit.summarization.trainer import train_and_fit +from nlptoolkit.summarization.infer import infer_from_trained + +config = Config(task='summarization') # loads default argument parameters as above +config.data_path = "./data/cnn_stories/cnn/stories/" +config.batch_size = 32 +config.lr = 0.0001 # change learning rate +config.model_no = 0 # set model as Transformer +train_and_fit(config) # starts training with configured parameters +inferer = infer_from_trained(config) # initiate infer object, which loads the model for inference, after training model +inferer.infer_from_input() # infer from user console input +inferer.infer_from_file(in_file="./data/input.txt", out_file="./data/output.txt") +``` + --- ## 4) Machine Translation diff --git a/nlptoolkit/ASR/evaluate.py b/nlptoolkit/ASR/infer.py similarity index 86% rename from nlptoolkit/ASR/evaluate.py rename to nlptoolkit/ASR/infer.py index 0cbec42..852cda7 100644 --- a/nlptoolkit/ASR/evaluate.py +++ b/nlptoolkit/ASR/infer.py @@ -24,6 +24,33 @@ datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) logger = logging.getLogger('__file__') +class infer_from_trained(object): + def __init__(self, args=None): + if args is None: + self.args = load_pickle("args.pkl") + else: + self.args = args + self.cuda = torch.cuda.is_available() + self.args.batch_size = 1 + + def infer_sentence(self, sent): + return + + def infer_from_input(self): + self.net.eval() + while True: + user_input = input("Type input sentence (Type \'exit' or \'quit' to quit):\n") + if user_input in ["exit", "quit"]: + break + predicted = self.infer_sentence(user_input) + return predicted + + def infer_from_file(self, in_file="./data/input.txt", out_file="./data/output.txt"): + df = pd.read_csv(in_file, header=None, names=["sents"]) + df['labels'] = df.progress_apply(lambda x: self.infer_sentence(x['sents']), axis=1) + df.to_csv(out_file, index=False) + logger.info("Done and saved as %s!" % out_file) + return def infer(file_path=None, speaker=None, pyTransformer=False): if pyTransformer: diff --git a/nlptoolkit/ASR/models/LAS/LAS_model.py b/nlptoolkit/ASR/models/LAS/LAS_model.py index 7e8d9bf..8a80d81 100644 --- a/nlptoolkit/ASR/models/LAS/LAS_model.py +++ b/nlptoolkit/ASR/models/LAS/LAS_model.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn +import torch.optim as optim import torch.nn.functional as F import numpy as np @@ -168,7 +169,7 @@ def forward(self, x, trg_input, infer=False): return x @classmethod - def load_model(cls, path): + def load_model(cls, path, args, cuda=True, amp=None): checkpoint = torch.load(path) model = cls(listener_input_size=checkpoint['listener_input_size'], \ listener_hidden_size=checkpoint['listener_hidden_size'], \ @@ -176,10 +177,22 @@ def load_model(cls, path): max_label_len = 100) model.listener.flatten_parameters() #model.speller.flatten_parameters() + if cuda: + model.cuda() + + if amp is not None: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + model, optimizer = amp.initialize(model, optimizer, opt_level='O2') + amp.load_state_dict(checkpoint['amp']) + #optimizer.load_state_dict(checkpoint['optimizer']) # dynamic loss scaling spikes if we load this! waiting for fix from nvidia apex + print("Loaded amp state dict!") + else: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + optimizer.load_state_dict(checkpoint['optimizer']) model.load_state_dict(checkpoint['state_dict']) - return model + return model, optimizer - def save_state(self, epoch, optimizer, scheduler, best_acc, path): + def save_state(self, epoch, optimizer, scheduler, best_acc, path, amp=None): state = { 'epoch': epoch + 1,\ 'state_dict': self.state_dict(),\ @@ -188,7 +201,8 @@ def save_state(self, epoch, optimizer, scheduler, best_acc, path): 'scheduler' : scheduler.state_dict(),\ 'listener_input_size' : self.listener_input_size,\ 'listener_hidden_size': self.listener_hidden_size,\ - 'output_class_dim': self.output_class_dim + 'output_class_dim': self.output_class_dim,\ + 'amp': amp.state_dict() } torch.save(state, path) \ No newline at end of file diff --git a/nlptoolkit/ASR/models/Transformer/transformer_model.py b/nlptoolkit/ASR/models/Transformer/transformer_model.py index 3a87127..8e3b9a7 100644 --- a/nlptoolkit/ASR/models/Transformer/transformer_model.py +++ b/nlptoolkit/ASR/models/Transformer/transformer_model.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn +import torch.optim as optim from torch.autograd import Variable import numpy as np import math @@ -351,7 +352,7 @@ def forward(self, src, trg, src_mask, trg_mask=None, g_mask1=None, g_mask2=None, #return x @classmethod - def load_model(cls, path): + def load_model(cls, path, args, cuda=True, amp=None): checkpoint = torch.load(path) model = cls(src_vocab=checkpoint["src_vocab"], \ trg_vocab=checkpoint["trg_vocab"], \ @@ -362,10 +363,22 @@ def load_model(cls, path): max_encoder_len=checkpoint["max_encoder_len"], \ max_decoder_len=checkpoint["max_decoder_len"], \ use_conv=True) + if cuda: + model.cuda() + + if amp is not None: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + model, optimizer = amp.initialize(model, optimizer, opt_level='O2') + amp.load_state_dict(checkpoint['amp']) + #optimizer.load_state_dict(checkpoint['optimizer']) # dynamic loss scaling spikes if we load this! waiting for fix from nvidia apex + print("Loaded amp state dict!") + else: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + optimizer.load_state_dict(checkpoint['optimizer']) model.load_state_dict(checkpoint['state_dict']) - return model + return model, optimizer - def save_state(self, epoch, optimizer, scheduler, best_acc, path): + def save_state(self, epoch, optimizer, scheduler, best_acc, path, amp=None): state = { 'epoch': epoch + 1,\ 'state_dict': self.state_dict(),\ @@ -379,6 +392,7 @@ def save_state(self, epoch, optimizer, scheduler, best_acc, path): 'num': self.num,\ 'n_heads': self.n_heads,\ 'max_encoder_len': self.max_encoder_len,\ - 'max_decoder_len': self.max_decoder_len, + 'max_decoder_len': self.max_decoder_len,\ + 'amp': amp.state_dict() } torch.save(state, path) \ No newline at end of file diff --git a/nlptoolkit/ASR/train_funcs.py b/nlptoolkit/ASR/train_funcs.py index d1e118a..3f8c03b 100644 --- a/nlptoolkit/ASR/train_funcs.py +++ b/nlptoolkit/ASR/train_funcs.py @@ -17,7 +17,7 @@ datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) logger = logging.getLogger(__file__) -def load_model_and_optimizer(args, vocab, max_features_length, max_seq_length, cuda, pyTransformer=False): +def load_model_and_optimizer(args, vocab, max_features_length, max_seq_length, cuda, amp=None, pyTransformer=False): if pyTransformer: from .models.Transformer.py_Transformer import pyTransformer as SpeechTransformer, \ @@ -45,16 +45,23 @@ def load_model_and_optimizer(args, vocab, max_features_length, max_seq_length, c nn.init.xavier_uniform_(p) criterion = nn.CrossEntropyLoss(ignore_index=1) # ignore padding tokens - optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + + net, optimizer, start_epoch, acc, loaded_opt = load_state(net, cuda, args, load_best=False, \ + amp=amp) + + if cuda and (not loaded_opt): + net.cuda() + + if (not loaded_opt): + optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) - model = SpeechTransformer if (args.model_no == 0) else LAS - model, loaded_optimizer, loaded_scheduler, start_epoch, acc = load_state(model, args, load_best=False, load_scheduler=False) + if (args.fp16) and (not loaded_opt) and (amp is not None): + logger.info("Using fp16...") + net, optimizer = amp.initialize(net, optimizer, opt_level='O2') + scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) - if start_epoch != 0: - net = model; optimizer = loaded_optimizer; scheduler = loaded_scheduler - if cuda: - net.cuda() + logger.info("Done setting up model, optimizer and scheduler.") if args.model_no == 0: ''' @@ -79,13 +86,13 @@ def load_model_and_optimizer(args, vocab, max_features_length, max_seq_length, c return net, criterion, optimizer, scheduler, start_epoch, acc, g_mask1, g_mask2 -def load_state(net, args, load_best=False, load_scheduler=False): +def load_state(net, cuda, args, load_best=False, amp=None): """ Loads saved model and optimizer states if exists """ + loaded_opt = False base_path = "./data/" checkpoint_path = os.path.join(base_path,"test_checkpoint_%d.pth.tar" % args.model_no) best_path = os.path.join(base_path,"test_model_best_%d.pth.tar" % args.model_no) start_epoch, best_pred, checkpoint = 0, 0, None - optimizer, scheduler = None, None if (load_best == True) and os.path.isfile(best_path): checkpoint = torch.load(best_path) logger.info("Loaded best model.") @@ -96,16 +103,15 @@ def load_state(net, args, load_best=False, load_scheduler=False): start_epoch = checkpoint['epoch'] best_pred = checkpoint['best_acc'] if load_best: - net = net.load_model(best_path) + net, optimizer = net.load_model(best_path, args, cuda, amp) else: - net = net.load_model(checkpoint_path) - optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) - scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) - if load_scheduler: - optimizer.load_state_dict(checkpoint['optimizer']) - scheduler.load_state_dict(checkpoint['scheduler']) - logger.info("Loaded model and optimizer.") - return net, optimizer, scheduler, start_epoch, best_pred + net, optimizer = net.load_model(checkpoint_path, args, cuda, amp) + + logger.info("Loaded model and optimizer.") + loaded_opt = True + else: + optimizer = None + return net, optimizer, start_epoch, best_pred, loaded_opt def load_results(model_no=0): """ Loads saved results if exists """ diff --git a/nlptoolkit/ASR/trainer.py b/nlptoolkit/ASR/trainer.py index e89679f..e3753b4 100644 --- a/nlptoolkit/ASR/trainer.py +++ b/nlptoolkit/ASR/trainer.py @@ -31,6 +31,11 @@ def train_and_fit(args, pyTransformer=False): print("Max sequence length: %d" % max_seq_length) vocab = load_pickle("vocab.pkl") + if args.fp16: + from apex import amp + else: + amp = None + logger.info("Loading model and optimizers...") cuda = torch.cuda.is_available() net, criterion, optimizer, scheduler, start_epoch, acc, g_mask1, g_mask2 = load_model_and_optimizer(args, vocab, \ @@ -38,7 +43,9 @@ def train_and_fit(args, pyTransformer=False): max_seq_length, cuda,\ pyTransformer) losses_per_epoch, accuracy_per_epoch = load_results(model_no=args.model_no) - batch_update_steps = 2 + batch_update_steps = int(train_length/(args.batch_size*10)) + + optimizer.zero_grad() logger.info("Starting training process...") for e in range(start_epoch, args.num_epochs): #l_rate = lrate(e + 1, d_model=32, k=10, warmup_n=25000) @@ -65,9 +72,18 @@ def train_and_fit(args, pyTransformer=False): outputs = outputs.view(-1, outputs.size(-1)) loss = criterion(outputs, labels); loss = loss/args.gradient_acc_steps - loss.backward() - if pyTransformer: - clip_grad_norm_(net.parameters(), args.max_norm) + + if args.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if args.fp16: + grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) + else: + grad_norm = clip_grad_norm_(net.parameters(), args.max_norm) + if (i % args.gradient_acc_steps) == 0: optimizer.step() optimizer.zero_grad() diff --git a/nlptoolkit/summarization/evaluate.py b/nlptoolkit/summarization/evaluate.py deleted file mode 100644 index 60ad682..0000000 --- a/nlptoolkit/summarization/evaluate.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Tue Aug 13 09:17:59 2019 - -@author: WT -""" -import torch -from torch.autograd import Variable -from .preprocessing_funcs import load_dataloaders -from .models.InputConv_Transformer import create_masks -from .train_funcs import load_model_and_optimizer -from .utils.bpe_vocab import Encoder -from .utils.misc_utils import load_pickle -import time -import logging - -logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \ - datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) -logger = logging.getLogger('__file__') - -def infer(args, from_data=False): - args.batch_size = 1 - cuda = torch.cuda.is_available() - train_loader, train_length, max_features_length, max_seq_len = load_dataloaders(args) - - if (args.level == "word") or (args.level == "char"): - vocab = load_pickle("vocab.pkl") - vocab_size = len(vocab.w2idx) - trg_init = vocab.w2idx[""] - elif args.level == "bpe": - vocab = Encoder.load("./data/vocab.pkl") - vocab_size = vocab.vocab_size - trg_init = vocab.word_vocab["__sos"] - trg_init = Variable(torch.LongTensor([trg_init])).unsqueeze(0) - - logger.info("Max features length = %d %ss" % (max_features_length, args.level)) - logger.info("Vocabulary size: %d" % vocab_size) - - logger.info("Loading model and optimizers...") - net, criterion, optimizer, scheduler, start_epoch, acc = load_model_and_optimizer(args, vocab_size, max_features_length,\ - max_seq_len, cuda) - - - if from_data: - with torch.no_grad(): - for i, data in enumerate(train_loader): - - if args.model_no == 0: - src_input, trg_input = data[0], data[1][:, :-1] - labels = data[1][:,1:].contiguous().view(-1) - src_mask, trg_mask = create_masks(src_input, trg_input) - if cuda: - src_input = src_input.cuda().long(); trg_input = trg_input.cuda().long(); labels = labels.cuda().long() - src_mask = src_mask.cuda(); trg_mask = trg_mask.cuda() - outputs = net(src_input, trg_input[:,0].unsqueeze(0), src_mask, trg_mask, infer=True) - - elif args.model_no == 1: - src_input, trg_input = data[0], data[1][:, :-1] - labels = data[1][:,1:].contiguous().view(-1) - if cuda: - src_input = src_input.cuda().long(); trg_input = trg_input.cuda().long(); labels = labels.cuda().long() - outputs = net(src_input, trg_input[:,0].unsqueeze(0), infer=True) - #outputs = outputs.view(-1, outputs.size(-1)) - #print(outputs.shape) - - if (args.level == "word") or (args.level == "char"): - vocab_decoder = vocab.convert_idx2w - elif args.level == "bpe": - vocab_decoder = vocab.inverse_transform - - if cuda: - l = list(labels.cpu().numpy()) - #o = list(torch.softmax(outputs, dim=1).max(1)[1].cpu().numpy()) - o = outputs[0].cpu().numpy().tolist() - else: - l = list(labels.numpy()) - #o = list(torch.softmax(outputs, dim=1).max(1)[1].numpy()) - o = outputs[0].numpy().tolist() - - if args.level == "bpe": - l = [l] - o = [o] - #print(o) - print("Sample Output: ", " ".join(vocab_decoder(o))) - print("Sample Label: ", " ".join(vocab_decoder(l))) - print("") - time.sleep(7) - else: - pass \ No newline at end of file diff --git a/nlptoolkit/summarization/infer.py b/nlptoolkit/summarization/infer.py new file mode 100644 index 0000000..9e3c933 --- /dev/null +++ b/nlptoolkit/summarization/infer.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 13 09:17:59 2019 + +@author: WT +""" +import pandas as pd +import torch +from torch.autograd import Variable +from .preprocessing_funcs import load_dataloaders +from .models.InputConv_Transformer import create_masks +from .train_funcs import load_model_and_optimizer +from .utils.bpe_vocab import Encoder +from .utils.misc_utils import load_pickle +from tqdm import tqdm +import time +import logging + +tqdm.pandas(desc="prog_bar") +logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \ + datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) +logger = logging.getLogger('__file__') + +class infer_from_trained(object): + def __init__(self, args=None): + if args is None: + self.args = load_pickle("args.pkl") + else: + self.args = args + self.cuda = torch.cuda.is_available() + self.args.batch_size = 1 + + logger.info("Loading tokenizer and model...") + try: + train_loader, train_length, max_features_length, max_seq_len = load_dataloaders(self.args) + self.train_loader = train_loader + except Exception as e: + print(e) + print("Data loading error!") + max_features_length = self.args.max_features_length + max_seq_len = self.args.max_features_length + self.train_loader = None + + self.max_features_length = max_features_length + self.max_seq_len = max_seq_len + + if (self.args.level == "word") or (self.args.level == "char"): + vocab = load_pickle("vocab.pkl") + vocab_size = len(vocab.w2idx) + trg_init = vocab.w2idx[""] + elif self.args.level == "bpe": + vocab = Encoder.load("./data/vocab.pkl") + vocab_size = vocab.vocab_size + trg_init = vocab.word_vocab["__sos"] + + self.trg_init = Variable(torch.LongTensor([trg_init])).unsqueeze(0) + + self.vocab = vocab + self.vocab_size = vocab_size + + logger.info("Max features length = %d %ss" % (max_features_length, self.args.level)) + logger.info("Vocabulary size: %d" % vocab_size) + + logger.info("Loading model and optimizers...") + + if self.args.fp16: + from apex import amp + else: + amp = None + + net, criterion, optimizer, scheduler, start_epoch, acc = load_model_and_optimizer(self.args, self.vocab_size, self.max_features_length,\ + self.max_seq_len, self.cuda, amp) + self.net = net + self.net.eval() + + def infer_from_data(self): + if self.train_loader is not None: + with torch.no_grad(): + for i, data in enumerate(self.train_loader): + + if self.args.model_no == 0: + src_input, trg_input = data[0], data[1][:, :-1] + labels = data[1][:,1:].contiguous().view(-1) + src_mask, trg_mask = create_masks(src_input, trg_input) + if self.cuda: + src_input = src_input.cuda().long(); trg_input = trg_input.cuda().long(); labels = labels.cuda().long() + src_mask = src_mask.cuda(); trg_mask = trg_mask.cuda() + outputs = self.net(src_input, trg_input[:,0].unsqueeze(0), src_mask, trg_mask, infer=True) + + elif self.args.model_no == 1: + src_input, trg_input = data[0], data[1][:, :-1] + labels = data[1][:,1:].contiguous().view(-1) + if self.cuda: + src_input = src_input.cuda().long(); trg_input = trg_input.cuda().long(); labels = labels.cuda().long() + outputs = self.net(src_input, trg_input[:,0].unsqueeze(0), infer=True) + #outputs = outputs.view(-1, outputs.size(-1)) + #print(outputs.shape) + + if (self.args.level == "word") or (self.args.level == "char"): + vocab_decoder = self.vocab.convert_idx2w + elif self.args.level == "bpe": + vocab_decoder = self.vocab.inverse_transform + + if self.cuda: + l = list(labels.cpu().numpy()) + #o = list(torch.softmax(outputs, dim=1).max(1)[1].cpu().numpy()) + o = outputs[0].cpu().numpy().tolist() + else: + l = list(labels.numpy()) + #o = list(torch.softmax(outputs, dim=1).max(1)[1].numpy()) + o = outputs[0].numpy().tolist() + + if self.args.level == "bpe": + l = [l] + o = [o] + #print(o) + print("Sample Output: ", " ".join(vocab_decoder(o))) + print("Sample Label: ", " ".join(vocab_decoder(l))) + print("") + time.sleep(7) + else: + print("No data to infer!") + + def infer_sentence(self, sent): + return + + def infer_from_input(self): + self.net.eval() + while True: + user_input = input("Type input sentence (Type \'exit' or \'quit' to quit):\n") + if user_input in ["exit", "quit"]: + break + predicted = self.infer_sentence(user_input) + return predicted + + def infer_from_file(self, in_file="./data/input.txt", out_file="./data/output.txt"): + df = pd.read_csv(in_file, header=None, names=["sents"]) + df['labels'] = df.progress_apply(lambda x: self.infer_sentence(x['sents']), axis=1) + df.to_csv(out_file, index=False) + logger.info("Done and saved as %s!" % out_file) + return + +def infer(args, from_data=False): + args.batch_size = 1 + cuda = torch.cuda.is_available() + train_loader, train_length, max_features_length, max_seq_len = load_dataloaders(args) + + if (args.level == "word") or (args.level == "char"): + vocab = load_pickle("vocab.pkl") + vocab_size = len(vocab.w2idx) + trg_init = vocab.w2idx[""] + elif args.level == "bpe": + vocab = Encoder.load("./data/vocab.pkl") + vocab_size = vocab.vocab_size + trg_init = vocab.word_vocab["__sos"] + trg_init = Variable(torch.LongTensor([trg_init])).unsqueeze(0) + + logger.info("Max features length = %d %ss" % (max_features_length, args.level)) + logger.info("Vocabulary size: %d" % vocab_size) + + logger.info("Loading model and optimizers...") + if args.fp16: + from apex import amp + else: + amp = None + net, criterion, optimizer, scheduler, start_epoch, acc = load_model_and_optimizer(args, vocab_size, max_features_length,\ + max_seq_len, cuda, amp) + + + if from_data: + with torch.no_grad(): + for i, data in enumerate(train_loader): + + if args.model_no == 0: + src_input, trg_input = data[0], data[1][:, :-1] + labels = data[1][:,1:].contiguous().view(-1) + src_mask, trg_mask = create_masks(src_input, trg_input) + if cuda: + src_input = src_input.cuda().long(); trg_input = trg_input.cuda().long(); labels = labels.cuda().long() + src_mask = src_mask.cuda(); trg_mask = trg_mask.cuda() + outputs = net(src_input, trg_input[:,0].unsqueeze(0), src_mask, trg_mask, infer=True) + + elif args.model_no == 1: + src_input, trg_input = data[0], data[1][:, :-1] + labels = data[1][:,1:].contiguous().view(-1) + if cuda: + src_input = src_input.cuda().long(); trg_input = trg_input.cuda().long(); labels = labels.cuda().long() + outputs = net(src_input, trg_input[:,0].unsqueeze(0), infer=True) + #outputs = outputs.view(-1, outputs.size(-1)) + #print(outputs.shape) + + if (args.level == "word") or (args.level == "char"): + vocab_decoder = vocab.convert_idx2w + elif args.level == "bpe": + vocab_decoder = vocab.inverse_transform + + if cuda: + l = list(labels.cpu().numpy()) + #o = list(torch.softmax(outputs, dim=1).max(1)[1].cpu().numpy()) + o = outputs[0].cpu().numpy().tolist() + else: + l = list(labels.numpy()) + #o = list(torch.softmax(outputs, dim=1).max(1)[1].numpy()) + o = outputs[0].numpy().tolist() + + if args.level == "bpe": + l = [l] + o = [o] + #print(o) + print("Sample Output: ", " ".join(vocab_decoder(o))) + print("Sample Label: ", " ".join(vocab_decoder(l))) + print("") + time.sleep(7) + else: + pass \ No newline at end of file diff --git a/nlptoolkit/summarization/models/InputConv_Transformer.py b/nlptoolkit/summarization/models/InputConv_Transformer.py index b3d8e30..4493745 100644 --- a/nlptoolkit/summarization/models/InputConv_Transformer.py +++ b/nlptoolkit/summarization/models/InputConv_Transformer.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn +import torch.optim as optim from torch.autograd import Variable import numpy as np import math @@ -321,7 +322,7 @@ def forward(self, src, trg, src_mask, trg_mask=None, g_mask1=None, g_mask2=None, #return o @classmethod - def load_model(cls, path): + def load_model(cls, path, args, cuda=True, amp=None): checkpoint = torch.load(path) model = cls(src_vocab=checkpoint["src_vocab"], \ trg_vocab=checkpoint["trg_vocab"], \ @@ -332,10 +333,22 @@ def load_model(cls, path): max_encoder_len=checkpoint["max_encoder_len"], \ max_decoder_len=checkpoint["max_decoder_len"], \ use_conv=True) + if cuda: + model.cuda() + + if amp is not None: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + model, optimizer = amp.initialize(model, optimizer, opt_level='O2') + amp.load_state_dict(checkpoint['amp']) + #optimizer.load_state_dict(checkpoint['optimizer']) # dynamic loss scaling spikes if we load this! waiting for fix from nvidia apex + print("Loaded amp state dict!") + else: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + optimizer.load_state_dict(checkpoint['optimizer']) model.load_state_dict(checkpoint['state_dict']) - return model + return model, optimizer - def save_state(self, epoch, optimizer, scheduler, best_acc, path): + def save_state(self, epoch, optimizer, scheduler, best_acc, path, amp=None): state = { 'epoch': epoch + 1,\ 'state_dict': self.state_dict(),\ @@ -349,6 +362,7 @@ def save_state(self, epoch, optimizer, scheduler, best_acc, path): 'num': self.num,\ 'n_heads': self.n_heads,\ 'max_encoder_len': self.max_encoder_len,\ - 'max_decoder_len': self.max_decoder_len, + 'max_decoder_len': self.max_decoder_len,\ + 'amp': amp.state_dict() } torch.save(state, path) \ No newline at end of file diff --git a/nlptoolkit/summarization/models/LSTM_attention_model.py b/nlptoolkit/summarization/models/LSTM_attention_model.py index a5bf9a7..9e6df23 100644 --- a/nlptoolkit/summarization/models/LSTM_attention_model.py +++ b/nlptoolkit/summarization/models/LSTM_attention_model.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn +import torch.optim as optim import torch.nn.functional as F import numpy as np @@ -168,7 +169,7 @@ def forward(self, x, trg_input, infer=False): return x @classmethod - def load_model(cls, path): + def load_model(cls, path, args, cuda=True, amp=None): checkpoint = torch.load(path) model = cls(vocab_size=checkpoint['vocab_size'],\ listener_embed_size=checkpoint['listener_embed_size'], \ @@ -177,10 +178,22 @@ def load_model(cls, path): max_label_len=checkpoint['max_label_len']) model.listener.flatten_parameters() #model.speller.flatten_parameters() + if cuda: + model.cuda() + + if amp is not None: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + model, optimizer = amp.initialize(model, optimizer, opt_level='O2') + amp.load_state_dict(checkpoint['amp']) + #optimizer.load_state_dict(checkpoint['optimizer']) # dynamic loss scaling spikes if we load this! waiting for fix from nvidia apex + print("Loaded amp state dict!") + else: + optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + optimizer.load_state_dict(checkpoint['optimizer']) model.load_state_dict(checkpoint['state_dict']) - return model + return model, optimizer - def save_state(self, epoch, optimizer, scheduler, best_acc, path): + def save_state(self, epoch, optimizer, scheduler, best_acc, path, amp=None): state = { 'epoch': epoch + 1,\ 'state_dict': self.state_dict(),\ @@ -191,7 +204,8 @@ def save_state(self, epoch, optimizer, scheduler, best_acc, path): 'listener_embed_size' : self.listener_embed_size,\ 'listener_hidden_size': self.listener_hidden_size,\ 'output_class_dim': self.output_class_dim,\ - 'max_label_len': self.max_label_len + 'max_label_len': self.max_label_len,\ + 'amp': amp.state_dict() } torch.save(state, path) \ No newline at end of file diff --git a/nlptoolkit/summarization/train_funcs.py b/nlptoolkit/summarization/train_funcs.py index fcbf67d..ba63aa2 100644 --- a/nlptoolkit/summarization/train_funcs.py +++ b/nlptoolkit/summarization/train_funcs.py @@ -19,7 +19,7 @@ datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) logger = logging.getLogger('__file__') -def load_model_and_optimizer(args, vocab_size, max_features_length, max_seq_length, cuda): +def load_model_and_optimizer(args, vocab_size, max_features_length, max_seq_length, cuda, amp=None): '''Loads the model (Transformer or encoder-decoder) based on provided arguments and parameters''' if args.model_no == 0: @@ -38,45 +38,51 @@ def load_model_and_optimizer(args, vocab_size, max_features_length, max_seq_leng criterion = nn.CrossEntropyLoss(ignore_index=1) # ignore padding tokens - #model = SummaryTransformer if (args.model_no == 0) else LAS - net, optimizer, scheduler, start_epoch, acc = load_state(net, args, load_best=False, load_scheduler=False) - - if cuda: + net, optimizer, start_epoch, acc, loaded_opt = load_state(net, cuda, args, load_best=False, \ + amp=amp) + + if cuda and (not loaded_opt): net.cuda() + + if (not loaded_opt): + optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) + scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) + + if (args.fp16) and (not loaded_opt) and (amp is not None): + logger.info("Using fp16...") + net, optimizer = amp.initialize(net, optimizer, opt_level='O2') + scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) + logger.info("Done setting up model, optimizer and scheduler.") + return net, criterion, optimizer, scheduler, start_epoch, acc -def load_state(net, args, load_best=False, load_scheduler=False): +def load_state(net, cuda, args, load_best=False, amp=None): """ Loads saved model and optimizer states if exists """ + loaded_opt = False base_path = "./data/" checkpoint_path = os.path.join(base_path,"test_checkpoint_%d.pth.tar" % args.model_no) best_path = os.path.join(base_path,"test_model_best_%d.pth.tar" % args.model_no) start_epoch, best_pred, checkpoint = 0, 0, None - if (load_best == True) and os.path.isfile(best_path): checkpoint = torch.load(best_path) logger.info("Loaded best model.") elif os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path) logger.info("Loaded checkpoint model.") - if checkpoint != None: start_epoch = checkpoint['epoch'] best_pred = checkpoint['best_acc'] if load_best: - net = net.load_model(best_path) + net, optimizer = net.load_model(best_path, args, cuda, amp) else: - net = net.load_model(checkpoint_path) - optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) - scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) - if load_scheduler: - optimizer.load_state_dict(checkpoint['optimizer']) - scheduler.load_state_dict(checkpoint['scheduler']) - logger.info("Loaded model and optimizer.") + net, optimizer = net.load_model(checkpoint_path, args, cuda, amp) + + logger.info("Loaded model and optimizer.") + loaded_opt = True else: - optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) - scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) - return net, optimizer, scheduler, start_epoch, best_pred + optimizer = None + return net, optimizer, start_epoch, best_pred, loaded_opt def load_results(model_no=0): """ Loads saved results if exists """ diff --git a/nlptoolkit/summarization/trainer.py b/nlptoolkit/summarization/trainer.py index 50924c8..25854ea 100644 --- a/nlptoolkit/summarization/trainer.py +++ b/nlptoolkit/summarization/trainer.py @@ -36,12 +36,19 @@ def train_and_fit(args): logger.info("Vocabulary size: %d" % vocab_size) logger.info("Loading model and optimizers...") + + if args.fp16: + from apex import amp + else: + amp = None + net, criterion, optimizer, scheduler, start_epoch, acc = load_model_and_optimizer(args, vocab_size, max_features_length,\ - max_seq_len, cuda) + max_seq_len, cuda, amp) losses_per_epoch, accuracy_per_epoch = load_results(model_no=args.model_no) - batch_update_steps = 3 + batch_update_steps = int(train_length/(args.batch_size*10)) logger.info("Starting training process...") + optimizer.zero_grad() for e in range(start_epoch, args.num_epochs): #l_rate = lrate(e + 1, d_model=32, k=10, warmup_n=25000) net.train() @@ -67,8 +74,17 @@ def train_and_fit(args): outputs = outputs.view(-1, outputs.size(-1)) loss = criterion(outputs, labels); loss = loss/args.gradient_acc_steps - loss.backward(); - #clip_grad_norm_(net.parameters(), args.max_norm) + if args.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if args.fp16: + grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) + else: + grad_norm = clip_grad_norm_(net.parameters(), args.max_norm) + if (i % args.gradient_acc_steps) == 0: optimizer.step() optimizer.zero_grad() @@ -93,14 +109,14 @@ def train_and_fit(args): acc = accuracy_per_epoch[-1] net.save_state(epoch=(e+1), optimizer=optimizer, scheduler=scheduler, best_acc=acc,\ path=os.path.join("./data/" ,\ - "test_model_best_%d.pth.tar" % args.model_no)) + "test_model_best_%d.pth.tar" % args.model_no), amp=amp) if (e % 1) == 0: save_as_pickle("test_losses_per_epoch_%d.pkl" % args.model_no, losses_per_epoch) save_as_pickle("test_accuracy_per_epoch_%d.pkl" % args.model_no, accuracy_per_epoch) net.save_state(epoch=(e+1), optimizer=optimizer, scheduler=scheduler, best_acc=acc,\ path=os.path.join("./data/" ,\ - "test_checkpoint_%d.pth.tar" % args.model_no)) + "test_checkpoint_%d.pth.tar" % args.model_no), amp=amp) logger.info("Finished training") fig = plt.figure(figsize=(13,13)) diff --git a/nlptoolkit/translation/infer.py b/nlptoolkit/translation/infer.py index 7270474..68ef6ff 100644 --- a/nlptoolkit/translation/infer.py +++ b/nlptoolkit/translation/infer.py @@ -17,6 +17,7 @@ import time import logging +tqdm.pandas(desc="prog_bar") logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \ datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) logger = logging.getLogger('__file__') diff --git a/nlptoolkit/translation/train_funcs.py b/nlptoolkit/translation/train_funcs.py index 1ba82ac..aa14153 100644 --- a/nlptoolkit/translation/train_funcs.py +++ b/nlptoolkit/translation/train_funcs.py @@ -48,7 +48,7 @@ def load_model_and_optimizer(args, src_vocab, trg_vocab, cuda, amp=None, pytrans optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) - if (args.fp16) and (not loaded_opt): + if (args.fp16) and (not loaded_opt) and (amp is not None): logger.info("Using fp16...") net, optimizer = amp.initialize(net, optimizer, opt_level='O2') scheduler = CosineWithRestarts(optimizer, T_max=args.T_max) diff --git a/nlptoolkit/utils/config.py b/nlptoolkit/utils/config.py index e867a3e..0dfb382 100644 --- a/nlptoolkit/utils/config.py +++ b/nlptoolkit/utils/config.py @@ -88,6 +88,7 @@ def __init__(self, task): self.LAS_embed_dim = 512 self.LAS_hidden_size = 512 self.batch_size = 32 + self.fp16 = 1 self.num_epochs = 500 self.lr = 0.0003 self.gradient_acc_steps = 2 @@ -149,7 +150,8 @@ def __init__(self, task): self.ff_dim = 128 self.num = 6 self.n_heads = 4 - self.batch_size = 30 + self.batch_size = 32 + self.fp16 = 1 self.num_epochs = 9000 self.lr = 0.0003 self.gradient_acc_steps = 4 diff --git a/speech.py b/speech.py index f5122d1..8a2bbdd 100644 --- a/speech.py +++ b/speech.py @@ -7,7 +7,7 @@ from nlptoolkit.utils.misc import save_as_pickle from nlptoolkit.ASR.trainer import train_and_fit -from nlptoolkit.ASR.evaluate import infer +from nlptoolkit.ASR.infer import infer import logging from argparse import ArgumentParser @@ -31,6 +31,7 @@ parser.add_argument("--num", type=int, default=6, help="Number of layers") parser.add_argument("--n_heads", type=int, default=4, help="Number of attention heads") parser.add_argument("--batch_size", type=int, default=30, help="Batch size") + parser.add_argument("--fp16", type=int, default=1, help="1: use mixed precision ; 0: use floating point 32") parser.add_argument("--num_epochs", type=int, default=9000, help="No of epochs") parser.add_argument("--lr", type=float, default=0.0003, help="learning rate") parser.add_argument("--gradient_acc_steps", type=int, default=4, help="Number of steps of gradient accumulation") diff --git a/summarize.py b/summarize.py index d7ac648..d6707cb 100644 --- a/summarize.py +++ b/summarize.py @@ -6,7 +6,7 @@ """ from nlptoolkit.utils.misc import save_as_pickle from nlptoolkit.summarization.trainer import train_and_fit -from nlptoolkit.summarization.evaluate import infer +from nlptoolkit.summarization.infer import infer_from_trained from argparse import ArgumentParser import logging @@ -29,6 +29,7 @@ parser.add_argument("--LAS_embed_dim", type=int, default=128, help="LAS Embedding dimension") parser.add_argument("--LAS_hidden_size", type=int, default=128, help="LAS listener hidden_size") parser.add_argument("--batch_size", type=int, default=32, help="Batch size") + parser.add_argument("--fp16", type=int, default=1, help="1: use mixed precision ; 0: use floating point 32") parser.add_argument("--num_epochs", type=int, default=8000, help="No of epochs") parser.add_argument("--lr", type=float, default=0.0003, help="learning rate") parser.add_argument("--gradient_acc_steps", type=int, default=2, help="Number of steps of gradient accumulation") @@ -44,4 +45,5 @@ if args.train: train_and_fit(args) if args.infer: - infer(args, from_data=True) \ No newline at end of file + inferer = infer_from_trained(args) + inferer.infer_from_data() \ No newline at end of file