Skip to content

Commit

Permalink
Delete some changes that were implemented for Multi GPU workflow.
Browse files Browse the repository at this point in the history
  • Loading branch information
jmisilo committed Nov 15, 2022
1 parent 92f9623 commit ec71006
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 147 deletions.
17 changes: 0 additions & 17 deletions Dockerfile

This file was deleted.

3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ matplotlib==3.6.0
numpy==1.23.3
pandas==1.5.0
Pillow==9.3.0
torch==1.12.1
# torch==1.12.1+cu116
torch==1.12.1+cu116
tqdm==4.64.1
transformers==4.22.1
wandb==0.13.4
8 changes: 4 additions & 4 deletions src/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(self, path):

# create data directory and in it create processed directory
os.makedirs(os.path.dirname(path), exist_ok=True)

# download dataset
download_dataset(path)

Expand All @@ -51,8 +52,8 @@ def cl_fn(batch, tokenizer):

return img_emb, input_ids, attention_mask

def get_loader(dataset, bs_exp=5, shuffle=True, num_workers=0, pin_memory=False, sampler=None):
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
def get_loader(dataset, bs_exp=5, shuffle=True, num_workers=0, pin_memory=False):
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

return DataLoader(
Expand All @@ -61,6 +62,5 @@ def get_loader(dataset, bs_exp=5, shuffle=True, num_workers=0, pin_memory=False,
collate_fn=lambda b: cl_fn(b, tokenizer),
shuffle=shuffle,
num_workers=num_workers,
pin_memory=pin_memory,
sampler=sampler
pin_memory=pin_memory
)
18 changes: 1 addition & 17 deletions src/model/loops.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from tqdm import tqdm

class Trainer:
def __init__(self, model, optimizer, scaler, scheduler, train_loader, valid_loader, test_dataset, test_path, ckp_path, device, multi_gpu=False):
def __init__(self, model, optimizer, scaler, scheduler, train_loader, valid_loader, test_dataset, test_path, ckp_path, device):
self.model = model
self.optimizer = optimizer
self.scaler = scaler
Expand All @@ -25,8 +25,6 @@ def __init__(self, model, optimizer, scaler, scheduler, train_loader, valid_load
self.ckp_path = ckp_path
self.device = device

self.multi_gpu = multi_gpu

# load checkpoint
if os.path.isfile(ckp_path):
self._load_ckp(ckp_path, optimizer, scheduler, scaler, device=device)
Expand Down Expand Up @@ -72,8 +70,6 @@ def train_epoch(self):
self.train_loss.append(total_loss / (batch_idx + 1))

self.scheduler.step()

return True

def valid_epoch(self):
self.model.eval()
Expand All @@ -98,8 +94,6 @@ def valid_epoch(self):

self.valid_loss.append(total_loss / (batch_idx + 1))

return True

def test_step(self, num_examples=4):
assert num_examples % 2 == 0, 'num_examples must be even'

Expand Down Expand Up @@ -129,8 +123,6 @@ def test_step(self, num_examples=4):

self.test_result = Image.open(buf)

return True

def get_training_data(self):
return {
'train_loss': self.train_loss,
Expand All @@ -153,14 +145,6 @@ def save_ckp(self, ckp_path):
ckp_path
)

return True

def set_samplers_epoch(self, epoch):
self.train_loader.sampler.set_epoch(epoch)
self.valid_loader.sampler.set_epoch(epoch)

return True

def _load_ckp(
self,
checkpoint_fpath,
Expand Down
81 changes: 13 additions & 68 deletions src/model/model.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,10 @@
'''
Module contains final Model and all pieces of it.
'''
import os

import torch
import torch.nn as nn
from torch.distributed import init_process_group, destroy_process_group
from transformers import CLIPModel, CLIPProcessor, GPT2LMHeadModel, GPT2Tokenizer

def ddp_setup(rank, world_size):
'''
Setup distributed training.
'''

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'

init_process_group('nccl', rank=rank, world_size=world_size)

def ddp_cleanup():
'''
Cleanup distributed training.
'''

destroy_process_group()

class ImageEncoder(nn.Module):
'''
Encodes image and returns it's embedding.
Expand Down Expand Up @@ -54,8 +34,7 @@ def __init__(
self,
ep_len,
num_layers,
embed_size_inp,
embed_size_out,
embed_size,
n_heads,
forward_expansion,
dropout,
Expand All @@ -64,56 +43,35 @@ def __init__(
super(Mapping, self).__init__()

self.ep_len = ep_len
self.embed_size_inp = embed_size_inp
self.embed_size_out = embed_size_out
self.embed_size = embed_size

self.device = device

num_layers_inp = num_layers // 2
num_layers_out = num_layers - num_layers_inp

self.transformer_encoder = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model=self.embed_size_inp,
d_model=embed_size,
nhead=n_heads,
dim_feedforward=self.embed_size_inp*forward_expansion,
dim_feedforward=embed_size*forward_expansion,
dropout=dropout,
batch_first=True,
device=device
),
num_layers=num_layers_inp
num_layers=num_layers
).to(self.device)

self.translator = nn.Linear(self.embed_size_inp, self.embed_size_out).to(self.device)

self.transformer_decoder = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model=self.embed_size_out,
nhead=n_heads,
dim_feedforward=self.embed_size_out*forward_expansion,
dropout=dropout,
batch_first=True,
device=device
),
num_layers=num_layers_out
).to(self.device)

self.mapper = nn.Linear(self.embed_size_out, ep_len * self.embed_size_out).to(self.device)
self.mapper = nn.Linear(embed_size, ep_len * embed_size).to(self.device)

self.init_weights()

def forward(self, img_embedded, train_mode=False):
x = self.transformer_encoder(img_embedded)
x = self.translator(x)

x = self.transformer_decoder(x)
x = self.mapper(x)

x = x.view(
*(
[-1, self.ep_len, self.embed_size_out]
[-1, self.ep_len, self.embed_size]
if train_mode else
[self.ep_len, self.embed_size_out]
[self.ep_len, self.embed_size]
)
) # for batched input

Expand All @@ -139,10 +97,10 @@ def __init__(self, device='cpu'):

self.device = device

self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
self.tokenizer.pad_token = self.tokenizer.eos_token

self.model = GPT2LMHeadModel.from_pretrained('gpt2-xl').to(self.device)
self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(self.device)
self.vocab_size = self.model.config.vocab_size

def forward(self, embedding, attention_mask=None):
Expand All @@ -158,7 +116,6 @@ class Net(nn.Module):
def __init__(self, ep_len, num_layers, n_heads, forward_expansion, dropout, max_len, device='cpu'):
'''
Model constructor.
Args:
num_layers: number of layers in the TransformerEncoder
n_heads: number of heads in the MultiHeadAttention
Expand All @@ -168,25 +125,15 @@ def __init__(self, ep_len, num_layers, n_heads, forward_expansion, dropout, max_
'''
super(Net, self).__init__()

assert num_layers >= 2, 'Number of layers must be at least 2.'

self.device = device
self.ep_len = ep_len

self.ie = ImageEncoder(device=device)
self.mp = Mapping(ep_len=self.ep_len, num_layers=num_layers, embed_size=self.ie.model.config.hidden_size, n_heads=n_heads, forward_expansion=forward_expansion, dropout=dropout, device=device)
self.td = TextDecoder(device=device)

self.mp = Mapping(
ep_len=self.ep_len,
num_layers=num_layers,
embed_size_inp=self.ie.model.config.hidden_size,
embed_size_out=self.td.model.config.hidden_size,
n_heads=n_heads,
forward_expansion=forward_expansion,
dropout=dropout,
device=device
)

assert self.ie.model.config.hidden_size == self.td.model.config.n_embd, "Embedding size of models mismatch"

self.max_len = max_len

self.criterion = nn.CrossEntropyLoss(ignore_index=self.td.tokenizer.pad_token_id)
Expand All @@ -200,10 +147,8 @@ def freeze_layers(self):
def forward(self, img, temperature=1.0):
'''
Caption generation for a single image.
Args:
img: image to generate caption for [PIL.Image]
Returns:
caption: generated caption [str]
tokens: generated tokens [torch.Tensor]
Expand Down
49 changes: 12 additions & 37 deletions src/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@

import numpy as np
import torch
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.optim as optim
from torch.utils.data import random_split
from torch.utils.data.distributed import DistributedSampler

import wandb
from data import MiniFlickrDataset, get_loader
from model import ddp_cleanup, ddp_setup, Net, Trainer
from model import Net, Trainer
from utils import Config, LRWarmup

config = Config()
Expand All @@ -39,18 +36,10 @@
torch.cuda.manual_seed(config.seed)
torch.backends.cudnn.deterministic = True

def main(rank, world_size, config, ckp_name=''):
# more than 1 GPU
def train(config, ckp_name=''):
is_cuda = torch.cuda.is_available()
MULTIGPU = world_size > 1

if MULTIGPU:
ddp_setup(rank, world_size)
device = rank
device = torch.device('cuda' if is_cuda else 'cpu')

else:
device = torch.device('cuda' if is_cuda else 'cpu')

model = Net(
ep_len=config.ep_len,
num_layers=config.num_layers,
Expand All @@ -61,9 +50,6 @@ def main(rank, world_size, config, ckp_name=''):
device=device
)

if MULTIGPU:
model = DDP(model, device_ids=[device])

dataset = MiniFlickrDataset(os.path.join('data', 'processed', 'dataset.pkl'))

config.train_size = int(config.train_size * len(dataset))
Expand All @@ -74,20 +60,18 @@ def main(rank, world_size, config, ckp_name=''):

train_loader = get_loader(
train_dataset,
bs_exp=config.batch_size_exp,
shuffle=not MULTIGPU,
bs_exp=config.batch_size_exp if is_cuda else 2,
shuffle=True,
num_workers=config.num_workers if is_cuda else 0,
pin_memory=is_cuda,
sampler=DistributedSampler(train_dataset) if MULTIGPU else None
pin_memory=is_cuda
)

valid_loader = get_loader(
val_dataset,
bs_exp=config.batch_size_exp,
bs_exp=config.batch_size_exp if is_cuda else 2,
shuffle=False,
num_workers=config.num_workers if is_cuda else 0,
pin_memory=is_cuda,
sampler=DistributedSampler(val_dataset) if MULTIGPU else None
pin_memory=is_cuda
)

optimizer = optim.Adam(model.parameters(), lr=config.lr)
Expand Down Expand Up @@ -116,11 +100,9 @@ def main(rank, world_size, config, ckp_name=''):
wandb.init(project='clipXgpt2 captioner', config=config.__dict__)
wandb.watch(trainer.model, log='all')
for epoch in range(trainer.epoch, config.epochs):
if MULTIGPU:
trainer.set_samplers_epoch(epoch)

trainer.train_epoch()
trainer.valid_epoch()
trainer.test_result()

metadata = trainer.get_training_data()

Expand All @@ -129,21 +111,14 @@ def main(rank, world_size, config, ckp_name=''):
'train_loss': metadata['train_loss'],
'valid_loss': metadata['valid_loss'],
'lr': metadata['lr'],
'examples': wandb.Image(metadata['examples']),
})

if not os.path.exists(config.weights_dir):
os.makedirs(config.weights_dir)

if (epoch + 1) % 50 == 0 and rank == 0:
if (epoch + 1) % 50 == 0:
trainer.save_ckp(os.path.join(config.weights_dir, f'epoch_{epoch + 1}.pt'))

ddp_cleanup()


if __name__ == '__main__':
# check if there is no GPU - use CPU -> world_size = 1
world_size = torch.cuda.device_count() if torch.cuda.is_available() else 1

print(f'Number of GPUs: {world_size}')

mp.spawn(main, args=(world_size, config, ''), nprocs=world_size)
train(config, args.checkpoint_name)
Loading

0 comments on commit ec71006

Please sign in to comment.