Skip to content

Commit

Permalink
Merge pull request #62 from jmisilo/58-double-gpu-training
Browse files Browse the repository at this point in the history
58 double gpu training
  • Loading branch information
jmisilo committed Nov 15, 2022
2 parents e561112 + bf28e67 commit 89bb23a
Show file tree
Hide file tree
Showing 10 changed files with 238 additions and 185 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ Pillow==9.3.0
torch==1.12.1+cu116
tqdm==4.64.1
transformers==4.22.1
wandb==0.13.4
15 changes: 14 additions & 1 deletion src/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* get_loader returns DataLoader object.
'''

import os
import pickle

import numpy as np
Expand All @@ -14,8 +15,20 @@
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer

from utils import download_dataset

class MiniFlickrDataset(Dataset):
def __init__(self, path):
# check if file is file
if not os.path.isfile(path):
print('Dataset file not found. Downloading...')

# create data directory and in it create processed directory
os.makedirs(os.path.dirname(path), exist_ok=True)

# download dataset
download_dataset(path)

with open(path, 'rb') as f:
self.data = pickle.load(f)

Expand All @@ -40,7 +53,7 @@ def cl_fn(batch, tokenizer):
return img_emb, input_ids, attention_mask

def get_loader(dataset, bs_exp=5, shuffle=True, num_workers=0, pin_memory=False):
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

return DataLoader(
Expand Down
4 changes: 2 additions & 2 deletions src/dataset_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load CLIP model and processor
preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch14')
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch14').vision_model.to(device)
preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14').vision_model.to(device)

# Load dataset
df = pd.read_csv(os.path.join(DATA_PATH, 'raw', 'results.csv'), sep='|')
Expand Down
241 changes: 152 additions & 89 deletions src/model/loops.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,123 +12,186 @@
import torch
from tqdm import tqdm

def train_epoch(model, scaler, optimizer, loader, epoch, device='cpu'):
'''
Train model for one epoch.
Args:
model: model to train
scaler: scaler for mixed precision training
optimizer: optimizer to use
loader: DataLoader object
epoch: current epoch
device: device to use
'''

model.train()

total_loss = 0

loop = tqdm(loader, total=len(loader))
loop.set_description(f'Epoch: {epoch} | Loss: ---')
for batch_idx, (img_emb, cap, att_mask) in enumerate(loop):
class Trainer:
def __init__(
self,
model,
optimizer,
scaler,
scheduler,
train_loader,
valid_loader,
test_dataset='./data',
test_path='',
ckp_path='',
device='cpu'
):
self.model = model
self.optimizer = optimizer
self.scaler = scaler
self.scheduler = scheduler
self.train_loader = train_loader
self.valid_loader = valid_loader
self.test_dataset = test_dataset
self.test_path = test_path
self.ckp_path = ckp_path
self.device = device

# load checkpoint
if os.path.isfile(ckp_path):
self._load_ckp(ckp_path, optimizer, scheduler, scaler, device=device)

else:
self.cur_lr = self.optimizer.param_groups[0]['lr']
self.epoch = 0
self.train_loss = []
self.valid_loss = []
self.test_result = None

def train_epoch(self):
self.model.train()
self.epoch += 1

img_emb, cap, att_mask = img_emb.to(device), cap.to(device), att_mask.to(device)
total_loss = 0

with torch.cuda.amp.autocast():
loss = model.train_forward(img_emb=img_emb, trg_cap=cap, att_mask=att_mask)

scaler.scale(loss).backward()
scaler.unscale_(optimizer)
loop = tqdm(self.train_loader, total=len(self.train_loader))
loop.set_description(f'Epoch: {self.epoch} | Loss: ---')
for batch_idx, (img_emb, cap, att_mask) in enumerate(loop):

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.3)
img_emb, cap, att_mask = img_emb.to(self.device), cap.to(self.device), att_mask.to(self.device)

scaler.step(optimizer)
scaler.update()
with torch.cuda.amp.autocast():
loss = self.model.train_forward(img_emb=img_emb, trg_cap=cap, att_mask=att_mask)

self.scaler.scale(loss).backward()
self.scaler.unscale_(self.optimizer)

optimizer.zero_grad()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.3)

total_loss += loss.item()
self.scaler.step(self.optimizer)
self.scaler.update()

loop.set_description(f'Epoch: {epoch} | Loss: {total_loss / (batch_idx + 1):.3f}')
loop.refresh()
self.optimizer.zero_grad()

return {
'loss': total_loss / (batch_idx + 1)
}
total_loss += loss.item()

def valid_epoch(model, loader, device='cpu'):
'''
Validate model for one epoch.
loop.set_description(f'Epoch: {self.epoch} | Loss: {total_loss / (batch_idx + 1):.3f}')
loop.refresh()

Args:
model: model to validate
loader: DataLoader object
device: device to use
'''
self.cur_lr = self.optimizer.param_groups[0]['lr']
self.train_loss.append(total_loss / (batch_idx + 1))

model.eval()
self.scheduler.step()

def valid_epoch(self):
self.model.eval()

total_loss = 0
total_loss = 0

loop = tqdm(loader, total=len(loader))
loop.set_description(f'Validation Loss: ---')
for batch_idx, (img_emb, cap, att_mask) in enumerate(loop):
loop = tqdm(self.loader, total=len(self.loader))
loop.set_description(f'Validation Loss: ---')
for batch_idx, (img_emb, cap, att_mask) in enumerate(loop):

img_emb, cap, att_mask = img_emb.to(device), cap.to(device), att_mask.to(device)
img_emb, cap, att_mask = img_emb.to(self.device), cap.to(self.device), att_mask.to(self.device)

with torch.no_grad():
with torch.cuda.amp.autocast():
with torch.no_grad():
with torch.cuda.amp.autocast():

loss = model.train_forward(img_emb=img_emb, trg_cap=cap, att_mask=att_mask)
loss = self.model.train_forward(img_emb=img_emb, trg_cap=cap, att_mask=att_mask)

total_loss += loss.item()

loop.set_description(f'Validation Loss: {total_loss / (batch_idx + 1):.3f}')
loop.refresh()
total_loss += loss.item()
loop.set_description(f'Validation Loss: {total_loss / (batch_idx + 1):.3f}')
loop.refresh()

return {
'loss': total_loss / (batch_idx + 1)
}
self.valid_loss.append(total_loss / (batch_idx + 1))

def test_step(model, dataset, img_path, num_examples=4):
'''
Test model on dataset.
Args:
model: model to test
dataset: dataset to test on
img_path: path to images
num_examples: number of examples to show
'''
def test_step(self, num_examples=4):
assert num_examples % 2 == 0, 'num_examples must be even'

assert num_examples % 2 == 0, 'num_examples must be even'
self.model.eval()

model.eval()
fig, axs = plt.subplots(num_examples // 2, 2, figsize=(20, 12))

fig, axs = plt.subplots(num_examples // 2, 2, figsize=(20, 12))
random_idx = np.random.randint(0, len(self.dataset), size=(num_examples,))
for idx, r in enumerate(random_idx):
img_name, _, _ = self.dataset[r]

random_idx = np.random.randint(0, len(dataset), size=(num_examples,))
for idx, r in enumerate(random_idx):
img_name, _, _ = dataset[r]
img = Image.open(os.path.join(self.test_path, img_name))

img = Image.open(os.path.join(img_path, img_name))
with torch.no_grad():
caption, _ = self.model(img)

with torch.no_grad():
caption, _ = model(img)
axs[idx // 2, idx % 2].imshow(img)
axs[idx // 2, idx % 2].set_title(caption)
axs[idx // 2, idx % 2].axis('off')

buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)

fig.clear()
plt.close(fig)

self.test_result = Image.open(buf)

def get_training_data(self):
return {
'train_loss': self.train_loss,
'valid_loss': self.valid_loss,
'lr': self.cur_lr,
'examples': self.test_result
}

def save_ckp(self, ckp_path):
torch.save(
{
'epoch': self.epoch,
'model_state_dict': self.model.module.state_dict() if self.multi_gpu else self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'scheduler_state_dict': self.scheduler.state_dict(),
'scaler_state_dict': self.scaler.state_dict(),
'tloss': self.train_loss,
'vloss': self.valid_loss
},
ckp_path
)

def _load_ckp(
self,
checkpoint_fpath,
optimizer=False,
scheduler=False,
scaler=False,
epoch=False,
train_loss=False,
valid_loss=False,
device='cpu'
):
'''
Loads entire checkpoint from file.
'''

checkpoint = torch.load(checkpoint_fpath, map_location=device)

self.model.load_state_dict(checkpoint['model_state_dict'])
if optimizer is not None:
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

if scheduler is not None:
self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

if scaler is not None:
self.scaler.load_state_dict(checkpoint['scaler_state_dict'])

axs[idx // 2, idx % 2].imshow(img)
axs[idx // 2, idx % 2].set_title(caption)
axs[idx // 2, idx % 2].axis('off')

buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
if epoch is not None:
self.epoch = checkpoint['epoch']

fig.clear()
plt.close(fig)
if train_loss is not None:
self.train_loss = checkpoint['train_loss']

return Image.open(buf)
if valid_loss is not None:
self.valid_loss = checkpoint['valid_loss']

def evaluate_dataset(model, dataset, img_path, save_path, temperature=1.0):
'''
Expand Down
14 changes: 5 additions & 9 deletions src/model/model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
'''
Module contains final Model and all pieces of it.
'''

import torch
import torch.nn as nn
from transformers import CLIPModel, CLIPProcessor, GPT2LMHeadModel, GPT2Tokenizer
Expand All @@ -16,8 +15,8 @@ def __init__(self, device='cpu'):

self.device = device

self.preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch14')
self.model = CLIPModel.from_pretrained('openai/clip-vit-base-patch14').vision_model.to(self.device)
self.preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
self.model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14').vision_model.to(self.device)

def forward(self, image):
# only one image at a time
Expand Down Expand Up @@ -98,10 +97,10 @@ def __init__(self, device='cpu'):

self.device = device

self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
self.tokenizer.pad_token = self.tokenizer.eos_token

self.model = GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(self.device)
self.vocab_size = self.model.config.vocab_size

def forward(self, embedding, attention_mask=None):
Expand All @@ -117,7 +116,6 @@ class Net(nn.Module):
def __init__(self, ep_len, num_layers, n_heads, forward_expansion, dropout, max_len, device='cpu'):
'''
Model constructor.
Args:
num_layers: number of layers in the TransformerEncoder
n_heads: number of heads in the MultiHeadAttention
Expand Down Expand Up @@ -149,10 +147,8 @@ def freeze_layers(self):
def forward(self, img, temperature=1.0):
'''
Caption generation for a single image.
Args:
img: image to generate caption for [PIL.Image]
Returns:
caption: generated caption [str]
tokens: generated tokens [torch.Tensor]
Expand Down Expand Up @@ -251,7 +247,7 @@ def train_forward(self, img_emb, trg_cap, att_mask):

m.train()
N = 10
emb = 768
emb = 1024
length = 20

l = m.train_forward(
Expand Down
Loading

0 comments on commit 89bb23a

Please sign in to comment.