Skip to content

Commit

Permalink
Implement workflow for training with multiple gpus
Browse files Browse the repository at this point in the history
  • Loading branch information
jmisilo committed Nov 15, 2022
1 parent a3a9519 commit 311dec3
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 37 deletions.
17 changes: 17 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# for tests on Linux machines
FROM python:3.9.13

# Set the working directory to /app
WORKDIR /app

RUN python -m venv venv

RUN . venv/bin/activate

COPY requirements.txt /app/requirements.txt

RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt

COPY /src /app/src

CMD ["python", "-u", "src/training.py"]
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ matplotlib==3.6.0
numpy==1.23.3
pandas==1.5.0
Pillow==9.3.0
torch==1.12.1+cu116
torch==1.12.1
# torch==1.12.1+cu116
tqdm==4.64.1
transformers==4.22.1
wandb==0.13.4
12 changes: 12 additions & 0 deletions src/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* get_loader returns DataLoader object.
'''

import os
import pickle

import numpy as np
Expand All @@ -14,8 +15,19 @@
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer

from utils import download_dataset

class MiniFlickrDataset(Dataset):
def __init__(self, path):
# check if file is file
if not os.path.isfile(path):
print('Dataset file not found. Downloading...')

# create data directory and in it create processed directory
os.makedirs(os.path.dirname(path), exist_ok=True)
# download dataset
download_dataset(path)

with open(path, 'rb') as f:
self.data = pickle.load(f)

Expand Down
4 changes: 2 additions & 2 deletions src/dataset_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load CLIP model and processor
preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch14')
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch14').vision_model.to(device)
preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14').vision_model.to(device)

# Load dataset
df = pd.read_csv(os.path.join(DATA_PATH, 'raw', 'results.csv'), sep='|')
Expand Down
64 changes: 48 additions & 16 deletions src/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def __init__(self, device='cpu'):

self.device = device

self.preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch14')
self.model = CLIPModel.from_pretrained('openai/clip-vit-base-patch14').vision_model.to(self.device)
self.preprocessor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
self.model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14').vision_model.to(self.device)

def forward(self, image):
# only one image at a time
Expand All @@ -54,7 +54,8 @@ def __init__(
self,
ep_len,
num_layers,
embed_size,
embed_size_inp,
embed_size_out,
n_heads,
forward_expansion,
dropout,
Expand All @@ -63,35 +64,56 @@ def __init__(
super(Mapping, self).__init__()

self.ep_len = ep_len
self.embed_size = embed_size
self.embed_size_inp = embed_size_inp
self.embed_size_out = embed_size_out

self.device = device

num_layers_inp = num_layers // 2
num_layers_out = num_layers - num_layers_inp

self.transformer_encoder = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model=embed_size,
d_model=self.embed_size_inp,
nhead=n_heads,
dim_feedforward=self.embed_size_inp*forward_expansion,
dropout=dropout,
batch_first=True,
device=device
),
num_layers=num_layers_inp
).to(self.device)

self.translator = nn.Linear(self.embed_size_inp, self.embed_size_out).to(self.device)

self.transformer_decoder = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model=self.embed_size_out,
nhead=n_heads,
dim_feedforward=embed_size*forward_expansion,
dim_feedforward=self.embed_size_out*forward_expansion,
dropout=dropout,
batch_first=True,
device=device
),
num_layers=num_layers
num_layers=num_layers_out
).to(self.device)

self.mapper = nn.Linear(embed_size, ep_len * embed_size).to(self.device)
self.mapper = nn.Linear(self.embed_size_out, ep_len * self.embed_size_out).to(self.device)

self.init_weights()

def forward(self, img_embedded, train_mode=False):
x = self.transformer_encoder(img_embedded)
x = self.translator(x)

x = self.transformer_decoder(x)
x = self.mapper(x)

x = x.view(
*(
[-1, self.ep_len, self.embed_size]
[-1, self.ep_len, self.embed_size_out]
if train_mode else
[self.ep_len, self.embed_size]
[self.ep_len, self.embed_size_out]
)
) # for batched input

Expand All @@ -117,10 +139,10 @@ def __init__(self, device='cpu'):

self.device = device

self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
self.tokenizer.pad_token = self.tokenizer.eos_token

self.model = GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
self.model = GPT2LMHeadModel.from_pretrained('gpt2-xl').to(self.device)
self.vocab_size = self.model.config.vocab_size

def forward(self, embedding, attention_mask=None):
Expand All @@ -146,15 +168,25 @@ def __init__(self, ep_len, num_layers, n_heads, forward_expansion, dropout, max_
'''
super(Net, self).__init__()

assert num_layers >= 2, 'Number of layers must be at least 2.'

self.device = device
self.ep_len = ep_len

self.ie = ImageEncoder(device=device)
self.mp = Mapping(ep_len=self.ep_len, num_layers=num_layers, embed_size=self.ie.model.config.hidden_size, n_heads=n_heads, forward_expansion=forward_expansion, dropout=dropout, device=device)
self.td = TextDecoder(device=device)

assert self.ie.model.config.hidden_size == self.td.model.config.n_embd, "Embedding size of models mismatch"

self.mp = Mapping(
ep_len=self.ep_len,
num_layers=num_layers,
embed_size_inp=self.ie.model.config.hidden_size,
embed_size_out=self.td.model.config.hidden_size,
n_heads=n_heads,
forward_expansion=forward_expansion,
dropout=dropout,
device=device
)

self.max_len = max_len

self.criterion = nn.CrossEntropyLoss(ignore_index=self.td.tokenizer.pad_token_id)
Expand Down Expand Up @@ -270,7 +302,7 @@ def train_forward(self, img_emb, trg_cap, att_mask):

m.train()
N = 10
emb = 768
emb = 1024
length = 20

l = m.train_forward(
Expand Down
9 changes: 4 additions & 5 deletions src/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def main(rank, world_size, config, ckp_name=''):

trainer.train_epoch()
trainer.valid_epoch()
trainer.test_result()

metadata = trainer.get_training_data()

Expand All @@ -130,21 +129,21 @@ def main(rank, world_size, config, ckp_name=''):
'train_loss': metadata['train_loss'],
'valid_loss': metadata['valid_loss'],
'lr': metadata['lr'],
'examples': wandb.Image(metadata['examples'])
})

if not os.path.exists(config.weights_dir):
os.makedirs(config.weights_dir)

if (epoch + 1) % 10 == 0 and rank == 0:
if (epoch + 1) % 50 == 0 and rank == 0:
trainer.save_ckp(os.path.join(config.weights_dir, f'epoch_{epoch + 1}.pt'))

ddp_cleanup()


if __name__ == '__main__':
# check if there is no GPU - use CPU -> world_size = 1

# check if there is no GPU - use CPU -> world_size = 1
world_size = torch.cuda.device_count() if torch.cuda.is_available() else 1

print(f'Number of GPUs: {world_size}')

mp.spawn(main, args=(world_size, config, ''), nprocs=world_size)
2 changes: 1 addition & 1 deletion src/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from utils.config import *
from utils.download_weights import *
from utils.downloads import *
from utils.lr_warmup import *
12 changes: 0 additions & 12 deletions src/utils/download_weights.py

This file was deleted.

19 changes: 19 additions & 0 deletions src/utils/downloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
'''
Utility functions for loading weights.
'''

import gdown

def download_weights(checkpoint_fpath):
'''
Downloads weights from Google Drive.
'''

gdown.download('https://drive.google.com/uc?id=10ieSMMJzE9EeiPIF3CMzeT4timiQTjHV', checkpoint_fpath, quiet=False)

def download_dataset(destination_path):
'''
Downloads dataset from Google Drive.
'''

gdown.download('https://drive.google.com/uc?id=1E7lKanGE2Gakgy3mvyUal_B43BxU3vHr', destination_path, quiet=False)

0 comments on commit 311dec3

Please sign in to comment.