Skip to content

Commit

Permalink
add comet logger
Browse files Browse the repository at this point in the history
  • Loading branch information
ayushm-agrawal committed Apr 17, 2021
1 parent 8127b28 commit 6ca3dff
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 66 deletions.
4 changes: 2 additions & 2 deletions configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
"model_name": "resnet101",
"dataset": "ImageNet",
"lr": 0.001,
"batch_size": 256,
"batch_size": 512,
"target_val_acc": 94.0,
"arr_save_path": "/home/shared/results/",
"num_workers": 8,
"num_workers": 4,
"data_path": "/home/shared/imagenet"
}
1 change: 1 addition & 0 deletions multi_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def multi_train(configs_path="./configs.json"):
print(f"Training: {configs.exp_name}")

configs.seed = seed
configs.experiment = experiment
rmae_dict, train_acc_arr, test_acc_arr = run_experiment(
configs.epochs, configs.model_name, "untrained", configs)

Expand Down
136 changes: 72 additions & 64 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,83 +13,91 @@ def training(epochs, loaders, model, optimizer, criterion, prev_list,
min_test_loss = np.Inf

train_acc_arr, test_acc_arr = [], []

for epoch in range(1, epochs+1):

train_loss = 0.0
train_correct = 0.0
train_total = 0.0
test_correct = 0.0
test_total = 0.0
test_loss = 0.0

# train the model
model.train()

for data, labels in loaders['train']:
# move the data and labels to gpu
data, labels = data.cuda(), labels.cuda()

optimizer.zero_grad()
# get model outputs
output = model(data)
# calculate the loss
loss = criterion(output, labels)
# backprop
loss.backward()
# optimize the weights
optimizer.step()
# update the training loss for the batch
train_loss += loss.item()*data.size(0)
# get the predictions for each image in the batch
preds = torch.max(output, 1)[1]
# get the number of correct predictions in the batch
train_correct += np.sum(np.squeeze(
preds.eq(labels.data.view_as(preds))).cpu().numpy())

# accumulate total number of examples
train_total += data.size(0)

train_loss = round(train_loss/len(loaders['train'].dataset), 4)
train_acc = round(((train_correct/train_total) * 100.0), 4)

# compute layer deltas after epoch.
rmae_delta_dict, prev_list = compute_delta(
model, prev_list, rmae_delta_dict, epoch)

model.eval()
with torch.no_grad():
for data, labels in loaders['test']:


with configs.experiment.train():
for epoch in range(1, epochs+1):

train_loss = 0.0
train_correct = 0.0
train_total = 0.0
test_correct = 0.0
test_total = 0.0
test_loss = 0.0

# train the model
model.train()

for data, labels in loaders['train']:
# move the data and labels to gpu
data, labels = data.cuda(), labels.cuda()

optimizer.zero_grad()
# get model outputs
output = model(data)
# calculate the loss
loss = criterion(output, labels)

test_loss += loss.item()*data.size(0)

# backprop
loss.backward()
# optimize the weights
optimizer.step()
# update the training loss for the batch
train_loss += loss.item()*data.size(0)
# get the predictions for each image in the batch
preds = torch.max(output, 1)[1]
# get the number of correct predictions in the batch
test_correct += np.sum(np.squeeze(
train_correct += np.sum(np.squeeze(
preds.eq(labels.data.view_as(preds))).cpu().numpy())

# accumulate total number of examples
test_total += data.size(0)
train_total += data.size(0)

train_loss = round(train_loss/len(loaders['train'].dataset), 4)
train_acc = round(((train_correct/train_total) * 100.0), 4)

configs.experiment.log_metric("accuracy", train_acc, step=epoch)
configs.experiment.log_metric("loss", train_loss, step=epoch)

# compute layer deltas after epoch.
rmae_delta_dict, prev_list = compute_delta(
model, prev_list, rmae_delta_dict)

with configs.experiment.test():
model.eval()
with torch.no_grad():
for data, labels in loaders['test']:

data, labels = data.cuda(), labels.cuda()

output = model(data)
loss = criterion(output, labels)

test_loss += loss.item()*data.size(0)

# get the predictions for each image in the batch
preds = torch.max(output, 1)[1]
# get the number of correct predictions in the batch
test_correct += np.sum(np.squeeze(
preds.eq(labels.data.view_as(preds))).cpu().numpy())

# accumulate total number of examples
test_total += data.size(0)

if test_loss < min_test_loss:
print(f"Saving model at Epoch: {epoch}")
# torch.save(model.state_dict(), 'drive/My Drive/cifar10-resnet18-gradual-adam')

if test_loss < min_test_loss:
print(f"Saving model at Epoch: {epoch}")
# torch.save(model.state_dict(), 'drive/My Drive/cifar10-resnet18-gradual-adam')
test_loss = round(test_loss/len(loaders['test'].dataset), 4)
test_acc = round(((test_correct/test_total) * 100), 4)

test_loss = round(test_loss/len(loaders['test'].dataset), 4)
test_acc = round(((test_correct/test_total) * 100), 4)
configs.experiment.log_metric("accuracy", test_acc, step=epoch)
configs.experiment.log_metric("loss", test_loss, step=epoch)

train_acc_arr.append(train_acc)
test_acc_arr.append(test_acc)
train_acc_arr.append(train_acc)
test_acc_arr.append(test_acc)

print(
f"Epoch: {epoch} \tTrain Loss: {train_loss} \tTrain Acc: {train_acc}% \tTest Loss: {test_loss} \tTest Acc: {test_acc}%")
if float(test_acc) >= configs.target_val_acc:
break
print(
f"Epoch: {epoch} \tTrain Loss: {train_loss} \tTrain Acc: {train_acc}% \tTest Loss: {test_loss} \tTest Acc: {test_acc}%")
if float(test_acc) >= configs.target_val_acc:
break

return rmae_delta_dict, np.asarray(train_acc_arr), np.asarray(test_acc_arr)

0 comments on commit 6ca3dff

Please sign in to comment.