add comet logger

ManifoldRG · Apr 17, 2021 · 6ca3dff · 6ca3dff
1 parent 8127b28
commit 6ca3dff
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 66 deletions.
diff --git a/configs.json b/configs.json
@@ -7,9 +7,9 @@
  "model_name": "resnet101",
  "dataset": "ImageNet",
  "lr": 0.001,
- "batch_size": 256,
+ "batch_size": 512,
  "target_val_acc": 94.0,
  "arr_save_path": "/home/shared/results/",
- "num_workers": 8,
+ "num_workers": 4,
  "data_path": "/home/shared/imagenet"
 }
diff --git a/multi_train.py b/multi_train.py
@@ -34,6 +34,7 @@ def multi_train(configs_path="./configs.json"):
  print(f"Training: {configs.exp_name}")
 
  configs.seed = seed
+ configs.experiment = experiment
  rmae_dict, train_acc_arr, test_acc_arr = run_experiment(
  configs.epochs, configs.model_name, "untrained", configs)
 

diff --git a/train.py b/train.py
@@ -13,83 +13,91 @@ def training(epochs, loaders, model, optimizer, criterion, prev_list,
  min_test_loss = np.Inf
 
  train_acc_arr, test_acc_arr = [], []
-
- for epoch in range(1, epochs+1):
-
- train_loss = 0.0
- train_correct = 0.0
- train_total = 0.0
- test_correct = 0.0
- test_total = 0.0
- test_loss = 0.0
-
- # train the model
- model.train()
-
- for data, labels in loaders['train']:
- # move the data and labels to gpu
- data, labels = data.cuda(), labels.cuda()
-
- optimizer.zero_grad()
- # get model outputs
- output = model(data)
- # calculate the loss
- loss = criterion(output, labels)
- # backprop
- loss.backward()
- # optimize the weights
- optimizer.step()
- # update the training loss for the batch
- train_loss += loss.item()*data.size(0)
- # get the predictions for each image in the batch
- preds = torch.max(output, 1)[1]
- # get the number of correct predictions in the batch
- train_correct += np.sum(np.squeeze(
- preds.eq(labels.data.view_as(preds))).cpu().numpy())
-
- # accumulate total number of examples
- train_total += data.size(0)
-
- train_loss = round(train_loss/len(loaders['train'].dataset), 4)
- train_acc = round(((train_correct/train_total) * 100.0), 4)
-
- # compute layer deltas after epoch.
- rmae_delta_dict, prev_list = compute_delta(
- model, prev_list, rmae_delta_dict, epoch)
-
- model.eval()
- with torch.no_grad():
- for data, labels in loaders['test']:
-
+
+ with configs.experiment.train():
+ for epoch in range(1, epochs+1):
+
+ train_loss = 0.0
+ train_correct = 0.0
+ train_total = 0.0
+ test_correct = 0.0
+ test_total = 0.0
+ test_loss = 0.0
+
+ # train the model
+ model.train()
+
+ for data, labels in loaders['train']:
+ # move the data and labels to gpu
  data, labels = data.cuda(), labels.cuda()
 
+ optimizer.zero_grad()
+ # get model outputs
  output = model(data)
+ # calculate the loss
  loss = criterion(output, labels)
-
- test_loss += loss.item()*data.size(0)
-
+ # backprop
+ loss.backward()
+ # optimize the weights
+ optimizer.step()
+ # update the training loss for the batch
+ train_loss += loss.item()*data.size(0)
  # get the predictions for each image in the batch
  preds = torch.max(output, 1)[1]
  # get the number of correct predictions in the batch
- test_correct += np.sum(np.squeeze(
+ train_correct += np.sum(np.squeeze(
  preds.eq(labels.data.view_as(preds))).cpu().numpy())
 
  # accumulate total number of examples
- test_total += data.size(0)
+ train_total += data.size(0)
+
+ train_loss = round(train_loss/len(loaders['train'].dataset), 4)
+ train_acc = round(((train_correct/train_total) * 100.0), 4)
+
+ configs.experiment.log_metric("accuracy", train_acc, step=epoch)
+ configs.experiment.log_metric("loss", train_loss, step=epoch)
+
+ # compute layer deltas after epoch.
+ rmae_delta_dict, prev_list = compute_delta(
+ model, prev_list, rmae_delta_dict)
+
+ with configs.experiment.test():
+ model.eval()
+ with torch.no_grad():
+ for data, labels in loaders['test']:
+
+ data, labels = data.cuda(), labels.cuda()
+
+ output = model(data)
+ loss = criterion(output, labels)
+
+ test_loss += loss.item()*data.size(0)
+
+ # get the predictions for each image in the batch
+ preds = torch.max(output, 1)[1]
+ # get the number of correct predictions in the batch
+ test_correct += np.sum(np.squeeze(
+ preds.eq(labels.data.view_as(preds))).cpu().numpy())
+
+ # accumulate total number of examples
+ test_total += data.size(0)
+
+ if test_loss < min_test_loss:
+ print(f"Saving model at Epoch: {epoch}")
+ # torch.save(model.state_dict(), 'drive/My Drive/cifar10-resnet18-gradual-adam')
 
- if test_loss < min_test_loss:
- print(f"Saving model at Epoch: {epoch}")
- # torch.save(model.state_dict(), 'drive/My Drive/cifar10-resnet18-gradual-adam')
+ test_loss = round(test_loss/len(loaders['test'].dataset), 4)
+ test_acc = round(((test_correct/test_total) * 100), 4)
 
- test_loss = round(test_loss/len(loaders['test'].dataset), 4)
- test_acc = round(((test_correct/test_total) * 100), 4)
+  configs.experiment.log_metric("accuracy", test_acc, step=epoch)
+  configs.experiment.log_metric("loss", test_loss, step=epoch)
 
- train_acc_arr.append(train_acc)
- test_acc_arr.append(test_acc)
+  train_acc_arr.append(train_acc)
+  test_acc_arr.append(test_acc)
 
- print(
- f"Epoch: {epoch} \tTrain Loss: {train_loss} \tTrain Acc: {train_acc}% \tTest Loss: {test_loss} \tTest Acc: {test_acc}%")
- if float(test_acc) >= configs.target_val_acc:
- break
+  print(
+  f"Epoch: {epoch} \tTrain Loss: {train_loss} \tTrain Acc: {train_acc}% \tTest Loss: {test_loss} \tTest Acc: {test_acc}%")
+  if float(test_acc) >= configs.target_val_acc:
+  break
 
  return rmae_delta_dict, np.asarray(train_acc_arr), np.asarray(test_acc_arr)