Add hardware/OS specifications.

martin-wey · Aug 29, 2022 · 0c0bd26 · 0c0bd26
1 parent 9791c4c
commit 0c0bd26
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ python src/main.py \
  --do_train \
  --run_name <folder_run_name> \
  --pretrained_model_name_or_path <hugging_face_model> \
- --model_type <model_type> 
+ --model_type <model_type> \
  --lang <lang> \
  --layer <layer> \
  --rank <rank>
@@ -75,7 +75,8 @@ The script `main.py` is in charge of training the probe. The main arguments are
 As a result of this script, a folder `runs/folder_run_name` will be generated. This folder contains three files:
 * `ìnfo.log`: log file.
 * `pytorch_model.bin`: the probing model serialized *i.e.*, the basis of the syntactic subspace, the vectors C and U.
-* `metrics.log`: a serialized dictionary that contains the training losses, the validation losses, the precision, recall, and F1 score on the test set. You can use `python -m pickle runs/folder_run_name/metrics.log` to check it.
+* `metrics.log`: a serialized dictionary that contains the training losses, the validation losses, the precision, recall, and F1 score on the test set. You can use `python -m pickle runs/folder_run_name/metrics.log` to check the metrics for the run.
+
 
 Here is an example of the usage of this script:
 ```sh
@@ -84,19 +85,39 @@ python src/main.py \
  --run_name codebert_python_5_128 \
  --pretrained_model_name_or_path microsoft/codebert-base \
  --model_type roberta \
- --lang python \ 
+ --lang python \
  --layer 5 \
  --rank 128
 ```
 This command trains a 128-dimensional probe over the output embeddings of the 5th layer of CodeBERT using the Python dataset. After running this command, the folder `runs/codebert_python_5_128` is created.
 
+--- 
 ## Replicating the experiments of the paper
 To replicate the experiments included in the paper, we provide two scripts that run everything.
 - `run_experiments_rq123.py`: to replicate the results of RQ1, RQ2 and RQ3.
 - `run_experiments_rq4.py`: to replicate the results of RQ4.
 
 You may have to change a few things in these two scripts such as the `CUDA_VISIBLE_DEVICE`, *i.e.*, GPU used with PyTorch. Besides, the script will generate all the results for all experiments in separated folders such as described in the previous section of this readme.
 
+After running the experiments (i.e., replicating the RQs), it is possible to get plots similar to the ones included in the paper using the `plot_graphs.py` script by specifying the base path of the run directories:
+`python plot_graphs.py --run_dir ./runs`.
+
+--- 
+## Hardware specifications
+In principle, all the experiments of this paper can be reproduced by following the instructions previously mentioned.
+For completeness, we also provide specifications the hardware and OS we used to get the results included in the paper.
+
+```sh
+OS: Gentoo Linux
+OS release: Gentoo Base System 2.8
+Kernel: Linux 5.15.41-gentoo-x86_64
+GPU: NVIDIA GeForce RTX 3090
+CUDA version: 11.3
+```
+Unfortunately, our probe cannot be run without at least one GPU. We also cannot ensure that the scripts can be used
+with GPUs other than those of NVIDIA.
+
+---
 ### You can cite our work if you find this repository or the paper useful.
 ```sh
 @misc{hernandez-ast-probe-2022,

diff --git a/run_experiments_rq123.py b/run_experiments_rq123.py
@@ -19,7 +19,7 @@ def main():
  layers = list(range(1, 13))
  for layer in layers:
  run_name = '_'.join([folder, lang, str(layer), '128'])
- os.system(f"CUDA_VISIBLE_DEVICES=3 python src/main.py --do_train --run_name {run_name} "
+ os.system(f"CUDA_VISIBLE_DEVICES=0 python src/main.py --do_train --run_name {run_name} "
  f"--pretrained_model_name_or_path {model} "
  f"--model_type {model_type} --lang {lang} "
  f"--layer {layer} --rank 128")

diff --git a/run_experiments_rq4.py b/run_experiments_rq4.py
@@ -14,7 +14,7 @@ def main():
  for layer in layers:
  for rank in [8, 16, 32, 64, 128, 256, 512]:
  run_name = '_'.join([folder, lang, str(layer), str(rank), 'rq4'])
- os.system(f"CUDA_VISIBLE_DEVICES=3 python src/main.py --do_train --run_name {run_name} "
+ os.system(f"CUDA_VISIBLE_DEVICES=0 python src/main.py --do_train --run_name {run_name} "
  f"--pretrained_model_name_or_path {model} "
  f"--model_type {model_type} --lang {lang} "
  f"--layer {layer} --rank {rank}")

diff --git a/src/data/binary_tree.py b/src/data/binary_tree.py
@@ -1,4 +1,4 @@
-from src.data.code2ast import get_id, get_root_ast
+from .code2ast import get_id, get_root_ast
 import networkx as nx
 import numpy as np
 

diff --git a/src/data/data_loading.py b/src/data/data_loading.py
@@ -21,31 +21,33 @@
 
 LANGUAGES = (
  'python',
- 'java',
- 'ruby',
  'javascript',
  'go',
- 'php'
+ # 'php',
+ # 'java',
+ # 'ruby'
 )
 PY_LANGUAGE = Language('grammars/languages.so', 'python')
 JS_LANGUAGE = Language('grammars/languages.so', 'javascript')
 GO_LANGUAGE = Language('grammars/languages.so', 'go')
-PHP_LANGUAGE = Language('grammars/languages.so', 'php')
-JAVA_LANGUAGE = Language('grammars/languages.so', 'java')
-RUBY_LANGUAGE = Language('grammars/languages.so', 'ruby')
+
+# PHP_LANGUAGE = Language('grammars/languages.so', 'php')
+# JAVA_LANGUAGE = Language('grammars/languages.so', 'java')
+# RUBY_LANGUAGE = Language('grammars/languages.so', 'ruby')
 
 PY_PARSER = Parser()
 PY_PARSER.set_language(PY_LANGUAGE)
 JS_PARSER = Parser()
 JS_PARSER.set_language(JS_LANGUAGE)
 GO_PARSER = Parser()
 GO_PARSER.set_language(GO_LANGUAGE)
-PHP_PARSER = Parser()
-PHP_PARSER.set_language(PHP_LANGUAGE)
-JAVA_PARSER = Parser()
-JAVA_PARSER.set_language(JAVA_LANGUAGE)
-RUBY_PARSER = Parser()
-RUBY_PARSER.set_language(RUBY_LANGUAGE)
+
+# PHP_PARSER = Parser()
+# PHP_PARSER.set_language(PHP_LANGUAGE)
+# JAVA_PARSER = Parser()
+# JAVA_PARSER.set_language(JAVA_LANGUAGE)
+# RUBY_PARSER = Parser()
+# RUBY_PARSER.set_language(RUBY_LANGUAGE)
 
 
 def download_codesearchnet_dataset(dataset_dir):