RC

guillaumegenthial · Nov 8, 2017 · dd27db3 · dd27db3
1 parent 101eb84
commit dd27db3
Show file tree

Hide file tree

Showing 10 changed files with 76 additions and 94 deletions.
diff --git a/README.md b/README.md
@@ -2,59 +2,59 @@
 
 ## Install
 
-Install ghostsript and magick (from source depending on your linux distribution) and pdflatex for evaluation
-
-https://www.imagemagick.org/script/install-source.php
+Install pdflatex (latex to pdf) and ghostsript + [magick](https://www.imagemagick.org/script/install-source.php
+) (pdf to png) on Linux
 
 
 ```
-sudo pip install -r requirements.txt
-sudo apt-get install texlive-latex-base
-sudo apt-get install texlive-latex-extra
+make install-linux
+```
+
+(takes a while, installs from source)
 
-sudo apt-get install ghostscript
-sudo apt-get install libgs-dev
+On Mac, assuming you already have a LaTeX distribution installed, you should have pdflatex and ghostscript installed, so you just need to install magick. You can try
 
-wget https://www.imagemagick.org/download/ImageMagick.tar.gz
-tar -xvf ImageMagick.tar.gz
-cd ImageMagick-7.*
-./configure --with-gslib=yes 
-make
-sudo make install
-sudo ldconfig /usr/local/lib
+```
+make install-mac
 ```
 
+## Data
 
-## Data and Preprocessing
+You can download the [prebuilt dataset from Harvard](https://zenodo.org/record/56198#.V2p0KTXT6eA) and use their preprocessing scripts found [here](https://github.com/harvardnlp/im2markup)
 
-We use Harvard preprocessing scripts that can be found at https://lstm.seas.harvard.edu/latex/
 
-First, crop + downsampling of images + group by similar shape
+## Getting Started
 
+We provide a small dataset just to check the pipeline. If you haven't touched the files, run
 
 ```
-python scripts/preprocessing/preprocess_images.py --input-dir data/formula_images --output-dir data/images_processed
+make run
 ```
 
-Second, parse formulas with KaTeX parser
+or perform the following steps
 
+1. Build the images from the formulas, write the matching file and extract the vocabulary. __Run only once__
 ```
-python scripts/preprocessing/preprocess_formulas.py --mode normalize --input-file data/im2latex_formulas.lst --output-file data/norm.formulas.lst
+python build.py
 ```
 
-Third, filter formulas
+2. Train on this small dataset
+```
+python train.py
+```
 
+3. Evaluate the text metrics
 ```
-python scripts/preprocessing/preprocess_filter.py --filter --image-dir data/images_processed --label-path data/norm.formulas.lst --data-path data/im2latex_train.lst  --output-path data/train_filter.lst
-python scripts/preprocessing/preprocess_filter.py --filter --image-dir data/images_processed --label-path data/norm.formulas.lst --data-path data/im2latex_validate.lst  --output-path data/val_filter.lst
-python scripts/preprocessing/preprocess_filter.py --filter --image-dir data/images_processed --label-path data/norm.formulas.lst --data-path data/im2latex_test.lst  --output-path data/test_filter.lst
+python evaluate_txt.py
 ```
 
+4. Evaluate the image metrics
+```
+python evaluate_img.py
+```
 
-## Train
+You should observe that the model starts to produce reasonable patterns of LaTeX.
 
-Edit the config file in configs/
+## Config
 
-```
-python main.py
-```
+Edit the config files in configs/ for your needs and change the name of the config files used in `build.py`, `train.py` etc.
diff --git a/build.py b/build.py
@@ -5,7 +5,7 @@
 
 
 if __name__ == "__main__":
-    data_config = Config("configs/small_data.json")
+    data_config = Config("configs/data_small.json")
 
     # datasets
     train_set = DataGenerator(
@@ -27,6 +27,6 @@
     val_set.build(buckets=data_config.buckets)
 
     # vocab
-    vocab_config = Config("configs/small_vocab.json")
+    vocab_config = Config("configs/vocab_small.json")
     vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok)
     write_vocab(vocab, vocab_config.path_vocab)
diff --git a/configs/training_small.json b/configs/training_small.json
@@ -2,16 +2,16 @@
     "export_name": "training.json",
 
     "lr_method"    : "Adam",
-    "n_epochs"     : 2,
+    "n_epochs"     : 5,
     "batch_size"   : 3,
     "dropout"      : 1,
     "metric_val"   : "perplexity",
     "clip"         : -1,
 
-    "lr_init"      : 1e-4,
-    "lr_min"       : 1e-4,
+    "lr_init"      : 1e-3,
+    "lr_min"       : 1e-3,
     "start_decay"  : 6,
     "end_decay"    : 13,
-    "lr_warm"      : 1e-4,
+    "lr_warm"      : 1e-3,
     "end_warm"     : 0
 }
diff --git a/evaluate_txt.py b/evaluate_txt.py
@@ -10,7 +10,7 @@
 
 if __name__ == "__main__":
     # restore config and model
-    dir_output = "results/google/under_50_vanilla_positional/"
+    dir_output = "results/small/"
 
     config_data  = Config(dir_output + "data.json")
     config_vocab = Config(dir_output + "vocab.json")
@@ -21,10 +21,6 @@
     model.build_pred()
     model.restore_session(dir_output + "model.weights/")
 
-    # custom
-    # config_data.max_iter = 20
-    # dir_output = "tmp/"
-
     # load dataset
     test_set = DataGenerator(path_formulas=config_data.path_formulas_test,
             dir_images=config_data.dir_images_test, img_prepro=greyscale,
@@ -35,7 +31,7 @@
 
     # use model to write predictions in files
     config_eval = Config({"dir_answers": dir_output + "formulas_test/",
-            "batch_size": 20})
+                          "batch_size": 20})
     files, perplexity = model.write_prediction(config_eval, test_set)
     formula_ref, formula_hyp = files[0], files[1]
 

diff --git a/hp_search.py b/hp_search.py
diff --git a/makefile b/makefile
@@ -0,0 +1,33 @@
+install-linux:
+	sudo pip install -r requirements.txt
+	sudo apt-get install texlive-latex-base
+	sudo apt-get install texlive-latex-extra
+
+	sudo apt-get install ghostscript
+	sudo apt-get install libgs-dev
+
+	wget https://www.imagemagick.org/download/ImageMagick.tar.gz
+	tar -xvf ImageMagick.tar.gz
+	cd ImageMagick-7.*; \
+	./configure --with-gslib=yes; \
+	make; \
+	sudo make install; \
+	sudo ldconfig /usr/local/lib
+	rm ImageMagick.tar.gz
+	rm -r ImageMagick-7.*
+
+install-mac:
+	wget https://www.imagemagick.org/download/ImageMagick.tar.gz
+	tar -xvf ImageMagick.tar.gz
+	cd ImageMagick-7.*; \
+	./configure --with-gslib=yes; \
+	make;\
+	sudo make install; \
+	rm ImageMagick.tar.gz
+	rm -r ImageMagick-7.*
+
+run:
+	python build.py
+	python train.py
+	python evaluate_txt.py
+	python evaluate_img.py
diff --git a/model/components/beam_search_decoder_cell.py b/model/components/beam_search_decoder_cell.py
@@ -249,7 +249,8 @@ def body(time, outputs_ta, parents):
         res = tf.while_loop(
                 condition,
                 body,
-                loop_vars=[initial_time, initial_outputs_ta, initial_parents])
+                loop_vars=[initial_time, initial_outputs_ta, initial_parents],
+                back_prop=False)
 
         # unfold and stack the structure from the nested tas
         final_outputs = nest.map_structure(lambda ta: ta.stack(), res[1])
@@ -284,7 +285,7 @@ def add_div_penalty(log_probs, div_gamma, div_prob, batch_size, beam_size,
     top_probs, top_inds = tf.nn.top_k(log_probs, k=vocab_size, sorted=True)
     # 2. inverse permutation to get rank of each entry
     top_inds = tf.reshape(top_inds, [-1, vocab_size])
-    index_rank = tf.map_fn(tf.invert_permutation, top_inds)
+    index_rank = tf.map_fn(tf.invert_permutation, top_inds, back_prop=False)
     index_rank = tf.reshape(index_rank, shape=[batch_size, beam_size,
             vocab_size])
     # 3. compute penalty

diff --git a/model/components/dynamic_decode.py b/model/components/dynamic_decode.py
@@ -59,7 +59,8 @@ def body(time, outputs_ta, state, inputs, finished):
             condition,
             body,
             loop_vars=[initial_time, initial_outputs_ta, initial_state,
-                       initial_inputs, initial_finished])
+                       initial_inputs, initial_finished],
+            back_prop=False)
 
     # get final outputs and states
     final_outputs_ta, final_state = res[1], res[2]

diff --git a/tmp/test.pdf b/tmp/test.pdf
diff --git a/tmp/test.png b/tmp/test.png