config.sh

#!/bin/bash
# this script defines configuration options


######################
## BASE DIRECTORIES ##
######################

# data directory (string)
DIR_DATA=${DIR_ROOT}/data

# tools directory (string)
DIR_TOOLS=${DIR_ROOT}/tools

# models directory (string)
DIR_MODELS=${DIR_ROOT}/models

# utils directory (string)
DIR_UTILS=${DIR_ROOT}/utils

#############
## UPDATES ##
#############

# downloads and processes data from the PMB overwriting existing data (boolean, default: 0)
GET_PMB=0

# processes the additional data again overwriting existing data (boolean, default: 0)
GET_EXTRA=0

# downloads and processes word embeddings overwriting existing ones (boolean, default: 0)
GET_EMBS=0

# downloads and installs required tools overwriting previous versions (boolean, default: 0)
GET_TOOLS=0

# trains a semantic tagger model overwritting existing models (boolean, default: 0)
GET_MODEL=0

###########################
## PARALLEL MEANING BANK ##
###########################

# use semantically tagged data from the PMB (boolean, default: 1)
PMB_MAIN_DATA=1

# version of the PMB Universal Semantic Tags release to use (string)
# currently available versions are "0.1.0"
PMB_VER="0.1.0"

# root directory where to store the PMB (string)
PMB_ROOT=${DIR_DATA}/pmb/sem-${PMB_VER}

# directory where to store data extracted from the PMB (string)
PMB_EXTDIR=${DIR_DATA}/pmb

# codes of PMB languages for which to extract tagged sentences (array)
# allowed values: "en", "de", "it", "nl"
PMB_LANGS=("en")

# proportion of PMB tagged sentences to include in the test set (float, default: 0.10)
# the remaining sentences are included in the training set
PMB_TEST_SIZE=0.10

# use additional semantically tagged data (boolean, default: 0)
# set this option to 0 if you do not have access to additional data
PMB_EXTRA_DATA=0

# directories with additional semantically tagged data (array)
# each directory listed is assumed to contain a number of files
# each file is assumed to contain [TAG]\t[WORD] lines
# each file is assumed to contain empty lines denoting the end of a sentence
PMB_EXTRA_SRC=()

# languages corresponding to the data of each directory with extra data (array)
# allowed values: "en", "de", "it", "nl"
PMB_EXTRA_LANGS=()

################
## EMBEDDINGS ##
################

# whether or not to use word embeddings (boolean, default: 1)
EMB_USE_WORDS=1

# whether or not to use character embeddings (boolean, default: 1)
EMB_USE_CHARS=1

# pretrained word embeddings for each one of the PMB languages (array)
# the files listed are assumed to be in the same order as PMB_LANGS
# default embeddings are used when a given string is empty or does not match a file
EMB_WORD_PRETRAINED=("")

# pretrained character embeddings for each one of the PMB languages (array)
# the files listed are assumed to be in the same order as PMB_LANGS
# default embeddings are used when a given string is empty or does not match a file
EMB_CHAR_PRETRAINED=("")

# root directory where to store word embeddings for English (string)
# GloVe embeddings are only used if no pretrained embeddings are given
# languages other than English use Polyglot embeddings instead
EMB_ROOT=${DIR_DATA}/embeddings

# version of the GloVe word embeddings to use for English as default (string)
# allowed values: "glove.6B.{50/100/200/300}d", "glove.42B.300d", "glove.840B.300d"
EMB_GLOVE_MODEL="glove.840B.300d"

##########################
## TRAINING AND TESTING ##
##########################

# proportion of sentences in the training set to use for testing purposes (float, default: 0.00)
# these sentences are deducted from the training set and are evaluated after training
RUN_TEST_SIZE=0.00

# proportion of sentences in the training set to use for development purposes (float, default: 0.00)
# these sentences are deducted from the training set and are evaluated after each training epoch
RUN_DEV_SIZE=0.00

# run grid-search for hyperparameter optimization (boolean, default: 0)
# grid-search is time-consuming and can change the hyperparameters defined in this file
# the hyperparameters here defined are shared among models for all languages otherwise
RUN_GRID_SEARCH=0

# maximum sentence length allowed, as a percentile on the sentence length distribution (float, default: 0.95)
# the number of words in a sentence for creating word-based features is computed based on this number
RUN_SENT_LEN=0.95

# maximum word length allowed, as a percentile on the word length distribution (float, default: 0.98)
# the number of characters in a word for creating character-based features is computed based on this number
RUN_WORD_LEN=0.98

# handle multi-word expressions (boolean, default: 1)
RUN_MWE=1

# depth of the residual network applied on character embedding features (int, default: 6)
# the residual network helps turn character embeddings into word-like representations
RUN_RESNET_DEPTH=6

#####################
# MODEL PARAMETERS ##
#####################

# type of neural model to use (string)
# allowed values: "lstm", "blstm", "gru", "bgru"
MODEL_TYPE="bgru"

# directory where to store the trained model (string)
MODEL_ROOT=${DIR_MODELS}/bin

# training epochs (int, default: 10)
MODEL_EPOCHS=10

# units in the first layer of the neural model (int, default: 300)
MODEL_SIZE=300

# number of recurrent layers of the neural model (int, default: 1)
MODEL_LAYERS=1

# standard deviation for the noise normal distribution (float, default: 0.0)
MODEL_SIGMA=0.0

# activation function on hidden layers (string)
# allowed values: "sigmoid", "tanh", "relu"
MODEL_ACTIVATION_HIDDEN="relu"

# type of output layer (string)
# allowed values: "softmax", "crf"
MODEL_ACTIVATION_OUTPUT="crf"

# loss function (string)
# allowed values: "mean_squared_error", "mean_absolute_error", "categorical_hinge", "categorical_cross_entropy"
MODEL_LOSS="categorical_cross_entropy"

# optimizer (string)
# allowed values: "sgd", "rmsprop", "adam", "adamax", "nadam"
MODEL_OPTIMIZER="adam"

# dropout rate on each layer (float, default: 0.10)
MODEL_DROPOUT=0.10

# batch size (int, default: 150)
MODEL_BATCH_SIZE=150

# use batch normalization (boolean, default: 1)
MODEL_BATCH_NORMALIZATION=1

# keras verbosity mode (int, default: 1)
MODEL_VERBOSE=1