forked from ginesiametlle/semtagger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.sh
executable file
·190 lines (137 loc) · 6.08 KB
/
config.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/bin/bash
# this script defines configuration options
######################
## BASE DIRECTORIES ##
######################
# data directory (string)
DIR_DATA=${DIR_ROOT}/data
# tools directory (string)
DIR_TOOLS=${DIR_ROOT}/tools
# models directory (string)
DIR_MODELS=${DIR_ROOT}/models
# utils directory (string)
DIR_UTILS=${DIR_ROOT}/utils
#############
## UPDATES ##
#############
# downloads and processes data from the PMB overwriting existing data (boolean, default: 0)
GET_PMB=0
# processes the additional data again overwriting existing data (boolean, default: 0)
GET_EXTRA=0
# downloads and processes word embeddings overwriting existing ones (boolean, default: 0)
GET_EMBS=0
# downloads and installs required tools overwriting previous versions (boolean, default: 0)
GET_TOOLS=0
# trains a semantic tagger model overwritting existing models (boolean, default: 0)
GET_MODEL=0
###########################
## PARALLEL MEANING BANK ##
###########################
# use semantically tagged data from the PMB (boolean, default: 1)
PMB_MAIN_DATA=1
# version of the PMB Universal Semantic Tags release to use (string)
# currently available versions are "0.1.0"
PMB_VER="0.1.0"
# root directory where to store the PMB (string)
PMB_ROOT=${DIR_DATA}/pmb/sem-${PMB_VER}
# directory where to store data extracted from the PMB (string)
PMB_EXTDIR=${DIR_DATA}/pmb
# codes of PMB languages for which to extract tagged sentences (array)
# allowed values: "en", "de", "it", "nl"
PMB_LANGS=("en")
# proportion of PMB tagged sentences to include in the test set (float, default: 0.10)
# the remaining sentences are included in the training set
PMB_TEST_SIZE=0.10
# use additional semantically tagged data (boolean, default: 0)
# set this option to 0 if you do not have access to additional data
PMB_EXTRA_DATA=0
# directories with additional semantically tagged data (array)
# each directory listed is assumed to contain a number of files
# each file is assumed to contain [TAG]\t[WORD] lines
# each file is assumed to contain empty lines denoting the end of a sentence
PMB_EXTRA_SRC=()
# languages corresponding to the data of each directory with extra data (array)
# allowed values: "en", "de", "it", "nl"
PMB_EXTRA_LANGS=()
################
## EMBEDDINGS ##
################
# whether or not to use word embeddings (boolean, default: 1)
EMB_USE_WORDS=1
# whether or not to use character embeddings (boolean, default: 1)
EMB_USE_CHARS=1
# pretrained word embeddings for each one of the PMB languages (array)
# the files listed are assumed to be in the same order as PMB_LANGS
# default embeddings are used when a given string is empty or does not match a file
EMB_WORD_PRETRAINED=("")
# pretrained character embeddings for each one of the PMB languages (array)
# the files listed are assumed to be in the same order as PMB_LANGS
# default embeddings are used when a given string is empty or does not match a file
EMB_CHAR_PRETRAINED=("")
# root directory where to store word embeddings for English (string)
# GloVe embeddings are only used if no pretrained embeddings are given
# languages other than English use Polyglot embeddings instead
EMB_ROOT=${DIR_DATA}/embeddings
# version of the GloVe word embeddings to use for English as default (string)
# allowed values: "glove.6B.{50/100/200/300}d", "glove.42B.300d", "glove.840B.300d"
EMB_GLOVE_MODEL="glove.840B.300d"
##########################
## TRAINING AND TESTING ##
##########################
# proportion of sentences in the training set to use for testing purposes (float, default: 0.00)
# these sentences are deducted from the training set and are evaluated after training
RUN_TEST_SIZE=0.00
# proportion of sentences in the training set to use for development purposes (float, default: 0.00)
# these sentences are deducted from the training set and are evaluated after each training epoch
RUN_DEV_SIZE=0.00
# run grid-search for hyperparameter optimization (boolean, default: 0)
# grid-search is time-consuming and can change the hyperparameters defined in this file
# the hyperparameters here defined are shared among models for all languages otherwise
RUN_GRID_SEARCH=0
# maximum sentence length allowed, as a percentile on the sentence length distribution (float, default: 0.95)
# the number of words in a sentence for creating word-based features is computed based on this number
RUN_SENT_LEN=0.95
# maximum word length allowed, as a percentile on the word length distribution (float, default: 0.98)
# the number of characters in a word for creating character-based features is computed based on this number
RUN_WORD_LEN=0.98
# handle multi-word expressions (boolean, default: 1)
RUN_MWE=1
# depth of the residual network applied on character embedding features (int, default: 6)
# the residual network helps turn character embeddings into word-like representations
RUN_RESNET_DEPTH=6
#####################
# MODEL PARAMETERS ##
#####################
# type of neural model to use (string)
# allowed values: "lstm", "blstm", "gru", "bgru"
MODEL_TYPE="bgru"
# directory where to store the trained model (string)
MODEL_ROOT=${DIR_MODELS}/bin
# training epochs (int, default: 10)
MODEL_EPOCHS=10
# units in the first layer of the neural model (int, default: 300)
MODEL_SIZE=300
# number of recurrent layers of the neural model (int, default: 1)
MODEL_LAYERS=1
# standard deviation for the noise normal distribution (float, default: 0.0)
MODEL_SIGMA=0.0
# activation function on hidden layers (string)
# allowed values: "sigmoid", "tanh", "relu"
MODEL_ACTIVATION_HIDDEN="relu"
# type of output layer (string)
# allowed values: "softmax", "crf"
MODEL_ACTIVATION_OUTPUT="crf"
# loss function (string)
# allowed values: "mean_squared_error", "mean_absolute_error", "categorical_hinge", "categorical_cross_entropy"
MODEL_LOSS="categorical_cross_entropy"
# optimizer (string)
# allowed values: "sgd", "rmsprop", "adam", "adamax", "nadam"
MODEL_OPTIMIZER="adam"
# dropout rate on each layer (float, default: 0.10)
MODEL_DROPOUT=0.10
# batch size (int, default: 150)
MODEL_BATCH_SIZE=150
# use batch normalization (boolean, default: 1)
MODEL_BATCH_NORMALIZATION=1
# keras verbosity mode (int, default: 1)
MODEL_VERBOSE=1