Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hyperparameter optimization for NN ensemble #569

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Initial rough version of hyperparameter optimization for NN ensemble. F…
…ixes #435
  • Loading branch information
osma committed Feb 11, 2022
commit e7858353fce558e98dcb81fb18c0bef4a433a325
86 changes: 83 additions & 3 deletions annif/backend/nn_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from annif.suggestion import VectorSuggestionResult
from . import backend
from . import ensemble
from . import hyperopt


def idx_to_key(idx):
Expand Down Expand Up @@ -76,6 +77,82 @@ def __len__(self):
return int(np.ceil(self._counter / self._batch_size))


class NNEnsembleOptimizer(hyperopt.HyperparameterOptimizer):
"""Hyperparameter optimizer for the NN ensemble backend"""

def _prepare(self, n_jobs=1):
sources = dict(
annif.util.parse_sources(self._backend.params['sources']))

# initialize the source projects before forking, to save memory
for project_id in sources.keys():
project = self._backend.project.registry.get_project(project_id)
project.initialize(parallel=True)

psmap = annif.parallel.ProjectSuggestMap(
self._backend.project.registry,
list(sources.keys()),
backend_params=None,
limit=None,
threshold=0.0)

jobs, pool_class = annif.parallel.get_pool(n_jobs)

self._score_vectors = []
self._gold_subjects = []

with pool_class(jobs) as pool:
for hits, uris, labels in pool.imap_unordered(
psmap.suggest, self._corpus.documents):
doc_scores = []
for project_id, p_hits in hits.items():
vector = p_hits.as_vector(self._backend.project.subjects)
doc_scores.append(np.sqrt(vector)
* sources[project_id]
* len(sources))
score_vector = np.array(doc_scores,
dtype=np.float32).transpose()
subjects = annif.corpus.SubjectSet((uris, labels))
self._score_vectors.append(score_vector)
self._gold_subjects.append(subjects)

def _objective(self, trial):
sources = annif.util.parse_sources(self._backend.params['sources'])
params = {
'nodes': trial.suggest_int('nodes', 50, 200),
'dropout_rate': trial.suggest_float('dropout_rate', 0.0, 0.5),
'epochs': trial.suggest_int('epochs', 5, 20),
'optimizer': 'adam'
}
model = self._backend._create_model(sources, params)

env = self._backend._open_lmdb(True,
self._backend.params['lmdb_map_size'])
with env.begin(buffers=True) as txn:
seq = LMDBSequence(txn, batch_size=32)
model.fit(seq, verbose=0, epochs=params['epochs'])

batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
for goldsubj, score_vector in zip(self._gold_subjects,
self._score_vectors):

results = model.predict(
np.expand_dims(score_vector, 0))
output = VectorSuggestionResult(results[0])
batch.evaluate(output, goldsubj)
eval_results = batch.results(metrics=[self._metric])
return eval_results[self._metric]

def _postprocess(self, study):
bp = study.best_params
lines = [
f"nodes={bp['nodes']}",
f"dropout_rate={bp['dropout_rate']}",
f"epochs={bp['epochs']}"
]
return hyperopt.HPRecommendation(lines=lines, score=study.best_value)


class MeanLayer(Layer):
"""Custom Keras layer that calculates mean values along the 2nd axis."""
def call(self, inputs):
Expand All @@ -84,7 +161,8 @@ def call(self, inputs):

class NNEnsembleBackend(
backend.AnnifLearningBackend,
ensemble.BaseEnsembleBackend):
ensemble.BaseEnsembleBackend,
hyperopt.AnnifHyperoptBackend):
"""Neural network ensemble backend that combines results from multiple
projects"""

Expand All @@ -105,6 +183,9 @@ class NNEnsembleBackend(
# defaults for uninitialized instances
_model = None

def get_hp_optimizer(self, corpus, metric):
return NNEnsembleOptimizer(self, corpus, metric)

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
params.update(self.DEFAULT_PARAMETERS)
Expand Down Expand Up @@ -138,8 +219,6 @@ def _merge_hits_from_sources(self, hits_from_sources, params):
return VectorSuggestionResult(results[0])

def _create_model(self, sources, params):
self.info("creating NN ensemble model")

inputs = Input(shape=(len(self.project.subjects), len(sources)))

flat_input = Flatten()(inputs)
Expand Down Expand Up @@ -171,6 +250,7 @@ def _create_model(self, sources, params):

def _train(self, corpus, params, jobs=0):
sources = annif.util.parse_sources(params['sources'])
self.info("creating NN ensemble model")
self._model = self._create_model(sources, params)
self._fit_model(
corpus,
Expand Down