title: "Finnish model" description: "Train spaCy model for Finnish on UD Finnish TDT" vars: vector_size: 50000 vector_dim: 300 max_texts: 4000000 texts_per_batch: 250000 minn: 4 maxn: 5 max_steps: 20000 pretrain_max_steps: 60 pretrain_max_texts: 100000 treebank: "UD_Finnish-TDT" corpus_ner: "turku-one" train_name: "fi_tdt-ud-train" dev_name: "fi_tdt-ud-dev" test_name: "fi_tdt-ud-test" n_threads: 4 gpu_id: -1 floret_epochs: 5 floret_min_count: 200 directories: ["assets", "corpus", "data", "metrics", "training"] assets: - dest: "assets/${vars.treebank}" git: repo: "https://github.com/UniversalDependencies/${vars.treebank}" branch: "r2.11" path: "" - dest: "assets/${vars.corpus_ner}" git: repo: "https://github.com/TurkuNLP/turku-one.git" branch: "main" path: "" workflows: floret-vectors: - download-mc4-fi - train-floret-mc4 pretrain: - download-mc4-fi - count-word-frequencies - init-lexdata - init-floret-vectors - convert-mc4-jsonl - pretrain-mc4 train-pipeline: - download-mc4-fi - count-word-frequencies - init-lexdata - init-floret-vectors - convert - convert-ner - train - train-ner - merge-parser-and-ner - functional-tests - evaluate commands: - name: "download-mc4-fi" help: "Download a subset of the MC4 corpus" script: - "rm -rf corpus/mc4/raw/" - "mkdir -p corpus/mc4/raw" - "python -m tools.download_huggingface mc4 fi ${vars.max_texts} ${vars.texts_per_batch} corpus/mc4/raw" outputs: - "corpus/mc4/raw" - name: "train-floret-mc4" help: "Train floret vectors on the MC4 corpus" script: - "rm -rf corpus/mc4/tokenized/" - "mkdir -p corpus/mc4/tokenized" - "python -m tools.tokenize_fi --threads ${vars.n_threads} corpus/mc4/raw/ corpus/mc4/tokenized" - "tools/merge_text_batches.sh corpus/mc4/tokenized/ corpus/mc4/tokenized/merged" - "floret cbow -mode floret -hashCount 2 -bucket ${vars.vector_size} -minn ${vars.minn} -maxn ${vars.maxn} -minCount ${vars.floret_min_count} -dim ${vars.vector_dim} -neg 10 -epoch ${vars.floret_epochs} -thread ${vars.n_threads} -input corpus/mc4/tokenized/merged -output vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}" - "gzip -f vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}.floret" - "rm -rf corpus/mc4/tokenized/" deps: - "corpus/mc4/raw" outputs: - "vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}.floret.gz" - name: "convert-mc4-jsonl" help: "Convert the MC4 corpus to JSONL format" script: - "tools/raw_text_to_jsonl.sh corpus/mc4/raw corpus/mc4/mc4_${vars.pretrain_max_texts}.jsonl ${vars.pretrain_max_texts}" deps: - "corpus/mc4/raw" outputs: - "corpus/mc4/mc4_${vars.pretrain_max_texts}.jsonl" - name: "pretrain-mc4" help: "Pretrain the tok2vec component" script: - "rm -rf training/pretrain" - "mkdir -p training/pretrain" - "python -m spacy pretrain configs/fi.cfg training/pretrain --code fi/fi.py --paths.pretrain corpus/mc4/mc4_${vars.pretrain_max_texts}.jsonl --paths.vectors data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret --paths.vocab_lookups data/vocab/lookups --pretraining.max_epochs ${vars.pretrain_max_steps} --gpu-id ${vars.gpu_id}" - "cp training/pretrain/model${vars.pretrain_max_steps}.bin pretrain/weights.bin" deps: - "configs/fi.cfg" - "corpus/mc4/mc4_${vars.pretrain_max_texts}.jsonl" - "data/vocab/lookups/lexeme_prob.json" - "data/vocab/lookups/lexeme_settings.json" - "data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret" outputs: - "pretrain/weights.bin" - name: "count-word-frequencies" script: - "mkdir -p data/word_frequencies" - "tools/frequencies.sh corpus/mc4/raw data/word_frequencies/finnish_vocab.txt.gz" deps: - "corpus/mc4/raw" outputs: - "data/word_frequencies/finnish_vocab.txt.gz" - name: "init-lexdata" script: - "mkdir -p data/vocab/lookups" - "python -m tools.create_lexdata data/word_frequencies/finnish_vocab.txt.gz data/vocab/lookups 500000" deps: - "data/word_frequencies/finnish_vocab.txt.gz" outputs: - "data/vocab/lookups/lexeme_prob.json" - "data/vocab/lookups/lexeme_settings.json" - name: "init-floret-vectors" help: "Create floret embeddings" script: - "mkdir -p data/vectors" - "python -m spacy init vectors fi vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}.floret.gz data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret --mode floret --name fi_web_floret.vectors" deps: - "vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}.floret.gz" outputs: - "data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret" - name: "convert" help: "Convert the data to spaCy's format" script: - "mkdir -p corpus/${vars.treebank}/preprocessed corpus/${vars.treebank}/spacy" - "python tools/preprocess_UD-TDT.py --trainset assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu" - "python -m spacy convert corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu corpus/${vars.treebank}/spacy --n-sents 6" - "mv corpus/${vars.treebank}/spacy/${vars.train_name}.spacy corpus/${vars.treebank}/spacy/train.spacy" - "python tools/preprocess_UD-TDT.py assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.dev_name}.conllu" - "python -m spacy convert corpus/${vars.treebank}/preprocessed/${vars.dev_name}.conllu corpus/${vars.treebank}/spacy --n-sents 6" - "mv corpus/${vars.treebank}/spacy/${vars.dev_name}.spacy corpus/${vars.treebank}/spacy/dev.spacy" - "python tools/preprocess_UD-TDT.py assets/${vars.treebank}/${vars.test_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.test_name}.conllu" - "python -m spacy convert corpus/${vars.treebank}/preprocessed/${vars.test_name}.conllu corpus/${vars.treebank}/spacy --n-sents 6" - "mv corpus/${vars.treebank}/spacy/${vars.test_name}.spacy corpus/${vars.treebank}/spacy/test.spacy" deps: - "assets/${vars.treebank}/" outputs: - "corpus/${vars.treebank}/spacy/train.spacy" - "corpus/${vars.treebank}/spacy/dev.spacy" - "corpus/${vars.treebank}/spacy/test.spacy" - name: "train" help: "Train the model" script: - "python -m spacy train configs/fi.cfg --output training/${vars.treebank}/ --paths.train corpus/${vars.treebank}/spacy/train.spacy --paths.dev corpus/${vars.treebank}/spacy/dev.spacy --paths.init_tok2vec pretrain/weights.bin --paths.vectors data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret --paths.vocab_lookups data/vocab/lookups --code fi/fi.py --gpu-id ${vars.gpu_id} --training.max_steps ${vars.max_steps}" deps: - "configs/fi.cfg" - "corpus/${vars.treebank}/spacy/train.spacy" - "corpus/${vars.treebank}/spacy/dev.spacy" - "pretrain/weights.bin" - "data/vocab/lookups/lexeme_prob.json" - "data/vocab/lookups/lexeme_settings.json" - "data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret" outputs: - "training/${vars.treebank}/model-best" - name: "convert-ner" help: "Convert the NER corpus to spaCy's format" script: - "mkdir -p corpus/${vars.corpus_ner}/spacy" - "python -m spacy convert assets/${vars.corpus_ner}/data/conll/train.tsv corpus/${vars.corpus_ner}/spacy --converter ner --n-sents 10" - "python -m spacy convert assets/${vars.corpus_ner}/data/conll/dev.tsv corpus/${vars.corpus_ner}/spacy --converter ner --n-sents 10" - "python -m spacy convert assets/${vars.corpus_ner}/data/conll/test.tsv corpus/${vars.corpus_ner}/spacy --converter ner --n-sents 10" deps: - "assets/${vars.corpus_ner}/" outputs: - "corpus/${vars.corpus_ner}/spacy/train.spacy" - "corpus/${vars.corpus_ner}/spacy/dev.spacy" - "corpus/${vars.corpus_ner}/spacy/test.spacy" - name: "train-ner" help: "Train the NER model" script: - "python -m spacy train configs/fi-ner.cfg --output training/${vars.corpus_ner}/ --paths.train corpus/${vars.corpus_ner}/spacy/train.spacy --paths.dev corpus/${vars.corpus_ner}/spacy/dev.spacy --paths.init_tok2vec pretrain/weights.bin --paths.vectors data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret --paths.vocab_lookups data/vocab/lookups --code fi/fi.py --gpu-id ${vars.gpu_id} --training.max_steps ${vars.max_steps}" deps: - "configs/fi-ner.cfg" - "corpus/${vars.corpus_ner}/spacy/train.spacy" - "corpus/${vars.corpus_ner}/spacy/dev.spacy" - "pretrain/weights.bin" - "data/vocab/lookups/lexeme_prob.json" - "data/vocab/lookups/lexeme_settings.json" - "data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret" outputs: - "training/${vars.corpus_ner}/model-best" - name: "merge-parser-and-ner" help: "Merge the parser and NER models into one model" script: - "spacy assemble configs/merged.cfg training/merged --paths.init_tok2vec pretrain/weights.bin --paths.vectors data/vectors/fi-${vars.vector_dim}-${vars.vector_size}-minn${vars.minn}-maxn${vars.maxn}-floret --paths.vocab_lookups data/vocab/lookups --code fi/fi.py" deps: - "training/${vars.treebank}/model-best" - "training/${vars.corpus_ner}/model-best" outputs: - "training/merged" - name: "functional-tests" help: "Run functional tests to check that all capabilities are include in the trained model" script: - "python -m pytest tests/functional" deps: - "training/merged" - name: "evaluate" help: "Evaluate the full model" script: - "mkdir -p metrics/${vars.treebank}" - "mkdir -p metrics/${vars.corpus_ner}" - "python -m spacy evaluate training/merged corpus/${vars.treebank}/spacy/dev.spacy --output metrics/${vars.treebank}/dev.json --code fi/fi.py --gpu-id ${vars.gpu_id}" - "python -m spacy evaluate training/merged corpus/${vars.treebank}/spacy/test.spacy --output metrics/${vars.treebank}/test.json --code fi/fi.py --gpu-id ${vars.gpu_id}" - "python -m spacy evaluate training/merged corpus/${vars.corpus_ner}/spacy/dev.spacy --output metrics/${vars.corpus_ner}/dev.json --code fi/fi.py --gpu-id ${vars.gpu_id}" - "python -m spacy evaluate training/merged corpus/${vars.corpus_ner}/spacy/test.spacy --output metrics/${vars.corpus_ner}/test.json --code fi/fi.py --gpu-id ${vars.gpu_id}" deps: - "corpus/${vars.treebank}/spacy/dev.spacy" - "corpus/${vars.treebank}/spacy/test.spacy" - "corpus/${vars.corpus_ner}/spacy/dev.spacy" - "corpus/${vars.corpus_ner}/spacy/test.spacy" - "training/merged/meta.json" outputs: - "metrics/${vars.corpus_ner}/dev.json" - "metrics/${vars.corpus_ner}/test.json"