forked from victorialslocum/frontpage
-
Notifications
You must be signed in to change notification settings - Fork 0
/
project.yml
130 lines (125 loc) · 5.25 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
title: "FrontPage: An Prodigy project to make a personal front-page."
description: "Using Prodigy, spaCy, and friends ... this project allows you to make your own frontpage of the internet."
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
annot_stream: "raw/content.jsonl"
model: "en_core_web_sm"
spacy_folder: "./spacy-data"
site_folder: "./docs"
model_folder: "./training"
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["scripts", "assets", "data", "training", "raw"]
# Assets that should be downloaded or available in the directory. You can replace
# this with your own input data.
assets:
- dest: ""
description: ""
# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
- name: "download"
help: "Download data from sources."
script:
- python scripts/download_arxiv.py --query 'ti:dataset OR ti:corpus OR ti:database OR abs:"a new dataset"' --tag dataset
- python scripts/download_arxiv.py --query 'machine AND learning AND security OR machine AND learning AND secure' --tag ml,security
- python scripts/download_arxiv.py --query 'machine AND learning AND video AND games OR machine AND learning AND gaming' --tag ml,gaming
- python scripts/download_arxiv.py --query 'abs:"chatgpt"' --tag ml,prompteng
- python scripts/download_arxiv.py --query 'prompt AND engineering' --tag ml,prompteng
- python scripts/download_arxiv.py --query 'data AND quality' --tag data-quality
deps:
- scripts/download_arxiv.py
- scripts/util.py
- name: "preprocess"
help: "Process data into Prodigy format."
script:
- python scripts/preprocess.py assets/ ${vars.annot_stream}
deps:
- assets
- scripts/preprocess.py
outputs:
- ${vars.annot_stream}
- name: "annotate-dataset"
help: "Annotate if content is about a new dataset."
script:
- "prodigy textcat_topic dataset_papers ${vars.annot_stream} en_core_web_sm patterns/dataset.jsonl dataset,arxiv dataset -F recipes/textcat_topic.py"
deps:
- ${vars.annot_stream}
- name: "annotate-prompt-eng"
help: "Annotate if content is about prompt engineering."
script:
- "prodigy textcat_topic prompt_papers ${vars.annot_stream} en_core_web_sm patterns/prompt.jsonl prompteng,arxiv prompteng -F recipes/textcat_topic.py"
deps:
- ${vars.annot_stream}
- name: "db-out"
help: "Export Prodigy annotations to a jsonl file."
script:
- "prodigy db_out dataset_papers ${vars.spacy_folder}"
- name: "data-to-spacy"
help: "Export Prodigy annotations to spacy format."
script:
- "prodigy data-to-spacy ${vars.spacy_folder} --textcat-multilabel dataset_papers,prompt_papers --eval-split 0.2"
deps:
- data
outputs:
- "${vars.spacy_folder}/train.spacy"
- "${vars.spacy_folder}/dev.spacy"
- "${vars.spacy_folder}/config.cfg"
- name: "spacy-train"
help: "Train a spaCy textcat model with Prodigy annotations."
script:
- "python -m spacy train ${vars.spacy_folder}/config.cfg --output training/ --paths.train ${vars.spacy_folder}/train.spacy --paths.dev ${vars.spacy_folder}/dev.spacy --verbose"
deps:
- "${vars.spacy_folder}/train.spacy"
- "${vars.spacy_folder}/dev.spacy"
- "${vars.spacy_folder}/config.cfg"
outputs:
- "${vars.model_folder}/model-best"
- "${vars.model_folder}/model-last"
- name: "prodigy-train-curve"
help: "Train a textcat model with Prodigy annotations."
script:
- "python -m prodigy train-curve --textcat-multilabel dataset_papers,prompt_papers --show-plot --n-samples 5"
- name: "prodigy-train"
help: "Train a textcat model with Prodigy annotations."
script:
- "python -m prodigy train --textcat-multilabel dataset_papers,prompt_papers --label-stats"
- name: "content"
help: "Generates the content stream for the frontpage"
script:
- "rm -f ${vars.site_folder}/frontpage.jsonl"
- "python scripts/attach_classes.py raw/content.jsonl ${vars.model_folder}/model-best ${vars.site_folder}/frontpage.jsonl"
deps:
- "scripts/attach_classes.py"
- "raw/content.jsonl"
- "${vars.model_folder}/model-best"
- name: "build"
help: "Builds your frontpage."
script:
- "python scripts/build_frontpage.py ${vars.site_folder}/frontpage.jsonl config/frontpage.yaml templates/frontpage.jinja2 ${vars.site_folder}/index.html"
deps:
- "config/frontpage.yaml"
- "templates/frontpage.jinja2"
- "scripts/build_frontpage.py"
- "${vars.site_folder}/frontpage.jsonl"
- name: "evaluate"
help: "evaluate a spaCy model and/or a matcher"
script:
- "python scripts/evaluate.py dataset_papers patterns/dataset.jsonl training/dataset/"
workflows:
fetch:
- download
- preprocess
new-frontpage:
- preprocess
- data-to-spacy
- spacy-train
- content
- build
new-frontpage-github:
- preprocess
- spacy-train
- content
- build