project.yml

title: "FrontPage: An Prodigy project to make a personal front-page."
description: "Using Prodigy, spaCy, and friends ... this project allows you to make your own frontpage of the internet."

# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  annot_stream: "raw/content.jsonl"
  model: "en_core_web_sm"
  spacy_folder: "./spacy-data"
  site_folder: "./docs"
  model_folder: "./training"

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["scripts", "assets", "data", "training", "raw"]

# Assets that should be downloaded or available in the directory. You can replace
# this with your own input data.
assets:
  - dest: ""
    description: ""

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
  - name: "download"
    help: "Download data from sources."
    script:
      - python scripts/download_arxiv.py --query 'ti:dataset OR ti:corpus OR ti:database OR abs:"a new dataset"' --tag dataset
      - python scripts/download_arxiv.py --query 'machine AND learning AND security OR machine AND learning AND secure' --tag ml,security
      - python scripts/download_arxiv.py --query 'machine AND learning AND video AND games OR machine AND learning AND gaming' --tag ml,gaming
      - python scripts/download_arxiv.py --query 'abs:"chatgpt"' --tag ml,prompteng
      - python scripts/download_arxiv.py --query 'prompt AND engineering' --tag ml,prompteng
      - python scripts/download_arxiv.py --query 'data AND quality' --tag data-quality
    deps:
      - scripts/download_arxiv.py
      - scripts/util.py
  - name: "preprocess"
    help: "Process data into Prodigy format."
    script:
      - python scripts/preprocess.py assets/ ${vars.annot_stream}
    deps:
      - assets
      - scripts/preprocess.py
    outputs:
      - ${vars.annot_stream}
  - name: "annotate-dataset"
    help: "Annotate if content is about a new dataset."
    script:
      - "prodigy textcat_topic dataset_papers ${vars.annot_stream} en_core_web_sm patterns/dataset.jsonl dataset,arxiv dataset -F recipes/textcat_topic.py"
    deps:
      - ${vars.annot_stream}
  - name: "annotate-prompt-eng"
    help: "Annotate if content is about prompt engineering."
    script:
      - "prodigy textcat_topic prompt_papers ${vars.annot_stream} en_core_web_sm patterns/prompt.jsonl prompteng,arxiv prompteng -F recipes/textcat_topic.py"
    deps:
      - ${vars.annot_stream}
  - name: "db-out"
    help: "Export Prodigy annotations to a jsonl file."
    script:
      - "prodigy db_out dataset_papers ${vars.spacy_folder}"
  - name: "data-to-spacy"
    help: "Export Prodigy annotations to spacy format."
    script:
      - "prodigy data-to-spacy ${vars.spacy_folder} --textcat-multilabel dataset_papers,prompt_papers --eval-split 0.2"
    deps:
      - data
    outputs:
      - "${vars.spacy_folder}/train.spacy"
      - "${vars.spacy_folder}/dev.spacy"
      - "${vars.spacy_folder}/config.cfg"
  - name: "spacy-train"
    help: "Train a spaCy textcat model with Prodigy annotations."
    script:
      - "python -m spacy train ${vars.spacy_folder}/config.cfg --output training/ --paths.train ${vars.spacy_folder}/train.spacy --paths.dev ${vars.spacy_folder}/dev.spacy --verbose"
    deps:
      - "${vars.spacy_folder}/train.spacy"
      - "${vars.spacy_folder}/dev.spacy"
      - "${vars.spacy_folder}/config.cfg"
    outputs:
      - "${vars.model_folder}/model-best"
      - "${vars.model_folder}/model-last"
  - name: "prodigy-train-curve"
    help: "Train a textcat model with Prodigy annotations."
    script:
      - "python -m prodigy train-curve --textcat-multilabel dataset_papers,prompt_papers --show-plot --n-samples 5"
  - name: "prodigy-train"
    help: "Train a textcat model with Prodigy annotations."
    script:
      - "python -m prodigy train --textcat-multilabel dataset_papers,prompt_papers --label-stats"
  - name: "content"
    help: "Generates the content stream for the frontpage"
    script:
      - "rm -f ${vars.site_folder}/frontpage.jsonl"
      - "python scripts/attach_classes.py raw/content.jsonl ${vars.model_folder}/model-best ${vars.site_folder}/frontpage.jsonl"
    deps:
      - "scripts/attach_classes.py"
      - "raw/content.jsonl"
      - "${vars.model_folder}/model-best"
  - name: "build"
    help: "Builds your frontpage."
    script:
      - "python scripts/build_frontpage.py ${vars.site_folder}/frontpage.jsonl config/frontpage.yaml templates/frontpage.jinja2 ${vars.site_folder}/index.html"
    deps:
      - "config/frontpage.yaml"
      - "templates/frontpage.jinja2"
      - "scripts/build_frontpage.py"
      - "${vars.site_folder}/frontpage.jsonl"
  - name: "evaluate"
    help: "evaluate a spaCy model and/or a matcher"
    script:
      - "python scripts/evaluate.py dataset_papers patterns/dataset.jsonl training/dataset/"
    
workflows:
  fetch:
    - download
    - preprocess
  new-frontpage:
    - preprocess
    - data-to-spacy
    - spacy-train
    - content
    - build
  new-frontpage-github:
    - preprocess
    - spacy-train
    - content
    - build