Skip to content

Commit

Permalink
Add workflow to sync tutorials with deepset Cloud when they change (d…
Browse files Browse the repository at this point in the history
…eepset-ai#150)

* Add script to generate txt files

* Add workflow to upload tutorial files to deepset Cloud
  • Loading branch information
silvanocerza authored Mar 28, 2023
1 parent 48ed2c7 commit 8b7d238
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 0 deletions.
90 changes: 90 additions & 0 deletions .github/workflows/dc_sync.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: Sync tutorial with dC workspace

on:
push:
branches:
- main
paths:
- "./tutorials/[0-9]*.ipynb"

jobs:
get-tutorials:
runs-on: ubuntu-latest
outputs:
modified-matrix: ${{ steps.create_matrix.outputs.modified-matrix }}
deleted-matrix: ${{ steps.create_matrix.outputs.deleted-matrix }}
any-changed: ${{ steps.files.outputs.any-changed }}
any-deleted: ${{ steps.files.outputs.any-deleted }}

steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Get changed files
id: files
uses: tj-actions/changed-files@v34
with:
json: true
files: |
./tutorials/[0-9]*.ipynb
- name: Create matrixes
id: create_matrix
run: |
echo "modified-matrix={\"file\": ${{ steps.files.outputs.all_changed_files }}}" >> "$GITHUB_OUTPUT"
echo "deleted-matrix={\"file\": ${{ steps.files.outputs.deleted_files }}}" >> "$GITHUB_OUTPUT"
modified:
needs: get-tutorials
if: needs.get-tutorials.outputs.any-changed
runs-on: ubuntu-latest
strategy:
matrix: ${{ fromJSON(needs.get-tutorials.outputs.modified-matrix) }}

steps:
- name: Checkout
uses: actions/checkout@v3

- name: Generate file to upload
id: file-generator
run: |
FILE="$(basename ${{ matrix.file }} .ipynb).txt"
echo "file=text/$FILE" >> "$GITHUB_OUTPUT"
METADATA="$(python scripts/generate_txt.py --print-metadata --notebooks ${{ matrix.file }})"
echo "metadata=$METADATA" >> "$GITHUB_OUTPUT"
- name: Upload tutorial to deepset Cloud
uses: silvanocerza/deepset-cloud-file-uploader@v1
with:
api-key: ${{ secrets.DEEPSET_CLOUD_API_KEY }}
workspace-name: ${{ secrets.DEEPSET_CLOUD_WORKSPACE }}
file: ${{ steps.file-generator.outputs.file }}
write-mode: OVERWRITE
metadata: ${{ steps.file-generator.outputs.metadata }}

deleted:
needs: get-tutorials
if: needs.get-tutorials.outputs.any-deleted
runs-on: ubuntu-latest
strategy:
matrix: ${{ fromJSON(needs.get-tutorials.outputs.deleted-matrix) }}

steps:
# This step is really important as when we remove a tutorial
# notebook we also want to remove the relative file from
# deepset Cloud, but since the remote file will have a .txt
# extension we must first the full file name with this step.
- name: Get file with correct extension
id: extension-changer
run: |
FILE="$(basename ${{ matrix.file }} .ipynb).txt"
echo "file=$FILE" >> "$GITHUB_OUTPUT"
- name: Delete file from deepset Cloud
uses: silvanocerza/deepset-cloud-file-deleter@v1
with:
api-key: ${{ secrets.DEEPSET_CLOUD_API_KEY }}
workspace-name: ${{ secrets.DEEPSET_CLOUD_WORKSPACE }}
file: ${{ steps.extension-changer.outputs.file}}
75 changes: 75 additions & 0 deletions scripts/generate_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import argparse
import tomli
from nbconvert import MarkdownExporter
from nbconvert.filters.strings import get_lines
from subprocess import check_output

from pathlib import Path


def read_index(path):
with open(path, "rb") as f:
return tomli.load(f)


def generate_metadata(config, tutorial):
aliases = []
if "aliases" in tutorial:
for alias in tutorial["aliases"]:
aliases.append(f"/tutorials/{alias}")

last_commit_date = (
check_output(f'git log -1 --pretty=format:"%cs" tutorials/{tutorial["notebook"]}'.split()).decode().strip()
)

return f"""layout: {config["layout"]}
featured: {tutorial.get("featured", False)}
colab: {tutorial.get("colab", f'{config["colab"]}{tutorial["notebook"]}')}
toc: {config["toc"]}
title: "{tutorial["title"]}"
lastmod: {last_commit_date}
level: "{tutorial["level"]}"
weight: {tutorial["weight"]}
description: {tutorial["description"]}
category: "QA"
aliases: {aliases}
download: "/downloads/{tutorial["notebook"]}"
completion_time: {tutorial.get("completion_time", False)}
created_at: {tutorial["created_at"]}"""


def generate_markdown_from_notebook(tutorial, output_path, tutorials_path):
md_exporter = MarkdownExporter(exclude_output=True)
body, _ = md_exporter.from_filename(f"{tutorials_path}")
body = get_lines(body, start=1)
filename = tutorial.get("slug", tutorial["notebook"][:-6])
Path(output_path).mkdir(exist_ok=True)
with open(f"{output_path}/{filename}.txt", "w", encoding="utf-8") as f:
f.write(body)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--index", dest="index", default="index.toml")
parser.add_argument("--notebooks", dest="notebooks", nargs="+", default=[])
parser.add_argument("--output", dest="output", default="text")
parser.add_argument("--print-metadata", dest="metadata", action="store_true")
args = parser.parse_args()
index = read_index(args.index)

notebooks = args.notebooks
if args.notebooks == ["all"]:
tutorials_path = Path(".", "tutorials")
notebooks = tutorials_path.glob("[0-9]*.ipynb")

notebooks_configs = {cfg["notebook"]: cfg for cfg in index["tutorial"]}

for notebook in notebooks:
notebook_name = notebook.split("/")[-1]
tutorial_config = notebooks_configs.get(notebook_name)
if tutorial_config:
generate_markdown_from_notebook(tutorial_config, args.output, notebook)

if args.metadata:
meta = generate_metadata(index["config"], tutorial_config)
print(meta)

0 comments on commit 8b7d238

Please sign in to comment.