Refactor build script, add limits to avoid timeout

datasets · augusto-herrmann · Sep 3, 2022 · Sep 3, 2022 · Sep 3, 2022 · Sep 3, 2022
commit 1d7efb03bdf2d8f86f87aadc2b636d3bc1a756ce
diff --git a/.github/workflows/deploy_site.yaml b/.github/workflows/deploy_site.yaml
@@ -23,7 +23,7 @@ jobs:
  path: "scripts/requirements.txt"
  - name: Prepare build
  working-directory: scripts/deploy
- run: python3 ./prepare_build.py
+ run: python3 ./prepare_build.py --max_bodies_per_jurisdiction 1000
  - name: Deploy
  uses: peaceiris/actions-gh-pages@v3
  if: ${{ github.ref == 'refs/heads/main' }}

diff --git a/scripts/deploy/prepare_build.py b/scripts/deploy/prepare_build.py
@@ -5,17 +5,18 @@
 - creates a json file for downloading machine processable data of
  individual public body
 """
-import os, shutil
+from typing import List
+import os
+import shutil
 import csv
 import logging
 import json
+import argparse
 
 from slugify import slugify
 
-# directories
 DATA_DIR = "../../data"
 WEBSITE_DIR = "../../website"
-MAX_NAME_SIZE = 250 # limit from Linux ext4 file system, minus extension
 BODY_TEMPLATE = \
 """---
 layout: body
@@ -24,39 +25,107 @@
 
 """
 
-logging.getLogger().setLevel(logging.INFO)
-
-# copy data files to website
-data_files = [file for file in os.listdir(DATA_DIR) if file.endswith('.csv')]
-
-for file in data_files:
- logging.info('Copying data file "%s"...', file)
- shutil.copy2(
- os.path.join(DATA_DIR,file),
- os.path.join(WEBSITE_DIR, "_data"))
-
-# generate files for individual public bodies
-for data_file in data_files:
- csv_filename = os.path.join(WEBSITE_DIR, "_data", data_file)
- with open(csv_filename, "r") as csv_file:
- row_reader = csv.DictReader(csv_file)
- for row in row_reader:
- if not row["id"]:
- continue # skip rows with empty ids
- jurisdiction, body_id = row["id"].split("/", maxsplit=1)
- body_id = slugify(body_id)[:MAX_NAME_SIZE]
- jurisdiction_path = os.path.join(WEBSITE_DIR, jurisdiction)
- try:
- os.mkdir(jurisdiction_path)
- logging.info('Created directory "%s"...', jurisdiction)
- except FileExistsError:
- pass
- md_filename = f"{body_id}.md"
- with open(os.path.join(jurisdiction_path, md_filename),
- "w") as md_file:
- md_file.write(
- BODY_TEMPLATE.format(f"{jurisdiction}/{body_id}/"))
- json_filename = f"{body_id}.json"
- with open(os.path.join(jurisdiction_path, json_filename),
- "w") as json_file:
- json.dump(row, json_file)
+def list_data_files(data_dir: str) -> List[str]:
+ """Returns a list of data files to process.
+
+ Args:
+ data_dir (str): Path to the original data files.
+
+ Returns:
+ List[str]: List of data files.
+ """
+ return (file
+ for file in os.listdir(data_dir) if file.endswith('.csv'))
+
+def copy_data_files(data_dir: str, website_dir: str):
+ """Copies the data files to the website _data subfolder for processing
+ by Jekyll.
+
+ Args:
+ data_dir (str): Path to the original data files.
+ website_dir (str): Path to the Jekyll source files.
+ """
+ for file in list_data_files(data_dir):
+ logging.info('Copying data file "%s"...', file)
+ shutil.copy2(
+ os.path.join(data_dir,file),
+ os.path.join(website_dir, "_data"))
+
+def generate_files_public_bodies(
+ data_dir: str,
+ website_dir: str,
+ max_name_size: int,
+ max_bodies_per_jurisdiction: int,
+ body_template: str):
+ """Generates the markdown and json files for each individual
+ public body.
+
+ Args:
+ data_dir (str): Path to the original data files.
+ website_dir (str): Path to the Jekyll source files.
+ max_name_size (int): Maximum size for file names.
+ max_bodies_per_jurisdiction (int): Maximum number of bodies to
+ generate files to per jurisdiction. May be necessary for
+ performance reasons.
+ body_template (str): Template for the markdown files.
+ """
+ for data_file in list_data_files(data_dir):
+ csv_filename = os.path.join(website_dir, "_data", data_file)
+ with open(csv_filename, "r") as csv_file:
+ row_reader = csv.DictReader(csv_file)
+ generated_count = 0
+ for row in row_reader:
+ if not row["id"]:
+ continue # skip rows with empty ids
+ jurisdiction, body_id = row["id"].split("/", maxsplit=1)
+ generated_count += 1
+ if generated_count > max_bodies_per_jurisdiction:
+ logging.info('Jurisdiction "%s" exceeded the public body'
+ ' limit of %d. Skipping further file generation.',
+ jurisdiction, max_bodies_per_jurisdiction)
+ break # do not generate files past the count limit
+ body_id = slugify(body_id)[:max_name_size]
+ jurisdiction_path = os.path.join(website_dir, jurisdiction)
+ try:
+ os.mkdir(jurisdiction_path)
+ logging.info('Created directory "%s"...', jurisdiction)
+ except FileExistsError:
+ pass
+ md_filename = f"{body_id}.md"
+ with open(os.path.join(jurisdiction_path, md_filename),
+ "w") as md_file:
+ md_file.write(
+ body_template.format(f"{jurisdiction}/{body_id}/"))
+ json_filename = f"{body_id}.json"
+ with open(os.path.join(jurisdiction_path, json_filename),
+ "w") as json_file:
+ json.dump(row, json_file)
+
+def parse_cli() -> int:
+ """Parses the command line interface.
+
+ Returns:
+ int: The maximum number of bodies per jurisdiction.
+ """
+ parser = argparse.ArgumentParser(description=__doc__)
+
+ parser.add_argument('--max_bodies_per_jurisdiction',
+ help='The maximum number of bodies per jurisdiction.',
+ default='100',
+ nargs='?',
+ )
+
+ args = parser.parse_args()
+
+ return int(args.max_bodies_per_jurisdiction)
+
+if __name__ == '__main__':
+ logging.getLogger().setLevel(logging.INFO)
+ max_bodies_per_jurisdiction = parse_cli()
+ copy_data_files(DATA_DIR, WEBSITE_DIR)
+ generate_files_public_bodies(
+ data_dir=DATA_DIR,
+ website_dir=WEBSITE_DIR,
+ max_name_size=250,
+ max_bodies_per_jurisdiction=max_bodies_per_jurisdiction,
+ body_template=BODY_TEMPLATE)
diff --git a/website/_layouts/jurisdiction.html b/website/_layouts/jurisdiction.html
@@ -2,7 +2,7 @@
 layout: default
 ---
 {% assign page_id=page.id | split: "/" | last %}
-{% assign jurisdiction=site.jurisdictions | find: 'id', page.id %}
+{% assign jurisdiction=site.jurisdictions | find: "id", page.id %}
 {% assign jurisdiction_id=jurisdiction.id | split: "/" | last %}
 {% assign data=site.data[jurisdiction_id] %}
 
@@ -14,12 +14,18 @@ <h1>{{ page.title }} ({{ jurisdiction_id | upcase }})</h1>
 </div>
 <div class="row">
  <div class="span12">
- <table class="table table-striped">
- {% for row in data %}
- <tr><td>
- <a href="{{ row.id | split: '/' | last }}/">{{ row.name }}</a>
- </td></tr>
- {% endfor %}
- </table>
-</div>
+ <table class="table table-striped">
+ {% for row in data %}
+ {% assign json_path=row.id | prepend: "/" | append: ".json" %}
+ {% assign file_exists=site.static_files | where: "path", json_path %}
+ <tr><td>
+ {% if file_exists.size > 0 %}
+ <a href="{{ row.id | split: '/' | last }}/">{{ row.name }}</a>
+ {% else %}
+ {{ row.name }}
+ {% endif %}
+ </td></tr>
+ {% endfor %}
+ </table>
+ </div>
 </div>