Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Impose a limit on the number of pages built #162

Merged
merged 2 commits into from
Sep 3, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Refactor build script, add limits to avoid timeout
  • Loading branch information
augusto-herrmann committed Sep 3, 2022
commit 1d7efb03bdf2d8f86f87aadc2b636d3bc1a756ce
2 changes: 1 addition & 1 deletion .github/workflows/deploy_site.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
path: "scripts/requirements.txt"
- name: Prepare build
working-directory: scripts/deploy
run: python3 ./prepare_build.py
run: python3 ./prepare_build.py --max_bodies_per_jurisdiction 1000
- name: Deploy
uses: peaceiris/actions-gh-pages@v3
if: ${{ github.ref == 'refs/heads/main' }}
Expand Down
147 changes: 108 additions & 39 deletions scripts/deploy/prepare_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@
- creates a json file for downloading machine processable data of
individual public body
"""
import os, shutil
from typing import List
import os
import shutil
import csv
import logging
import json
import argparse

from slugify import slugify

# directories
DATA_DIR = "../../data"
WEBSITE_DIR = "../../website"
MAX_NAME_SIZE = 250 # limit from Linux ext4 file system, minus extension
BODY_TEMPLATE = \
"""---
layout: body
Expand All @@ -24,39 +25,107 @@

"""

logging.getLogger().setLevel(logging.INFO)

# copy data files to website
data_files = [file for file in os.listdir(DATA_DIR) if file.endswith('.csv')]

for file in data_files:
logging.info('Copying data file "%s"...', file)
shutil.copy2(
os.path.join(DATA_DIR,file),
os.path.join(WEBSITE_DIR, "_data"))

# generate files for individual public bodies
for data_file in data_files:
csv_filename = os.path.join(WEBSITE_DIR, "_data", data_file)
with open(csv_filename, "r") as csv_file:
row_reader = csv.DictReader(csv_file)
for row in row_reader:
if not row["id"]:
continue # skip rows with empty ids
jurisdiction, body_id = row["id"].split("/", maxsplit=1)
body_id = slugify(body_id)[:MAX_NAME_SIZE]
jurisdiction_path = os.path.join(WEBSITE_DIR, jurisdiction)
try:
os.mkdir(jurisdiction_path)
logging.info('Created directory "%s"...', jurisdiction)
except FileExistsError:
pass
md_filename = f"{body_id}.md"
with open(os.path.join(jurisdiction_path, md_filename),
"w") as md_file:
md_file.write(
BODY_TEMPLATE.format(f"{jurisdiction}/{body_id}/"))
json_filename = f"{body_id}.json"
with open(os.path.join(jurisdiction_path, json_filename),
"w") as json_file:
json.dump(row, json_file)
def list_data_files(data_dir: str) -> List[str]:
"""Returns a list of data files to process.

Args:
data_dir (str): Path to the original data files.

Returns:
List[str]: List of data files.
"""
return (file
for file in os.listdir(data_dir) if file.endswith('.csv'))

def copy_data_files(data_dir: str, website_dir: str):
"""Copies the data files to the website _data subfolder for processing
by Jekyll.

Args:
data_dir (str): Path to the original data files.
website_dir (str): Path to the Jekyll source files.
"""
for file in list_data_files(data_dir):
logging.info('Copying data file "%s"...', file)
shutil.copy2(
os.path.join(data_dir,file),
os.path.join(website_dir, "_data"))

def generate_files_public_bodies(
data_dir: str,
website_dir: str,
max_name_size: int,
max_bodies_per_jurisdiction: int,
body_template: str):
"""Generates the markdown and json files for each individual
public body.

Args:
data_dir (str): Path to the original data files.
website_dir (str): Path to the Jekyll source files.
max_name_size (int): Maximum size for file names.
max_bodies_per_jurisdiction (int): Maximum number of bodies to
generate files to per jurisdiction. May be necessary for
performance reasons.
body_template (str): Template for the markdown files.
"""
for data_file in list_data_files(data_dir):
csv_filename = os.path.join(website_dir, "_data", data_file)
with open(csv_filename, "r") as csv_file:
row_reader = csv.DictReader(csv_file)
generated_count = 0
for row in row_reader:
if not row["id"]:
continue # skip rows with empty ids
jurisdiction, body_id = row["id"].split("/", maxsplit=1)
generated_count += 1
if generated_count > max_bodies_per_jurisdiction:
logging.info('Jurisdiction "%s" exceeded the public body'
' limit of %d. Skipping further file generation.',
jurisdiction, max_bodies_per_jurisdiction)
break # do not generate files past the count limit
body_id = slugify(body_id)[:max_name_size]
jurisdiction_path = os.path.join(website_dir, jurisdiction)
try:
os.mkdir(jurisdiction_path)
logging.info('Created directory "%s"...', jurisdiction)
except FileExistsError:
pass
md_filename = f"{body_id}.md"
with open(os.path.join(jurisdiction_path, md_filename),
"w") as md_file:
md_file.write(
body_template.format(f"{jurisdiction}/{body_id}/"))
json_filename = f"{body_id}.json"
with open(os.path.join(jurisdiction_path, json_filename),
"w") as json_file:
json.dump(row, json_file)

def parse_cli() -> int:
"""Parses the command line interface.

Returns:
int: The maximum number of bodies per jurisdiction.
"""
parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument('--max_bodies_per_jurisdiction',
help='The maximum number of bodies per jurisdiction.',
default='100',
nargs='?',
)

args = parser.parse_args()

return int(args.max_bodies_per_jurisdiction)

if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
max_bodies_per_jurisdiction = parse_cli()
copy_data_files(DATA_DIR, WEBSITE_DIR)
generate_files_public_bodies(
data_dir=DATA_DIR,
website_dir=WEBSITE_DIR,
max_name_size=250,
max_bodies_per_jurisdiction=max_bodies_per_jurisdiction,
body_template=BODY_TEMPLATE)
24 changes: 15 additions & 9 deletions website/_layouts/jurisdiction.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
layout: default
---
{% assign page_id=page.id | split: "/" | last %}
{% assign jurisdiction=site.jurisdictions | find: 'id', page.id %}
{% assign jurisdiction=site.jurisdictions | find: "id", page.id %}
{% assign jurisdiction_id=jurisdiction.id | split: "/" | last %}
{% assign data=site.data[jurisdiction_id] %}

Expand All @@ -14,12 +14,18 @@ <h1>{{ page.title }} ({{ jurisdiction_id | upcase }})</h1>
</div>
<div class="row">
<div class="span12">
<table class="table table-striped">
{% for row in data %}
<tr><td>
<a href="{{ row.id | split: '/' | last }}/">{{ row.name }}</a>
</td></tr>
{% endfor %}
</table>
</div>
<table class="table table-striped">
{% for row in data %}
{% assign json_path=row.id | prepend: "/" | append: ".json" %}
{% assign file_exists=site.static_files | where: "path", json_path %}
<tr><td>
{% if file_exists.size > 0 %}
<a href="{{ row.id | split: '/' | last }}/">{{ row.name }}</a>
{% else %}
{{ row.name }}
{% endif %}
</td></tr>
{% endfor %}
</table>
</div>
</div>