Skip to content

Commit

Permalink
feature/tree-sources-staging (#24)
Browse files Browse the repository at this point in the history
* feature/tree-sources-staging

Added a new merge step to the ETL process. The new merge step takes all
the data from the geojson file and moves it into a staging table called
`tree_sources_staging`. This is a way to keep the raw data from the
sources into a database table.

This PR uses ogr2ogr to move data into the database. By default, data is
moved 20k rows at a time. Postgres is all lowercase, and javascript uses
camelcase, so we cannot easily use underscores in names without some
bigger changes. However, that did not seem to be an issue.

feature/tree-sources-staging
Removed unused new library

* feature/tree-sources-staging
- added a new PG_USE_COPY argument. this improves performance by about
2-3x.
- cleaned up command to have shorter strings
- moved PG config to a separate variable

* feature/tree-sources-staging
- update the ogr2ogr command to transform the sql converting camelCase
to snake_case
- convert database table to treedata_staging
- use spawn instead of exec
- truncate the staging table as the first step

Added a new line for a little clean up

* feature/tree-sources-staging
- updated idName to idSourceName as per the rest of the code
  • Loading branch information
tzinckgraf committed Mar 29, 2023
1 parent 314325f commit 7830a73
Show file tree
Hide file tree
Showing 8 changed files with 156 additions and 2 deletions.
2 changes: 2 additions & 0 deletions bin/merge.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
import { runMerge } from "../src/index.js";
await runMerge();
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"convert": "node bin/convert.js",
"normalize": "node bin/normalize.js",
"concatenate": "node bin/concatenate.js",
"merge": "node bin/merge.js",
"save": "node bin/save.js",
"tile": "node bin/tile.js",
"upload": "node bin/upload.js",
Expand Down
14 changes: 14 additions & 0 deletions src/db/db-config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* https://github.com/vitaly-t/pg-promise/wiki/Connection-Syntax
*/
export const dbConfig = {
host: process.env.POSTGRES_HOST,
port: process.env.POSTGRES_PORT,
database: process.env.POSTGRES_DB,
user: process.env.POSTGRES_USER,
password: process.env.POSTGRES_PASSWORD,
};

// console.log('dbConfig',dbConfig, process.env);

//module.exports = dbConfig;
7 changes: 7 additions & 0 deletions src/db/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import pgp from 'pg-promise';
import { dbConfig } from './db-config.js';
import { pgPromiseConfig } from './pg-promise-config.js';

export const pgPromise = pgp(pgPromiseConfig);
export const db = pgPromise(dbConfig);

27 changes: 27 additions & 0 deletions src/db/pg-promise-config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/* eslint-disable guard-for-in */
/* eslint-disable no-restricted-syntax */
import pgp from 'pg-promise';

/**
* source:
* https://github.com/vitaly-t/pg-promise/issues/78#issuecomment-171951303
*/
function camelizeColumns(data) {
const tmp = data[0];
for (const prop in tmp) {
const camel = pgp.utils.camelize(prop);
if (!(camel in tmp)) {
for (let i = 0; i < data.length; i++) {
const d = data[i];
d[camel] = d[prop];
delete d[prop];
}
}
}
}

export const pgPromiseConfig = {
capSQL: true,
receive: (data) => camelizeColumns(data),
};

7 changes: 7 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import path from "path";
import * as download from "./stages/download.js";
import * as convert from "./stages/convert.js";
import * as normalize from "./stages/normalize.js";
import * as merge from "./stages/merge.js";
import * as save from "./stages/save.js";
import * as concatenate from "./stages/concatenate.js";
import * as tile from "./stages/tile.js";
Expand All @@ -11,6 +12,7 @@ import * as utils from "./core/utils.js";
import * as config from "./config.js";
import sources from "./core/sources.js";


export const runDownload = async () => {
await download.downloadSources(sources);
};
Expand All @@ -23,6 +25,10 @@ export const runNormalize = async () => {
await normalize.normalizeSources(sources);
};

export const runMerge = async () => {
await merge.mergeSources(sources);
};

export const runConcatenate = async () => {
const filenames = await utils.asyncReadDir(config.NORMALIZED_DIRECTORY);
const filepaths = filenames.map((f) =>
Expand Down Expand Up @@ -51,6 +57,7 @@ export const runAll = async () => {
await runDownload();
await runConvert();
await runNormalize();
await runMerge();
await runConcatenate();
await runTile();
await runUpload();
Expand Down
94 changes: 94 additions & 0 deletions src/stages/merge.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import { spawn } from "child_process";
import { dbConfig } from "../db/db-config.js";
import { db } from "../db/index.js";
import pLimit from "p-limit";
import * as utils from "../core/utils.js";


export const mergeSource = async (source) => {
console.log(`Starting merge for ${source.idSourceName}`);
if (
!source.destinations ||
!source.destinations.geojson ||
!source.destinations.normalized
) {
throw new Error(`No destinations for source: "${source}"`);
}

const normalizedExists = await utils.asyncFileExists(
source.destinations.normalized.path
);
if (!normalizedExists) {
console.log(
`The normalized file "${source.destinations.normalized.path}" does not exists. Skipping...`
);
return `NO FILE for ${source.idSourceName}`; // Early Return
}

console.log(`Running for ${source.destinations.normalized.path}`);
/*
* For debugging purposes, you can check the contents of the geojson file with ogr2ogr
* using the following command:
* ogrinfo data/normalized/[sourceId].geojsons
* -dialect SQLite
* -sql 'select idReference as id_reference, idSourceName as id_source_name, * from "[sourceId]"'
*/
const postgresConfig = `postgresql:https://${dbConfig.user}:${dbConfig.password}@${dbConfig.host}:5432/${dbConfig.database}`;
const command = "ogr2ogr";
const commandArgs = [
"-f", "PostgreSQL", `PG:${postgresConfig}`,
"-dialect", "SQLite",
"-sql",
`select idReference as id_reference, idSourceName as id_source_name, * from ${source.idSourceName}`,
"-nln", "treedata_staging",
"-geomfield", "geom",
source.destinations.normalized.path,
"-append", "-update", "--config", "PG_USE_COPY", "YES", "-preserve_fid"
];

return new Promise((resolve, _) => {
const child = spawn(
command,
commandArgs,
{
stdio: ["ignore", process.stdout, process.stderr],
}
);
child.on("exit", resolve);
});

};


const truncateStaging = async () => {
return await db.none('TRUNCATE TABLE treedata_staging');
}


export const mergeSources = async (list) => {
console.log("Start truncate the treedata_staging database...");
try {
await truncateStaging();
} catch(error) {
console.error(
'\nError truncating the treedata_staging database...',
inspect(error, true, 2, true));
throw error;
}
console.log("Done truncating the treedata_staging database...");

console.log("Start uploading to the database...");
const limit = pLimit(5);
const promises = list.map((source) =>
limit(() => mergeSource(source))
);
const results = await Promise.allSettled(promises);
console.log("Finished uploading to the database...");
results.forEach((l) => {
if (l && l.forEach) {
l.forEach(console.log);
} else {
console.log(l);
}
});
};
6 changes: 4 additions & 2 deletions src/stages/normalize.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import * as constants from "../constants.js";

const RECSEP = RegExp(String.fromCharCode(30), "g");

const transform = (context, source, line) => {
const transform = (context, source, line, timestamp) => {
if (!line || !line.length) {
return null;
}
Expand Down Expand Up @@ -108,6 +108,7 @@ const transform = (context, source, line) => {
lat: data.geometry.coordinates[1],
lng: data.geometry.coordinates[0],
count: 0,
created: timestamp
};
return data;
};
Expand Down Expand Up @@ -162,8 +163,9 @@ export const normalizeSource = async (source) => {
}

const groups = {};
const timestamp = new Date();
for await (const line of utils.asyncReadLines(reader)) {
const transformed = transform(context, source, line);
const transformed = transform(context, source, line, timestamp);
if (!transformed) {
continue; // Early Continuation
}
Expand Down

0 comments on commit 7830a73

Please sign in to comment.