From 8317fcb01524bed09dc27e7005e8d5967bb79036 Mon Sep 17 00:00:00 2001 From: rzmk <30333942+rzmk@users.noreply.github.com> Date: Fri, 14 Jun 2024 18:48:05 -0400 Subject: [PATCH 1/2] `jsonp`: add `jsonp` command allowing non-nested JSON to CSV conversion with Polars --- README.md | 1 + src/cmd/jsonp.rs | 123 ++++++++++++++++++++++++++++++++++++++++++++ src/cmd/mod.rs | 2 + src/main.rs | 8 +++ tests/test_jsonp.rs | 86 +++++++++++++++++++++++++++++++ tests/tests.rs | 2 + 6 files changed, 222 insertions(+) create mode 100644 src/cmd/jsonp.rs create mode 100644 tests/test_jsonp.rs diff --git a/README.md b/README.md index 6c81bc9e8..768bc6e6f 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ | [join](/src/cmd/join.rs#L2) | Inner, outer, right, cross, anti & semi joins. Automatically creates a simple, in-memory hash index to make it fast. | | [joinp](/src/cmd/joinp.rs#L2)
✨🚀🐻‍❄️ | Inner, outer, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output doesn't have duplicate columns. However, `joinp` doesn't have an --ignore-case option & it doesn't support right outer joins. | | [jsonl](/src/cmd/jsonl.rs#L2)
🚀🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL. +| [jsonp](/src/cmd/jsonp.rs#L2)
| Convert non-nested JSON to CSV. Only available with the polars feature enabled. |
[luau](/src/cmd/luau.rs#L2) 👑
✨📇🌐🔣 ![CKAN](docs/images/ckan.png) | Create multiple new computed columns, filter rows, compute aggregations and build complex data pipelines by executing a [Luau](https://luau-lang.org) [0.625](https://github.com/Roblox/luau/releases/tag/0.625) expression/script for every row of a CSV file ([sequential mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L254-L298)), or using [random access](https://www.webopedia.com/definitions/random-access/) with an index ([random access mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L367-L415)).
Can process a single Luau expression or [full-fledged data-wrangling scripts using lookup tables](https://github.com/dathere/qsv-lookup-tables#example) with discrete BEGIN, MAIN and END sections.
It is not just another qsv command, it is qsv's [Domain-specific Language](https://en.wikipedia.org/wiki/Domain-specific_language) (DSL) with [numerous qsv-specific helper functions](https://github.com/jqnatividad/qsv/blob/113eee17b97882dc368b2e65fec52b86df09f78b/src/cmd/luau.rs#L1356-L2290) to build production data pipelines. | | [partition](/src/cmd/partition.rs#L2) | Partition a CSV based on a column value. | | [prompt](/src/cmd/prompt.rs#L2) | Open a file dialog to pick a file. | diff --git a/src/cmd/jsonp.rs b/src/cmd/jsonp.rs new file mode 100644 index 000000000..b3d9df38d --- /dev/null +++ b/src/cmd/jsonp.rs @@ -0,0 +1,123 @@ +static USAGE: &str = r#" +Convert non-nested JSON to CSV (polars feature only). + +You may provide JSON data either from stdin or a file path. +This command may not work with nested JSON data. + +As a basic example, say we have a file fruits.json with contents: + +[ + { + "fruit": "apple", + "price": 2.5 + }, + { + "fruit": "banana", + "price": 3.0 + } +] + +To convert it to CSV format, run: + +qsv jsonp fruits.json + +And the following is printed to the terminal: + +fruit,price +apple,2.5 +banana,3.0 + +If fruits.json was provided using stdin then either use - or do not provide a file path. For example: + +cat fruits.json | qsv jsonp - + +For more examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_jsonp.rs. + +Usage: + qsv jsonp [options] [] + qsv jsonp --help + +jsonp options: + --datetime-format The datetime format to use writing datetimes. + See https://docs.rs/chrono/latest/chrono/format/strftime/index.html + for the list of valid format specifiers. + --date-format The date format to use writing dates. + --time-format The time format to use writing times. + --float-precision The number of digits of precision to use when writing floats. + --wnull-value The string to use when WRITING null values. + +Common options: + -h, --help Display this message + -o, --output Write output to instead of stdout. +"#; + +use std::io::{Cursor, Read, Seek, SeekFrom, Write}; + +use polars::prelude::*; +use serde::Deserialize; + +use crate::{util, CliResult}; + +#[derive(Deserialize)] +struct Args { + arg_input: Option, + flag_datetime_format: Option, + flag_date_format: Option, + flag_time_format: Option, + flag_float_precision: Option, + flag_wnull_value: Option, + flag_output: Option, +} + +pub fn run(argv: &[&str]) -> CliResult<()> { + let args: Args = util::get_args(USAGE, argv)?; + + fn df_from_stdin() -> PolarsResult { + // Create a buffer in memory for stdin + let mut buffer: Vec = Vec::new(); + let stdin = std::io::stdin(); + stdin.lock().read_to_end(&mut buffer)?; + Ok(JsonReader::new(Box::new(std::io::Cursor::new(buffer))).finish()?) + } + + fn df_from_path(path: String) -> PolarsResult { + Ok(JsonReader::new(std::fs::File::open(path)?).finish()?) + } + + let df = match args.arg_input.clone() { + Some(path) => { + if path == "-" { + df_from_stdin()? + } else { + df_from_path(path)? + } + }, + None => df_from_stdin()?, + }; + + fn df_to_csv(mut writer: W, mut df: DataFrame, args: &Args) -> PolarsResult<()> { + CsvWriter::new(&mut writer) + .with_datetime_format(args.flag_datetime_format.clone()) + .with_date_format(args.flag_date_format.clone()) + .with_time_format(args.flag_time_format.clone()) + .with_float_precision(args.flag_float_precision.clone()) + .with_null_value(args.flag_wnull_value.clone().unwrap_or("".to_string())) + .include_bom(util::get_envvar_flag("QSV_OUTPUT_BOM")) + .finish(&mut df)?; + Ok(()) + } + + if let Some(output_path) = args.flag_output.clone() { + let mut output = std::fs::File::create(output_path)?; + df_to_csv(&mut output, df, &args)?; + } else { + let mut res = Cursor::new(Vec::new()); + df_to_csv(&mut res, df, &args)?; + res.seek(SeekFrom::Start(0))?; + let mut out = String::new(); + res.read_to_string(&mut out)?; + println!("{out}"); + } + + Ok(()) +} diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 844d26fe6..db3a7ba57 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -46,6 +46,8 @@ pub mod join; pub mod joinp; #[cfg(any(feature = "feature_capable", feature = "lite"))] pub mod jsonl; +#[cfg(feature = "polars")] +pub mod jsonp; #[cfg(feature = "luau")] pub mod luau; #[cfg(any(feature = "feature_capable", feature = "lite"))] diff --git a/src/main.rs b/src/main.rs index 70385103e..d3816f0e3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -144,6 +144,10 @@ fn main() -> QsvExitCode { enabled_commands.push_str(" jsonl Convert newline-delimited JSON files to CSV\n"); + #[cfg(all(feature = "polars", feature = "feature_capable"))] + enabled_commands + .push_str(" jsonp Convert non-nested JSON to CSV (polars feature only)\n"); + #[cfg(all(feature = "luau", feature = "feature_capable"))] enabled_commands.push_str(" luau Execute Luau script on CSV data\n"); @@ -356,6 +360,8 @@ enum Command { #[cfg(all(feature = "polars", feature = "feature_capable"))] JoinP, Jsonl, + #[cfg(all(feature = "polars", feature = "feature_capable"))] + JsonP, #[cfg(all(feature = "luau", feature = "feature_capable"))] Luau, Partition, @@ -445,6 +451,8 @@ impl Command { #[cfg(all(feature = "polars", feature = "feature_capable"))] Command::JoinP => cmd::joinp::run(argv), Command::Jsonl => cmd::jsonl::run(argv), + #[cfg(all(feature = "polars", feature = "feature_capable"))] + Command::JsonP => cmd::jsonp::run(argv), #[cfg(all(feature = "luau", feature = "feature_capable"))] Command::Luau => cmd::luau::run(argv), Command::Partition => cmd::partition::run(argv), diff --git a/tests/test_jsonp.rs b/tests/test_jsonp.rs new file mode 100644 index 000000000..a6c66b383 --- /dev/null +++ b/tests/test_jsonp.rs @@ -0,0 +1,86 @@ +use crate::workdir::Workdir; + +#[test] +fn jsonp_simple() { + let wrk = Workdir::new("jsonp_simple"); + wrk.create_from_string( + "data.json", + r#"[{"id":1,"father":"Mark","mother":"Charlotte","oldest_child":"Tom","boy":true}, +{"id":2,"father":"John","mother":"Ann","oldest_child":"Jessika","boy":false}, +{"id":3,"father":"Bob","mother":"Monika","oldest_child":"Jerry","boy":true}]"#, + ); + let mut cmd = wrk.command("jsonp"); + cmd.arg("data.json"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["id", "father", "mother", "oldest_child", "boy"], + svec!["1", "Mark", "Charlotte", "Tom", "true"], + svec!["2", "John", "Ann", "Jessika", "false"], + svec!["3", "Bob", "Monika", "Jerry", "true"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn jsonp_fruits_stats() { + let wrk = Workdir::new("jsonp_fruits_stats"); + wrk.create_from_string( + "data.json", + r#"[{"field":"fruit","type":"String","is_ascii":true,"sum":null,"min":"apple","max":"strawberry","range":null,"min_length":5,"max_length":10,"mean":null,"stddev":null,"variance":null,"nullcount":0,"max_precision":null,"sparsity":0},{"field":"price","type":"Float","is_ascii":null,"sum":7,"min":"1.5","max":"3.0","range":1.5,"min_length":4,"max_length":4,"mean":2.3333,"stddev":0.6236,"variance":0.3889,"nullcount":0,"max_precision":1,"sparsity":0}]"#, + ); + let mut cmd = wrk.command("jsonp"); + cmd.arg("data.json"); + + let got: String = wrk.stdout(&mut cmd); + let expected = r#"field,type,is_ascii,sum,min,max,range,min_length,max_length,mean,stddev,variance,nullcount,max_precision,sparsity +fruit,String,true,,apple,strawberry,,5,10,,,,0,,0 +price,Float,,7,1.5,3.0,1.5,4,4,2.3333,0.6236,0.3889,0,1,0"#.to_string(); + assert_eq!(got, expected); +} + +#[test] +fn jsonp_fruits_stats_fp_2() { + let wrk = Workdir::new("jsonp_fruits_stats_fp_2"); + wrk.create_from_string( + "data.json", + r#"[{"field":"fruit","type":"String","is_ascii":true,"sum":null,"min":"apple","max":"strawberry","range":null,"min_length":5,"max_length":10,"mean":null,"stddev":null,"variance":null,"nullcount":0,"max_precision":null,"sparsity":0},{"field":"price","type":"Float","is_ascii":null,"sum":7,"min":"1.5","max":"3.0","range":1.5,"min_length":4,"max_length":4,"mean":2.3333,"stddev":0.6236,"variance":0.3889,"nullcount":0,"max_precision":1,"sparsity":0}]"#, + ); + let mut cmd = wrk.command("jsonp"); + cmd.arg("data.json"); + cmd.args(&["--float-precision", "2"]); + + let got: String = wrk.stdout(&mut cmd); + let expected = r#"field,type,is_ascii,sum,min,max,range,min_length,max_length,mean,stddev,variance,nullcount,max_precision,sparsity +fruit,String,true,,apple,strawberry,,5,10,,,,0,,0 +price,Float,,7,1.5,3.0,1.50,4,4,2.33,0.62,0.39,0,1,0"#.to_string(); + assert_eq!(got, expected); +} + +#[test] +// Verify that qsv stats fruits.csv has the same content as +// qsv stats fruits.csv | qsv slice --json | qsv jsonp +fn jsonp_fruits_stats_slice_jsonp() { + let wrk = Workdir::new("jsonp_fruits_stats_slice_jsonp"); + let test_file = wrk.load_test_file("fruits.csv"); + + // qsv stats fruits.csv + let mut stats_cmd = wrk.command("stats"); + stats_cmd.arg(test_file); + let stats_output: String = wrk.stdout(&mut stats_cmd); + wrk.create_from_string("stats.csv", stats_output.as_str()); + + // qsv slice --json + let mut slice_cmd = wrk.command("slice"); + slice_cmd.arg("stats.csv"); + slice_cmd.arg("--json"); + let slice_output: String = wrk.stdout(&mut slice_cmd); + wrk.create_from_string("slice.json", slice_output.as_str()); + + // qsv jsonp + let mut jsonp_cmd = wrk.command("jsonp"); + jsonp_cmd.arg("slice.json"); + let jsonp_output: String = wrk.stdout(&mut jsonp_cmd); + + assert_eq!(stats_output, jsonp_output); +} diff --git a/tests/tests.rs b/tests/tests.rs index af644b1d1..62577f69d 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -79,6 +79,8 @@ mod test_join; mod test_joinp; #[cfg(any(feature = "feature_capable", feature = "lite"))] mod test_jsonl; +#[cfg(feature = "polars")] +mod test_jsonp; #[cfg(feature = "luau")] mod test_luau; #[cfg(any(feature = "feature_capable", feature = "lite"))] From b01ab5edaeeaf1936f36ae99e11d94582ebb8d5b Mon Sep 17 00:00:00 2001 From: rzmk <30333942+rzmk@users.noreply.github.com> Date: Fri, 14 Jun 2024 20:12:23 -0400 Subject: [PATCH 2/2] `jsonp`: fix clippy lints and disable for DP+ --- src/cmd/jsonp.rs | 6 +++--- tests/tests.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cmd/jsonp.rs b/src/cmd/jsonp.rs index b3d9df38d..3c3c83f7b 100644 --- a/src/cmd/jsonp.rs +++ b/src/cmd/jsonp.rs @@ -77,11 +77,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut buffer: Vec = Vec::new(); let stdin = std::io::stdin(); stdin.lock().read_to_end(&mut buffer)?; - Ok(JsonReader::new(Box::new(std::io::Cursor::new(buffer))).finish()?) + JsonReader::new(Box::new(std::io::Cursor::new(buffer))).finish() } fn df_from_path(path: String) -> PolarsResult { - Ok(JsonReader::new(std::fs::File::open(path)?).finish()?) + JsonReader::new(std::fs::File::open(path)?).finish() } let df = match args.arg_input.clone() { @@ -100,7 +100,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .with_datetime_format(args.flag_datetime_format.clone()) .with_date_format(args.flag_date_format.clone()) .with_time_format(args.flag_time_format.clone()) - .with_float_precision(args.flag_float_precision.clone()) + .with_float_precision(args.flag_float_precision) .with_null_value(args.flag_wnull_value.clone().unwrap_or("".to_string())) .include_bom(util::get_envvar_flag("QSV_OUTPUT_BOM")) .finish(&mut df)?; diff --git a/tests/tests.rs b/tests/tests.rs index 62577f69d..9093aeca8 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -79,7 +79,7 @@ mod test_join; mod test_joinp; #[cfg(any(feature = "feature_capable", feature = "lite"))] mod test_jsonl; -#[cfg(feature = "polars")] +#[cfg(all(feature = "polars", not(feature = "datapusher_plus")))] mod test_jsonp; #[cfg(feature = "luau")] mod test_luau;