Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

validate: add --trim and --quiet options #1452

Merged
merged 2 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
validate: Add --trim option
  • Loading branch information
jqnatividad committed Dec 5, 2023
commit d590f7d1c9809027aa531b8dc6f6b1e7cad7f7db
10 changes: 6 additions & 4 deletions src/cmd/validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Validate arguments:
The file can be a local file or a URL.

Validate options:
--trim Trim leading and trailing whitespace from fields before validating.
--fail-fast Stops on first error.
--valid <suffix> Valid record output file suffix. [default: valid]
--invalid <suffix> Invalid record output file suffix. [default: invalid]
Expand Down Expand Up @@ -139,6 +140,7 @@ static TIMEOUT_SECS: AtomicU16 = AtomicU16::new(15);
#[derive(Deserialize)]
#[allow(dead_code)]
struct Args {
flag_trim: bool,
flag_fail_fast: bool,
flag_valid: Option<String>,
flag_invalid: Option<String>,
Expand Down Expand Up @@ -466,6 +468,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let mut validation_results = Vec::with_capacity(batch_size);
let mut valid_flags: Vec<bool> = Vec::with_capacity(batch_size);
let mut validation_error_messages: Vec<String> = Vec::with_capacity(50);
let flag_trim = args.flag_trim;

// set RAYON_NUM_THREADS
util::njobs(args.flag_jobs);
Expand All @@ -481,10 +484,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
if has_data {
row_number += 1;
record.push_field(buffer.format(row_number).as_bytes());

// non-allocating trimming in place is much faster on the record level
// with our csv fork than doing per field std::str::trim which is allocating
record.trim();
if flag_trim {
record.trim();
}
batch.push(record.clone());
} else {
// nothing else to add to batch
Expand Down
99 changes: 93 additions & 6 deletions tests/test_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@

#[test]
#[file_serial]
fn generate_schema_with_defaults_and_validate_with_no_errors() {
fn generate_schema_with_defaults_and_validate_trim_with_no_errors() {
// create workspace and invoke schema command with value constraints flag
let wrk =
Workdir::new("fn generate_schema_with_defaults_and_validate_with_no_errors").flexible(true);
let wrk = Workdir::new("fn generate_schema_with_defaults_and_validate_trim_with_no_errors")
.flexible(true);
wrk.clear_contents().unwrap();

// copy csv file to workdir
Expand Down Expand Up @@ -44,6 +44,7 @@
// invoke validate command from schema created above
let mut cmd2 = wrk.command("validate");
cmd2.arg("adur-public-toilets.csv");
cmd2.arg("--trim");
Dismissed Show dismissed Hide dismissed
cmd2.arg("adur-public-toilets.csv.schema.json");
wrk.output(&mut cmd2);

Expand All @@ -58,10 +59,10 @@

#[test]
#[file_serial]
fn generate_schema_with_optional_flags_and_validate_with_errors() {
fn generate_schema_with_optional_flags_notrim_and_validate_with_errors() {
// create workspace and invoke schema command with value constraints flag
let wrk =
Workdir::new("generate_schema_with_optional_flags_and_validate_with_errors").flexible(true);
let wrk = Workdir::new("generate_schema_with_optional_flags_notrim_and_validate_with_errors")
.flexible(true);
wrk.clear_contents().unwrap();

// copy csv file to workdir
Expand Down Expand Up @@ -101,6 +102,92 @@
cmd2.arg("adur-public-toilets.csv.schema.json");
wrk.output(&mut cmd2);

// validation report
let validation_errors_expected = r#"row_number field error
1 OpeningHours "S = 09:00 - 21:00 W = 09:00 - 17:00 " is not one of ["09.00 - 17.00","S = 08:00 - 21:00 W = 08:00 - 17:00","S = 09:00 - 15:00 W = 09:00 - 15:00","S = 09:00 - 21:00 W = 09:00 - 17:00",null]
2 ExtractDate "07/07/2014 00:00" is not a "date"
3 ExtractDate "2014-07-07 00:00" is not a "date"
4 ExtractDate "07/07/2014 00:00" is not a "date"
5 ExtractDate "07/07/2014 00:00" is not a "date"
6 ExtractDate "07/07/2014 00:00" is not a "date"
7 ExtractDate "07/07/2014 00:00" is not a "date"
8 ExtractDate "07/07/2014 00:00" is not a "date"
9 ExtractDate "07/07/2014 00:00" is not a "date"
10 ExtractDate "07/07/2014 00:00" is not a "date"
11 ExtractDate "07/07/2014 00:00" is not a "date"
12 ExtractDate "07/07/2014 00:00" is not a "date"
13 ExtractDate "07/07/2014 00:00" is not a "date"
14 ExtractDate "07/07/2014 00:00" is not a "date"
15 ExtractDate "07/07/2014 00:00" is not a "date"
"#;

// expecting invalid rows, so confirm there ARE output files generated
let validation_error_path = &wrk.path("adur-public-toilets.csv.validation-errors.tsv");
println!("expecting validation error file at: {validation_error_path:?}");

assert!(Path::new(validation_error_path).exists());
assert!(Path::new(&wrk.path("adur-public-toilets.csv.valid")).exists());
assert!(Path::new(&wrk.path("adur-public-toilets.csv.invalid")).exists());

// check validation error output
let validation_error_output: String =
wrk.from_str(&wrk.path("adur-public-toilets.csv.validation-errors.tsv"));

assert!(!validation_error_output.is_empty());

assert_eq!(
validation_errors_expected.to_string(),
validation_error_output
);
wrk.assert_err(&mut cmd2);
Dismissed Show dismissed Hide dismissed
}

#[test]
#[file_serial]
fn generate_schema_with_optional_flags_trim_and_validate_with_errors() {
// create workspace and invoke schema command with value constraints flag
let wrk = Workdir::new("generate_schema_with_optional_flags_trim_and_validate_with_errors")
.flexible(true);
wrk.clear_contents().unwrap();

// copy csv file to workdir
let csv = wrk.load_test_resource("adur-public-toilets.csv");
wrk.create_from_string("adur-public-toilets.csv", &csv);

// run schema command with value constraints option
let mut cmd = wrk.command("schema");
cmd.arg("adur-public-toilets.csv");
cmd.arg("--enum-threshold");
cmd.arg("13");
cmd.arg("--pattern-columns");
cmd.arg("ReportEmail,OpeningHours");
cmd.arg("--strict-dates");
wrk.output(&mut cmd);

// load output schema file
let output_schema_string: String =
wrk.from_str(&wrk.path("adur-public-toilets.csv.schema.json"));
let output_schema_json =
serde_json::from_str(&output_schema_string).expect("parse schema json");

// make sure it's a valid JSON Schema by compiling with jsonschema library
jsonschema::JSONSchema::options()
.compile(&output_schema_json)
.expect("valid JSON Schema");

// diff output json with expected json
let expected_schema: String =
wrk.load_test_resource("adur-public-toilets.csv.schema-strict.expected.json");
let expected_schema_json: Value = serde_json::from_str(&expected_schema).unwrap();
assert_json_eq!(expected_schema_json, output_schema_json);

// invoke validate command from schema created above
let mut cmd2 = wrk.command("validate");
Dismissed Show dismissed Hide dismissed
cmd2.arg("adur-public-toilets.csv");
Dismissed Show dismissed Hide dismissed
cmd2.arg("--trim");
Dismissed Show dismissed Hide dismissed
cmd2.arg("adur-public-toilets.csv.schema.json");
Dismissed Show dismissed Hide dismissed
wrk.output(&mut cmd2);

// validation report
let validation_errors_expected = r#"row_number field error
2 ExtractDate "07/07/2014 00:00" is not a "date"
Expand Down
Loading