Skip to content

Commit

Permalink
Merge pull request #1867 from jqnatividad/1864-select-reverse-sort
Browse files Browse the repository at this point in the history
`select`: add `--sort`, `--random` & `--seed` options; also add 9999 sentinel value to indicate last column
  • Loading branch information
jqnatividad committed Jun 9, 2024
2 parents 0fb9f25 + 9734b0b commit 6af3857
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 2 deletions.
90 changes: 89 additions & 1 deletion src/cmd/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@ selected using regular expressions.
Select the third column named 'Foo':
$ qsv select 'Foo[2]'
Select the first and last columns, 9999 is a special index for the last column:
$ qsv select 1,9999
Reverse the order of columns:
$ qsv select 9999-1
Sort the columns lexicographically. Note that you must provide a dummy selector:
$ qsv select 1 --sort
Randomly shuffle the columns:
$ qsv select 1 --random
# with a seed
$ qsv select 1 --random --seed 42
Select columns using a regex using '/<regex>/':
$ qsv select /^a/
$ qsv select '/^.*\d.*$/'
Expand All @@ -39,6 +53,16 @@ Usage:
qsv select [options] [--] <selection> [<input>]
qsv select --help
select options:
These options only apply to the `select` command, not the `--select` flag in other commands.
Be sure to provide a dummy selector (e.g. '1') to avoid command-line parsing errors.
-R, --random Randomly reorder the columns.
--seed <number> Seed for the random number generator.
-S, --sort Sort the columns lexicographically, i.e. by their
byte values.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
Expand All @@ -49,6 +73,7 @@ Common options:
Must be a single character. (default: ,)
"#;

use rand::{seq::SliceRandom, SeedableRng};
use serde::Deserialize;

use crate::{
Expand All @@ -61,13 +86,76 @@ use crate::{
struct Args {
arg_input: Option<String>,
arg_selection: SelectColumns,
flag_random: bool,
flag_seed: Option<u64>,
flag_sort: bool,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut args: Args = util::get_args(USAGE, argv)?;

if args.flag_sort && args.flag_random {
return fail_clierror!("Cannot use both --random and --sort flags.");
}

if args.flag_random {
// get the number of columns
let num_cols = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(true)
.reader()?
.byte_headers()?
.len();

// make a vector of the column indices (1-indexed).
let mut original_selection: Vec<usize> = (1..=num_cols).collect();

// Use seed if it is provided.
let mut rng = if let Some(seed) = args.flag_seed {
rand::rngs::StdRng::seed_from_u64(seed) // DevSkim: ignore DS148264
} else {
rand::rngs::StdRng::from_entropy()
};

// Shuffle the vector of column indices.
original_selection.shuffle(&mut rng);

// Convert the shuffled indices into a comma-separated string.
let randomized_selection = original_selection
.into_iter()
.map(|i| i.to_string())
.collect::<Vec<String>>()
.join(",");

// Parse the shuffled string into a SelectColumns object.
args.arg_selection = SelectColumns::parse(&randomized_selection)?;
}

if args.flag_sort {
// get the headers
let headers = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.reader()?
.byte_headers()?
.clone();

// sort the headers lexicographically
let mut sorted_headers = Vec::with_capacity(headers.len());
sorted_headers.extend(headers.iter().map(<[u8]>::to_vec));
sorted_headers.sort_unstable();

// make a comma-separated string of the sorted, quoted headers
let sorted_selection = sorted_headers
.iter()
.map(|h| format!("\"{}\"", String::from_utf8_lossy(h)))
.collect::<Vec<String>>()
.join(",");

args.arg_selection = SelectColumns::parse(&sorted_selection)?;
}

let rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
Expand Down
6 changes: 5 additions & 1 deletion src/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,10 +318,14 @@ impl OneSelector {
} else {
first_record.len() - 1
}),
OneSelector::Index(i) => {
OneSelector::Index(mut i) => {
if first_record.is_empty() {
return fail!("Input is empty.");
}
// 9999 is a sentinel value that means "last column".
if i == 9999 {
i = first_record.len();
}
if i < 1 || i > first_record.len() {
fail_format!(
"Selector index {i} is out of bounds. Index must be >= 1 and <= {}.",
Expand Down
87 changes: 87 additions & 0 deletions tests/test_select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ select_test!(
["a", "b", "d", "e"]
);

select_test!(
select_reverse_sentinel,
r#"9999-1"#,
"5-1",
["h1", "h4", "h[]3", "h2", "h1"],
["e", "d", "c", "b", "a"]
);

select_test_err!(select_err_unknown_header, "done");
select_test_err!(select_err_oob_low, "0");
select_test_err!(select_err_oob_high, "6");
Expand All @@ -193,3 +201,82 @@ select_test_err!(select_err_regex_nomatch, "/nomatch/");
select_test_err!(select_err_regex_invalid, "/?/");
select_test_err!(select_err_regex_empty, "//");
select_test_err!(select_err_regex_triple_slash, "///");

fn unsorted_data(headers: bool) -> Vec<Vec<String>> {
let mut rows = vec![
svec![
"value1", "value2", "value3", "value4", "value5", "value6", "value7", "value8",
"value9", "value10"
],
svec!["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
svec![
"value10", "value9", "value8", "value7", "value6", "value5", "value4", "value3",
"value2", "value1"
],
];
if headers {
rows.insert(
0,
svec![
"Günther", "Alice", "Çemil", "Đan", "Fátima", "Héctor", "İbrahim", "Bob", "Jürgen",
"Élise"
],
);
}
rows
}

#[test]
fn test_select_sort() {
let wrk = Workdir::new("test_select_sort");
wrk.create("data.csv", unsorted_data(true));
let mut cmd = wrk.command("select");
cmd.arg("1").arg("--sort").arg("data.csv");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);

let expected = vec![
svec![
"Alice", "Bob", "Fátima", "Günther", "Héctor", "Jürgen", "Çemil", "Élise", "Đan",
"İbrahim"
],
svec![
"value2", "value8", "value5", "value1", "value6", "value9", "value3", "value10",
"value4", "value7"
],
svec!["2", "8", "5", "1", "6", "9", "3", "10", "4", "7"],
svec![
"value9", "value3", "value6", "value10", "value5", "value2", "value8", "value1",
"value7", "value4"
],
];
assert_eq!(got, expected);
}

#[test]
fn test_select_random_seeded() {
let wrk = Workdir::new("test_select_random_seeded");
wrk.create("data.csv", unsorted_data(true));
let mut cmd = wrk.command("select");
cmd.arg("1")
.arg("--random")
.args(["--seed", "42"])
.arg("data.csv");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);

let expected = vec![
svec![
"Bob", "Đan", "Élise", "Héctor", "Günther", "Jürgen", "İbrahim", "Fátima", "Çemil",
"Alice"
],
svec![
"value8", "value4", "value10", "value6", "value1", "value9", "value7", "value5",
"value3", "value2"
],
svec!["8", "4", "10", "6", "1", "9", "7", "5", "3", "2"],
svec![
"value3", "value7", "value1", "value5", "value10", "value2", "value4", "value6",
"value8", "value9"
],
];
assert_eq!(got, expected);
}

0 comments on commit 6af3857

Please sign in to comment.