jqnatividad · jqnatividad · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/src/cmd/dedup.rs b/src/cmd/dedup.rs
@@ -10,11 +10,6 @@ than memory CSV files. This will make dedup run in streaming mode with constant
 
 Either way, the output will not only be deduplicated, it will also be sorted.
 
-Note that dedup's sorting will only be done alphabetically, not numerically. That is,
-10 will come before 2. If you need to sort numerically, use the sort command first with
-the --numeric option and pipe it to dedup with the --sorted option.
-(i.e. qsv sort --numeric in.csv | qsv dedup --sorted)
-
 A duplicate count will also be sent to <stderr>.
 
 For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs.
@@ -28,6 +23,7 @@ dedup options:
  Note that the outputs will remain at the full width
  of the CSV.
  See 'qsv select --help' for the format details.
+ -N, --numeric Compare according to string numerical value
  -i, --ignore-case Compare strings disregarding case.
  --sorted The input is already sorted. Do not load the CSV into
  memory to sort it first. Meant to be used in tandem and
@@ -63,7 +59,7 @@ use serde::Deserialize;
 use simdutf8::basic::from_utf8;
 
 use crate::{
- cmd::sort::iter_cmp,
+ cmd::sort::{iter_cmp, iter_cmp_num},
  config::{Config, Delimiter},
  select::SelectColumns,
  util, CliResult,
@@ -72,6 +68,7 @@ use crate::{
 struct Args {
  arg_input: Option<String>,
  flag_select: SelectColumns,
+ flag_numeric: bool,
  flag_ignore_case: bool,
  flag_sorted: bool,
  flag_dupes_output: Option<String>,
@@ -84,9 +81,23 @@ struct Args {
  flag_memcheck: bool,
 }
 
+enum ComparisonMode {
+ Numeric,
+ IgnoreCase,
+ Normal,
+}
+
 pub fn run(argv: &[&str]) -> CliResult<()> {
  let args: Args = util::get_args(USAGE, argv)?;
- let ignore_case = args.flag_ignore_case;
+
+ let compare_mode = if args.flag_numeric {
+ ComparisonMode::Numeric
+ } else if args.flag_ignore_case {
+ ComparisonMode::IgnoreCase
+ } else {
+ ComparisonMode::Normal
+ };
+
  let rconfig = Config::new(&args.arg_input)
  .delimiter(args.flag_delimiter)
  .no_headers(args.flag_no_headers)
@@ -119,10 +130,10 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
  };
  let a = sel.select(&record);
  let b = sel.select(&next_record);
- let comparison = if ignore_case {
- iter_cmp_ignore_case(a, b)
- } else {
- iter_cmp(a, b)
+ let comparison = match compare_mode {
+ ComparisonMode::Normal => iter_cmp(a, b),
+  ComparisonMode::Numeric => iter_cmp_num(a, b),
+ ComparisonMode::IgnoreCase => iter_cmp_ignore_case(a, b),
  };
  match comparison {
  cmp::Ordering::Equal => {
@@ -152,40 +163,65 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
  util::njobs(args.flag_jobs);
 
  let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
- if ignore_case {
- all.par_sort_by(|r1, r2| {
- let a = sel.select(r1);
- let b = sel.select(r2);
- iter_cmp_ignore_case(a, b)
- });
- } else {
- all.par_sort_by(|r1, r2| {
- let a = sel.select(r1);
- let b = sel.select(r2);
- iter_cmp(a, b)
- });
+ match compare_mode {
+ ComparisonMode::Normal => {
+ all.par_sort_by(|r1, r2| {
+ let a = sel.select(r1);
+ let b = sel.select(r2);
+ iter_cmp(a, b)
+ });
+ },
+ ComparisonMode::Numeric => {
+ all.par_sort_by(|r1, r2| {
+ let a = sel.select(r1);
+ let b = sel.select(r2);
+ iter_cmp_num(a, b)
+ });
+ },
+ ComparisonMode::IgnoreCase => {
+ all.par_sort_by(|r1, r2| {
+ let a = sel.select(r1);
+ let b = sel.select(r2);
+ iter_cmp_ignore_case(a, b)
+ });
+ },
  }
 
  for (current, current_record) in all.iter().enumerate() {
  let a = sel.select(current_record);
  if let Some(next_record) = all.get(current + 1) {
  let b = sel.select(next_record);
- if ignore_case {
- if iter_cmp_ignore_case(a, b) == cmp::Ordering::Equal {
- dupe_count += 1;
- if dupes_output {
- dupewtr.write_byte_record(current_record)?;
+ match compare_mode {
+ ComparisonMode::Normal => {
+ if iter_cmp(a, b) == cmp::Ordering::Equal {
+ dupe_count += 1;
+ if dupes_output {
+ dupewtr.write_byte_record(current_record)?;
+ }
+ } else {
+ wtr.write_byte_record(current_record)?;
  }
- } else {
- wtr.write_byte_record(current_record)?;
- }
- } else if iter_cmp(a, b) == cmp::Ordering::Equal {
- dupe_count += 1;
- if dupes_output {
- dupewtr.write_byte_record(current_record)?;
- }
- } else {
- wtr.write_byte_record(current_record)?;
+ },
+ ComparisonMode::Numeric => {
+ if iter_cmp_num(a, b) == cmp::Ordering::Equal {
+ dupe_count += 1;
+ if dupes_output {
+ dupewtr.write_byte_record(current_record)?;
+ }
+ } else {
+ wtr.write_byte_record(current_record)?;
+ }
+ },
+ ComparisonMode::IgnoreCase => {
+ if iter_cmp_ignore_case(a, b) == cmp::Ordering::Equal {
+ dupe_count += 1;
+ if dupes_output {
+ dupewtr.write_byte_record(current_record)?;
+ }
+ } else {
+ wtr.write_byte_record(current_record)?;
+ }
+ },
  }
  } else {
  wtr.write_byte_record(current_record)?;

diff --git a/tests/test_dedup.rs b/tests/test_dedup.rs
@@ -74,6 +74,35 @@ fn dedup_issue_1381() {
  assert_eq!(got, expected);
 }
 
+#[test]
+fn dedup_issue_1665_numeric() {
+ let wrk = Workdir::new("dedup_issue_1665_numeric");
+ wrk.create(
+ "in.csv",
+ vec![
+ svec!["data"],
+ svec!["1"],
+ svec!["3"],
+ svec!["3"],
+ svec!["5"],
+ svec!["10"],
+ ],
+ );
+
+ let mut cmd = wrk.command("dedup");
+ cmd.arg("-N").arg("in.csv");
+
+ let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+ let expected = vec![
+ svec!["data"],
+ svec!["1"],
+ svec!["3"],
+ svec!["5"],
+ svec!["10"],
+ ];
+ assert_eq!(got, expected);
+}
+
 #[test]
 fn dedup_select() {
  let wrk = Workdir::new("dedup_select");