stats: --cache-threshold tweaks

- now supports `-c` shortcut - added ability to specify a negative arg. When doing so, if input file size is > abs of negative arg in bytes, an index is automatically created (as it greatly accelerates stats) AND forces caching
jqnatividad · jqnatividad · May 10, 2024 · May 9, 2024 · May 9, 2024 · May 10, 2024
commit 63ddca3a02bd333eb95902fef64ca20d5f215605
diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs
@@ -139,10 +139,14 @@ stats options:
  by using this option BEFORE running the `schema` and `tojsonl`
  commands and they will automatically load the binary encoded
  stats file if it exists.
- --cache-threshold <arg> The threshold in milliseconds to cache the stats results.
- If a stats run takes longer than this threshold, the stats
- results will be cached. Set to 0 to suppress caching. Set
- to 1 to force caching.
+ -c, --cache-threshold <arg> When greater than 1, the threshold in milliseconds before caching
+ stats results. If a stats run takes longer than this threshold,
+ the stats results will be cached.
+ Set to 0 to suppress caching. 
+ Set to 1 to force caching.
+ Set to a negative number to automatically create an index
+ when the input file size is greater than abs(arg) in bytes
+ AND to force caching.
  [default: 5000]
 
 Common options:
@@ -174,7 +178,7 @@ It's type inferences are also used by the `tojsonl` command to generate properly
 JSONL files.
 
 To safeguard against undefined behavior, `stats` is the most extensively tested command,
-with >480 tests.
+with ~500 tests.
 */
 
 use std::{
@@ -224,7 +228,7 @@ pub struct Args {
  pub flag_force: bool,
  pub flag_jobs: Option<usize>,
  pub flag_stats_binout: bool,
- pub flag_cache_threshold: u64,
+ pub flag_cache_threshold: isize,
  pub flag_output: Option<String>,
  pub flag_no_headers: bool,
  pub flag_delimiter: Option<Delimiter>,
@@ -478,6 +482,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
  util::mem_file_check(&path, false, args.flag_memcheck)?;
  }
 
+ // check if flag_cache_threshold is a negative number,
+ // if so, set the autoindex_size to absolute of the number
+ if args.flag_cache_threshold < 0 {
+ fconfig.autoindex_size = args.flag_cache_threshold.unsigned_abs() as u64;
+ }
+
  // we need to count the number of records in the file to calculate sparsity
  let record_count = RECORD_COUNT.get_or_init(|| util::count_rows(&fconfig).unwrap());
  log::info!("scanning {record_count} records...");
@@ -520,7 +530,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
  // update the stats args json metadata
  current_stats_args.compute_duration_ms = start_time.elapsed().as_millis() as u64;
 
- if create_cache && current_stats_args.compute_duration_ms > args.flag_cache_threshold {
+ if create_cache
+ && current_stats_args.compute_duration_ms > args.flag_cache_threshold as u64
+ {
  // if the stats run took longer than the cache threshold and the threshold > 0,
  // cache the stats so we don't have to recompute it next time
  current_stats_args.canonical_input_path =
@@ -532,7 +544,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
  }
 
  // ensure create_cache is also true if the user specified --cache-threshold 1
- create_cache = create_cache || args.flag_cache_threshold == 1;
+ create_cache = create_cache || args.flag_cache_threshold == 1 || args.flag_cache_threshold < 0;
 
  wtr.flush()?;
 

diff --git a/src/config.rs b/src/config.rs
@@ -72,26 +72,26 @@ impl<'de> Deserialize<'de> for Delimiter {
 
 #[derive(Clone, Debug)]
 pub struct Config {
- pub path: Option<PathBuf>, // None implies <stdin>
- idx_path: Option<PathBuf>,
- select_columns: Option<SelectColumns>,
- delimiter: u8,
- pub no_headers: bool,
- pub flexible: bool,
- terminator: csv::Terminator,
- pub quote: u8,
- quote_style: csv::QuoteStyle,
- double_quote: bool,
- escape: Option<u8>,
- quoting: bool,
- pub preamble_rows: u64,
- trim: csv::Trim,
- autoindex_size:  u64,
- prefer_dmy: bool,
- pub comment: Option<u8>,
- snappy: bool, // flag to enable snappy compression/decompression
- pub read_buffer: u32,
- pub write_buffer: u32,
+ pub path:  Option<PathBuf>, // None implies <stdin>
+ idx_path:  Option<PathBuf>,
+ select_columns:  Option<SelectColumns>,
+ delimiter:  u8,
+ pub no_headers:  bool,
+ pub flexible:  bool,
+ terminator:  csv::Terminator,
+ pub quote:  u8,
+ quote_style:  csv::QuoteStyle,
+ double_quote:  bool,
+ escape:  Option<u8>,
+ quoting:  bool,
+ pub preamble_rows:  u64,
+ trim:  csv::Trim,
+ pub autoindex_size: u64,
+ prefer_dmy:  bool,
+ pub comment:  Option<u8>,
+ snappy:  bool, // flag to enable snappy compression/decompression
+ pub read_buffer:  u32,
+ pub write_buffer:  u32,
 }
 
 // Empty trait as an alias for Seek and Read that avoids auto trait errors