Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: add --cache-threshold autoindex creation/deletion logic #1809

Merged
merged 4 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
stats: --cache-threshold tweaks
- now supports `-c` shortcut
- added ability to specify a negative arg. When doing so, if input file size is > abs of negative arg in bytes, an index is automatically created (as it greatly accelerates stats) AND forces caching
  • Loading branch information
jqnatividad committed May 9, 2024
commit 63ddca3a02bd333eb95902fef64ca20d5f215605
28 changes: 20 additions & 8 deletions src/cmd/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,14 @@ stats options:
by using this option BEFORE running the `schema` and `tojsonl`
commands and they will automatically load the binary encoded
stats file if it exists.
--cache-threshold <arg> The threshold in milliseconds to cache the stats results.
If a stats run takes longer than this threshold, the stats
results will be cached. Set to 0 to suppress caching. Set
to 1 to force caching.
-c, --cache-threshold <arg> When greater than 1, the threshold in milliseconds before caching
stats results. If a stats run takes longer than this threshold,
the stats results will be cached.
Set to 0 to suppress caching.
Set to 1 to force caching.
Set to a negative number to automatically create an index
when the input file size is greater than abs(arg) in bytes
AND to force caching.
[default: 5000]

Common options:
Expand Down Expand Up @@ -174,7 +178,7 @@ It's type inferences are also used by the `tojsonl` command to generate properly
JSONL files.

To safeguard against undefined behavior, `stats` is the most extensively tested command,
with >480 tests.
with ~500 tests.
*/

use std::{
Expand Down Expand Up @@ -224,7 +228,7 @@ pub struct Args {
pub flag_force: bool,
pub flag_jobs: Option<usize>,
pub flag_stats_binout: bool,
pub flag_cache_threshold: u64,
pub flag_cache_threshold: isize,
pub flag_output: Option<String>,
pub flag_no_headers: bool,
pub flag_delimiter: Option<Delimiter>,
Expand Down Expand Up @@ -478,6 +482,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
util::mem_file_check(&path, false, args.flag_memcheck)?;
}

// check if flag_cache_threshold is a negative number,
// if so, set the autoindex_size to absolute of the number
if args.flag_cache_threshold < 0 {
fconfig.autoindex_size = args.flag_cache_threshold.unsigned_abs() as u64;
}

// we need to count the number of records in the file to calculate sparsity
let record_count = RECORD_COUNT.get_or_init(|| util::count_rows(&fconfig).unwrap());
log::info!("scanning {record_count} records...");
Expand Down Expand Up @@ -520,7 +530,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
// update the stats args json metadata
current_stats_args.compute_duration_ms = start_time.elapsed().as_millis() as u64;

if create_cache && current_stats_args.compute_duration_ms > args.flag_cache_threshold {
if create_cache
&& current_stats_args.compute_duration_ms > args.flag_cache_threshold as u64
{
// if the stats run took longer than the cache threshold and the threshold > 0,
// cache the stats so we don't have to recompute it next time
current_stats_args.canonical_input_path =
Expand All @@ -532,7 +544,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}

// ensure create_cache is also true if the user specified --cache-threshold 1
create_cache = create_cache || args.flag_cache_threshold == 1;
create_cache = create_cache || args.flag_cache_threshold == 1 || args.flag_cache_threshold < 0;

wtr.flush()?;

Expand Down
40 changes: 20 additions & 20 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,26 +72,26 @@ impl<'de> Deserialize<'de> for Delimiter {

#[derive(Clone, Debug)]
pub struct Config {
pub path: Option<PathBuf>, // None implies <stdin>
idx_path: Option<PathBuf>,
select_columns: Option<SelectColumns>,
delimiter: u8,
pub no_headers: bool,
pub flexible: bool,
terminator: csv::Terminator,
pub quote: u8,
quote_style: csv::QuoteStyle,
double_quote: bool,
escape: Option<u8>,
quoting: bool,
pub preamble_rows: u64,
trim: csv::Trim,
autoindex_size: u64,
prefer_dmy: bool,
pub comment: Option<u8>,
snappy: bool, // flag to enable snappy compression/decompression
pub read_buffer: u32,
pub write_buffer: u32,
pub path: Option<PathBuf>, // None implies <stdin>
idx_path: Option<PathBuf>,
select_columns: Option<SelectColumns>,
delimiter: u8,
pub no_headers: bool,
pub flexible: bool,
terminator: csv::Terminator,
pub quote: u8,
quote_style: csv::QuoteStyle,
double_quote: bool,
escape: Option<u8>,
quoting: bool,
pub preamble_rows: u64,
trim: csv::Trim,
pub autoindex_size: u64,
prefer_dmy: bool,
pub comment: Option<u8>,
snappy: bool, // flag to enable snappy compression/decompression
pub read_buffer: u32,
pub write_buffer: u32,
}

// Empty trait as an alias for Seek and Read that avoids auto trait errors
Expand Down