Skip to content

Commit

Permalink
split: no need to convert to utf8, just count the number of bytes i…
Browse files Browse the repository at this point in the history
…n the vec slice

simplifies code, makes it work with non-UTF8 encoded CSVs, and makes it faster to boot
  • Loading branch information
jqnatividad committed Feb 23, 2024
1 parent be74040 commit 9fc5ad1
Showing 1 changed file with 12 additions and 34 deletions.
46 changes: 12 additions & 34 deletions src/cmd/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ use crate::{
config::{Config, Delimiter},
index::Indexed,
util::{self, FilenameTemplate},
CliError, CliResult,
CliResult,
};

#[derive(Clone, Deserialize)]
Expand All @@ -125,8 +125,6 @@ struct Args {
flag_quiet: bool,
}

static UTF8_ERROR: &str = "UTF-8 Encoding error";

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
if args.flag_size == 0 {
Expand All @@ -151,66 +149,46 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}
}

impl From<simdutf8::basic::Utf8Error> for CliError {
fn from(_: simdutf8::basic::Utf8Error) -> Self {
CliError::Encoding(UTF8_ERROR.to_string())
}
}

impl Args {
fn split_by_kb_size(&self, chunk_size: usize) -> CliResult<()> {
let rconfig = self.rconfig();
let mut rdr = rconfig.reader()?;
let headers = rdr.byte_headers()?.clone();

let mut headerbuf_wtr = csv::WriterBuilder::new().from_writer(vec![]);

headerbuf_wtr.write_byte_record(&headers)?;
let header_byte_size = if self.flag_no_headers {
0
} else {
let mut headerbuf_wtr = csv::WriterBuilder::new().from_writer(vec![]);
headerbuf_wtr.write_byte_record(&headers)?;

// safety: we know the inner vec is valid
let header_string =
simdutf8::basic::from_utf8(&headerbuf_wtr.into_inner().unwrap())?.to_string();
header_string.len()
headerbuf_wtr.into_inner().unwrap().len()
};

let mut wtr = self.new_writer(&headers, 0, self.flag_pad)?;
let mut i = 0;
let mut num_chunks = 0;
let mut row = csv::ByteRecord::new();
let chunk_size_bytes = chunk_size * 1024;
let mut buf_curr_string = String::with_capacity(chunk_size_bytes);
let mut buf_next_string = String::with_capacity(chunk_size_bytes);
let mut chunk_size_bytes_left = chunk_size_bytes - header_byte_size;

let mut not_empty = rdr.read_byte_record(&mut row)?;
let mut curr_size_bytes = buf_curr_string.len();
chunk_size_bytes_left -= curr_size_bytes;
let mut curr_size_bytes;
let mut next_size_bytes;
wtr.write_byte_record(&row)?;

while not_empty {
let mut buf_curr_wtr = csv::WriterBuilder::new().from_writer(vec![]);
buf_curr_wtr.write_byte_record(&row)?;
buf_curr_string.clear();
buf_curr_string.push_str(simdutf8::basic::from_utf8(
&buf_curr_wtr
.into_inner()
.map_err(|_| CliError::Encoding(UTF8_ERROR.to_string()))?,
)?);
curr_size_bytes = buf_curr_string.len();

curr_size_bytes = buf_curr_wtr.into_inner().unwrap().len();

not_empty = rdr.read_byte_record(&mut row)?;
let next_size_bytes = if not_empty {
next_size_bytes = if not_empty {
let mut buf_next_wtr = csv::WriterBuilder::new().from_writer(vec![]);
buf_next_wtr.write_byte_record(&row)?;
buf_next_string.clear();
buf_next_string.push_str(simdutf8::basic::from_utf8(
&buf_next_wtr
.into_inner()
.map_err(|_| CliError::Encoding(UTF8_ERROR.to_string()))?,
)?);
buf_next_string.len()

buf_next_wtr.into_inner().unwrap().len()
} else {
0
};
Expand Down

0 comments on commit 9fc5ad1

Please sign in to comment.