Skip to content

Commit

Permalink
Increased performance ~20% of write_to_tsv() by removing clone().
Browse files Browse the repository at this point in the history
  - This introduced two new non-borrowing structs,
    `GenomicRangeRecordEmptyBorrowed` and `GenomicRangeRecordBorrowed`,
    which pass the sequence name by reference, avoiding a
    `clone()` which can be quite costly when serialzing
    many genomic ranges.

 - `seqname()` in `GenomicRangeRecord` also know returns a reference,
   which seems to also improve performance in some situations.

 - Slight change to `GRanges::push_range()` methods, for optimziation
   (simplified code a bit).
  • Loading branch information
vsbuffalo committed May 4, 2024
1 parent b8cf886 commit 9182a90
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 24 deletions.
24 changes: 10 additions & 14 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ where
config: &TsvConfig,
) -> Result<(), GRangesError> {
let mut writer = build_tsv_writer_with_config(output, config)?;
let seqnames = &self.ranges.sorted_keys;

for range in self.iter_ranges() {
let record = range.to_record(
&self.seqnames(),
&seqnames,
self.data.as_ref().ok_or(GRangesError::NoDataContainer)?,
);
writer.serialize(record)?;
Expand All @@ -268,10 +269,11 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges
output: Option<impl Into<PathBuf>>,
config: &TsvConfig,
) -> Result<(), GRangesError> {
let seqnames = &self.0.ranges.sorted_keys;
let mut writer = build_tsv_writer_with_config(output, config)?;

for range in self.iter_ranges() {
let record = range.to_record_empty::<()>(&self.seqnames());
let record = range.to_record_empty::<()>(&seqnames);
writer.serialize(record)?;
}

Expand Down Expand Up @@ -738,11 +740,8 @@ impl<U> GRanges<VecRangesIndexed, Vec<U>> {
self.data = Some(Vec::new());
}
let data_ref = self.data.as_mut().ok_or(GRangesError::NoDataContainer)?;
// push data to the vec data container, getting the index
let index: usize = {
data_ref.push(data);
data_ref.len() - 1 // new data index
};
let index = data_ref.len();
data_ref.push(data);
// push an indexed range
let range = RangeIndexed::new(start, end, index);
let range_container = self
Expand Down Expand Up @@ -797,11 +796,8 @@ impl<'a, DL, DR> GRanges<VecRangesIndexed, JoinData<'a, DL, DR>> {
panic!("Internal error: JoinData not initialized.");
}
let data_ref = self.data.as_mut().ok_or(GRangesError::NoDataContainer)?;
// push data to the vec data container, getting the index
let index: usize = {
data_ref.push(data);
data_ref.len() - 1 // new data index
};
let index = data_ref.len();
data_ref.push(data);
// push an indexed range
let range = RangeIndexed::new(start, end, index);
let range_container = self
Expand Down Expand Up @@ -1872,7 +1868,7 @@ mod tests {
let seqnames = sl.keys().map(|x| x.to_string()).collect::<Vec<_>>();
let actual_ranges: Vec<(String, Position, Position)> = gr
.iter_ranges()
.map(|r| (r.seqname(&seqnames), r.start(), r.end()))
.map(|r| (r.seqname(&seqnames).to_string(), r.start(), r.end()))
.collect();
assert_eq!(actual_ranges, expected_ranges_chop);

Expand Down Expand Up @@ -1904,7 +1900,7 @@ mod tests {

let actual_ranges: Vec<(String, Position, Position)> = gr
.iter_ranges()
.map(|r| (r.seqname(&seqnames), r.start(), r.end()))
.map(|r| (r.seqname(&seqnames).to_string(), r.start(), r.end()))
.collect();

assert_eq!(actual_ranges, expected_ranges_no_chop);
Expand Down
41 changes: 31 additions & 10 deletions src/ranges/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,30 @@ impl AdjustableGenericRange for RangeIndexed {
}
}

/// Represents a genomic range entry with some borrowed data.
/// This is used primarily as a temporary store for deserializing
/// a genomic range.
#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
pub struct GenomicRangeRecordEmptyBorrowed<'a> {
pub seqname: &'a str,
pub start: Position,
pub end: Position,
}

/// Represents a genomic range entry with some borrowed data.
#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
pub struct GenomicRangeRecordBorrowed<'a, U> {
pub seqname: &'a str,
pub start: Position,
pub end: Position,
pub data: U,
}

/// Represents a genomic range entry with some data.
///
/// This is used as a type for holding a range with associated data directly
/// from a parser.
/// from a parser. Thus it owns the memory it uses for the seqname and the
/// data.
#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
pub struct GenomicRangeRecord<U> {
pub seqname: String,
Expand Down Expand Up @@ -364,29 +384,30 @@ impl GenomicRangeIndexedRecord {
}
/// Using the *corresponding ordered* sequence names the `seqname_index` indices
/// correspond to, get the sequence name.
pub fn seqname(&self, seqnames: &[String]) -> String {
seqnames[self.seqname_index].clone()
pub fn seqname<'a>(&'a self, seqnames: &'a [String]) -> &String {
&seqnames[self.seqname_index]
}
pub fn to_record<'a, T>(
self,
seqnames: &[String],
seqnames: &'a[String],
data: &'a T,
) -> GenomicRangeRecord<<T as IndexedDataContainer>::Item<'a>>
) -> GenomicRangeRecordBorrowed<'a, <T as IndexedDataContainer>::Item<'a>>
where
T: IndexedDataContainer,
{
let data = data.get_value(self.index().unwrap());

GenomicRangeRecord {
seqname: seqnames[self.seqname_index].clone(),
GenomicRangeRecordBorrowed {
seqname: &seqnames[self.seqname_index],
start: self.start,
end: self.end,
data,
}
}
pub fn to_record_empty<T>(self, seqnames: &[String]) -> GenomicRangeRecordEmpty {
GenomicRangeRecordEmpty {
seqname: seqnames[self.seqname_index].clone(),
pub fn to_record_empty<'a, T>(self, seqnames: &'a[String]) ->
GenomicRangeRecordEmptyBorrowed<'a> {
GenomicRangeRecordEmptyBorrowed {
seqname: &seqnames[self.seqname_index],
start: self.start,
end: self.end,
}
Expand Down

0 comments on commit 9182a90

Please sign in to comment.