Fix chromosome naming issue in BAM file

VanLoo-lab · Jan 8, 2024 · 6b80eea · 6b80eea
1 parent 2ef6b81
commit 6b80eea
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 7 deletions.
diff --git a/R/allele_counts.R b/R/allele_counts.R
@@ -36,12 +36,6 @@ get_reads_in_segments <- function(bam_file, segments, min_mapq, paired_end = FAL
  asMates <- FALSE
  }
 
- # Set segment chromosome names based on sequence style in BAM
- # Required to read BAM correctly e.g. hg38 vs GRCh38 contigs
- is_ucsc <- startsWith(seqnames(seqinfo(BamFile(bam_file)))[[1]], "chr")
- bamstyle <- ifelse(is_ucsc, "UCSC", "NCBI")
- GenomeInfoDb::seqlevelsStyle(segments) <- bamstyle
-
  # Set parameters and read BamFile.
  # ScanBamParam "whats" options are available via scanBamWhat()
  param <- Rsamtools::ScanBamParam(which = segments, what = what, flag = flag, mapqFilter = min_mapq)
@@ -57,6 +51,7 @@ get_reads_in_segments <- function(bam_file, segments, min_mapq, paired_end = FAL
  )
 
  # Format BAM to ucsc format if not already
+ is_ucsc <- startsWith(seqnames(seqinfo(BamFile(bam_file)))[[1]], "chr")
  if (!is_ucsc & nrow(bam_dt) > 0) {
  bam_dt$rname <- paste0("chr", bam_dt$rname)
  }
@@ -818,6 +813,19 @@ filter_multi_snp_loci <- function(pileup_summary) {
  return(pileup_summary)
 }
 
+clean_and_limit_segments <- function(seg, bam_file) {
+ # Set segment chromosome names based on sequence style in BAM
+ # Required to read BAM correctly:
+ # e.g.: BAM having hg38 vs GRCh38 contigs 
+ # e.g.: Segments having chromosomes not currently in BAM
+ bchroms <- seqnames(seqinfo(BamFile(bam_file)))
+ is_ucsc <- startsWith(bchroms[[1]], "chr")
+ bamstyle <- ifelse(is_ucsc, "UCSC", "NCBI")
+ GenomeInfoDb::seqlevelsStyle(seg) <- bamstyle
+ GenomeInfoDb::seqlevels(seg, pruning.mode="coarse") <- bchroms 
+ return(seg)
+}
+
 # Wrapper ----
 
 #' @export
@@ -831,6 +839,12 @@ cwrap_get_allele_counts <- function(bam_file, seg, loci_dt = NA, paired_end, dro
  }
  }
 
+ # Limit segments to chromosomes covered in BAM file
+ seg <- clean_and_limit_segments(seg, bam_file)
+ if(length(seg) == 0) {
+ return(empty_count_alleles_result())
+ }
+
  # Read BAM and annotate SNP and CPG loci
  bam_dt <- get_reads_in_segments(bam_file, seg, min_mapq, paired_end = paired_end)
  if (nrow(bam_dt) == 0) {
@@ -883,3 +897,4 @@ cwrap_get_allele_counts <- function(bam_file, seg, loci_dt = NA, paired_end, dro
 
  return(result)
 }
+
diff --git a/R/asm_allele_counts.R b/R/asm_allele_counts.R
@@ -34,7 +34,8 @@ cwrap_asm_get_allele_counts <- function(
  bam_dt <- annotate_bam_with_loci_asm(bam_dt, loci_dt, drop_ccgg, paired_end)
 
  # Get qname to cpg mapping
- qname_hap_cg <- unique(bam_dt[, .(qname, hap_id, chrom, start, end)])
+ qname_hap_cg <- unique(bam_dt[, .(qname, hap_id, chrom=chrom, start=start, end=end)])
+ qname_hap_cg$chrom = gsub("chr", "", qname_hap_cg$chrom)
 
  # Split by ref and alt after filtering
  ref_bam <- bam_dt[hap_is_ref == T]