Skip to content

Commit

Permalink
Fix FAST5 base QC errors
Browse files Browse the repository at this point in the history
  • Loading branch information
jonperdomo committed Jul 31, 2023
1 parent 6a0e6d0 commit 1de24bd
Show file tree
Hide file tree
Showing 8 changed files with 301 additions and 72 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
- name: Set up conda environment
uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: lrst_py39
activate-environment: longreadsum
environment-file: environment.yml
python-version: 3.9
auto-activate-base: false
Expand Down
45 changes: 45 additions & 0 deletions .github/workflows/build-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# This workflow will build and test LongReadSum

name: build tests

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build-and-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Download assets
uses: dsaltares/[email protected]
with:
repo: 'WGLab/LongReadSum'
version: 'tags/v0.1.0'
file: 'SampleData.zip'
target: 'TestInputs/SampleData.zip'

- name: Unzip assets
shell: bash --login {0}
run: unzip TestInputs/SampleData.zip -d TestInputs

- name: Set up conda environment
uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: lrst_py39
environment-file: environment.yml
python-version: 3.9
auto-activate-base: false

- name: Build LongReadSum
shell: bash --login {0} # --login enables PATH variable access
run: make

- name: Run tests
shell: bash --login {0}
run: |
mkdir output # Output folder for tests
pytest
30 changes: 30 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# VSCode
.vscode
.idea

# Python
__pycache__

# SWIG generated files
lib/*.so
lib/lrst.py
src/lrst_wrap.cpp

# Build and dist
build
dist

# Conda
conda

# Outputs
output

# Sample data
SampleData

# Logs
*.log

# Testing scripts
tests/SCRIPTS.txt
1 change: 1 addition & 0 deletions include/output_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ class Output_FAST5 : public Output_FA
std::string getNthReadName(int read_index);
std::string getNthReadSequence(int read_index);
void addReadBaseSignals(Base_Signals values);
void addReadFastq(std::vector<std::string> fq, FILE *read_details_fp);
std::vector<std::vector<int>> getNthReadBaseSignals(int read_index);
std::vector<double> getNthReadBaseMeans(int read_index);
std::vector<double> getNthReadBaseStds(int read_index);
Expand Down
109 changes: 38 additions & 71 deletions src/fast5_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -374,83 +374,50 @@ static int writeBaseQCDetails(const char *input_file, Output_FAST5 &output_data,
// Open the file
H5::H5File f5 = H5::H5File(input_file, H5F_ACC_RDONLY);

// Get the FASTQ dataset
std::vector<std::string> fq = getFastq(f5, basecall_group);
if (fq.empty()){
std::cerr << "No sequence data found. Signal QC may be generated using the 'f5s' option." << std::endl;
exit_code = 2;
// Check if it is a multi-read FAST5 file
std::string signal_group_str;
std::string read_name;
std::string basecall_group = "/Analyses/Basecall_1D_000";
bool single_read = false;
if (f5.nameExists("/Raw")) {
single_read = true;
std::cout << "Single read FAST5" << std::endl;

// Get the basecall group
basecall_group = "/Analyses/Basecall_1D_000";

// Get the sequence data
std::vector<std::string> fq = getFastq(f5, basecall_group);

// Append the basecalls to the output structure
output_data.addReadFastq(fq, read_details_fp);

} else {
// Access the read name
std::string header_str = fq[0];
std::istringstream iss_header( header_str );
std::string read_name_str;
std::getline( iss_header, read_name_str, ' ' );
read_name = read_name_str.c_str();

// Access the sequence data
std::string sequence_data_str = fq[1];

// Update the total number of bases
int base_count = sequence_data_str.length();
long_read_info.total_num_bases += base_count;

// Store the read length
long_read_info.read_lengths.push_back(base_count);

// Access base quality data
char value;
std::vector<int> base_quality_values;
std::string base_quality_str = fq[3];
std::istringstream iss( base_quality_str );
while (iss >> value) {
int base_quality_value = value - '!'; // '!' symbol represent 0-quality score
base_quality_values.push_back(base_quality_value);
}
std::cout << "Multi-read FAST5" << std::endl;

// Update the base quality and GC content information
int gc_count = 0;
double read_mean_base_qual = 0;
char current_base;
uint64_t base_quality_value;
for (int i = 0; i < base_count; i++)
{
current_base = sequence_data_str[i];
if (current_base == 'A' || current_base == 'a')
{
long_read_info.total_a_cnt += 1;
}
else if (current_base == 'G' || current_base == 'g')
{
long_read_info.total_g_cnt += 1;
gc_count += 1;
}
else if (current_base == 'C' || current_base == 'c')
{
long_read_info.total_c_cnt += 1;
gc_count += 1;
}
else if (current_base == 'T' || current_base == 't' || current_base == 'U' || current_base == 'u')
{
long_read_info.total_tu_cnt += 1;
}
// Get the base quality
base_quality_value = (uint64_t)base_quality_values[i];
seq_quality_info.base_quality_distribution[base_quality_value] += 1;
read_mean_base_qual += (double)base_quality_value;
}
// Loop through each read
std::cout << "Reading all reads" << std::endl;
H5::Group root_group = f5.openGroup("/");
size_t num_objs = root_group.getNumObjs();
for (size_t i=0; i < num_objs; i++) {
read_name = root_group.getObjnameByIdx(i);

// Calculate percent guanine & cytosine
gc_content_pct = 100.0 *( (double)gc_count / (double)base_count );
// First remove the prefix
std::string read_id = read_name.substr(5);
std::cout << "Processing read ID: " << read_id << std::endl;
//std::cout << "Read: " << read_name << std::endl;

// Look into this section
long_read_info.read_gc_content_count[(int)(gc_content_pct + 0.5)] += 1;
read_mean_base_qual /= (double) base_count;
seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1;
fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual);
// Get the basecall group
basecall_group = "/" + read_name + "/Analyses/Basecall_1D_000";

// Update the total number of reads (1 per FAST5 file)
output_data.long_read_info.total_num_reads += 1;
// Get the sequence data
std::vector<std::string> fq = getFastq(f5, basecall_group);

// Append the basecalls to the output structure
output_data.addReadFastq(fq, read_details_fp);
}
}

} catch (FileIException &error) {
// If the dataset is missing, continue and ignore this file
if (error.getFuncName() == "H5File::openDataSet") {
Expand Down
79 changes: 79 additions & 0 deletions src/output_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include<algorithm> // std::foreach
#include <math.h> // sqrt
#include <iostream>
#include <sstream>

#include "output_data.h"
#include "basic_statistics.h"
Expand Down Expand Up @@ -465,6 +466,84 @@ void Output_FAST5::addReadBaseSignals(Base_Signals values){
this->base_count += base_count; // Update base count
}

// Add read fastq data
void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_fp)
{
const char * read_name;
double gc_content_pct;

// Access the read name
std::string header_str = fq[0];
std::istringstream iss_header( header_str );
std::string read_name_str;
std::getline( iss_header, read_name_str, ' ' );
read_name = read_name_str.c_str();

// Access the sequence data
std::string sequence_data_str = fq[1];

// Update the total number of bases
int base_count = sequence_data_str.length();
long_read_info.total_num_bases += base_count;

// Store the read length
long_read_info.read_lengths.push_back(base_count);

// Access base quality data
char value;
std::vector<int> base_quality_values;
std::string base_quality_str = fq[3];
std::istringstream iss( base_quality_str );
while (iss >> value) {
int base_quality_value = value - '!'; // '!' symbol represent 0-quality score
base_quality_values.push_back(base_quality_value);
}

// Update the base quality and GC content information
int gc_count = 0;
double read_mean_base_qual = 0;
char current_base;
uint64_t base_quality_value;
for (int i = 0; i < base_count; i++)
{
current_base = sequence_data_str[i];
if (current_base == 'A' || current_base == 'a')
{
long_read_info.total_a_cnt += 1;
}
else if (current_base == 'G' || current_base == 'g')
{
long_read_info.total_g_cnt += 1;
gc_count += 1;
}
else if (current_base == 'C' || current_base == 'c')
{
long_read_info.total_c_cnt += 1;
gc_count += 1;
}
else if (current_base == 'T' || current_base == 't' || current_base == 'U' || current_base == 'u')
{
long_read_info.total_tu_cnt += 1;
}
// Get the base quality
base_quality_value = (uint64_t)base_quality_values[i];
seq_quality_info.base_quality_distribution[base_quality_value] += 1;
read_mean_base_qual += (double)base_quality_value;
}

// Calculate percent guanine & cytosine
gc_content_pct = 100.0 *( (double)gc_count / (double)base_count );

// Look into this section
long_read_info.read_gc_content_count[(int)(gc_content_pct + 0.5)] += 1;
read_mean_base_qual /= (double) base_count;
seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1;
fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual);

// Update the total number of reads
long_read_info.total_num_reads += 1;
}

// Get the read count
int Output_FAST5::getReadCount(){
return this->read_count;
Expand Down
Loading

0 comments on commit 1de24bd

Please sign in to comment.