Skip to content

Commit

Permalink
Make de novo assembly pipeline standalone
Browse files Browse the repository at this point in the history
Also factor out Python script in assembly pipe.
  • Loading branch information
veghp committed Aug 23, 2023
1 parent 725c9bb commit 89d14e0
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 72 deletions.
44 changes: 44 additions & 0 deletions bin/trim_assembly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python
# Copyright 2021 Edinburgh Genome Foundry, University of Edinburgh
#
# This file is part of Sequeduct.
#
# Sequeduct is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
#
# Sequeduct is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with Sequeduct. If not, see <https:www.gnu.org/licenses/>.

import sys

assembly_dir = sys.argv[1] # skip first filename
params_assembly_prefix = sys.argv[2]
params_canu_postfix = sys.argv[3]
trimmed_denovo = sys.argv[4]
barcode = sys.argv[5]

from Bio import SeqIO

canu_fasta = assembly_dir + '/' + params_assembly_prefix + params_canu_postfix
try:
contig = SeqIO.read(canu_fasta, format="fasta")
except:
print("The FASTA file contains more than 1 contigs. First contig used.")
contig = next(SeqIO.parse(canu_fasta, format="fasta"))

entries = contig.description.split(" ")
desc_dict = {"name": entries[0]} # first is the name
for entry in entries[1:]: # addressed the first one above
elements = entry.split("=")
desc_dict[elements[0]] = elements[1]

# canu assembly: 0-based, from-index inclusive, end-index exclusive
if desc_dict["suggestCircular"] == "yes": # as output by canu
start, end = desc_dict["trim"].split("-") # must contain 2 values
start = int(start)
end = int(end)
SeqIO.write(contig[start:end], trimmed_denovo, format="fasta")
else: # keep intact
SeqIO.write(contig, trimmed_denovo, format="fasta")

print("Trimmed:", barcode)
17 changes: 12 additions & 5 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ include { preview_workflow } from "$projectDir/nextflow/sequeduct_preview.nf"
include { analysis_workflow } from "$projectDir/nextflow/sequeduct_analysis.nf"
include { review_consensus } from "$projectDir/nextflow/sequeduct_review.nf"
include { review_denovo } from "$projectDir/nextflow/sequeduct_review.nf"
include { assemble_denovo } from "$projectDir/nextflow/sequeduct_review.nf"
include { assemble_denovo } from "$projectDir/nextflow/sequeduct_assembly.nf"

///////////////////////////////////////////////////////////////////////////////

workflow preview {

Expand All @@ -37,6 +38,8 @@ workflow preview {
preview_workflow(input_ch)
}

///////////////////////////////////////////////////////////////////////////////

workflow analysis {

Channel
Expand Down Expand Up @@ -64,6 +67,8 @@ workflow analysis {
analysis_workflow(entries_ch, genbank_ch)
}

///////////////////////////////////////////////////////////////////////////////

workflow review {

Channel
Expand Down Expand Up @@ -105,17 +110,19 @@ workflow review {
review_denovo(entries_de_novo_ch)
}

///////////////////////////////////////////////////////////////////////////////

workflow assembly {
Channel
.fromPath(params.results_csv)
.fromPath(params.assembly_sheet)
.splitCsv(header: true)

.map { row ->
def barcode = row['Barcode'] // a barcode is present only once -- no pooling
def barcode = row['Barcode_dir'] // a barcode is present only once -- no pooling
def length = row['Length'] // estimated length in kbp for canu genomeSize parameter
def fastq_path = file("${params.fastq_filtered_dir}/${barcode}.fastq")
return [barcode, fastq_path, length]
def barcode_path = file("${params.fastq_dir}/${barcode}")
def fastq_files = barcode_path.listFiles()
return [barcode, barcode_path, fastq_files, length]
}
.set { entries_assembly_ch }

Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
params {

sample_sheet = '' // CSV of the sample ~ barcode relations. Columns: Sample,Barcode_dir (may have other)
assembly_sheet = '' // CSV of the barcode ~ expected seq length relations. Columns: Barcode_dir,Length (in kbp. May have other columns)
fastq_dir = 'fastq' // The directory that contains the barcode directories of FASTQ files
barcode_prefix = 'barcode' // The prefix for the individual barcode directories as output by the sequencer

Expand Down
75 changes: 75 additions & 0 deletions nextflow/sequeduct_assembly.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env nextflow
// Copyright 2021 Edinburgh Genome Foundry, University of Edinburgh
//
// This file is part of Sequeduct.
//
// Sequeduct is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
//
// Sequeduct is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with Sequeduct. If not, see <https:www.gnu.org/licenses/>.

nextflow.enable.dsl=2

///////////////////////////////////////////////////////////////////////////////
// De novo assembly from reads

process runNanoFilt {
publishDir 'results/dir4_assembly/n1_fastq_filtered', mode: 'copy', pattern: '*.fastq' // need only the fastq

input:
tuple val(barcode), path(barcode_path), val(fastq_files), val(length)

output:
tuple val(barcode), path(barcode_path), path(fastq_file), val(length)

script:
fastq_file = barcode + '.fastq' // need for output

fastqFileString = fastq_files.join(' ') // need as one string for cat

"""
cat $fastqFileString | NanoFilt -l $params.min_length -q $params.quality_cutoff > $fastq_file
"""
}


process assembleOnly {
publishDir 'results/dir4_assembly/n2_de_novo_assembly', mode: 'copy'

input:
tuple val(barcode), path(barcode_path), path(fastq_file), val(length)
output:
tuple val(barcode), path(assembly_dir)
script:
assembly_dir = barcode + '_assembly'
genomsize_param = 'genomeSize=' + length + 'k'
"""
canu -p $params.assembly_prefix -d $assembly_dir $genomsize_param -nanopore $fastq_file
"""
}


process trimAssemblyOnly {
publishDir 'results/dir4_assembly/n3_assembly_trimmed', mode: 'copy', pattern: '*_denovo.fasta'

input:
tuple val(barcode), path(assembly_dir)
output:
tuple val(barcode), path(assembly_dir), path(trimmed_denovo)

script:
trimmed_denovo = barcode + '_denovo.fasta'
"""
trim_assembly.py "$assembly_dir" "$params.assembly_prefix" "$params.canu_postfix" "$trimmed_denovo" "$barcode"
"""
}


workflow assemble_denovo {
take: entries_assembly_ch
main:
runNanoFilt(entries_assembly_ch)
assembleOnly(runNanoFilt.out)
trimAssemblyOnly(assembleOnly.out)
}
68 changes: 1 addition & 67 deletions nextflow/sequeduct_review.nf
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ workflow review_consensus {
writeCSV(alignParts.out.alignment_ch)
runReview(writeCSV.out.paf_file_ch.collect(), writeCSV.out.consensus_path_ch.collect(), writeCSV.out.samplesheet_csv_ch.collectFile())
}

///////////////////////////////////////////////////////////////////////////////
// De novo assembly sequence review


process convertGenbank_de_novo {
input:
tuple val(entry), val(barcode), val(sample), val(result), file(genbank_path), file(fastq_path)
Expand Down Expand Up @@ -220,64 +220,6 @@ process runReview_de_novo {
"""
}

// For assembly workflow only:

process assembleOnly {
publishDir 'results/dir4_assembly/n1_de_novo_assembly', mode: 'copy'

input:
tuple val(barcode), path(fastq_path), val(length)
output:
tuple val(barcode), path(assembly_dir)
script:
assembly_dir = barcode + '_assembly'
genomsize_param = 'genomeSize=' + length + 'k'
"""
canu -p $params.assembly_prefix -d $assembly_dir $genomsize_param -nanopore $fastq_path
"""
}


process trimAssemblyOnly {
publishDir 'results/dir4_assembly/n2_assembly_trimmed', mode: 'copy', pattern: '*_denovo.fasta'

input:
tuple val(barcode), path(assembly_dir)
output:
tuple val(barcode), path(assembly_dir), path(trimmed_denovo)

script:
trimmed_denovo = barcode + '_denovo.fasta'
"""
#!/usr/bin/env python
from Bio import SeqIO
canu_fasta = "$assembly_dir" + '/' + "$params.assembly_prefix" + "$params.canu_postfix"
try:
contig = SeqIO.read(canu_fasta, format="fasta")
except:
print("The FASTA file contains more than 1 contigs. First contig used.")
contig = next(SeqIO.parse(canu_fasta, format="fasta"))
entries = contig.description.split(" ")
desc_dict = {"name": entries[0]} # first is the name
for entry in entries[1:]: # addressed the first one above
elements = entry.split("=")
desc_dict[elements[0]] = elements[1]
# canu assembly: 0-based, from-index inclusive, end-index exclusive
if desc_dict["suggestCircular"] == "yes": # as output by canu
start, end = desc_dict["trim"].split("-") # must contain 2 values
start = int(start)
end = int(end)
SeqIO.write(contig[start:end], "$trimmed_denovo", format="fasta")
else: # keep intact
SeqIO.write(contig, "$trimmed_denovo", format="fasta")
print("Trimmed:", "$barcode")
"""
}


// Workflows:

Expand All @@ -298,11 +240,3 @@ workflow review_denovo {
writeCSV_de_novo(alignParts_de_novo.out)
runReview_de_novo(writeCSV_de_novo.out.paf_file_de_novo_ch.collect(), writeCSV_de_novo.out.genbank_path_ch.collect(), writeCSV_de_novo.out.trimmed_de_novo_fa_ch.collect(), writeCSV_de_novo.out.samplesheet_csv_de_novo_ch.collectFile())
}


workflow assemble_denovo {
take: entries_assembly_ch
main:
assembleOnly(entries_assembly_ch)
trimAssemblyOnly(assembleOnly.out)
}

0 comments on commit 89d14e0

Please sign in to comment.