-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Make de novo assembly pipeline standalone
Also factor out Python script in assembly pipe.
- Loading branch information
Showing
5 changed files
with
133 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env python | ||
# Copyright 2021 Edinburgh Genome Foundry, University of Edinburgh | ||
# | ||
# This file is part of Sequeduct. | ||
# | ||
# Sequeduct is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | ||
# | ||
# Sequeduct is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License along with Sequeduct. If not, see <https:www.gnu.org/licenses/>. | ||
|
||
import sys | ||
|
||
assembly_dir = sys.argv[1] # skip first filename | ||
params_assembly_prefix = sys.argv[2] | ||
params_canu_postfix = sys.argv[3] | ||
trimmed_denovo = sys.argv[4] | ||
barcode = sys.argv[5] | ||
|
||
from Bio import SeqIO | ||
|
||
canu_fasta = assembly_dir + '/' + params_assembly_prefix + params_canu_postfix | ||
try: | ||
contig = SeqIO.read(canu_fasta, format="fasta") | ||
except: | ||
print("The FASTA file contains more than 1 contigs. First contig used.") | ||
contig = next(SeqIO.parse(canu_fasta, format="fasta")) | ||
|
||
entries = contig.description.split(" ") | ||
desc_dict = {"name": entries[0]} # first is the name | ||
for entry in entries[1:]: # addressed the first one above | ||
elements = entry.split("=") | ||
desc_dict[elements[0]] = elements[1] | ||
|
||
# canu assembly: 0-based, from-index inclusive, end-index exclusive | ||
if desc_dict["suggestCircular"] == "yes": # as output by canu | ||
start, end = desc_dict["trim"].split("-") # must contain 2 values | ||
start = int(start) | ||
end = int(end) | ||
SeqIO.write(contig[start:end], trimmed_denovo, format="fasta") | ||
else: # keep intact | ||
SeqIO.write(contig, trimmed_denovo, format="fasta") | ||
|
||
print("Trimmed:", barcode) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#!/usr/bin/env nextflow | ||
// Copyright 2021 Edinburgh Genome Foundry, University of Edinburgh | ||
// | ||
// This file is part of Sequeduct. | ||
// | ||
// Sequeduct is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | ||
// | ||
// Sequeduct is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||
// | ||
// You should have received a copy of the GNU General Public License along with Sequeduct. If not, see <https:www.gnu.org/licenses/>. | ||
|
||
nextflow.enable.dsl=2 | ||
|
||
/////////////////////////////////////////////////////////////////////////////// | ||
// De novo assembly from reads | ||
|
||
process runNanoFilt { | ||
publishDir 'results/dir4_assembly/n1_fastq_filtered', mode: 'copy', pattern: '*.fastq' // need only the fastq | ||
|
||
input: | ||
tuple val(barcode), path(barcode_path), val(fastq_files), val(length) | ||
|
||
output: | ||
tuple val(barcode), path(barcode_path), path(fastq_file), val(length) | ||
|
||
script: | ||
fastq_file = barcode + '.fastq' // need for output | ||
|
||
fastqFileString = fastq_files.join(' ') // need as one string for cat | ||
|
||
""" | ||
cat $fastqFileString | NanoFilt -l $params.min_length -q $params.quality_cutoff > $fastq_file | ||
""" | ||
} | ||
|
||
|
||
process assembleOnly { | ||
publishDir 'results/dir4_assembly/n2_de_novo_assembly', mode: 'copy' | ||
|
||
input: | ||
tuple val(barcode), path(barcode_path), path(fastq_file), val(length) | ||
output: | ||
tuple val(barcode), path(assembly_dir) | ||
script: | ||
assembly_dir = barcode + '_assembly' | ||
genomsize_param = 'genomeSize=' + length + 'k' | ||
""" | ||
canu -p $params.assembly_prefix -d $assembly_dir $genomsize_param -nanopore $fastq_file | ||
""" | ||
} | ||
|
||
|
||
process trimAssemblyOnly { | ||
publishDir 'results/dir4_assembly/n3_assembly_trimmed', mode: 'copy', pattern: '*_denovo.fasta' | ||
|
||
input: | ||
tuple val(barcode), path(assembly_dir) | ||
output: | ||
tuple val(barcode), path(assembly_dir), path(trimmed_denovo) | ||
|
||
script: | ||
trimmed_denovo = barcode + '_denovo.fasta' | ||
""" | ||
trim_assembly.py "$assembly_dir" "$params.assembly_prefix" "$params.canu_postfix" "$trimmed_denovo" "$barcode" | ||
""" | ||
} | ||
|
||
|
||
workflow assemble_denovo { | ||
take: entries_assembly_ch | ||
main: | ||
runNanoFilt(entries_assembly_ch) | ||
assembleOnly(runNanoFilt.out) | ||
trimAssemblyOnly(assembleOnly.out) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters