Skip to content

Commit

Permalink
new file: .gitignore
Browse files Browse the repository at this point in the history
	new file:   README.md
	new file:   meth_phaser_parallel
	new file:   meth_phaser_post_processing
	new file:   methphaser.yaml
	new file:   methphasing
  • Loading branch information
Fu-Yilei committed May 3, 2023
0 parents commit 72a6bc1
Show file tree
Hide file tree
Showing 6 changed files with 2,495 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.ipynb_checkpoints/
.vscode/
test_data/
processing_folder/
.nfs*
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Meth Phaser

Requirement:
1. pysam>=0.20.0
2. tqdm
3. Need to put the executable `methphasing` into bashrc for `meth_phaser_parallel` to call it. exact command: export "PATH=/path/to/methphaser/:$PATH"
4. pandas
5. scipy
6. matplotlib

Known Issue:
1. Does not phase reads before the first phased block and after the last phased block
2. Multiprocessing threads should be smaller than the total block number on the chromosome. e.g., MethPhaser cannot use 3 threads to phase 3 blocks, since the number of the gaps among 3 blocks is 2.
3. MethPhaser sometimes ignore some very short reads (about 0.1% reads). Probably because of pysam.


## Example on HLA-E, HLA-C region:
Step 1: Run MethPhaser to get block relationship (will take about 5 minutes)

./meth_phaser_parallel -b test_data/HLA.R10.haplotagged.bam -r test_data/GCA_000001405.15_GRCh38_no_alt_analysis_set.chr6.fna -g test_data/LSK.filtered.gtf -vc test_data/HLA.R10.phased.vcf.gz -o test_data/work -k -1 -ml -2

Step 2: Run post processing script to get modified reads and vcf file (will take about 2 minutes on an SSD)

./meth_phaser_post_processing -ib test_data/HLA.R10.haplotagged.bam -if test_data/work/ -ov test_data/output.vcf -ob test_data/output -vc test_data/HLA.R10.phased.vcf.gz
282 changes: 282 additions & 0 deletions meth_phaser_parallel
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
#!/usr/bin/env python

import math
import subprocess
import pandas as pd
from multiprocessing import Pool
import time
import os
import argparse
import sys


def parse_arg(argv):
"""
Function for pasing arguments
"""

parser = argparse.ArgumentParser(
description="methphaser: phase reads based on methlytion informaiton"
)
required_args = parser.add_argument_group("Required arguments")
# input set
required_args.add_argument(
"-b",
"--bam_file",
type=str,
help="input methylation annotated bam file",
required=True,
metavar="",
)
required_args.add_argument(
"-r",
"--reference",
type=str,
help="reference genome",
required=True,
metavar="",
)
required_args.add_argument(
"-g",
"--gtf",
type=str,
help="gtf file from whatshap visualization",
required=True,
metavar="",
)
required_args.add_argument(
"-vc",
"--vcf_called",
type=str,
help="called vcf file from HapCUT2",
required=True,
metavar="",
)
parser.add_argument(
"-vt",
"--vcf_truth",
type=str,
help="Truth vcf file for benchmarking",
default=None,
metavar="",
)
parser.add_argument(
"-t", "--threads", type=int, help="threads", default=1, metavar=""
)
parser.add_argument(
"-ml",
"--max_len",
type=int,
help="maximum homozygous region length for phasing, default: -1 (ignore the largest homozygous region, centromere), input -2 for not skipping anything",
default=-1,
metavar="",
)
parser.add_argument(
"-c",
"--cut_off",
type=float,
help="the minimum percentage of vote to determine a read's haplotype",
default=0.65,
metavar="",
)
parser.add_argument(
"-a",
"--assignment_min",
type=int,
help="minimum assigned read number for ranksum test",
default=2,
metavar="",
)
parser.add_argument(
"-o",
"--output_dir",
type=str,
help="output_directory",
default="work/",
metavar="",
)
parser.add_argument(
"-k",
"--k_iterations",
type=int,
help="use at most k iterations, use -1 for unlimited iterations.",
default="10",
metavar="",
)

if len(argv) == 0:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args(argv)

return args


# def rename_process():


def run_project_parallel(
param_list, start_pos, end_pos, chromosome, skipping_pair
): # modify this
bam_file = param_list.bam_file
ref_seq_file = param_list.reference
phased_block_file = param_list.gtf
vcf_called = param_list.vcf_called
vcf_truth = param_list.vcf_truth
output_dir = param_list.output_dir
max_iteration = param_list.k_iterations
minimum_confidence_vote = param_list.assignment_min
output_relationship_assignment_dir = os.path.join(output_dir, chromosome, f"{start_pos}_{end_pos}.csv")
output_chromosome_read_assignment_dir = os.path.join(output_dir, f"{chromosome}_read_assignment/")
if vcf_truth != None:
cmd_list = [
"methphasing",
"-b",
bam_file,
"-r",
ref_seq_file,
"-p",
phased_block_file,
"-vc",
vcf_called,
"-vt",
vcf_truth,
"-n",
f"{start_pos},{end_pos}",
"-m",
chromosome,
"-o",
output_relationship_assignment_dir,
"-k",
str(max_iteration),
"-s",
skipping_pair,
"-a",
str(minimum_confidence_vote),
"-ra",
output_chromosome_read_assignment_dir
# f"/data/yf20/meth_phasing/LSK/whole_genome_6/{chromosome}_read_assignment/",
]
else:
cmd_list = [
"methphasing",
"-b",
bam_file,
"-r",
ref_seq_file,
"-p",
phased_block_file,
"-vc",
vcf_called,
"-n",
f"{start_pos},{end_pos}",
"-m",
chromosome,
"-o",
output_relationship_assignment_dir,
"-k",
str(max_iteration),
"-s",
skipping_pair,
"-a",
str(minimum_confidence_vote),
"-ra",
output_chromosome_read_assignment_dir,
]
os.makedirs(
os.path.join(output_dir, chromosome), exist_ok=True
)
os.makedirs(
output_chromosome_read_assignment_dir,
exist_ok=True,
)
# print(cmd_list)
subprocess.run(cmd_list)

def main(argv):
args = parse_arg(argv)
threads = args.threads # 32
vcf_truth = args.vcf_truth
ref_seq_file = args.reference
bam_file = args.bam_file
phased_block_file = args.gtf
vcf_called = args.vcf_called
phasing_max_len = args.max_len
cut_off = args.cut_off
assignment_min = args.assignment_min
output_dir = args.output_dir
k_iterations = args.k_iterations
ref_seq_index = pd.read_csv(f"{ref_seq_file}.fai", sep="\t", header=None)
chromosomes = list(ref_seq_index[0])
phased_block_df = pd.read_csv(
phased_block_file,
header=None,
sep="\t",
names=[
"chr",
"phasing",
"ex/intron",
"start",
"end",
"1",
"strand",
"2",
"info",
],
)
phased_df_chr = phased_block_df.groupby(["chr"])
for chromosome in chromosomes:
chr_block_num = len(phased_block_df[phased_block_df["chr"] == chromosome])
if chr_block_num == 0:
print(f"no reads aligned to chromosome {chromosome}, skipping...")
continue
phasd_df_chr_list = list(
phased_df_chr.get_group(chromosome).iterrows()
) # skipping pair calculation
phased_block_distance_dict = {}
for index, i in enumerate(phasd_df_chr_list[:-1]):
phased_block_distance_dict.update(
{index: phasd_df_chr_list[index + 1][1].start - i[1].end}
)
phased_block_distance_list_sorted = [
(k, v)
for k, v in sorted(
phased_block_distance_dict.items(),
key=lambda item: item[1],
reverse=True,
)
]
skipping_pair_start_list = [-1]
if phasing_max_len == -1:
skipping_pair_start_list.append(phased_block_distance_list_sorted[0][0])
elif phasing_max_len != -2:
for i in phased_block_distance_list_sorted:
if i[1] >= phasing_max_len:
skipping_pair_start_list.append(i[0])
# else:
# skipping_pair_start_list = [-1]
skipping_pair_start_list = str(skipping_pair_start_list)

total_block_n = chr_block_num
param_list = []
interval_list = []
block_size = math.ceil(total_block_n / threads)
for i in range(threads):
start_pos = i * block_size
end_pos = (i + 1) * block_size + 1
if end_pos >= total_block_n:
end_pos = total_block_n
interval_list.append(
(args, start_pos, end_pos, chromosome, skipping_pair_start_list)
)
break
interval_list.append(
(args, start_pos, end_pos, chromosome, skipping_pair_start_list)
)
# print(interval_list)
with Pool(threads) as pool:
L = pool.starmap(run_project_parallel, interval_list)


if __name__ == "__main__":
main(sys.argv[1:])
Loading

0 comments on commit 72a6bc1

Please sign in to comment.