-
Notifications
You must be signed in to change notification settings - Fork 1
/
x_matrix.py
215 lines (193 loc) · 9.41 KB
/
x_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os
import sys
import pysam
from subprocess import *
from multiprocessing import Pool
def unwrap_collect_barcode_for_regions(arg, **kwarg):
return XMatrix.collect_barcode_for_block(*arg, **kwarg)
def unwrap_cnt_barcode_for_chrm(arg, **kwarg):
return XMatrix.cnt_barcode_for_chrm(*arg, **kwarg)
####To generate the matrix, we cut the genomes to blocks, and each block is cut to bins (10K by default)
####Frist, for each block, get all the barcode of each bin. Each block will be saved as one file,
#### and each bin saved as one line;
####Second, compare each two bins, and output in bed format: chrm1, start1, end1, chrm2, start2, end2, cnt
class XMatrix():
##comparison is between bins, by default each bin is ~10K
def __init__(self, sf_bam, n_bins_block, bin_size, i_barcode_cutoff, sf_working_folder, i_min_shared_cnt):
self.sf_bam = sf_bam
self.bin_size = bin_size
self.block_size = n_bins_block * bin_size
self.n_bins_block = n_bins_block # # of bins per block
self.sf_working_folder = sf_working_folder
if self.sf_working_folder[-1] != "/":
self.sf_working_folder += "/"
self.barcode_cutoff = i_barcode_cutoff
self.i_min_shared_cnt = i_min_shared_cnt
def break_chrm_to_blocks(self, chrm, chrm_length):
# break a whole chrom to blocks
# then, break block to bins
n_blocks = chrm_length / self.block_size
if chrm_length % self.block_size != 0:
n_blocks += 1
l_blocks = []
i_start = 0
i_end = 0
while i_end < chrm_length:
i_start = i_end
i_end += self.block_size
if i_end > chrm_length:
i_end = chrm_length
l_blocks.append((chrm, i_start, i_end))
return l_blocks
def cnt_shared_barcodes(self, sf_chrm_size, n_jobs, sf_out):
samfile = pysam.AlignmentFile(self.sf_bam, "rb")
references = samfile.references
ref_lenths = samfile.lengths
for chrm, chrm_length in zip(references, ref_lenths):
l_chrm_blocks = self.break_chrm_to_blocks(chrm, chrm_length)
# for block in l_chrm_blocks:# for each block generate a file that contain the barcords of each bin (per line)
pool = Pool(n_jobs)
pool.map(unwrap_collect_barcode_for_regions, zip([self] * len(l_chrm_blocks), l_chrm_blocks), 1)
pool.close()
pool.join()
samfile.close()
m_chrms = {} ####the order of chrms should follow the order in sf_chrm_size file, not by ascii code
iorder = 0
with open(sf_chrm_size) as fin_chrm_size:
for line in fin_chrm_size:
fields = line.split()
m_chrms[fields[0]] = iorder
iorder += 1
for chrm1, chrm_length1 in zip(references, ref_lenths):
if chrm1 not in m_chrms:
continue
l_chrm1_blocks = self.break_chrm_to_blocks(chrm1, chrm_length1)
l_chrm1_with_all = []
for chrm2, chrm_length2 in zip(references, ref_lenths):
if chrm2 not in m_chrms:
continue
if m_chrms[chrm2] < m_chrms[chrm1]:
continue
l_chrm2_blocks = self.break_chrm_to_blocks(chrm2, chrm_length2)
l_chrm1_with_all.append((l_chrm1_blocks, l_chrm2_blocks, chrm1, chrm2))
pool = Pool(n_jobs)
pool.map(unwrap_cnt_barcode_for_chrm, zip([self] * len(l_chrm1_with_all), l_chrm1_with_all), 1)
pool.close()
pool.join()
# merge all the chr1_vs_chr2 files to one single file
with open(sf_out, "w") as fout:
for chrm1 in references:
if chrm1 not in m_chrms:
continue
for chrm2 in references:
if chrm2 not in m_chrms:
continue
if m_chrms[chrm2] < m_chrms[chrm1]: ###here order should follow the sf_chrm_size file, not ascii
continue
sf_chrm_vs_chrm_out = self.sf_working_folder + "{0}_{1}_shared_barcode_count.txt".format(chrm1,
chrm2)
if os.path.isfile(sf_chrm_vs_chrm_out) == False:
print "File {0} doesn't exist!".format(sf_chrm_vs_chrm_out)
with open(sf_chrm_vs_chrm_out) as fin_info:
if chrm1 != chrm2:
for line in fin_info:
fout.write(line)
else:##if intra-chrom case, then make sure start1 <= start2
for line in fin_info:
fields = line.split()
start1 = int(fields[1])
start2 = int(fields[4])
if start1 > start2:
continue
fout.write(line)
def cnt_barcode_for_chrm(self, record):
l_chrm1_blocks = record[0]
l_chrm2_blocks = record[1]
gchrm1 = record[2]
gchrm2 = record[3]
sf_chrm_vs_chrm_out = self.sf_working_folder + "{0}_{1}_shared_barcode_count.txt".format(gchrm1, gchrm2)
for chr1_block in l_chrm1_blocks:
chr1 = chr1_block[0]
chr1_start = chr1_block[1]
chr1_end = chr1_block[2]
sf_region1 = self.sf_working_folder + "{0}_{1}_{2}_block_barcodes.txt".format(chr1, chr1_start, chr1_end)
if os.path.isfile(sf_region1) == False:
print "File {0} doesn't exist!!!!".format(sf_region1)
continue
for chr2_block in l_chrm2_blocks:
chr2 = chr2_block[0]
chr2_start = chr2_block[1]
chr2_end = chr2_block[2]
sf_region2 = self.sf_working_folder + "{0}_{1}_{2}_block_barcodes.txt".format(chr2, chr2_start,
chr2_end)
if os.path.isfile(sf_region2) == False:
print "File {0} doesn't exist!!!!".format(sf_region2)
continue
self.cnt_shared_barcode_two_blocks(sf_region1, sf_region2, sf_chrm_vs_chrm_out)
def reload_bins_from_file(self, sf_block):
l_block_barcode = []
with open(sf_block) as fin_block:
for bin1 in fin_block:
fields = bin1.split()
chrm1 = fields[0]
istart1 = fields[1]
iend1 = fields[2]
barcode_set1 = set()
for barcode in fields[3:]:
barcode_set1.add(barcode)
l_block_barcode.append((chrm1, istart1, iend1, barcode_set1))
return l_block_barcode
def cnt_shared_barcode_two_blocks(self, sf_block1, sf_block2, sf_out):
with open(sf_out, "a") as fout_cnt:
l_block1_barcode = self.reload_bins_from_file(sf_block1)
l_block2_barcode = self.reload_bins_from_file(sf_block2)
for record1 in l_block1_barcode:
for record2 in l_block2_barcode:
bin1 = record1[3]
bin2 = record2[3]
cnt_shared = len(bin1 & bin2)
if cnt_shared <= self.i_min_shared_cnt:
continue
chrm1 = record1[0]
istart1 = record1[1]
iend1 = record1[2]
chrm2 = record2[0]
istart2 = record2[1]
iend2 = record2[2]
s_info = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(chrm1, istart1, iend1, chrm2, istart2, iend2,
cnt_shared)
fout_cnt.write(s_info)
# each line represents a bin
##collect all the barcodes of a region
def collect_barcode_for_block(self, record):
chrm = record[0]
istart = record[1] # block-start
iend = record[2] # block-end
l_barcode_sets = []
for i in range(self.n_bins_block):
set_barcode = set()
l_barcode_sets.append(set_barcode)
samfile = pysam.AlignmentFile(self.sf_bam, "rb")
for alignmt in samfile.fetch(chrm, istart, iend):
if alignmt.has_tag("BX") == False:
continue
pos = alignmt.pos
offset = pos - istart
i_bin = offset / self.bin_size
s_barcode = alignmt.get_tag("BX")
l_barcode_sets[i_bin].add(s_barcode)
samfile.close()
sf_region_out = self.sf_working_folder + "{0}_{1}_{2}_block_barcodes.txt".format(chrm, istart, iend)
with open(sf_region_out, "w") as fout_block_barcodes:
itemp_start = istart
itemp_end = istart
for barcode_set in l_barcode_sets:
itemp_start = itemp_end
itemp_end += self.bin_size
if len(barcode_set) > self.barcode_cutoff:
continue
s_region = "{0}\t{1}\t{2}\t".format(chrm, itemp_start, itemp_end)
fout_block_barcodes.write(s_region)
for barcode in barcode_set:
fout_block_barcodes.write(barcode + "\t")
fout_block_barcodes.write("\n")