Skip to content

Commit

Permalink
Update pychip.py
Browse files Browse the repository at this point in the history
  • Loading branch information
sbslee committed Mar 31, 2023
1 parent 491f66c commit ed5b832
Showing 1 changed file with 54 additions and 2 deletions.
56 changes: 54 additions & 2 deletions fuc/api/pychip.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import re
import pandas as pd

from . import common

class AxiomFrame:
"""
Class for storing Axiom annotation data.
Expand Down Expand Up @@ -190,23 +192,73 @@ def from_file(cls, fn):

return cls(df)

def to_vep(self):
def to_vep(self, fasta):
"""
Convert InfiniumFrame to the Ensembl VEP format.
Returns
-------
pandas.DataFrame
Variants in Ensembl VEP format.
Parameters
----------
region : str
Region ('chrom:start-end').
"""
nucleotides = ['A', 'C', 'G', 'T']
n = 15
df = self.df[(self.df.Chr != 'XY') & (self.df.Chr != '0')]
df.MapInfo = df.MapInfo.astype(int)
df = df.head(1000)
def one_row(r):
if r.Chr == 'MT':
chrom = 'chrM'
else:
chrom = f'chr{r.Chr}'
pos = r.MapInfo
matches = re.findall(r'\[([^\]]+)\]', r.SourceSeq)
if not matches:
raise ValueError(f'Something went wrong: {r}')
a1, a2 = matches[0].split('/')
data = pd.Series([r.Chr, r.MapInfo, a1, a2])
left_seq = r.SourceSeq.split('[')[0]
right_seq = r.SourceSeq.split(']')[1]
locus = f'{chrom}:{pos}-{pos}'
if a1 in nucleotides and a2 in nucleotides: # SNV
ref = common.extract_sequence(fasta, locus)
if ref == a1:
pass
elif ref == a2:
a1, a2 = a2, a1
else:
raise ValueError(f'Reference allele not found: {locus}')
elif a1 == '-' or a2 == '-':
affected = a1 if a2 == '-' else a2
c = len(affected)
if c == 1:
left_query = common.extract_sequence(fasta, f'{chrom}:{pos-n}-{pos-1}')
right_query = common.extract_sequence(fasta, f'{chrom}:{pos+1}-{pos+n}')
print(locus, left_seq[-n:], left_query, left_seq[-n:] == left_query, 'DEL I')
print(locus, right_seq[:n], right_query, right_seq[:n] == right_query, 'DEL I')
print()
else:
left_query = common.extract_sequence(fasta, f'{chrom}:{pos-n}-{pos}') # -1
right_query = common.extract_sequence(fasta, f'{chrom}:{pos+c}-{pos+c+n-1}')
if left_seq[-n:] == left_query[:-1] and right_seq[:n] == right_query: # DEL I
print(locus, left_seq[-n:], left_query[:-1], left_seq[-n:] == left_query[:-1], 'DEL II')
print(locus, right_seq[:n], right_query, right_seq[:n] == right_query, 'DEL II')
print()
else:
#left_query = common.extract_sequence(fasta, f'{chrom}:{pos-n}-{pos}')
#left_query = left_query[len(affected):] + affected
print(locus, left_seq[-n:], left_query, left_seq[-n:] == left_query, 'Other')
print(locus, right_seq[:n], right_query, right_seq[:n] == right_query, 'Other')
print()
else:
raise ValueError(f'Unsupported format: {locus}')
data = pd.Series([r.Chr, r.MapInfo, r.MapInfo, f'{a1}/{a2}', '+'])
return data
df = df.apply(one_row, axis=1)
df = df.sort_values(by=[0, 1])
df = df.drop_duplicates()
return df

0 comments on commit ed5b832

Please sign in to comment.