Add new method pyvcf.VcfFrame.filter_gsa

sbslee · Aug 7, 2022 · d042722 · d042722
1 parent ef3039a
commit d042722
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 0 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,6 +5,7 @@ Changelog
 -----------------------
 
 * Update ``pyvcf`` submodule to accept "sites-only" VCF.
+* Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
 
 0.35.0 (2022-07-12)
 -------------------

diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
@@ -4368,6 +4368,87 @@ def f(r):
  return self.__class__(self.copy_meta(), self.copy_df())
  return self.__class__(self.copy_meta(), self.df[i])
 
+ def filter_gsa(self, opposite=False, as_index=False):
+ """
+ Filter rows specific to Illumina's GSA array.
+
+ This function will remove variants that are specific to Illimina's
+ Infinium Global Screening (GSA) array. More specifically, variants
+ are removed if they contain one of the characters {'I', 'D', 'N',
+ ','} as either REF or ALT.
+
+ Parameters
+ ----------
+ opposite : bool, default: False
+ If True, return rows that don't meet the said criteria.
+ as_index : bool, default: False
+ If True, return boolean index array instead of VcfFrame.
+
+ Returns
+ -------
+ VcfFrame or pandas.Series
+ Filtered VcfFrame or boolean index array.
+
+ Examples
+ --------
+ Assume we have the following data:
+
+ >>> from fuc import pyvcf
+ >>> data = {
+ ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'],
+ ... 'POS': [100, 101, 102, 103],
+ ... 'ID': ['.', '.', '.', '.'],
+ ... 'REF': ['D', 'N', 'A', 'C'],
+ ... 'ALT': ['I', '.', '.', 'A'],
+ ... 'QUAL': ['.', '.', '.', '.'],
+ ... 'FILTER': ['.', '.', '.', '.'],
+ ... 'INFO': ['.', '.', '.', '.'],
+ ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'],
+ ... 'Steven': ['0/1', '0/0', './.', '0/1'],
+ ... }
+ >>> vf = pyvcf.VcfFrame.from_dict([], data)
+ >>> vf.df
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+ 0 chr1 100 . D I . . . GT 0/1
+ 1 chr1 101 . N . . . . GT 0/0
+ 2 chr1 102 . A . . . . GT ./.
+ 3 chr1 103 . C A . . . GT 0/1
+
+ We can remove rows that are GSA-specific:
+
+ >>> vf.filter_gsa().df
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+ 0 chr1 103 . C A . . . GT 0/1
+
+ We can also select those rows:
+
+ >>> vf.filter_gsa(opposite=True).df
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+ 0 chr1 100 . D I . . . GT 0/1
+ 1 chr1 101 . N . . . . GT 0/0
+ 2 chr1 102 . A . . . . GT ./.
+
+ Finally, we can return boolean index array from the filtering:
+
+ >>> vf.filter_gsa(as_index=True)
+ 0 False
+ 1 False
+ 2 False
+ 3 True
+ dtype: bool
+ """
+ def one_row(r):
+ alleles = ['I', 'D', '.', 'N']
+ return r.REF in alleles or r.ALT in alleles
+ i = ~self.df.apply(one_row, axis=1)
+ if opposite:
+ i = ~i
+ if as_index:
+ return i
+ if i.empty:
+ return self.__class__(self.copy_meta(), self.copy_df())
+ return self.__class__(self.copy_meta(), self.df[i])
+
  def filter_indel(self, opposite=False, as_index=False):
  """
  Filter rows with indel.