Merge pull request #66 from sbslee/0.36.0-dev

0.36.0 dev
sbslee · Aug 11, 2022 · 9f389ec · 9f389ec
2 parents 192fa56 + b1da2a1
commit 9f389ec
Show file tree

Hide file tree

Showing 8 changed files with 213 additions and 8 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,16 @@
 Changelog
 *********
 
+0.36.0 (2022-08-12)
+-------------------
+
+* ``fuc`` now has a citation! Please refer to the publication “`ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__” by Lee et al., 2022 (Steven is the first author). Fore more details, see the Citation section in README.
+* Update ``pyvcf`` submodule to accept "sites-only" VCF.
+* Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
+* Add new method :meth:`pyvcf.VcfFrame.duplicated`.
+* Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method.
+* Add new optional argument ``count`` to :meth:`pymaf.MafFrame.plot_mutated_matched` method.
+
 0.35.0 (2022-07-12)
 -------------------
 
@@ -33,7 +43,7 @@ Changelog
 0.32.0 (2022-04-02)
 -------------------
 
-* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
+* Add new optional argument ``filter_off`` to :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
 * Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files.
 * Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity.
 * Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations.

diff --git a/README.rst b/README.rst
@@ -57,6 +57,14 @@ Your contributions (e.g. feature ideas, pull requests) are most welcome.
 | Email: [email protected]
 | License: MIT License
 
+Citation
+========
+
+If you use fuc in a published analysis, please report the program version
+and cite the following article:
+
+Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__. PLOS ONE.
+
 Installation
 ============
 

diff --git a/data/vcf/3.vcf b/data/vcf/3.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2 
+#CHROM POS ID REF ALT QUAL FILTER INFO
+chr1 100 . A "T,C" . . .
+chr1 101 . G T . . .
+chr2 1055 . T G . . .
+chr2 3345 . A C . . .
+chr2 5594 . T G . . .
diff --git a/docs/create.py b/docs/create.py
@@ -85,6 +85,14 @@
 | Email: [email protected]
 | License: MIT License
 
+Citation
+========
+
+If you use fuc in a published analysis, please report the program version
+and cite the following article:
+
+Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__. PLOS ONE.
+
 Installation
 ============
 

diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py
@@ -1400,7 +1400,7 @@ def plot_regplot_gene(
 
  def plot_regplot_tmb(
  self, af, subject_col, group_col, a, b, ax=None, figsize=None,
- **kwargs
+ to_csv=None, **kwargs
  ):
  """
  Create a scatter plot with a linear regression model fit visualizing
@@ -1419,6 +1419,8 @@ def plot_regplot_tmb(
  AnnFrame column containing sample group information.
  a, b : str
  Sample group names.
+ to_csv : str, optional
+ Write the plot's data to a CSV file.
  ax : matplotlib.axes.Axes, optional
  Pre-existing axes for the plot. Otherwise, crete a new one.
  figsize : tuple, optional
@@ -1483,6 +1485,10 @@ def one_row(r):
  print(f'R^2 = {results.rsquared:.2f}')
  print(f' P = {results.f_pvalue:.2e}')
 
+ # Write the DataFrame to a CSV file.
+ if to_csv is not None:
+ df.to_csv(to_csv)
+
  return ax
 
  def plot_interactions(
@@ -1805,8 +1811,8 @@ def plot_mutated(
  return ax
 
  def plot_mutated_matched(
- self, af, patient_col, group_col, group_order, ax=None, figsize=None,
- **kwargs
+ self, af, patient_col, group_col, group_order, count=10, ax=None,
+ figsize=None, **kwargs
  ):
  """
  Create a bar plot visualizing the mutation prevalence of top
@@ -1822,6 +1828,8 @@ def plot_mutated_matched(
  AnnFrame column containing sample group information.
  group_order : list
  List of sample group names.
+ count : int, defualt: 10
+ Number of top mutated genes to display.
  ax : matplotlib.axes.Axes, optional
  Pre-existing axes for the plot. Otherwise, crete a new one.
  figsize : tuple, optional
@@ -1835,7 +1843,7 @@ def plot_mutated_matched(
  matplotlib.axes.Axes
  The matplotlib axes containing the plot.
  """
- df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order)
+ df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order, count=count)
  df = df.applymap(lambda x: 0 if x == 'None' else 1)
  s = df.sum(axis=1) / len(df.columns) * 100
  s.name = 'Count'

diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
@@ -86,7 +86,8 @@
 do not contain the FORMAT column or sample-specific information. These are
 called "sites-only" VCF files, and normally represent genetic variation that
 has been observed in a large population. Generally, information about the
-population of origin should be included in the header.
+population of origin should be included in the header. Note that the pyvcf
+submodule supports these sites-only VCF files as well.
 
 There are several reserved keywords in the INFO and FORMAT columns that are
 standards across the community. Popular keywords are listed below:
@@ -1577,6 +1578,8 @@ class VcfFrame:
  """
  Class for storing VCF data.
 
+ Sites-only VCF files are supported.
+
  Parameters
  ----------
  meta : list
@@ -1624,7 +1627,16 @@ class VcfFrame:
 
  def _check_df(self, df):
  df = df.reset_index(drop=True)
- df = df.astype(HEADERS)
+ headers = HEADERS.copy()
+ # Handle "sites-only" VCF.
+ if 'FORMAT' not in df.columns:
+ del headers['FORMAT']
+ if set(df.columns) != set(headers):
+ raise ValueError("The input appears to be a sites-only VCF "
+ "because it's missing the FORMAT column; "
+ "however, it contains one or more incorrect "
+ f"columns: {df.columns.to_list()}.")
+ df = df.astype(headers)
  return df
 
  def __init__(self, meta, df):
@@ -4356,6 +4368,87 @@ def f(r):
  return self.__class__(self.copy_meta(), self.copy_df())
  return self.__class__(self.copy_meta(), self.df[i])
 
+ def filter_gsa(self, opposite=False, as_index=False):
+ """
+ Filter rows specific to Illumina's GSA array.
+
+ This function will remove variants that are specific to Illimina's
+ Infinium Global Screening (GSA) array. More specifically, variants
+ are removed if they contain one of the characters {'I', 'D', 'N',
+ ','} as either REF or ALT.
+
+ Parameters
+ ----------
+ opposite : bool, default: False
+ If True, return rows that don't meet the said criteria.
+ as_index : bool, default: False
+ If True, return boolean index array instead of VcfFrame.
+
+ Returns
+ -------
+ VcfFrame or pandas.Series
+ Filtered VcfFrame or boolean index array.
+
+ Examples
+ --------
+ Assume we have the following data:
+
+ >>> from fuc import pyvcf
+ >>> data = {
+ ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'],
+ ... 'POS': [100, 101, 102, 103],
+ ... 'ID': ['.', '.', '.', '.'],
+ ... 'REF': ['D', 'N', 'A', 'C'],
+ ... 'ALT': ['I', '.', '.', 'A'],
+ ... 'QUAL': ['.', '.', '.', '.'],
+ ... 'FILTER': ['.', '.', '.', '.'],
+ ... 'INFO': ['.', '.', '.', '.'],
+ ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'],
+ ... 'Steven': ['0/1', '0/0', './.', '0/1'],
+ ... }
+ >>> vf = pyvcf.VcfFrame.from_dict([], data)
+ >>> vf.df
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+ 0 chr1 100 . D I . . . GT 0/1
+ 1 chr1 101 . N . . . . GT 0/0
+ 2 chr1 102 . A . . . . GT ./.
+ 3 chr1 103 . C A . . . GT 0/1
+
+ We can remove rows that are GSA-specific:
+
+ >>> vf.filter_gsa().df
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+ 0 chr1 103 . C A . . . GT 0/1
+
+ We can also select those rows:
+
+ >>> vf.filter_gsa(opposite=True).df
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+ 0 chr1 100 . D I . . . GT 0/1
+ 1 chr1 101 . N . . . . GT 0/0
+ 2 chr1 102 . A . . . . GT ./.
+
+ Finally, we can return boolean index array from the filtering:
+
+ >>> vf.filter_gsa(as_index=True)
+ 0 False
+ 1 False
+ 2 False
+ 3 True
+ dtype: bool
+ """
+ def one_row(r):
+ alleles = ['I', 'D', '.', 'N']
+ return r.REF in alleles or r.ALT in alleles
+ i = ~self.df.apply(one_row, axis=1)
+ if opposite:
+ i = ~i
+ if as_index:
+ return i
+ if i.empty:
+ return self.__class__(self.copy_meta(), self.copy_df())
+ return self.__class__(self.copy_meta(), self.df[i])
+
  def filter_indel(self, opposite=False, as_index=False):
  """
  Filter rows with indel.
@@ -5852,6 +5945,73 @@ def rename(self, names, indicies=None):
  vf.df.columns = columns
  return vf
 
+ def duplicated(self, subset=None, keep='first'):
+ """
+ Return boolean Series denoting duplicate rows in VcfFrame.
+
+ This method essentially wraps the :meth:`pandas.DataFrame.duplicated`
+ method.
+
+ Considering certain columns is optional.
+
+ Parameters
+ ----------
+ subset : column label or sequence of labels, optional
+ Only consider certain columns for identifying duplicates, by
+ default use all of the columns.
+ keep : {'first', 'last', False}, default 'first'
+ Determines which duplicates (if any) to keep.
+
+ - ``first`` : Mark duplicates as ``True`` except for the first
+ occurrence.
+ - ``last`` : Mark duplicates as ``True`` except for the last
+ occurrence.
+ - False : Mark all duplicates as ``True``.
+
+ Returns
+ -------
+ Series
+ Boolean series for each duplicated rows.
+
+ Examples
+ --------
+
+ >>> from fuc import pyvcf
+ >>> data = {
+ ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'],
+ ... 'POS': [100, 100, 200, 200],
+ ... 'ID': ['.', '.', '.', '.'],
+ ... 'REF': ['A', 'A', 'C', 'C'],
+ ... 'ALT': ['C', 'T', 'G', 'G,A'],
+ ... 'QUAL': ['.', '.', '.', '.'],
+ ... 'FILTER': ['.', '.', '.', '.'],
+ ... 'INFO': ['.', '.', '.', '.'],
+ ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'],
+ ... 'A': ['0/1', './.', '0/1', './.'],
+ ... 'B': ['./.', '0/1', './.', '1/2'],
+ ... }
+ >>> vf = pyvcf.VcfFrame.from_dict([], data)
+ >>> vf.df
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
+ 0 chr1 100 . A C . . . GT 0/1 ./.
+ 1 chr1 100 . A T . . . GT ./. 0/1
+ 2 chr2 200 . C G . . . GT 0/1 ./.
+ 3 chr2 200 . C G,A . . . GT ./. 1/2
+ >>> vf.duplicated(['CHROM', 'POS', 'REF'])
+ 0 False
+ 1 True
+ 2 False
+ 3 True
+ dtype: bool
+ >>> vf.duplicated(['CHROM', 'POS', 'REF'], keep='last')
+ 0 True
+ 1 False
+ 2 True
+ 3 False
+ dtype: bool
+ """
+ return self.df.duplicated(subset=subset, keep=keep)
+
  def drop_duplicates(self, subset=None, keep='first'):
  """
  Return VcfFrame with duplicate rows removed.

diff --git a/fuc/version.py b/fuc/version.py
@@ -1 +1 @@
-__version__ = '0.35.0'
+__version__ = '0.36.0'
diff --git a/test.py b/test.py
@@ -51,6 +51,10 @@ def test_subset(self):
  vf = vf.subset(['Sarah', 'John'])
  self.assertEqual(len(vf.samples), 2)
 
+ def test_sites_only(self):
+ vf = pyvcf.VcfFrame.from_file(vcf_file3)
+ self.assertEqual(vf.shape, (5, 0))
+
 class TestPybed(unittest.TestCase):
 
  def test_intersect(self):