Skip to content

Commit

Permalink
Merge pull request #66 from sbslee/0.36.0-dev
Browse files Browse the repository at this point in the history
0.36.0 dev
  • Loading branch information
sbslee committed Aug 11, 2022
2 parents 192fa56 + b1da2a1 commit 9f389ec
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 8 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
Changelog
*********

0.36.0 (2022-08-12)
-------------------

* ``fuc`` now has a citation! Please refer to the publication “`ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__” by Lee et al., 2022 (Steven is the first author). Fore more details, see the Citation section in README.
* Update ``pyvcf`` submodule to accept "sites-only" VCF.
* Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
* Add new method :meth:`pyvcf.VcfFrame.duplicated`.
* Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method.
* Add new optional argument ``count`` to :meth:`pymaf.MafFrame.plot_mutated_matched` method.

0.35.0 (2022-07-12)
-------------------

Expand Down Expand Up @@ -33,7 +43,7 @@ Changelog
0.32.0 (2022-04-02)
-------------------

* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
* Add new optional argument ``filter_off`` to :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
* Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files.
* Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity.
* Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations.
Expand Down
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ Your contributions (e.g. feature ideas, pull requests) are most welcome.
| Email: [email protected]
| License: MIT License
Citation
========

If you use fuc in a published analysis, please report the program version
and cite the following article:

Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__. PLOS ONE.

Installation
============

Expand Down
7 changes: 7 additions & 0 deletions data/vcf/3.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 100 . A "T,C" . . .
chr1 101 . G T . . .
chr2 1055 . T G . . .
chr2 3345 . A C . . .
chr2 5594 . T G . . .
8 changes: 8 additions & 0 deletions docs/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@
| Email: [email protected]
| License: MIT License
Citation
========
If you use fuc in a published analysis, please report the program version
and cite the following article:
Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__. PLOS ONE.
Installation
============
Expand Down
16 changes: 12 additions & 4 deletions fuc/api/pymaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1400,7 +1400,7 @@ def plot_regplot_gene(

def plot_regplot_tmb(
self, af, subject_col, group_col, a, b, ax=None, figsize=None,
**kwargs
to_csv=None, **kwargs
):
"""
Create a scatter plot with a linear regression model fit visualizing
Expand All @@ -1419,6 +1419,8 @@ def plot_regplot_tmb(
AnnFrame column containing sample group information.
a, b : str
Sample group names.
to_csv : str, optional
Write the plot's data to a CSV file.
ax : matplotlib.axes.Axes, optional
Pre-existing axes for the plot. Otherwise, crete a new one.
figsize : tuple, optional
Expand Down Expand Up @@ -1483,6 +1485,10 @@ def one_row(r):
print(f'R^2 = {results.rsquared:.2f}')
print(f' P = {results.f_pvalue:.2e}')

# Write the DataFrame to a CSV file.
if to_csv is not None:
df.to_csv(to_csv)

return ax

def plot_interactions(
Expand Down Expand Up @@ -1805,8 +1811,8 @@ def plot_mutated(
return ax

def plot_mutated_matched(
self, af, patient_col, group_col, group_order, ax=None, figsize=None,
**kwargs
self, af, patient_col, group_col, group_order, count=10, ax=None,
figsize=None, **kwargs
):
"""
Create a bar plot visualizing the mutation prevalence of top
Expand All @@ -1822,6 +1828,8 @@ def plot_mutated_matched(
AnnFrame column containing sample group information.
group_order : list
List of sample group names.
count : int, defualt: 10
Number of top mutated genes to display.
ax : matplotlib.axes.Axes, optional
Pre-existing axes for the plot. Otherwise, crete a new one.
figsize : tuple, optional
Expand All @@ -1835,7 +1843,7 @@ def plot_mutated_matched(
matplotlib.axes.Axes
The matplotlib axes containing the plot.
"""
df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order)
df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order, count=count)
df = df.applymap(lambda x: 0 if x == 'None' else 1)
s = df.sum(axis=1) / len(df.columns) * 100
s.name = 'Count'
Expand Down
164 changes: 162 additions & 2 deletions fuc/api/pyvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@
do not contain the FORMAT column or sample-specific information. These are
called "sites-only" VCF files, and normally represent genetic variation that
has been observed in a large population. Generally, information about the
population of origin should be included in the header.
population of origin should be included in the header. Note that the pyvcf
submodule supports these sites-only VCF files as well.
There are several reserved keywords in the INFO and FORMAT columns that are
standards across the community. Popular keywords are listed below:
Expand Down Expand Up @@ -1577,6 +1578,8 @@ class VcfFrame:
"""
Class for storing VCF data.
Sites-only VCF files are supported.
Parameters
----------
meta : list
Expand Down Expand Up @@ -1624,7 +1627,16 @@ class VcfFrame:

def _check_df(self, df):
df = df.reset_index(drop=True)
df = df.astype(HEADERS)
headers = HEADERS.copy()
# Handle "sites-only" VCF.
if 'FORMAT' not in df.columns:
del headers['FORMAT']
if set(df.columns) != set(headers):
raise ValueError("The input appears to be a sites-only VCF "
"because it's missing the FORMAT column; "
"however, it contains one or more incorrect "
f"columns: {df.columns.to_list()}.")
df = df.astype(headers)
return df

def __init__(self, meta, df):
Expand Down Expand Up @@ -4356,6 +4368,87 @@ def f(r):
return self.__class__(self.copy_meta(), self.copy_df())
return self.__class__(self.copy_meta(), self.df[i])

def filter_gsa(self, opposite=False, as_index=False):
"""
Filter rows specific to Illumina's GSA array.
This function will remove variants that are specific to Illimina's
Infinium Global Screening (GSA) array. More specifically, variants
are removed if they contain one of the characters {'I', 'D', 'N',
','} as either REF or ALT.
Parameters
----------
opposite : bool, default: False
If True, return rows that don't meet the said criteria.
as_index : bool, default: False
If True, return boolean index array instead of VcfFrame.
Returns
-------
VcfFrame or pandas.Series
Filtered VcfFrame or boolean index array.
Examples
--------
Assume we have the following data:
>>> from fuc import pyvcf
>>> data = {
... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'],
... 'POS': [100, 101, 102, 103],
... 'ID': ['.', '.', '.', '.'],
... 'REF': ['D', 'N', 'A', 'C'],
... 'ALT': ['I', '.', '.', 'A'],
... 'QUAL': ['.', '.', '.', '.'],
... 'FILTER': ['.', '.', '.', '.'],
... 'INFO': ['.', '.', '.', '.'],
... 'FORMAT': ['GT', 'GT', 'GT', 'GT'],
... 'Steven': ['0/1', '0/0', './.', '0/1'],
... }
>>> vf = pyvcf.VcfFrame.from_dict([], data)
>>> vf.df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
0 chr1 100 . D I . . . GT 0/1
1 chr1 101 . N . . . . GT 0/0
2 chr1 102 . A . . . . GT ./.
3 chr1 103 . C A . . . GT 0/1
We can remove rows that are GSA-specific:
>>> vf.filter_gsa().df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
0 chr1 103 . C A . . . GT 0/1
We can also select those rows:
>>> vf.filter_gsa(opposite=True).df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven
0 chr1 100 . D I . . . GT 0/1
1 chr1 101 . N . . . . GT 0/0
2 chr1 102 . A . . . . GT ./.
Finally, we can return boolean index array from the filtering:
>>> vf.filter_gsa(as_index=True)
0 False
1 False
2 False
3 True
dtype: bool
"""
def one_row(r):
alleles = ['I', 'D', '.', 'N']
return r.REF in alleles or r.ALT in alleles
i = ~self.df.apply(one_row, axis=1)
if opposite:
i = ~i
if as_index:
return i
if i.empty:
return self.__class__(self.copy_meta(), self.copy_df())
return self.__class__(self.copy_meta(), self.df[i])

def filter_indel(self, opposite=False, as_index=False):
"""
Filter rows with indel.
Expand Down Expand Up @@ -5852,6 +5945,73 @@ def rename(self, names, indicies=None):
vf.df.columns = columns
return vf

def duplicated(self, subset=None, keep='first'):
"""
Return boolean Series denoting duplicate rows in VcfFrame.
This method essentially wraps the :meth:`pandas.DataFrame.duplicated`
method.
Considering certain columns is optional.
Parameters
----------
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', False}, default 'first'
Determines which duplicates (if any) to keep.
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
Returns
-------
Series
Boolean series for each duplicated rows.
Examples
--------
>>> from fuc import pyvcf
>>> data = {
... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'],
... 'POS': [100, 100, 200, 200],
... 'ID': ['.', '.', '.', '.'],
... 'REF': ['A', 'A', 'C', 'C'],
... 'ALT': ['C', 'T', 'G', 'G,A'],
... 'QUAL': ['.', '.', '.', '.'],
... 'FILTER': ['.', '.', '.', '.'],
... 'INFO': ['.', '.', '.', '.'],
... 'FORMAT': ['GT', 'GT', 'GT', 'GT'],
... 'A': ['0/1', './.', '0/1', './.'],
... 'B': ['./.', '0/1', './.', '1/2'],
... }
>>> vf = pyvcf.VcfFrame.from_dict([], data)
>>> vf.df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
0 chr1 100 . A C . . . GT 0/1 ./.
1 chr1 100 . A T . . . GT ./. 0/1
2 chr2 200 . C G . . . GT 0/1 ./.
3 chr2 200 . C G,A . . . GT ./. 1/2
>>> vf.duplicated(['CHROM', 'POS', 'REF'])
0 False
1 True
2 False
3 True
dtype: bool
>>> vf.duplicated(['CHROM', 'POS', 'REF'], keep='last')
0 True
1 False
2 True
3 False
dtype: bool
"""
return self.df.duplicated(subset=subset, keep=keep)

def drop_duplicates(self, subset=None, keep='first'):
"""
Return VcfFrame with duplicate rows removed.
Expand Down
2 changes: 1 addition & 1 deletion fuc/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.35.0'
__version__ = '0.36.0'
4 changes: 4 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ def test_subset(self):
vf = vf.subset(['Sarah', 'John'])
self.assertEqual(len(vf.samples), 2)

def test_sites_only(self):
vf = pyvcf.VcfFrame.from_file(vcf_file3)
self.assertEqual(vf.shape, (5, 0))

class TestPybed(unittest.TestCase):

def test_intersect(self):
Expand Down

0 comments on commit 9f389ec

Please sign in to comment.