diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e809a45..b444d25 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,16 @@ Changelog ********* +0.36.0 (2022-08-12) +------------------- + +* ``fuc`` now has a citation! Please refer to the publication “`ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation `__” by Lee et al., 2022 (Steven is the first author). Fore more details, see the Citation section in README. +* Update ``pyvcf`` submodule to accept "sites-only" VCF. +* Add new method :meth:`pyvcf.VcfFrame.filter_gsa`. +* Add new method :meth:`pyvcf.VcfFrame.duplicated`. +* Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method. +* Add new optional argument ``count`` to :meth:`pymaf.MafFrame.plot_mutated_matched` method. + 0.35.0 (2022-07-12) ------------------- @@ -33,7 +43,7 @@ Changelog 0.32.0 (2022-04-02) ------------------- -* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix. +* Add new optional argument ``filter_off`` to :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix. * Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files. * Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity. * Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations. diff --git a/README.rst b/README.rst index d7f087e..812b0fa 100644 --- a/README.rst +++ b/README.rst @@ -57,6 +57,14 @@ Your contributions (e.g. feature ideas, pull requests) are most welcome. | Email: sbstevenlee@gmail.com | License: MIT License +Citation +======== + +If you use fuc in a published analysis, please report the program version +and cite the following article: + +Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation `__. PLOS ONE. + Installation ============ diff --git a/data/vcf/3.vcf b/data/vcf/3.vcf new file mode 100644 index 0000000..711330f --- /dev/null +++ b/data/vcf/3.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 100 . A "T,C" . . . +chr1 101 . G T . . . +chr2 1055 . T G . . . +chr2 3345 . A C . . . +chr2 5594 . T G . . . \ No newline at end of file diff --git a/docs/create.py b/docs/create.py index 27d9cf9..a215862 100644 --- a/docs/create.py +++ b/docs/create.py @@ -85,6 +85,14 @@ | Email: sbstevenlee@gmail.com | License: MIT License +Citation +======== + +If you use fuc in a published analysis, please report the program version +and cite the following article: + +Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation `__. PLOS ONE. + Installation ============ diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index cefbaa3..aaffa10 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -1400,7 +1400,7 @@ def plot_regplot_gene( def plot_regplot_tmb( self, af, subject_col, group_col, a, b, ax=None, figsize=None, - **kwargs + to_csv=None, **kwargs ): """ Create a scatter plot with a linear regression model fit visualizing @@ -1419,6 +1419,8 @@ def plot_regplot_tmb( AnnFrame column containing sample group information. a, b : str Sample group names. + to_csv : str, optional + Write the plot's data to a CSV file. ax : matplotlib.axes.Axes, optional Pre-existing axes for the plot. Otherwise, crete a new one. figsize : tuple, optional @@ -1483,6 +1485,10 @@ def one_row(r): print(f'R^2 = {results.rsquared:.2f}') print(f' P = {results.f_pvalue:.2e}') + # Write the DataFrame to a CSV file. + if to_csv is not None: + df.to_csv(to_csv) + return ax def plot_interactions( @@ -1805,8 +1811,8 @@ def plot_mutated( return ax def plot_mutated_matched( - self, af, patient_col, group_col, group_order, ax=None, figsize=None, - **kwargs + self, af, patient_col, group_col, group_order, count=10, ax=None, + figsize=None, **kwargs ): """ Create a bar plot visualizing the mutation prevalence of top @@ -1822,6 +1828,8 @@ def plot_mutated_matched( AnnFrame column containing sample group information. group_order : list List of sample group names. + count : int, defualt: 10 + Number of top mutated genes to display. ax : matplotlib.axes.Axes, optional Pre-existing axes for the plot. Otherwise, crete a new one. figsize : tuple, optional @@ -1835,7 +1843,7 @@ def plot_mutated_matched( matplotlib.axes.Axes The matplotlib axes containing the plot. """ - df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order) + df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order, count=count) df = df.applymap(lambda x: 0 if x == 'None' else 1) s = df.sum(axis=1) / len(df.columns) * 100 s.name = 'Count' diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 4efdb28..8211b5b 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -86,7 +86,8 @@ do not contain the FORMAT column or sample-specific information. These are called "sites-only" VCF files, and normally represent genetic variation that has been observed in a large population. Generally, information about the -population of origin should be included in the header. +population of origin should be included in the header. Note that the pyvcf +submodule supports these sites-only VCF files as well. There are several reserved keywords in the INFO and FORMAT columns that are standards across the community. Popular keywords are listed below: @@ -1577,6 +1578,8 @@ class VcfFrame: """ Class for storing VCF data. + Sites-only VCF files are supported. + Parameters ---------- meta : list @@ -1624,7 +1627,16 @@ class VcfFrame: def _check_df(self, df): df = df.reset_index(drop=True) - df = df.astype(HEADERS) + headers = HEADERS.copy() + # Handle "sites-only" VCF. + if 'FORMAT' not in df.columns: + del headers['FORMAT'] + if set(df.columns) != set(headers): + raise ValueError("The input appears to be a sites-only VCF " + "because it's missing the FORMAT column; " + "however, it contains one or more incorrect " + f"columns: {df.columns.to_list()}.") + df = df.astype(headers) return df def __init__(self, meta, df): @@ -4356,6 +4368,87 @@ def f(r): return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) + def filter_gsa(self, opposite=False, as_index=False): + """ + Filter rows specific to Illumina's GSA array. + + This function will remove variants that are specific to Illimina's + Infinium Global Screening (GSA) array. More specifically, variants + are removed if they contain one of the characters {'I', 'D', 'N', + ','} as either REF or ALT. + + Parameters + ---------- + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. + + Returns + ------- + VcfFrame or pandas.Series + Filtered VcfFrame or boolean index array. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['D', 'N', 'A', 'C'], + ... 'ALT': ['I', '.', '.', 'A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'Steven': ['0/1', '0/0', './.', '0/1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . D I . . . GT 0/1 + 1 chr1 101 . N . . . . GT 0/0 + 2 chr1 102 . A . . . . GT ./. + 3 chr1 103 . C A . . . GT 0/1 + + We can remove rows that are GSA-specific: + + >>> vf.filter_gsa().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 103 . C A . . . GT 0/1 + + We can also select those rows: + + >>> vf.filter_gsa(opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . D I . . . GT 0/1 + 1 chr1 101 . N . . . . GT 0/0 + 2 chr1 102 . A . . . . GT ./. + + Finally, we can return boolean index array from the filtering: + + >>> vf.filter_gsa(as_index=True) + 0 False + 1 False + 2 False + 3 True + dtype: bool + """ + def one_row(r): + alleles = ['I', 'D', '.', 'N'] + return r.REF in alleles or r.ALT in alleles + i = ~self.df.apply(one_row, axis=1) + if opposite: + i = ~i + if as_index: + return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) + return self.__class__(self.copy_meta(), self.df[i]) + def filter_indel(self, opposite=False, as_index=False): """ Filter rows with indel. @@ -5852,6 +5945,73 @@ def rename(self, names, indicies=None): vf.df.columns = columns return vf + def duplicated(self, subset=None, keep='first'): + """ + Return boolean Series denoting duplicate rows in VcfFrame. + + This method essentially wraps the :meth:`pandas.DataFrame.duplicated` + method. + + Considering certain columns is optional. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + Series + Boolean series for each duplicated rows. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], + ... 'POS': [100, 100, 200, 200], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'C', 'C'], + ... 'ALT': ['C', 'T', 'G', 'G,A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'A': ['0/1', './.', '0/1', './.'], + ... 'B': ['./.', '0/1', './.', '1/2'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . A C . . . GT 0/1 ./. + 1 chr1 100 . A T . . . GT ./. 0/1 + 2 chr2 200 . C G . . . GT 0/1 ./. + 3 chr2 200 . C G,A . . . GT ./. 1/2 + >>> vf.duplicated(['CHROM', 'POS', 'REF']) + 0 False + 1 True + 2 False + 3 True + dtype: bool + >>> vf.duplicated(['CHROM', 'POS', 'REF'], keep='last') + 0 True + 1 False + 2 True + 3 False + dtype: bool + """ + return self.df.duplicated(subset=subset, keep=keep) + def drop_duplicates(self, subset=None, keep='first'): """ Return VcfFrame with duplicate rows removed. diff --git a/fuc/version.py b/fuc/version.py index 2670d05..aae5aca 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.35.0' +__version__ = '0.36.0' diff --git a/test.py b/test.py index f55be54..0054d78 100644 --- a/test.py +++ b/test.py @@ -51,6 +51,10 @@ def test_subset(self): vf = vf.subset(['Sarah', 'John']) self.assertEqual(len(vf.samples), 2) + def test_sites_only(self): + vf = pyvcf.VcfFrame.from_file(vcf_file3) + self.assertEqual(vf.shape, (5, 0)) + class TestPybed(unittest.TestCase): def test_intersect(self):