Skip to content

Commit

Permalink
BUG fix replacement counts when there are un-replaceable samples (owk…
Browse files Browse the repository at this point in the history
…in#90)

* fix: correect bug in value replacement when some samples are not replaceable

* test: add pytest for unreplaceable samples
  • Loading branch information
BorisMuzellec authored Mar 2, 2023
1 parent 7e549a0 commit 5f671a8
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 6 deletions.
13 changes: 7 additions & 6 deletions pydeseq2/dds.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,21 +692,22 @@ def _replace_outliers(self) -> None:
),
index=self.counts_to_refit.var_names,
)

replacement_counts = (
pd.DataFrame(
trim_base_mean.values * self.obsm["size_factors"],
index=self.counts_to_refit.var_names,
columns=self.obs_names,
columns=self.counts_to_refit.obs_names,
)
.astype(int)
.T
)

if sum(self.varm["replaced"] > 0):
self.counts_to_refit.X[
idx[self.obsm["replaceable"]][:, self.varm["replaced"]]
self.obsm["replaceable"][:, None] & idx[:, self.varm["replaced"]]
] = replacement_counts.values[
idx[self.obsm["replaceable"]][:, self.varm["replaced"]]
self.obsm["replaceable"][:, None] & idx[:, self.varm["replaced"]]
]

def _refit_without_outliers(
Expand Down Expand Up @@ -748,7 +749,7 @@ def _refit_without_outliers(
)

# Use the same size factors
sub_dds.obsm["size_factors"] = self.obsm["size_factors"]
sub_dds.obsm["size_factors"] = self.counts_to_refit.obsm["size_factors"]

# Estimate gene-wise dispersions.
sub_dds.fit_genewise_dispersions()
Expand All @@ -757,7 +758,7 @@ def _refit_without_outliers(
# Note: the trend curve is not refitted.
sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
sub_dds.varm["_normed_means"] = (
self.counts_to_refit.X / self.obsm["size_factors"][:, None]
self.counts_to_refit.X / self.counts_to_refit.obsm["size_factors"][:, None]
).mean(0)
sub_dds.varm["fitted_dispersions"] = dispersion_trend(
sub_dds.varm["_normed_means"],
Expand All @@ -784,7 +785,7 @@ def _refit_without_outliers(
self.varm["dispersions"][to_replace] = sub_dds.varm["dispersions"]

replace_cooks = pd.DataFrame(self.layers["cooks"].copy())
replace_cooks.loc[self.obsm["replaceable"], to_replace] = 0
replace_cooks.loc[self.obsm["replaceable"], to_replace] = 0.0

self.layers["replace_cooks"] = replace_cooks
# Take into account new all-zero genes
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ profile = "black"
filterwarnings = [
"error",
"ignore::statsmodels.tools.sm_exceptions.DomainWarning", # due to fitting the dipersion curve with the identity link
"ignore::FutureWarning" # Ignore AnnData FutureWarning about implicit conversion
]
50 changes: 50 additions & 0 deletions tests/test_edge_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,53 @@ def test_few_samples():

# Check that no gene was refit, as there are not enough samples.
assert dds.varm["replaced"].sum() == 0


def test_few_samples_and_outlier():
"""Test that PyDESeq2 runs bug-free with outliers and a cohort with less than 3
samples.
TODO: refine this test to check the correctness of the output?
"""

counts_df = load_example_data(
modality="raw_counts",
dataset="synthetic",
debug=False,
)

clinical_df = load_example_data(
modality="clinical",
dataset="synthetic",
debug=False,
)

# Subsample two samples for each condition
samples_to_keep = [
"sample1",
"sample2",
"sample92",
"sample93",
"sample94",
"sample95",
"sample96",
"sample97",
"sample98",
"sample99",
"sample100",
]

counts_df = counts_df.loc[samples_to_keep]
clinical_df = clinical_df.loc[samples_to_keep]

# Introduce outliers
counts_df.iloc[0, 0] = 1000
counts_df.iloc[-1, -1] = 1000

# Run analysis. Should not throw an error.
dds = DeseqDataSet(
counts_df, clinical_df, refit_cooks=True, design_factors="condition"
)
dds.deseq2()

res = DeseqStats(dds)
res.summary()

0 comments on commit 5f671a8

Please sign in to comment.