BUG fix replacement counts when there are un-replaceable samples (owk…

…in#90) * fix: correect bug in value replacement when some samples are not replaceable * test: add pytest for unreplaceable samples
mikelove · Mar 2, 2023 · 5f671a8 · 5f671a8
1 parent 7e549a0
commit 5f671a8
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 6 deletions.
diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py
@@ -692,21 +692,22 @@ def _replace_outliers(self) -> None:
  ),
  index=self.counts_to_refit.var_names,
  )
+
  replacement_counts = (
  pd.DataFrame(
  trim_base_mean.values * self.obsm["size_factors"],
  index=self.counts_to_refit.var_names,
- columns=self.obs_names,
+ columns=self.counts_to_refit.obs_names,
  )
  .astype(int)
  .T
  )
 
  if sum(self.varm["replaced"] > 0):
  self.counts_to_refit.X[
- idx[self.obsm["replaceable"]][:, self.varm["replaced"]]
+ self.obsm["replaceable"][:, None] & idx[:, self.varm["replaced"]]
  ] = replacement_counts.values[
- idx[self.obsm["replaceable"]][:, self.varm["replaced"]]
+ self.obsm["replaceable"][:, None] & idx[:, self.varm["replaced"]]
  ]
 
  def _refit_without_outliers(
@@ -748,7 +749,7 @@ def _refit_without_outliers(
  )
 
  # Use the same size factors
- sub_dds.obsm["size_factors"] = self.obsm["size_factors"]
+ sub_dds.obsm["size_factors"] = self.counts_to_refit.obsm["size_factors"]
 
  # Estimate gene-wise dispersions.
  sub_dds.fit_genewise_dispersions()
@@ -757,7 +758,7 @@ def _refit_without_outliers(
  # Note: the trend curve is not refitted.
  sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
  sub_dds.varm["_normed_means"] = (
- self.counts_to_refit.X / self.obsm["size_factors"][:, None]
+ self.counts_to_refit.X / self.counts_to_refit.obsm["size_factors"][:, None]
  ).mean(0)
  sub_dds.varm["fitted_dispersions"] = dispersion_trend(
  sub_dds.varm["_normed_means"],
@@ -784,7 +785,7 @@ def _refit_without_outliers(
  self.varm["dispersions"][to_replace] = sub_dds.varm["dispersions"]
 
  replace_cooks = pd.DataFrame(self.layers["cooks"].copy())
- replace_cooks.loc[self.obsm["replaceable"], to_replace] = 0
+ replace_cooks.loc[self.obsm["replaceable"], to_replace] = 0.0
 
  self.layers["replace_cooks"] = replace_cooks
  # Take into account new all-zero genes

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,4 +12,5 @@ profile = "black"
 filterwarnings = [
  "error",
  "ignore::statsmodels.tools.sm_exceptions.DomainWarning", # due to fitting the dipersion curve with the identity link
+ "ignore::FutureWarning" # Ignore AnnData FutureWarning about implicit conversion
 ]
diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py
@@ -240,3 +240,53 @@ def test_few_samples():
 
  # Check that no gene was refit, as there are not enough samples.
  assert dds.varm["replaced"].sum() == 0
+
+
+def test_few_samples_and_outlier():
+ """Test that PyDESeq2 runs bug-free with outliers and a cohort with less than 3
+ samples.
+ TODO: refine this test to check the correctness of the output?
+ """
+
+ counts_df = load_example_data(
+ modality="raw_counts",
+ dataset="synthetic",
+ debug=False,
+ )
+
+ clinical_df = load_example_data(
+ modality="clinical",
+ dataset="synthetic",
+ debug=False,
+ )
+
+ # Subsample two samples for each condition
+ samples_to_keep = [
+ "sample1",
+ "sample2",
+ "sample92",
+ "sample93",
+ "sample94",
+ "sample95",
+ "sample96",
+ "sample97",
+ "sample98",
+ "sample99",
+ "sample100",
+ ]
+
+ counts_df = counts_df.loc[samples_to_keep]
+ clinical_df = clinical_df.loc[samples_to_keep]
+
+ # Introduce outliers
+ counts_df.iloc[0, 0] = 1000
+ counts_df.iloc[-1, -1] = 1000
+
+ # Run analysis. Should not throw an error.
+ dds = DeseqDataSet(
+ counts_df, clinical_df, refit_cooks=True, design_factors="condition"
+ )
+ dds.deseq2()
+
+ res = DeseqStats(dds)
+ res.summary()