debugged unit tests

theislab · AnnaChristina · Nov 7, 2021 · Oct 12, 2021 · Oct 12, 2021 · Oct 13, 2021
commit 162ba4e6f254c0bd6a81a6e35c35591f23113667
diff --git a/ncem/api/train/__init__.py b/ncem/api/train/__init__.py
@@ -1,6 +1,6 @@
 """Initializes a train object in api."""
 from ncem.estimators import (Estimator, EstimatorCVAE, EstimatorCVAEncem,
- EstimatorED, EstimatorEDncem, EstimatorGraph,
+ EstimatorED, EstimatorEDncem, EstimatorEdNcemNeighborhood, EstimatorGraph,
  EstimatorInteractions, EstimatorLinear,
  EstimatorNoGraph)
 from ncem.models import BetaScheduler
diff --git a/ncem/data.py b/ncem/data.py
@@ -1,6 +1,7 @@
 import abc
 import warnings
 from collections import OrderedDict
+import os
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import matplotlib.colors as colors
@@ -1726,6 +1727,10 @@ def size_factors(self):
  global_mean_per_node = self.celldata.X.sum(axis=1).mean(axis=0)
  return {i: global_mean_per_node / np.sum(adata.X, axis=1) for i, adata in self.img_celldata.items()}
 
+ @property
+ def var_names(self):
+ return self.celldata.var_names
+
 
 class DataLoaderZhang(DataLoader):
  """DataLoaderZhang class. Inherits all functions from DataLoader."""
@@ -1770,7 +1775,7 @@ def _register_celldata(self):
  "patient_col": "mouse",
  }
 
- celldata = read_h5ad(self.data_path + metadata["fn"]).copy()
+ celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"])).copy()
  celldata.uns["metadata"] = metadata
  celldata.uns["img_keys"] = list(np.unique(celldata.obs[metadata["image_col"]]))
 
@@ -1875,7 +1880,7 @@ def _register_celldata(self):
  "patient_col": None,
  }
 
- celldata = read_h5ad(self.data_path + metadata["fn"])
+ celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"]))
  celldata = celldata[celldata.obs[metadata["image_col"]] != "Dirt"].copy()
  celldata.uns["metadata"] = metadata
  img_keys = list(np.unique(celldata.obs[metadata["image_col"]]))
@@ -1973,7 +1978,7 @@ def _register_celldata(self):
  "cluster_col_preprocessed": "Cluster_preprocessed",
  "patient_col": "donor",
  }
- celldata_df = read_csv(self.data_path + metadata["fn"][0])
+ celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"][0]))
  celldata_df["point"] = [f"scMEP_point_{str(x)}" for x in celldata_df["point"]]
  celldata_df = celldata_df.fillna(0)
  # celldata_df = celldata_df.dropna(inplace=False).reset_index()
@@ -2097,7 +2102,7 @@ def _register_graph_features(self, label_selection):
  label_cols_toread = list(label_selection.intersection(set(list(label_cols.keys()))))
  usecols = label_cols_toread + [patient_col]
 
- tissue_meta_data = read_excel(self.data_path + "scMEP_sample_description.xlsx", usecols=usecols)
+ tissue_meta_data = read_excel(os.path.join(self.data_path, "scMEP_sample_description.xlsx"), usecols=usecols)
  # BUILD LABEL VECTORS FROM LABEL COLUMNS
  # The columns contain unprocessed numeric and categorical entries that are now processed to prediction-ready
  # numeric tensors. Here we first generate a dictionary of tensors for each label (label_tensors). We then
@@ -2212,8 +2217,8 @@ def _register_celldata(self):
  "cluster_col_preprocessed": "cell_class_preprocessed",
  "patient_col": None,
  }
- nuclei_df = read_excel(self.data_path + metadata["fn"][0])
- membranes_df = read_excel(self.data_path + metadata["fn"][1])
+ nuclei_df = read_excel(os.path.join(self.data_path, metadata["fn"][0]))
+ membranes_df = read_excel(os.path.join(self.data_path, metadata["fn"][1]))
 
  celldata_df = nuclei_df.join(membranes_df.set_index("ObjectNumber"), on="ObjectNumber")
 
@@ -2385,7 +2390,7 @@ def _register_celldata(self):
  "cluster_col_preprocessed": "ClusterName_preprocessed",
  "patient_col": "patients",
  }
- celldata_df = read_csv(self.data_path + metadata["fn"])
+ celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"]))
 
  feature_cols = [
  "CD44 - stroma:Cyc_2_ch_2",
@@ -2544,7 +2549,7 @@ def _register_graph_features(self, label_selection):
 
  usecols = label_cols_toread_csv + [patient_col]
  tissue_meta_data = read_csv(
- self.data_path + "CRC_TMAs_patient_annotations.csv",
+ os.path.join(self.data_path, "CRC_TMAs_patient_annotations.csv"),
  # sep='\t',
  usecols=usecols,
  )[usecols]
@@ -2737,7 +2742,7 @@ def _register_celldata(self):
  "patient_col": "embryo",
  }
 
- celldata = read_h5ad(self.data_path + metadata["fn"]).copy()
+ celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"])).copy()
  celldata.uns["metadata"] = metadata
  celldata.uns["img_keys"] = list(np.unique(celldata.obs[metadata["image_col"]]))
 
@@ -2835,7 +2840,7 @@ def _register_celldata(self):
  "cluster_col": "CellTypeID_new",
  "cluster_col_preprocessed": "CellTypeID_new_preprocessed",
  }
- celldata_df = read_csv(self.data_path + metadata["fn"])
+ celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"]))
 
  feature_cols = [
  "Abcb4",
@@ -2994,10 +2999,10 @@ def _register_celldata(self):
 
  # add clean cluster column which removes regular expression from cluster_col
  celldata.obs[metadata["cluster_col_preprocessed"]] = list(
- pd.Series(list(celldata.obs[metadata["cluster_col"]]), dtype="category").map(self.cell_type_merge_dict)
+ pd.Series(list(celldata.obs[metadata["cluster_col"]]), dtype="str").map(self.cell_type_merge_dict)
  )
  celldata.obs[metadata["cluster_col_preprocessed"]] = celldata.obs[metadata["cluster_col_preprocessed"]].astype(
- "category"
+ "str"
  )
  # register node type names
  node_type_names = list(np.unique(celldata.obs[metadata["cluster_col_preprocessed"]]))
@@ -3076,7 +3081,7 @@ def _register_celldata(self):
  "cluster_col": "CellTypeID_new",
  "cluster_col_preprocessed": "CellTypeID_new_preprocessed",
  }
- celldata_df = read_csv(self.data_path + metadata["fn"])
+ celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"]))
 
  feature_cols = [
  "Abcb4",

diff --git a/ncem/estimators/base_estimator_neighbors.py b/ncem/estimators/base_estimator_neighbors.py
@@ -10,11 +10,15 @@ class EstimatorNeighborhood(Estimator):
  """EstimatorGraph class for spatial models of the nieghborhood only (not full graph)."""
 
  n_features_in: int
- _n_neighbors_padded: int
+ _n_neighbors_padded: Union[int, None]
  h0_in: bool
  idx_target_features: np.ndarray
  idx_neighbor_features: np.ndarray
 
+ def __init__(self):
+ super(EstimatorNeighborhood, self).__init__()
+ self._n_neighbors_padded = None
+
  def set_input_features(self, h0_in=True, target_feature_names=None, neighbor_feature_names=None):
  """
  Need to run this before compiling model.
@@ -27,18 +31,19 @@ def set_input_features(self, h0_in=True, target_feature_names=None, neighbor_fea
  assert neighbor_feature_names is None
  self.n_features_in = self.n_features_0
  else:
- self.idx_target_features = None # TODO match names to feature names in h1 here, as np index array
- self.idx_neighbor_features = None # TODO match names to feature names in h1 here, as np index array
- assert len(self.idx_target_features.tolist()) == len(self.idx_neighbor_features.tolist())
+ features = self.data.var_names.tolist()
+ self.idx_target_features = np.array([features.index(x) for x in target_feature_names])
+ self.idx_neighbor_features = np.array([features.index(x) for x in neighbor_feature_names])
+ assert len(self.idx_target_features) == len(self.idx_neighbor_features)
  assert len(set(self.idx_target_features.tolist()).intersection(set(self.idx_neighbor_features.tolist()))) == 0
  self.n_features_in = len(self.idx_target_features)
 
  @property
  def n_neighbors_padded(self):
  if self._n_neighbors_padded is None:
- self._n_neighbors_padded = np.max(np.asarray([
+ self._n_neighbors_padded = int(np.max(np.asarray([
  np.max(np.asarray(np.sum(a, axis=1)).flatten()) for a in self.a.values()
- ]))
+ ])))
  return self._n_neighbors_padded
 
  def _get_output_signature(self, resampled: bool = False):
@@ -53,24 +58,25 @@ def _get_output_signature(self, resampled: bool = False):
  -------
  output_signature
  """
+ # target node features
  h_targets = tf.TensorSpec(
  shape=(self.n_eval_nodes_per_graph, self.n_features_in), dtype=tf.float32
- ) # target node features
+ )
+ # neighbor node features
  h_neighbors = tf.TensorSpec(
- shape=(self.n_neighbors_padded, self.n_features_in), dtype=tf.float32
- ) # neighbor node features
+ shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded, self.n_features_in), dtype=tf.float32
+ )
  sf = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, 1), dtype=tf.float32) # input node size factors
- node_covar = tf.TensorSpec(
- shape=(self.n_eval_nodes_per_graph, self.n_node_covariates), dtype=tf.float32
- ) # node-level covariates
- a = tf.TensorSpec(
- shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded), dtype=tf.float32
- ) # adjacency matrix
- domain = tf.TensorSpec(shape=(self.n_domains,), dtype=tf.int32) # domain
- reconstruction = tf.TensorSpec(
- shape=(self.n_eval_nodes_per_graph, self.n_features_1), dtype=tf.float32
- ) # node features to reconstruct
- kl_dummy = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph,), dtype=tf.float32) # dummy for kl loss
+ # node-level covariates
+ node_covar = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_node_covariates), dtype=tf.float32)
+ # adjacency matrix
+ a = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded), dtype=tf.float32)
+ # domain
+ domain = tf.TensorSpec(shape=(self.n_domains,), dtype=tf.int32)
+ # node features to reconstruct
+ reconstruction = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_features_1), dtype=tf.float32)
+ # dummy for kl loss
+ kl_dummy = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph,), dtype=tf.float32)
 
  if self.vi_model:
  if resampled:
@@ -94,9 +100,10 @@ def _get_output_signature(self, resampled: bool = False):
  else:
  output_signature = ((h_targets, h_neighbors, sf, a, node_covar, domain),
  reconstruction)
+ print(output_signature)
  return output_signature
 
- def _get_dataset_base(
+ def _get_dataset(
  self,
  image_keys: List[str],
  nodes_idx: Dict[str, np.ndarray],
@@ -172,7 +179,7 @@ def generator():
  if self.h0_in:
  h_targets = self.h_0[key][idx_nodes[indices], :]
  else:
- h_targets = self.h_1[key][idx_nodes[indices], self.idx_target_features]
+ h_targets = self.h_1[key][idx_nodes[indices], :][:, self.idx_target_features]
  h_neighbors = []
  a_neighborhood = np.zeros((self.n_eval_nodes_per_graph, self.n_neighbors_padded), "float32")
  for i, j in enumerate(idx_nodes[indices]):
@@ -181,24 +188,21 @@ def generator():
  if self.h0_in:
  h_neighbors_j = self.h_0[key][idx_neighbors, :]
  else:
- h_neighbors_j = self.h_1[key][idx_neighbors, self.idx_neighbor_features]
+ h_neighbors_j = self.h_1[key][idx_neighbors, :][:, self.idx_neighbor_features]
  h_neighbors_j = np.expand_dims(h_neighbors_j, axis=0)
  # Pad neighborhoods:
  diff = self.n_neighbors_padded - h_neighbors_j.shape[1]
  zeros = np.zeros((1, diff, h_neighbors_j.shape[2]), dtype="float32")
  h_neighbors_j = np.concatenate([h_neighbors_j, zeros], axis=1)
  h_neighbors.append(h_neighbors_j)
  a_neighborhood[i, :len(idx_neighbors)] = a_j[idx_neighbors]
- h_neighbors = np.concatenate([h_neighbors], axis=0)
+ h_neighbors = np.concatenate(h_neighbors, axis=0)
  if self.log_transform:
  h_targets = np.log(h_targets + 1.0)
  h_neighbors = np.log(h_neighbors + 1.0)
 
- node_covar = self.node_covar[key][idx_nodes]
- node_covar = node_covar[indices, :]
-
- sf = np.expand_dims(self.size_factors[key][idx_nodes], axis=1)
- sf = sf[indices, :]
+ node_covar = self.node_covar[key][idx_nodes][indices, :]
+ sf = np.expand_dims(self.size_factors[key][idx_nodes][indices], axis=1)
 
  g = np.zeros((self.n_domains,), dtype="int32")
  g[self.domains[key]] = 1

diff --git a/ncem/estimators/estimator_ed_ncem.py b/ncem/estimators/estimator_ed_ncem.py
@@ -170,7 +170,7 @@ class EstimatorEdNcemNeighborhood(EstimatorNeighborhood):
  def __init__(
  self,
  cond_type: str,
- use_type_cond: bool,
+ use_type_cond: bool = True,
  log_transform: bool = False,
  ):
  """Initialize a EstimatorEDncem object.
@@ -191,7 +191,7 @@ def __init__(
  """
  super(EstimatorEdNcemNeighborhood, self).__init__()
  self.model_type = "ed_ncem"
- if cond_type in ["gat", "lr_gat"]:
+ if cond_type in ["gat", "lr_gat", "max", "gcn"]:
  self.adj_type = "full"
  else:
  raise ValueError("cond_type %s not recognized" % cond_type)
@@ -209,6 +209,7 @@ def init_model(
  dropout_rate: float,
  l2_coef: float,
  l1_coef: float,
+ cond_type: str,
  n_eval_nodes_per_graph: int,
  use_domain: bool,
  scale_node_size: bool,
@@ -239,6 +240,7 @@ def init_model(
  use_type_cond=self.use_type_cond,
  scale_node_size=scale_node_size,
  output_layer=output_layer,
+ cond_type=cond_type,
  dec_intermediate_dim=dec_intermediate_dim,
  dec_n_hidden=dec_n_hidden,
  dec_dropout_rate=dec_dropout_rate,

diff --git a/ncem/models/layers/single_gnn_layers.py b/ncem/models/layers/single_gnn_layers.py
@@ -57,13 +57,11 @@ def call(self, inputs, **kwargs):
 
 class SingleLrGatLayer(tf.keras.layers.Layer):
 
- def __init__(self, lr_dim, out_dim, dropout_rate, l2_reg, **kwargs):
+ def __init__(self, lr_dim, dropout_rate, l2_reg, **kwargs):
  """Initialize GCNLayer.
 
  Parameters
  ----------
- out_dim
- Output dimension.
  dropout_rate
  Dropout rate.
  activation
@@ -75,7 +73,6 @@ def __init__(self, lr_dim, out_dim, dropout_rate, l2_reg, **kwargs):
  """
  super().__init__(**kwargs)
  self.lr_dim = lr_dim
- self.out_dim = out_dim
  self.dropout_rate = dropout_rate
  self.l2_reg = l2_reg
 
@@ -93,7 +90,7 @@ def __init__(self, lr_dim, out_dim, dropout_rate, l2_reg, **kwargs):
  initializer=tf.keras.initializers.glorot_uniform(),
  regularizer=tf.keras.regularizers.l2(self.l2_reg),
  )
- self.bias_r = self.add_weight(name="bias_r", shape=(1, 1, 1, self.lr_dim,))
+ self.bias_r = self.add_weight(name="bias_r", shape=(1, 1, self.lr_dim,))
 
  def call(self, inputs, **kwargs):
  targets_receptor = inputs[0] # (batch, target nodes, lr)

diff --git a/ncem/models/model_ed_single_ncem.py b/ncem/models/model_ed_single_ncem.py
@@ -54,14 +54,13 @@ def __init__(
  categ_condition_dim = input_shapes[4]
  domain_dim = input_shapes[5]
 
- # node features - reconstruction: Input Tensor - shape=(None, targets, F-out)
- input_x_reconstruct = tf.keras.Input(shape=(num_targets_dim, out_feature_dim), name="node_features_reconstruct")
- # node size - reconstruction: Input Tensor - shape=(None, targets, 1)
- input_node_size = tf.keras.Input(shape=(num_targets_dim, 1), name="node_size_reconstruct")
  # node features - node representation of neighbors: Input Tensor - shape=(None, targets, F-in)
- input_x_targets = tf.keras.Input(shape=(neighbors_dim, in_lr_feature_dim), name="node_features_targets")
+ input_x_targets = tf.keras.Input(shape=(num_targets_dim, in_lr_feature_dim), name="node_features_targets")
  # node features - node representation of neighbors: Input Tensor - shape=(None, neighbors, F-in)
- input_x_neighbors = tf.keras.Input(shape=(neighbors_dim, in_lr_feature_dim), name="node_features_neighbors")
+ input_x_neighbors = tf.keras.Input(shape=(num_targets_dim, neighbors_dim, in_lr_feature_dim),
+ name="node_features_neighbors")
+ # node size - reconstruction: Input Tensor - shape=(None, targets, 1)
+ input_node_size = tf.keras.Input(shape=(num_targets_dim, 1), name="node_size_reconstruct")
  # adj_matrices - A: Input Tensor - shape=(None, targets, neighbors)
  input_a = tf.keras.Input(shape=(num_targets_dim, neighbors_dim), name="adjacency_matrix")
  # Categorical predictors: Input Tensor - shape=(None, targets, P)
@@ -84,7 +83,6 @@ def __init__(
  if cond_type == "lr_gat":
  x_encoder = SingleLrGatLayer(
  lr_dim=in_lr_feature_dim,
- latent_dim=latent_dim,
  dropout_rate=dropout_rate,
  l2_reg=l2_coef,
  name=f"lr_gat_layer",
@@ -100,7 +98,7 @@ def __init__(
  elif cond_type == "max":
  x_encoder = SingleMaxLayer(
  name=f"max_layer"
- )([input_x_neighbors,])
+ )([input_x_neighbors, ])
  elif cond_type == "gcn":
  x_encoder = SingleGcnLayer(
  name=f"max_layer"
@@ -128,9 +126,9 @@ def __init__(
 
  self.encoder = tf.keras.Model(
  inputs=[
- input_x_reconstruct,
  input_x_targets,
  input_x_neighbors,
+ input_node_size,
  input_a,
  input_categ_condition,
  input_g,
@@ -140,9 +138,9 @@ def __init__(
  )
  self.training_model = tf.keras.Model(
  inputs=[
- input_x_reconstruct,
  input_x_targets,
  input_x_neighbors,
+ input_node_size,
  input_a,
  input_categ_condition,
  input_g,