Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

skeleton for LR encoding ED model with extension of estimator #44

Merged
merged 22 commits into from
Nov 7, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f99ff95
skeleton for LR encoding ED model with extension of estimator
davidsebfischer Oct 12, 2021
f84f8a7
enabled extraction of scaled adj matrix
davidsebfischer Oct 12, 2021
162ba4e
debugged unit tests
davidsebfischer Oct 13, 2021
37ec983
added disclaimer
davidsebfischer Oct 13, 2021
64ae03e
Merge branch 'feature/lr_encoder' of github.com:theislab/ncem into fe…
AnnaChristina Oct 14, 2021
3dc14a3
simplify output layers with get_out in remaining models
AnnaChristina Oct 14, 2021
427ffb5
simplify output layers with get_out in remaining models
AnnaChristina Oct 14, 2021
833ec07
add disclaimer to cond layers
AnnaChristina Oct 14, 2021
21f54c9
fix max and gcn layer for single gnn
AnnaChristina Oct 19, 2021
998c8cc
Bump version from 0.3.2 to 0.4.0
AnnaChristina Oct 25, 2021
01438aa
added node embedding and output weight saving in EDncem models
davidsebfischer Oct 28, 2021
7405baa
Merge pull request #53 from theislab/development
AnnaChristina Nov 3, 2021
e26132a
fix conflicts
AnnaChristina Nov 3, 2021
9209a78
Merge branch 'release' of github.com:theislab/ncem into release
AnnaChristina Nov 3, 2021
e61d30c
Merge pull request #54 from theislab/release
AnnaChristina Nov 3, 2021
0c2839e
Bump version from 0.4.6 to 0.4.7
AnnaChristina Nov 3, 2021
0e2711f
add n_top_genes to dataloader
AnnaChristina Nov 4, 2021
2f03c77
Bump version from 0.4.7 to 0.4.8
AnnaChristina Nov 4, 2021
be89c0b
add hgnc names for schuerch
AnnaChristina Nov 7, 2021
803063d
Merge branch 'feature/embedding_saving' of github.com:theislab/ncem i…
AnnaChristina Nov 7, 2021
8a4bcfa
add saving of LR names to model class
AnnaChristina Nov 7, 2021
bd2f900
Merge pull request #51 from theislab/feature/embedding_saving
AnnaChristina Nov 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
debugged unit tests
  • Loading branch information
davidsebfischer committed Oct 13, 2021
commit 162ba4e6f254c0bd6a81a6e35c35591f23113667
2 changes: 1 addition & 1 deletion ncem/api/train/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Initializes a train object in api."""
from ncem.estimators import (Estimator, EstimatorCVAE, EstimatorCVAEncem,
EstimatorED, EstimatorEDncem, EstimatorGraph,
EstimatorED, EstimatorEDncem, EstimatorEdNcemNeighborhood, EstimatorGraph,
EstimatorInteractions, EstimatorLinear,
EstimatorNoGraph)
from ncem.models import BetaScheduler
31 changes: 18 additions & 13 deletions ncem/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import abc
import warnings
from collections import OrderedDict
import os
from typing import Dict, List, Optional, Sequence, Tuple, Union

import matplotlib.colors as colors
Expand Down Expand Up @@ -1726,6 +1727,10 @@ def size_factors(self):
global_mean_per_node = self.celldata.X.sum(axis=1).mean(axis=0)
return {i: global_mean_per_node / np.sum(adata.X, axis=1) for i, adata in self.img_celldata.items()}

@property
def var_names(self):
return self.celldata.var_names


class DataLoaderZhang(DataLoader):
"""DataLoaderZhang class. Inherits all functions from DataLoader."""
Expand Down Expand Up @@ -1770,7 +1775,7 @@ def _register_celldata(self):
"patient_col": "mouse",
}

celldata = read_h5ad(self.data_path + metadata["fn"]).copy()
celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"])).copy()
celldata.uns["metadata"] = metadata
celldata.uns["img_keys"] = list(np.unique(celldata.obs[metadata["image_col"]]))

Expand Down Expand Up @@ -1875,7 +1880,7 @@ def _register_celldata(self):
"patient_col": None,
}

celldata = read_h5ad(self.data_path + metadata["fn"])
celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"]))
celldata = celldata[celldata.obs[metadata["image_col"]] != "Dirt"].copy()
celldata.uns["metadata"] = metadata
img_keys = list(np.unique(celldata.obs[metadata["image_col"]]))
Expand Down Expand Up @@ -1973,7 +1978,7 @@ def _register_celldata(self):
"cluster_col_preprocessed": "Cluster_preprocessed",
"patient_col": "donor",
}
celldata_df = read_csv(self.data_path + metadata["fn"][0])
celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"][0]))
celldata_df["point"] = [f"scMEP_point_{str(x)}" for x in celldata_df["point"]]
celldata_df = celldata_df.fillna(0)
# celldata_df = celldata_df.dropna(inplace=False).reset_index()
Expand Down Expand Up @@ -2097,7 +2102,7 @@ def _register_graph_features(self, label_selection):
label_cols_toread = list(label_selection.intersection(set(list(label_cols.keys()))))
usecols = label_cols_toread + [patient_col]

tissue_meta_data = read_excel(self.data_path + "scMEP_sample_description.xlsx", usecols=usecols)
tissue_meta_data = read_excel(os.path.join(self.data_path, "scMEP_sample_description.xlsx"), usecols=usecols)
# BUILD LABEL VECTORS FROM LABEL COLUMNS
# The columns contain unprocessed numeric and categorical entries that are now processed to prediction-ready
# numeric tensors. Here we first generate a dictionary of tensors for each label (label_tensors). We then
Expand Down Expand Up @@ -2212,8 +2217,8 @@ def _register_celldata(self):
"cluster_col_preprocessed": "cell_class_preprocessed",
"patient_col": None,
}
nuclei_df = read_excel(self.data_path + metadata["fn"][0])
membranes_df = read_excel(self.data_path + metadata["fn"][1])
nuclei_df = read_excel(os.path.join(self.data_path, metadata["fn"][0]))
membranes_df = read_excel(os.path.join(self.data_path, metadata["fn"][1]))

celldata_df = nuclei_df.join(membranes_df.set_index("ObjectNumber"), on="ObjectNumber")

Expand Down Expand Up @@ -2385,7 +2390,7 @@ def _register_celldata(self):
"cluster_col_preprocessed": "ClusterName_preprocessed",
"patient_col": "patients",
}
celldata_df = read_csv(self.data_path + metadata["fn"])
celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"]))

feature_cols = [
"CD44 - stroma:Cyc_2_ch_2",
Expand Down Expand Up @@ -2544,7 +2549,7 @@ def _register_graph_features(self, label_selection):

usecols = label_cols_toread_csv + [patient_col]
tissue_meta_data = read_csv(
self.data_path + "CRC_TMAs_patient_annotations.csv",
os.path.join(self.data_path, "CRC_TMAs_patient_annotations.csv"),
# sep='\t',
usecols=usecols,
)[usecols]
Expand Down Expand Up @@ -2737,7 +2742,7 @@ def _register_celldata(self):
"patient_col": "embryo",
}

celldata = read_h5ad(self.data_path + metadata["fn"]).copy()
celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"])).copy()
celldata.uns["metadata"] = metadata
celldata.uns["img_keys"] = list(np.unique(celldata.obs[metadata["image_col"]]))

Expand Down Expand Up @@ -2835,7 +2840,7 @@ def _register_celldata(self):
"cluster_col": "CellTypeID_new",
"cluster_col_preprocessed": "CellTypeID_new_preprocessed",
}
celldata_df = read_csv(self.data_path + metadata["fn"])
celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"]))

feature_cols = [
"Abcb4",
Expand Down Expand Up @@ -2994,10 +2999,10 @@ def _register_celldata(self):

# add clean cluster column which removes regular expression from cluster_col
celldata.obs[metadata["cluster_col_preprocessed"]] = list(
pd.Series(list(celldata.obs[metadata["cluster_col"]]), dtype="category").map(self.cell_type_merge_dict)
pd.Series(list(celldata.obs[metadata["cluster_col"]]), dtype="str").map(self.cell_type_merge_dict)
)
celldata.obs[metadata["cluster_col_preprocessed"]] = celldata.obs[metadata["cluster_col_preprocessed"]].astype(
"category"
"str"
)
# register node type names
node_type_names = list(np.unique(celldata.obs[metadata["cluster_col_preprocessed"]]))
Expand Down Expand Up @@ -3076,7 +3081,7 @@ def _register_celldata(self):
"cluster_col": "CellTypeID_new",
"cluster_col_preprocessed": "CellTypeID_new_preprocessed",
}
celldata_df = read_csv(self.data_path + metadata["fn"])
celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"]))

feature_cols = [
"Abcb4",
Expand Down
62 changes: 33 additions & 29 deletions ncem/estimators/base_estimator_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@ class EstimatorNeighborhood(Estimator):
"""EstimatorGraph class for spatial models of the nieghborhood only (not full graph)."""

n_features_in: int
_n_neighbors_padded: int
_n_neighbors_padded: Union[int, None]
h0_in: bool
idx_target_features: np.ndarray
idx_neighbor_features: np.ndarray

def __init__(self):
super(EstimatorNeighborhood, self).__init__()
self._n_neighbors_padded = None

def set_input_features(self, h0_in=True, target_feature_names=None, neighbor_feature_names=None):
"""
Need to run this before compiling model.
Expand All @@ -27,18 +31,19 @@ def set_input_features(self, h0_in=True, target_feature_names=None, neighbor_fea
assert neighbor_feature_names is None
self.n_features_in = self.n_features_0
else:
self.idx_target_features = None # TODO match names to feature names in h1 here, as np index array
self.idx_neighbor_features = None # TODO match names to feature names in h1 here, as np index array
assert len(self.idx_target_features.tolist()) == len(self.idx_neighbor_features.tolist())
features = self.data.var_names.tolist()
self.idx_target_features = np.array([features.index(x) for x in target_feature_names])
self.idx_neighbor_features = np.array([features.index(x) for x in neighbor_feature_names])
assert len(self.idx_target_features) == len(self.idx_neighbor_features)
assert len(set(self.idx_target_features.tolist()).intersection(set(self.idx_neighbor_features.tolist()))) == 0
self.n_features_in = len(self.idx_target_features)

@property
def n_neighbors_padded(self):
if self._n_neighbors_padded is None:
self._n_neighbors_padded = np.max(np.asarray([
self._n_neighbors_padded = int(np.max(np.asarray([
np.max(np.asarray(np.sum(a, axis=1)).flatten()) for a in self.a.values()
]))
])))
return self._n_neighbors_padded

def _get_output_signature(self, resampled: bool = False):
Expand All @@ -53,24 +58,25 @@ def _get_output_signature(self, resampled: bool = False):
-------
output_signature
"""
# target node features
h_targets = tf.TensorSpec(
shape=(self.n_eval_nodes_per_graph, self.n_features_in), dtype=tf.float32
) # target node features
)
# neighbor node features
h_neighbors = tf.TensorSpec(
shape=(self.n_neighbors_padded, self.n_features_in), dtype=tf.float32
) # neighbor node features
shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded, self.n_features_in), dtype=tf.float32
)
sf = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, 1), dtype=tf.float32) # input node size factors
node_covar = tf.TensorSpec(
shape=(self.n_eval_nodes_per_graph, self.n_node_covariates), dtype=tf.float32
) # node-level covariates
a = tf.TensorSpec(
shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded), dtype=tf.float32
) # adjacency matrix
domain = tf.TensorSpec(shape=(self.n_domains,), dtype=tf.int32) # domain
reconstruction = tf.TensorSpec(
shape=(self.n_eval_nodes_per_graph, self.n_features_1), dtype=tf.float32
) # node features to reconstruct
kl_dummy = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph,), dtype=tf.float32) # dummy for kl loss
# node-level covariates
node_covar = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_node_covariates), dtype=tf.float32)
# adjacency matrix
a = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded), dtype=tf.float32)
# domain
domain = tf.TensorSpec(shape=(self.n_domains,), dtype=tf.int32)
# node features to reconstruct
reconstruction = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_features_1), dtype=tf.float32)
# dummy for kl loss
kl_dummy = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph,), dtype=tf.float32)

if self.vi_model:
if resampled:
Expand All @@ -94,9 +100,10 @@ def _get_output_signature(self, resampled: bool = False):
else:
output_signature = ((h_targets, h_neighbors, sf, a, node_covar, domain),
reconstruction)
print(output_signature)
return output_signature

def _get_dataset_base(
def _get_dataset(
self,
image_keys: List[str],
nodes_idx: Dict[str, np.ndarray],
Expand Down Expand Up @@ -172,7 +179,7 @@ def generator():
if self.h0_in:
h_targets = self.h_0[key][idx_nodes[indices], :]
else:
h_targets = self.h_1[key][idx_nodes[indices], self.idx_target_features]
h_targets = self.h_1[key][idx_nodes[indices], :][:, self.idx_target_features]
h_neighbors = []
a_neighborhood = np.zeros((self.n_eval_nodes_per_graph, self.n_neighbors_padded), "float32")
for i, j in enumerate(idx_nodes[indices]):
Expand All @@ -181,24 +188,21 @@ def generator():
if self.h0_in:
h_neighbors_j = self.h_0[key][idx_neighbors, :]
else:
h_neighbors_j = self.h_1[key][idx_neighbors, self.idx_neighbor_features]
h_neighbors_j = self.h_1[key][idx_neighbors, :][:, self.idx_neighbor_features]
h_neighbors_j = np.expand_dims(h_neighbors_j, axis=0)
# Pad neighborhoods:
diff = self.n_neighbors_padded - h_neighbors_j.shape[1]
zeros = np.zeros((1, diff, h_neighbors_j.shape[2]), dtype="float32")
h_neighbors_j = np.concatenate([h_neighbors_j, zeros], axis=1)
h_neighbors.append(h_neighbors_j)
a_neighborhood[i, :len(idx_neighbors)] = a_j[idx_neighbors]
h_neighbors = np.concatenate([h_neighbors], axis=0)
h_neighbors = np.concatenate(h_neighbors, axis=0)
if self.log_transform:
h_targets = np.log(h_targets + 1.0)
h_neighbors = np.log(h_neighbors + 1.0)

node_covar = self.node_covar[key][idx_nodes]
node_covar = node_covar[indices, :]

sf = np.expand_dims(self.size_factors[key][idx_nodes], axis=1)
sf = sf[indices, :]
node_covar = self.node_covar[key][idx_nodes][indices, :]
sf = np.expand_dims(self.size_factors[key][idx_nodes][indices], axis=1)

g = np.zeros((self.n_domains,), dtype="int32")
g[self.domains[key]] = 1
Expand Down
6 changes: 4 additions & 2 deletions ncem/estimators/estimator_ed_ncem.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ class EstimatorEdNcemNeighborhood(EstimatorNeighborhood):
def __init__(
self,
cond_type: str,
use_type_cond: bool,
use_type_cond: bool = True,
log_transform: bool = False,
):
"""Initialize a EstimatorEDncem object.
Expand All @@ -191,7 +191,7 @@ def __init__(
"""
super(EstimatorEdNcemNeighborhood, self).__init__()
self.model_type = "ed_ncem"
if cond_type in ["gat", "lr_gat"]:
if cond_type in ["gat", "lr_gat", "max", "gcn"]:
self.adj_type = "full"
else:
raise ValueError("cond_type %s not recognized" % cond_type)
Expand All @@ -209,6 +209,7 @@ def init_model(
dropout_rate: float,
l2_coef: float,
l1_coef: float,
cond_type: str,
n_eval_nodes_per_graph: int,
use_domain: bool,
scale_node_size: bool,
Expand Down Expand Up @@ -239,6 +240,7 @@ def init_model(
use_type_cond=self.use_type_cond,
scale_node_size=scale_node_size,
output_layer=output_layer,
cond_type=cond_type,
dec_intermediate_dim=dec_intermediate_dim,
dec_n_hidden=dec_n_hidden,
dec_dropout_rate=dec_dropout_rate,
Expand Down
7 changes: 2 additions & 5 deletions ncem/models/layers/single_gnn_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,11 @@ def call(self, inputs, **kwargs):

class SingleLrGatLayer(tf.keras.layers.Layer):

def __init__(self, lr_dim, out_dim, dropout_rate, l2_reg, **kwargs):
def __init__(self, lr_dim, dropout_rate, l2_reg, **kwargs):
"""Initialize GCNLayer.

Parameters
----------
out_dim
Output dimension.
dropout_rate
Dropout rate.
activation
Expand All @@ -75,7 +73,6 @@ def __init__(self, lr_dim, out_dim, dropout_rate, l2_reg, **kwargs):
"""
super().__init__(**kwargs)
self.lr_dim = lr_dim
self.out_dim = out_dim
self.dropout_rate = dropout_rate
self.l2_reg = l2_reg

Expand All @@ -93,7 +90,7 @@ def __init__(self, lr_dim, out_dim, dropout_rate, l2_reg, **kwargs):
initializer=tf.keras.initializers.glorot_uniform(),
regularizer=tf.keras.regularizers.l2(self.l2_reg),
)
self.bias_r = self.add_weight(name="bias_r", shape=(1, 1, 1, self.lr_dim,))
self.bias_r = self.add_weight(name="bias_r", shape=(1, 1, self.lr_dim,))

def call(self, inputs, **kwargs):
targets_receptor = inputs[0] # (batch, target nodes, lr)
Expand Down
18 changes: 8 additions & 10 deletions ncem/models/model_ed_single_ncem.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,13 @@ def __init__(
categ_condition_dim = input_shapes[4]
domain_dim = input_shapes[5]

# node features - reconstruction: Input Tensor - shape=(None, targets, F-out)
input_x_reconstruct = tf.keras.Input(shape=(num_targets_dim, out_feature_dim), name="node_features_reconstruct")
# node size - reconstruction: Input Tensor - shape=(None, targets, 1)
input_node_size = tf.keras.Input(shape=(num_targets_dim, 1), name="node_size_reconstruct")
# node features - node representation of neighbors: Input Tensor - shape=(None, targets, F-in)
input_x_targets = tf.keras.Input(shape=(neighbors_dim, in_lr_feature_dim), name="node_features_targets")
input_x_targets = tf.keras.Input(shape=(num_targets_dim, in_lr_feature_dim), name="node_features_targets")
# node features - node representation of neighbors: Input Tensor - shape=(None, neighbors, F-in)
input_x_neighbors = tf.keras.Input(shape=(neighbors_dim, in_lr_feature_dim), name="node_features_neighbors")
input_x_neighbors = tf.keras.Input(shape=(num_targets_dim, neighbors_dim, in_lr_feature_dim),
name="node_features_neighbors")
# node size - reconstruction: Input Tensor - shape=(None, targets, 1)
input_node_size = tf.keras.Input(shape=(num_targets_dim, 1), name="node_size_reconstruct")
# adj_matrices - A: Input Tensor - shape=(None, targets, neighbors)
input_a = tf.keras.Input(shape=(num_targets_dim, neighbors_dim), name="adjacency_matrix")
# Categorical predictors: Input Tensor - shape=(None, targets, P)
Expand All @@ -84,7 +83,6 @@ def __init__(
if cond_type == "lr_gat":
x_encoder = SingleLrGatLayer(
lr_dim=in_lr_feature_dim,
latent_dim=latent_dim,
dropout_rate=dropout_rate,
l2_reg=l2_coef,
name=f"lr_gat_layer",
Expand All @@ -100,7 +98,7 @@ def __init__(
elif cond_type == "max":
x_encoder = SingleMaxLayer(
name=f"max_layer"
)([input_x_neighbors,])
)([input_x_neighbors, ])
elif cond_type == "gcn":
x_encoder = SingleGcnLayer(
name=f"max_layer"
Expand Down Expand Up @@ -128,9 +126,9 @@ def __init__(

self.encoder = tf.keras.Model(
inputs=[
input_x_reconstruct,
input_x_targets,
input_x_neighbors,
input_node_size,
input_a,
input_categ_condition,
input_g,
Expand All @@ -140,9 +138,9 @@ def __init__(
)
self.training_model = tf.keras.Model(
inputs=[
input_x_reconstruct,
input_x_targets,
input_x_neighbors,
input_node_size,
input_a,
input_categ_condition,
input_g,
Expand Down
Loading