diff --git a/.cookietemple.yml b/.cookietemple.yml index 6268509e..218098e4 100644 --- a/.cookietemple.yml +++ b/.cookietemple.yml @@ -15,5 +15,5 @@ full_name: Anna Schaar email: anna.schaar@helmholtz-muenchen.de project_name: ncem project_short_description: ncem. Learning cell communication from spatial graphs of cells. -version: 0.4.6 +version: 0.4.8 license: BSD-3-Clause diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 06c4b89b..3b58fa28 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -1,5 +1,10 @@ -name-template: "0.4.6 🌈" # <> -tag-template: 0.4.6 # <> +<<<<<<< HEAD +name-template: "0.4.8 🌈" # <> +tag-template: 0.4.8 # <> +======= +name-template: "0.4.8 🌈" # <> +tag-template: 0.4.8 # <> +>>>>>>> a602092480da26e9dd2e6cdfea66a485b515f277 categories: - title: "🚀 Features" labels: diff --git a/.gitignore b/.gitignore index 3b2f137e..ba0287a6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +**.DS_Store +ncem/unit_test/temp + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/cookietemple.cfg b/cookietemple.cfg index 6a8e5939..cf15e347 100644 --- a/cookietemple.cfg +++ b/cookietemple.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.6 +current_version = 0.4.8 [bumpversion_files_whitelisted] init_file = ncem/__init__.py diff --git a/docs/conf.py b/docs/conf.py index 7471b66e..403d4b8c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -73,9 +73,9 @@ # the built documents. # # The short X.Y version. -version = "0.4.6" +version = "0.4.8" # The full version, including alpha/beta/rc tags. -release = "0.4.6" +release = "0.4.8" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/ncem/__init__.py b/ncem/__init__.py index 67e82e88..2308f073 100644 --- a/ncem/__init__.py +++ b/ncem/__init__.py @@ -8,4 +8,4 @@ __maintainer__ = ", ".join(["Anna C. Schaar", "David S. Fischer"]) __author__ = ", ".join(["Anna C. Schaar", "David S. Fischer"]) __email__ = ", ".join(["anna.schaar@helmholtz-muenchen.de", "david.fischer@helmholtz-muenchen.de"]) -__version__ = "0.4.6" +__version__ = "0.4.8" diff --git a/ncem/api/train/__init__.py b/ncem/api/train/__init__.py index e28b4ced..9d9d506d 100644 --- a/ncem/api/train/__init__.py +++ b/ncem/api/train/__init__.py @@ -1,6 +1,6 @@ """Initializes a train object in api.""" from ncem.estimators import (Estimator, EstimatorCVAE, EstimatorCVAEncem, - EstimatorED, EstimatorEDncem, EstimatorGraph, + EstimatorED, EstimatorEDncem, EstimatorEdNcemNeighborhood, EstimatorGraph, EstimatorInteractions, EstimatorLinear, EstimatorNoGraph) from ncem.models import BetaScheduler diff --git a/ncem/data.py b/ncem/data.py index 4f68145f..38c6d46b 100644 --- a/ncem/data.py +++ b/ncem/data.py @@ -1,6 +1,7 @@ import abc import warnings from collections import OrderedDict +import os from typing import Dict, List, Optional, Sequence, Tuple, Union import matplotlib.colors as colors @@ -15,7 +16,7 @@ from matplotlib.ticker import FormatStrFormatter from matplotlib.tri import Triangulation from omnipath.interactions import import_intercell_network -from pandas import read_csv, read_excel +from pandas import read_csv, read_excel, DataFrame from scipy import sparse, stats from tqdm import tqdm @@ -1640,6 +1641,7 @@ def __init__( coord_type: str = 'generic', n_rings: int = 1, label_selection: Optional[List[str]] = None, + n_top_genes: Optional[int] = None ): """Initialize DataLoader. @@ -1655,7 +1657,7 @@ def __init__( self.data_path = data_path print("Loading data from raw files") - self.register_celldata() + self.register_celldata(n_top_genes=n_top_genes) self.register_img_celldata() self.register_graph_features(label_selection=label_selection) self.compute_adjacency_matrices(radius=radius, coord_type=coord_type, n_rings=n_rings) @@ -1683,10 +1685,10 @@ def patients(self): """ return np.unique(np.asarray(list(self.celldata.uns["img_to_patient_dict"].values()))) - def register_celldata(self): + def register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" print("registering celldata") - self._register_celldata() + self._register_celldata(n_top_genes=n_top_genes) assert self.celldata is not None, "celldata was not loaded" def register_img_celldata(self): @@ -1707,7 +1709,7 @@ def register_graph_features(self, label_selection): self._register_graph_features(label_selection=label_selection) @abc.abstractmethod - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" pass @@ -1742,6 +1744,10 @@ def size_factors(self): global_mean_per_node = self.celldata.X.sum(axis=1).mean(axis=0) return {i: global_mean_per_node / np.sum(adata.X, axis=1) for i, adata in self.img_celldata.items()} + @property + def var_names(self): + return self.celldata.var_names + class DataLoaderZhang(DataLoader): """DataLoaderZhang class. Inherits all functions from DataLoader.""" @@ -1774,7 +1780,7 @@ class DataLoaderZhang(DataLoader): "other": "other", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 0.109, @@ -1786,7 +1792,7 @@ def _register_celldata(self): "patient_col": "mouse", } - celldata = read_h5ad(self.data_path + metadata["fn"]).copy() + celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"])).copy() celldata.uns["metadata"] = metadata celldata.uns["img_keys"] = list(np.unique(celldata.obs[metadata["image_col"]])) @@ -1880,7 +1886,7 @@ class DataLoaderJarosch(DataLoader): "other Lymphocytes": "other Lymphocytes", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 0.5, @@ -1892,7 +1898,7 @@ def _register_celldata(self): "patient_col": None, } - celldata = read_h5ad(self.data_path + metadata["fn"]) + celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"])) celldata = celldata[celldata.obs[metadata["image_col"]] != "Dirt"].copy() celldata.uns["metadata"] = metadata img_keys = list(np.unique(celldata.obs[metadata["image_col"]])) @@ -1979,7 +1985,7 @@ class DataLoaderHartmann(DataLoader): "Myeloid_CD11c": "CD11c Myeloid", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 400 / 1024, @@ -1990,7 +1996,7 @@ def _register_celldata(self): "cluster_col_preprocessed": "Cluster_preprocessed", "patient_col": "donor", } - celldata_df = read_csv(self.data_path + metadata["fn"][0]) + celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"][0])) celldata_df["point"] = [f"scMEP_point_{str(x)}" for x in celldata_df["point"]] celldata_df = celldata_df.fillna(0) # celldata_df = celldata_df.dropna(inplace=False).reset_index() @@ -2114,7 +2120,7 @@ def _register_graph_features(self, label_selection): label_cols_toread = list(label_selection.intersection(set(list(label_cols.keys())))) usecols = label_cols_toread + [patient_col] - tissue_meta_data = read_excel(self.data_path + "scMEP_sample_description.xlsx", usecols=usecols) + tissue_meta_data = read_excel(os.path.join(self.data_path, "scMEP_sample_description.xlsx"), usecols=usecols) # BUILD LABEL VECTORS FROM LABEL COLUMNS # The columns contain unprocessed numeric and categorical entries that are now processed to prediction-ready # numeric tensors. Here we first generate a dictionary of tensors for each label (label_tensors). We then @@ -2218,7 +2224,7 @@ class DataLoaderPascualReguant(DataLoader): "other": "other", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 0.325, @@ -2229,8 +2235,8 @@ def _register_celldata(self): "cluster_col_preprocessed": "cell_class_preprocessed", "patient_col": None, } - nuclei_df = read_excel(self.data_path + metadata["fn"][0]) - membranes_df = read_excel(self.data_path + metadata["fn"][1]) + nuclei_df = read_excel(os.path.join(self.data_path, metadata["fn"][0])) + membranes_df = read_excel(os.path.join(self.data_path, metadata["fn"][1])) celldata_df = nuclei_df.join(membranes_df.set_index("ObjectNumber"), on="ObjectNumber") @@ -2391,7 +2397,7 @@ class DataLoaderSchuerch(DataLoader): "vasculature": "vasculature", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 0.377442, @@ -2402,7 +2408,7 @@ def _register_celldata(self): "cluster_col_preprocessed": "ClusterName_preprocessed", "patient_col": "patients", } - celldata_df = read_csv(self.data_path + metadata["fn"]) + celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"])) feature_cols = [ "CD44 - stroma:Cyc_2_ch_2", @@ -2463,8 +2469,67 @@ def _register_celldata(self): "CD57 - NK cells:Cyc_17_ch_4", "MMP12 - matrix metalloproteinase:Cyc_21_ch_4", ] - - celldata = AnnData(X=celldata_df[feature_cols], obs=celldata_df[["File Name", "patients", "ClusterName"]]) + feature_cols_hgnc_names = [ + 'CD44', + 'FOXP3', + 'CD8A', + 'TP53', + 'GATA3', + 'PTPRC', + 'TBX21', + 'CTNNB1', + 'HLA-DR', + 'CD274', + 'MKI67', + 'PTPRC', + 'CD4', + 'CR2', + 'MUC1', + 'TNFRSF8', + 'CD2', + 'VIM', + 'MS4A1', + 'LAG3', + 'ATP1A1', + 'CD5', + 'IDO1', + 'KRT1', + 'ITGAM', + 'NCAM1', + 'ACTA1', + 'BCL2', + 'IL2RA', + 'ITGAX', + 'PDCD1', + 'GZMB', + 'EGFR', + 'VISTA', + 'FUT4', + 'ICOS', + 'SYP', + 'GFAP', + 'CD7', + 'CD247', + 'CHGA', + 'CD163', + 'PTPRC', + 'CD68', + 'PECAM1', + 'PDPN', + 'CD34', + 'CD38', + 'SDC1', + 'HOECHST1:Cyc_1_ch_1', ## + 'CDX2', + 'COL6A1', + 'CCR4', + 'MMP9', + 'TFRC', + 'B3GAT1', + 'MMP12' + ] + X = DataFrame(np.array(celldata_df[feature_cols]), columns=feature_cols_hgnc_names) + celldata = AnnData(X=X, obs=celldata_df[["File Name", "patients", "ClusterName"]]) celldata.uns["metadata"] = metadata img_keys = list(np.unique(celldata_df[metadata["image_col"]])) @@ -2561,7 +2626,7 @@ def _register_graph_features(self, label_selection): usecols = label_cols_toread_csv + [patient_col] tissue_meta_data = read_csv( - self.data_path + "CRC_TMAs_patient_annotations.csv", + os.path.join(self.data_path, "CRC_TMAs_patient_annotations.csv"), # sep='\t', usecols=usecols, )[usecols] @@ -2742,7 +2807,7 @@ class DataLoaderLohoff(DataLoader): 'Surface ectoderm': 'Surface ectoderm' } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 1., @@ -2754,7 +2819,7 @@ def _register_celldata(self): "patient_col": "embryo", } - celldata = read_h5ad(self.data_path + metadata["fn"]).copy() + celldata = read_h5ad(os.path.join(self.data_path, metadata["fn"])).copy() celldata.uns["metadata"] = metadata celldata.uns["img_keys"] = list(np.unique(celldata.obs[metadata["image_col"]])) @@ -2831,18 +2896,18 @@ class DataLoaderLuWT(DataLoader): """DataLoaderLuWT class. Inherits all functions from DataLoader.""" cell_type_merge_dict = { - 1: "AEC", - 2: "SEC", - 3: "MK", - 4: "Hepatocyte", - 5: "Macrophage", - 6: "Myeloid", - 7: "Erythroid progenitor", - 8: "Erythroid cell", - 9: "Unknown", + "1": "AEC", + "2": "SEC", + "3": "MK", + "4": "Hepatocyte", + "5": "Macrophage", + "6": "Myeloid", + "7": "Erythroid progenitor", + "8": "Erythroid cell", + "9": "Unknown", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 0.1079, @@ -2852,7 +2917,7 @@ def _register_celldata(self): "cluster_col": "CellTypeID_new", "cluster_col_preprocessed": "CellTypeID_new_preprocessed", } - celldata_df = read_csv(self.data_path + metadata["fn"]) + celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"])) feature_cols = [ "Abcb4", @@ -3011,10 +3076,10 @@ def _register_celldata(self): # add clean cluster column which removes regular expression from cluster_col celldata.obs[metadata["cluster_col_preprocessed"]] = list( - pd.Series(list(celldata.obs[metadata["cluster_col"]]), dtype="category").map(self.cell_type_merge_dict) + pd.Series(list(celldata.obs[metadata["cluster_col"]]), dtype="str").map(self.cell_type_merge_dict) ) celldata.obs[metadata["cluster_col_preprocessed"]] = celldata.obs[metadata["cluster_col_preprocessed"]].astype( - "category" + "str" ) celldata = celldata[celldata.obs[metadata["cluster_col_preprocessed"]] != 'Unknown'] @@ -3085,7 +3150,7 @@ class DataLoaderLuWTimputed(DataLoader): 8: "Erythroid cell", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 0.1079, @@ -3094,12 +3159,13 @@ def _register_celldata(self): "pos_cols": ["Center_x", "Center_y"], "cluster_col": "CellTypeID_new", "cluster_col_preprocessed": "CellTypeID_new_preprocessed", + "n_top_genes": n_top_genes } celldata = read_h5ad(self.data_path + metadata["fn"]) celldata.uns["metadata"] = metadata - # only loading top 500 genes - sc.pp.highly_variable_genes(celldata, n_top_genes=500) - celldata = celldata[:, celldata.var.highly_variable].copy() + if n_top_genes: + sc.pp.highly_variable_genes(celldata, n_top_genes=n_top_genes) + celldata = celldata[:, celldata.var.highly_variable].copy() self.celldata = celldata def _register_img_celldata(self): @@ -3144,18 +3210,18 @@ class DataLoaderLuTET2(DataLoader): """DataLoaderLuTET2 class. Inherits all functions from DataLoader.""" cell_type_merge_dict = { - 1: "AEC", - 2: "SEC", - 3: "MK", - 4: "Hepatocyte", - 5: "Macrophage", - 6: "Myeloid", - 7: "Erythroid progenitor", - 8: "Erythroid cell", - 9: "Unknown", + "1": "AEC", + "2": "SEC", + "3": "MK", + "4": "Hepatocyte", + "5": "Macrophage", + "6": "Myeloid", + "7": "Erythroid progenitor", + "8": "Erythroid cell", + "9": "Unknown", } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 0.1079, @@ -3165,7 +3231,7 @@ def _register_celldata(self): "cluster_col": "CellTypeID_new", "cluster_col_preprocessed": "CellTypeID_new_preprocessed", } - celldata_df = read_csv(self.data_path + metadata["fn"]) + celldata_df = read_csv(os.path.join(self.data_path, metadata["fn"])) feature_cols = [ "Abcb4", @@ -3403,7 +3469,7 @@ class DataLoader10xVisiumMouseBrain(DataLoader): 'Thalamus_2': 'Thalamus 2' } - def _register_celldata(self): + def _register_celldata(self, n_top_genes: Optional[int] = None): """Load AnnData object of complete dataset.""" metadata = { "lateral_resolution": 1., @@ -3412,13 +3478,13 @@ def _register_celldata(self): "cluster_col": "cluster", "cluster_col_preprocessed": "cluster_preprocessed", "patient_col": "in_tissue", - "n_top_genes": 500 + "n_top_genes": n_top_genes } celldata = read_h5ad(self.data_path + metadata["fn"]).copy() - # only loading top 500 genes - sc.pp.highly_variable_genes(celldata, n_top_genes=500) - celldata = celldata[:, celldata.var.highly_variable].copy() + if n_top_genes: + sc.pp.highly_variable_genes(celldata, n_top_genes=n_top_genes) + celldata = celldata[:, celldata.var.highly_variable].copy() celldata.X = celldata.X.toarray() celldata.uns["metadata"] = metadata diff --git a/ncem/estimators/__init__.py b/ncem/estimators/__init__.py index 916e2d05..dc05e957 100644 --- a/ncem/estimators/__init__.py +++ b/ncem/estimators/__init__.py @@ -1,9 +1,10 @@ """Importing estimator classes.""" from ncem.estimators.base_estimator import (Estimator, EstimatorGraph, EstimatorNoGraph) +from ncem.estimators.base_estimator_neighbors import EstimatorNeighborhood from ncem.estimators.estimator_cvae import EstimatorCVAE from ncem.estimators.estimator_cvae_ncem import EstimatorCVAEncem from ncem.estimators.estimator_ed import EstimatorED -from ncem.estimators.estimator_ed_ncem import EstimatorEDncem +from ncem.estimators.estimator_ed_ncem import EstimatorEDncem, EstimatorEdNcemNeighborhood from ncem.estimators.estimator_interactions import EstimatorInteractions from ncem.estimators.estimator_linear import EstimatorLinear diff --git a/ncem/estimators/base_estimator.py b/ncem/estimators/base_estimator.py index ad675af2..46a4f222 100644 --- a/ncem/estimators/base_estimator.py +++ b/ncem/estimators/base_estimator.py @@ -15,6 +15,29 @@ r_squared_linreg) +def transfer_layers(model1, model2): + """ + Transfer layer weights from model 1 to model 2. + + :param model1: Input model. + :param model2: Output model. + """ + layer_names_model1 = [x.name for x in model1.layers] + layer_names_model2 = [x.name for x in model2.layers] + layers_updated = [] + layer_not_updated = set(layer_names_model2) + for x in layer_names_model1: + w = model1.get_layer(name=x).get_weights() + if x in layer_names_model2: + # Only update layers with parameters: + if len(w) > 0: + model2.get_layer(x).set_weights(w) + layers_updated.append(x) + layer_not_updated = layer_not_updated.difference({x}) + print(f"updated layers: {layers_updated}") + print(f"did not update layers: {layer_not_updated}") + + class Estimator: """Estimator class for models. @@ -91,6 +114,7 @@ def _load_data( radius: Optional[int] = None, n_rings: int = 1, label_selection: Optional[List[str]] = None, + n_top_genes: Optional[int] = None ): """Initialize a DataLoader object. @@ -104,6 +128,8 @@ def _load_data( Radius. label_selection : list, optional Label selection. + n_top_genes: int, optional + N top genes for highly variable gene selection. Raises ------ @@ -165,7 +191,8 @@ def _load_data( raise ValueError(f"data_origin {data_origin} not recognized") self.data = DataLoader( - data_path, radius=radius, coord_type=coord_type, n_rings=n_rings, label_selection=label_selection + data_path, radius=radius, coord_type=coord_type, n_rings=n_rings, label_selection=label_selection, + n_top_genes=n_top_genes ) def get_data( @@ -182,7 +209,8 @@ def get_data( use_covar_graph_covar: bool = False, domain_type: str = "image", robustness: Optional[float] = None, - robustness_seed: int = 1 + robustness_seed: int = 1, + n_top_genes: Optional[int] = None ): """Get data used in estimator classes. @@ -214,6 +242,8 @@ def get_data( Optional fraction of images for robustness test. robustness_seed: int Seed for robustness analysis + n_top_genes: int, optional + N top genes for highly variable gene selection. Raises ------ ValueError @@ -231,6 +261,7 @@ def get_data( radius=radius, n_rings=n_rings, label_selection=labels_to_load, + n_top_genes=n_top_genes ) if robustness: np.random.seed(robustness_seed) diff --git a/ncem/estimators/base_estimator_neighbors.py b/ncem/estimators/base_estimator_neighbors.py new file mode 100644 index 00000000..a8d461f4 --- /dev/null +++ b/ncem/estimators/base_estimator_neighbors.py @@ -0,0 +1,231 @@ +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ncem.estimators.base_estimator import Estimator + + +class EstimatorNeighborhood(Estimator): + """EstimatorGraph class for spatial models of the nieghborhood only (not full graph).""" + + n_features_in: int + _n_neighbors_padded: Union[int, None] + h0_in: bool + features: list + target_feature_names: list + neighbor_feature_names: list + idx_target_features: np.ndarray + idx_neighbor_features: np.ndarray + + def __init__(self): + super(EstimatorNeighborhood, self).__init__() + self._n_neighbors_padded = None + + def set_input_features(self, h0_in=True, target_feature_names=None, neighbor_feature_names=None): + """ + Need to run this before compiling model. + + Returns: + """ + self.h0_in = h0_in + if self.h0_in: + assert target_feature_names is None + assert neighbor_feature_names is None + self.n_features_in = self.n_features_0 + else: + features = self.data.var_names.tolist() + self.features = features + self.target_feature_names = target_feature_names + self.neighbor_feature_names = neighbor_feature_names + self.idx_target_features = np.array([features.index(x) for x in target_feature_names]) + self.idx_neighbor_features = np.array([features.index(x) for x in neighbor_feature_names]) + assert len(self.idx_target_features) == len(self.idx_neighbor_features) + assert len(set(self.idx_target_features.tolist()).intersection(set(self.idx_neighbor_features.tolist()))) == 0 + self.n_features_in = len(self.idx_target_features) + + @property + def n_neighbors_padded(self): + if self._n_neighbors_padded is None: + self._n_neighbors_padded = int(np.max(np.asarray([ + np.max(np.asarray(np.sum(a, axis=1)).flatten()) for a in self.a.values() + ]))) + return self._n_neighbors_padded + + def _get_output_signature(self, resampled: bool = False): + """Get output signatures. + + Parameters + ---------- + resampled : bool + Whether dataset is resampled or not. + + Returns + ------- + output_signature + """ + # target node features + h_targets = tf.TensorSpec( + shape=(self.n_eval_nodes_per_graph, self.n_features_in), dtype=tf.float32 + ) + # neighbor node features + h_neighbors = tf.TensorSpec( + shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded, self.n_features_in), dtype=tf.float32 + ) + sf = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, 1), dtype=tf.float32) # input node size factors + # node-level covariates + node_covar = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_node_covariates), dtype=tf.float32) + # adjacency matrix + a = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_neighbors_padded), dtype=tf.float32) + # domain + domain = tf.TensorSpec(shape=(self.n_domains,), dtype=tf.int32) + # node features to reconstruct + reconstruction = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph, self.n_features_1), dtype=tf.float32) + # dummy for kl loss + kl_dummy = tf.TensorSpec(shape=(self.n_eval_nodes_per_graph,), dtype=tf.float32) + + if self.vi_model: + if resampled: + output_signature = ( + (h_targets, h_neighbors, sf, a, node_covar, domain), + (reconstruction, kl_dummy), + (h_targets, h_neighbors, sf, a, node_covar, domain), + (reconstruction, kl_dummy), + ) + else: + output_signature = ((h_targets, h_neighbors, sf, a, node_covar, domain), + (reconstruction, kl_dummy)) + else: + if resampled: + output_signature = ( + (h_targets, h_neighbors, sf, a, node_covar, domain), + reconstruction, + (h_targets, h_neighbors, sf, a, node_covar, domain), + reconstruction, + ) + else: + output_signature = ((h_targets, h_neighbors, sf, a, node_covar, domain), + reconstruction) + # print(output_signature) + return output_signature + + def _get_dataset( + self, + image_keys: List[str], + nodes_idx: Dict[str, np.ndarray], + batch_size: int, + shuffle_buffer_size: Optional[int], + train: bool = True, + seed: Optional[int] = None, + prefetch: int = 100, + reinit_n_eval: Optional[int] = None, + ): + """Prepare a dataset. + + Parameters + ---------- + image_keys : np.array + Image keys in partition. + nodes_idx : dict, str + Dictionary of nodes per image in partition. + batch_size : int + Batch size. + shuffle_buffer_size : int, optional + Shuffle buffer size. + train : bool + Whether dataset is used for training or not (influences shuffling of nodes). + seed : int, optional + Random seed. + prefetch: int + Prefetch of dataset. + reinit_n_eval : int, optional + Used if model is reinitialized to different number of nodes per graph. + + Returns + ------- + A tensorflow dataset. + """ + np.random.seed(seed) + if reinit_n_eval is not None and reinit_n_eval != self.n_eval_nodes_per_graph: + print( + "ATTENTION: specifying reinit_n_eval will change class argument n_eval_nodes_per_graph " + "from %i to %i" % (self.n_eval_nodes_per_graph, reinit_n_eval) + ) + self.n_eval_nodes_per_graph = reinit_n_eval + + def generator(): + for key in image_keys: + if nodes_idx[key].size == 0: # needed for images where no nodes are selected + continue + idx_nodes = np.arange(0, self.a[key].shape[0]) + + if train: + index_list = [ + np.asarray( + np.random.choice( + a=nodes_idx[key], + size=self.n_eval_nodes_per_graph, + replace=True, + ), + dtype=np.int32, + ) + ] + else: + # dropping + index_list = [ + np.asarray( + nodes_idx[key][self.n_eval_nodes_per_graph * i: self.n_eval_nodes_per_graph * (i + 1)], + dtype=np.int32, + ) + for i in range(len(nodes_idx[key]) // self.n_eval_nodes_per_graph) + ] + + for indices in index_list: + h_out = self.h_1[key][idx_nodes[indices], :] + if self.h0_in: + h_targets = self.h_0[key][idx_nodes[indices], :] + else: + h_targets = self.h_1[key][idx_nodes[indices], :][:, self.idx_target_features] + h_neighbors = [] + a_neighborhood = np.zeros((self.n_eval_nodes_per_graph, self.n_neighbors_padded), "float32") + for i, j in enumerate(idx_nodes[indices]): + a_j = np.asarray(self.a[key][j, :].todense()).flatten() + idx_neighbors = np.where(a_j > 0.)[0] + if self.h0_in: + h_neighbors_j = self.h_0[key][idx_neighbors, :] + else: + h_neighbors_j = self.h_1[key][idx_neighbors, :][:, self.idx_neighbor_features] + h_neighbors_j = np.expand_dims(h_neighbors_j, axis=0) + # Pad neighborhoods: + diff = self.n_neighbors_padded - h_neighbors_j.shape[1] + zeros = np.zeros((1, diff, h_neighbors_j.shape[2]), dtype="float32") + h_neighbors_j = np.concatenate([h_neighbors_j, zeros], axis=1) + h_neighbors.append(h_neighbors_j) + a_neighborhood[i, :len(idx_neighbors)] = a_j[idx_neighbors] + h_neighbors = np.concatenate(h_neighbors, axis=0) + if self.log_transform: + h_targets = np.log(h_targets + 1.0) + h_neighbors = np.log(h_neighbors + 1.0) + + node_covar = self.node_covar[key][idx_nodes][indices, :] + sf = np.expand_dims(self.size_factors[key][idx_nodes][indices], axis=1) + + g = np.zeros((self.n_domains,), dtype="int32") + g[self.domains[key]] = 1 + + if self.vi_model: + kl_dummy = np.zeros((self.n_eval_nodes_per_graph,), dtype="float32") + yield (h_targets, h_neighbors, sf, a_neighborhood, node_covar, g), (h_out, kl_dummy) + else: + yield (h_targets, h_neighbors, sf, a_neighborhood, node_covar, g), h_out + + output_signature = self._get_output_signature(resampled=False) + + dataset = tf.data.Dataset.from_generator(generator=generator, output_signature=output_signature) + if train: + if shuffle_buffer_size is not None: + dataset = dataset.shuffle(buffer_size=shuffle_buffer_size, seed=None, reshuffle_each_iteration=True) + dataset = dataset.repeat() + dataset = dataset.batch(batch_size) + dataset = dataset.prefetch(prefetch) + return dataset diff --git a/ncem/estimators/estimator_ed_ncem.py b/ncem/estimators/estimator_ed_ncem.py index f2ce6456..d6addb94 100644 --- a/ncem/estimators/estimator_ed_ncem.py +++ b/ncem/estimators/estimator_ed_ncem.py @@ -1,10 +1,52 @@ +import abc import tensorflow as tf +from typing import Tuple, Union -from ncem.estimators import EstimatorGraph -from ncem.models import ModelEDncem +from ncem.estimators import Estimator, EstimatorGraph, EstimatorNeighborhood +from ncem.estimators.base_estimator import transfer_layers +from ncem.models import ModelEDncem, ModelEd2Ncem +from ncem.models.layers.output_layers import IDENTIFIER_OUTPUT_LAYER -class EstimatorEDncem(EstimatorGraph): +class EstimatorEDncemBase(Estimator, abc.ABC): + model: Union[ModelEDncem, ModelEd2Ncem] + + def predict_embedding_any(self, img_keys, node_idx, batch_size: int = 1): + """Predict embedding on any given data set. + + Parameters + ---------- + img_keys + Image keys. + node_idx + Nodes indices. + batch_size : int + Number of samples. If unspecified, it will default to 1. + + Returns + ------- + eval_dict + """ + ds = self._get_dataset( + image_keys=img_keys, + nodes_idx=node_idx, + batch_size=batch_size, + shuffle_buffer_size=1, + train=False, + seed=None, + reinit_n_eval=None, + ) + transfer_layers(model1=self.model.training_model, model2=self.model.encoder) + return self.model.encoder.predict(ds) + + def get_decoding_weights(self): + layer_name_out = [x.name for x in self.model.training_model.layers if IDENTIFIER_OUTPUT_LAYER in x.name] + assert len(layer_name_out) == 1, "there should only be one output layer for this operation" + w = self.model.training_model.get_layer(name=layer_name_out[0]).get_weights() + return w + + +class EstimatorEDncem(EstimatorGraph, EstimatorEDncemBase): """Estimator class for encoder-decoder NCEM models. Subclass of EstimatorGraph.""" def __init__( @@ -161,3 +203,92 @@ def init_model( self.max_beta = max_beta self.pre_warm_up = pre_warm_up self._compile_model(optimizer=optimizer, output_layer=output_layer) + + +class EstimatorEdNcemNeighborhood(EstimatorNeighborhood, EstimatorEDncemBase): + """Estimator class for encoder-decoder NCEM models with single graph layer. Subclass of EstimatorNeighborhood.""" + + def __init__( + self, + cond_type: str, + use_type_cond: bool = True, + log_transform: bool = False, + ): + """Initialize a EstimatorEDncem object. + + Parameters + ---------- + cond_type : str + Max, ind or gcn, graph layer used in conditional. + use_type_cond : bool + Whether to use the categorical cell type label in conditional. + log_transform : bool + Whether to log transform h_1. + + Raises + ------ + ValueError + If `cond_type` is not recognized. + """ + super(EstimatorEdNcemNeighborhood, self).__init__() + self.model_type = "ed_ncem" + if cond_type in ["gat", "lr_gat", "max", "gcn"]: + self.adj_type = "full" + else: + raise ValueError("cond_type %s not recognized" % cond_type) + self.cond_type = cond_type + self.use_type_cond = use_type_cond + self.log_transform = log_transform + self.metrics = {"np": [], "tf": []} + self.n_eval_nodes_per_graph = None + + def init_model( + self, + optimizer: str, + learning_rate: float, + latent_dim: Tuple[int], + dropout_rate: float, + l2_coef: float, + l1_coef: float, + cond_type: str, + n_eval_nodes_per_graph: int, + use_domain: bool, + scale_node_size: bool, + output_layer: str, + dec_intermediate_dim: int, + dec_n_hidden: int, + dec_dropout_rate: float, + dec_l1_coef: float, + dec_l2_coef: float, + dec_use_batch_norm: bool, + **kwargs + ): + self.n_eval_nodes_per_graph = n_eval_nodes_per_graph + self.model = ModelEd2Ncem( + input_shapes=( + self.n_features_in, + self.n_features_1, + self.n_eval_nodes_per_graph, + self.n_neighbors_padded, + self.n_node_covariates, + self.n_domains, + ), + latent_dim=latent_dim, + dropout_rate=dropout_rate, + l2_coef=l2_coef, + l1_coef=l1_coef, + use_domain=use_domain, + use_type_cond=self.use_type_cond, + scale_node_size=scale_node_size, + output_layer=output_layer, + cond_type=cond_type, + dec_intermediate_dim=dec_intermediate_dim, + dec_n_hidden=dec_n_hidden, + dec_dropout_rate=dec_dropout_rate, + dec_l1_coef=dec_l1_coef, + dec_l2_coef=dec_l2_coef, + dec_use_batch_norm=dec_use_batch_norm, + ) + optimizer = tf.keras.optimizers.get(optimizer) + tf.keras.backend.set_value(optimizer.lr, learning_rate) + self._compile_model(optimizer=optimizer, output_layer=output_layer) diff --git a/ncem/models/__init__.py b/ncem/models/__init__.py index f8dbe2a4..45f8c778 100644 --- a/ncem/models/__init__.py +++ b/ncem/models/__init__.py @@ -4,5 +4,6 @@ from ncem.models.model_cvae_ncem import ModelCVAEncem from ncem.models.model_ed import ModelED from ncem.models.model_ed_ncem import ModelEDncem +from ncem.models.model_ed_single_ncem import ModelEd2Ncem from ncem.models.model_interactions import ModelInteractions from ncem.models.model_linear import ModelLinear diff --git a/ncem/models/layers/__init__.py b/ncem/models/layers/__init__.py index c47dbc0f..7ecc010a 100644 --- a/ncem/models/layers/__init__.py +++ b/ncem/models/layers/__init__.py @@ -9,5 +9,7 @@ LinearOutput, NegBinConstDispOutput, NegBinOutput, - NegBinSharedDispOutput) + NegBinSharedDispOutput, + get_out) from ncem.models.layers.preproc_input import DenseInteractions, PreprocInput +from ncem.models.layers.single_gnn_layers import SingleLrGatLayer, SingleGatLayer diff --git a/ncem/models/layers/output_layers.py b/ncem/models/layers/output_layers.py index 2b1057e2..f17ef4b9 100644 --- a/ncem/models/layers/output_layers.py +++ b/ncem/models/layers/output_layers.py @@ -1,5 +1,37 @@ import tensorflow as tf +IDENTIFIER_OUTPUT_LAYER = "Output" + + +def get_out(output_layer: str, out_feature_dim, scale_node_size, name: str = 'decoder'): + if output_layer == "gaussian": + output_decoder_layer = GaussianOutput( + original_dim=out_feature_dim, + use_node_scale=scale_node_size, + name=f"Gaussian{IDENTIFIER_OUTPUT_LAYER}_{name}", + ) + elif output_layer == "nb": + output_decoder_layer = NegBinOutput( + original_dim=out_feature_dim, + use_node_scale=scale_node_size, + name=f"NegBin{IDENTIFIER_OUTPUT_LAYER}_{name}", + ) + elif output_layer == "nb_shared_disp": + output_decoder_layer = NegBinSharedDispOutput( + original_dim=out_feature_dim, + use_node_scale=scale_node_size, + name=f"NegBinSharedDisp{IDENTIFIER_OUTPUT_LAYER}_{name}", + ) + elif output_layer == "nb_const_disp": + output_decoder_layer = NegBinConstDispOutput( + original_dim=out_feature_dim, + use_node_scale=scale_node_size, + name=f"NegBinConstDisp{IDENTIFIER_OUTPUT_LAYER}_{name}", + ) + else: + raise ValueError("tried to access a non-supported output layer %s" % output_layer) + return output_decoder_layer + class LinearOutput(tf.keras.layers.Layer): """Linear output layer.""" diff --git a/ncem/models/layers/single_gnn_layers.py b/ncem/models/layers/single_gnn_layers.py new file mode 100644 index 00000000..e17fe164 --- /dev/null +++ b/ncem/models/layers/single_gnn_layers.py @@ -0,0 +1,187 @@ +import tensorflow as tf + + +class SingleMaxLayer(tf.keras.layers.Layer): + """ + TODO MAX implementation here is not complete yet. + """ + def call(self, inputs, **kwargs): + x = inputs[0] + a = inputs[1] + + a = tf.expand_dims(a, axis=-1) # (batch, target nodes, padded neighbor nodes, 1) + t = x * a + t = tf.reduce_sum(t, axis=2) + + y = tf.where(t > 0.5, x=tf.divide(t, t), y=tf.multiply(t, tf.constant(0.0, dtype=tf.float32))) + return y + + +class SingleGcnLayer(tf.keras.layers.Layer): + """ + TODO GCN implementation here is not complete yet. + """ + def __init__(self, latent_dim, dropout_rate, activation, l2_reg, use_bias: bool = False, **kwargs): + super().__init__(**kwargs) + self.latent_dim = latent_dim + self.dropout_rate = dropout_rate + self.activation = tf.keras.activations.get(activation) + self.l2_reg = l2_reg + self.use_bias = use_bias + + self.kernel = None + self.bias = None + + def build(self, input_shapes): + input_shape = input_shapes[0] + # Layer kernel + self.kernel = self.add_weight( + name="kernel", + shape=(int(input_shape[2]), self.latent_dim), + initializer=tf.keras.initializers.glorot_uniform(), + regularizer=tf.keras.regularizers.l2(self.l2_reg), + ) + # Layer bias + if self.use_bias: + self.bias = self.add_weight(name="bias", shape=(self.latent_dim,)) + + def call(self, inputs, **kwargs): + targets = inputs[0] # (batch, target nodes, features) + neighbors = inputs[1] # (batch, target nodes, padded neighbor nodes, features) + neighborhood = tf.concat([tf.expand_dims(targets, axis=-2), neighbors], axis=-2) + a = inputs[2] # (batch, target nodes, padded neighbor nodes) + + # (batch, target nodes, padded neighbor nodes, latent) + y = tf.matmul(neighborhood, self.kernel) + y = tf.reduce_sum(y, axis=-2) # (batch, target nodes, latent) + # Normalise neighborhood size in embedding: + factor = tf.reduce_sum(a, axis=-1, keepdims=True) # (batch, target nodes, 1) + factor = tf.where(factor > 0.5, x=factor, y=tf.ones_like(factor)) + y = y / factor + y = tf.keras.layers.Dropout(self.dropout_rate)(y) + + if self.use_bias: + y = tf.add(y, self.bias) + if self.activation is not None: + y = self.activation(y) + + return y + + +class SingleLrGatLayer(tf.keras.layers.Layer): + + def __init__(self, lr_dim, dropout_rate, l2_reg, **kwargs): + """Initialize GCNLayer. + + Parameters + ---------- + dropout_rate + Dropout rate. + activation + Activation. + l2_reg + l2 regularization coefficient. + kwargs + Arbitrary keyword arguments. + """ + super().__init__(**kwargs) + self.lr_dim = lr_dim + self.dropout_rate = dropout_rate + self.l2_reg = l2_reg + + # Layer kernel + self.kernel_l = self.add_weight( + name="kernel_l", + shape=(1, 1, self.lr_dim), + initializer=tf.keras.initializers.glorot_uniform(), + regularizer=tf.keras.regularizers.l2(self.l2_reg), + ) + self.bias_l = self.add_weight(name="bias_l", shape=(1, 1, self.lr_dim,)) + self.kernel_r = self.add_weight( + name="kernel_r", + shape=(1, 1, self.lr_dim), + initializer=tf.keras.initializers.glorot_uniform(), + regularizer=tf.keras.regularizers.l2(self.l2_reg), + ) + self.bias_r = self.add_weight(name="bias_r", shape=(1, 1, self.lr_dim,)) + + def call(self, inputs, **kwargs): + targets_receptor = inputs[0] # (batch, target nodes, lr) + # print('targets_receptor', targets_receptor.shape) + neighbors_ligand = inputs[1] # (batch, target nodes, padded neighbor nodes, lr) + a = inputs[2] # (batch, target nodes, padded neighbor nodes) + + targets_receptor = targets_receptor * self.kernel_r + self.bias_r # (batch, target nodes, lr) + neighbors_ligand = neighbors_ligand * self.kernel_l + self.bias_l # (batch, target nodes, padded neighbor nodes, lr) + targets_receptor = tf.expand_dims(targets_receptor, axis=-2) # (batch, target nodes, 1, lr) + weights = targets_receptor * neighbors_ligand + # print('weights', weights.shape) + # Mask embeddings to neighbors + a = tf.expand_dims(a, axis=-1) # (batch, target nodes, padded neighbor nodes, 1) + weights = weights * a + weights = tf.reduce_sum(weights, axis=2) # (batch, target nodes, lr) + y = tf.math.sigmoid(weights) + + return y + + +class SingleGatLayer(tf.keras.layers.Layer): + + """ + TODO GAT implementation here is not complete yet. + """ + + def __init__(self, in_dim, out_dim, dropout_rate, l2_reg, **kwargs): + """Initialize GatLayer. + + Parameters + ---------- + out_dim + Output dimension. + dropout_rate + Dropout rate. + activation + Activation. + l2_reg + l2 regularization coefficient. + kwargs + Arbitrary keyword arguments. + """ + super().__init__(**kwargs) + self.in_dim = in_dim + self.out_dim = out_dim + self.dropout_rate = dropout_rate + self.l2_reg = l2_reg + + # Layer kernel + self.kernel_target = self.add_weight( + name="kernel_l", + shape=(1, 1, self.in_dim), + initializer=tf.keras.initializers.glorot_uniform(), + regularizer=tf.keras.regularizers.l2(self.l2_reg), + ) + self.bias_target = self.add_weight(name="bias_l", shape=(1, 1, self.in_dim,)) + self.kernel_neighbor = self.add_weight( + name="kernel_r", + shape=(1, 1, self.in_dim), + initializer=tf.keras.initializers.glorot_uniform(), + regularizer=tf.keras.regularizers.l2(self.l2_reg), + ) + self.bias_neighbor = self.add_weight(name="bias_r", shape=(1, 1, 1, self.in_dim,)) + + def call(self, inputs, **kwargs): + targets = inputs[0] # (batch, target nodes, features) + neighbors = inputs[1] # (batch, target nodes, padded neighbor nodes, features) + a = inputs[2] # (batch, target nodes, padded neighbor nodes) + + targets = targets * self.kernel_neighbor + self.bias_neighbor # (batch, target nodes, lr) + neighbors = neighbors * self.kernel_target + self.bias_target # (batch, target nodes, padded neighbor nodes, lr) + targets = tf.expand_dims(targets, axis=-2) # (batch, target nodes, 1, lr) + weights = targets * neighbors + # Mask embeddings to neighbors + a = tf.expand_dims(a, axis=-1) # (batch, target nodes, padded neighbor nodes, 1) + weights = weights * a + weights = tf.reduce_sum(weights, axis=2) # (batch, target nodes, lr) + y = tf.math.sigmoid(weights) + + return y diff --git a/ncem/models/model_cvae.py b/ncem/models/model_cvae.py index 88a35e77..0e28313e 100644 --- a/ncem/models/model_cvae.py +++ b/ncem/models/model_cvae.py @@ -1,10 +1,7 @@ import numpy as np import tensorflow as tf -from ncem.models.layers import (Decoder, Encoder, GaussianConstDispOutput, - GaussianOutput, NegBinConstDispOutput, - NegBinOutput, NegBinSharedDispOutput, - PreprocInput, SamplingPrior) +from ncem.models.layers import (Decoder, Encoder, get_out, PreprocInput, SamplingPrior) class ModelCVAE: @@ -160,88 +157,17 @@ def __init__( sampling_decoder1 = self.decoder_model((latent_sampling_reshaped1, categ_condition)) sampling_decoder2 = self.decoder_model((latent_sampling_reshaped2, categ_condition)) - if output_layer == "gaussian": - output_decoder_layer = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "gaussian_const_disp": - output_decoder_layer = GaussianConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianConstDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = GaussianConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianConstDispOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = GaussianConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianConstDispOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "nb": - output_decoder_layer = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "nb_shared_disp": - output_decoder_layer = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "nb_const_disp": - output_decoder_layer = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((sampling_decoder2, input_node_size)) - else: - raise ValueError("tried to access a non-supported output layer %s" % output_layer) + output_decoder_layer = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size + )((output_decoder, input_node_size)) + output_sampling_decoder1 = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size, + name='sampling1' + )((sampling_decoder1, input_node_size)) + output_sampling_decoder2 = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size, + name='sampling2' + )((sampling_decoder2, input_node_size)) output_decoder_concat = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_decoder_layer) output_sampling_concat1 = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_sampling_decoder1) diff --git a/ncem/models/model_cvae_ncem.py b/ncem/models/model_cvae_ncem.py index 65111f55..7213198e 100644 --- a/ncem/models/model_cvae_ncem.py +++ b/ncem/models/model_cvae_ncem.py @@ -3,11 +3,7 @@ import numpy as np import tensorflow as tf -from ncem.models.layers import (CondDecoder, CondEncoder, - GaussianConstDispOutput, GaussianOutput, - GCNLayer, MaxLayer, NegBinConstDispOutput, - NegBinOutput, NegBinSharedDispOutput, - PreprocInput, SamplingPrior) +from ncem.models.layers import (CondDecoder, CondEncoder, GCNLayer, MaxLayer, get_out, PreprocInput, SamplingPrior) class ModelCVAEncem: @@ -243,88 +239,17 @@ def __init__( sampling_decoder1 = self.decoder_model((latent_sampling_reshaped1, x_neighbour_embedding, categ_condition)) sampling_decoder2 = self.decoder_model((latent_sampling_reshaped2, x_neighbour_embedding, categ_condition)) - if output_layer == "gaussian": - output_decoder_layer = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "gaussian_const_disp": - output_decoder_layer = GaussianConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianConstDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = GaussianConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianConstDispOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = GaussianConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianConstDispOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "nb": - output_decoder_layer = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "nb_shared_disp": - output_decoder_layer = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_sampling", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_sampling", - )((sampling_decoder2, input_node_size)) - elif output_layer == "nb_const_disp": - output_decoder_layer = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder1 = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((sampling_decoder1, input_node_size)) - output_sampling_decoder2 = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((sampling_decoder2, input_node_size)) - else: - raise ValueError("tried to access a non-supported output layer %s" % output_layer) + output_decoder_layer = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size + )((output_decoder, input_node_size)) + output_sampling_decoder1 = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size, + name='sampling1' + )((sampling_decoder1, input_node_size)) + output_sampling_decoder2 = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size, + name='sampling2' + )((sampling_decoder2, input_node_size)) output_decoder_concat = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_decoder_layer) output_sampling_concat1 = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_sampling_decoder1) diff --git a/ncem/models/model_ed.py b/ncem/models/model_ed.py index 66d95b31..419adbab 100644 --- a/ncem/models/model_ed.py +++ b/ncem/models/model_ed.py @@ -1,8 +1,6 @@ import tensorflow as tf -from ncem.models.layers import (Decoder, Encoder, GaussianOutput, - NegBinConstDispOutput, NegBinOutput, - NegBinSharedDispOutput) +from ncem.models.layers import (Decoder, Encoder, get_out) class ModelED: @@ -133,52 +131,13 @@ def __init__( output_decoder = self.decoder_model((z, categ_condition)) sampling_decoder = self.decoder_model((latent_sampling_reshaped, categ_condition)) - if output_layer == "gaussian": - output_decoder_layer = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_sampling", - )((sampling_decoder, input_node_size)) - elif output_layer == "nb": - output_decoder_layer = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_sampling", - )((sampling_decoder, input_node_size)) - elif output_layer == "nb_shared_disp": - output_decoder_layer = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_sampling", - )((sampling_decoder, input_node_size)) - elif output_layer == "nb_const_disp": - output_decoder_layer = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_sampling", - )((sampling_decoder, input_node_size)) - else: - raise ValueError("tried to access a non-supported output layer %s" % output_layer) + output_decoder_layer = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size + )((output_decoder, input_node_size)) + output_sampling_decoder = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size, + name='sampling' + )((sampling_decoder, input_node_size)) output_decoder_concat = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_decoder_layer) output_sampling_concat = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_sampling_decoder) diff --git a/ncem/models/model_ed_ncem.py b/ncem/models/model_ed_ncem.py index 8ee5313f..12081f95 100644 --- a/ncem/models/model_ed_ncem.py +++ b/ncem/models/model_ed_ncem.py @@ -2,9 +2,7 @@ import tensorflow as tf -from ncem.models.layers import (Decoder, Encoder, GaussianOutput, GCNLayer, - MaxLayer, NegBinConstDispOutput, NegBinOutput, - NegBinSharedDispOutput) +from ncem.models.layers import (Decoder, Encoder, GCNLayer, MaxLayer, get_out) class ModelEDncem: @@ -213,52 +211,13 @@ def __init__( output_decoder = self.decoder_model((z, categ_condition)) sampling_decoder = self.decoder_model((latent_sampling_reshaped, categ_condition)) - if output_layer == "gaussian": - output_decoder_layer = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = GaussianOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="GaussianOutput_sampling", - )((sampling_decoder, input_node_size)) - elif output_layer == "nb": - output_decoder_layer = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = NegBinOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinOutput_sampling", - )((sampling_decoder, input_node_size)) - elif output_layer == "nb_shared_disp": - output_decoder_layer = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = NegBinSharedDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinSharedDispOutput_sampling", - )((sampling_decoder, input_node_size)) - elif output_layer == "nb_const_disp": - output_decoder_layer = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_decoder", - )((output_decoder, input_node_size)) - output_sampling_decoder = NegBinConstDispOutput( - original_dim=out_node_feature_dim, - use_node_scale=scale_node_size, - name="NegBinConstDispOutput_sampling", - )((sampling_decoder, input_node_size)) - else: - raise ValueError("tried to access a non-supported output layer %s" % output_layer) + output_decoder_layer = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size + )((output_decoder, input_node_size)) + output_sampling_decoder = get_out( + output_layer=output_layer, out_feature_dim=out_node_feature_dim, scale_node_size=scale_node_size, + name='sampling' + )((sampling_decoder, input_node_size)) output_decoder_concat = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_decoder_layer) output_sampling_concat = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_sampling_decoder) diff --git a/ncem/models/model_ed_single_ncem.py b/ncem/models/model_ed_single_ncem.py new file mode 100644 index 00000000..d295ec43 --- /dev/null +++ b/ncem/models/model_ed_single_ncem.py @@ -0,0 +1,158 @@ +import tensorflow as tf + +from ncem.models.layers import get_out, Decoder +from ncem.models.layers.single_gnn_layers import SingleMaxLayer, SingleGcnLayer, SingleGatLayer, SingleLrGatLayer + + +class ModelEd2Ncem: + """Model class for NCEM encoder-decoder with graph layer IND (MAX) or GCN.""" + + def __init__( + self, + input_shapes, + latent_dim: tuple, + dropout_rate: float, + l2_coef: float, + l1_coef: float, + cond_type: str, + use_type_cond: bool, + use_domain: bool, + scale_node_size: bool, + output_layer: str, + dec_intermediate_dim: int, + dec_n_hidden: int, + dec_dropout_rate: float, + dec_l1_coef: float, + dec_l2_coef: float, + dec_use_batch_norm: bool, + **kwargs, + ): + super().__init__() + self.args = { + "input_shapes": input_shapes, + "latent_dim": latent_dim, + "dropout_rate": dropout_rate, + "l2_coef": l2_coef, + "l1_coef": l1_coef, + "use_domain": use_domain, + "use_type_cond": use_type_cond, + "scale_node_size": scale_node_size, + "output_layer": output_layer, + "dec_intermediate_dim": "dec_intermediate_dim", + "dec_n_hidden": "dec_n_hidden", + "dec_dropout_rate": "dec_dropout_rate", + "dec_l1_coef": "dec_l1_coef", + "dec_l2_coef": "dec_l2_coef", + "dec_use_batch_norm": "dec_use_batch_norm", + } + in_lr_feature_dim = input_shapes[0] + out_feature_dim = input_shapes[1] + num_targets_dim = input_shapes[2] + neighbors_dim = input_shapes[3] + categ_condition_dim = input_shapes[4] + domain_dim = input_shapes[5] + + # node features - node representation of neighbors: Input Tensor - shape=(None, targets, F-in) + input_x_targets = tf.keras.Input(shape=(num_targets_dim, in_lr_feature_dim), name="node_features_targets") + # node features - node representation of neighbors: Input Tensor - shape=(None, neighbors, F-in) + input_x_neighbors = tf.keras.Input(shape=(num_targets_dim, neighbors_dim, in_lr_feature_dim), + name="node_features_neighbors") + # node size - reconstruction: Input Tensor - shape=(None, targets, 1) + input_node_size = tf.keras.Input(shape=(num_targets_dim, 1), name="node_size_reconstruct") + # adj_matrices - A: Input Tensor - shape=(None, targets, neighbors) + input_a = tf.keras.Input(shape=(num_targets_dim, neighbors_dim), name="adjacency_matrix") + # Categorical predictors: Input Tensor - shape=(None, targets, P) + input_categ_condition = tf.keras.Input(shape=(num_targets_dim, categ_condition_dim), + name="categorical_predictor") + # domain information of graph - shape=(None, 1) + input_g = tf.keras.layers.Input(shape=(domain_dim,), name="input_da_group", dtype="int32") + + if use_domain: + categ_condition = tf.concat( + [ + input_categ_condition, + tf.tile(tf.expand_dims(tf.cast(input_g, dtype="float32"), axis=-2), [1, num_targets_dim, 1]), + ], + axis=-1, + ) + else: + categ_condition = input_categ_condition + + if cond_type == "lr_gat": + print("LRGAT") + #x_encoder = SingleMaxLayer( + # name=f"max_layer" + #)([input_x_neighbors, ]) + x_encoder = SingleLrGatLayer( + lr_dim=in_lr_feature_dim, + dropout_rate=dropout_rate, + l2_reg=l2_coef, + name=f"lr_gat_layer", + )([input_x_targets, input_x_neighbors, input_a]) + elif cond_type == "gat": + x_encoder = SingleGatLayer( + lr_dim=in_lr_feature_dim, + latent_dim=latent_dim, + dropout_rate=dropout_rate, + l2_reg=l2_coef, + name=f"lr_gat_layer", + )([input_x_targets, input_x_neighbors, input_a]) + elif cond_type == "max": + print("MAX") + x_encoder = SingleMaxLayer( + name=f"max_layer" + )([input_x_neighbors, input_a]) + elif cond_type == "gcn": + x_encoder = SingleGcnLayer( + latent_dim=latent_dim, + dropout_rate=dropout_rate, + activation="relu", + l2_reg=l2_coef, + use_bias=True, + name=f"gcn_layer" + )([input_x_targets, input_x_neighbors, input_a]) + else: + raise ValueError("tried to access a non-supported conditional layer %s" % cond_type) + + # Decoder on neighborhood embeddings: + if dec_n_hidden > 0: + x = Decoder( + intermediate_dim=dec_intermediate_dim, + n_hidden=dec_n_hidden, + dropout_rate=dec_dropout_rate, + l1_coef=dec_l1_coef, + l2_coef=dec_l2_coef, + use_type_cond=use_type_cond, + use_batch_norm=dec_use_batch_norm, + )((x_encoder, categ_condition)) + else: + x = tf.concat([x_encoder, categ_condition], axis=-1) + + output_decoder = get_out(output_layer=output_layer, out_feature_dim=out_feature_dim, + scale_node_size=scale_node_size)((x, input_node_size)) + output_decoder_concat = tf.keras.layers.Concatenate(axis=2, name="reconstruction")(output_decoder) + + self.encoder = tf.keras.Model( + inputs=[ + input_x_targets, + input_x_neighbors, + input_node_size, + input_a, + input_categ_condition, + input_g, + ], + outputs=x_encoder, + name="encoder_ncem", + ) + self.training_model = tf.keras.Model( + inputs=[ + input_x_targets, + input_x_neighbors, + input_node_size, + input_a, + input_categ_condition, + input_g, + ], + outputs=output_decoder_concat, + name="ed_ncem", + ) diff --git a/ncem/train/train_model.py b/ncem/train/train_model.py index 16a195b1..ff923765 100644 --- a/ncem/train/train_model.py +++ b/ncem/train/train_model.py @@ -3,7 +3,7 @@ from typing import Union from ncem.estimators import (EstimatorCVAE, EstimatorCVAEncem, EstimatorED, - EstimatorEDncem, EstimatorInteractions, + EstimatorEDncem, EstimatorEdNcemNeighborhood, EstimatorInteractions, EstimatorLinear) @@ -127,7 +127,29 @@ def _save_specific(self, fn): self._save_evaluation_per_node_type(fn=fn) -class TrainModelEDncem(TrainModel): +class TrainModelEDncemBase: + + estimator: Union[EstimatorEDncem, EstimatorEdNcemNeighborhood] + + def _save_embedding(self, fn): + splits = { + "eval": {"img_keys": self.estimator.img_keys_eval, "node_idx": self.estimator.nodes_idx_eval}, + "test": {"img_keys": self.estimator.img_keys_test, "node_idx": self.estimator.nodes_idx_test}, + "train": {"img_keys": self.estimator.img_keys_train, "node_idx": self.estimator.nodes_idx_train}, + } + embeddings = {} + for k, v in splits.items(): + embeddings[k] = self.estimator.predict_embedding_any(**v) + with open(fn + "_embeddings.pickle", "wb") as f: + pickle.dump(obj=embeddings, file=f) + + def _save_output_weights(self, fn): + w = self.estimator.get_decoding_weights() + with open(fn + "_output_weights.pickle", "wb") as f: + pickle.dump(obj=w, file=f) + + +class TrainModelEDncem(TrainModel, TrainModelEDncemBase): estimator: EstimatorEDncem def init_estim(self, **kwargs): @@ -137,6 +159,25 @@ def _save_specific(self, fn): self._save_evaluation_per_node_type(fn=fn) +class TrainModelEdSingleNcem(TrainModel, TrainModelEDncemBase): + estimator: EstimatorEdNcemNeighborhood + + def init_estim(self, **kwargs): + self.estimator = EstimatorEdNcemNeighborhood(**kwargs) + + def _save_lr_feature_names(self, fn): + feature_names = { + "features": self.estimator.features, + "target_feature_names": self.estimator.target_feature_names, + "neighbor_feature_names": self.estimator.neighbor_feature_names, + } + self._try_save(fn + "_lr_features.pickle", feature_names) + + def _save_specific(self, fn): + self._save_evaluation_per_node_type(fn=fn) + self._save_lr_feature_names(fn=fn) + + class TrainModelCVAEBase(TrainModel): estimator: Union[EstimatorCVAE, EstimatorCVAEncem] diff --git a/ncem/unit_test/directories.py b/ncem/unit_test/directories.py new file mode 100644 index 00000000..adfdc29e --- /dev/null +++ b/ncem/unit_test/directories.py @@ -0,0 +1,14 @@ +""" +Paths to cache directories used throughout the tests. +""" + +import os + +DIR_TEMP = os.path.join(os.path.dirname(__file__), "temp") +DIR_TEMP_DATA = os.path.join(DIR_TEMP, "data") + +DATA_PATH_ZHANG = os.path.join(DIR_TEMP_DATA, "zhang") +DATA_PATH_JAROSCH = os.path.join(DIR_TEMP_DATA, "busch") +DATA_PATH_HARTMANN = os.path.join(DIR_TEMP_DATA, "hartmann") +DATA_PATH_SCHUERCH = os.path.join(DIR_TEMP_DATA, "schuerch") +DATA_PATH_LU = os.path.join(DIR_TEMP_DATA, "lu") diff --git a/ncem/unit_test/test_dataloader_local.py b/ncem/unit_test/test_dataloader_local.py index abc997d2..f1e95f51 100644 --- a/ncem/unit_test/test_dataloader_local.py +++ b/ncem/unit_test/test_dataloader_local.py @@ -1,84 +1,57 @@ -import unittest +import pytest +from ncem.unit_test.directories import DATA_PATH_ZHANG, DATA_PATH_JAROSCH, DATA_PATH_HARTMANN, DATA_PATH_SCHUERCH, \ + DATA_PATH_LU -class TestDataLoader(unittest.TestCase): + +class HelperTestDataLoader: data_origin: str - data_path: str - def get_dataloader(self): - if self.data_origin == "zhang": + def get_dataloader(self, data_origin: str): + if data_origin == "zhang": from ncem.data import DataLoaderZhang as DataLoader radius = 100 label_selection = [] - elif self.data_origin == "jarosch": + data_path = DATA_PATH_ZHANG + elif data_origin == "jarosch": from ncem.data import DataLoaderJarosch as DataLoader radius = 100 label_selection = [] - elif self.data_origin == "hartmann": + data_path = DATA_PATH_JAROSCH + elif data_origin == "hartmann": from ncem.data import DataLoaderHartmann as DataLoader radius = 100 label_selection = ["Diagnosis", "Age", "Sex"] - elif self.data_origin == "pascualreguant": + data_path = DATA_PATH_HARTMANN + elif data_origin == "pascualreguant": from ncem.data import DataLoaderPascualReguant as DataLoader radius = 100 label_selection = [] - elif self.data_origin == "schuerch": + data_path = None # TODO + elif data_origin == "schuerch": from ncem.data import DataLoaderSchuerch as DataLoader radius = 100 label_selection = ["Group"] - else: - from ncem.data import DataLoaderZhang as DataLoader + data_path = DATA_PATH_SCHUERCH + elif data_origin == "luwt": + from ncem.data import DataLoaderSchuerch as DataLoader radius = 100 - label_selection = [] - - self.data = DataLoader(data_path=self.data_path, radius=radius, label_selection=label_selection) - - -class TestDataLoaderZang(TestDataLoader, unittest.TestCase): - data_path = "/Users/anna.schaar/phd/datasets/zhang/" - data_origin = "zhang" - - def test_data_types(self): - self.get_dataloader() - - -class TestDataLoaderJarosch(TestDataLoader, unittest.TestCase): - data_path = "/Users/anna.schaar/phd/datasets/busch/" - data_origin = "jarosch" - - def test_data_types(self): - self.get_dataloader() - - -class TestDataLoaderHartmann(TestDataLoader, unittest.TestCase): - data_path = "/Users/anna.schaar/phd/datasets/hartmann/" - data_origin = "hartmann" - - def test_data_types(self): - self.get_dataloader() - - -class TestDataLoaderPascualReguant(TestDataLoader, unittest.TestCase): - data_path = "/Users/anna.schaar/phd/datasets/pascualreguant/" - data_origin = "pascualreguant" - - def test_data_types(self): - self.get_dataloader() - - -class TestDataLoaderSchuerch(TestDataLoader, unittest.TestCase): - data_path = "/Users/anna.schaar/phd/datasets/schuerch/" - data_origin = "schuerch" + label_selection = ["Group"] + data_path = DATA_PATH_LU + else: + assert False - def test_data_types(self): - self.get_dataloader() + self.data = DataLoader(data_path=data_path, radius=radius, label_selection=label_selection) + self.data_origin = data_origin -if __name__ == "__main__": - unittest.main() +@pytest.mark.parametrize("dataset", ["luwt", "zhang", "jarosch", "hartmann", "lu"]) +def test_data_types(dataset): + loader = HelperTestDataLoader + loader.get_dataloader(data_origin=dataset) diff --git a/ncem/unit_test/test_estimator_local.py b/ncem/unit_test/test_estimator_local.py index 95c73bc9..fdd96fab 100644 --- a/ncem/unit_test/test_estimator_local.py +++ b/ncem/unit_test/test_estimator_local.py @@ -1,14 +1,14 @@ -import unittest +import pytest +from typing import Union import ncem.api as ncem +from ncem.estimators import Estimator +from ncem.unit_test.directories import DATA_PATH_ZHANG, DATA_PATH_HARTMANN, DATA_PATH_LU -class TestEstimator(unittest.TestCase): - base_path = "/Users/anna.schaar/phd/datasets/" - data_path_zhang = base_path + "zhang/" - data_path_jarosch = base_path + "busch/" - data_path_hartmann = base_path + "hartmann/" - data_path_schuerch = base_path + "buffer/" + +class HelperTestEstimator: + est: Estimator def get_estimator( self, @@ -27,6 +27,14 @@ def get_estimator( self.est = ncem.train.EstimatorEDncem(cond_type="max") elif model == "ed_ncem_gcn": self.est = ncem.train.EstimatorEDncem(cond_type="gcn") + elif model == "ed_ncem2_max": + self.est = ncem.train.EstimatorEdNcemNeighborhood(cond_type="max") + elif model == "ed_ncem2_lr_gat": + self.est = ncem.train.EstimatorEdNcemNeighborhood(cond_type="lr_gat") + elif model == "ed_ncem2_gat": + self.est = ncem.train.EstimatorEdNcemNeighborhood(cond_type="gat") + elif model == "ed_ncem2_gcn": + self.est = ncem.train.EstimatorEdNcemNeighborhood(cond_type="gcn") elif model == "cvae": self.est = ncem.train.EstimatorCVAE() elif model == "cvae_ncem_max": @@ -38,10 +46,13 @@ def get_estimator( if data_origin == "zhang": radius = 100 - data_path = self.data_path_zhang + data_path = DATA_PATH_ZHANG elif data_origin == "hartmann": radius = 100 - data_path = self.data_path_hartmann + data_path = DATA_PATH_HARTMANN + elif data_origin.startswith("lu"): + radius = 100 + data_path = DATA_PATH_LU else: assert False @@ -53,9 +64,20 @@ def get_estimator( node_feature_space_id=node_feature_space_id, ) - def _test_train(self, model: str, data_origin: str = "zhang"): + def test_train(self, model: str, data_origin: str = "zhang"): self.get_estimator(model=model, data_origin=data_origin) + kwargs_shared = { + 'optimizer': "adam", + 'latent_dim': 4, + 'dropout_rate': 0., + 'l2_coef': 0., + 'l1_coef': 0., + 'n_eval_nodes_per_graph': 4, + 'scale_node_size': False, + 'output_layer': "gaussian" + } + if model == "linear": kwargs = {"use_source_type": True, "use_domain": True, "learning_rate": 1e-2} train_kwargs = {} @@ -110,6 +132,25 @@ def _test_train(self, model: str, data_origin: str = "zhang"): "beta": 0.1, } train_kwargs = {} + elif model.startswith("ed_ncem2"): + kwargs = kwargs_shared + kwargs.update({ + "use_domain": True, + "use_bias": True, + "learning_rate": 1e-2, + "cond_type": self.est.cond_type, + "dec_intermediate_dim": 0, + "dec_n_hidden": 0, + "dec_dropout_rate": float, + "dec_l1_coef": 0., + "dec_l2_coef": 0., + "dec_use_batch_norm": False, + }) + train_kwargs = {} + self.est.set_input_features( + h0_in=False, + target_feature_names=["Abcb4", "Abcc3"], + neighbor_feature_names=["Adgre1", "Ammecr1"]) else: assert False self._model_kwargs = kwargs @@ -139,18 +180,44 @@ def _test_train(self, model: str, data_origin: str = "zhang"): ) self.est.model.training_model.summary() - def test_linear(self): - # self._test_train(model='linear_baseline', data_origin='hartmann') - # self._test_train(model="linear", data_origin="hartmann") - # self._test_train(model='interactions_baseline', data_origin='hartmann') - self._test_train(model="interactions", data_origin="hartmann") - - def test_ed(self): - # self._test_train(model="ed", data_origin="hartmann") - self._test_train(model="ed_ncem_max", data_origin="hartmann") - # self._test_train(model="ed_ncem_gcn", data_origin="hartmann") - - def test_cvae(self): - # self._test_train(model="cvae", data_origin="hartmann") - self._test_train(model="cvae_ncem_max", data_origin="hartmann") - # self._test_train(model="cvae_ncem_gcn", data_origin="hartmann") + +class HelperTestEstimatorEd(HelperTestEstimator): + + est: Union[ncem.train.EstimatorED, ncem.train.EstimatorEDncem, ncem.train.EstimatorEdNcemNeighborhood] + + def test_embedding(self): + _ = self.est.predict_embedding_any(img_keys=self.est.img_keys_test, node_idx=self.est.nodes_idx_test) + + def test_decoding_weights(self): + _ = self.est.get_decoding_weights() + + +@pytest.mark.parametrize("dataset", ["luwt"]) +@pytest.mark.parametrize("model", ["interactions"]) +def test_linear(dataset: str, model: str): + estim = HelperTestEstimator() + estim.test_train(model=model, data_origin=dataset) + + +@pytest.mark.parametrize("dataset", ["luwt"]) +@pytest.mark.parametrize("model", ["ed_ncem_max"]) +def test_ed(dataset: str, model: str): + estim = HelperTestEstimator() + estim.test_train(model=model, data_origin=dataset) + + +@pytest.mark.parametrize("dataset", ["luwt"]) +@pytest.mark.parametrize("model", ["cvae_ncem_max"]) +def test_cvae(dataset: str, model: str): + estim = HelperTestEstimator() + estim.test_train(model=model, data_origin=dataset) + + +@pytest.mark.parametrize("dataset", ["luwt"]) +#@pytest.mark.parametrize("model", ["ed_ncem2_max", "ed_ncem2_gcn", "ed_ncem2_lr_gat", "ed_ncem2_gat"]) +@pytest.mark.parametrize("model", ["ed_ncem2_lr_gat"]) +def test_ed2(dataset: str, model: str): + estim = HelperTestEstimatorEd() + estim.test_train(model=model, data_origin=dataset) + estim.test_embedding() + estim.test_decoding_weights() diff --git a/pyproject.toml b/pyproject.toml index 932f059a..28ed8310 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [tool.poetry] name = "ncem" -version = "0.4.6" # <> +version = "0.4.8" # <> description = "ncem. Learning cell communication from spatial graphs of cells." -authors = ["David S. Fischer ", "Anna C. Schaar "] +authors = ["Anna C. Schaar "] license = "BSD" readme = "README.rst" homepage = "https://github.com/theislab/ncem"