Merge branch 'new-version'

# Conflicts: # leaderboard/leaderboard_gen.py
sunlabuiuc · Nov 10, 2022 · d3bf727 · d3bf727
2 parents 98c0d9e + 4141d5e
commit d3bf727
Show file tree

Hide file tree

Showing 75 changed files with 6,383 additions and 4,955 deletions.
diff --git a/README.rst b/README.rst
@@ -123,80 +123,58 @@ An ML Pipeline Example
  mimic3dataset = MIMIC3Dataset(
  root="https://storage.googleapis.com/pyhealth/mimiciii/1.4/", 
  tables=["DIAGNOSES_ICD", "PROCEDURES_ICD", "PRESCRIPTIONS"],
- code_mapping={"NDC": "ATC"}, # map all NDC codes to ATC codes in these tables
+ # map all NDC codes to ATC 3-rd level codes in these tables
+ code_mapping={"NDC": ("ATC", {"target_kwargs": {"level": 3}})},
  )
 
 * **STEP 2: <pyhealth.tasks>** inputs the ``<pyhealth.datasets>`` object and defines how to process each pateint's data into a set of samples for the tasks. In the package, we provide several task examples, such as ``drug recommendation`` and ``length of stay prediction``.
 
 .. code-block:: python
 
  from pyhealth.tasks import drug_recommendation_mimic3_fn
- from pyhealth.datasets.splitter import split_by_patient
- from torch.utils.data import DataLoader
- from pyhealth.utils import collate_fn_dict
+ from pyhealth.datasets import split_by_patient, get_dataloader
 
- mimic3dataset.set_task(task_fn=drug_recommendation_mimic3_fn) # use default drugrec task
+ mimic3dataset.set_task(task_fn=drug_recommendation_mimic3_fn) # use default task
  train_ds, val_ds, test_ds = split_by_patient(mimic3dataset, [0.8, 0.1, 0.1])
 
  # create dataloaders
- train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn_dict)
- val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate_fn_dict)
- test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn_dict)
+ train_loader = get_dataloader(train_dataset, batch_size=32, shuffle=True)
+ val_loader = get_dataloader(val_dataset, batch_size=32, shuffle=False)
+ test_loader = get_dataloader(test_dataset, batch_size=32, shuffle=False)
 
-* **STEP 3: <pyhealth.models>** provides the healthcare ML models using ``<pyhealth.datasets>``. This module also provides model layers, such as ``pyhealth.models.RETAINLayer`` for building customized ML architectures. Our model layers can used as easily as ``torch.nn.Linear``.
+* **STEP 3: <pyhealth.models>** provides the healthcare ML models using ``<pyhealth.models>``. This module also provides model layers, such as ``pyhealth.models.RETAINLayer`` for building customized ML architectures. Our model layers can used as easily as ``torch.nn.Linear``.
 
 .. code-block:: python
- 
+
  from pyhealth.models import Transformer
 
- device = "cuda:0"
  model = Transformer(
- dataset=mimic3dataset,
- tables=["conditions", "procedures"],
+ dataset=dataset,
+ feature_keys=["conditions", "procedures"],
+ label_key="drugs",
  mode="multilabel",
+ operation_level="visit",
  )
- model.to(device)
 
 * **STEP 4: <pyhealth.trainer>** is the training manager with ``train_loader``, the ``val_loader``, ``val_metric``, and specify other arguemnts, such as epochs, optimizer, learning rate, etc. The trainer will automatically save the best model and output the path in the end.
 
 .. code-block:: python
  
  from pyhealth.trainer import Trainer
- from pyhealth.metrics import pr_auc_multilabel
- import torch
-
- trainer = Trainer(enable_logging=True, output_path="../output", device=device)
- trainer.fit(model,
- train_loader=train_loader,
- epochs=10,
- optimizer_class=torch.optim.Adam,
- optimizer_params={"lr": 1e-3, "weight_decay": 1e-5},
- val_loader=val_loader,
- val_metric=pr_auc_multilabel,
+
+ trainer = Trainer(model=model)
+ trainer.train(
+ train_dataloader=train_dataloader,
+ val_dataloader=val_dataloader,
+ epochs=50,
+ monitor="pr_auc_samples",
  )
- # Best model saved to: ../output/221004-015401/best.ckpt
 
 * **STEP 5: <pyhealth.metrics>** provides: (i) **common evaluation metrics** and the usage is the same as ``<pyhealth.metrics>``; (ii) **metrics (weighted by patients)** for patient-level tasks; (iii) **special metrics** in healthcare, such as drug-drug interaction (DDI) rate.
 
 .. code-block:: python
  
- from pyhealth.evaluator import evaluate
- from pyhealth.metrics import accuracy_multilabel, jaccard_multilabel, f1_multilabel
-
- # load best model and do inference
- model = trainer.load_best_model(model)
- y_gt, y_prob, y_pred = evaluate(model, test_loader, device)
-
- jaccard = jaccard_multilabel(y_gt, y_pred)
- accuracy = accuracy_multilabel(y_gt, y_pred)
- f1 = f1_multilabel(y_gt, y_pred)
- prauc = pr_auc_multilabel(y_gt, y_prob)
-
- print("jaccard: ", jaccard)
- print("accuracy: ", accuracy)
- print("f1: ", f1)
- print("prauc: ", prauc)
-
+ trainer.evaluate(test_dataloader)
 
 Medical Code Map
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -208,20 +186,22 @@ Medical Code Map
 .. code-block:: python
 
  from pyhealth.medcode import CrossMap
- codemap = CrossMap("ICD9CM", "CCSCM")
+
+ codemap = CrossMap.load("ICD9CM", "CCSCM")
  codemap.map("82101") # use it like a dict
 
- codemap = CrossMap("NDC", "ATC", level=3)
+ codemap = CrossMap.load("NDC", "ATC")
  codemap.map("00527051210")
 
 * For code ontology lookup within one system
 
 .. code-block:: python
 
  from pyhealth.medcode import InnerMap
- ICD9CM = InnerMap("ICD9CM")
- ICD9CM.lookup("428.0") # get detailed info
- ICD9CM.get_ancestors("428.0") # get parents
+
+ icd9cm = InnerMap.load("ICD9CM")
+ icd9cm.lookup("428.0") # get detailed info
+ icd9cm.get_ancestors("428.0") # get parents
 
 Medical Code Tokenizer
 ^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/api/data/pyhealth.data.Patient.rst b/docs/api/data/pyhealth.data.Patient.rst
@@ -1,7 +1,9 @@
 pyhealth.data.Patient
 =========================
 
-Another basic data structure in the package. A Patient is a collection of Visit for the current patients. It contains all necessary attributes of a patient, such as ethnicity, mortality status, gender, etc. It can support various healthcare tasks.
+Another basic data structure in the package. A Patient is a collection of Visit for
+the current patients. It contains all necessary attributes of a patient, such as
+ethnicity, mortality status, gender, etc. It can support various healthcare tasks.
 
 .. autoclass:: pyhealth.data.Patient
  :members:

diff --git a/docs/api/data/pyhealth.data.Visit.rst b/docs/api/data/pyhealth.data.Visit.rst
@@ -1,7 +1,10 @@
 pyhealth.data.Visit
 =========================
 
-Another basic data structure in the package. A Visit is a single encounter in hispital. It is a container a sequence of Event for each information aspect, such as diagnosis or medications. It also contains other necessary attributes for supporting healthcare tasks, such as the date of the visit.
+Another basic data structure in the package. A Visit is a single encounter in
+hospital. It is a container a sequence of Event for each information aspect,
+such as diagnosis or medications. It also contains other necessary attributes
+for supporting healthcare tasks, such as the date of the visit.
 
 .. autoclass:: pyhealth.data.Visit
  :members:

diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -10,4 +10,5 @@ Datasets
  datasets/pyhealth.datasets.eICUDataset
  datasets/pyhealth.datasets.OMOPDataset
  datasets/pyhealth.datasets.splitter
+ datasets/pyhealth.datasets.utils
 
diff --git a/docs/api/datasets/pyhealth.datasets.utils.rst b/docs/api/datasets/pyhealth.datasets.utils.rst
@@ -0,0 +1,13 @@
+pyhealth.datasets.utils
+===================================
+
+Several utility functions.
+
+.. automodule:: pyhealth.datasets.utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+
+
diff --git a/docs/api/evaluator.rst b/docs/api/evaluator.rst
diff --git a/docs/api/metrics.rst b/docs/api/metrics.rst
@@ -10,4 +10,4 @@ tasks, such as drug drug interaction (DDI) rate.
 
  metrics/pyhealth.metrics.multiclass
  metrics/pyhealth.metrics.multilabel
- metrics/pyhealth.metrics.multiclass_avg_patient
+ metrics/pyhealth.metrics.binary
diff --git a/docs/api/metrics/pyhealth.metrics.binary.rst b/docs/api/metrics/pyhealth.metrics.binary.rst
@@ -0,0 +1,6 @@
+pyhealth.metrics.binary
+===================================
+
+.. currentmodule:: pyhealth.metrics.binary
+
+.. autofunction:: binary_metrics_fn
diff --git a/docs/api/metrics/pyhealth.metrics.multiclass.rst b/docs/api/metrics/pyhealth.metrics.multiclass.rst
@@ -3,13 +3,5 @@
 
 .. currentmodule:: pyhealth.metrics.multiclass
 
-.. autofunction:: accuracy_score
-.. autofunction:: precision_score
-.. autofunction:: recall_score
-.. autofunction:: f1_score
-.. autofunction:: roc_auc_score
-.. autofunction:: average_precision_score
-.. autofunction:: jaccard_score
-.. autofunction:: cohen_kappa_score
-.. autofunction:: r2_score
-.. autofunction:: confusion_matrix
+.. autofunction:: multiclass_metrics_fn
+
diff --git a/docs/api/metrics/pyhealth.metrics.multiclass_avg_patient.rst b/docs/api/metrics/pyhealth.metrics.multiclass_avg_patient.rst
diff --git a/docs/api/metrics/pyhealth.metrics.multilabel.rst b/docs/api/metrics/pyhealth.metrics.multilabel.rst
@@ -3,13 +3,4 @@
 
 .. currentmodule:: pyhealth.metrics.multilabel
 
-.. autofunction:: accuracy_multilabel
-.. autofunction:: precision_multilabel
-.. autofunction:: recall_multilabel
-.. autofunction:: f1_multilabel
-.. autofunction:: roc_auc_multilabel
-.. autofunction:: pr_auc_multilabel
-.. autofunction:: jaccard_multilabel
-.. autofunction:: cohen_kappa_multilabel
-.. autofunction:: r2_score_multilabel
-.. autofunction:: ddi_rate_score
+.. autofunction:: multilabel_metrics_fn
diff --git a/docs/index.rst b/docs/index.rst
@@ -100,77 +100,55 @@ An ML Pipeline Example
  mimic3dataset = MIMIC3Dataset(
  root="https://storage.googleapis.com/pyhealth/mimiciii/1.4/", 
  tables=["DIAGNOSES_ICD", "PROCEDURES_ICD", "PRESCRIPTIONS"],
- code_mapping={"NDC": "ATC"}, # map all NDC codes to ATC codes in these tables
+ # map all NDC codes to ATC 3-rd level codes in these tables
+ code_mapping={"NDC": ("ATC", {"target_kwargs": {"level": 3}})},
  )
 
 * **STEP 2: <pyhealth.tasks>** inputs the ``<pyhealth.datasets>`` object and defines how to process each pateint's data into a set of samples for the tasks. In the package, we provide several task examples, such as ``drug recommendation`` and ``length of stay prediction``.
 .. code-block:: python
 
  from pyhealth.tasks import drug_recommendation_mimic3_fn
- from pyhealth.datasets.splitter import split_by_patient
- from torch.utils.data import DataLoader
- from pyhealth.utils import collate_fn_dict
+ from pyhealth.datasets import split_by_patient, get_dataloader
 
- mimic3dataset.set_task(task_fn=drug_recommendation_mimic3_fn) # use default drugrec task
+ mimic3dataset.set_task(task_fn=drug_recommendation_mimic3_fn) # use default task
  train_ds, val_ds, test_ds = split_by_patient(mimic3dataset, [0.8, 0.1, 0.1])
 
  # create dataloaders
- train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn_dict)
- val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate_fn_dict)
- test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn_dict)
+ train_loader = get_dataloader(train_dataset, batch_size=32, shuffle=True)
+ val_loader = get_dataloader(val_dataset, batch_size=32, shuffle=False)
+ test_loader = get_dataloader(test_dataset, batch_size=32, shuffle=False)
 
-* **STEP 3: <pyhealth.models>** provides the healthcare ML models using ``<pyhealth.datasets>``. This module also provides model layers, such as ``pyhealth.models.RETAINLayer`` for building customized ML architectures. Our model layers can used as easily as ``torch.nn.Linear``.
+* **STEP 3: <pyhealth.models>** provides the healthcare ML models using ``<pyhealth.models>``. This module also provides model layers, such as ``pyhealth.models.RETAINLayer`` for building customized ML architectures. Our model layers can used as easily as ``torch.nn.Linear``.
 .. code-block:: python
  
  from pyhealth.models import Transformer
 
- device = "cuda:0"
  model = Transformer(
- dataset=mimic3dataset,
- tables=["conditions", "procedures"],
+ dataset=dataset,
+ feature_keys=["conditions", "procedures"],
+ label_key="drugs",
  mode="multilabel",
+ operation_level="visit",
  )
- model.to(device)
 
 * **STEP 4: <pyhealth.trainer>** is the training manager with ``train_loader``, the ``val_loader``, ``val_metric``, and specify other arguemnts, such as epochs, optimizer, learning rate, etc. The trainer will automatically save the best model and output the path in the end.
 .. code-block:: python
  
  from pyhealth.trainer import Trainer
- from pyhealth.metrics import pr_auc_multilabel
- import torch
-
- trainer = Trainer(enable_logging=True, output_path="../output", device=device)
- trainer.fit(model,
- train_loader=train_loader,
- epochs=10,
- optimizer_class=torch.optim.Adam,
- optimizer_params={"lr": 1e-3, "weight_decay": 1e-5},
- val_loader=val_loader,
- val_metric=pr_auc_multilabel,
+
+ trainer = Trainer(model=model)
+ trainer.train(
+ train_dataloader=train_dataloader,
+ val_dataloader=val_dataloader,
+ epochs=50,
+ monitor="pr_auc_samples",
  )
- # Best model saved to: ../output/221004-015401/best.ckpt
 
 * **STEP 5: <pyhealth.metrics>** provides: (i) **common evaluation metrics** and the usage is the same as ``<pyhealth.metrics>``; (ii) **metrics (weighted by patients)** for patient-level tasks; (iii) **special metrics** in healthcare, such as drug-drug interaction (DDI) rate.
 
 .. code-block:: python
  
- from pyhealth.evaluator import evaluate
- from pyhealth.metrics import accuracy_multilabel, jaccard_multilabel, f1_multilabel
-
- # load best model and do inference
- model = trainer.load_best_model(model)
- y_gt, y_prob, y_pred = evaluate(model, test_loader, device)
-
- jaccard = jaccard_multilabel(y_gt, y_pred)
- accuracy = accuracy_multilabel(y_gt, y_pred)
- f1 = f1_multilabel(y_gt, y_pred)
- prauc = pr_auc_multilabel(y_gt, y_prob)
-
- print("jaccard: ", jaccard)
- print("accuracy: ", accuracy)
- print("f1: ", f1)
- print("prauc: ", prauc)
-
+ trainer.evaluate(test_dataloader)
 
 Medical Code Map
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -181,19 +159,21 @@ Medical Code Map
 .. code-block:: python
 
  from pyhealth.medcode import CrossMap
- codemap = CrossMap("ICD9CM", "CCSCM")
+
+ codemap = CrossMap.load("ICD9CM", "CCSCM")
  codemap.map("82101") # use it like a dict
 
- codemap = CrossMap("NDC", "ATC", level=3)
+ codemap = CrossMap.load("NDC", "ATC")
  codemap.map("00527051210")
 
 * For code ontology lookup within one system
 .. code-block:: python
 
  from pyhealth.medcode import InnerMap
- ICD9CM = InnerMap("ICD9CM")
- ICD9CM.lookup("428.0") # get detailed info
- ICD9CM.get_ancestors("428.0") # get parents
+
+ icd9cm = InnerMap.load("ICD9CM")
+ icd9cm.lookup("428.0") # get detailed info
+ icd9cm.get_ancestors("428.0") # get parents
 
 Medical Code Tokenizer
 ^^^^^^^^^^^^^^^^^^^^^^^^^^