WenjieDu · WenjieDu · Apr 28, 2023 · Apr 22, 2023 · Apr 23, 2023 · Apr 23, 2023
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,17 @@
+// Some IDEs or editors like PhCharm may not support comments in JSON files, but it is still valid JSON.
+// About configurations for GitHub codebase, please refer to
+// https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/setting-up-your-python-project-for-codespaces
+
+{
+ "name": "PyPOTS developing environment",
+
+ "image": "mcr.microsoft.com/devcontainers/universal:2",
+
+ "features": {
+ "ghcr.io/devcontainers/features/conda:1": {},
+ },
+
+ // Please select the machine type with 4GB memory, otherwise the conda command below will exit with code 137,
+ // which is out of memory.
+ "postCreateCommand": "conda env create -f environment-dev.yml",
+}
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -2,5 +2,5 @@ blank_issues_enabled: true
 version: 2.1
 contact_links:
  - name: PyPOTS Community on Slack
- url: pypots-dev.slack.com
+ url: https://pypots-dev.slack.com
  about: General usage questions, community discussions, and the development team are here.
diff --git a/.github/stale.yml b/.github/stale.yml
@@ -0,0 +1,22 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 7
+
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 3
+
+# Issues with these labels will never be considered stale
+exemptLabels:
+ - pinned
+ - keep
+
+# Label to use when marking an issue as stale
+staleLabel: stale
+
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+ This issue has been automatically marked as stale because it has not had
+ recent activity. It will be closed if no further activity occurs. Thank you
+ for your contributions.
+
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -41,14 +41,15 @@ jobs:
 
  - name: Test with pytest
  run: |
- # run tests separately here due to Segmentation Fault in test_clustering when run all in 
+ # run tests separately here due to Segmentation Fault in test_clustering when run all in
  # one command with `pytest` on MacOS. Bugs not caught, so this is a trade-off to avoid SF.
- python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots --dist=loadgroup 
+ python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots --dist=loadgroup
  python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append --dist=loadgroup
  python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append --dist=loadgroup
  python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append --dist=loadgroup
  python -m pytest -rA pypots/tests/test_data.py -n auto --cov=pypots --cov-append --dist=loadgroup
  python -m pytest -rA pypots/tests/test_utils.py -n auto --cov=pypots --cov-append --dist=loadgroup
+ python -m pytest -rA pypots/tests/test_cli.py -n auto --cov=pypots --cov-append --dist=loadgroup
 
  - name: Generate the LCOV report
  run: |
@@ -58,4 +59,4 @@ jobs:
  uses: coverallsapp/github-action@master
  with:
  github-token: ${{ secrets.GITHUB_TOKEN }}
- path-to-lcov: 'coverage.lcov'
+ path-to-lcov: 'coverage.lcov'
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,16 +7,6 @@ repos:
  - id: end-of-file-fixer
  - id: check-yaml
 
- # hooks for optimizing imports
- - repo: https://github.com/PyCQA/autoflake
- rev: v2.1.1
- hooks:
- - id: autoflake
- args: [
- --check,
- --remove-all-unused-imports,
- ]
-
  # hooks for linting code
  - repo: https://github.com/psf/black
  rev: 22.10.0

diff --git a/README.md b/README.md
@@ -4,16 +4,16 @@
 **<p align="center">A Python Toolbox for Data Mining on Partially-Observed Time Series</p>**
 
 <p align="center">
- <img alt="Python version" src="https://img.shields.io/badge/Python->v3.6-yellow?color=88ada6">
- <img alt="powered by Pytorch" src="https://img.shields.io/static/v1?label=PyTorch&message=%E2%9D%A4%EF%B8%8F&color=bbcdc5&logo=pytorch">
+ <img alt="Python version" src="https://img.shields.io/badge/Python-v3.7--3.10-88ada6?logo=python&logoColor=white">
+ <img alt="powered by Pytorch" src="https://img.shields.io/badge/PyTorch-%E2%9D%A4%EF%B8%8F-bbcdc5?logo=pytorch&logoColor=white">
  <a href="https://pypi.org/project/">
  <img alt="the latest release version" src="https://img.shields.io/github/v/release/wenjiedu/pypots?color=e0eee8&include_prereleases&label=Release">
  </a>
  <a href="https://github.com/WenjieDu/PyPOTS/blob/main/LICENSE">
  <img alt="GPL3 license" src="https://img.shields.io/badge/License-GPL--v3-c0ebd7">
  </a>
  <a href="https://join.slack.com/t/pypots-dev/shared_invite/zt-1gq6ufwsi-p0OZdW~e9UW_IA4_f1OfxA"> 
- <img alt="Slack Workspace" src="https://img.shields.io/badge/Slack-PyPOTS-grey?logo=slack&color=7bcfa6">
+ <img alt="Slack Workspace" src="https://img.shields.io/badge/Slack-PyPOTS-7bcfa6?logo=slack">
  </a>
  <a href="https://github.com/sponsors/WenjieDu">
  <img alt="GitHub Sponsors" src="https://img.shields.io/github/sponsors/wenjiedu?label=Sponsors&color=7fecad&logo=githubsponsors">
@@ -30,18 +30,19 @@
  <a href="https://coveralls.io/github/WenjieDu/PyPOTS"> 
  <img alt="Coveralls coverage" src="https://img.shields.io/coverallsCoverage/github/WenjieDu/PyPOTS?branch=main&logo=coveralls&color=00e09e&label=Coverage">
  </a>
- <a href="https://anaconda.org/conda-forge/pypots">
- <img alt="Conda downloads" src="https://img.shields.io/conda/dn/conda-forge/pypots?label=Conda%20Downloads&color=48c0a3">
- </a>
- <a href="https://pypi.org/project/pypots">
- <img alt="PyPI downloads" src="https://static.pepy.tech/personalized-badge/pypots?period=total&units=international_system&left_color=grey&right_color=teal&left_text=PyPI%20Downloads">
- </a>
  <a href="https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml"> 
- <img alt="GitHub Testing" src="https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml/badge.svg">
+ <img alt="GitHub Testing" src="https://img.shields.io/github/actions/workflow/status/wenjiedu/pypots/testing.yml?logo=github&color=48c0a3&label=CI">
  </a>
  <a href="https://doi.org/10.5281/zenodo.6823221">
- <img alt="Zenodo DOI" src="https://zenodo.org/badge/DOI/10.5281/zenodo.6823221.svg">
+ <img alt="Zenodo DOI" src="https://img.shields.io/badge/DOI-10.5281/zenodo.6823221-21a675">
+ </a>
+ <a href="https://anaconda.org/conda-forge/pypots">
+ <img alt="Conda downloads" src="https://img.shields.io/conda/dn/conda-forge/pypots?label=Conda%20Downloads&color=057748&logo=anaconda&logoColor=white">
+ </a>
+ <a href="https://pypi.org/project/pypots">
+ <img alt="PyPI downloads" src="https://static.pepy.tech/personalized-badge/pypots?period=total&units=international_system&left_color=grey&right_color=teal&left_text=PyPI%20Downloads&logo=github">
  </a>
+
 </p>
 
 ⦿ `Motivation`: Due to all kinds of reasons like failure of collection sensors, communication error, and unexpected malfunction, missing values are common to see in time series from the real-world environment. This makes partially-observed time series (POTS) a pervasive problem in open-world modeling and prevents advanced data analysis. Although this problem is important, the area of data mining on POTS still lacks a dedicated toolkit. PyPOTS is created to fill in this blank.
@@ -54,6 +55,7 @@ To make various open-source time-series datasets readily available to our users,
 Visit [TSDB](https://github.com/WenjieDu/TSDB) right now to know more about this handy tool 🛠! It now supports a total of 119 open-source datasets.
 <br clear="left">
 
+
 ## ❖ Installation
 PyPOTS now is available on <a href="https://anaconda.org/conda-forge/pypots"><img alt="on Anaconda" align="center" 
 src="https://img.shields.io/badge/Anaconda--lightgreen?style=social&logo=anaconda"></a>❗️ 
@@ -66,8 +68,17 @@ Install the latest release from PyPI:
 or install from the source code with the latest features not officially released in a version:
 > pip install https://github.com/WenjieDu/PyPOTS/archive/main.zip
 
-<details open>
-<summary><b>Below is an example applying SAITS in PyPOTS to impute missing values in the dataset PhysioNet2012:</b></summary>
+
+## ❖ Usage
+PyPOTS tutorials have been released. You can find them [here](https://github.com/WenjieDu/PyPOTS/tree/main/tutorials).
+If you have further questions, please refer to PyPOTS documentation [📑http:https://pypots.readthedocs.io](http:https://pypots.readthedocs.io).
+Besides, you can also 
+[raise an issue](https://github.com/WenjieDu/PyPOTS/issues) or
+[ask in our community](https://join.slack.com/t/pypots-dev/shared_invite/zt-1gq6ufwsi-p0OZdW~e9UW_IA4_f1OfxA).
+And please allow us to present you a usage example of imputing missing values in time series with PyPOTS below.
+
+<details>
+<summary><b>Click here to see an example applying SAITS on PhysioNet2012 for imputation:</b></summary>
 
 ``` python
 import numpy as np
@@ -93,6 +104,7 @@ mae = cal_mae(imputation, X_intact, indicating_mask) # calculate mean absolute
 ```
 </details>
 
+
 ## ❖ Available Algorithms
 PyPOTS supports imputation, classification, clustering, and forecasting tasks on multivariate time series with missing values. The currently available algorithms of four tasks are cataloged in the following table with four partitions. The paper references are all listed at the bottom of this readme file. Please refer to them if you want more details.
 
@@ -160,8 +172,6 @@ Your star is your recognition to PyPOTS, and it matters!
 
 
 ## ❖ Attention 👀
-The documentation and tutorials are under construction. 
-
 ‼️ PyPOTS is currently under developing. If you like it and look forward to its growth, <ins>please give PyPOTS a star 
 and watch it to keep you posted on its progress and to let me know that its development is meaningful</ins>. If you have 
 any feedback, or want to contribute ideas/suggestions or share time-series related algorithms/papers, please join PyPOTS 

diff --git a/pypots/base.py b/pypots/base.py
@@ -27,9 +27,20 @@ class BaseModel(ABC):
  then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models.
  Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future.
 
- tb_file_saving_path : str, default = None,
- The path to save the training logs (i.e. loss values recorded during training) into a tensorboard file.
- Will not save if not given.
+ saving_path : str, default = None,
+ The path for automatically saving the trained model and training logs (i.e. loss values recorded during
+ training into a tensorboard file). Will not save if not given.
+
+ auto_save_model : bool, default = True,
+ Whether to automatically save the trained model if `saving_path` is given and not None.
+ Default as True, i.e. the trained model will be automatically saved to `self.saving_path`
+ and users don't have to explicitly invoke function `self.save_model()`.
+
+ saving_strategy : str, "best" or "better" , default = "best",
+ The strategy to save the trained model. It has to be "best" or "better".
+ The "best" strategy will only automatically save the best model after the training finished.
+ The "better" strategy will automatically save the model during training whenever the model performs
+ better than in previous epochs.
 
  Attributes
  ----------
@@ -47,14 +58,29 @@ class BaseModel(ABC):
 
  """
 
+ # leverage typing to show type hints in IDEs
+ # SAVING_STRATEGY = Literal["best", "better"]
+ SAVING_STRATEGY = ["best", "better"]
+
  def __init__(
  self,
  device: Optional[Union[str, torch.device]] = None,
- tb_file_saving_path: str = None,
+ saving_path: str = None,
+ auto_save_model: bool = True,
+ saving_strategy: str = "best",
  ):
+
+ assert saving_strategy in [
+ "best",
+ "better",
+ ], f"saving_strategy must be one of {self.SAVING_STRATEGY}, but got f{saving_strategy}."
+
+ self.device = None
+ self.saving_path = saving_path
+ self.auto_save_model = auto_save_model
+ self.saving_strategy = saving_strategy
  self.model = None
  self.summary_writer = None
- self.device = None
 
  # set up the device for model running below
  if device is None:
@@ -75,24 +101,29 @@ def __init__(
  f"device should be str or torch.device, but got {type(device)}"
  )
 
- # set up the summary writer for training log saving below
- # initialize self.summary_writer if tb_file_saving_path is given and not None, otherwise don't save the log
- self.tb_file_saving_path = None
- if isinstance(tb_file_saving_path, str):
-
+ # set up saving_path to save the trained model and training logs
+ if isinstance(saving_path, str):
  from datetime import datetime
 
- # get the current time to append to the dir name,
- # so you can use the same tb_file_saving_path for multiple running
+ # get the current time to append to saving_path,
+ # so you can use the same saving_path to run multiple times
+ # and also be aware of when they were run
  time_now = datetime.now().__format__("%Y%m%d_T%H%M%S")
- # the actual directory name to save the tensorboard file
- actual_tb_saving_dir_name = "tensorboard_" + time_now
- self.tb_file_saving_path = os.path.join(
-  tb_file_saving_path, actual_tb_saving_dir_name
- )
- # os.makedirs(actual_tb_file_saving_path) # create the dir for file saving
+ # the actual saving_path for saving both the best model and the tensorboard file
+ self.saving_path = os.path.join(saving_path, time_now)
+
+ # initialize self.summary_writer only if saving_path is given and not None
+ # otherwise self.summary_writer will be None and the training log won't be saved
+ tb_saving_path = os.path.join(self.saving_path, "tensorboard")
  self.summary_writer = SummaryWriter(
- self.tb_file_saving_path, filename_suffix=".pypots"
+ tb_saving_path,
+ filename_suffix=".pypots",
+ )
+
+ logger.info(
+ f"saving_path is set as {saving_path}, "
+ f"the trained model will be saved to {self.saving_path}, "
+ f"the tensorboard file will be saved to {tb_saving_path}"
  )
 
  def save_log_into_tb_file(self, step: int, stage: str, loss_dict: dict) -> None:
@@ -164,6 +195,34 @@ def save_model(
  f'Failed to save the model to "{saving_path}" because of the below error! \n{e}'
  )
 
+ def auto_save_model_if_necessary(
+ self,
+ training_finished: bool = True,
+ saving_name: str = None,
+ ):
+ """Automatically save the current model into a file if in need.
+
+ Parameters
+ ----------
+ training_finished : bool, default = False,
+ Whether the training is already finished when invoke this function.
+ The saving_strategy "better" only works when training_finished is False.
+ The saving_strategy "best" only works when training_finished is True.
+
+ saving_name : str, default = None,
+ The file name of the saved model.
+
+ """
+ if self.saving_path is not None and self.auto_save_model:
+ name = self.__class__.__name__ if saving_name is None else saving_name
+ if not training_finished and self.saving_strategy == "better":
+ self.save_model(self.saving_path, name)
+ elif training_finished and self.saving_strategy == "best":
+ self.save_model(self.saving_path, name)
+
+ else:
+ return
+
  def load_model(self, model_path: str) -> None:
  """Load the saved model from a disk file.
 
@@ -203,6 +262,7 @@ class BaseNNModel(BaseModel):
  patience : int,
  Number of epochs the training procedure will keep if loss doesn't decrease.
  Once exceeding the number, the training will stop.
+ Must be smaller than or equal to the value of `epoches`.
 
  learning_rate : float,
  The learning rate of the optimizer.
@@ -219,7 +279,7 @@ class BaseNNModel(BaseModel):
  If not given, will try to use CUDA devices first, then CPUs. CUDA and CPU are so far the main devices for people
  to train ML models. Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future.
 
- tb_file_saving_path : str, default = None,
+ saving_path : str, default = None,
  The path to save the tensorboard file, which contains the loss values recorded during training.
 
 
@@ -248,9 +308,16 @@ def __init__(
  weight_decay: float,
  num_workers: int = 0,
  device: Optional[Union[str, torch.device]] = None,
- tb_file_saving_path: str = None,
+ saving_path: str = None,
  ):
- super().__init__(device, tb_file_saving_path)
+ super().__init__(device, saving_path)
+
+ if patience is None:
+ patience = -1 # early stopping on patience won't work if it is set as < 0
+ else:
+ assert (
+ patience <= epochs
+ ), f"patience must be smaller than epoches which is {epochs}, but got patience={patience}"
 
  # training hype-parameters
  self.batch_size = batch_size