From ee2cdb64c437e7b71bd15d9851d79d24bfa7e4bb Mon Sep 17 00:00:00 2001
From: Sungjin Lee <sule@microsoft.com>
Date: Fri, 31 May 2019 21:46:36 -0700
Subject: [PATCH] update slm-lab part

---
 convlab/agent/__init__.py                     |  500 ++++----
 convlab/agent/algorithm/__init__.py           |    1 -
 convlab/agent/algorithm/actor_critic.py       |  311 +++--
 convlab/agent/algorithm/base.py               |    3 +-
 convlab/agent/algorithm/dqn.py                |  192 ++--
 convlab/agent/algorithm/hydra_dqn.py          |  126 ---
 convlab/agent/algorithm/policy_util.py        | 1002 +++++++++++------
 convlab/agent/algorithm/ppo.py                |  120 +-
 convlab/agent/algorithm/random.py             |   19 +-
 convlab/agent/algorithm/reinforce.py          |  126 +--
 convlab/agent/algorithm/sarsa.py              |   83 +-
 convlab/agent/algorithm/sil.py                |  100 +-
 convlab/agent/memory/base.py                  |   59 +-
 convlab/agent/memory/onpolicy.py              |  252 +----
 convlab/agent/memory/prioritized.py           |   55 +-
 convlab/agent/memory/replay.py                |  283 +----
 convlab/agent/net/base.py                     |   27 +-
 convlab/agent/net/conv.py                     |  127 +--
 convlab/agent/net/mlp.py                      |  147 +--
 convlab/agent/net/net_util.py                 |  309 +++--
 convlab/agent/net/recurrent.py                |   79 +-
 convlab/env/__init__.py                       |    9 +-
 convlab/env/base.py                           |  131 +--
 convlab/env/multiwoz.py                       |   84 +-
 convlab/experiment/__init__.py                |    9 +-
 convlab/experiment/analysis.py                |  785 ++++---------
 convlab/experiment/control.py                 |  436 +++----
 convlab/experiment/monitor.py                 |  494 --------
 convlab/experiment/retro_analysis.py          |  273 +----
 convlab/experiment/search.py                  |  327 ++----
 convlab/lib/__init__.py                       |    7 -
 convlab/lib/decorator.py                      |    5 +-
 convlab/lib/distribution.py                   |   87 ++
 convlab/lib/logger.py                         |   83 +-
 convlab/lib/math_util.py                      |  204 ++--
 convlab/lib/optimizer.py                      |  102 ++
 convlab/lib/util.py                           |  433 +++----
 convlab/lib/viz.py                            |  353 +++---
 .../multiwoz/rule_based_multiwoz_bot.py       |    2 +-
 .../user/multiwoz/policy_agenda_multiwoz.py   |    4 +
 convlab/spec/demo.json                        |  218 +++-
 convlab/spec/random_baseline.py               |  133 +++
 convlab/spec/spec_util.py                     |  196 ++--
 43 files changed, 3448 insertions(+), 4848 deletions(-)
 delete mode 100644 convlab/agent/algorithm/hydra_dqn.py
 delete mode 100644 convlab/experiment/monitor.py
 create mode 100644 convlab/lib/distribution.py
 create mode 100644 convlab/lib/optimizer.py
 create mode 100644 convlab/spec/random_baseline.py

diff --git a/convlab/agent/__init__.py b/convlab/agent/__init__.py
index 9c05a79..5c43d2b 100644
--- a/convlab/agent/__init__.py
+++ b/convlab/agent/__init__.py
@@ -1,96 +1,67 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The agent module
-Contains graduated components from experiments for building agents and be taught, tested, evaluated on curriculum.
-To be designed by human and evolution module, based on the experiment aim (trait) and fitness metrics.
-Main SLM components (refer to SLM doc for more):
-- primary survival objective
-- control policies
-- sensors (input) for embodiment
-- motors (output) for embodiment
-- neural architecture
-- memory (with time)
-- prioritization mechanism and "emotions"
-- strange loop must be created
-- social aspect
-- high level properties of thinking, e.g. creativity, planning.
-
-Agent components:
-- algorithm (with net, policy)
-- memory (per body)
-'''
+# The agent module
+import numpy as np
+import pandas as pd
+import pydash as ps
+import torch
+from copy import deepcopy
+
 from convlab.agent import algorithm, memory
+from convlab.agent.algorithm import policy_util
+from convlab.agent.net import net_util
 from convlab.lib import logger, util
 from convlab.lib.decorator import lab_api
 from convlab.modules import nlu, dst, nlg, state_encoder, action_decoder
-import numpy as np
-import pydash as ps
-from copy import deepcopy
 
-AGENT_DATA_NAMES = ['action', 'loss', 'explore_var']
+
 logger = logger.get_logger(__name__)
 
 
 class Agent:
     '''
-    Class for all Agents.
-    Standardizes the Agent design to work in Lab.
-    Access Envs properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs
+    Agent abstraction; implements the API to interface with Env in SLM Lab
+    Contains algorithm, memory, body
     '''
 
-    def __init__(self, spec, info_space, body, a=None, agent_space=None, global_nets=None):
+    def __init__(self, spec, body, a=None, global_nets=None):
         self.spec = spec
-        self.info_space = info_space
-        self.a = a or 0  # for compatibility with agent_space
+        self.a = a or 0  # for multi-agent
         self.agent_spec = spec['agent'][self.a]
         self.name = self.agent_spec['name']
         assert not ps.is_list(global_nets), f'single agent global_nets must be a dict, got {global_nets}'
-        if agent_space is None:  # singleton mode
-            self.body = body
-            body.agent = self
-            MemoryClass = getattr(memory, ps.get(self.agent_spec, 'memory.name'))
-            self.body.memory = MemoryClass(self.agent_spec['memory'], self.body)
-            AlgorithmClass = getattr(algorithm, ps.get(self.agent_spec, 'algorithm.name'))
-            self.algorithm = AlgorithmClass(self, global_nets)
-        else:
-            self.space_init(agent_space, body, global_nets)
+        # set components
+        self.body = body
+        body.agent = self
+        MemoryClass = getattr(memory, ps.get(self.agent_spec, 'memory.name'))
+        self.body.memory = MemoryClass(self.agent_spec['memory'], self.body)
+        AlgorithmClass = getattr(algorithm, ps.get(self.agent_spec, 'algorithm.name'))
+        self.algorithm = AlgorithmClass(self, global_nets)
 
         logger.info(util.self_desc(self))
 
     @lab_api
-    def reset(self, observation):
-        '''Do agent reset per session, such as memory pointer'''
-        logger.debug(f'Agent {self.a} reset')
-        self.body.memory.epi_reset(observation)
-
-    @lab_api
-    def act(self, observation):
+    def act(self, state):
         '''Standard act method from algorithm.'''
-        action = self.algorithm.act(observation)
-        logger.debug(f'Agent {self.a} act: {action}')
+        with torch.no_grad():  # for efficiency, only calc grad in algorithm.train
+            action = self.algorithm.act(state)
         return action
-    
+
     @lab_api
-    def update(self, action, reward, observation, done):
+    def update(self, state, action, reward, next_state, done):
         '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
-        self.body.action_pd_update()
-        self.body.memory.update(action, reward, observation, done)
+        self.body.update(state, action, reward, next_state, done)
+        if util.in_eval_lab_modes():  # eval does not update agent for training
+            return
+        self.body.memory.update(state, action, reward, next_state, done)
         loss = self.algorithm.train()
         if not np.isnan(loss):  # set for log_summary()
             self.body.loss = loss
         explore_var = self.algorithm.update()
-        logger.debug(f'Agent {self.a} loss: {loss}, explore_var {explore_var}')
-        if done:
-            self.body.epi_update()
         return loss, explore_var
 
     @lab_api
     def save(self, ckpt=None):
         '''Save agent'''
-        if util.in_eval_lab_modes():
-            # eval does not save new models
+        if util.in_eval_lab_modes():  # eval does not save new models
             return
         self.algorithm.save(ckpt=ckpt)
 
@@ -99,62 +70,6 @@ def close(self):
         '''Close and cleanup agent at the end of a session, e.g. save model'''
         self.save()
 
-    @lab_api
-    def space_init(self, agent_space, body_a, global_nets):
-        '''Post init override for space env. Note that aeb is already correct from __init__'''
-        self.agent_space = agent_space
-        self.body_a = body_a
-        self.aeb_space = agent_space.aeb_space
-        self.nanflat_body_a = util.nanflatten(self.body_a)
-        for idx, body in enumerate(self.nanflat_body_a):
-            if idx == 0:  # NOTE set default body
-                self.body = body
-            body.agent = self
-            body.nanflat_a_idx = idx
-            MemoryClass = getattr(memory, ps.get(self.agent_spec, 'memory.name'))
-            body.memory = MemoryClass(self.agent_spec['memory'], body)
-        self.body_num = len(self.nanflat_body_a)
-        AlgorithmClass = getattr(algorithm, ps.get(self.agent_spec, 'algorithm.name'))
-        self.algorithm = AlgorithmClass(self, global_nets)
-        # after algo init, transfer any missing variables from default body
-        for idx, body in enumerate(self.nanflat_body_a):
-            for k, v in vars(self.body).items():
-                if util.gen_isnan(getattr(body, k, None)):
-                    setattr(body, k, v)
-
-    @lab_api
-    def space_reset(self, observation_a):
-        '''Do agent reset per session, such as memory pointer'''
-        logger.debug(f'Agent {self.a} reset')
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            body.memory.epi_reset(observation_a[eb])
-
-    @lab_api
-    def space_act(self, observation_a):
-        '''Standard act method from algorithm.'''
-        action_a = self.algorithm.space_act(observation_a)
-        logger.debug(f'Agent {self.a} act: {action_a}')
-        return action_a
-
-    @lab_api
-    def space_update(self, action_a, reward_a, observation_a, done_a):
-        '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            body.action_pd_update()
-            body.memory.update(action_a[eb], reward_a[eb], observation_a[eb], done_a[eb])
-        loss_a = self.algorithm.space_train()
-        loss_a = util.guard_data_a(self, loss_a, 'loss')
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            if not np.isnan(loss_a[eb]):  # set for log_summary()
-                body.loss = loss_a[eb]
-        explore_var_a = self.algorithm.space_update()
-        explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var')
-        logger.debug(f'Agent {self.a} loss: {loss_a}, explore_var_a {explore_var_a}')
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            if body.env.done:
-                body.epi_update()
-        return loss_a, explore_var_a
-
 
 class DialogAgent(Agent):
     '''
@@ -162,9 +77,8 @@ class DialogAgent(Agent):
     Standardizes the Agent design to work in Lab.
     Access Envs properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs
     '''
-    def __init__(self, spec, info_space, body, a=None, agent_space=None, global_nets=None):
+    def __init__(self, spec, body, a=None, global_nets=None):
         self.spec = spec
-        self.info_space = info_space
         self.a = a or 0  # for compatibility with agent_space
         self.agent_spec = spec['agent'][self.a]
         self.name = self.agent_spec['name']
@@ -195,29 +109,26 @@ def __init__(self, spec, info_space, body, a=None, agent_space=None, global_nets
             params = deepcopy(ps.get(self.agent_spec, 'nlg'))
             NlgClass = getattr(nlg, params.pop('name'))
             self.nlg = NlgClass(**params) 
-        if agent_space is None:  # singleton mode
-            self.body = body
-            body.agent = self
-            MemoryClass = getattr(memory, ps.get(self.agent_spec, 'memory.name'))
-            self.body.memory = MemoryClass(self.agent_spec['memory'], self.body)
-            AlgorithmClass = getattr(algorithm, ps.get(self.agent_spec, 'algorithm.name'))
-            self.algorithm = AlgorithmClass(self, global_nets)
-        else:
-            self.space_init(agent_space, body, global_nets)
+        self.body = body
+        body.agent = self
+        MemoryClass = getattr(memory, ps.get(self.agent_spec, 'memory.name'))
+        self.body.memory = MemoryClass(self.agent_spec['memory'], self.body)
+        AlgorithmClass = getattr(algorithm, ps.get(self.agent_spec, 'algorithm.name'))
+        self.algorithm = AlgorithmClass(self, global_nets)
         self.body.state, self.body.encoded_state, self.body.action = None, None, None
         logger.info(util.self_desc(self))
 
     @lab_api
-    def reset(self, observation):
+    def reset(self, obs):
         '''Do agent reset per session, such as memory pointer'''
         logger.debug(f'Agent {self.a} reset')
         if self.dst:
             self.dst.init_session()
-        if hasattr(self.algorithm, "reset"):
+        if hasattr(self.algorithm, "reset"):  # This is mainly for external policies that may need to reset its state.
             self.algorithm.reset()
-        input_act, state, encoded_state = self.state_update(observation, "null")  # "null" action to be compatible with MDBT
+        input_act, state, encoded_state = self.state_update(obs, "null")  # "null" action to be compatible with MDBT
+        self.state = state
         self.body.state, self.body.encoded_state = state, encoded_state
-        self.body.memory.epi_reset(encoded_state)
 
     @lab_api
     def act(self, observation):
@@ -225,6 +136,7 @@ def act(self, observation):
         action = self.algorithm.act(self.body.encoded_state)
         decoded_action = self.action_decode(action, self.body.state) 
         self.body.action = action
+        # logger.info(f'Agent {self.a} system utterance: {decoded_action}')
         logger.nl(f'Agent {self.a} system utterance: {decoded_action}')
         logger.act(f'Agent {self.a} system action: {action}')
         return decoded_action
@@ -239,6 +151,7 @@ def state_update(self, observation, action):
             self.dst.state['user_action'] = input_act 
         elif self.dst and not isinstance(self.dst, dst.MDBTTracker):  # for act-in act-out agent
             self.dst.state['user_action'] = observation 
+        # logger.info(f'Agent {self.a} user utterance: {observation}')
         logger.nl(f'Agent {self.a} user utterance: {observation}')
         logger.act(f'Agent {self.a} user action: {input_act}')
         logger.state(f'Agent {self.a} dialog state: {state}')
@@ -250,23 +163,35 @@ def action_decode(self, action, state):
         return decoded_action 
     
     @lab_api
-    def update(self, action, reward, observation, done):
+    def update(self, obs, action, reward, next_obs, done):
         '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
-        input_act, state, encoded_state = self.state_update(observation, action)
-        self.body.state, self.body.encoded_state = state, encoded_state
-        if self.algorithm.__class__.__name__ == 'ExternalPolicy':
-            loss, explore_var = 0, 0
-            self.body.memory.update(0, reward, 0, done)
-        else:
-            self.body.action_pd_update()
-            self.body.memory.update(self.body.action, reward, encoded_state, done)
-            loss = self.algorithm.train()
-            if not np.isnan(loss):  # set for log_summary()
-                self.body.loss = loss
-            explore_var = self.algorithm.update()
-            logger.debug(f'Agent {self.a} loss: {loss}, explore_var {explore_var}')
-        if done:
-            self.body.epi_update()
+        input_act, next_state, encoded_state = self.state_update(next_obs, action)
+        self.body.update(self.body.state, action, reward, next_state, done)
+        if util.in_eval_lab_modes() or self.algorithm.__class__.__name__ == 'ExternalPolicy':  # eval does not update agent for training
+            self.body.state, self.body.encoded_state = next_state, encoded_state
+            return
+        self.body.memory.update(self.body.encoded_state, self.body.action, reward, encoded_state, done)
+        self.body.state, self.body.encoded_state = next_state, encoded_state
+        loss = self.algorithm.train()
+        if not np.isnan(loss):  # set for log_summary()
+            self.body.loss = loss
+        explore_var = self.algorithm.update()
+        return loss, explore_var
+
+        # self.body.state, self.body.encoded_state = state, encoded_state
+        # if self.algorithm.__class__.__name__ == 'ExternalPolicy':
+        #     loss, explore_var = 0, 0
+        #     self.body.memory.update(0, reward, 0, done)
+        # else:
+        #     self.body.action_pd_update()
+        #     self.body.memory.update(self.body.action, reward, encoded_state, done)
+        #     loss = self.algorithm.train()
+        #     if not np.isnan(loss):  # set for log_summary()
+        #         self.body.loss = loss
+        #     explore_var = self.algorithm.update()
+        #     logger.debug(f'Agent {self.a} loss: {loss}, explore_var {explore_var}')
+        # if done:
+        #     self.body.epi_update()
         return loss, explore_var
 
     @lab_api
@@ -284,135 +209,160 @@ def close(self):
         '''Close and cleanup agent at the end of a session, e.g. save model'''
         self.save()
 
-    @lab_api
-    def space_init(self, agent_space, body_a, global_nets):
-        '''Post init override for space env. Note that aeb is already correct from __init__'''
-        self.agent_space = agent_space
-        self.body_a = body_a
-        self.aeb_space = agent_space.aeb_space
-        self.nanflat_body_a = util.nanflatten(self.body_a)
-        for idx, body in enumerate(self.nanflat_body_a):
-            if idx == 0:  # NOTE set default body
-                self.body = body
-            body.agent = self
-            body.nanflat_a_idx = idx
-            MemoryClass = getattr(memory, ps.get(self.agent_spec, 'memory.name'))
-            body.memory = MemoryClass(self.agent_spec['memory'], body)
-        self.body_num = len(self.nanflat_body_a)
-        AlgorithmClass = getattr(algorithm, ps.get(self.agent_spec, 'algorithm.name'))
-        self.algorithm = AlgorithmClass(self, global_nets)
-        # after algo init, transfer any missing variables from default body
-        for idx, body in enumerate(self.nanflat_body_a):
-            for k, v in vars(self.body).items():
-                if util.gen_isnan(getattr(body, k, None)):
-                    setattr(body, k, v)
-
-    @lab_api
-    def space_reset(self, observation_a):
-        '''Do agent reset per session, such as memory pointer'''
-        logger.debug(f'Agent {self.a} reset')
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            body.memory.epi_reset(observation_a[eb])
 
-    @lab_api
-    def space_act(self, observation_a):
-        '''Standard act method from algorithm.'''
-        action_a = self.algorithm.space_act(observation_a)
-        logger.debug(f'Agent {self.a} act: {action_a}')
-        return action_a
-
-    @lab_api
-    def space_update(self, action_a, reward_a, observation_a, done_a):
-        '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            body.action_pd_update()
-            body.memory.update(action_a[eb], reward_a[eb], observation_a[eb], done_a[eb])
-        loss_a = self.algorithm.space_train()
-        loss_a = util.guard_data_a(self, loss_a, 'loss')
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            if not np.isnan(loss_a[eb]):  # set for log_summary()
-                body.loss = loss_a[eb]
-        explore_var_a = self.algorithm.space_update()
-        explore_var_a = util.guard_data_a(self, explore_var_a, 'explore_var')
-        logger.debug(f'Agent {self.a} loss: {loss_a}, explore_var_a {explore_var_a}')
-        for eb, body in util.ndenumerate_nonan(self.body_a):
-            if body.env.done:
-                body.epi_update()
-        return loss_a, explore_var_a
-
-
-class AgentSpace:
+class Body:
     '''
-    Subspace of AEBSpace, collection of all agents, with interface to Session logic; same methods as singleton agents.
-    Access EnvSpace properties by: AgentSpace - AEBSpace - EnvSpace - Envs
+    Body of an agent inside an environment, it:
+    - enables the automatic dimension inference for constructing network input/output
+    - acts as reference bridge between agent and environment (useful for multi-agent, multi-env)
+    - acts as non-gradient variable storage for monitoring and analysis
     '''
 
-    def __init__(self, spec, aeb_space, global_nets=None):
-        self.spec = spec
-        self.aeb_space = aeb_space
-        aeb_space.agent_space = self
-        self.info_space = aeb_space.info_space
-        self.aeb_shape = aeb_space.aeb_shape
-        assert not ps.is_dict(global_nets), f'multi agent global_nets must be a list of dicts, got {global_nets}'
-        assert ps.is_list(self.spec['agent'])
-        self.agents = []
-        for a in range(len(self.spec['agent'])):
-            body_a = self.aeb_space.body_space.get(a=a)
-            if global_nets is not None:
-                agent_global_nets = global_nets[a]
-            else:
-                agent_global_nets = None
-            agent = Agent(self.spec, self.info_space, body=body_a, a=a, agent_space=self, global_nets=agent_global_nets)
-            self.agents.append(agent)
-        logger.info(util.self_desc(self))
-
-    def get(self, a):
-        return self.agents[a]
-
-    @lab_api
-    def reset(self, observation_space):
-        logger.debug3('AgentSpace.reset')
-        _action_v, _loss_v, _explore_var_v = self.aeb_space.init_data_v(AGENT_DATA_NAMES)
-        for agent in self.agents:
-            observation_a = observation_space.get(a=agent.a)
-            agent.space_reset(observation_a)
-        _action_space, _loss_space, _explore_var_space = self.aeb_space.add(AGENT_DATA_NAMES, (_action_v, _loss_v, _explore_var_v))
-        logger.debug3(f'action_space: {_action_space}')
-        return _action_space
-
-    @lab_api
-    def act(self, observation_space):
-        data_names = ('action',)
-        action_v, = self.aeb_space.init_data_v(data_names)
-        for agent in self.agents:
-            a = agent.a
-            observation_a = observation_space.get(a=a)
-            action_a = agent.space_act(observation_a)
-            action_v[a, 0:len(action_a)] = action_a
-        action_space, = self.aeb_space.add(data_names, (action_v,))
-        logger.debug3(f'\naction_space: {action_space}')
-        return action_space
-
-    @lab_api
-    def update(self, action_space, reward_space, observation_space, done_space):
-        data_names = ('loss', 'explore_var')
-        loss_v, explore_var_v = self.aeb_space.init_data_v(data_names)
-        for agent in self.agents:
-            a = agent.a
-            action_a = action_space.get(a=a)
-            reward_a = reward_space.get(a=a)
-            observation_a = observation_space.get(a=a)
-            done_a = done_space.get(a=a)
-            loss_a, explore_var_a = agent.space_update(action_a, reward_a, observation_a, done_a)
-            loss_v[a, 0:len(loss_a)] = loss_a
-            explore_var_v[a, 0:len(explore_var_a)] = explore_var_a
-        loss_space, explore_var_space = self.aeb_space.add(data_names, (loss_v, explore_var_v))
-        logger.debug3(f'\nloss_space: {loss_space}\nexplore_var_space: {explore_var_space}')
-        return loss_space, explore_var_space
-
-    @lab_api
-    def close(self):
-        logger.info('AgentSpace.close')
-        for agent in self.agents:
-            agent.close()
-
+    def __init__(self, env, agent_spec, aeb=(0, 0, 0)):
+        # essential reference variables
+        self.agent = None  # set later
+        self.env = env
+        self.aeb = aeb
+        self.a, self.e, self.b = aeb
+
+        # variables set during init_algorithm_params
+        self.explore_var = np.nan  # action exploration: epsilon or tau
+        self.entropy_coef = np.nan  # entropy for exploration
+
+        # debugging/logging variables, set in train or loss function
+        self.loss = np.nan
+        self.mean_entropy = np.nan
+        self.mean_grad_norm = np.nan
+
+        self.ckpt_total_reward = np.nan
+        self.total_reward = 0  # init to 0, but dont ckpt before end of an epi
+        self.total_reward_ma = np.nan
+        self.ma_window = 100
+        # store current and best reward_ma for model checkpointing and early termination if all the environments are solved
+        self.best_reward_ma = -np.inf
+        self.eval_reward_ma = np.nan
+
+        # dataframes to track data for analysis.analyze_session
+        # track training data per episode
+        self.train_df = pd.DataFrame(columns=[
+            'epi', 't', 'wall_t', 'opt_step', 'frame', 'fps', 'total_reward', 'total_reward_ma', 'loss', 'lr',
+            'explore_var', 'entropy_coef', 'entropy', 'grad_norm'])
+        # track eval data within run_eval. the same as train_df except for reward
+        self.eval_df = self.train_df.copy()
+
+        # the specific agent-env interface variables for a body
+        self.observation_space = self.env.observation_space
+        self.action_space = self.env.action_space
+        self.observable_dim = self.env.observable_dim
+        self.state_dim = self.observable_dim['state']
+        self.action_dim = self.env.action_dim
+        self.is_discrete = self.env.is_discrete
+        # set the ActionPD class for sampling action
+        self.action_type = policy_util.get_action_type(self.action_space)
+        self.action_pdtype = agent_spec[self.a]['algorithm'].get('action_pdtype')
+        if self.action_pdtype in (None, 'default'):
+            self.action_pdtype = policy_util.ACTION_PDS[self.action_type][0]
+        self.ActionPD = policy_util.get_action_pd_cls(self.action_pdtype, self.action_type)
+
+    def update(self, state, action, reward, next_state, done):
+        '''Interface update method for body at agent.update()'''
+        if hasattr(self.env.u_env, 'raw_reward'):  # use raw_reward if reward is preprocessed
+            reward = self.env.u_env.raw_reward
+        if self.ckpt_total_reward is np.nan:  # init
+            self.ckpt_total_reward = reward
+        else:  # reset on epi_start, else keep adding. generalized for vec env
+            self.ckpt_total_reward = self.ckpt_total_reward * (1 - self.epi_start) + reward
+        self.total_reward = done * self.ckpt_total_reward + (1 - done) * self.total_reward
+        self.epi_start = done
+
+    def __str__(self):
+        return f'body: {util.to_json(util.get_class_attr(self))}'
+
+    def calc_df_row(self, env):
+        '''Calculate a row for updating train_df or eval_df.'''
+        frame = self.env.clock.get('frame')
+        wall_t = env.clock.get_elapsed_wall_t()
+        fps = 0 if wall_t == 0 else frame / wall_t
+
+        # update debugging variables
+        if net_util.to_check_train_step():
+            grad_norms = net_util.get_grad_norms(self.agent.algorithm)
+            self.mean_grad_norm = np.nan if ps.is_empty(grad_norms) else np.mean(grad_norms)
+
+        row = pd.Series({
+            # epi and frame are always measured from training env
+            'epi': self.env.clock.get('epi'),
+            # t and reward are measured from a given env or eval_env
+            't': env.clock.get('t'),
+            'wall_t': wall_t,
+            'opt_step': self.env.clock.get('opt_step'),
+            'frame': frame,
+            'fps': fps,
+            'total_reward': np.nanmean(self.total_reward),  # guard for vec env
+            'total_reward_ma': np.nan,  # update outside
+            'loss': self.loss,
+            'lr': self.get_mean_lr(),
+            'explore_var': self.explore_var,
+            'entropy_coef': self.entropy_coef if hasattr(self, 'entropy_coef') else np.nan,
+            'entropy': self.mean_entropy,
+            'grad_norm': self.mean_grad_norm,
+        }, dtype=np.float32)
+        assert all(col in self.train_df.columns for col in row.index), f'Mismatched row keys: {row.index} vs df columns {self.train_df.columns}'
+        return row
+
+    def train_ckpt(self):
+        '''Checkpoint to update body.train_df data'''
+        row = self.calc_df_row(self.env)
+        # append efficiently to df
+        self.train_df.loc[len(self.train_df)] = row
+        # update current reward_ma
+        self.total_reward_ma = self.train_df[-self.ma_window:]['total_reward'].mean()
+        self.train_df.iloc[-1]['total_reward_ma'] = self.total_reward_ma
+
+    def eval_ckpt(self, eval_env, total_reward):
+        '''Checkpoint to update body.eval_df data'''
+        row = self.calc_df_row(eval_env)
+        row['total_reward'] = total_reward
+        # append efficiently to df
+        self.eval_df.loc[len(self.eval_df)] = row
+        # update current reward_ma
+        self.eval_reward_ma = self.eval_df[-self.ma_window:]['total_reward'].mean()
+        self.eval_df.iloc[-1]['total_reward_ma'] = self.eval_reward_ma
+
+    def get_mean_lr(self):
+        '''Gets the average current learning rate of the algorithm's nets.'''
+        if not hasattr(self.agent.algorithm, 'net_names'):
+            return np.nan
+        lrs = []
+        for attr, obj in self.agent.algorithm.__dict__.items():
+            if attr.endswith('lr_scheduler'):
+                lrs.append(obj.get_lr())
+        return np.mean(lrs)
+
+    def get_log_prefix(self):
+        '''Get the prefix for logging'''
+        spec = self.agent.spec
+        spec_name = spec['name']
+        trial_index = spec['meta']['trial']
+        session_index = spec['meta']['session']
+        prefix = f'Trial {trial_index} session {session_index} {spec_name}_t{trial_index}_s{session_index}'
+        return prefix
+
+    def log_metrics(self, metrics, df_mode):
+        '''Log session metrics'''
+        prefix = self.get_log_prefix()
+        row_str = '  '.join([f'{k}: {v:g}' for k, v in metrics.items()])
+        msg = f'{prefix} [{df_mode}_df metrics] {row_str}'
+        logger.info(msg)
+
+    def log_summary(self, df_mode):
+        '''
+        Log the summary for this body when its environment is done
+        @param str:df_mode 'train' or 'eval'
+        '''
+        prefix = self.get_log_prefix()
+        df = getattr(self, f'{df_mode}_df')
+        last_row = df.iloc[-1]
+        row_str = '  '.join([f'{k}: {v:g}' for k, v in last_row.items()])
+        msg = f'{prefix} [{df_mode}_df] {row_str}'
+        logger.info(msg)
diff --git a/convlab/agent/algorithm/__init__.py b/convlab/agent/algorithm/__init__.py
index 8147106..44fae15 100644
--- a/convlab/agent/algorithm/__init__.py
+++ b/convlab/agent/algorithm/__init__.py
@@ -10,7 +10,6 @@
 # expose all the classes
 from .actor_critic import *
 from .dqn import *
-from .hydra_dqn import *
 from .ppo import *
 from .random import *
 from .reinforce import *
diff --git a/convlab/agent/algorithm/actor_critic.py b/convlab/agent/algorithm/actor_critic.py
index 565eb94..f72d58f 100644
--- a/convlab/agent/algorithm/actor_critic.py
+++ b/convlab/agent/algorithm/actor_critic.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent import net
 from convlab.agent.algorithm import policy_util
 from convlab.agent.algorithm.reinforce import Reinforce
@@ -21,8 +18,8 @@ class ActorCritic(Reinforce):
     https://arxiv.org/abs/1602.01783
     Algorithm specific spec param:
     memory.name: batch (through OnPolicyBatchReplay memory class) or episodic through (OnPolicyReplay memory class)
-    lam: if not null, used as the lambda value of generalized advantage estimation (GAE) introduced in "High-Dimensional Continuous Control Using Generalized Advantage Estimation https://arxiv.org/abs/1506.02438. The algorithm becomes A2C. This lambda controls the bias variance tradeoff for GAE. Floating point value between 0 and 1. Lower values correspond to more bias, less variance. Higher values to more variance, less bias.
-    num_step_returns: if lam is null and this is not null, specifies the number of steps for N-step returns from "Asynchronous Methods for Deep Reinforcement Learning". The algorithm becomes A2C.
+    lam: if not null, used as the lambda value of generalized advantage estimation (GAE) introduced in "High-Dimensional Continuous Control Using Generalized Advantage Estimation https://arxiv.org/abs/1506.02438. This lambda controls the bias variance tradeoff for GAE. Floating point value between 0 and 1. Lower values correspond to more bias, less variance. Higher values to more variance, less bias. Algorithm becomes A2C(GAE).
+    num_step_returns: if lam is null and this is not null, specifies the number of steps for N-step returns from "Asynchronous Methods for Deep Reinforcement Learning". The algorithm becomes A2C(Nstep).
     If both lam and num_step_returns are null, use the default TD error. Then the algorithm stays as AC.
     net.type: whether the actor and critic should share params (e.g. through 'MLPNetShared') or have separate params (e.g. through 'MLPNetSeparate'). If param sharing is used then there is also the option to control the weight given to the policy and value components of the loss function through 'policy_loss_coef' and 'val_loss_coef'
     Algorithm - separate actor and critic:
@@ -64,8 +61,6 @@ class ActorCritic(Reinforce):
         "policy_loss_coef": 1.0,
         "val_loss_coef": 0.01,
         "training_frequency": 1,
-        "training_epoch": 8,
-        "normalize_state": true
     }
 
     e.g. special net_spec param "shared" to share/separate Actor/Critic
@@ -99,8 +94,6 @@ def init_algorithm_params(self):
             'policy_loss_coef',
             'val_loss_coef',
             'training_frequency',
-            'training_epoch',
-            'normalize_state',
         ])
         self.to_train = 0
         self.action_policy = getattr(policy_util, self.action_policy)
@@ -109,13 +102,13 @@ def init_algorithm_params(self):
         if self.entropy_coef_spec is not None:
             self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec)
             self.body.entropy_coef = self.entropy_coef_scheduler.start_val
-        # Select appropriate methods to calculate adv_targets and v_targets for training
+        # Select appropriate methods to calculate advs and v_targets for training
         if self.lam is not None:
             self.calc_advs_v_targets = self.calc_gae_advs_v_targets
         elif self.num_step_returns is not None:
             self.calc_advs_v_targets = self.calc_nstep_advs_v_targets
         else:
-            self.calc_advs_v_targets = self.calc_td_advs_v_targets
+            self.calc_advs_v_targets = self.calc_ret_advs_v_targets
 
     @lab_api
     def init_nets(self, global_nets=None):
@@ -131,7 +124,7 @@ def init_nets(self, global_nets=None):
             - Discrete action spaces: The return list contains 2 element. The first element is a tensor containing the logits for a categorical probability distribution over the actions. The second element contains the state-value estimated by the network.
         3. If the network type is feedforward, convolutional, or recurrent
             - Feedforward and convolutional networks take a single state as input and require an OnPolicyReplay or OnPolicyBatchReplay memory
-            - Recurrent networks take n states as input and require an OnPolicySeqReplay or OnPolicySeqBatchReplay memory
+            - Recurrent networks take n states as input and require env spec "frame_op": "concat", "frame_op_len": seq_len
         '''
         assert 'shared' in self.net_spec, 'Specify "shared" for ActorCritic network in net_spec'
         self.shared = self.net_spec['shared']
@@ -149,201 +142,163 @@ def init_nets(self, global_nets=None):
         if critic_net_spec['use_same_optim']:
             critic_net_spec = actor_net_spec
 
-        if global_nets is None:
-            in_dim = self.body.state_dim
-            out_dim = net_util.get_out_dim(self.body, add_critic=self.shared)
-            # main actor network, may contain out_dim self.shared == True
-            NetClass = getattr(net, actor_net_spec['type'])
-            self.net = NetClass(actor_net_spec, in_dim, out_dim)
-            self.net_names = ['net']
-            if not self.shared:  # add separate network for critic
-                critic_out_dim = 1
-                CriticNetClass = getattr(net, critic_net_spec['type'])
-                self.critic = CriticNetClass(critic_net_spec, in_dim, critic_out_dim)
-                self.net_names.append('critic')
-        else:
-            util.set_attr(self, global_nets)
-            self.net_names = list(global_nets.keys())
+        in_dim = self.body.state_dim
+        out_dim = net_util.get_out_dim(self.body, add_critic=self.shared)
+        # main actor network, may contain out_dim self.shared == True
+        NetClass = getattr(net, actor_net_spec['type'])
+        self.net = NetClass(actor_net_spec, in_dim, out_dim)
+        self.net_names = ['net']
+        if not self.shared:  # add separate network for critic
+            critic_out_dim = 1
+            CriticNetClass = getattr(net, critic_net_spec['type'])
+            self.critic_net = CriticNetClass(critic_net_spec, in_dim, critic_out_dim)
+            self.net_names.append('critic_net')
+        # init net optimizer and its lr scheduler
+        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
+        self.lr_scheduler = net_util.get_lr_scheduler(self.optim, self.net.lr_scheduler_spec)
+        if not self.shared:
+            self.critic_optim = net_util.get_optim(self.critic_net, self.critic_net.optim_spec)
+            self.critic_lr_scheduler = net_util.get_lr_scheduler(self.critic_optim, self.critic_net.lr_scheduler_spec)
+        net_util.set_global_nets(self, global_nets)
         self.post_init_nets()
 
     @lab_api
-    def calc_pdparam(self, x, evaluate=True, net=None):
+    def calc_pdparam(self, x, net=None):
         '''
         The pdparam will be the logits for discrete prob. dist., or the mean and std for continuous prob. dist.
         '''
-        pdparam = super(ActorCritic, self).calc_pdparam(x, evaluate=evaluate, net=net)
-        if self.shared:  # output: policy, value
-            if len(pdparam) == 2:  # single policy outputs, value
-                pdparam = pdparam[0]
-            else:  # multiple policy outputs, value
-                pdparam = pdparam[:-1]
-        logger.debug(f'pdparam: {pdparam}')
+        out = super().calc_pdparam(x, net=net)
+        if self.shared:
+            assert ps.is_list(out), f'Shared output should be a list [pdparam, v]'
+            if len(out) == 2:  # single policy
+                pdparam = out[0]
+            else:  # multiple-task policies, still assumes 1 value
+                pdparam = out[:-1]
+            self.v_pred = out[-1].view(-1)  # cache for loss calc to prevent double-pass
+        else:  # out is pdparam
+            pdparam = out
         return pdparam
 
-    def calc_v(self, x, evaluate=True, net=None):
+    def calc_v(self, x, net=None, use_cache=True):
         '''
-        Forward-pass to calculate the predicted state-value from critic.
+        Forward-pass to calculate the predicted state-value from critic_net.
         '''
-        net = self.net if net is None else net
         if self.shared:  # output: policy, value
-            if evaluate:
-                out = net.wrap_eval(x)
+            if use_cache:  # uses cache from calc_pdparam to prevent double-pass
+                v_pred = self.v_pred
             else:
-                net.train()
-                out = net(x)
-            v = out[-1].squeeze_(dim=1)  # get value only
+                net = self.net if net is None else net
+                v_pred = net(x)[-1].view(-1)
         else:
-            if evaluate:
-                out = self.critic.wrap_eval(x)
-            else:
-                self.critic.train()
-                out = self.critic(x)
-            v = out.squeeze_(dim=1)
-        logger.debug(f'v: {v}')
-        return v
+            net = self.critic_net if net is None else net
+            v_pred = net(x).view(-1)
+        return v_pred
 
-    @lab_api
-    def train(self):
-        '''Trains the algorithm'''
-        if util.in_eval_lab_modes():
-            self.body.flush()
-            return np.nan
-        if self.shared:
-            return self.train_shared()
-        else:
-            return self.train_separate()
+    def calc_pdparam_v(self, batch):
+        '''Efficiently forward to get pdparam and v by batch for loss computation'''
+        states = batch['states']
+        if self.body.env.is_venv:
+            states = math_util.venv_unpack(states)
+        pdparam = self.calc_pdparam(states)
+        v_pred = self.calc_v(states)  # uses self.v_pred from calc_pdparam if self.shared
+        return pdparam, v_pred
 
-    def train_shared(self):
+    def calc_ret_advs_v_targets(self, batch, v_preds):
+        '''Calculate plain returns, and advs = rets - v_preds, v_targets = rets'''
+        v_preds = v_preds.detach()  # adv does not accumulate grad
+        if self.body.env.is_venv:
+            v_preds = math_util.venv_pack(v_preds, self.body.env.num_envs)
+        rets = math_util.calc_returns(batch['rewards'], batch['dones'], self.gamma)
+        advs = rets - v_preds
+        v_targets = rets
+        if self.body.env.is_venv:
+            advs = math_util.venv_unpack(advs)
+            v_targets = math_util.venv_unpack(v_targets)
+        logger.debug(f'advs: {advs}\nv_targets: {v_targets}')
+        return advs, v_targets
+
+    def calc_nstep_advs_v_targets(self, batch, v_preds):
         '''
-        Trains the network when the actor and critic share parameters
-        loss = self.policy_loss_coef * policy_loss + self.val_loss_coef * val_loss
+        Calculate N-step returns, and advs = nstep_rets - v_preds, v_targets = nstep_rets
+        See n-step advantage under http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
         '''
-        clock = self.body.env.clock
-        if self.to_train == 1:
-            batch = self.sample()
-            with torch.no_grad():
-                advs, v_targets = self.calc_advs_v_targets(batch)
-            policy_loss = self.calc_policy_loss(batch, advs)  # from actor
-            val_loss = self.calc_val_loss(batch, v_targets)  # from critic
-            loss = policy_loss + val_loss
-            self.net.training_step(loss=loss, lr_clock=clock)
-            # reset
-            self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
-            return loss.item()
-        else:
-            return np.nan
+        next_states = batch['next_states'][-1]
+        if not self.body.env.is_venv:
+            next_states = next_states.unsqueeze(dim=0)
+        with torch.no_grad():
+            next_v_pred = self.calc_v(next_states, use_cache=False)
+        v_preds = v_preds.detach()  # adv does not accumulate grad
+        if self.body.env.is_venv:
+            v_preds = math_util.venv_pack(v_preds, self.body.env.num_envs)
+        nstep_rets = math_util.calc_nstep_returns(batch['rewards'], batch['dones'], next_v_pred, self.gamma, self.num_step_returns)
+        advs = nstep_rets - v_preds
+        v_targets = nstep_rets
+        if self.body.env.is_venv:
+            advs = math_util.venv_unpack(advs)
+            v_targets = math_util.venv_unpack(v_targets)
+        logger.debug(f'advs: {advs}\nv_targets: {v_targets}')
+        return advs, v_targets
 
-    def train_separate(self):
+    def calc_gae_advs_v_targets(self, batch, v_preds):
         '''
-        Trains the network when the actor and critic are separate networks
-        loss = val_loss + abs(policy_loss)
+        Calculate GAE, and advs = GAE, v_targets = advs + v_preds
+        See GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf
         '''
-        if self.to_train == 1:
-            batch = self.sample()
-            policy_loss = self.train_actor(batch)
-            val_loss = self.train_critic(batch)
-            loss = val_loss + abs(policy_loss)
-            # reset
-            self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name}, loss: {loss:g}')
-            return loss.item()
-        else:
-            return np.nan
-
-    def train_actor(self, batch):
-        '''Trains the actor when the actor and critic are separate networks'''
+        next_states = batch['next_states'][-1]
+        if not self.body.env.is_venv:
+            next_states = next_states.unsqueeze(dim=0)
         with torch.no_grad():
-            advs, _v_targets = self.calc_advs_v_targets(batch)
-        policy_loss = self.calc_policy_loss(batch, advs)
-        self.net.training_step(loss=policy_loss, lr_clock=self.body.env.clock)
-        return policy_loss
+            next_v_pred = self.calc_v(next_states, use_cache=False)
+        v_preds = v_preds.detach()  # adv does not accumulate grad
+        if self.body.env.is_venv:
+            v_preds = math_util.venv_pack(v_preds, self.body.env.num_envs)
+            next_v_pred = next_v_pred.unsqueeze(dim=0)
+        v_preds_all = torch.cat((v_preds, next_v_pred), dim=0)
+        advs = math_util.calc_gaes(batch['rewards'], batch['dones'], v_preds_all, self.gamma, self.lam)
+        v_targets = advs + v_preds
+        advs = math_util.standardize(advs)  # standardize only for advs, not v_targets
+        if self.body.env.is_venv:
+            advs = math_util.venv_unpack(advs)
+            v_targets = math_util.venv_unpack(v_targets)
+        logger.debug(f'advs: {advs}\nv_targets: {v_targets}')
+        return advs, v_targets
 
-    def train_critic(self, batch):
-        '''Trains the critic when the actor and critic are separate networks'''
-        total_val_loss = torch.tensor(0.0, device=self.net.device)
-        # training iters only applicable to separate critic network
-        for _ in range(self.training_epoch):
-            with torch.no_grad():
-                _advs, v_targets = self.calc_advs_v_targets(batch)
-            val_loss = self.calc_val_loss(batch, v_targets)
-            self.critic.training_step(loss=val_loss, lr_clock=self.body.env.clock)
-            total_val_loss += val_loss
-        val_loss = total_val_loss / self.training_epoch
-        return val_loss
-
-    def calc_policy_loss(self, batch, advs):
+    def calc_policy_loss(self, batch, pdparams, advs):
         '''Calculate the actor's policy loss'''
-        assert len(self.body.log_probs) == len(advs), f'batch_size of log_probs {len(self.body.log_probs)} vs advs: {len(advs)}'
-        log_probs = torch.stack(self.body.log_probs)
-        policy_loss = - self.policy_loss_coef * log_probs * advs
-        if self.entropy_coef_spec is not None:
-            entropies = torch.stack(self.body.entropies)
-            policy_loss += (-self.body.entropy_coef * entropies)
-        policy_loss = torch.mean(policy_loss)
-        logger.debug(f'Actor policy loss: {policy_loss:g}')
-        return policy_loss
+        return super().calc_policy_loss(batch, pdparams, advs)
 
-    def calc_val_loss(self, batch, v_targets):
+    def calc_val_loss(self, v_preds, v_targets):
         '''Calculate the critic's value loss'''
-        v_targets = v_targets.unsqueeze(dim=-1)
-        v_preds = self.calc_v(batch['states'], evaluate=False).unsqueeze_(dim=-1)
-        assert v_preds.shape == v_targets.shape
+        assert v_preds.shape == v_targets.shape, f'{v_preds.shape} != {v_targets.shape}'
         val_loss = self.val_loss_coef * self.net.loss_fn(v_preds, v_targets)
         logger.debug(f'Critic value loss: {val_loss:g}')
         return val_loss
 
-    def calc_gae_advs_v_targets(self, batch):
-        '''
-        Calculate the GAE advantages and value targets for training actor and critic respectively
-        adv_targets = GAE (see math_util method)
-        v_targets = adv_targets + v_preds
-        before output, adv_targets is standardized (so v_targets used the unstandardized version)
-        Used for training with GAE
-        '''
-        v_preds = self.calc_v(batch['states'])
-        # calc next_state boundary value and concat with above for efficiency
-        next_v_pred_tail = self.calc_v(batch['next_states'][-1:])
-        next_v_preds = torch.cat([v_preds[1:], next_v_pred_tail], dim=0)
-        # v targets = r_t + gamma * V(s_(t+1))
-        v_targets = math_util.calc_nstep_returns(batch, self.gamma, 1, next_v_preds)
-        # ensure val for next_state is 0 at done
-        next_v_preds = next_v_preds * (1 - batch['dones'])
-        adv_targets = math_util.calc_gaes(batch['rewards'], v_preds, next_v_preds, self.gamma, self.lam)
-        adv_targets = math_util.standardize(adv_targets)
-        logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
-        return adv_targets, v_targets
-
-    def calc_nstep_advs_v_targets(self, batch):
-        '''
-        Calculate N-step returns advantage = nstep_returns - v_pred
-        See n-step advantage under http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
-        Used for training with N-step (not GAE)
-        Returns 2-tuple for API-consistency with GAE
-        '''
-        next_v_preds = self.calc_v(batch['next_states'])
-        v_preds = self.calc_v(batch['states'])
-        # v targets = r_t + gamma * V(s_(t+1))
-        v_targets = math_util.calc_nstep_returns(batch, self.gamma, 1, next_v_preds)
-        nstep_returns = math_util.calc_nstep_returns(batch, self.gamma, self.num_step_returns, next_v_preds)
-        nstep_advs = nstep_returns - v_preds
-        adv_targets = nstep_advs
-        logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
-        return adv_targets, v_targets
-
-    def calc_td_advs_v_targets(self, batch):
-        '''
-        Estimate Q(s_t, a_t) with r_t + gamma * V(s_t+1 ) for simplest AC algorithm
-        '''
-        next_v_preds = self.calc_v(batch['next_states'])
-        # Equivalent to 1-step return
-        # v targets = r_t + gamma * V(s_(t+1))
-        v_targets = math_util.calc_nstep_returns(batch, self.gamma, 1, next_v_preds)
-        adv_targets = v_targets  # Plain Q estimate, called adv for API consistency
-        logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
-        return adv_targets, v_targets
+    def train(self):
+        '''Train actor critic by computing the loss in batch efficiently'''
+        if util.in_eval_lab_modes():
+            return np.nan
+        clock = self.body.env.clock
+        if self.to_train == 1:
+            batch = self.sample()
+            clock.set_batch_size(len(batch))
+            pdparams, v_preds = self.calc_pdparam_v(batch)
+            advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
+            policy_loss = self.calc_policy_loss(batch, pdparams, advs)  # from actor
+            val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
+            if self.shared:  # shared network
+                loss = policy_loss + val_loss
+                self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
+            else:
+                self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
+                self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net)
+                loss = policy_loss + val_loss
+            # reset
+            self.to_train = 0
+            logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
+            return loss.item()
+        else:
+            return np.nan
 
     @lab_api
     def update(self):
diff --git a/convlab/agent/algorithm/base.py b/convlab/agent/algorithm/base.py
index d138c18..46c44dc 100644
--- a/convlab/agent/algorithm/base.py
+++ b/convlab/agent/algorithm/base.py
@@ -120,7 +120,8 @@ def load(self):
         for k, v in vars(self).items():
             if k.endswith('_scheduler'):
                 var_name = k.replace('_scheduler', '')
-                setattr(self.body, var_name, v.end_val)
+                if hasattr(v, 'end_val'):
+                    setattr(self.body, var_name, v.end_val)
 
     # NOTE optional extension for multi-agent-env
 
diff --git a/convlab/agent/algorithm/dqn.py b/convlab/agent/algorithm/dqn.py
index d99bd03..ba91d2b 100644
--- a/convlab/agent/algorithm/dqn.py
+++ b/convlab/agent/algorithm/dqn.py
@@ -1,11 +1,8 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent import net
 from convlab.agent.algorithm import policy_util
 from convlab.agent.algorithm.sarsa import SARSA
 from convlab.agent.net import net_util
-from convlab.lib import logger, util
+from convlab.lib import logger, math_util, util
 from convlab.lib.decorator import lab_api
 import numpy as np
 import pydash as ps
@@ -46,11 +43,10 @@ class VanillaDQN(SARSA):
             "end_step": 1000,
         },
         "gamma": 0.99,
-        "training_batch_epoch": 8,
-        "training_epoch": 4,
+        "training_batch_iter": 8,
+        "training_iter": 4,
         "training_frequency": 10,
         "training_start_step": 10,
-        "normalize_state": true
     }
     '''
 
@@ -66,63 +62,63 @@ def init_algorithm_params(self):
             'action_pdtype',
             'action_policy',
             'rule_guide_max_epi',
-            "rule_guide_frequency",
+            'rule_guide_frequency',
             # explore_var is epsilon, tau or etc. depending on the action policy
             # these control the trade off between exploration and exploitaton
             'explore_var_spec',
             'gamma',  # the discount factor
-            'training_batch_epoch',  # how many gradient updates per batch
-            'training_epoch',  # how many batches to train each time
+            'training_batch_iter',  # how many gradient updates per batch
+            'training_iter',  # how many batches to train each time
             'training_frequency',  # how often to train (once a few timesteps)
             'training_start_step',  # how long before starting training
-            'normalize_state',
         ])
-        super(VanillaDQN, self).init_algorithm_params()
+        super().init_algorithm_params()
 
     @lab_api
     def init_nets(self, global_nets=None):
         '''Initialize the neural network used to learn the Q function from the spec'''
         if self.algorithm_spec['name'] == 'VanillaDQN':
             assert all(k not in self.net_spec for k in ['update_type', 'update_frequency', 'polyak_coef']), 'Network update not available for VanillaDQN; use DQN.'
-        if global_nets is None:
-            in_dim = self.body.state_dim
-            out_dim = net_util.get_out_dim(self.body)
-            NetClass = getattr(net, self.net_spec['type'])
-            self.net = NetClass(self.net_spec, in_dim, out_dim)
-            self.net_names = ['net']
-        else:
-            util.set_attr(self, global_nets)
-            self.net_names = list(global_nets.keys())
+        in_dim = self.body.state_dim
+        out_dim = net_util.get_out_dim(self.body)
+        NetClass = getattr(net, self.net_spec['type'])
+        self.net = NetClass(self.net_spec, in_dim, out_dim)
+        self.net_names = ['net']
+        # init net optimizer and its lr scheduler
+        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
+        self.lr_scheduler = net_util.get_lr_scheduler(self.optim, self.net.lr_scheduler_spec)
+        net_util.set_global_nets(self, global_nets)
         self.post_init_nets()
 
     def calc_q_loss(self, batch):
         '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
-        q_preds = self.net.wrap_eval(batch['states'])
+        states = batch['states']
+        next_states = batch['next_states']
+        q_preds = self.net(states)
+        with torch.no_grad():
+            next_q_preds = self.net(next_states)
         act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
-        next_q_preds = self.net.wrap_eval(batch['next_states'])
         # Bellman equation: compute max_q_targets using reward and max estimated Q values (0 if no next_state)
         max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=True)
         max_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds
-        max_q_targets = max_q_targets.detach()
+        logger.debug(f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}')
         q_loss = self.net.loss_fn(act_q_preds, max_q_targets)
 
         # TODO use the same loss_fn but do not reduce yet
         if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
-            errors = torch.abs(max_q_targets - act_q_preds.detach())
+            errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy()
             self.body.memory.update_priorities(errors)
         return q_loss
 
     @lab_api
     def act(self, state):
         '''Selects and returns a discrete action for body using the action policy'''
-        return super(VanillaDQN, self).act(state)
+        return super().act(state)
 
     @lab_api
     def sample(self):
         '''Samples a batch from memory of size self.memory_spec['batch_size']'''
         batch = self.body.memory.sample()
-        if self.normalize_state:
-            batch = policy_util.normalize_states_and_next_states(self.body, batch)
         batch = util.to_torch_batch(batch, self.net.device, self.body.memory.is_episodic)
         return batch
 
@@ -136,24 +132,26 @@ def train(self):
         Otherwise this function does nothing.
         '''
         if util.in_eval_lab_modes():
-            self.body.flush()
             return np.nan
         clock = self.body.env.clock
-        tick = clock.get(clock.max_tick_unit)
-        self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0)
         if self.to_train == 1:
-            total_loss = torch.tensor(0.0, device=self.net.device)
-            for _ in range(self.training_epoch):
-                batch = self.sample()
-                for _ in range(self.training_batch_epoch):
+            total_loss = torch.tensor(0.0)
+            # for _ in range(self.training_iter):
+            #     batch = self.sample()
+            #     clock.set_batch_size(len(batch))
+            #     for _ in range(self.training_batch_iter):
+            num_batches = int(self.body.memory.size / self.body.memory.batch_size)
+            for _ in range(self.training_iter):
+                # clock.set_batch_size(len(batch))
+                for _ in range(min(self.training_batch_iter, num_batches)):
+                    batch = self.sample()
                     loss = self.calc_q_loss(batch)
-                    self.net.training_step(loss=loss, lr_clock=clock)
+                    self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
                     total_loss += loss
-            loss = total_loss / (self.training_epoch * self.training_batch_epoch)
+            loss = total_loss / (self.training_iter * self.training_batch_iter)
             # reset
             self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
+            logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
             return loss.item()
         else:
             return np.nan
@@ -161,7 +159,7 @@ def train(self):
     @lab_api
     def update(self):
         '''Update the agent after training'''
-        return super(VanillaDQN, self).update()
+        return super().update()
 
 
 class DQNBase(VanillaDQN):
@@ -185,92 +183,49 @@ def init_nets(self, global_nets=None):
         '''Initialize networks'''
         if self.algorithm_spec['name'] == 'DQNBase':
             assert all(k not in self.net_spec for k in ['update_type', 'update_frequency', 'polyak_coef']), 'Network update not available for DQNBase; use DQN.'
-        if global_nets is None:
-            in_dim = self.body.state_dim
-            out_dim = net_util.get_out_dim(self.body)
-            NetClass = getattr(net, self.net_spec['type'])
-            self.net = NetClass(self.net_spec, in_dim, out_dim)
-            self.target_net = NetClass(self.net_spec, in_dim, out_dim)
-            self.net_names = ['net', 'target_net']
-        else:
-            util.set_attr(self, global_nets)
-            self.net_names = list(global_nets.keys())
+        in_dim = self.body.state_dim
+        out_dim = net_util.get_out_dim(self.body)
+        NetClass = getattr(net, self.net_spec['type'])
+        self.net = NetClass(self.net_spec, in_dim, out_dim)
+        self.target_net = NetClass(self.net_spec, in_dim, out_dim)
+        self.net_names = ['net', 'target_net']
+        # init net optimizer and its lr scheduler
+        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
+        self.lr_scheduler = net_util.get_lr_scheduler(self.optim, self.net.lr_scheduler_spec)
+        net_util.set_global_nets(self, global_nets)
         self.post_init_nets()
         self.online_net = self.target_net
         self.eval_net = self.target_net
 
     def calc_q_loss(self, batch):
         '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
-        q_preds = self.net.wrap_eval(batch['states'])
-        # q_preds = self.net(batch['states'])
+        states = batch['states']
+        next_states = batch['next_states']
+        q_preds = self.net(states)
+        with torch.no_grad():
+            # Use online_net to select actions in next state
+            online_next_q_preds = self.online_net(next_states)
+            # Use eval_net to calculate next_q_preds for actions chosen by online_net
+            next_q_preds = self.eval_net(next_states)
         act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
-        # Use online_net to select actions in next state
-        online_next_q_preds = self.online_net.wrap_eval(batch['next_states'])
-        # Use eval_net to calculate next_q_preds for actions chosen by online_net
-        next_q_preds = self.eval_net.wrap_eval(batch['next_states'])
-        max_next_q_preds = next_q_preds.gather(-1, online_next_q_preds.argmax(dim=-1, keepdim=True)).squeeze(-1)
+        online_actions = online_next_q_preds.argmax(dim=-1, keepdim=True)
+        max_next_q_preds = next_q_preds.gather(-1, online_actions).squeeze(-1)
         max_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds
-        max_q_targets = max_q_targets.detach()
-
-        # print(action_list[int(batch['actions'][0].item())])
-        # print(batch['actions'][0].item())
-        # print('{} vs {}'.format(act_q_preds.item(), max_q_targets.item()))
-
+        logger.debug(f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}')
         q_loss = self.net.loss_fn(act_q_preds, max_q_targets)
 
-
         # TODO use the same loss_fn but do not reduce yet
         if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
-            errors = torch.abs(max_q_targets - act_q_preds.detach())
+            errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy()
             self.body.memory.update_priorities(errors)
         return q_loss
 
-    @lab_api
-    def train(self):
-        '''
-        Completes one training step for the agent if it is time to train.
-        i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
-        Each training step consists of sampling n batches from the agent's memory.
-        For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
-        Otherwise this function does nothing.
-        '''
-        if util.in_eval_lab_modes():
-            self.body.flush()
-            return np.nan
-        clock = self.body.env.clock
-        tick = clock.get(clock.max_tick_unit)
-        self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0)
-        if self.to_train == 1:
-            total_loss = torch.tensor(0.0, device=self.net.device)
-            for epoch in range(self.training_epoch):
-                num_batches = int(self.body.memory.true_size / self.body.memory.batch_size)
-                for _ in range(num_batches):
-                    batch = self.sample()
-                    loss = self.calc_q_loss(batch)
-                    self.net.training_step(loss=loss, lr_clock=clock)
-                    total_loss += loss
-            loss = total_loss / (self.training_epoch * num_batches)
-            # reset
-            self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
-            return loss.item()
-        else:
-            return np.nan
-
     def update_nets(self):
-        total_t = self.body.env.clock.total_t
-        if total_t % self.net.update_frequency == 0:
+        if util.frame_mod(self.body.env.clock.frame, self.net.update_frequency, self.body.env.num_envs):
             if self.net.update_type == 'replace':
-                logger.debug('Updating target_net by replacing')
                 net_util.copy(self.net, self.target_net)
-                self.online_net = self.target_net
-                self.eval_net = self.target_net
             elif self.net.update_type == 'polyak':
-                logger.debug('Updating net by averaging')
                 net_util.polyak_update(self.net, self.target_net, self.net.polyak_coef)
-                self.online_net = self.target_net
-                self.eval_net = self.target_net
             else:
                 raise ValueError('Unknown net.update_type. Should be "replace" or "polyak". Exiting.')
 
@@ -278,7 +233,7 @@ def update_nets(self):
     def update(self):
         '''Updates self.target_net and the explore variables'''
         self.update_nets()
-        return super(DQNBase, self).update()
+        return super().update()
 
 
 class DQN(DQNBase):
@@ -298,15 +253,15 @@ class DQN(DQNBase):
             "end_step": 1000,
         },
         "gamma": 0.99,
-        "training_batch_epoch": 8,
-        "training_epoch": 4,
+        "training_batch_iter": 8,
+        "training_iter": 4,
         "training_frequency": 10,
         "training_start_step": 10
     }
     '''
     @lab_api
     def init_nets(self, global_nets=None):
-        super(DQN, self).init_nets(global_nets)
+        super().init_nets(global_nets)
 
 
 class DoubleDQN(DQN):
@@ -326,25 +281,14 @@ class DoubleDQN(DQN):
             "end_step": 1000,
         },
         "gamma": 0.99,
-        "training_batch_epoch": 8,
-        "training_epoch": 4,
+        "training_batch_iter": 8,
+        "training_iter": 4,
         "training_frequency": 10,
         "training_start_step": 10
     }
     '''
     @lab_api
     def init_nets(self, global_nets=None):
-        super(DoubleDQN, self).init_nets(global_nets)
+        super().init_nets(global_nets)
         self.online_net = self.net
         self.eval_net = self.target_net
-
-    def update_nets(self):
-        res = super(DoubleDQN, self).update_nets()
-        total_t = self.body.env.clock.total_t
-        if self.net.update_type == 'replace':
-            if total_t % self.net.update_frequency == 0:
-                self.online_net = self.net
-                self.eval_net = self.target_net
-        elif self.net.update_type == 'polyak':
-            self.online_net = self.net
-            self.eval_net = self.target_net
diff --git a/convlab/agent/algorithm/hydra_dqn.py b/convlab/agent/algorithm/hydra_dqn.py
deleted file mode 100644
index 031ce4d..0000000
--- a/convlab/agent/algorithm/hydra_dqn.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-from convlab.agent import net
-from convlab.agent.algorithm import policy_util
-from convlab.agent.algorithm.sarsa import SARSA
-from convlab.agent.algorithm.dqn import DQN
-from convlab.lib import logger, util
-from convlab.lib.decorator import lab_api
-import numpy as np
-import torch
-
-logger = logger.get_logger(__name__)
-
-
-class HydraDQN(DQN):
-    '''Multi-task DQN with separate state and action processors per environment'''
-
-    @lab_api
-    def init_nets(self, global_nets=None):
-        '''Initialize nets with multi-task dimensions, and set net params'''
-        # NOTE: Separate init from MultitaskDQN despite similarities so that this implementation can support arbitrary sized state and action heads (e.g. multiple layers)
-        self.state_dims = in_dims = [body.state_dim for body in self.agent.nanflat_body_a]
-        self.action_dims = out_dims = [body.action_dim for body in self.agent.nanflat_body_a]
-        if global_nets is None:
-            NetClass = getattr(net, self.net_spec['type'])
-            self.net = NetClass(self.net_spec, in_dims, out_dims)
-            self.target_net = NetClass(self.net_spec, in_dims, out_dims)
-            self.net_names = ['net', 'target_net']
-        else:
-            util.set_attr(self, global_nets)
-            self.net_names = list(global_nets.keys())
-        self.post_init_nets()
-        self.online_net = self.target_net
-        self.eval_net = self.target_net
-
-    @lab_api
-    def calc_pdparam(self, xs, evaluate=True, net=None):
-        '''
-        Calculate pdparams for multi-action by chunking the network logits output
-        '''
-        pdparam = SARSA.calc_pdparam(self, xs, evaluate=evaluate, net=net)
-        return pdparam
-
-    @lab_api
-    def space_act(self, state_a):
-        '''Non-atomizable act to override agent.act(), do a single pass on the entire state_a instead of composing act() via iteration'''
-        # gather and flatten
-        states = []
-        for eb, body in util.ndenumerate_nonan(self.agent.body_a):
-            state = state_a[eb]
-            if self.normalize_state:
-                state = policy_util.update_online_stats_and_normalize_state(body, state)
-            states.append(state)
-        xs = [torch.from_numpy(state).float() for state in states]
-        pdparam = self.calc_pdparam(xs, evaluate=False)
-        # use multi-policy. note arg change
-        action_a, action_pd_a = self.action_policy(states, self, self.agent.nanflat_body_a, pdparam)
-        for idx, body in enumerate(self.agent.nanflat_body_a):
-            body.action_tensor, body.action_pd = action_a[idx], action_pd_a[idx]  # used for body.action_pd_update later
-        return action_a.cpu().numpy()
-
-    @lab_api
-    def space_sample(self):
-        '''Samples a batch per body, which may experience different environment'''
-        batch = {k: [] for k in self.body.memory.data_keys}
-        for body in self.agent.nanflat_body_a:
-            body_batch = body.memory.sample()
-            if self.normalize_state:
-                body_batch = policy_util.normalize_states_and_next_states(body, body_batch)
-            body_batch = util.to_torch_batch(body_batch, self.net.device, body.memory.is_episodic)
-            for k, arr in batch.items():
-                arr.append(body_batch[k])
-        return batch
-
-    def calc_q_loss(self, batch):
-        '''Compute the Q value loss for Hydra network by apply the singleton logic on generalized aggregate.'''
-        q_preds = torch.stack(self.net.wrap_eval(batch['states']))
-        act_q_preds = q_preds.gather(-1, torch.stack(batch['actions']).long().unsqueeze(-1)).squeeze(-1)
-        # Use online_net to select actions in next state
-        online_next_q_preds = torch.stack(self.online_net.wrap_eval(batch['next_states']))
-        # Use eval_net to calculate next_q_preds for actions chosen by online_net
-        next_q_preds = torch.stack(self.eval_net.wrap_eval(batch['next_states']))
-        max_next_q_preds = online_next_q_preds.gather(-1, next_q_preds.argmax(dim=-1, keepdim=True)).squeeze(-1)
-        max_q_targets = torch.stack(batch['rewards']) + self.gamma * (1 - torch.stack(batch['dones'])) * max_next_q_preds
-        q_loss = self.net.loss_fn(act_q_preds, max_q_targets)
-
-        # TODO use the same loss_fn but do not reduce yet
-        for body in self.agent.nanflat_body_a:
-            if 'Prioritized' in util.get_class_name(body.memory):  # PER
-                errors = torch.abs(max_q_targets - act_q_preds)
-                body.memory.update_priorities(errors)
-        return q_loss
-
-    @lab_api
-    def space_train(self):
-        '''
-        Completes one training step for the agent if it is time to train.
-        i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
-        Each training step consists of sampling n batches from the agent's memory.
-        For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
-        Otherwise this function does nothing.
-        '''
-        if util.in_eval_lab_modes():
-            self.body.flush()
-            return np.nan
-        clock = self.body.env.clock  # main clock
-        tick = util.s_get(self, 'aeb_space.clock').get(clock.max_tick_unit)
-        self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0)
-        if self.to_train == 1:
-            total_loss = torch.tensor(0.0, device=self.net.device)
-            for _ in range(self.training_epoch):
-                batch = self.space_sample()
-                for _ in range(self.training_batch_epoch):
-                    loss = self.calc_q_loss(batch)
-                    self.net.training_step(loss=loss, lr_clock=clock)
-                    total_loss += loss
-            loss = total_loss / (self.training_epoch * self.training_batch_epoch)
-            # reset
-            self.to_train = 0
-            for body in self.agent.nanflat_body_a:
-                body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
-            return loss.item()
-        else:
-            return np.nan
diff --git a/convlab/agent/algorithm/policy_util.py b/convlab/agent/algorithm/policy_util.py
index f23dbf5..8886d3f 100644
--- a/convlab/agent/algorithm/policy_util.py
+++ b/convlab/agent/algorithm/policy_util.py
@@ -1,206 +1,153 @@
 # Modified by Microsoft Corporation.
 # Licensed under the MIT license.
-
-'''
-Action policy methods to sampling actions
-Algorithm provides a `calc_pdparam` which takes a state and do a forward pass through its net,
-and the pdparam is used to construct an action probability distribution as appropriate per the action type as indicated by the body
-Then the prob. dist. is used to sample action.
-
-The default form looks like:
-```
-ActionPD, pdparam, body = init_action_pd(state, algorithm, body)
-action, action_pd = sample_action_pd(ActionPD, pdparam, body)
-```
-
-We can also augment pdparam before sampling - as in the case of Boltzmann sampling,
-or do epsilon-greedy to use pdparam-sampling or random sampling.
-'''
+# Action policy module
+# Constructs action probability distribution used by agent to sample action and calculate log_prob, entropy, etc.
+from gym import spaces
 # from convlab.env.wrapper import LazyFrames
-from convlab.lib import logger, math_util, util
+from convlab.lib import distribution, logger, math_util, util
 from torch import distributions
 import numpy as np
 import pydash as ps
 import torch
-from pprint import pprint
+import torch.nn.functional as F
 
 logger = logger.get_logger(__name__)
 
-
+# register custom distributions
+setattr(distributions, 'Argmax', distribution.Argmax)
+setattr(distributions, 'GumbelCategorical', distribution.GumbelCategorical)
+setattr(distributions, 'MultiCategorical', distribution.MultiCategorical)
 # probability distributions constraints for different action types; the first in the list is the default
 ACTION_PDS = {
     'continuous': ['Normal', 'Beta', 'Gumbel', 'LogNormal'],
     'multi_continuous': ['MultivariateNormal'],
-    'discrete': ['Categorical', 'Argmax'],
+    'discrete': ['Categorical', 'Argmax', 'GumbelCategorical'],
     'multi_discrete': ['MultiCategorical'],
     'multi_binary': ['Bernoulli'],
 }
 
 
-class Argmax(distributions.Categorical):
-    '''
-    Special distribution class for argmax sampling, where probability is always 1 for the argmax.
-    NOTE although argmax is not a sampling distribution, this implementation is for API consistency.
-    '''
-
-    def __init__(self, probs=None, logits=None, validate_args=None):
-        if probs is not None:
-            new_probs = torch.zeros_like(probs, dtype=torch.float)
-            new_probs[torch.argmax(probs, dim=0)] = 1.0
-            probs = new_probs
-        elif logits is not None:
-            new_logits = torch.full_like(logits, -1e8, dtype=torch.float)
-            max_idx = torch.argmax(logits, dim=0)
-            new_logits[max_idx] = logits[max_idx]
-            logits = new_logits
-
-        super(Argmax, self).__init__(probs=probs, logits=logits, validate_args=validate_args)
-
-
-class MultiCategorical(distributions.Categorical):
-    '''MultiCategorical as collection of Categoricals'''
-
-    def __init__(self, probs=None, logits=None, validate_args=None):
-        self.categoricals = []
-        if probs is None:
-            probs = [None] * len(logits)
-        elif logits is None:
-            logits = [None] * len(probs)
+def get_action_type(action_space):
+    '''Method to get the action type to choose prob. dist. to sample actions from NN logits output'''
+    if isinstance(action_space, spaces.Box):
+        shape = action_space.shape
+        assert len(shape) == 1
+        if shape[0] == 1:
+            return 'continuous'
         else:
-            raise ValueError('Either probs or logits must be None')
-
-        for sub_probs, sub_logits in zip(probs, logits):
-            categorical = distributions.Categorical(probs=sub_probs, logits=sub_logits, validate_args=validate_args)
-            self.categoricals.append(categorical)
-
-    @property
-    def logits(self):
-        return [cat.logits for cat in self.categoricals]
-
-    @property
-    def probs(self):
-        return [cat.probs for cat in self.categoricals]
-
-    @property
-    def param_shape(self):
-        return [cat.param_shape for cat in self.categoricals]
-
-    @property
-    def mean(self):
-        return torch.stack([cat.mean for cat in self.categoricals])
-
-    @property
-    def variance(self):
-        return torch.stack([cat.variance for cat in self.categoricals])
-
-    def sample(self, sample_shape=torch.Size()):
-        return torch.stack([cat.sample(sample_shape=sample_shape) for cat in self.categoricals])
-
-    def log_prob(self, value):
-        return torch.stack([cat.log_prob(value[idx]) for idx, cat in enumerate(self.categoricals)])
-
-    def entropy(self):
-        return torch.stack([cat.entropy() for cat in self.categoricals])
-
-    def enumerate_support(self):
-        return [cat.enumerate_support() for cat in self.categoricals]
+            return 'multi_continuous'
+    elif isinstance(action_space, spaces.Discrete):
+        return 'discrete'
+    elif isinstance(action_space, spaces.MultiDiscrete):
+        return 'multi_discrete'
+    elif isinstance(action_space, spaces.MultiBinary):
+        return 'multi_binary'
+    else:
+        raise NotImplementedError
 
 
-setattr(distributions, 'Argmax', Argmax)
-setattr(distributions, 'MultiCategorical', MultiCategorical)
+# action_policy base methods
 
+def get_action_pd_cls(action_pdtype, action_type):
+    '''
+    Verify and get the action prob. distribution class for construction
+    Called by body at init to set its own ActionPD
+    '''
+    pdtypes = ACTION_PDS[action_type]
+    assert action_pdtype in pdtypes, f'Pdtype {action_pdtype} is not compatible/supported with action_type {action_type}. Options are: {pdtypes}'
+    ActionPD = getattr(distributions, action_pdtype)
+    return ActionPD
 
-# base methods
 
-def try_preprocess(state, algorithm, body, append=True):
-    '''Try calling preprocess as implemented in body's memory to use for net input'''
+def guard_tensor(state, body):
+    '''Guard-cast tensor before being input to network'''
     # if isinstance(state, LazyFrames):
-    #     state = state.__array__()  # from global env preprocessor
-    if hasattr(body.memory, 'preprocess_state'):
-        state = body.memory.preprocess_state(state, append=append)
-    # as float, and always as minibatch for net input
-    state = torch.from_numpy(state).float().unsqueeze_(dim=0)
+    #     state = state.__array__()  # realize data
+    state = torch.from_numpy(state.astype(np.float32))
+    if not body.env.is_venv or util.in_eval_lab_modes():
+        # singleton state, unsqueeze as minibatch for net input
+        state = state.unsqueeze(dim=0)
     return state
 
 
-def cond_squeeze(out):
-    '''Helper to squeeze output depending if it is tensor (discrete pdparam) or list of tensors (continuous pdparam of loc and scale)'''
-    if isinstance(out, list):
-        for out_t in out:
-            out_t.squeeze_(dim=0)
-    else:
-        out.squeeze_(dim=0)
-    return out
+def calc_pdparam(state, algorithm, body):
+    '''
+    Prepare the state and run algorithm.calc_pdparam to get pdparam for action_pd
+    @param tensor:state For pdparam = net(state)
+    @param algorithm The algorithm containing self.net
+    @param body Body which links algorithm to the env which the action is for
+    @returns tensor:pdparam
+    @example
+
+    pdparam = calc_pdparam(state, algorithm, body)
+    action_pd = ActionPD(logits=pdparam)  # e.g. ActionPD is Categorical
+    action = action_pd.sample()
+    '''
+    if not torch.is_tensor(state):  # dont need to cast from numpy
+        state = guard_tensor(state, body)
+        state = state.to(algorithm.net.device)
+    pdparam = algorithm.calc_pdparam(state)
+    return pdparam
 
 
-def init_action_pd(state, algorithm, body, append=True):
+def init_action_pd(ActionPD, pdparam):
     '''
-    Build the proper action prob. dist. to use for action sampling.
-    state is passed through algorithm's net via calc_pdparam, which the algorithm must implement using its proper net.
-    This will return body, ActionPD and pdparam to allow augmentation, e.g. applying temperature tau to pdparam for boltzmann.
-    Then, output must be called with sample_action_pd(body, ActionPD, pdparam) to sample action.
-    @returns {cls, tensor, *} ActionPD, pdparam, body
+    Initialize the action_pd for discrete or continuous actions:
+    - discrete: action_pd = ActionPD(logits)
+    - continuous: action_pd = ActionPD(loc, scale)
     '''
-    pdtypes = ACTION_PDS[body.action_type]
-    assert body.action_pdtype in pdtypes, f'Pdtype {body.action_pdtype} is not compatible/supported with action_type {body.action_type}. Options are: {ACTION_PDS[body.action_type]}'
-    ActionPD = getattr(distributions, body.action_pdtype)
-
-    state = try_preprocess(state, algorithm, body, append=append)
-    state = state.to(algorithm.net.device)
-    pdparam = algorithm.calc_pdparam(state, evaluate=False)
-    return ActionPD, pdparam, body
+    if 'logits' in ActionPD.arg_constraints:  # discrete
+        action_pd = ActionPD(logits=pdparam)
+    else:  # continuous, args = loc and scale
+        if isinstance(pdparam, list):  # split output
+            loc, scale = pdparam
+        else:
+            loc, scale = pdparam.transpose(0, 1)
+        # scale (stdev) must be > 0, use softplus with positive
+        scale = F.softplus(scale) + 1e-8
+        if isinstance(pdparam, list):  # split output
+            # construct covars from a batched scale tensor
+            covars = torch.diag_embed(scale)
+            action_pd = ActionPD(loc=loc, covariance_matrix=covars)
+        else:
+            action_pd = ActionPD(loc=loc, scale=scale)
+    return action_pd
 
 
-def sample_action_pd(ActionPD, pdparam, body):
+def sample_action(ActionPD, pdparam):
     '''
-    This uses the outputs from init_action_pd and an optionally augmented pdparam to construct a action_pd for sampling action
-    @returns {tensor, distribution} action, action_pd A sampled action, and the prob. dist. used for sampling to enable calculations like kl, entropy, etc. later.
+    Convenience method to sample action(s) from action_pd = ActionPD(pdparam)
+    Works with batched pdparam too
+    @returns tensor:action Sampled action(s)
+    @example
+
+    # policy contains:
+    pdparam = calc_pdparam(state, algorithm, body)
+    action = sample_action(body.ActionPD, pdparam)
     '''
-    pdparam = cond_squeeze(pdparam)
-    if body.is_discrete:
-        action_pd = ActionPD(logits=pdparam)
-    else:  # continuous outputs a list, loc and scale
-        assert len(pdparam) == 2, pdparam
-        # scale (stdev) must be >0, use softplus
-        if pdparam[1] < 5:
-            pdparam[1] = torch.log(1 + torch.exp(pdparam[1])) + 1e-8
-        action_pd = ActionPD(*pdparam)
+    action_pd = init_action_pd(ActionPD, pdparam)
     action = action_pd.sample()
-    return action, action_pd
+    return action
 
 
-# interface action sampling methods
+# action_policy used by agent
 
 
 def default(state, algorithm, body):
-    '''Plain policy by direct sampling using outputs of net as logits and constructing ActionPD as appropriate'''
-    ActionPD, pdparam, body = init_action_pd(state, algorithm, body)
-    action, action_pd = sample_action_pd(ActionPD, pdparam, body)
-    return action, action_pd
+    '''Plain policy by direct sampling from a default action probability defined by body.ActionPD'''
+    pdparam = calc_pdparam(state, algorithm, body)
+    action = sample_action(body.ActionPD, pdparam)
+    return action
 
 
 def random(state, algorithm, body):
-    '''Random action sampling that returns the same data format as default(), but without forward pass. Uses gym.space.sample()'''
-    state = try_preprocess(state, algorithm, body, append=True)  # for consistency with init_action_pd inner logic
-    if body.action_type == 'discrete':
-        action_pd = distributions.Categorical(logits=torch.ones(body.action_space.high, device=algorithm.net.device))
-    elif body.action_type == 'continuous':
-        # Possibly this should this have a 'device' set
-        action_pd = distributions.Uniform(
-            low=torch.tensor(body.action_space.low).float(),
-            high=torch.tensor(body.action_space.high).float())
-    elif body.action_type == 'multi_discrete':
-        action_pd = distributions.Categorical(
-            logits=torch.ones(body.action_space.high.size, body.action_space.high[0], device=algorithm.net.device))
-    elif body.action_type == 'multi_continuous':
-        raise NotImplementedError
-    elif body.action_type == 'multi_binary':
-        raise NotImplementedError
+    '''Random action using gym.action_space.sample(), with the same format as default()'''
+    if body.env.is_venv and not util.in_eval_lab_modes():
+        _action = [body.action_space.sample() for _ in range(body.env.num_envs)]
     else:
-        raise NotImplementedError
-    sample = body.action_space.sample()
-    action = torch.tensor(sample, device=algorithm.net.device)
-    return action, action_pd
+        _action = body.action_space.sample()
+    action = torch.tensor([_action])
+    return action
 
 
 def epsilon_greedy(state, algorithm, body):
@@ -217,26 +164,26 @@ def boltzmann(state, algorithm, body):
     Boltzmann policy: adjust pdparam with temperature tau; the higher the more randomness/noise in action.
     '''
     tau = body.explore_var
-    ActionPD, pdparam, body = init_action_pd(state, algorithm, body)
+    pdparam = calc_pdparam(state, algorithm, body)
     pdparam /= tau
-    action, action_pd = sample_action_pd(ActionPD, pdparam, body)
-    return action, action_pd
+    action = sample_action(body.ActionPD, pdparam)
+    return action
 
 
 def rule_guide_epsilon_greedy(state, algorithm, body):
     epsilon = body.explore_var
     if epsilon > np.random.rand():
-        action, action_pd = random(state, algorithm, body)
+        action = random(state, algorithm, body)
     else:
-        action, action_pd = default(state, algorithm, body)
+        action = default(state, algorithm, body)
 
     if body.env.clock.epi < algorithm.rule_guide_max_epi and \
         body.env.clock.epi % algorithm.rule_guide_frequency == 0: 
         if hasattr(body, 'state'):
-            action, rp = rule_guide(body.state, algorithm, body)
+            action = rule_guide(body.state, algorithm, body)
         else:
-            action, rp = rule_guide(state, algorithm, body)
-    return action, action_pd 
+            action = rule_guide(state, algorithm, body)
+    return action 
 
 
 def rule_guide_default(state, algorithm, body):
@@ -245,10 +192,10 @@ def rule_guide_default(state, algorithm, body):
     if body.env.clock.epi < algorithm.rule_guide_max_epi and \
         body.env.clock.epi % algorithm.rule_guide_frequency == 0: 
         if hasattr(body, 'state'):
-            action, rp = rule_guide(body.state, algorithm, body)
+            action = rule_guide(body.state, algorithm, body)
         else:
-            action, rp = rule_guide(state, algorithm, body)
-    return action, action_pd 
+            action = rule_guide(state, algorithm, body)
+    return action 
 
 
 def rule_guide(state, algorithm, body):
@@ -256,12 +203,11 @@ def rule_guide(state, algorithm, body):
     action = env.rule_policy(state, algorithm, body)
     probs = torch.zeros(body.action_space.high, device=algorithm.net.device)
     probs[action] = 1
-    action_pd = distributions.Categorical(probs=probs)
     action = torch.tensor(action, device=algorithm.net.device)
-    return action, action_pd
+    return action
 
-
-# multi-body policy with a single forward pass to calc pdparam
+# multi-body/multi-env action_policy used by agent
+# TODO rework
 
 def multi_default(states, algorithm, body_list, pdparam):
     '''
@@ -269,71 +215,61 @@ def multi_default(states, algorithm, body_list, pdparam):
     Note, for efficiency, do a single forward pass to calculate pdparam, then call this policy like:
     @example
 
-    pdparam = self.calc_pdparam(state, evaluate=False)
-    action_a, action_pd_a = self.action_policy(pdparam, self, body_list)
+    pdparam = self.calc_pdparam(state)
+    action_a = self.action_policy(pdparam, self, body_list)
     '''
-    pdparam.squeeze_(dim=0)
     # assert pdparam has been chunked
     assert len(pdparam.shape) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
-    action_list, action_pd_a = [], []
+    action_list = []
     for idx, sub_pdparam in enumerate(pdparam):
         body = body_list[idx]
-        try_preprocess(states[idx], algorithm, body, append=True)  # for consistency with init_action_pd inner logic
-        ActionPD = getattr(distributions, body.action_pdtype)
-        action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
+        guard_tensor(states[idx], body)  # for consistency with singleton inner logic
+        action = sample_action(body.ActionPD, sub_pdparam)
         action_list.append(action)
-        action_pd_a.append(action_pd)
-    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
-    return action_a, action_pd_a
+    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze(dim=1)
+    return action_a
 
 
 def multi_random(states, algorithm, body_list, pdparam):
     '''Apply random policy body-wise.'''
-    pdparam.squeeze_(dim=0)
-    action_list, action_pd_a = [], []
+    action_list = []
     for idx, body in body_list:
-        action, action_pd = random(states[idx], algorithm, body)
+        action = random(states[idx], algorithm, body)
         action_list.append(action)
-        action_pd_a.append(action_pd)
-    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
-    return action_a, action_pd_a
+    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze(dim=1)
+    return action_a
 
 
 def multi_epsilon_greedy(states, algorithm, body_list, pdparam):
     '''Apply epsilon-greedy policy body-wise'''
     assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
-    action_list, action_pd_a = [], []
+    action_list = []
     for idx, sub_pdparam in enumerate(pdparam):
         body = body_list[idx]
         epsilon = body.explore_var
         if epsilon > np.random.rand():
-            action, action_pd = random(states[idx], algorithm, body)
+            action = random(states[idx], algorithm, body)
         else:
-            try_preprocess(states[idx], algorithm, body, append=True)  # for consistency with init_action_pd inner logic
-            ActionPD = getattr(distributions, body.action_pdtype)
-            action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
+            guard_tensor(states[idx], body)  # for consistency with singleton inner logic
+            action = sample_action(body.ActionPD, sub_pdparam)
         action_list.append(action)
-        action_pd_a.append(action_pd)
-    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
-    return action_a, action_pd_a
+    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze(dim=1)
+    return action_a
 
 
 def multi_boltzmann(states, algorithm, body_list, pdparam):
     '''Apply Boltzmann policy body-wise'''
-    # pdparam.squeeze_(dim=0)
     assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
-    action_list, action_pd_a = [], []
+    action_list = []
     for idx, sub_pdparam in enumerate(pdparam):
         body = body_list[idx]
-        try_preprocess(states[idx], algorithm, body, append=True)  # for consistency with init_action_pd inner logic
+        guard_tensor(states[idx], body)  # for consistency with singleton inner logic
         tau = body.explore_var
         sub_pdparam /= tau
-        ActionPD = getattr(distributions, body.action_pdtype)
-        action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
+        action = sample_action(body.ActionPD, sub_pdparam)
         action_list.append(action)
-        action_pd_a.append(action_pd)
-    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
-    return action_a, action_pd_a
+    action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze(dim=1)
+    return action_a
 
 
 # action policy update methods
@@ -369,164 +305,538 @@ def __init__(self, var_decay_spec=None):
 
     def update(self, algorithm, clock):
         '''Get an updated value for var'''
-        if (util.get_lab_mode() in ('enjoy', 'eval')) or self._updater_name == 'no_decay':
+        if (util.in_eval_lab_modes()) or self._updater_name == 'no_decay':
             return self.end_val
-        step = clock.get(clock.max_tick_unit)
+        step = clock.get()
         val = self._updater(self.start_val, self.end_val, self.start_step, self.end_step, step)
         return val
 
+# '''
+# Action policy methods to sampling actions
+# Algorithm provides a `calc_pdparam` which takes a state and do a forward pass through its net,
+# and the pdparam is used to construct an action probability distribution as appropriate per the action type as indicated by the body
+# Then the prob. dist. is used to sample action.
+
+# The default form looks like:
+# ```
+# ActionPD, pdparam, body = init_action_pd(state, algorithm, body)
+# action, action_pd = sample_action_pd(ActionPD, pdparam, body)
+# ```
+
+# We can also augment pdparam before sampling - as in the case of Boltzmann sampling,
+# or do epsilon-greedy to use pdparam-sampling or random sampling.
+# '''
+# # from convlab.env.wrapper import LazyFrames
+# from convlab.lib import logger, math_util, util
+# from torch import distributions
+# import numpy as np
+# import pydash as ps
+# import torch
+# from pprint import pprint
+
+# logger = logger.get_logger(__name__)
+
+
+# # probability distributions constraints for different action types; the first in the list is the default
+# ACTION_PDS = {
+#     'continuous': ['Normal', 'Beta', 'Gumbel', 'LogNormal'],
+#     'multi_continuous': ['MultivariateNormal'],
+#     'discrete': ['Categorical', 'Argmax'],
+#     'multi_discrete': ['MultiCategorical'],
+#     'multi_binary': ['Bernoulli'],
+# }
+
+
+# class Argmax(distributions.Categorical):
+#     '''
+#     Special distribution class for argmax sampling, where probability is always 1 for the argmax.
+#     NOTE although argmax is not a sampling distribution, this implementation is for API consistency.
+#     '''
 
-# misc calc methods
+#     def __init__(self, probs=None, logits=None, validate_args=None):
+#         if probs is not None:
+#             new_probs = torch.zeros_like(probs, dtype=torch.float)
+#             new_probs[torch.argmax(probs, dim=0)] = 1.0
+#             probs = new_probs
+#         elif logits is not None:
+#             new_logits = torch.full_like(logits, -1e8, dtype=torch.float)
+#             max_idx = torch.argmax(logits, dim=0)
+#             new_logits[max_idx] = logits[max_idx]
+#             logits = new_logits
 
-def guard_multi_pdparams(pdparams, body):
-    '''Guard pdparams for multi action'''
-    action_dim = body.action_dim
-    is_multi_action = ps.is_iterable(action_dim)
-    if is_multi_action:
-        assert ps.is_list(pdparams)
-        pdparams = [t.clone() for t in pdparams]  # clone for grad safety
-        assert len(pdparams) == len(action_dim), pdparams
-        # transpose into (batch_size, [action_dims])
-        pdparams = [list(torch.split(t, action_dim, dim=0)) for t in torch.cat(pdparams, dim=1)]
-    return pdparams
+#         super(Argmax, self).__init__(probs=probs, logits=logits, validate_args=validate_args)
 
 
-def calc_log_probs(algorithm, net, body, batch):
-    '''
-    Method to calculate log_probs fresh from batch data
-    Body already stores log_prob from self.net. This is used for PPO where log_probs needs to be recalculated.
-    '''
-    states, actions = batch['states'], batch['actions']
-    action_dim = body.action_dim
-    is_multi_action = ps.is_iterable(action_dim)
-    # construct log_probs for each state-action
-    pdparams = algorithm.calc_pdparam(states, net=net)
-    pdparams = guard_multi_pdparams(pdparams, body)
-    assert len(pdparams) == len(states), f'batch_size of pdparams: {len(pdparams)} vs states: {len(states)}'
-
-    pdtypes = ACTION_PDS[body.action_type]
-    ActionPD = getattr(distributions, body.action_pdtype)
-
-    log_probs = []
-    for idx, pdparam in enumerate(pdparams):
-        if not is_multi_action:  # already cloned  for multi_action above
-            pdparam = pdparam.clone()  # clone for grad safety
-        _action, action_pd = sample_action_pd(ActionPD, pdparam, body)
-        log_probs.append(action_pd.log_prob(actions[idx].float()).sum(dim=0))
-    log_probs = torch.stack(log_probs)
-    assert not torch.isnan(log_probs).any(), f'log_probs: {log_probs}, \npdparams: {pdparams} \nactions: {actions}'
-    logger.debug(f'log_probs: {log_probs}')
-    return log_probs
-
-
-def update_online_stats(body, state):
-    '''
-    Method to calculate the running mean and standard deviation of the state space.
-    See https://www.johndcook.com/blog/standard_deviation/ for more details
-    for n >= 1
-        M_n = M_n-1 + (state - M_n-1) / n
-        S_n = S_n-1 + (state - M_n-1) * (state - M_n)
-        variance = S_n / (n - 1)
-        std_dev = sqrt(variance)
-    '''
-    logger.debug(f'mean: {body.state_mean}, std: {body.state_std_dev}, num examples: {body.state_n}')
-    # Assumes only one state is given
-    if ("Atari" in body.memory.__class__.__name__):
-        assert state.ndim == 3
-    elif getattr(body.memory, 'raw_state_dim', False):
-        assert state.size == body.memory.raw_state_dim
-    else:
-        assert state.size == body.state_dim or state.shape == body.state_dim
-    mean = body.state_mean
-    body.state_n += 1
-    if np.isnan(mean).any():
-        assert np.isnan(body.state_std_dev_int)
-        assert np.isnan(body.state_std_dev)
-        body.state_mean = state
-        body.state_std_dev_int = 0
-        body.state_std_dev = 0
-    else:
-        assert body.state_n > 1
-        body.state_mean = mean + (state - mean) / body.state_n
-        body.state_std_dev_int = body.state_std_dev_int + (state - mean) * (state - body.state_mean)
-        body.state_std_dev = np.sqrt(body.state_std_dev_int / (body.state_n - 1))
-        # Guard against very small std devs
-        if (body.state_std_dev < 1e-8).any():
-            body.state_std_dev[np.where(body.state_std_dev < 1e-8)] += 1e-8
-    logger.debug(f'new mean: {body.state_mean}, new std: {body.state_std_dev}, num examples: {body.state_n}')
+# class MultiCategorical(distributions.Categorical):
+#     '''MultiCategorical as collection of Categoricals'''
 
+#     def __init__(self, probs=None, logits=None, validate_args=None):
+#         self.categoricals = []
+#         if probs is None:
+#             probs = [None] * len(logits)
+#         elif logits is None:
+#             logits = [None] * len(probs)
+#         else:
+#             raise ValueError('Either probs or logits must be None')
 
-def normalize_state(body, state):
-    '''
-    Normalizes one or more states using a running mean and standard deviation
-    Details of the normalization from Deep RL Bootcamp, L6
-    https://www.youtube.com/watch?v=8EcdaCk9KaQ&feature=youtu.be
-    '''
-    same_shape = False if type(state) == list else state.shape == body.state_mean.shape
-    has_preprocess = getattr(body.memory, 'preprocess_state', False)
-    if ('Atari' in body.memory.__class__.__name__):
-        # never normalize atari, it has its own normalization step
-        logger.debug('skipping normalizing for Atari, already handled by preprocess')
-        return state
-    elif ('Replay' in body.memory.__class__.__name__) and has_preprocess:
-        # normalization handled by preprocess_state function in the memory
-        logger.debug('skipping normalizing, already handled by preprocess')
-        return state
-    elif same_shape:
-        # if not atari, always normalize the state the first time we see it during act
-        # if the shape is not transformed in some way
-        if np.sum(body.state_std_dev) == 0:
-            return np.clip(state - body.state_mean, -10, 10)
-        else:
-            return np.clip((state - body.state_mean) / body.state_std_dev, -10, 10)
-    else:
-        # broadcastable sample from an un-normalized memory so we should normalize
-        logger.debug('normalizing sample from memory')
-        if np.sum(body.state_std_dev) == 0:
-            return np.clip(state - body.state_mean, -10, 10)
-        else:
-            return np.clip((state - body.state_mean) / body.state_std_dev, -10, 10)
+#         for sub_probs, sub_logits in zip(probs, logits):
+#             categorical = distributions.Categorical(probs=sub_probs, logits=sub_logits, validate_args=validate_args)
+#             self.categoricals.append(categorical)
+
+#     @property
+#     def logits(self):
+#         return [cat.logits for cat in self.categoricals]
+
+#     @property
+#     def probs(self):
+#         return [cat.probs for cat in self.categoricals]
+
+#     @property
+#     def param_shape(self):
+#         return [cat.param_shape for cat in self.categoricals]
+
+#     @property
+#     def mean(self):
+#         return torch.stack([cat.mean for cat in self.categoricals])
+
+#     @property
+#     def variance(self):
+#         return torch.stack([cat.variance for cat in self.categoricals])
+
+#     def sample(self, sample_shape=torch.Size()):
+#         return torch.stack([cat.sample(sample_shape=sample_shape) for cat in self.categoricals])
+
+#     def log_prob(self, value):
+#         return torch.stack([cat.log_prob(value[idx]) for idx, cat in enumerate(self.categoricals)])
+
+#     def entropy(self):
+#         return torch.stack([cat.entropy() for cat in self.categoricals])
+
+#     def enumerate_support(self):
+#         return [cat.enumerate_support() for cat in self.categoricals]
 
 
-# TODO Not currently used, this will crash for more exotic memory structures
-# def unnormalize_state(body, state):
+# setattr(distributions, 'Argmax', Argmax)
+# setattr(distributions, 'MultiCategorical', MultiCategorical)
+
+
+# base methods
+
+# def try_preprocess(state, algorithm, body, append=True):
+#     '''Try calling preprocess as implemented in body's memory to use for net input'''
+#     # if isinstance(state, LazyFrames):
+#     #     state = state.__array__()  # from global env preprocessor
+#     if hasattr(body.memory, 'preprocess_state'):
+#         state = body.memory.preprocess_state(state, append=append)
+#     # as float, and always as minibatch for net input
+#     state = torch.from_numpy(state).float().unsqueeze_(dim=0)
+#     return state
+
+
+# def cond_squeeze(out):
+#     '''Helper to squeeze output depending if it is tensor (discrete pdparam) or list of tensors (continuous pdparam of loc and scale)'''
+#     if isinstance(out, list):
+#         for out_t in out:
+#             out_t.squeeze_(dim=0)
+#     else:
+#         out.squeeze_(dim=0)
+#     return out
+
+
+# def init_action_pd(state, algorithm, body, append=True):
 #     '''
-#     Un-normalizes one or more states using a running mean and new_std_dev
+#     Build the proper action prob. dist. to use for action sampling.
+#     state is passed through algorithm's net via calc_pdparam, which the algorithm must implement using its proper net.
+#     This will return body, ActionPD and pdparam to allow augmentation, e.g. applying temperature tau to pdparam for boltzmann.
+#     Then, output must be called with sample_action_pd(body, ActionPD, pdparam) to sample action.
+#     @returns {cls, tensor, *} ActionPD, pdparam, body
 #     '''
-#     return state * body.state_mean + body.state_std_dev
+#     pdtypes = ACTION_PDS[body.action_type]
+#     assert body.action_pdtype in pdtypes, f'Pdtype {body.action_pdtype} is not compatible/supported with action_type {body.action_type}. Options are: {ACTION_PDS[body.action_type]}'
+#     ActionPD = getattr(distributions, body.action_pdtype)
 
+#     state = try_preprocess(state, algorithm, body, append=append)
+#     state = state.to(algorithm.net.device)
+#     pdparam = algorithm.calc_pdparam(state, evaluate=False)
+#     return ActionPD, pdparam, body
 
-def update_online_stats_and_normalize_state(body, state):
-    '''
-    Convenience combination function for updating running state mean and std_dev and normalizing the state in one go.
-    '''
-    logger.debug(f'state: {state}')
-    update_online_stats(body, state)
-    state = normalize_state(body, state)
-    logger.debug(f'normalized state: {state}')
-    return state
 
+# def sample_action_pd(ActionPD, pdparam, body):
+#     '''
+#     This uses the outputs from init_action_pd and an optionally augmented pdparam to construct a action_pd for sampling action
+#     @returns {tensor, distribution} action, action_pd A sampled action, and the prob. dist. used for sampling to enable calculations like kl, entropy, etc. later.
+#     '''
+#     pdparam = cond_squeeze(pdparam)
+#     if body.is_discrete:
+#         action_pd = ActionPD(logits=pdparam)
+#     else:  # continuous outputs a list, loc and scale
+#         assert len(pdparam) == 2, pdparam
+#         # scale (stdev) must be >0, use softplus
+#         if pdparam[1] < 5:
+#             pdparam[1] = torch.log(1 + torch.exp(pdparam[1])) + 1e-8
+#         action_pd = ActionPD(*pdparam)
+#     action = action_pd.sample()
+#     return action, action_pd
+
+
+# # interface action sampling methods
+
+
+# def default(state, algorithm, body):
+#     '''Plain policy by direct sampling using outputs of net as logits and constructing ActionPD as appropriate'''
+#     ActionPD, pdparam, body = init_action_pd(state, algorithm, body)
+#     action, action_pd = sample_action_pd(ActionPD, pdparam, body)
+#     return action, action_pd
+
+
+# def random(state, algorithm, body):
+#     '''Random action sampling that returns the same data format as default(), but without forward pass. Uses gym.space.sample()'''
+#     state = try_preprocess(state, algorithm, body, append=True)  # for consistency with init_action_pd inner logic
+#     if body.action_type == 'discrete':
+#         action_pd = distributions.Categorical(logits=torch.ones(body.action_space.high, device=algorithm.net.device))
+#     elif body.action_type == 'continuous':
+#         # Possibly this should this have a 'device' set
+#         action_pd = distributions.Uniform(
+#             low=torch.tensor(body.action_space.low).float(),
+#             high=torch.tensor(body.action_space.high).float())
+#     elif body.action_type == 'multi_discrete':
+#         action_pd = distributions.Categorical(
+#             logits=torch.ones(body.action_space.high.size, body.action_space.high[0], device=algorithm.net.device))
+#     elif body.action_type == 'multi_continuous':
+#         raise NotImplementedError
+#     elif body.action_type == 'multi_binary':
+#         raise NotImplementedError
+#     else:
+#         raise NotImplementedError
+#     sample = body.action_space.sample()
+#     action = torch.tensor(sample, device=algorithm.net.device)
+#     return action, action_pd
+
+
+# def epsilon_greedy(state, algorithm, body):
+#     '''Epsilon-greedy policy: with probability epsilon, do random action, otherwise do default sampling.'''
+#     epsilon = body.explore_var
+#     if epsilon > np.random.rand():
+#         return random(state, algorithm, body)
+#     else:
+#         return default(state, algorithm, body)
+
+
+# def boltzmann(state, algorithm, body):
+#     '''
+#     Boltzmann policy: adjust pdparam with temperature tau; the higher the more randomness/noise in action.
+#     '''
+#     tau = body.explore_var
+#     ActionPD, pdparam, body = init_action_pd(state, algorithm, body)
+#     pdparam /= tau
+#     action, action_pd = sample_action_pd(ActionPD, pdparam, body)
+#     return action, action_pd
+
+
+# def rule_guide_epsilon_greedy(state, algorithm, body):
+#     epsilon = body.explore_var
+#     if epsilon > np.random.rand():
+#         action, action_pd = random(state, algorithm, body)
+#     else:
+#         action, action_pd = default(state, algorithm, body)
+
+#     if body.env.clock.epi < algorithm.rule_guide_max_epi and \
+#         body.env.clock.epi % algorithm.rule_guide_frequency == 0: 
+#         if hasattr(body, 'state'):
+#             action, rp = rule_guide(body.state, algorithm, body)
+#         else:
+#             action, rp = rule_guide(state, algorithm, body)
+#     return action, action_pd 
+
+
+# def rule_guide_default(state, algorithm, body):
+#     action, action_pd = default(state, algorithm, body)
+
+#     if body.env.clock.epi < algorithm.rule_guide_max_epi and \
+#         body.env.clock.epi % algorithm.rule_guide_frequency == 0: 
+#         if hasattr(body, 'state'):
+#             action, rp = rule_guide(body.state, algorithm, body)
+#         else:
+#             action, rp = rule_guide(state, algorithm, body)
+#     return action, action_pd 
+
+
+# def rule_guide(state, algorithm, body):
+#     env = body.env.u_env
+#     action = env.rule_policy(state, algorithm, body)
+#     probs = torch.zeros(body.action_space.high, device=algorithm.net.device)
+#     probs[action] = 1
+#     action_pd = distributions.Categorical(probs=probs)
+#     action = torch.tensor(action, device=algorithm.net.device)
+#     return action, action_pd
+
+
+# # multi-body policy with a single forward pass to calc pdparam
+
+# def multi_default(states, algorithm, body_list, pdparam):
+#     '''
+#     Apply default policy body-wise
+#     Note, for efficiency, do a single forward pass to calculate pdparam, then call this policy like:
+#     @example
 
-def normalize_states_and_next_states(body, batch, episodic_flag=None):
-    '''
-    Convenience function for normalizing the states and next states in a batch of data
-    '''
-    logger.debug(f'states: {batch["states"]}')
-    logger.debug(f'next states: {batch["next_states"]}')
-    episodic = episodic_flag if episodic_flag is not None else body.memory.is_episodic
-    logger.debug(f'Episodic: {episodic}, episodic_flag: {episodic_flag}, body.memory: {body.memory.is_episodic}')
-    if episodic:
-        normalized = []
-        for epi in batch['states']:
-            normalized.append(normalize_state(body, epi))
-        batch['states'] = normalized
-        normalized = []
-        for epi in batch['next_states']:
-            normalized.append(normalize_state(body, epi))
-        batch['next_states'] = normalized
-    else:
-        batch['states'] = normalize_state(body, batch['states'])
-        batch['next_states'] = normalize_state(body, batch['next_states'])
-    logger.debug(f'normalized states: {batch["states"]}')
-    logger.debug(f'normalized next states: {batch["next_states"]}')
-    return batch
+#     pdparam = self.calc_pdparam(state, evaluate=False)
+#     action_a, action_pd_a = self.action_policy(pdparam, self, body_list)
+#     '''
+#     pdparam.squeeze_(dim=0)
+#     # assert pdparam has been chunked
+#     assert len(pdparam.shape) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
+#     action_list, action_pd_a = [], []
+#     for idx, sub_pdparam in enumerate(pdparam):
+#         body = body_list[idx]
+#         try_preprocess(states[idx], algorithm, body, append=True)  # for consistency with init_action_pd inner logic
+#         ActionPD = getattr(distributions, body.action_pdtype)
+#         action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
+#         action_list.append(action)
+#         action_pd_a.append(action_pd)
+#     action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
+#     return action_a, action_pd_a
+
+
+# def multi_random(states, algorithm, body_list, pdparam):
+#     '''Apply random policy body-wise.'''
+#     pdparam.squeeze_(dim=0)
+#     action_list, action_pd_a = [], []
+#     for idx, body in body_list:
+#         action, action_pd = random(states[idx], algorithm, body)
+#         action_list.append(action)
+#         action_pd_a.append(action_pd)
+#     action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
+#     return action_a, action_pd_a
+
+
+# def multi_epsilon_greedy(states, algorithm, body_list, pdparam):
+#     '''Apply epsilon-greedy policy body-wise'''
+#     assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
+#     action_list, action_pd_a = [], []
+#     for idx, sub_pdparam in enumerate(pdparam):
+#         body = body_list[idx]
+#         epsilon = body.explore_var
+#         if epsilon > np.random.rand():
+#             action, action_pd = random(states[idx], algorithm, body)
+#         else:
+#             try_preprocess(states[idx], algorithm, body, append=True)  # for consistency with init_action_pd inner logic
+#             ActionPD = getattr(distributions, body.action_pdtype)
+#             action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
+#         action_list.append(action)
+#         action_pd_a.append(action_pd)
+#     action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
+#     return action_a, action_pd_a
+
+
+# def multi_boltzmann(states, algorithm, body_list, pdparam):
+#     '''Apply Boltzmann policy body-wise'''
+#     # pdparam.squeeze_(dim=0)
+#     assert len(pdparam) > 1 and len(pdparam) == len(body_list), f'pdparam shape: {pdparam.shape}, bodies: {len(body_list)}'
+#     action_list, action_pd_a = [], []
+#     for idx, sub_pdparam in enumerate(pdparam):
+#         body = body_list[idx]
+#         try_preprocess(states[idx], algorithm, body, append=True)  # for consistency with init_action_pd inner logic
+#         tau = body.explore_var
+#         sub_pdparam /= tau
+#         ActionPD = getattr(distributions, body.action_pdtype)
+#         action, action_pd = sample_action_pd(ActionPD, sub_pdparam, body)
+#         action_list.append(action)
+#         action_pd_a.append(action_pd)
+#     action_a = torch.tensor(action_list, device=algorithm.net.device).unsqueeze_(dim=1)
+#     return action_a, action_pd_a
+
+
+# # action policy update methods
+
+# class VarScheduler:
+#     '''
+#     Variable scheduler for decaying variables such as explore_var (epsilon, tau) and entropy
+
+#     e.g. spec
+#     "explore_var_spec": {
+#         "name": "linear_decay",
+#         "start_val": 1.0,
+#         "end_val": 0.1,
+#         "start_step": 0,
+#         "end_step": 800,
+#     },
+#     '''
+
+#     def __init__(self, var_decay_spec=None):
+#         self._updater_name = 'no_decay' if var_decay_spec is None else var_decay_spec['name']
+#         self._updater = getattr(math_util, self._updater_name)
+#         util.set_attr(self, dict(
+#             start_val=np.nan,
+#         ))
+#         util.set_attr(self, var_decay_spec, [
+#             'start_val',
+#             'end_val',
+#             'start_step',
+#             'end_step',
+#         ])
+#         if not getattr(self, 'end_val', None):
+#             self.end_val = self.start_val
+
+#     def update(self, algorithm, clock):
+#         '''Get an updated value for var'''
+#         if (util.get_lab_mode() in ('enjoy', 'eval')) or self._updater_name == 'no_decay':
+#             return self.end_val
+#         step = clock.get(clock.max_tick_unit)
+#         val = self._updater(self.start_val, self.end_val, self.start_step, self.end_step, step)
+#         return val
+
+
+# # misc calc methods
+
+# def guard_multi_pdparams(pdparams, body):
+#     '''Guard pdparams for multi action'''
+#     action_dim = body.action_dim
+#     is_multi_action = ps.is_iterable(action_dim)
+#     if is_multi_action:
+#         assert ps.is_list(pdparams)
+#         pdparams = [t.clone() for t in pdparams]  # clone for grad safety
+#         assert len(pdparams) == len(action_dim), pdparams
+#         # transpose into (batch_size, [action_dims])
+#         pdparams = [list(torch.split(t, action_dim, dim=0)) for t in torch.cat(pdparams, dim=1)]
+#     return pdparams
+
+
+# def calc_log_probs(algorithm, net, body, batch):
+#     '''
+#     Method to calculate log_probs fresh from batch data
+#     Body already stores log_prob from self.net. This is used for PPO where log_probs needs to be recalculated.
+#     '''
+#     states, actions = batch['states'], batch['actions']
+#     action_dim = body.action_dim
+#     is_multi_action = ps.is_iterable(action_dim)
+#     # construct log_probs for each state-action
+#     pdparams = algorithm.calc_pdparam(states, net=net)
+#     pdparams = guard_multi_pdparams(pdparams, body)
+#     assert len(pdparams) == len(states), f'batch_size of pdparams: {len(pdparams)} vs states: {len(states)}'
+
+#     pdtypes = ACTION_PDS[body.action_type]
+#     ActionPD = getattr(distributions, body.action_pdtype)
+
+#     log_probs = []
+#     for idx, pdparam in enumerate(pdparams):
+#         if not is_multi_action:  # already cloned  for multi_action above
+#             pdparam = pdparam.clone()  # clone for grad safety
+#         _action, action_pd = sample_action_pd(ActionPD, pdparam, body)
+#         log_probs.append(action_pd.log_prob(actions[idx].float()).sum(dim=0))
+#     log_probs = torch.stack(log_probs)
+#     assert not torch.isnan(log_probs).any(), f'log_probs: {log_probs}, \npdparams: {pdparams} \nactions: {actions}'
+#     logger.debug(f'log_probs: {log_probs}')
+#     return log_probs
+
+
+# def update_online_stats(body, state):
+#     '''
+#     Method to calculate the running mean and standard deviation of the state space.
+#     See https://www.johndcook.com/blog/standard_deviation/ for more details
+#     for n >= 1
+#         M_n = M_n-1 + (state - M_n-1) / n
+#         S_n = S_n-1 + (state - M_n-1) * (state - M_n)
+#         variance = S_n / (n - 1)
+#         std_dev = sqrt(variance)
+#     '''
+#     logger.debug(f'mean: {body.state_mean}, std: {body.state_std_dev}, num examples: {body.state_n}')
+#     # Assumes only one state is given
+#     if ("Atari" in body.memory.__class__.__name__):
+#         assert state.ndim == 3
+#     elif getattr(body.memory, 'raw_state_dim', False):
+#         assert state.size == body.memory.raw_state_dim
+#     else:
+#         assert state.size == body.state_dim or state.shape == body.state_dim
+#     mean = body.state_mean
+#     body.state_n += 1
+#     if np.isnan(mean).any():
+#         assert np.isnan(body.state_std_dev_int)
+#         assert np.isnan(body.state_std_dev)
+#         body.state_mean = state
+#         body.state_std_dev_int = 0
+#         body.state_std_dev = 0
+#     else:
+#         assert body.state_n > 1
+#         body.state_mean = mean + (state - mean) / body.state_n
+#         body.state_std_dev_int = body.state_std_dev_int + (state - mean) * (state - body.state_mean)
+#         body.state_std_dev = np.sqrt(body.state_std_dev_int / (body.state_n - 1))
+#         # Guard against very small std devs
+#         if (body.state_std_dev < 1e-8).any():
+#             body.state_std_dev[np.where(body.state_std_dev < 1e-8)] += 1e-8
+#     logger.debug(f'new mean: {body.state_mean}, new std: {body.state_std_dev}, num examples: {body.state_n}')
+
+
+# def normalize_state(body, state):
+#     '''
+#     Normalizes one or more states using a running mean and standard deviation
+#     Details of the normalization from Deep RL Bootcamp, L6
+#     https://www.youtube.com/watch?v=8EcdaCk9KaQ&feature=youtu.be
+#     '''
+#     same_shape = False if type(state) == list else state.shape == body.state_mean.shape
+#     has_preprocess = getattr(body.memory, 'preprocess_state', False)
+#     if ('Atari' in body.memory.__class__.__name__):
+#         # never normalize atari, it has its own normalization step
+#         logger.debug('skipping normalizing for Atari, already handled by preprocess')
+#         return state
+#     elif ('Replay' in body.memory.__class__.__name__) and has_preprocess:
+#         # normalization handled by preprocess_state function in the memory
+#         logger.debug('skipping normalizing, already handled by preprocess')
+#         return state
+#     elif same_shape:
+#         # if not atari, always normalize the state the first time we see it during act
+#         # if the shape is not transformed in some way
+#         if np.sum(body.state_std_dev) == 0:
+#             return np.clip(state - body.state_mean, -10, 10)
+#         else:
+#             return np.clip((state - body.state_mean) / body.state_std_dev, -10, 10)
+#     else:
+#         # broadcastable sample from an un-normalized memory so we should normalize
+#         logger.debug('normalizing sample from memory')
+#         if np.sum(body.state_std_dev) == 0:
+#             return np.clip(state - body.state_mean, -10, 10)
+#         else:
+#             return np.clip((state - body.state_mean) / body.state_std_dev, -10, 10)
+
+
+# # TODO Not currently used, this will crash for more exotic memory structures
+# # def unnormalize_state(body, state):
+# #     '''
+# #     Un-normalizes one or more states using a running mean and new_std_dev
+# #     '''
+# #     return state * body.state_mean + body.state_std_dev
+
+
+# def update_online_stats_and_normalize_state(body, state):
+#     '''
+#     Convenience combination function for updating running state mean and std_dev and normalizing the state in one go.
+#     '''
+#     logger.debug(f'state: {state}')
+#     update_online_stats(body, state)
+#     state = normalize_state(body, state)
+#     logger.debug(f'normalized state: {state}')
+#     return state
+
+
+# def normalize_states_and_next_states(body, batch, episodic_flag=None):
+#     '''
+#     Convenience function for normalizing the states and next states in a batch of data
+#     '''
+#     logger.debug(f'states: {batch["states"]}')
+#     logger.debug(f'next states: {batch["next_states"]}')
+#     episodic = episodic_flag if episodic_flag is not None else body.memory.is_episodic
+#     logger.debug(f'Episodic: {episodic}, episodic_flag: {episodic_flag}, body.memory: {body.memory.is_episodic}')
+#     if episodic:
+#         normalized = []
+#         for epi in batch['states']:
+#             normalized.append(normalize_state(body, epi))
+#         batch['states'] = normalized
+#         normalized = []
+#         for epi in batch['next_states']:
+#             normalized.append(normalize_state(body, epi))
+#         batch['next_states'] = normalized
+#     else:
+#         batch['states'] = normalize_state(body, batch['states'])
+#         batch['next_states'] = normalize_state(body, batch['next_states'])
+#     logger.debug(f'normalized states: {batch["states"]}')
+#     logger.debug(f'normalized next states: {batch["next_states"]}')
+#     return batch
diff --git a/convlab/agent/algorithm/ppo.py b/convlab/agent/algorithm/ppo.py
index 753d43a..6ff7336 100644
--- a/convlab/agent/algorithm/ppo.py
+++ b/convlab/agent/algorithm/ppo.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from copy import deepcopy
 from convlab.agent import net
 from convlab.agent.algorithm import policy_util
@@ -54,9 +51,9 @@ class PPO(ActorCritic):
           "start_step": 100,
           "end_step": 5000,
         },
+        "minibatch_size": 256,
         "training_frequency": 1,
         "training_epoch": 8,
-        "normalize_state": true
     }
 
     e.g. special net_spec param "shared" to share/separate Actor/Critic
@@ -75,13 +72,12 @@ def init_algorithm_params(self):
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
+            minibatch_size=4,
             val_loss_coef=1.0,
         ))
         util.set_attr(self, self.algorithm_spec, [
             'action_pdtype',
             'action_policy',
-            "rule_guide_max_epi",
-            "rule_guide_frequency",
             # theoretically, PPO does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',
@@ -89,9 +85,9 @@ def init_algorithm_params(self):
             'clip_eps_spec',
             'entropy_coef_spec',
             'val_loss_coef',
+            'minibatch_size',
             'training_frequency',  # horizon
             'training_epoch',
-            'normalize_state',
         ])
         self.to_train = 0
         self.action_policy = getattr(policy_util, self.action_policy)
@@ -109,12 +105,12 @@ def init_algorithm_params(self):
     @lab_api
     def init_nets(self, global_nets=None):
         '''PPO uses old and new to calculate ratio for loss'''
-        super(PPO, self).init_nets(global_nets)
+        super().init_nets(global_nets)
         # create old net to calculate ratio
         self.old_net = deepcopy(self.net)
         assert id(self.old_net) != id(self.net)
 
-    def calc_policy_loss(self, batch, advs):
+    def calc_policy_loss(self, batch, pdparams, advs):
         '''
         The PPO loss function (subscript t is omitted)
         L^{CLIP+VF+S} = E[ L^CLIP - c1 * L^VF + c2 * S[pi](s) ]
@@ -128,92 +124,84 @@ def calc_policy_loss(self, batch, advs):
         3. S = E[ entropy ]
         '''
         clip_eps = self.body.clip_eps
+        action_pd = policy_util.init_action_pd(self.body.ActionPD, pdparams)
+        states = batch['states']
+        actions = batch['actions']
+        if self.body.env.is_venv:
+            states = math_util.venv_unpack(states)
+            actions = math_util.venv_unpack(actions)
 
         # L^CLIP
-        log_probs = policy_util.calc_log_probs(self, self.net, self.body, batch)
-        old_log_probs = policy_util.calc_log_probs(self, self.old_net, self.body, batch).detach()
+        log_probs = action_pd.log_prob(actions)
+        with torch.no_grad():
+            old_pdparams = self.calc_pdparam(states, net=self.old_net)
+            old_action_pd = policy_util.init_action_pd(self.body.ActionPD, old_pdparams)
+            old_log_probs = old_action_pd.log_prob(actions)
         assert log_probs.shape == old_log_probs.shape
-        assert advs.shape[0] == log_probs.shape[0]  # batch size
         ratios = torch.exp(log_probs - old_log_probs)  # clip to prevent overflow
         logger.debug(f'ratios: {ratios}')
         sur_1 = ratios * advs
         sur_2 = torch.clamp(ratios, 1.0 - clip_eps, 1.0 + clip_eps) * advs
         # flip sign because need to maximize
-        clip_loss = -torch.mean(torch.min(sur_1, sur_2))
+        clip_loss = -torch.min(sur_1, sur_2).mean()
         logger.debug(f'clip_loss: {clip_loss}')
 
         # L^VF (inherit from ActorCritic)
 
         # S entropy bonus
-        entropies = torch.stack(self.body.entropies)
-        ent_penalty = torch.mean(-self.body.entropy_coef * entropies)
+        entropy = action_pd.entropy().mean()
+        self.body.mean_entropy = entropy  # update logging variable
+        ent_penalty = -self.body.entropy_coef * entropy
         logger.debug(f'ent_penalty: {ent_penalty}')
 
         policy_loss = clip_loss + ent_penalty
         logger.debug(f'PPO Actor policy loss: {policy_loss:g}')
         return policy_loss
 
-    def train_shared(self):
-        '''
-        Trains the network when the actor and critic share parameters
-        '''
-        clock = self.body.env.clock
-        if self.to_train == 1:
-            # update old net
-            torch.cuda.empty_cache()
-            net_util.copy(self.net, self.old_net)
-            batch = self.sample()
-            total_loss = torch.tensor(0.0, device=self.net.device)
-            for _ in range(self.training_epoch):
-                with torch.no_grad():
-                    advs, v_targets = self.calc_advs_v_targets(batch)
-                policy_loss = self.calc_policy_loss(batch, advs)  # from actor
-                val_loss = self.calc_val_loss(batch, v_targets)  # from critic
-                loss = policy_loss + val_loss
-                # retain for entropies etc.
-                self.net.training_step(loss=loss, lr_clock=clock, retain_graph=True)
-                total_loss += loss
-            loss = total_loss / self.training_epoch
-            # reset
-            self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
-            return loss.item()
-        else:
+    def train(self):
+        if util.in_eval_lab_modes():
             return np.nan
-
-    def train_separate(self):
-        '''
-        Trains the network when the actor and critic share parameters
-        '''
         clock = self.body.env.clock
         if self.to_train == 1:
-            torch.cuda.empty_cache()
-            net_util.copy(self.net, self.old_net)
+            net_util.copy(self.net, self.old_net)  # update old net
             batch = self.sample()
-            policy_loss = self.train_actor(batch)
-            val_loss = self.train_critic(batch)
-            loss = val_loss + policy_loss
+            clock.set_batch_size(len(batch))
+            _pdparams, v_preds = self.calc_pdparam_v(batch)
+            advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
+            # piggy back on batch, but remember to not pack or unpack
+            batch['advs'], batch['v_targets'] = advs, v_targets
+            if self.body.env.is_venv:  # unpack if venv for minibatch sampling
+                for k, v in batch.items():
+                    if k not in ('advs', 'v_targets'):
+                        batch[k] = math_util.venv_unpack(v)
+            total_loss = torch.tensor(0.0)
+            for _ in range(self.training_epoch):
+                minibatches = util.split_minibatch(batch, self.minibatch_size)
+                for minibatch in minibatches:
+                    if self.body.env.is_venv:  # re-pack to restore proper shape
+                        for k, v in minibatch.items():
+                            if k not in ('advs', 'v_targets'):
+                                minibatch[k] = math_util.venv_pack(v, self.body.env.num_envs)
+                    advs, v_targets = minibatch['advs'], minibatch['v_targets']
+                    pdparams, v_preds = self.calc_pdparam_v(minibatch)
+                    policy_loss = self.calc_policy_loss(minibatch, pdparams, advs)  # from actor
+                    val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
+                    if self.shared:  # shared network
+                        loss = policy_loss + val_loss
+                        self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
+                    else:
+                        self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
+                        self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net)
+                        loss = policy_loss + val_loss
+                    total_loss += loss
+            loss = total_loss / self.training_epoch / len(minibatches)
             # reset
             self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
+            logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
             return loss.item()
         else:
             return np.nan
 
-    def train_actor(self, batch):
-        '''Trains the actor when the actor and critic are separate networks'''
-        total_policy_loss = torch.tensor(0.0, device=self.net.device)
-        for _ in range(self.training_epoch):
-            with torch.no_grad():
-                advs, _v_targets = self.calc_advs_v_targets(batch)
-            policy_loss = self.calc_policy_loss(batch, advs)
-            # retain for entropies etc.
-            self.net.training_step(loss=policy_loss, lr_clock=self.body.env.clock, retain_graph=True)
-        val_loss = total_policy_loss / self.training_epoch
-        return policy_loss
-
     @lab_api
     def update(self):
         self.body.explore_var = self.explore_var_scheduler.update(self, self.body.env.clock)
diff --git a/convlab/agent/algorithm/random.py b/convlab/agent/algorithm/random.py
index 67e41ed..db00bb6 100644
--- a/convlab/agent/algorithm/random.py
+++ b/convlab/agent/algorithm/random.py
@@ -1,10 +1,5 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The random agent algorithm
-For basic dev purpose.
-'''
+# The random agent algorithm
+# For basic dev purpose
 from convlab.agent.algorithm.base import Algorithm
 from convlab.lib import logger
 from convlab.lib.decorator import lab_api
@@ -23,16 +18,21 @@ def init_algorithm_params(self):
         '''Initialize other algorithm parameters'''
         self.to_train = 0
         self.training_frequency = 1
+        self.training_start_step = 0
 
     @lab_api
     def init_nets(self, global_nets=None):
         '''Initialize the neural network from the spec'''
-        pass
+        self.net_names = []
 
     @lab_api
     def act(self, state):
         '''Random action'''
-        action = self.body.action_space.sample()
+        body = self.body
+        if body.env.is_venv and not util.in_eval_lab_modes():
+            action = np.array([body.action_space.sample() for _ in range(body.env.num_envs)])
+        else:
+            action = body.action_space.sample()
         return action
 
     @lab_api
@@ -44,6 +44,7 @@ def sample(self):
     @lab_api
     def train(self):
         self.sample()
+        self.body.env.clock.tick('opt_step')  # to simulate metrics calc
         loss = np.nan
         return loss
 
diff --git a/convlab/agent/algorithm/reinforce.py b/convlab/agent/algorithm/reinforce.py
index 1a5177f..7dca884 100644
--- a/convlab/agent/algorithm/reinforce.py
+++ b/convlab/agent/algorithm/reinforce.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent import net
 from convlab.agent.algorithm import policy_util
 from convlab.agent.algorithm.base import Algorithm
@@ -8,11 +5,6 @@
 from convlab.lib import logger, math_util, util
 from convlab.lib.decorator import lab_api
 import numpy as np
-import pydash as ps
-from copy import deepcopy 
-import torch
-from torch import distributions
-import torch.nn.functional as F
 
 logger = logger.get_logger(__name__)
 
@@ -45,7 +37,6 @@ class Reinforce(Algorithm):
           "end_step": 5000,
         },
         "training_frequency": 1,
-        "normalize_state": true
     }
     '''
 
@@ -58,18 +49,17 @@ def init_algorithm_params(self):
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
+            policy_loss_coef=1.0,
         ))
         util.set_attr(self, self.algorithm_spec, [
             'action_pdtype',
             'action_policy',
-            "rule_guide_max_epi",
-            "rule_guide_frequency",
             # theoretically, REINFORCE does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',  # the discount factor
             'entropy_coef_spec',
+            'policy_loss_coef',
             'training_frequency',
-            'normalize_state',
         ])
         self.to_train = 0
         self.action_policy = getattr(policy_util, self.action_policy)
@@ -87,100 +77,90 @@ def init_nets(self, global_nets=None):
         Networks for continuous action spaces have two heads and return two values, the first is a tensor containing the mean of the action policy, the second is a tensor containing the std deviation of the action policy. The distribution is assumed to be a Gaussian (Normal) distribution.
         Networks for discrete action spaces have a single head and return the logits for a categorical probability distribution over the discrete actions
         '''
-        if global_nets is None:
-            in_dim = self.body.state_dim
-            out_dim = net_util.get_out_dim(self.body)
-            NetClass = getattr(net, self.net_spec['type'])
-            self.net = NetClass(self.net_spec, in_dim, out_dim)
-            self.net_names = ['net']
-        else:
-            util.set_attr(self, global_nets)
-            self.net_names = list(global_nets.keys())
+        in_dim = self.body.state_dim
+        out_dim = net_util.get_out_dim(self.body)
+        NetClass = getattr(net, self.net_spec['type'])
+        self.net = NetClass(self.net_spec, in_dim, out_dim)
+        self.net_names = ['net']
+        # init net optimizer and its lr scheduler
+        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
+        self.lr_scheduler = net_util.get_lr_scheduler(self.optim, self.net.lr_scheduler_spec)
+        net_util.set_global_nets(self, global_nets)
         self.post_init_nets()
 
     @lab_api
-    def calc_pdparam(self, x, evaluate=True, net=None):
+    def calc_pdparam(self, x, net=None):
         '''
         The pdparam will be the logits for discrete prob. dist., or the mean and std for continuous prob. dist.
         '''
         net = self.net if net is None else net
-        if evaluate:
-            pdparam = net.wrap_eval(x)
-        else:
-            # print(torch.nonzero(x))
-            net.train()
-            pdparam = net(x)
-        logger.debug(f'pdparam: {pdparam}')
+        pdparam = net(x)
         return pdparam
 
     @lab_api
     def act(self, state):
         body = self.body
-        if self.normalize_state:
-            state = policy_util.update_online_stats_and_normalize_state(body, state)
-        action, action_pd = self.action_policy(state, self, body)
-        body.action_tensor, body.action_pd = action, action_pd  # used for body.action_pd_update later
-        if len(action.shape) == 0:  # scalar
-            return action.cpu().numpy().astype(body.action_space.dtype).item()
-        else:
-            return action.cpu().numpy()
+        action = self.action_policy(state, self, body)
+        return action.cpu().squeeze().numpy()  # squeeze to handle scalar
 
     @lab_api
     def sample(self):
         '''Samples a batch from memory'''
         batch = self.body.memory.sample()
-        if self.normalize_state:
-            batch = policy_util.normalize_states_and_next_states(self.body, batch)
         batch = util.to_torch_batch(batch, self.net.device, self.body.memory.is_episodic)
         return batch
 
+    def calc_pdparam_batch(self, batch):
+        '''Efficiently forward to get pdparam and by batch for loss computation'''
+        states = batch['states']
+        if self.body.env.is_venv:
+            states = math_util.venv_unpack(states)
+        pdparam = self.calc_pdparam(states)
+        return pdparam
+
+    def calc_ret_advs(self, batch):
+        '''Calculate plain returns; which is generalized to advantage in ActorCritic'''
+        rets = math_util.calc_returns(batch['rewards'], batch['dones'], self.gamma)
+        advs = rets
+        if self.body.env.is_venv:
+            advs = math_util.venv_unpack(advs)
+        logger.debug(f'advs: {advs}')
+        return advs
+
+    def calc_policy_loss(self, batch, pdparams, advs):
+        '''Calculate the actor's policy loss'''
+        action_pd = policy_util.init_action_pd(self.body.ActionPD, pdparams)
+        actions = batch['actions']
+        if self.body.env.is_venv:
+            actions = math_util.venv_unpack(actions)
+        log_probs = action_pd.log_prob(actions)
+        policy_loss = - self.policy_loss_coef * (log_probs * advs).mean()
+        if self.entropy_coef_spec:
+            entropy = action_pd.entropy().mean()
+            self.body.mean_entropy = entropy  # update logging variable
+            policy_loss += (-self.body.entropy_coef * entropy)
+        logger.debug(f'Actor policy loss: {policy_loss:g}')
+        return policy_loss
+
     @lab_api
     def train(self):
         if util.in_eval_lab_modes():
-            self.body.flush()
             return np.nan
         clock = self.body.env.clock
         if self.to_train == 1:
             batch = self.sample()
-            loss = self.calc_policy_loss(batch)
-            self.net.training_step(loss=loss, lr_clock=clock)
+            clock.set_batch_size(len(batch))
+            pdparams = self.calc_pdparam_batch(batch)
+            advs = self.calc_ret_advs(batch)
+            loss = self.calc_policy_loss(batch, pdparams, advs)
+            self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
             # reset
             self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
+            logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
             return loss.item()
         else:
             return np.nan
 
-    def calc_policy_loss(self, batch):
-        '''Calculate the policy loss for a batch of data.'''
-        # use simple returns as advs
-        # advs = math_util.calc_returns(batch, self.gamma)[1:]
-        advs = math_util.calc_returns(batch, self.gamma)
-        # advs = math_util.standardize(advs)
-        logger.debug(f'advs: {advs}')
-        assert len(self.body.log_probs) == len(advs), f'batch_size of log_probs {len(self.body.log_probs)} vs advs: {len(advs)}'
-
-        # log_probs = torch.stack(self.body.log_probs)[1:]
-        log_probs = torch.stack(self.body.log_probs)
-        policy_loss = - log_probs * advs
-
-        # pdparam = self.net(batch['states'][1:])
-        # policy_loss = - torch.index_select(F.log_softmax(pdparam), 1, batch['actions'][1:].long()) * advs
-        # policy_loss = F.cross_entropy(pdparam, batch['actions'].long())
-
-        # action_pd = distributions.Categorical(logits=pdparam)
-        # log_probs = action_pd.log_prob(batch['actions'][1:].long())
-        # print(log_probs)
-        # policy_loss = - log_probs * advs
-
-        if self.entropy_coef_spec is not None:
-            entropies = torch.stack(self.body.entropies)
-            policy_loss += (-self.body.entropy_coef * entropies)
-        policy_loss = torch.sum(policy_loss)
-        logger.debug(f'Actor policy loss: {policy_loss:g}')
-        return policy_loss
-
     @lab_api
     def update(self):
         self.body.explore_var = self.explore_var_scheduler.update(self, self.body.env.clock)
diff --git a/convlab/agent/algorithm/sarsa.py b/convlab/agent/algorithm/sarsa.py
index 7f5eb27..91c019f 100644
--- a/convlab/agent/algorithm/sarsa.py
+++ b/convlab/agent/algorithm/sarsa.py
@@ -1,11 +1,8 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent import net
 from convlab.agent.algorithm import policy_util
 from convlab.agent.algorithm.base import Algorithm
 from convlab.agent.net import net_util
-from convlab.lib import logger, util
+from convlab.lib import logger, math_util, util
 from convlab.lib.decorator import lab_api
 import numpy as np
 import pydash as ps
@@ -42,7 +39,6 @@ class SARSA(Algorithm):
         },
         "gamma": 0.99,
         "training_frequency": 10,
-        "normalize_state": true
     }
     '''
 
@@ -63,7 +59,6 @@ def init_algorithm_params(self):
             'explore_var_spec',
             'gamma',  # the discount factor
             'training_frequency',  # how often to train for batch training (once each training_frequency time steps)
-            'normalize_state',
         ])
         self.to_train = 0
         self.action_policy = getattr(policy_util, self.action_policy)
@@ -75,54 +70,33 @@ def init_nets(self, global_nets=None):
         '''Initialize the neural network used to learn the Q function from the spec'''
         if 'Recurrent' in self.net_spec['type']:
             self.net_spec.update(seq_len=self.net_spec['seq_len'])
-        if global_nets is None:
-            in_dim = self.body.state_dim
-            out_dim = net_util.get_out_dim(self.body)
-            NetClass = getattr(net, self.net_spec['type'])
-            self.net = NetClass(self.net_spec, in_dim, out_dim)
-            self.net_names = ['net']
-        else:
-            util.set_attr(self, global_nets)
-            self.net_names = list(global_nets.keys())
+        in_dim = self.body.state_dim
+        out_dim = net_util.get_out_dim(self.body)
+        NetClass = getattr(net, self.net_spec['type'])
+        self.net = NetClass(self.net_spec, in_dim, out_dim)
+        self.net_names = ['net']
+        # init net optimizer and its lr scheduler
+        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
+        self.lr_scheduler = net_util.get_lr_scheduler(self.optim, self.net.lr_scheduler_spec)
+        net_util.set_global_nets(self, global_nets)
         self.post_init_nets()
 
     @lab_api
-    def calc_pdparam(self, x, evaluate=True, net=None):
+    def calc_pdparam(self, x, net=None):
         '''
         To get the pdparam for action policy sampling, do a forward pass of the appropriate net, and pick the correct outputs.
         The pdparam will be the logits for discrete prob. dist., or the mean and std for continuous prob. dist.
         '''
         net = self.net if net is None else net
-        if evaluate:
-            pdparam = net.wrap_eval(x)
-        else:
-            net.train()
-            pdparam = net(x)
-        logger.debug(f'pdparam: {pdparam}')
+        pdparam = net(x)
         return pdparam
 
     @lab_api
     def act(self, state):
         '''Note, SARSA is discrete-only'''
         body = self.body
-        if self.normalize_state:
-            state = policy_util.update_online_stats_and_normalize_state(body, state)
-        action, action_pd = self.action_policy(state, self, body)
-        body.action_tensor, body.action_pd = action, action_pd  # used for body.action_pd_update later
-        if len(action.shape) == 0:  # scalar
-            return action.cpu().numpy().astype(body.action_space.dtype).item()
-        else:
-            return action.cpu().numpy()
-
-    def calc_q_loss(self, batch):
-        '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
-        q_preds = self.net.wrap_eval(batch['states'])
-        act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
-        next_q_preds = self.net.wrap_eval(batch['next_states'])
-        act_next_q_preds = q_preds.gather(-1, batch['next_actions'].long().unsqueeze(-1)).squeeze(-1)
-        act_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * act_next_q_preds
-        q_loss = self.net.loss_fn(act_q_preds, act_q_targets)
-        return q_loss
+        action = self.action_policy(state, self, body)
+        return action.cpu().squeeze().numpy()  # squeeze to handle scalar
 
     @lab_api
     def sample(self):
@@ -131,11 +105,29 @@ def sample(self):
         # this is safe for next_action at done since the calculated act_next_q_preds will be multiplied by (1 - batch['dones'])
         batch['next_actions'] = np.zeros_like(batch['actions'])
         batch['next_actions'][:-1] = batch['actions'][1:]
-        if self.normalize_state:
-            batch = policy_util.normalize_states_and_next_states(self.body, batch)
         batch = util.to_torch_batch(batch, self.net.device, self.body.memory.is_episodic)
         return batch
 
+    def calc_q_loss(self, batch):
+        '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
+        states = batch['states']
+        next_states = batch['next_states']
+        if self.body.env.is_venv:
+            states = math_util.venv_unpack(states)
+            next_states = math_util.venv_unpack(next_states)
+        q_preds = self.net(states)
+        with torch.no_grad():
+            next_q_preds = self.net(next_states)
+        if self.body.env.is_venv:
+            q_preds = math_util.venv_pack(q_preds, self.body.env.num_envs)
+            next_q_preds = math_util.venv_pack(next_q_preds, self.body.env.num_envs)
+        act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
+        act_next_q_preds = next_q_preds.gather(-1, batch['next_actions'].long().unsqueeze(-1)).squeeze(-1)
+        act_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * act_next_q_preds
+        logger.debug(f'act_q_preds: {act_q_preds}\nact_q_targets: {act_q_targets}')
+        q_loss = self.net.loss_fn(act_q_preds, act_q_targets)
+        return q_loss
+
     @lab_api
     def train(self):
         '''
@@ -143,17 +135,16 @@ def train(self):
         Otherwise this function does nothing.
         '''
         if util.in_eval_lab_modes():
-            self.body.flush()
             return np.nan
         clock = self.body.env.clock
         if self.to_train == 1:
             batch = self.sample()
+            clock.set_batch_size(len(batch))
             loss = self.calc_q_loss(batch)
-            self.net.training_step(loss=loss, lr_clock=clock)
+            self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
             # reset
             self.to_train = 0
-            self.body.flush()
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
+            logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
             return loss.item()
         else:
             return np.nan
diff --git a/convlab/agent/algorithm/sil.py b/convlab/agent/algorithm/sil.py
index 332c400..3a5922e 100644
--- a/convlab/agent/algorithm/sil.py
+++ b/convlab/agent/algorithm/sil.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent import net, memory
 from convlab.agent.algorithm import policy_util
 from convlab.agent.algorithm.actor_critic import ActorCritic
@@ -39,16 +36,15 @@ class SIL(ActorCritic):
         "val_loss_coef": 0.01,
         "sil_policy_loss_coef": 1.0,
         "sil_val_loss_coef": 0.01,
-        "training_batch_epoch": 8,
+        "training_batch_iter": 8,
         "training_frequency": 1,
-        "training_epoch": 8,
-        "normalize_state": true
+        "training_iter": 8,
     }
 
     e.g. special memory_spec
     "memory": {
         "name": "OnPolicyReplay",
-        "sil_replay_name": "SILReplay",
+        "sil_replay_name": "Replay",
         "batch_size": 32,
         "max_size": 10000,
         "use_cer": true
@@ -56,7 +52,7 @@ class SIL(ActorCritic):
     '''
 
     def __init__(self, agent, global_nets=None):
-        super(SIL, self).__init__(agent, global_nets)
+        super().__init__(agent, global_nets)
         # create the extra replay memory for SIL
         MemoryClass = getattr(memory, self.memory_spec['sil_replay_name'])
         self.body.replay_memory = MemoryClass(self.memory_spec, self.body)
@@ -87,98 +83,68 @@ def init_algorithm_params(self):
             'sil_policy_loss_coef',
             'sil_val_loss_coef',
             'training_frequency',
-            'training_batch_epoch',
-            'training_epoch',
-            'normalize_state'
+            'training_batch_iter',
+            'training_iter',
         ])
-        super(SIL, self).init_algorithm_params()
+        super().init_algorithm_params()
 
     def sample(self):
         '''Modify the onpolicy sample to also append to replay'''
         batch = self.body.memory.sample()
         batch = {k: np.concatenate(v) for k, v in batch.items()}  # concat episodic memory
-        batch['rets'] = math_util.calc_returns(batch, self.gamma)
         for idx in range(len(batch['dones'])):
             tuples = [batch[k][idx] for k in self.body.replay_memory.data_keys]
             self.body.replay_memory.add_experience(*tuples)
-        if self.normalize_state:
-            batch = policy_util.normalize_states_and_next_states(self.body, batch)
         batch = util.to_torch_batch(batch, self.net.device, self.body.replay_memory.is_episodic)
         return batch
 
     def replay_sample(self):
         '''Samples a batch from memory'''
         batch = self.body.replay_memory.sample()
-        if self.normalize_state:
-            batch = policy_util.normalize_states_and_next_states(
-                self.body, batch, episodic_flag=self.body.replay_memory.is_episodic)
         batch = util.to_torch_batch(batch, self.net.device, self.body.replay_memory.is_episodic)
-        assert not torch.isnan(batch['states']).any(), batch['states']
         return batch
 
-    def calc_sil_policy_val_loss(self, batch):
+    def calc_sil_policy_val_loss(self, batch, pdparams):
         '''
         Calculate the SIL policy losses for actor and critic
         sil_policy_loss = -log_prob * max(R - v_pred, 0)
         sil_val_loss = (max(R - v_pred, 0)^2) / 2
         This is called on a randomly-sample batch from experience replay
         '''
-        returns = batch['rets']
-        v_preds = self.calc_v(batch['states'], evaluate=False)
-        clipped_advs = torch.clamp(returns - v_preds, min=0.0)
-        log_probs = policy_util.calc_log_probs(self, self.net, self.body, batch)
-
-        sil_policy_loss = self.sil_policy_loss_coef * torch.mean(- log_probs * clipped_advs)
-        sil_val_loss = self.sil_val_loss_coef * torch.pow(clipped_advs, 2) / 2
-        sil_val_loss = torch.mean(sil_val_loss)
+        v_preds = self.calc_v(batch['states'], use_cache=False)
+        rets = math_util.calc_returns(batch['rewards'], batch['dones'], self.gamma)
+        clipped_advs = torch.clamp(rets - v_preds, min=0.0)
+
+        action_pd = policy_util.init_action_pd(self.body.ActionPD, pdparams)
+        actions = batch['actions']
+        if self.body.env.is_venv:
+            actions = math_util.venv_unpack(actions)
+        log_probs = action_pd.log_prob(actions)
+
+        sil_policy_loss = - self.sil_policy_loss_coef * (log_probs * clipped_advs).mean()
+        sil_val_loss = self.sil_val_loss_coef * clipped_advs.pow(2).mean() / 2
         logger.debug(f'SIL actor policy loss: {sil_policy_loss:g}')
         logger.debug(f'SIL critic value loss: {sil_val_loss:g}')
         return sil_policy_loss, sil_val_loss
 
-    def train_shared(self):
-        '''
-        Trains the network when the actor and critic share parameters
-        '''
+    def train(self):
         clock = self.body.env.clock
         if self.to_train == 1:
             # onpolicy update
-            super_loss = super(SIL, self).train_shared()
+            super_loss = super().train()
             # offpolicy sil update with random minibatch
-            total_sil_loss = torch.tensor(0.0, device=self.net.device)
-            for _ in range(self.training_epoch):
+            total_sil_loss = torch.tensor(0.0)
+            for _ in range(self.training_iter):
                 batch = self.replay_sample()
-                for _ in range(self.training_batch_epoch):
-                    sil_policy_loss, sil_val_loss = self.calc_sil_policy_val_loss(batch)
+                for _ in range(self.training_batch_iter):
+                    pdparams, _v_preds = self.calc_pdparam_v(batch)
+                    sil_policy_loss, sil_val_loss = self.calc_sil_policy_val_loss(batch, pdparams)
                     sil_loss = sil_policy_loss + sil_val_loss
-                    self.net.training_step(loss=sil_loss, lr_clock=clock)
+                    self.net.train_step(sil_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
                     total_sil_loss += sil_loss
-            sil_loss = total_sil_loss / self.training_epoch
-            loss = super_loss + sil_loss
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
-            return loss.item()
-        else:
-            return np.nan
-
-    def train_separate(self):
-        '''
-        Trains the network when the actor and critic are separate networks
-        '''
-        clock = self.body.env.clock
-        if self.to_train == 1:
-            # onpolicy update
-            super_loss = super(SIL, self).train_separate()
-            # offpolicy sil update with random minibatch
-            total_sil_loss = torch.tensor(0.0, device=self.net.device)
-            for _ in range(self.training_epoch):
-                batch = self.replay_sample()
-                for _ in range(self.training_batch_epoch):
-                    sil_policy_loss, sil_val_loss = self.calc_sil_policy_val_loss(batch)
-                    self.net.training_step(loss=sil_policy_loss, lr_clock=clock, retain_graph=True)
-                    self.critic.training_step(loss=sil_val_loss, lr_clock=clock)
-                    total_sil_loss += sil_policy_loss + sil_val_loss
-            sil_loss = total_sil_loss / self.training_epoch
+            sil_loss = total_sil_loss / self.training_iter
             loss = super_loss + sil_loss
-            logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
+            logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
             return loss.item()
         else:
             return np.nan
@@ -213,15 +179,15 @@ class PPOSIL(SIL, PPO):
         "sil_policy_loss_coef": 1.0,
         "sil_val_loss_coef": 0.01,
         "training_frequency": 1,
-        "training_batch_epoch": 8,
+        "training_batch_iter": 8,
+        "training_iter": 8,
         "training_epoch": 8,
-        "normalize_state": true
     }
 
     e.g. special memory_spec
     "memory": {
         "name": "OnPolicyReplay",
-        "sil_replay_name": "SILReplay",
+        "sil_replay_name": "Replay",
         "batch_size": 32,
         "max_size": 10000,
         "use_cer": true
diff --git a/convlab/agent/memory/base.py b/convlab/agent/memory/base.py
index d307cbd..b09e3a3 100644
--- a/convlab/agent/memory/base.py
+++ b/convlab/agent/memory/base.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from abc import ABC, abstractmethod
 from collections import deque
 from convlab.lib import logger, util
@@ -11,12 +8,7 @@
 
 
 class Memory(ABC):
-    '''
-    Abstract class ancestor to all Memories,
-    specifies the necessary design blueprint for agent body to work in Lab.
-    Mostly, implement just the abstract methods and properties.
-    Memory is singleton to each body for modularity, and there is no gains to do multi-body memory now. Shall be constructed when body_space is built.
-    '''
+    '''Abstract Memory class to define the API methods'''
 
     def __init__(self, memory_spec, body):
         '''
@@ -24,65 +16,20 @@ def __init__(self, memory_spec, body):
         '''
         self.memory_spec = memory_spec
         self.body = body
-
         # declare what data keys to store
         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones', 'priorities']
-        # the basic variables for every memory
-        self.last_state = None
-        # method to log size warning only once to prevent spamming log
-        self.warn_size_once = ps.once(lambda msg: logger.warn(msg))
-        # for API consistency, reset to some max_len in your specific memory class
-        self.state_buffer = deque(maxlen=0)
-        # total_reward and its history over episodes
-        self.total_reward = 0
 
     @abstractmethod
     def reset(self):
         '''Method to fully reset the memory storage and related variables'''
         raise NotImplementedError
 
-    def epi_reset(self, state):
-        '''Method to reset at new episode'''
-        self.last_state = state
-        self.body.epi_reset()
-        self.total_reward = 0
-        self.state_buffer.clear()
-        for _ in range(self.state_buffer.maxlen):
-            self.state_buffer.append(np.zeros(self.body.state_dim))
-
-    def base_update(self, action, reward, state, done):
-        '''Method to do base memory update, like stats'''
-        from convlab.experiment import analysis
-        if np.isnan(reward):  # the start of episode
-            self.epi_reset(state)
-            return
-
-        self.total_reward += reward
-        return
-
     @abstractmethod
-    def update(self, action, reward, state, done):
-        '''Implement memory update given the full info from the latest timestep. Hint: use self.last_state to construct SARS. NOTE: guard for np.nan reward and done when individual env resets.'''
-        self.base_update(action, reward, state, done)
+    def update(self, state, action, reward, next_state, done):
+        '''Implement memory update given the full info from the latest timestep. NOTE: guard for np.nan reward and done when individual env resets.'''
         raise NotImplementedError
 
     @abstractmethod
     def sample(self):
         '''Implement memory sampling mechanism'''
         raise NotImplementedError
-
-    def preprocess_append(self, state, append=True):
-        '''Method to conditionally append to state buffer'''
-        if append:
-            assert id(state) != id(self.state_buffer[-1]), 'Do not append to buffer other than during action'
-            self.state_buffer.append(state)
-
-    def preprocess_state(self, state, append=True):
-        '''Transforms the raw state into format that is fed into the network'''
-        return state
-
-    def print_memory_info(self):
-        '''Prints size of all of the memory arrays'''
-        for k in self.data_keys:
-            d = getattr(self, k)
-            logger.info(f'Memory for body {self.body.aeb}: {k} :shape: {d.shape}, dtype: {d.dtype}, size: {util.sizeof(d)}MB')
diff --git a/convlab/agent/memory/onpolicy.py b/convlab/agent/memory/onpolicy.py
index 6bddb9d..da5033a 100644
--- a/convlab/agent/memory/onpolicy.py
+++ b/convlab/agent/memory/onpolicy.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from collections import deque
 from copy import deepcopy
 from convlab.agent.memory.base import Memory
@@ -39,14 +36,13 @@ class OnPolicyReplay(Memory):
     '''
 
     def __init__(self, memory_spec, body):
-        super(OnPolicyReplay, self).__init__(memory_spec, body)
+        super().__init__(memory_spec, body)
         # NOTE for OnPolicy replay, frequency = episode; for other classes below frequency = frames
         util.set_attr(self, self.body.agent.agent_spec['algorithm'], ['training_frequency'])
-        self.state_buffer = deque(maxlen=0)  # for API consistency
         # Don't want total experiences reset when memory is
         self.is_episodic = True
-        self.true_size = 0  # to number of experiences stored
-        self.seen_size = 0  # the number of experiences seen, including those stored and discarded
+        self.size = 0  # total experiences stored
+        self.seen_size = 0  # total experiences seen cumulatively
         # declare what data keys to store
         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
         self.reset()
@@ -57,27 +53,21 @@ def reset(self):
         for k in self.data_keys:
             setattr(self, k, [])
         self.cur_epi_data = {k: [] for k in self.data_keys}
-        self.most_recent = [None] * len(self.data_keys)
-        self.true_size = 0  # Size of the current memory
-        self.state_buffer.clear()
-        for _ in range(self.state_buffer.maxlen):
-            self.state_buffer.append(np.zeros(self.body.state_dim))
+        self.most_recent = (None,) * len(self.data_keys)
+        self.size = 0
 
     @lab_api
-    def update(self, action, reward, state, done):
+    def update(self, state, action, reward, next_state, done):
         '''Interface method to update memory'''
-        self.base_update(action, reward, state, done)
-        if not np.isnan(reward):  # not the start of episode
-            self.add_experience(self.last_state, action, reward, state, done)
-        self.last_state = state
+        self.add_experience(state, action, reward, next_state, done)
 
     def add_experience(self, state, action, reward, next_state, done):
         '''Interface helper method for update() to add experience to memory'''
-        self.most_recent = [state, action, reward, next_state, done]
+        self.most_recent = (state, action, reward, next_state, done)
         for idx, k in enumerate(self.data_keys):
             self.cur_epi_data[k].append(self.most_recent[idx])
         # If episode ended, add to memory and clear cur_epi_data
-        if done:
+        if util.epi_done(done):
             for k in self.data_keys:
                 getattr(self, k).append(self.cur_epi_data[k])
             self.cur_epi_data = {k: [] for k in self.data_keys}
@@ -86,9 +76,7 @@ def add_experience(self, state, action, reward, next_state, done):
             if len(self.states) == self.body.agent.algorithm.training_frequency:
                 self.body.agent.algorithm.to_train = 1
         # Track memory size and num experiences
-        self.true_size += 1
-        if self.true_size > 1000:
-            self.warn_size_once('Large memory size: {}'.format(self.true_size))
+        self.size += 1
         self.seen_size += 1
 
     def get_most_recent_experience(self):
@@ -112,78 +100,6 @@ def sample(self):
         return batch
 
 
-class OnPolicySeqReplay(OnPolicyReplay):
-    '''
-    Same as OnPolicyReplay Memory but returns the last `seq_len` states and next_states for input to a recurrent network.
-    Experiences with less than `seq_len` previous examples are padded with a 0 valued state and action vector.
-
-    e.g. memory_spec
-    "memory": {
-        "name": "OnPolicySeqReplay"
-    }
-    * seq_len provided by net_spec
-    '''
-
-    def __init__(self, memory_spec, body):
-        super(OnPolicySeqReplay, self).__init__(memory_spec, body)
-        self.seq_len = self.body.agent.agent_spec['net']['seq_len']
-        self.state_buffer = deque(maxlen=self.seq_len)
-        self.reset()
-
-    def preprocess_state(self, state, append=True):
-        '''
-        Transforms the raw state into format that is fed into the network
-        NOTE for onpolicy memory this method only gets called in policy util, not here.
-        '''
-        self.preprocess_append(state, append)
-        return np.stack(self.state_buffer)
-
-    def sample(self):
-        '''
-        Returns all the examples from memory in a single batch. Batch is stored as a dict.
-        Keys are the names of the different elements of an experience. Values are nested lists of the corresponding sampled elements. Elements are nested into episodes
-        states and next_states have are further nested into sequences containing the previous `seq_len` - 1 relevant states
-        e.g.
-        let s_seq_0 be [0, ..., s0] (zero-padded), s_seq_k be [s_{k-seq_len}, ..., s_k], so the states are nested for passing into RNN.
-        batch = {
-            'states'    : [
-                [s_seq_0, s_seq_1, ..., s_seq_k]_epi_1,
-                [s_seq_0, s_seq_1, ..., s_seq_k]_epi_2,
-                ...]
-            'actions'   : [[a_epi1], [a_epi2], ...],
-            'rewards'   : [[r_epi1], [r_epi2], ...],
-            'next_states: [
-                [ns_seq_0, ns_seq_1, ..., ns_seq_k]_epi_1,
-                [ns_seq_0, ns_seq_1, ..., ns_seq_k]_epi_2,
-                ...]
-            'dones'     : [[d_epi1], [d_epi2], ...]}
-        '''
-        batch = {}
-        batch['states'] = self.build_seqs(self.states)
-        batch['actions'] = self.actions
-        batch['rewards'] = self.rewards
-        batch['next_states'] = self.build_seqs(self.next_states)
-        batch['dones'] = self.dones
-        self.reset()
-        return batch
-
-    def build_seqs(self, data):
-        '''Construct the epi-nested-seq data for sampling'''
-        all_epi_data_seq = []
-        for epi_data in data:
-            data_seq = []
-            # make [0, ..., *epi_data]
-            padded_epi_data = deepcopy(epi_data)
-            padding = np.zeros_like(epi_data[0])
-            for i in range(self.seq_len - 1):
-                padded_epi_data.insert(0, padding)
-            # slide seqs and build for one epi
-            for i in range(len(epi_data)):
-                data_seq.append(padded_epi_data[i:i + self.seq_len])
-            all_epi_data_seq.append(data_seq)
-        return all_epi_data_seq
-
-
 class OnPolicyBatchReplay(OnPolicyReplay):
     '''
     Same as OnPolicyReplay Memory with the following difference.
@@ -200,7 +116,7 @@ class OnPolicyBatchReplay(OnPolicyReplay):
     '''
 
     def __init__(self, memory_spec, body):
-        super(OnPolicyBatchReplay, self).__init__(memory_spec, body)
+        super().__init__(memory_spec, body)
         self.is_episodic = False
 
     def add_experience(self, state, action, reward, next_state, done):
@@ -209,12 +125,10 @@ def add_experience(self, state, action, reward, next_state, done):
         for idx, k in enumerate(self.data_keys):
             getattr(self, k).append(self.most_recent[idx])
         # Track memory size and num experiences
-        self.true_size += 1
-        if self.true_size > 1000:
-            self.warn_size_once('Large memory size: {}'.format(self.true_size))
+        self.size += 1
         self.seen_size += 1
         # Decide if agent is to train
-        if done or len(self.states) == self.body.agent.algorithm.training_frequency:
+        if len(self.states) == self.body.agent.algorithm.training_frequency:
             self.body.agent.algorithm.to_train = 1
 
     def sample(self):
@@ -229,142 +143,4 @@ def sample(self):
             'next_states': next_states,
             'dones'      : dones}
         '''
-        return super(OnPolicyBatchReplay, self).sample()
-
-
-class OnPolicySeqBatchReplay(OnPolicyBatchReplay):
-    '''
-    Same as OnPolicyBatchReplay Memory but returns the last `seq_len` states and next_states for input to a recurrent network.
-    Experiences with less than `seq_len` previous examples are padded with a 0 valued state and action vector.
-
-    e.g. memory_spec
-    "memory": {
-        "name": "OnPolicySeqBatchReplay"
-    }
-    * seq_len provided by net_spec
-    * batch_size is training_frequency provided by algorithm_spec
-    '''
-
-    def __init__(self, memory_spec, body):
-        super(OnPolicySeqBatchReplay, self).__init__(memory_spec, body)
-        self.is_episodic = False
-        self.seq_len = self.body.agent.agent_spec['net']['seq_len']
-        self.state_buffer = deque(maxlen=self.seq_len)
-        self.reset()
-
-    def preprocess_state(self, state, append=True):
-        # delegate to OnPolicySeqReplay sequential method
-        return OnPolicySeqReplay.preprocess_state(self, state, append)
-
-    def sample(self):
-        '''
-        Batched version of OnPolicySeqBatchReplay.sample()
-        e.g.
-        let s_seq_0 be [0, ..., s0] (zero-padded), s_seq_k be [s_{k-seq_len}, ..., s_k], so the states are nested for passing into RNN.
-        batch = {
-            'states'     : [[s_seq_0, s_seq_1, ..., s_seq_k]],
-            'actions'    : actions,
-            'rewards'    : rewards,
-            'next_states': [[ns_seq_0, ns_seq_1, ..., ns_seq_k]],
-            'dones'      : dones}
-        '''
-        # delegate method
-        return OnPolicySeqReplay.sample(self)
-
-    def build_seqs(self, data):
-        '''Construct the seq data for sampling'''
-        data_seq = []
-        # make [0, ..., *data]
-        padded_data = deepcopy(data)
-        padding = np.zeros_like(data[0])
-        for i in range(self.seq_len - 1):
-            padded_data.insert(0, padding)
-        # slide seqs and build for one epi
-        for i in range(len(data)):
-            data_seq.append(padded_data[i:i + self.seq_len])
-        return data_seq
-
-
-class OnPolicyConcatReplay(OnPolicyReplay):
-    '''
-    Preprocesses a state to be the concatenation of the last n states. Otherwise the same as Replay memory
-
-    e.g. memory_spec
-    "memory": {
-        "name": "OnPolicyConcatReplay",
-        "concat_len": 4
-    }
-    '''
-
-    def __init__(self, memory_spec, body):
-        util.set_attr(self, memory_spec, [
-            'concat_len',  # number of stack states
-        ])
-        self.raw_state_dim = deepcopy(body.state_dim)  # used for state_buffer
-        body.state_dim = body.state_dim * self.concat_len  # modify to use for net init for concat input
-        super(OnPolicyConcatReplay, self).__init__(memory_spec, body)
-        self.state_buffer = deque(maxlen=self.concat_len)
-        self.reset()
-
-    def reset(self):
-        '''Initializes the memory arrays, size and head pointer'''
-        super(OnPolicyConcatReplay, self).reset()
-        self.state_buffer.clear()
-        for _ in range(self.state_buffer.maxlen):
-            self.state_buffer.append(np.zeros(self.raw_state_dim))
-
-    def epi_reset(self, state):
-        '''Method to reset at new episode'''
-        state = self.preprocess_state(state, append=False)  # prevent conflict with preprocess in epi_reset
-        super(OnPolicyConcatReplay, self).epi_reset(state)
-        # reappend buffer with custom shape
-        self.state_buffer.clear()
-        for _ in range(self.state_buffer.maxlen):
-            self.state_buffer.append(np.zeros(self.raw_state_dim))
-
-    def preprocess_state(self, state, append=True):
-        '''Transforms the raw state into format that is fed into the network'''
-        # append when state is first seen when acting in policy_util, don't append elsewhere in memory
-        self.preprocess_append(state, append)
-        return np.concatenate(self.state_buffer)
-
-    @lab_api
-    def update(self, action, reward, state, done):
-        '''Interface method to update memory'''
-        self.base_update(action, reward, state, done)
-        state = self.preprocess_state(state, append=False)  # prevent conflict with preprocess in epi_reset
-        if not np.isnan(reward):  # not the start of episode
-            self.add_experience(self.last_state, action, reward, state, done)
-        self.last_state = state
-
-
-class OnPolicyAtariReplay(OnPolicyReplay):
-    '''
-    Preprocesses an state to be the concatenation of the last four states, after converting the 210 x 160 x 3 image to 84 x 84 x 1 grayscale image, and clips all rewards to [-10, 10] as per "Playing Atari with Deep Reinforcement Learning", Mnih et al, 2013
-    Note: Playing Atari with Deep RL clips the rewards to + / - 1
-    Otherwise the same as OnPolicyReplay memory
-    '''
-
-    def __init__(self, memory_spec, body):
-        util.set_attr(self, memory_spec, [
-            'stack_len',  # number of stack states
-        ])
-        OnPolicyReplay.__init__(self, memory_spec, body)
-
-    def add_experience(self, state, action, reward, next_state, done):
-        # clip reward, done here to minimize change to only training data data
-        super(OnPolicyAtariReplay, self).add_experience(state, action, np.sign(reward), next_state, done)
-
-
-class OnPolicyImageReplay(OnPolicyReplay):
-    '''
-    An on policy replay buffer that normalizes (preprocesses) images through
-    division by 255 and subtraction of 0.5.
-    '''
-
-    def __init__(self, memory_spec, body):
-        super(OnPolicyImageReplay, self).__init__(memory_spec, body)
-
-    def preprocess_state(self, state, append=True):
-        state = util.normalize_image(state) - 0.5
-        return state
+        return super().sample()
diff --git a/convlab/agent/memory/prioritized.py b/convlab/agent/memory/prioritized.py
index 3a58842..c1cbb23 100644
--- a/convlab/agent/memory/prioritized.py
+++ b/convlab/agent/memory/prioritized.py
@@ -1,12 +1,8 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-from convlab.agent.memory.replay import Replay, AtariReplay
+from convlab.agent.memory.replay import Replay
 from convlab.lib import util
 from convlab.lib.decorator import lab_api
 import numpy as np
 import random
-import torch
 
 
 class SumTree:
@@ -116,16 +112,16 @@ def __init__(self, memory_spec, body):
             'max_size',
             'use_cer',
         ])
-        super(PrioritizedReplay, self).__init__(memory_spec, body)
+        super().__init__(memory_spec, body)
 
-        self.epsilon = torch.full((1,), self.epsilon)
-        self.alpha = torch.full((1,), self.alpha)
+        self.epsilon = np.full((1,), self.epsilon)
+        self.alpha = np.full((1,), self.alpha)
         # adds a 'priorities' scalar to the data_keys and call reset again
         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones', 'priorities']
         self.reset()
 
     def reset(self):
-        super(PrioritizedReplay, self).reset()
+        super().reset()
         self.tree = SumTree(self.max_size)
 
     def add_experience(self, state, action, reward, next_state, done, error=100000):
@@ -133,16 +129,14 @@ def add_experience(self, state, action, reward, next_state, done, error=100000):
         Implementation for update() to add experience to memory, expanding the memory size if necessary.
         All experiences are added with a high priority to increase the likelihood that they are sampled at least once.
         '''
-        super(PrioritizedReplay, self).add_experience(state, action, reward, next_state, done)
-        error = torch.zeros(1).fill_(error)
+        super().add_experience(state, action, reward, next_state, done)
         priority = self.get_priority(error)
         self.priorities[self.head] = priority
         self.tree.add(priority, self.head)
 
     def get_priority(self, error):
         '''Takes in the error of one or more examples and returns the proportional priority'''
-        p = torch.pow(error.cpu().detach() + self.epsilon, self.alpha)
-        return p.squeeze_().detach().numpy()
+        return np.power(error + self.epsilon, self.alpha).squeeze()
 
     def sample_idxs(self, batch_size):
         '''Samples batch_size indices from memory in proportional to their priority.'''
@@ -161,43 +155,14 @@ def sample_idxs(self, batch_size):
             batch_idxs[-1] = self.head
         return batch_idxs
 
-    def get_body_errors(self, errors):
-        '''Get the slice of errors belonging to a body in network output'''
-        body_idx = self.body.nanflat_a_idx
-        start_idx = body_idx * self.batch_size
-        end_idx = start_idx + self.batch_size
-        body_errors = errors[start_idx:end_idx]
-        return body_errors
-
     def update_priorities(self, errors):
         '''
         Updates the priorities from the most recent batch
         Assumes the relevant batch indices are stored in self.batch_idxs
         '''
-        body_errors = self.get_body_errors(errors)
-        priorities = self.get_priority(body_errors)
+        priorities = self.get_priority(errors)
         assert len(priorities) == self.batch_idxs.size
-        self.priorities[self.batch_idxs] = priorities
+        for idx, p in zip(self.batch_idxs, priorities):
+            self.priorities[idx] = p
         for p, i in zip(priorities, self.tree_idxs):
             self.tree.update(i, p)
-
-
-class AtariPrioritizedReplay(PrioritizedReplay, AtariReplay):
-    '''Make a Prioritized AtariReplay via nice multi-inheritance (python magic)'''
-
-    def __init__(self, memory_spec, body):
-        util.set_attr(self, memory_spec, [
-            'alpha',
-            'epsilon',
-            'batch_size',
-            'max_size',
-            'use_cer',
-        ])
-        AtariReplay.__init__(self, memory_spec, body)
-        self.epsilon = torch.full((1,), self.epsilon)
-        self.alpha = torch.full((1,), self.alpha)
-        # adds a 'priorities' scalar to the data_keys and call reset again
-        self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones', 'priorities']
-        self.reset()
-        self.states_shape = self.scalar_shape
-        self.states = [None] * self.max_size
diff --git a/convlab/agent/memory/replay.py b/convlab/agent/memory/replay.py
index 56457a5..2a543e8 100644
--- a/convlab/agent/memory/replay.py
+++ b/convlab/agent/memory/replay.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from collections import deque
 from copy import deepcopy
 from convlab.agent.memory.base import Memory
@@ -12,6 +9,35 @@
 logger = logger.get_logger(__name__)
 
 
+def sample_next_states(head, max_size, ns_idx_offset, batch_idxs, states, ns_buffer):
+    '''Method to sample next_states from states, with proper guard for next_state idx being out of bound'''
+    # idxs for next state is state idxs with offset, modded
+    ns_batch_idxs = (batch_idxs + ns_idx_offset) % max_size
+    # if head < ns_idx <= head + ns_idx_offset, ns is stored in ns_buffer
+    ns_batch_idxs = ns_batch_idxs % max_size
+    buffer_ns_locs = np.argwhere(
+        (head < ns_batch_idxs) & (ns_batch_idxs <= head + ns_idx_offset)).flatten()
+    # find if there is any idxs to get from buffer
+    to_replace = buffer_ns_locs.size != 0
+    if to_replace:
+        # extract the buffer_idxs first for replacement later
+        # given head < ns_idx <= head + offset, and valid buffer idx is [0, offset)
+        # get 0 < ns_idx - head <= offset, or equiv.
+        # get -1 < ns_idx - head - 1 <= offset - 1, i.e.
+        # get 0 <= ns_idx - head - 1 < offset, hence:
+        buffer_idxs = ns_batch_idxs[buffer_ns_locs] - head - 1
+        # set them to 0 first to allow sampling, then replace later with buffer
+        ns_batch_idxs[buffer_ns_locs] = 0
+    # guard all against overrun idxs from offset
+    ns_batch_idxs = ns_batch_idxs % max_size
+    next_states = util.batch_get(states, ns_batch_idxs)
+    if to_replace:
+        # now replace using buffer_idxs and ns_buffer
+        buffer_ns = util.batch_get(ns_buffer, buffer_idxs)
+        next_states[buffer_ns_locs] = buffer_ns
+    return next_states
+
+
 class Replay(Memory):
     '''
     Stores agent experiences and samples from them for agent training
@@ -42,70 +68,60 @@ class Replay(Memory):
     '''
 
     def __init__(self, memory_spec, body):
-        super(Replay, self).__init__(memory_spec, body)
+        super().__init__(memory_spec, body)
         util.set_attr(self, self.memory_spec, [
             'batch_size',
             'max_size',
             'use_cer',
         ])
-        self.state_buffer = deque(maxlen=0)  # for API consistency
         self.is_episodic = False
         self.batch_idxs = None
-        self.true_size = 0  # to number of experiences stored
-        self.seen_size = 0  # the number of experiences seen, including those stored and discarded
+        self.size = 0  # total experiences stored
+        self.seen_size = 0  # total experiences seen cumulatively
         self.head = -1  # index of most recent experience
+        # generic next_state buffer to store last next_states (allow for multiple for venv)
+        self.ns_idx_offset = self.body.env.num_envs if body.env.is_venv else 1
+        self.ns_buffer = deque(maxlen=self.ns_idx_offset)
         # declare what data keys to store
         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
-        self.scalar_shape = (self.max_size,)
-        self.states_shape = self.scalar_shape + tuple(np.reshape(self.body.state_dim, -1))
-        self.actions_shape = self.scalar_shape + self.body.action_space.shape
         self.reset()
 
     def reset(self):
         '''Initializes the memory arrays, size and head pointer'''
-        # set data keys as self.{data_keys}
+        # set self.states, self.actions, ...
         for k in self.data_keys:
-            if k == 'states':
-                setattr(self, k, np.zeros(self.states_shape, dtype=np.float16))
-            elif k == 'next_states':
-                # don't store next_states, but create a place holder to track it for sampling
-                self.latest_next_state = None
-            elif k == 'actions':
-                setattr(self, k, np.zeros(self.actions_shape, dtype=self.body.action_space.dtype))
-            else:
-                setattr(self, k, np.zeros(self.scalar_shape))
-        self.true_size = 0
+            if k != 'next_states':  # reuse self.states
+                # list add/sample is over 10x faster than np, also simpler to handle
+                setattr(self, k, [None] * self.max_size)
+        self.size = 0
         self.head = -1
-        self.state_buffer.clear()
-        for _ in range(self.state_buffer.maxlen):
-            self.state_buffer.append(np.zeros(self.body.state_dim))
-
-    def epi_reset(self, state):
-        '''Method to reset at new episode'''
-        super(Replay, self).epi_reset(self.preprocess_state(state, append=False))
+        self.ns_buffer.clear()
 
     @lab_api
-    def update(self, action, reward, state, done):
+    def update(self, state, action, reward, next_state, done):
         '''Interface method to update memory'''
-        self.base_update(action, reward, state, done)
-        state = self.preprocess_state(state, append=False)  # prevent conflict with preprocess in epi_reset
-        if not np.isnan(reward):  # not the start of episode
-            self.add_experience(self.last_state, action, reward, state, done)
-        self.last_state = state
+        if self.body.env.is_venv:
+            for sarsd in zip(state, action, reward, next_state, done):
+                self.add_experience(*sarsd)
+        else:
+            self.add_experience(state, action, reward, next_state, done)
 
     def add_experience(self, state, action, reward, next_state, done):
         '''Implementation for update() to add experience to memory, expanding the memory size if necessary'''
         # Move head pointer. Wrap around if necessary
         self.head = (self.head + 1) % self.max_size
-        self.states[self.head] = state
+        self.states[self.head] = state.astype(np.float16)
         self.actions[self.head] = action
         self.rewards[self.head] = reward
-        self.latest_next_state = next_state
+        self.ns_buffer.append(next_state.astype(np.float16))
         self.dones[self.head] = done
         # Actually occupied size of memory
-        if self.true_size < self.max_size:
-            self.true_size += 1
+        if self.size < self.max_size:
+            self.size += 1
         self.seen_size += 1
+        # set to_train using memory counters head, seen_size instead of tick since clock will step by num_envs when on venv; to_train will be set to 0 after training step
+        algorithm = self.body.agent.algorithm
+        algorithm.to_train = algorithm.to_train or (self.seen_size > algorithm.training_start_step and self.head % algorithm.training_frequency == 0)
 
     @lab_api
     def sample(self):
@@ -124,199 +140,14 @@ def sample(self):
         batch = {}
         for k in self.data_keys:
             if k == 'next_states':
-                batch[k] = self._sample_next_states(self.batch_idxs)
+                batch[k] = sample_next_states(self.head, self.max_size, self.ns_idx_offset, self.batch_idxs, self.states, self.ns_buffer)
             else:
-                batch[k] = util.cond_multiget(getattr(self, k), self.batch_idxs)
+                batch[k] = util.batch_get(getattr(self, k), self.batch_idxs)
         return batch
 
-    def _sample_next_states(self, batch_idxs):
-        '''Method to sample next_states from states, with proper guard for last idx (out of bound)'''
-        # idxs for next state is state idxs + 1
-        ns_batch_idxs = batch_idxs + 1
-        # find the locations to be replaced with latest_next_state
-        latest_ns_locs = np.argwhere(ns_batch_idxs == self.true_size).flatten()
-        to_replace = latest_ns_locs.size != 0
-        # set to 0, a safe sentinel for ns_batch_idxs due to the +1 above
-        # then sample safely from self.states, and replace at locs with latest_next_state
-        if to_replace:
-            ns_batch_idxs[latest_ns_locs] = 0
-        next_states = util.cond_multiget(self.states, ns_batch_idxs)
-        if to_replace:
-            next_states[latest_ns_locs] = self.latest_next_state
-        return next_states
-
     def sample_idxs(self, batch_size):
         '''Batch indices a sampled random uniformly'''
-        batch_idxs = np.random.randint(self.true_size, size=batch_size)
+        batch_idxs = np.random.randint(self.size, size=batch_size)
         if self.use_cer:  # add the latest sample
             batch_idxs[-1] = self.head
         return batch_idxs
-
-
-class SeqReplay(Replay):
-    '''
-    Preprocesses a state to be the stacked sequence of the last n states. Otherwise the same as Replay memory
-
-    e.g. memory_spec
-    "memory": {
-        "name": "SeqReplay",
-        "batch_size": 32,
-        "max_size": 10000,
-        "use_cer": true
-    }
-    * seq_len provided by net_spec
-    '''
-
-    def __init__(self, memory_spec, body):
-        super(SeqReplay, self).__init__(memory_spec, body)
-        self.seq_len = self.body.agent.agent_spec['net']['seq_len']
-        self.state_buffer = deque(maxlen=self.seq_len)
-        # update states_shape and call reset again
-        self.states_shape = self.scalar_shape + tuple(np.reshape([self.seq_len, self.body.state_dim], -1))
-        self.reset()
-
-    def preprocess_state(self, state, append=True):
-        '''Transforms the raw state into format that is fed into the network'''
-        # append when state is first seen when acting in policy_util, don't append elsewhere in memory
-        self.preprocess_append(state, append)
-        return np.stack(self.state_buffer)
-
-
-class SILReplay(Replay):
-    '''
-    Special Replay for SIL, which adds the returns calculated from its OnPolicyReplay
-
-    e.g. memory_spec
-    "memory": {
-        "name": "SILReplay",
-        "batch_size": 32,
-        "max_size": 10000,
-        "use_cer": true
-    }
-    '''
-
-    def __init__(self, memory_spec, body):
-        super(SILReplay, self).__init__(memory_spec, body)
-        # adds a 'rets' scalar to the data_keys and call reset again
-        self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones', 'rets']
-        self.reset()
-
-    @lab_api
-    def update(self, action, reward, state, done):
-        '''Interface method to update memory'''
-        raise AssertionError('Do not call SIL memory in main API control loop')
-
-    def add_experience(self, state, action, reward, next_state, done, ret):
-        '''Used to add memory from onpolicy memory'''
-        super(SILReplay, self).add_experience(state, action, reward, next_state, done)
-        self.rets[self.head] = ret
-
-
-class SILSeqReplay(SILReplay, SeqReplay):
-    '''
-    Preprocesses a state to be the stacked sequence of the last n states. Otherwise the same as SILReplay memory
-
-    e.g. memory_spec
-    "memory": {
-        "name": "SILSeqReplay",
-        "batch_size": 32,
-        "max_size": 10000,
-        "use_cer": true
-    }
-    * seq_len provided by net_spec
-    '''
-    pass
-
-
-class ConcatReplay(Replay):
-    '''
-    Preprocesses a state to be the concatenation of the last n states. Otherwise the same as Replay memory
-
-    e.g. memory_spec
-    "memory": {
-        "name": "ConcatReplay",
-        "batch_size": 32,
-        "max_size": 10000,
-        "concat_len": 4,
-        "use_cer": true
-    }
-    '''
-
-    def __init__(self, memory_spec, body):
-        util.set_attr(self, memory_spec, [
-            'batch_size',
-            'max_size',
-            'concat_len',  # number of stack states
-            'use_cer',
-        ])
-        self.raw_state_dim = deepcopy(body.state_dim)  # used for state_buffer
-        body.state_dim = body.state_dim * self.concat_len  # modify to use for net init for concat input
-        super(ConcatReplay, self).__init__(memory_spec, body)
-        self.state_buffer = deque(maxlen=self.concat_len)
-        self.reset()
-
-    def reset(self):
-        '''Initializes the memory arrays, size and head pointer'''
-        super(ConcatReplay, self).reset()
-        self.state_buffer.clear()
-        for _ in range(self.state_buffer.maxlen):
-            self.state_buffer.append(np.zeros(self.raw_state_dim))
-
-    def epi_reset(self, state):
-        '''Method to reset at new episode'''
-        super(ConcatReplay, self).epi_reset(state)
-        # reappend buffer with custom shape
-        self.state_buffer.clear()
-        for _ in range(self.state_buffer.maxlen):
-            self.state_buffer.append(np.zeros(self.raw_state_dim))
-
-    def preprocess_state(self, state, append=True):
-        '''Transforms the raw state into format that is fed into the network'''
-        # append when state is first seen when acting in policy_util, don't append elsewhere in memory
-        self.preprocess_append(state, append)
-        return np.concatenate(self.state_buffer)
-
-
-class AtariReplay(Replay):
-    '''
-    Preprocesses an state to be the concatenation of the last four states, after converting the 210 x 160 x 3 image to 84 x 84 x 1 grayscale image, and clips all rewards to [-10, 10] as per "Playing Atari with Deep Reinforcement Learning", Mnih et al, 2013
-    Note: Playing Atari with Deep RL clips the rewards to + / - 1
-
-    e.g. memory_spec
-    "memory": {
-        "name": "AtariReplay",
-        "batch_size": 32,
-        "max_size": 250000,
-        "stack_len": 4,
-        "use_cer": true
-    }
-    '''
-
-    def __init__(self, memory_spec, body):
-        util.set_attr(self, memory_spec, [
-            'batch_size',
-            'max_size',
-            'stack_len',  # number of stack states
-            'use_cer',
-        ])
-        Replay.__init__(self, memory_spec, body)
-        self.states_shape = self.scalar_shape
-        self.states = [None] * self.max_size
-
-    def add_experience(self, state, action, reward, next_state, done):
-        # clip reward, done here to minimize change to only training data data
-        super(AtariReplay, self).add_experience(state, action, np.sign(reward), next_state, done)
-
-
-class ImageReplay(Replay):
-    '''
-    An off policy replay buffer that normalizes (preprocesses) images through
-    division by 255 and subtraction of 0.5.
-    '''
-
-    def __init__(self, memory_spec, body):
-        super(ImageReplay, self).__init__(memory_spec, body)
-
-    def preprocess_state(self, state, append=True):
-        state = util.normalize_image(state) - 0.5
-        return state
diff --git a/convlab/agent/net/base.py b/convlab/agent/net/base.py
index b959a19..719cbc5 100644
--- a/convlab/agent/net/base.py
+++ b/convlab/agent/net/base.py
@@ -1,16 +1,12 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from abc import ABC, abstractmethod
+from convlab.agent.net import net_util
+import pydash as ps
 import torch
+import torch.nn as nn
 
 
 class Net(ABC):
-    '''
-    Abstract class ancestor to all Nets,
-    specifies the necessary design blueprint for algorithm to work in Lab.
-    Mostly, implement just the abstract methods and properties.
-    '''
+    '''Abstract Net class to define the API methods'''
 
     def __init__(self, net_spec, in_dim, out_dim):
         '''
@@ -30,6 +26,21 @@ def __init__(self, net_spec, in_dim, out_dim):
         else:
             self.device = 'cpu'
 
+    @net_util.dev_check_train_step
+    def train_step(self, loss, optim, lr_scheduler, clock=None, global_net=None):
+        lr_scheduler.step(epoch=ps.get(clock, 'frame'))
+        optim.zero_grad()
+        loss.backward()
+        if self.clip_grad_val is not None:
+            nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
+        if global_net is not None:
+            net_util.push_global_grads(self, global_net)
+        optim.step()
+        if global_net is not None:
+            net_util.copy(global_net, self)
+        clock.tick('opt_step')
+        return loss
+
     def store_grad_norms(self):
         '''Stores the gradient norms for debugging.'''
         norms = [param.grad.norm().item() for param in self.parameters()]
diff --git a/convlab/agent/net/conv.py b/convlab/agent/net/conv.py
index 1bcd121..93e1a09 100644
--- a/convlab/agent/net/conv.py
+++ b/convlab/agent/net/conv.py
@@ -1,16 +1,10 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent.net import net_util
 from convlab.agent.net.base import Net
-from convlab.lib import logger, math_util, util
-import numpy as np
+from convlab.lib import math_util, util
 import pydash as ps
 import torch
 import torch.nn as nn
 
-logger = logger.get_logger(__name__)
-
 
 class ConvNet(Net, nn.Module):
     '''
@@ -34,7 +28,9 @@ class ConvNet(Net, nn.Module):
         ],
         "fc_hid_layers": [512],
         "hid_layers_activation": "relu",
+        "out_layer_activation": "tanh",
         "init_fn": null,
+        "normalize": false,
         "batch_norm": false,
         "clip_grad_val": 1.0,
         "loss_spec": {
@@ -59,12 +55,15 @@ class ConvNet(Net, nn.Module):
     def __init__(self, net_spec, in_dim, out_dim):
         '''
         net_spec:
-        conv_hid_layers: list containing dimensions of the convolutional hidden layers. Asssumed to all come before the flat layers.
+        conv_hid_layers: list containing dimensions of the convolutional hidden layers, each is a list representing hid_layer = out_d, kernel, stride, padding, dilation.
+            Asssumed to all come before the flat layers.
             Note: a convolutional layer should specify the in_channel, out_channels, kernel_size, stride (of kernel steps), padding, and dilation (spacing between kernel points) E.g. [3, 16, (5, 5), 1, 0, (2, 2)]
             For more details, see http://pytorch.org/docs/master/nn.html#conv2d and https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
         fc_hid_layers: list of fc layers following the convolutional layers
         hid_layers_activation: activation function for the hidden layers
+        out_layer_activation: activation function for the output layer, same shape as out_dim
         init_fn: weight initialization function
+        normalize: whether to divide by 255.0 to normalize image input
         batch_norm: whether to add batch normalization after each convolutional layer, excluding the input layer.
         clip_grad_val: clip gradient norm if value is not None
         loss_spec: measure of error between model predictions and correct outputs
@@ -77,10 +76,12 @@ def __init__(self, net_spec, in_dim, out_dim):
         '''
         assert len(in_dim) == 3  # image shape (c,w,h)
         nn.Module.__init__(self)
-        super(ConvNet, self).__init__(net_spec, in_dim, out_dim)
+        super().__init__(net_spec, in_dim, out_dim)
         # set default
         util.set_attr(self, dict(
+            out_layer_activation=None,
             init_fn=None,
+            normalize=False,
             batch_norm=True,
             clip_grad_val=None,
             loss_spec={'name': 'MSELoss'},
@@ -95,7 +96,9 @@ def __init__(self, net_spec, in_dim, out_dim):
             'conv_hid_layers',
             'fc_hid_layers',
             'hid_layers_activation',
+            'out_layer_activation',
             'init_fn',
+            'normalize',
             'batch_norm',
             'clip_grad_val',
             'loss_spec',
@@ -107,33 +110,35 @@ def __init__(self, net_spec, in_dim, out_dim):
             'gpu',
         ])
 
-        # conv layer
+        # conv body
         self.conv_model = self.build_conv_layers(self.conv_hid_layers)
         self.conv_out_dim = self.get_conv_output_size()
 
-        # fc layer
-        if not ps.is_empty(self.fc_hid_layers):
-            # fc layer from flattened conv
-            self.fc_model = self.build_fc_layers(self.fc_hid_layers)
-            tail_in_dim = self.fc_hid_layers[-1]
-        else:
+        # fc body
+        if ps.is_empty(self.fc_hid_layers):
             tail_in_dim = self.conv_out_dim
+        else:
+            # fc body from flattened conv
+            self.fc_model = net_util.build_fc_model([self.conv_out_dim] + self.fc_hid_layers, self.hid_layers_activation)
+            tail_in_dim = self.fc_hid_layers[-1]
 
         # tails. avoid list for single-tail for compute speed
         if ps.is_integer(self.out_dim):
-            self.model_tail = nn.Linear(tail_in_dim, self.out_dim)
+            self.model_tail = net_util.build_fc_model([tail_in_dim, self.out_dim], self.out_layer_activation)
         else:
-            self.model_tails = nn.ModuleList([nn.Linear(tail_in_dim, out_d) for out_d in self.out_dim])
+            if not ps.is_list(self.out_layer_activation):
+                self.out_layer_activation = [self.out_layer_activation] * len(out_dim)
+            assert len(self.out_layer_activation) == len(self.out_dim)
+            tails = []
+            for out_d, out_activ in zip(self.out_dim, self.out_layer_activation):
+                tail = net_util.build_fc_model([tail_in_dim, out_d], out_activ)
+                tails.append(tail)
+            self.model_tails = nn.ModuleList(tails)
 
         net_util.init_layers(self, self.init_fn)
-        for module in self.modules():
-            module.to(self.device)
         self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
-        self.optim = net_util.get_optim(self, self.optim_spec)
-        self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
-
-    def __str__(self):
-        return super(ConvNet, self).__str__() + f'\noptim: {self.optim}'
+        self.to(self.device)
+        self.train()
 
     def get_conv_output_size(self):
         '''Helper function to calculate the size of the flattened features after the final convolutional layer'''
@@ -152,7 +157,8 @@ def build_conv_layers(self, conv_hid_layers):
             hid_layer = [tuple(e) if ps.is_list(e) else e for e in hid_layer]  # guard list-to-tuple
             # hid_layer = out_d, kernel, stride, padding, dilation
             conv_layers.append(nn.Conv2d(in_d, *hid_layer))
-            conv_layers.append(net_util.get_activation_fn(self.hid_layers_activation))
+            if self.hid_layers_activation is not None:
+                conv_layers.append(net_util.get_activation_fn(self.hid_layers_activation))
             # Don't include batch norm in the first layer
             if self.batch_norm and i != 0:
                 conv_layers.append(nn.BatchNorm2d(in_d))
@@ -160,20 +166,13 @@ def build_conv_layers(self, conv_hid_layers):
         conv_model = nn.Sequential(*conv_layers)
         return conv_model
 
-    def build_fc_layers(self, fc_hid_layers):
-        '''
-        Builds all of the fc layers in the network and store in a Sequential model
-        '''
-        assert not ps.is_empty(fc_hid_layers)
-        dims = [self.conv_out_dim] + fc_hid_layers
-        fc_model = net_util.build_sequential(dims, self.hid_layers_activation)
-        return fc_model
-
     def forward(self, x):
         '''
         The feedforward step
-        Note that PyTorch takes (c,w,h) but gym provides (w,h,c), so preprocessing must be done before passing to network
+        Note that PyTorch takes (c,h,w) but gym provides (h,w,c), so preprocessing must be done before passing to network
         '''
+        if self.normalize:
+            x = x / 255.0
         x = self.conv_model(x)
         x = x.view(x.size(0), -1)  # to (batch_size, -1)
         if hasattr(self, 'fc_model'):
@@ -187,36 +186,6 @@ def forward(self, x):
         else:
             return self.model_tail(x)
 
-    def training_step(self, x=None, y=None, loss=None, retain_graph=False, lr_clock=None):
-        '''Takes a single training step: one forward and one backwards pass'''
-        if hasattr(self, 'model_tails') and x is not None:
-            raise ValueError('Loss computation from x,y not supported for multitails')
-        self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t'))
-        self.train()
-        self.optim.zero_grad()
-        if loss is None:
-            out = self(x)
-            loss = self.loss_fn(out, y)
-        assert not torch.isnan(loss).any(), loss
-        if net_util.to_assert_trained():
-            assert_trained = net_util.gen_assert_trained(self)
-        loss.backward(retain_graph=retain_graph)
-        if self.clip_grad_val is not None:
-            nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
-        self.optim.step()
-        if net_util.to_assert_trained():
-            assert_trained(self, loss)
-            self.store_grad_norms()
-        logger.debug(f'Net training_step loss: {loss}')
-        return loss
-
-    def wrap_eval(self, x):
-        '''
-        Completes one feedforward step, ensuring net is set to evaluation model returns: network output given input x
-        '''
-        self.eval()
-        return self(x)
-
 
 class DuelingConvNet(ConvNet):
     '''
@@ -242,6 +211,7 @@ class DuelingConvNet(ConvNet):
         "fc_hid_layers": [512],
         "hid_layers_activation": "relu",
         "init_fn": "xavier_uniform_",
+        "normalize": false,
         "batch_norm": false,
         "clip_grad_val": 1.0,
         "loss_spec": {
@@ -270,6 +240,7 @@ def __init__(self, net_spec, in_dim, out_dim):
         # set default
         util.set_attr(self, dict(
             init_fn=None,
+            normalize=False,
             batch_norm=False,
             clip_grad_val=None,
             loss_spec={'name': 'MSELoss'},
@@ -285,6 +256,7 @@ def __init__(self, net_spec, in_dim, out_dim):
             'fc_hid_layers',
             'hid_layers_activation',
             'init_fn',
+            'normalize',
             'batch_norm',
             'clip_grad_val',
             'loss_spec',
@@ -299,31 +271,32 @@ def __init__(self, net_spec, in_dim, out_dim):
         # Guard against inappropriate algorithms and environments
         assert isinstance(out_dim, int)
 
-        # conv layer
+        # conv body
         self.conv_model = self.build_conv_layers(self.conv_hid_layers)
         self.conv_out_dim = self.get_conv_output_size()
 
-        # fc layer
-        if not ps.is_empty(self.fc_hid_layers):
+        # fc body
+        if ps.is_empty(self.fc_hid_layers):
+            tail_in_dim = self.conv_out_dim
+        else:
             # fc layer from flattened conv
-            self.fc_model = self.build_fc_layers(self.fc_hid_layers)
+            self.fc_model = net_util.build_fc_model([self.conv_out_dim] + self.fc_hid_layers, self.hid_layers_activation)
             tail_in_dim = self.fc_hid_layers[-1]
-        else:
-            tail_in_dim = self.conv_out_dim
 
         # tails. avoid list for single-tail for compute speed
         self.v = nn.Linear(tail_in_dim, 1)  # state value
-        self.adv = nn.Linear(tail_in_dim, out_dim[0])  # action dependent raw advantage
+        self.adv = nn.Linear(tail_in_dim, out_dim)  # action dependent raw advantage
+        self.model_tails = nn.ModuleList(self.v, self.adv)
 
         net_util.init_layers(self, self.init_fn)
-        for module in self.modules():
-            module.to(self.device)
         self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
-        self.optim = net_util.get_optim(self, self.optim_spec)
-        self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
+        self.to(self.device)
+        self.train()
 
     def forward(self, x):
         '''The feedforward step'''
+        if self.normalize:
+            x = x / 255.0
         x = self.conv_model(x)
         x = x.view(x.size(0), -1)  # to (batch_size, -1)
         if hasattr(self, 'fc_model'):
diff --git a/convlab/agent/net/mlp.py b/convlab/agent/net/mlp.py
index 1a47e1c..fff7026 100644
--- a/convlab/agent/net/mlp.py
+++ b/convlab/agent/net/mlp.py
@@ -1,16 +1,11 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent.net import net_util
 from convlab.agent.net.base import Net
-from convlab.lib import logger, math_util, util
+from convlab.lib import math_util, util
 import numpy as np
 import pydash as ps
 import torch
 import torch.nn as nn
 
-logger = logger.get_logger(__name__)
-
 
 class MLPNet(Net, nn.Module):
     '''
@@ -23,6 +18,7 @@ class MLPNet(Net, nn.Module):
         "shared": true,
         "hid_layers": [32],
         "hid_layers_activation": "relu",
+        "out_layer_activation": null,
         "init_fn": "xavier_uniform_",
         "clip_grad_val": 1.0,
         "loss_spec": {
@@ -49,6 +45,7 @@ def __init__(self, net_spec, in_dim, out_dim):
         net_spec:
         hid_layers: list containing dimensions of the hidden layers
         hid_layers_activation: activation function for the hidden layers
+        out_layer_activation: activation function for the output layer, same shape as out_dim
         init_fn: weight initialization function
         clip_grad_val: clip gradient norm if value is not None
         loss_spec: measure of error between model predictions and correct outputs
@@ -60,9 +57,10 @@ def __init__(self, net_spec, in_dim, out_dim):
         gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing
         '''
         nn.Module.__init__(self)
-        super(MLPNet, self).__init__(net_spec, in_dim, out_dim)
+        super().__init__(net_spec, in_dim, out_dim)
         # set default
         util.set_attr(self, dict(
+            out_layer_activation=None,
             init_fn=None,
             clip_grad_val=None,
             loss_spec={'name': 'MSELoss'},
@@ -77,6 +75,7 @@ def __init__(self, net_spec, in_dim, out_dim):
             'shared',
             'hid_layers',
             'hid_layers_activation',
+            'out_layer_activation',
             'init_fn',
             'clip_grad_val',
             'loss_spec',
@@ -89,23 +88,25 @@ def __init__(self, net_spec, in_dim, out_dim):
         ])
 
         dims = [self.in_dim] + self.hid_layers
-        self.model = net_util.build_sequential(dims, self.hid_layers_activation)
+        self.model = net_util.build_fc_model(dims, self.hid_layers_activation)
         # add last layer with no activation
         # tails. avoid list for single-tail for compute speed
         if ps.is_integer(self.out_dim):
-            self.model_tail = nn.Linear(dims[-1], self.out_dim)
+            self.model_tail = net_util.build_fc_model([dims[-1], self.out_dim], self.out_layer_activation)
         else:
-            self.model_tails = nn.ModuleList([nn.Linear(dims[-1], out_d) for out_d in self.out_dim])
+            if not ps.is_list(self.out_layer_activation):
+                self.out_layer_activation = [self.out_layer_activation] * len(out_dim)
+            assert len(self.out_layer_activation) == len(self.out_dim)
+            tails = []
+            for out_d, out_activ in zip(self.out_dim, self.out_layer_activation):
+                tail = net_util.build_fc_model([dims[-1], out_d], out_activ)
+                tails.append(tail)
+            self.model_tails = nn.ModuleList(tails)
 
         net_util.init_layers(self, self.init_fn)
-        for module in self.modules():
-            module.to(self.device)
         self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
-        self.optim = net_util.get_optim(self, self.optim_spec)
-        self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
-
-    def __str__(self):
-        return super(MLPNet, self).__str__() + f'\noptim: {self.optim}'
+        self.to(self.device)
+        self.train()
 
     def forward(self, x):
         '''The feedforward step'''
@@ -118,40 +119,6 @@ def forward(self, x):
         else:
             return self.model_tail(x)
 
-    def training_step(self, x=None, y=None, loss=None, retain_graph=False, lr_clock=None):
-        '''
-        Takes a single training step: one forward and one backwards pass
-        More most RL usage, we have custom, often complicated, loss functions. Compute its value and put it in a pytorch tensor then pass it in as loss
-        '''
-        if hasattr(self, 'model_tails') and x is not None:
-            raise ValueError('Loss computation from x,y not supported for multitails')
-        self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t'))
-        self.train()
-        self.optim.zero_grad()
-        if loss is None:
-            out = self(x)
-            loss = self.loss_fn(out, y)
-        assert not torch.isnan(loss).any(), loss
-        if net_util.to_assert_trained():
-            assert_trained = net_util.gen_assert_trained(self)
-        loss.backward(retain_graph=retain_graph)
-        if self.clip_grad_val is not None:
-            nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
-        self.optim.step()
-        if net_util.to_assert_trained():
-            assert_trained(self, loss)
-            self.store_grad_norms()
-        logger.debug(f'Net training_step loss: {loss}')
-        return loss
-
-    def wrap_eval(self, x):
-        '''
-        Completes one feedforward step, ensuring net is set to evaluation model
-        returns: network output given input x
-        '''
-        self.eval()
-        return self(x)
-
 
 class HydraMLPNet(Net, nn.Module):
     '''
@@ -167,6 +134,7 @@ class HydraMLPNet(Net, nn.Module):
             [] # tail, no hidden layers
         ],
         "hid_layers_activation": "relu",
+        "out_layer_activation": null,
         "init_fn": "xavier_uniform_",
         "clip_grad_val": 1.0,
         "loss_spec": {
@@ -213,9 +181,10 @@ def __init__(self, net_spec, in_dim, out_dim):
            env 1 action      env 2 action
         '''
         nn.Module.__init__(self)
-        super(HydraMLPNet, self).__init__(net_spec, in_dim, out_dim)
+        super().__init__(net_spec, in_dim, out_dim)
         # set default
         util.set_attr(self, dict(
+            out_layer_activation=None,
             init_fn=None,
             clip_grad_val=None,
             loss_spec={'name': 'MSELoss'},
@@ -229,6 +198,7 @@ def __init__(self, net_spec, in_dim, out_dim):
         util.set_attr(self, self.net_spec, [
             'hid_layers',
             'hid_layers_activation',
+            'out_layer_activation',
             'init_fn',
             'clip_grad_val',
             'loss_spec',
@@ -253,18 +223,13 @@ def __init__(self, net_spec, in_dim, out_dim):
         self.model_heads = self.build_model_heads(in_dim)
         heads_out_dim = np.sum([head_hid_layers[-1] for head_hid_layers in self.head_hid_layers])
         dims = [heads_out_dim] + self.body_hid_layers
-        self.model_body = net_util.build_sequential(dims, self.hid_layers_activation)
-        self.model_tails = self.build_model_tails(out_dim)
+        self.model_body = net_util.build_fc_model(dims, self.hid_layers_activation)
+        self.model_tails = self.build_model_tails(self.out_dim, self.out_layer_activation)
 
         net_util.init_layers(self, self.init_fn)
-        for module in self.modules():
-            module.to(self.device)
         self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
-        self.optim = net_util.get_optim(self, self.optim_spec)
-        self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
-
-    def __str__(self):
-        return super(HydraMLPNet, self).__str__() + f'\noptim: {self.optim}'
+        self.to(self.device)
+        self.train()
 
     def build_model_heads(self, in_dim):
         '''Build each model_head. These are stored as Sequential models in model_heads'''
@@ -272,22 +237,26 @@ def build_model_heads(self, in_dim):
         model_heads = nn.ModuleList()
         for in_d, hid_layers in zip(in_dim, self.head_hid_layers):
             dims = [in_d] + hid_layers
-            model_head = net_util.build_sequential(dims, self.hid_layers_activation)
+            model_head = net_util.build_fc_model(dims, self.hid_layers_activation)
             model_heads.append(model_head)
         return model_heads
 
-    def build_model_tails(self, out_dim):
+    def build_model_tails(self, out_dim, out_layer_activation):
         '''Build each model_tail. These are stored as Sequential models in model_tails'''
+        if not ps.is_list(out_layer_activation):
+            out_layer_activation = [out_layer_activation] * len(out_dim)
         model_tails = nn.ModuleList()
         if ps.is_empty(self.tail_hid_layers):
-            for out_d in out_dim:
-                model_tails.append(nn.Linear(self.body_hid_layers[-1], out_d))
+            for out_d, out_activ in zip(out_dim, out_layer_activation):
+                tail = net_util.build_fc_model([self.body_hid_layers[-1], out_d], out_activ)
+                model_tails.append(tail)
         else:
             assert len(self.tail_hid_layers) == len(out_dim), 'Hydra tail hid_params inconsistent with number out dims'
-            for out_d, hid_layers in zip(out_dim, self.tail_hid_layers):
+            for out_d, out_activ, hid_layers in zip(out_dim, out_layer_activation, self.tail_hid_layers):
                 dims = hid_layers
-                model_tail = net_util.build_sequential(dims, self.hid_layers_activation)
-                model_tail.add_module(str(len(model_tail)), nn.Linear(dims[-1], out_d))
+                model_tail = net_util.build_fc_model(dims, self.hid_layers_activation)
+                tail_out = net_util.build_fc_model([dims[-1], out_d], out_activ)
+                model_tail.add_module(str(len(model_tail)), tail_out)
                 model_tails.append(model_tail)
         return model_tails
 
@@ -303,41 +272,6 @@ def forward(self, xs):
             outs.append(model_tail(body_x))
         return outs
 
-    def training_step(self, xs=None, ys=None, loss=None, retain_graph=False, lr_clock=None):
-        '''
-        Takes a single training step: one forward and one backwards pass. Both x and y are lists of the same length, one x and y per environment
-        '''
-        self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t'))
-        self.train()
-        self.optim.zero_grad()
-        if loss is None:
-            outs = self(xs)
-            total_loss = torch.tensor(0.0, device=self.device)
-            for out, y in zip(outs, ys):
-                loss = self.loss_fn(out, y)
-                total_loss += loss
-            loss = total_loss
-        assert not torch.isnan(loss).any(), loss
-        if net_util.to_assert_trained():
-            assert_trained = net_util.gen_assert_trained(self)
-        loss.backward(retain_graph=retain_graph)
-        if self.clip_grad_val is not None:
-            nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
-        self.optim.step()
-        if net_util.to_assert_trained():
-            assert_trained(self, loss)
-            self.store_grad_norms()
-        logger.debug(f'Net training_step loss: {loss}')
-        return loss
-
-    def wrap_eval(self, x):
-        '''
-        Completes one feedforward step, ensuring net is set to evaluation model
-        returns: network output given input x
-        '''
-        self.eval()
-        return self(x)
-
 
 class DuelingMLPNet(MLPNet):
     '''
@@ -404,16 +338,13 @@ def __init__(self, net_spec, in_dim, out_dim):
         # Guard against inappropriate algorithms and environments
         # Build model body
         dims = [self.in_dim] + self.hid_layers
-        self.model_body = net_util.build_sequential(dims, self.hid_layers_activation)
+        self.model_body = net_util.build_fc_model(dims, self.hid_layers_activation)
         # output layers
         self.v = nn.Linear(dims[-1], 1)  # state value
         self.adv = nn.Linear(dims[-1], out_dim)  # action dependent raw advantage
         net_util.init_layers(self, self.init_fn)
-        for module in self.modules():
-            module.to(self.device)
         self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
-        self.optim = net_util.get_optim(self, self.optim_spec)
-        self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
+        self.to(self.device)
 
     def forward(self, x):
         '''The feedforward step'''
diff --git a/convlab/agent/net/net_util.py b/convlab/agent/net/net_util.py
index 3f6cb70..753ee88 100644
--- a/convlab/agent/net/net_util.py
+++ b/convlab/agent/net/net_util.py
@@ -1,18 +1,15 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-from functools import partial
-from convlab import ROOT_DIR
-from convlab.lib import logger, util
+from functools import partial, wraps
+from convlab.lib import logger, optimizer, util
 import os
 import pydash as ps
 import torch
 import torch.nn as nn
 
-
-NN_LOWCASE_LOOKUP = {nn_name.lower(): nn_name for nn_name in nn.__dict__}
 logger = logger.get_logger(__name__)
 
+# register custom torch.optim
+setattr(torch.optim, 'GlobalAdam', optimizer.GlobalAdam)
+
 
 class NoOpLRScheduler:
     '''Symbolic LRScheduler class for API consistency'''
@@ -24,55 +21,71 @@ def step(self, epoch=None):
         pass
 
     def get_lr(self):
-        return self.optim.defaults['lr']
+        if hasattr(self.optim, 'defaults'):
+            return self.optim.defaults['lr']
+        else:  # TODO retrieve lr more generally
+            return self.optim.param_groups[0]['lr']
 
 
-def build_sequential(dims, activation):
-    '''Build the Sequential model by interleaving nn.Linear and activation_fn'''
+def build_fc_model(dims, activation=None):
+    '''Build a full-connected model by interleaving nn.Linear and activation_fn'''
     assert len(dims) >= 2, 'dims need to at least contain input, output'
+    # shift dims and make pairs of (in, out) dims per layer
     dim_pairs = list(zip(dims[:-1], dims[1:]))
     layers = []
     for in_d, out_d in dim_pairs:
         layers.append(nn.Linear(in_d, out_d))
-        layers.append(get_activation_fn(activation))
+        if activation is not None:
+            layers.append(get_activation_fn(activation))
     model = nn.Sequential(*layers)
     return model
 
 
+def get_nn_name(uncased_name):
+    '''Helper to get the proper name in PyTorch nn given a case-insensitive name'''
+    for nn_name in nn.__dict__:
+        if uncased_name.lower() == nn_name.lower():
+            return nn_name
+    raise ValueError(f'Name {uncased_name} not found in {nn.__dict__}')
+
+
 def get_activation_fn(activation):
     '''Helper to generate activation function layers for net'''
-    nn_name = NN_LOWCASE_LOOKUP.get(activation) or NN_LOWCASE_LOOKUP['relu']
-    ActivationClass = getattr(nn, nn_name)
+    ActivationClass = getattr(nn, get_nn_name(activation))
     return ActivationClass()
 
 
 def get_loss_fn(cls, loss_spec):
     '''Helper to parse loss param and construct loss_fn for net'''
-    LossClass = getattr(nn, loss_spec['name'])
+    LossClass = getattr(nn, get_nn_name(loss_spec['name']))
     loss_spec = ps.omit(loss_spec, 'name')
     loss_fn = LossClass(**loss_spec)
     return loss_fn
 
 
-def get_optim(cls, optim_spec):
-    '''Helper to parse optim param and construct optim for net'''
-    OptimClass = getattr(torch.optim, optim_spec['name'])
-    optim_spec = ps.omit(optim_spec, 'name')
-    optim = OptimClass(cls.parameters(), **optim_spec)
-    return optim
-
-
-def get_lr_scheduler(cls, lr_scheduler_spec):
+def get_lr_scheduler(optim, lr_scheduler_spec):
     '''Helper to parse lr_scheduler param and construct Pytorch optim.lr_scheduler'''
     if ps.is_empty(lr_scheduler_spec):
-        lr_scheduler = NoOpLRScheduler(cls.optim)
+        lr_scheduler = NoOpLRScheduler(optim)
+    elif lr_scheduler_spec['name'] == 'LinearToZero':
+        LRSchedulerClass = getattr(torch.optim.lr_scheduler, 'LambdaLR')
+        frame = float(lr_scheduler_spec['frame'])
+        lr_scheduler = LRSchedulerClass(optim, lr_lambda=lambda x: 1 - x / frame)
     else:
         LRSchedulerClass = getattr(torch.optim.lr_scheduler, lr_scheduler_spec['name'])
         lr_scheduler_spec = ps.omit(lr_scheduler_spec, 'name')
-        lr_scheduler = LRSchedulerClass(cls.optim, **lr_scheduler_spec)
+        lr_scheduler = LRSchedulerClass(optim, **lr_scheduler_spec)
     return lr_scheduler
 
 
+def get_optim(net, optim_spec):
+    '''Helper to parse optim param and construct optim for net'''
+    OptimClass = getattr(torch.optim, optim_spec['name'])
+    optim_spec = ps.omit(optim_spec, 'name')
+    optim = OptimClass(net.parameters(), **optim_spec)
+    return optim
+
+
 def get_policy_out_dim(body):
     '''Helper method to construct the policy network out_dim for a body according to is_discrete, action_type'''
     action_dim = body.action_dim
@@ -84,15 +97,11 @@ def get_policy_out_dim(body):
             assert ps.is_integer(action_dim), action_dim
             policy_out_dim = action_dim
     else:
-        if body.action_type == 'multi_continuous':
-            assert ps.is_list(action_dim), action_dim
-            raise NotImplementedError('multi_continuous not supported yet')
-        else:
-            assert ps.is_integer(action_dim), action_dim
-            if action_dim == 1:
-                policy_out_dim = 2  # singleton stay as int
-            else:
-                policy_out_dim = action_dim * [2]
+        assert ps.is_integer(action_dim), action_dim
+        if action_dim == 1:  # single action, use [loc, scale]
+            policy_out_dim = 2
+        else:  # multi-action, use [locs], [scales]
+            policy_out_dim = [action_dim, action_dim]
     return policy_out_dim
 
 
@@ -109,33 +118,38 @@ def get_out_dim(body, add_critic=False):
     return out_dim
 
 
-def init_layers(net, init_fn):
-    if init_fn is None:
+def init_layers(net, init_fn_name):
+    '''Primary method to initialize the weights of the layers of a network'''
+    if init_fn_name is None:
         return
-    if init_fn == 'xavier_uniform_':
-        try:
-            gain = nn.init.calculate_gain(net.hid_layers_activation)
-        except ValueError:
-            gain = 1
-        init_fn = partial(nn.init.xavier_uniform_, gain=gain)
-    elif 'kaiming' in init_fn:
-        assert net.hid_layers_activation in ['relu', 'leaky_relu'], f'Kaiming initialization not supported for {net.hid_layers_activation}'
-        init_fn = nn.init.__dict__[init_fn]
-        init_fn = partial(init_fn, nonlinearity=net.hid_layers_activation)
+
+    # get nonlinearity
+    nonlinearity = get_nn_name(net.hid_layers_activation).lower()
+    if nonlinearity == 'leakyrelu':
+        nonlinearity = 'leaky_relu'  # guard name
+
+    # get init_fn and add arguments depending on nonlinearity
+    init_fn = getattr(nn.init, init_fn_name)
+    if 'kaiming' in init_fn_name:  # has 'nonlinearity' as arg
+        assert nonlinearity in ['relu', 'leaky_relu'], f'Kaiming initialization not supported for {nonlinearity}'
+        init_fn = partial(init_fn, nonlinearity=nonlinearity)
+    elif 'orthogonal' in init_fn_name or 'xavier' in init_fn_name:  # has 'gain' as arg
+        gain = nn.init.calculate_gain(nonlinearity)
+        init_fn = partial(init_fn, gain=gain)
     else:
-        init_fn = nn.init.__dict__[init_fn]
-    net.apply(partial(init_parameters, init_fn=init_fn))
+        pass
 
+    # finally, apply init_params to each layer in its modules
+    net.apply(partial(init_params, init_fn=init_fn))
 
-def init_parameters(module, init_fn):
-    '''
-    Initializes module's weights using init_fn, which is the name of function from from nn.init
-    Initializes module's biases to either 0.01 or 0.0, depending on module
-    The only exception is BatchNorm layers, for which we use uniform initialization
-    '''
-    bias_init = 0.01
-    classname = module.__class__.__name__
-    if 'BatchNorm' in classname:
+
+def init_params(module, init_fn):
+    '''Initialize module's weights using init_fn, and biases to 0.0'''
+    bias_init = 0.0
+    classname = util.get_class_name(module)
+    if 'Net' in classname:  # skip if it's a net, not pytorch layer
+        pass
+    elif any(k in classname for k in ('BatchNorm', 'Conv', 'Linear')):
         init_fn(module.weight)
         nn.init.constant_(module.bias, bias_init)
     elif 'GRU' in classname:
@@ -143,10 +157,9 @@ def init_parameters(module, init_fn):
             if 'weight' in name:
                 init_fn(param)
             elif 'bias' in name:
-                nn.init.constant_(param, 0.0)
-    elif 'Linear' in classname or ('Conv' in classname and 'Net' not in classname):
-        init_fn(module.weight)
-        nn.init.constant_(module.bias, bias_init)
+                nn.init.constant_(param, bias_init)
+    else:
+        pass
 
 
 # params methods
@@ -155,30 +168,31 @@ def init_parameters(module, init_fn):
 def save(net, model_path):
     '''Save model weights to path'''
     torch.save(net.state_dict(), util.smart_path(model_path))
-    logger.info(f'Saved model to {model_path}')
 
 
 def save_algorithm(algorithm, ckpt=None):
     '''Save all the nets for an algorithm'''
     agent = algorithm.agent
     net_names = algorithm.net_names
-    prepath = util.get_prepath(agent.spec, agent.info_space, unit='session')
+    model_prepath = agent.spec['meta']['model_prepath']
     if ckpt is not None:
-        prepath = f'{prepath}_ckpt-{ckpt}'
-    logger.info(f'Saving algorithm {util.get_class_name(algorithm)} nets {net_names}')
+        model_prepath = f'{model_prepath}_ckpt-{ckpt}'
     for net_name in net_names:
         net = getattr(algorithm, net_name)
-        model_path = f'{prepath}_{net_name}_model.pth'
+        model_path = f'{model_prepath}_{net_name}_model.pt'
         save(net, model_path)
-        optim_path = f'{prepath}_{net_name}_optim.pth'
-        save(net.optim, optim_path)
+        optim_name = net_name.replace('net', 'optim')
+        optim = getattr(algorithm, optim_name, None)
+        if optim is not None:  # only trainable net has optim
+            optim_path = f'{model_prepath}_{net_name}_optim.pt'
+            save(optim, optim_path)
+    logger.debug(f'Saved algorithm {util.get_class_name(algorithm)} nets {net_names} to {model_prepath}_*.pt')
 
 
 def load(net, model_path):
     '''Save model weights from a path into a net module'''
     device = None if torch.cuda.is_available() else 'cpu'
     net.load_state_dict(torch.load(util.smart_path(model_path), map_location=device))
-    logger.info(f'Loaded model from {model_path}')
 
 
 def load_algorithm(algorithm):
@@ -187,16 +201,19 @@ def load_algorithm(algorithm):
     net_names = algorithm.net_names
     if util.in_eval_lab_modes():
         # load specific model in eval mode
-        prepath = agent.info_space.eval_model_prepath
+        model_prepath = agent.spec['meta']['eval_model_prepath']
     else:
-        prepath = util.get_prepath(agent.spec, agent.info_space, unit='session')
-    logger.info(f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names}')
+        model_prepath = agent.spec['meta']['model_prepath']
+    logger.info(f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names} from {model_prepath}_*.pt')
     for net_name in net_names:
         net = getattr(algorithm, net_name)
-        model_path = f'{prepath}_{net_name}_model.pth'
+        model_path = f'{model_prepath}_{net_name}_model.pt'
         load(net, model_path)
-        optim_path = f'{prepath}_{net_name}_optim.pth'
-        load(net.optim, optim_path)
+        optim_name = net_name.replace('net', 'optim')
+        optim = getattr(algorithm, optim_name, None)
+        if optim is not None:  # only trainable net has optim
+            optim_path = f'{model_prepath}_{net_name}_optim.pt'
+            load(optim, optim_path)
 
 
 def copy(src_net, tar_net):
@@ -204,54 +221,75 @@ def copy(src_net, tar_net):
     tar_net.load_state_dict(src_net.state_dict())
 
 
-def polyak_update(src_net, tar_net, beta=0.5):
-    '''Polyak weight update to update a target tar_net'''
-    tar_params = tar_net.named_parameters()
-    src_params = src_net.named_parameters()
-    src_dict_params = dict(src_params)
-
-    for name, tar_param in tar_params:
-        if name in src_dict_params:
-            src_dict_params[name].data.copy_(beta * tar_param.data + (1 - beta) * src_dict_params[name].data)
-
-    tar_net.load_state_dict(src_dict_params)
+def polyak_update(src_net, tar_net, old_ratio=0.5):
+    '''
+    Polyak weight update to update a target tar_net, retain old weights by its ratio, i.e.
+    target <- old_ratio * source + (1 - old_ratio) * target
+    '''
+    for src_param, tar_param in zip(src_net.parameters(), tar_net.parameters()):
+        tar_param.data.copy_(old_ratio * src_param.data + (1.0 - old_ratio) * tar_param.data)
 
 
-def to_assert_trained():
+def to_check_train_step():
     '''Condition for running assert_trained'''
     return os.environ.get('PY_ENV') == 'test' or util.get_lab_mode() == 'dev'
 
 
-def gen_assert_trained(pre_model):
+def dev_check_train_step(fn):
     '''
-    Generate assert_trained function used to check weight updates
+    Decorator to check if net.train_step actually updates the network weights properly
+    Triggers only if to_check_train_step is True (dev/test mode)
     @example
 
-    assert_trained = gen_assert_trained(model)
-    # ...
-    loss.backward()
-    optim.step()
-    assert_trained(model, loss)
+    @net_util.dev_check_train_step
+    def train_step(self, ...):
+        ...
     '''
-    pre_weights = [param.clone() for param in pre_model.parameters()]
-
-    def assert_trained(post_model, loss):
-        post_weights = [param.clone() for param in post_model.parameters()]
-        if loss == 0:
-            # TODO if without momentum, weights should not change too
-            for p_name, param in post_model.named_parameters():
+    @wraps(fn)
+    def check_fn(*args, **kwargs):
+        if not to_check_train_step():
+            return fn(*args, **kwargs)
+
+        net = args[0]  # first arg self
+        # get pre-update parameters to compare
+        pre_params = [param.clone() for param in net.parameters()]
+
+        # run train_step, get loss
+        loss = fn(*args, **kwargs)
+        assert not torch.isnan(loss).any(), loss
+
+        # get post-update parameters to compare
+        post_params = [param.clone() for param in net.parameters()]
+        if loss == 0.0:
+            # if loss is 0, there should be no updates
+            # TODO if without momentum, parameters should not change too
+            for p_name, param in net.named_parameters():
                 assert param.grad.norm() == 0
         else:
-            assert not all(torch.equal(w1, w2) for w1, w2 in zip(pre_weights, post_weights)), f'Model parameter is not updated in training_step(), check if your tensor is detached from graph. loss: {loss}'
-            min_norm = 0
-            max_norm = 1e5
-            for p_name, param in post_model.named_parameters():
+            # check parameter updates
+            try:
+                assert not all(torch.equal(w1, w2) for w1, w2 in zip(pre_params, post_params)), f'Model parameter is not updated in train_step(), check if your tensor is detached from graph. Loss: {loss:g}'
+                logger.info(f'Model parameter is updated in train_step(). Loss: {loss: g}')
+            except Exception as e:
+                logger.error(e)
+                if os.environ.get('PY_ENV') == 'test':
+                    # raise error if in unit test
+                    raise(e)
+
+            # check grad norms
+            min_norm, max_norm = 0.0, 1e5
+            for p_name, param in net.named_parameters():
                 try:
-                    assert min_norm < param.grad.norm() < max_norm, f'Gradient norm fails the extreme value check {min_norm} < {p_name}:{param.grad.norm()} < {max_norm}, which is bad. Loss: {loss}. Check your network and loss computation. Consider using the "clip_grad_val" net parameter.'
+                    grad_norm = param.grad.norm()
+                    assert min_norm < grad_norm < max_norm, f'Gradient norm for {p_name} is {grad_norm:g}, fails the extreme value check {min_norm} < grad_norm < {max_norm}. Loss: {loss:g}. Check your network and loss computation.'
                 except Exception as e:
-                    logger.warn(e)
-        logger.debug('Passed network weight update assertation in dev lab_mode.')
-    return assert_trained
+                    logger.warning(e)
+            logger.info(f'Gradient norms passed value check.')
+        logger.debug('Passed network parameter update check.')
+        # store grad norms for debugging
+        net.store_grad_norms()
+        return loss
+    return check_fn
 
 
 def get_grad_norms(algorithm):
@@ -262,3 +300,54 @@ def get_grad_norms(algorithm):
         if net.grad_norms is not None:
             grad_norms.extend(net.grad_norms)
     return grad_norms
+
+
+def init_global_nets(algorithm):
+    '''
+    Initialize global_nets for Hogwild using an identical instance of an algorithm from an isolated Session
+    in spec.meta.distributed, specify either:
+    - 'shared': global network parameter is shared all the time. In this mode, algorithm local network will be replaced directly by global_net via overriding by identify attribute name
+    - 'synced': global network parameter is periodically synced to local network after each gradient push. In this mode, algorithm will keep a separate reference to `global_{net}` for each of its network
+    '''
+    dist_mode = algorithm.agent.spec['meta']['distributed']
+    assert dist_mode in ('shared', 'synced'), f'Unrecognized distributed mode'
+    global_nets = {}
+    for net_name in algorithm.net_names:
+        optim_name = net_name.replace('net', 'optim')
+        if not hasattr(algorithm, optim_name):  # only for trainable network, i.e. has an optim
+            continue
+        g_net = getattr(algorithm, net_name)
+        g_net.share_memory()  # make net global
+        if dist_mode == 'shared':  # use the same name to override the local net
+            global_nets[net_name] = g_net
+        else:  # keep a separate reference for syncing
+            global_nets[f'global_{net_name}'] = g_net
+        # if optim is Global, set to override the local optim and its scheduler
+        optim = getattr(algorithm, optim_name)
+        if 'Global' in util.get_class_name(optim):
+            optim.share_memory()  # make optim global
+            global_nets[optim_name] = optim
+            lr_scheduler_name = net_name.replace('net', 'lr_scheduler')
+            lr_scheduler = getattr(algorithm, lr_scheduler_name)
+            global_nets[lr_scheduler_name] = lr_scheduler
+    logger.info(f'Initialized global_nets attr {list(global_nets.keys())} for Hogwild')
+    return global_nets
+
+
+def set_global_nets(algorithm, global_nets):
+    '''For Hogwild, set attr built in init_global_nets above. Use in algorithm init.'''
+    # set attr first so algorithm always has self.global_{net} to pass into train_step
+    for net_name in algorithm.net_names:
+        setattr(algorithm, f'global_{net_name}', None)
+    # set attr created in init_global_nets
+    if global_nets is not None:
+        util.set_attr(algorithm, global_nets)
+        logger.info(f'Set global_nets attr {list(global_nets.keys())} for Hogwild')
+
+
+def push_global_grads(net, global_net):
+    '''Push gradients to global_net, call inside train_step between loss.backward() and optim.step()'''
+    for param, global_param in zip(net.parameters(), global_net.parameters()):
+        if global_param.grad is not None:
+            return  # quick skip
+        global_param._grad = param.grad
diff --git a/convlab/agent/net/recurrent.py b/convlab/agent/net/recurrent.py
index 0d9cf50..3423b84 100644
--- a/convlab/agent/net/recurrent.py
+++ b/convlab/agent/net/recurrent.py
@@ -1,16 +1,9 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.agent.net import net_util
 from convlab.agent.net.base import Net
-from convlab.lib import logger, util
-import numpy as np
+from convlab.lib import util
 import pydash as ps
-import torch
 import torch.nn as nn
 
-logger = logger.get_logger(__name__)
-
 
 class RecurrentNet(Net, nn.Module):
     '''
@@ -30,6 +23,7 @@ class RecurrentNet(Net, nn.Module):
         "cell_type": "GRU",
         "fc_hid_layers": [],
         "hid_layers_activation": "relu",
+        "out_layer_activation": null,
         "rnn_hidden_size": 32,
         "rnn_num_layers": 1,
         "bidirectional": False,
@@ -61,6 +55,7 @@ def __init__(self, net_spec, in_dim, out_dim):
         cell_type: any of RNN, LSTM, GRU
         fc_hid_layers: list of fc layers preceeding the RNN layers
         hid_layers_activation: activation function for the fc hidden layers
+        out_layer_activation: activation function for the output layer, same shape as out_dim
         rnn_hidden_size: rnn hidden_size
         rnn_num_layers: number of recurrent layers
         bidirectional: if RNN should be bidirectional
@@ -76,9 +71,10 @@ def __init__(self, net_spec, in_dim, out_dim):
         gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing
         '''
         nn.Module.__init__(self)
-        super(RecurrentNet, self).__init__(net_spec, in_dim, out_dim)
+        super().__init__(net_spec, in_dim, out_dim)
         # set default
         util.set_attr(self, dict(
+            out_layer_activation=None,
             cell_type='GRU',
             rnn_num_layers=1,
             bidirectional=False,
@@ -96,6 +92,7 @@ def __init__(self, net_spec, in_dim, out_dim):
             'cell_type',
             'fc_hid_layers',
             'hid_layers_activation',
+            'out_layer_activation',
             'rnn_hidden_size',
             'rnn_num_layers',
             'bidirectional',
@@ -110,16 +107,18 @@ def __init__(self, net_spec, in_dim, out_dim):
             'polyak_coef',
             'gpu',
         ])
-        # fc layer: state processing model
-        if not ps.is_empty(self.fc_hid_layers):
+        # restore proper in_dim from env stacked state_dim (stack_len, *raw_state_dim)
+        self.in_dim = in_dim[1:] if len(in_dim) > 2 else in_dim[1]
+        # fc body: state processing model
+        if ps.is_empty(self.fc_hid_layers):
+            self.rnn_input_dim = self.in_dim
+        else:
             fc_dims = [self.in_dim] + self.fc_hid_layers
-            self.fc_model = net_util.build_sequential(fc_dims, self.hid_layers_activation)
+            self.fc_model = net_util.build_fc_model(fc_dims, self.hid_layers_activation)
             self.rnn_input_dim = fc_dims[-1]
-        else:
-            self.rnn_input_dim = self.in_dim
 
         # RNN model
-        self.rnn_model = getattr(nn, self.cell_type)(
+        self.rnn_model = getattr(nn, net_util.get_nn_name(self.cell_type))(
             input_size=self.rnn_input_dim,
             hidden_size=self.rnn_hidden_size,
             num_layers=self.rnn_num_layers,
@@ -127,19 +126,21 @@ def __init__(self, net_spec, in_dim, out_dim):
 
         # tails. avoid list for single-tail for compute speed
         if ps.is_integer(self.out_dim):
-            self.model_tail = nn.Linear(self.rnn_hidden_size, self.out_dim)
+            self.model_tail = net_util.build_fc_model([self.rnn_hidden_size, self.out_dim], self.out_layer_activation)
         else:
-            self.model_tails = nn.ModuleList([nn.Linear(self.rnn_hidden_size, out_d) for out_d in self.out_dim])
+            if not ps.is_list(self.out_layer_activation):
+                self.out_layer_activation = [self.out_layer_activation] * len(out_dim)
+            assert len(self.out_layer_activation) == len(self.out_dim)
+            tails = []
+            for out_d, out_activ in zip(self.out_dim, self.out_layer_activation):
+                tail = net_util.build_fc_model([self.rnn_hidden_size, out_d], out_activ)
+                tails.append(tail)
+            self.model_tails = nn.ModuleList(tails)
 
         net_util.init_layers(self, self.init_fn)
-        for module in self.modules():
-            module.to(self.device)
         self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
-        self.optim = net_util.get_optim(self, self.optim_spec)
-        self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
-
-    def __str__(self):
-        return super(RecurrentNet, self).__str__() + f'\noptim: {self.optim}'
+        self.to(self.device)
+        self.train()
 
     def forward(self, x):
         '''The feedforward step. Input is batch_size x seq_len x state_dim'''
@@ -163,33 +164,3 @@ def forward(self, x):
             return outs
         else:
             return self.model_tail(hid_x)
-
-    def training_step(self, x=None, y=None, loss=None, retain_graph=False, lr_clock=None):
-        '''Takes a single training step: one forward and one backwards pass'''
-        if hasattr(self, 'model_tails') and x is not None:
-            raise ValueError('Loss computation from x,y not supported for multitails')
-        self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t'))
-        self.train()
-        self.optim.zero_grad()
-        if loss is None:
-            out = self(x)
-            loss = self.loss_fn(out, y)
-        assert not torch.isnan(loss).any(), loss
-        if net_util.to_assert_trained():
-            assert_trained = net_util.gen_assert_trained(self)
-        loss.backward(retain_graph=retain_graph)
-        if self.clip_grad_val is not None:
-            nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
-        self.optim.step()
-        if net_util.to_assert_trained():
-            assert_trained(self, loss)
-            self.store_grad_norms()
-        logger.debug(f'Net training_step loss: {loss}')
-        return loss
-
-    def wrap_eval(self, x):
-        '''
-        Completes one feedforward step, ensuring net is set to evaluation model returns: network output given input x
-        '''
-        self.eval()
-        return self(x)
diff --git a/convlab/env/__init__.py b/convlab/env/__init__.py
index 2a74364..1fecfeb 100644
--- a/convlab/env/__init__.py
+++ b/convlab/env/__init__.py
@@ -7,7 +7,8 @@
 Provides the rich experience for agent embodiment, reflects the curriculum and allows teaching (possibly allows teacher to enter).
 To be designed by human and evolution module, based on the curriculum and fitness metrics.
 '''
-from convlab.env.base import Clock, ENV_DATA_NAMES
+# from convlab.env.base import Clock, ENV_DATA_NAMES
+from convlab.env.base import Clock 
 from convlab.lib import logger, util
 from convlab.lib.decorator import lab_api
 import pydash as ps
@@ -16,13 +17,13 @@
 logger = logger.get_logger(__name__)
 
 
-def make_env(spec, e=None, env_space=None):
+def make_env(spec, e=None):
     if spec['env'][0]['name'] == 'movie':
         from convlab.env.movie import MovieEnv
-        env = MovieEnv(spec, e, env_space)
+        env = MovieEnv(spec, e)
     elif spec['env'][0]['name'] == 'multiwoz':
         from convlab.env.multiwoz import MultiWozEnv
-        env = MultiWozEnv(spec, e, env_space)
+        env = MultiWozEnv(spec, e)
 
     return env
 
diff --git a/convlab/env/base.py b/convlab/env/base.py
index df34567..5c476a7 100644
--- a/convlab/env/base.py
+++ b/convlab/env/base.py
@@ -1,15 +1,11 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from abc import ABC, abstractmethod
 from gym import spaces
 from convlab.lib import logger, util
 from convlab.lib.decorator import lab_api
 import numpy as np
+import pydash as ps
 import time
 
-ENV_DATA_NAMES = ['reward', 'state', 'done']
-NUM_EVAL_EPI = 100  # set the number of episodes to eval a model ckpt
 logger = logger.get_logger(__name__)
 
 
@@ -36,39 +32,41 @@ def set_gym_space_attr(gym_space):
 class Clock:
     '''Clock class for each env and space to keep track of relative time. Ticking and control loop is such that reset is at t=0 and epi=0'''
 
-    def __init__(self, clock_speed=1):
+    def __init__(self, max_frame=int(1e7), clock_speed=1):
+        self.max_frame = max_frame
         self.clock_speed = int(clock_speed)
-        self.ticks = 0  # multiple ticks make a timestep; used for clock speed
+        self.reset()
+
+    def reset(self):
         self.t = 0
-        self.total_t = 0
-        self.epi = -1  # offset so epi is 0 when it gets ticked at start
+        self.frame = 0  # i.e. total_t
+        self.epi = 0
         self.start_wall_t = time.time()
+        self.batch_size = 1  # multiplier to accurately count opt steps
+        self.opt_step = 0  # count the number of optimizer updates
 
-    def get(self, unit='t'):
+    def get(self, unit='frame'):
         return getattr(self, unit)
 
     def get_elapsed_wall_t(self):
         '''Calculate the elapsed wall time (int seconds) since self.start_wall_t'''
         return int(time.time() - self.start_wall_t)
 
+    def set_batch_size(self, batch_size):
+        self.batch_size = batch_size
+
     def tick(self, unit='t'):
         if unit == 't':  # timestep
-            if self.to_step():
-                self.t += 1
-                self.total_t += 1
-            else:
-                pass
-            self.ticks += 1
+            self.t += self.clock_speed
+            self.frame += self.clock_speed
         elif unit == 'epi':  # episode, reset timestep
             self.epi += 1
             self.t = 0
+        elif unit == 'opt_step':
+            self.opt_step += self.batch_size
         else:
             raise KeyError
 
-    def to_step(self):
-        '''Step signal from clock_speed. Step only if the base unit of time in this clock has moved. Used to control if env of different clock_speed should step()'''
-        return self.ticks % self.clock_speed == 0
-
 
 class BaseEnv(ABC):
     '''
@@ -76,46 +74,59 @@ class BaseEnv(ABC):
 
     e.g. env_spec
     "env": [{
-      "name": "CartPole-v0",
-      "max_t": null,
-      "max_tick": 150,
-    }],
-
-    # or using total_t
-    "env": [{
-      "name": "CartPole-v0",
-      "max_t": null,
-      "max_tick": 10000,
+        "name": "PongNoFrameskip-v4",
+        "frame_op": "concat",
+        "frame_op_len": 4,
+        "normalize_state": false,
+        "reward_scale": "sign",
+        "num_envs": 8,
+        "max_t": null,
+        "max_frame": 1e7
     }],
     '''
 
-    def __init__(self, spec, e=None, env_space=None):
-        self.e = e or 0  # for compatibility with env_space
-        self.clock_speed = 1
-        self.clock = Clock(self.clock_speed)
+    def __init__(self, spec, e=None):
+        self.e = e or 0  # for multi-env
         self.done = False
         self.env_spec = spec['env'][self.e]
+        # set default
         util.set_attr(self, dict(
-            reward_scale=1.0,
+            log_frequency=None,  # default to log at epi done
+            frame_op=None,
+            frame_op_len=None,
+            normalize_state=False,
+            reward_scale=None,
+            num_envs=None,
         ))
         util.set_attr(self, spec['meta'], [
+            'log_frequency',
             'eval_frequency',
-            'max_tick_unit',
         ])
         util.set_attr(self, self.env_spec, [
             'name',
-            'max_t',
-            'max_tick',
+            'frame_op',
+            'frame_op_len',
+            'normalize_state',
             'reward_scale',
+            'num_envs',
+            'max_t',
+            'max_frame',
         ])
-        if util.get_lab_mode() == 'eval':
-            # override for eval, offset so epi is 0 - (num_eval_epi - 1)
-            logger.info(f'Override max_tick for eval mode to {NUM_EVAL_EPI} epi')
-            self.max_tick = NUM_EVAL_EPI - 1
-            self.max_tick_unit = 'epi'
-        # set max_tick info to clock
-        self.clock.max_tick = self.max_tick
-        self.clock.max_tick_unit = self.max_tick_unit
+        seq_len = ps.get(spec, 'agent.0.net.seq_len')
+        if seq_len is not None:  # infer if using RNN
+            self.frame_op = 'stack'
+            self.frame_op_len = seq_len
+        if util.in_eval_lab_modes():  # use singleton for eval
+            self.num_envs = 1
+            self.log_frequency = None
+        if spec['meta']['distributed'] != False:  # divide max_frame for distributed
+            self.max_frame = int(self.max_frame / spec['meta']['max_session'])
+        self.is_venv = (self.num_envs is not None and self.num_envs > 1)
+        if self.is_venv:
+            assert self.log_frequency is not None, f'Specify log_frequency when using venv'
+        self.clock_speed = 1 * (self.num_envs or 1)  # tick with a multiple of num_envs to properly count frames
+        self.clock = Clock(self.max_frame, self.clock_speed)
+        self.to_render = util.to_render()
 
     def _set_attr_from_u_env(self, u_env):
         '''Set the observation, action dimensions and action type from u_env'''
@@ -159,13 +170,13 @@ def _is_discrete(self, action_space):
     @abstractmethod
     @lab_api
     def reset(self):
-        '''Reset method, return _reward, state, done'''
+        '''Reset method, return state'''
         raise NotImplementedError
 
     @abstractmethod
     @lab_api
     def step(self, action):
-        '''Step method, return reward, state, done'''
+        '''Step method, return state, reward, done, info'''
         raise NotImplementedError
 
     @abstractmethod
@@ -173,27 +184,3 @@ def step(self, action):
     def close(self):
         '''Method to close and cleanup env'''
         raise NotImplementedError
-
-    @lab_api
-    def set_body_e(self, body_e):
-        '''Method called by body_space.init_body_space to complete the necessary backward reference needed for EnvSpace to work'''
-        self.body_e = body_e
-        self.nanflat_body_e = util.nanflatten(self.body_e)
-        for idx, body in enumerate(self.nanflat_body_e):
-            body.nanflat_e_idx = idx
-        self.body_num = len(self.nanflat_body_e)
-
-    @lab_api
-    def space_init(self, env_space):
-        '''Post init override for space env. Note that aeb is already correct from __init__'''
-        raise NotImplementedError
-
-    @lab_api
-    def space_reset(self):
-        '''Space (multi-env) reset method, return _reward_e, state_e, done_e'''
-        raise NotImplementedError
-
-    @lab_api
-    def space_step(self, action_e):
-        '''Space (multi-env) step method, return reward_e, state_e, done_e'''
-        raise NotImplementedError
diff --git a/convlab/env/multiwoz.py b/convlab/env/multiwoz.py
index 78bfc63..a8c665d 100644
--- a/convlab/env/multiwoz.py
+++ b/convlab/env/multiwoz.py
@@ -9,7 +9,7 @@
 import pydash as ps
 from gym import spaces
 
-from convlab.env.base import BaseEnv, ENV_DATA_NAMES, set_gym_space_attr
+from convlab.env.base import BaseEnv, set_gym_space_attr
 # from convlab.env.registration import get_env_path
 from convlab.lib import logger, util
 from convlab.lib.decorator import lab_api
@@ -91,11 +91,11 @@ def step(self, action):
         str_sys_response = '{}'.format(action)
         str_user_response = '{}'.format(user_response)
         self.history.extend([str_sys_response, str_user_response])
-        if session_over:
-            dialog_status = self.simulator.policy.goal.task_complete()
-            if dialog_status:
-                self.stat['success'] += 1
-            else: self.stat['fail'] += 1
+        # if session_over:
+        #     dialog_status = self.simulator.policy.goal.task_complete()
+        #     if dialog_status:
+        #         self.stat['success'] += 1
+        #     else: self.stat['fail'] += 1
         self.env_info = [State(user_response, reward, session_over)] 
         return self.env_info 
 
@@ -150,8 +150,9 @@ def _score(a1, a2):
         return action 
 
     def close(self):
-        print('\nstatistics: %s' % (self.stat))
-        print('\nsuccess rate: %s' % (self.stat['success']/(self.stat['success']+self.stat['fail'])))
+        pass
+        # print('\nstatistics: %s' % (self.stat))
+        # print('\nsuccess rate: %s' % (self.stat['success']/(self.stat['success']+self.stat['fail'])))
 
 
 class MultiWozEnv(BaseEnv):
@@ -171,8 +172,8 @@ class MultiWozEnv(BaseEnv):
     }],
     '''
 
-    def __init__(self, spec, e=None, env_space=None):
-        super(MultiWozEnv, self).__init__(spec, e, env_space)
+    def __init__(self, spec, e=None):
+        super(MultiWozEnv, self).__init__(spec, e)
         self.action_dim = self.observation_dim = 0
         util.set_attr(self, self.env_spec, [
             'observation_dim',
@@ -182,11 +183,6 @@ def __init__(self, spec, e=None, env_space=None):
         self.u_env = MultiWozEnvironment(self.env_spec, worker_id, self.action_dim)
         self.patch_gym_spaces(self.u_env)
         self._set_attr_from_u_env(self.u_env)
-        # assert self.max_t is not None
-        if env_space is None:  # singleton mode
-            pass
-        else:
-            self.space_init(env_space)
 
         logger.info(util.self_desc(self))
 
@@ -210,69 +206,33 @@ def _get_env_info(self, env_info_dict, a):
 
     @lab_api
     def reset(self):
-        _reward = np.nan
+        # _reward = np.nan
         env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('multiwoz'))
         a, b = 0, 0  # default singleton aeb
         env_info_a = self._get_env_info(env_info_dict, a)
         state = env_info_a.states[b]
-        self.done = done = False
-        logger.debug(f'Env {self.e} reset reward: {_reward}, state: {state}, done: {done}')
-        return _reward, state, done
+        # self.done = done = False
+        # logger.debug(f'Env {self.e} reset reward: {_reward}, state: {state}, done: {done}')
+        # return _reward, state, done
+        self.done = False
+        logger.debug(f'Env {self.e} reset state: {state}')
+        return state
 
     @lab_api
     def step(self, action):
         env_info_dict = self.u_env.step(action)
         a, b = 0, 0  # default singleton aeb
         env_info_a = self._get_env_info(env_info_dict, a)
-        reward = env_info_a.rewards[b] * self.reward_scale
+        reward = env_info_a.rewards[b]  # * self.reward_scale
         state = env_info_a.states[b]
         done = env_info_a.local_done[b]
         self.done = done = done or self.clock.t > self.max_t
         logger.debug(f'Env {self.e} step reward: {reward}, state: {state}, done: {done}')
-        return reward, state, done
+        return state, reward, done, env_info_a 
 
     @lab_api
     def close(self):
         self.u_env.close()
 
-    # NOTE optional extension for multi-agent-env
-
-    @lab_api
-    def space_init(self, env_space):
-        '''Post init override for space env. Note that aeb is already correct from __init__'''
-        self.env_space = env_space
-        self.aeb_space = env_space.aeb_space
-        self.observation_spaces = [self.observation_space]
-        self.action_spaces = [self.action_space]
-
-    @lab_api
-    def space_reset(self):
-        self._check_u_brain_to_agent()
-        self.done = False
-        env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('multiwoz'))
-        _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
-        for (a, b), body in util.ndenumerate_nonan(self.body_e):
-            env_info_a = self._get_env_info(env_info_dict, a)
-            self._check_u_agent_to_body(env_info_a, a)
-            state = env_info_a.states[b]
-            state_e[(a, b)] = state
-            done_e[(a, b)] = self.done
-        logger.debug(f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}')
-        return _reward_e, state_e, done_e
-
-    @lab_api
-    def space_step(self, action_e):
-        # TODO implement clock_speed: step only if self.clock.to_step()
-        if self.done:
-            return self.space_reset()
-        action_e = util.nanflatten(action_e)
-        env_info_dict = self.u_env.step(action_e)
-        reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
-        for (a, b), body in util.ndenumerate_nonan(self.body_e):
-            env_info_a = self._get_env_info(env_info_dict, a)
-            reward_e[(a, b)] = env_info_a.rewards[b] * self.reward_scale
-            state_e[(a, b)] = env_info_a.states[b]
-            done_e[(a, b)] = env_info_a.local_done[b]
-        self.done = (util.nonan_all(done_e) or self.clock.t > self.max_t)
-        logger.debug(f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}')
-        return reward_e, state_e, done_e
+    def get_task_success(self):
+        return self.u_env.simulator.policy.goal.task_complete()
\ No newline at end of file
diff --git a/convlab/experiment/__init__.py b/convlab/experiment/__init__.py
index a46b534..a9d8b91 100644
--- a/convlab/experiment/__init__.py
+++ b/convlab/experiment/__init__.py
@@ -1,7 +1,2 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The experiment module
-Handles experimentation logic: control, design, monitoring, analysis, evolution
-'''
+# The experiment module
+# Handles experimentation logic: control, analysis
diff --git a/convlab/experiment/analysis.py b/convlab/experiment/analysis.py
index aebf891..89c02e1 100644
--- a/convlab/experiment/analysis.py
+++ b/convlab/experiment/analysis.py
@@ -1,569 +1,284 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The analysis module
-Handles the analyses of the info and data space for experiment evaluation and design.
-'''
-from convlab.agent import AGENT_DATA_NAMES
-from convlab.env import ENV_DATA_NAMES
-from convlab.lib import logger, math_util, util, viz
-from convlab.spec import spec_util
+from convlab.lib import logger, util, viz
+from convlab.spec import random_baseline
 import numpy as np
 import os
 import pandas as pd
 import pydash as ps
-import regex as re
 import shutil
+import torch
 
-FITNESS_COLS = ['strength', 'speed', 'stability', 'consistency']
-# TODO improve to make it work with any reward mean
-FITNESS_STD = util.read('convlab/spec/_fitness_std.json')
-NOISE_WINDOW = 0.05
-NORM_ORDER = 1  # use L1 norm in fitness vector norm
-MA_WINDOW = 100
-logger = logger.get_logger(__name__)
-
-'''
-Fitness analysis
-'''
 
+NUM_EVAL = 4
+METRICS_COLS = [
+    'strength', 'max_strength', 'final_strength',
+    'sample_efficiency', 'training_efficiency',
+    'stability', 'consistency',
+]
 
-def calc_strength_sr(aeb_df, rand_reward, std_reward):
-    '''
-    Calculate strength for each reward as
-    strength = (reward - rand_reward) / (std_reward - rand_reward)
-    '''
-    return (aeb_df['reward'] - rand_reward) / (std_reward - rand_reward)
+logger = logger.get_logger(__name__)
 
 
-def calc_strength(aeb_df):
+# methods to generate returns (total rewards)
+
+def gen_return(agent, env):
+    '''Generate return for an agent and an env in eval mode'''
+    obs = env.reset()
+    agent.reset(obs)
+    done = False
+    total_reward = 0
+    while not done:
+        action = agent.act(obs)
+        next_obs, reward, done, info = env.step(action)
+        agent.update(obs, action, reward, next_obs, done)
+        obs = next_obs
+        total_reward += reward
+    return total_reward
+
+
+def gen_avg_return(agent, env, num_eval=NUM_EVAL):
+    '''Generate average return for agent and an env'''
+    with util.ctx_lab_mode('eval'):  # enter eval context
+        agent.algorithm.update()  # set explore_var etc. to end_val under ctx
+        with torch.no_grad():
+            returns = [gen_return(agent, env) for i in range(num_eval)]
+    # exit eval context, restore variables simply by updating
+    agent.algorithm.update()
+    return np.mean(returns)
+
+
+def gen_result(agent, env):
+    '''Generate average return for agent and an env'''
+    with util.ctx_lab_mode('eval'):  # enter eval context
+        agent.algorithm.update()  # set explore_var etc. to end_val under ctx
+        with torch.no_grad():
+            _return = gen_return(agent, env)
+    # exit eval context, restore variables simply by updating
+    agent.algorithm.update()
+    if hasattr(env, 'get_task_success'):
+        return _return, env.get_task_success() 
+    return _return 
+
+
+# metrics calculation methods
+
+def calc_strength(mean_returns, mean_rand_returns):
     '''
-    Strength of an agent in fitness is its maximum strength_ma. Moving average is used to denoise signal.
-    For an agent total reward at a time, calculate strength by normalizing it with a given baseline rand_reward and solution std_reward, i.e.
-    strength = (reward - rand_reward) / (std_reward - rand_reward)
-
-    **Properties:**
-    - random agent has strength 0, standard agent has strength 1.
-    - strength is standardized to be independent of the actual sign and scale of raw reward
-    - scales relative to std_reward: if an agent achieve x2 std_reward, the strength is x2, and so on.
-    This allows for standard comparison between agents on the same problem using an intuitive measurement of strength. With proper scaling by a difficulty factor, we can compare across problems of different difficulties.
+    Calculate strength for metric
+    str &= \frac{1}{N} \sum_{i=0}^N \overline{R}_i - \overline{R}_{rand}
+    @param Series:mean_returns A series of mean returns from each checkpoint
+    @param float:mean_rand_returns The random baseline
+    @returns float:str, Series:local_strs
     '''
-    return aeb_df['strength_ma'].max()
+    local_strs = mean_returns - mean_rand_returns
+    str_ = local_strs.mean()
+    return str_, local_strs
 
 
-def calc_speed(aeb_df, std_timestep):
-    '''
-    Find the maximum strength_ma, and the time to first reach it. Then the strength/time divided by the standard std_strength/std_timestep is speed, i.e.
-    speed = (max_strength_ma / timestep_to_first_reach) / (std_strength / std_timestep)
-    **Properties:**
-    - random agent has speed 0, standard agent has speed 1.
-    - if both agents reach the same max strength_ma, and one reaches it in half the timesteps, it is twice as fast.
-    - speed is standardized regardless of the scaling of absolute timesteps, or even the max strength attained
-    This allows an intuitive measurement of learning speed and the standard comparison between agents on the same problem.
-    '''
-    if aeb_df['strength_ma'].count() > 0:
-        first_max_idx = aeb_df['strength_ma'].idxmax()  # this returns the first max
-        max_row = aeb_df.loc[first_max_idx]
-        std_strength = 1.
-        if max_row['total_t'] == 0:  # especially for random agent
-            speed = 0.
-        else:
-            speed = (max_row['strength_ma'] / max_row['total_t']) / (std_strength / std_timestep)
-    else:
-        speed = 0.
-    return speed
-
-
-def calc_stability(aeb_df):
+def calc_efficiency(local_strs, ts):
     '''
-    Stability = fraction of monotonically increasing elements in the denoised series of strength_ma, or 0 if strength_ma is all <= 0.
-    **Properties:**
-    - stable agent has value 1, unstable agent < 1, and non-solution = 0.
-    - uses strength_ma to be more robust to noise
-    - sharp gain in strength is considered stable
-    - monotonically increasing implies strength can keep growing and as long as it does not fall much, it is considered stable
+    Calculate efficiency for metric
+    e &= \frac{\sum_{i=0}^N \frac{1}{t_i} str_i}{\sum_{i=0}^N \frac{1}{t_i}}
+    @param Series:local_strs A series of local strengths
+    @param Series:ts A series of times units (frame or opt_steps)
+    @returns float:eff, Series:local_effs
     '''
-    if (aeb_df['strength_ma'].values <= 0.).all():
-        stability = 0.
-    else:
-        mono_inc_sr = np.diff(aeb_df['strength_ma']) >= 0.
-        stability = mono_inc_sr.sum() / mono_inc_sr.size
-    return stability
+    eff = (local_strs / ts).sum() / local_strs.sum()
+    local_effs = (local_strs / ts).cumsum() / local_strs.cumsum()
+    return eff, local_effs
 
 
-def calc_consistency(aeb_fitness_df):
+def calc_stability(local_strs):
     '''
-    Calculate the consistency of trial by the fitness_vectors of its sessions:
-    consistency = ratio of non-outlier vectors
-    **Properties:**
-    - outliers are calculated using MAD modified z-score
-    - if all the fitness vectors are zero or all strength are zero, consistency = 0
-    - works for all sorts of session fitness vectors, with the standard scale
-    When an agent fails to achieve standard strength, it is meaningless to measure consistency or give false interpolation, so consistency is 0.
+    Calculate stability for metric
+    sta &= 1 - \left| \frac{\sum_{i=0}^{N-1} \min(str_{i+1} - str_i, 0)}{\sum_{i=0}^{N-1} str_i} \right|
+    @param Series:local_strs A series of local strengths
+    @returns float:sta, Series:local_stas
     '''
-    fitness_vecs = aeb_fitness_df.values
-    if ~np.any(fitness_vecs) or ~np.any(aeb_fitness_df['strength']):
-        # no consistency if vectors all 0
-        consistency = 0.
-    elif len(fitness_vecs) == 2:
-        # if only has 2 vectors, check norm_diff
-        diff_norm = np.linalg.norm(np.diff(fitness_vecs, axis=0), NORM_ORDER) / np.linalg.norm(np.ones(len(fitness_vecs[0])), NORM_ORDER)
-        consistency = diff_norm <= NOISE_WINDOW
-    else:
-        is_outlier_arr = math_util.is_outlier(fitness_vecs)
-        consistency = (~is_outlier_arr).sum() / len(is_outlier_arr)
-    return consistency
-
-
-def calc_epi_reward_ma(aeb_df, ckpt=None):
-    '''Calculates the episode reward moving average with the MA_WINDOW'''
-    rewards = aeb_df['reward']
-    if ckpt == 'eval':
-        # online eval mode reward is reward_ma from avg
-        aeb_df['reward_ma'] = rewards
-    else:
-        aeb_df['reward_ma'] = rewards.rolling(window=MA_WINDOW, min_periods=0, center=False).mean()
-    return aeb_df
-
-
-def calc_fitness(fitness_vec):
-    '''
-    Takes a vector of qualifying standardized dimensions of fitness and compute the normalized length as fitness
-    use L1 norm for simplicity and intuititveness of linearity
-    '''
-    if isinstance(fitness_vec, pd.Series):
-        fitness_vec = fitness_vec.values
-    elif isinstance(fitness_vec, pd.DataFrame):
-        fitness_vec = fitness_vec.iloc[0].values
-    std_fitness_vector = np.ones(len(fitness_vec))
-    fitness = np.linalg.norm(fitness_vec, NORM_ORDER) / np.linalg.norm(std_fitness_vector, NORM_ORDER)
-    return fitness
-
-
-def calc_aeb_fitness_sr(aeb_df, env_name):
-    '''Top level method to calculate fitness vector for AEB level data (strength, speed, stability)'''
-    std = FITNESS_STD.get(env_name)
-    if std is None:
-        std = FITNESS_STD.get('template')
-        logger.warn(f'The fitness standard for env {env_name} is not built yet. Contact author. Using a template standard for now.')
-
-    # calculate the strength sr and the moving-average (to denoise) first before calculating fitness
-    aeb_df['strength'] = calc_strength_sr(aeb_df, std['rand_epi_reward'], std['std_epi_reward'])
-    aeb_df['strength_ma'] = aeb_df['strength'].rolling(MA_WINDOW, min_periods=0, center=False).mean()
-
-    strength = calc_strength(aeb_df)
-    speed = calc_speed(aeb_df, std['std_timestep'])
-    stability = calc_stability(aeb_df)
-    aeb_fitness_sr = pd.Series({
-        'strength': strength, 'speed': speed, 'stability': stability})
-    return aeb_fitness_sr
-
-
-'''
-Checkpoint and early termination analysis
-'''
-
-
-def get_reward_mas(agent, name='eval_reward_ma'):
-    '''Return array of the named reward_ma for all of an agent's bodies.'''
-    bodies = getattr(agent, 'nanflat_body_a', [agent.body])
-    return np.array([getattr(body, name) for body in bodies], dtype=np.float16)
-
-
-def get_std_epi_rewards(agent):
-    '''Return array of std_epi_reward for each of the environments.'''
-    bodies = getattr(agent, 'nanflat_body_a', [agent.body])
-    return np.array([ps.get(FITNESS_STD, f'{body.env.name}.std_epi_reward') for body in bodies], dtype=np.float16)
-
-
-def new_best(agent):
-    '''Check if algorithm is now the new best result, then update the new best'''
-    best_reward_mas = get_reward_mas(agent, 'best_reward_ma')
-    eval_reward_mas = get_reward_mas(agent, 'eval_reward_ma')
-    best = (eval_reward_mas >= best_reward_mas).all()
-    if best:
-        bodies = getattr(agent, 'nanflat_body_a', [agent.body])
-        for body in bodies:
-            body.best_reward_ma = body.eval_reward_ma
-    return best
-
-
-def all_solved(agent):
-    '''Check if envs have all been solved using std from convlab/spec/_fitness_std.json'''
-    eval_reward_mas = get_reward_mas(agent, 'eval_reward_ma')
-    std_epi_rewards = get_std_epi_rewards(agent)
-    solved = (
-        not np.isnan(std_epi_rewards).any() and
-        (eval_reward_mas >= std_epi_rewards).all()
-    )
-    return solved
-
-
-def is_unfit(fitness_df, session):
-    '''Check if a fitness_df is unfit. Used to determine of trial should stop running more sessions'''
-    if FITNESS_STD.get(session.spec['env'][0]['name']) is None:
-        return False  # fitness not known
-    mean_fitness_df = calc_mean_fitness(fitness_df)
-    return mean_fitness_df['strength'].iloc[0] <= NOISE_WINDOW
-
-
-'''
-Analysis interface methods
-'''
-
+    # shift to keep indices for division
+    drops = local_strs.diff().shift(-1).iloc[:-1].clip(upper=0.0)
+    denoms = local_strs.iloc[:-1]
+    local_stas = 1 - (drops / denoms).abs()
+    sum_drops = drops.sum()
+    sum_denom = denoms.sum()
+    sta = 1 - np.abs(sum_drops / sum_denom)
+    return sta, local_stas
 
-def save_spec(spec, info_space, unit='experiment'):
-    '''Save spec to proper path. Called at Experiment or Trial init.'''
-    prepath = util.get_prepath(spec, info_space, unit)
-    util.write(spec, f'{prepath}_spec.json')
 
-
-def calc_mean_fitness(fitness_df):
-    '''Method to calculated mean over all bodies for a fitness_df'''
-    return fitness_df.mean(axis=1, level=3)
-
-
-def get_session_data(session, body_df_kind='eval', tmp_space_session_sub=False):
-    '''
-    Gather data from session from all the bodies
-    Depending on body_df_kind, will use eval_df or train_df
-    '''
-    session_data = {}
-    for aeb, body in util.ndenumerate_nonan(session.aeb_space.body_space.data):
-        aeb_df = body.eval_df if body_df_kind == 'eval' else body.train_df
-        # TODO tmp substitution since SpaceSession does not have run_eval_episode yet
-        if tmp_space_session_sub:
-            aeb_df = body.train_df
-        session_data[aeb] = aeb_df.copy()
-    return session_data
-
-
-def calc_session_fitness_df(session, session_data):
-    '''Calculate the session fitness df'''
-    session_fitness_data = {}
-    for aeb in session_data:
-        aeb_df = session_data[aeb]
-        aeb_df = calc_epi_reward_ma(aeb_df, ps.get(session.info_space, 'ckpt'))
-        util.downcast_float32(aeb_df)
-        body = session.aeb_space.body_space.data[aeb]
-        aeb_fitness_sr = calc_aeb_fitness_sr(aeb_df, body.env.name)
-        aeb_fitness_df = pd.DataFrame([aeb_fitness_sr], index=[session.index])
-        aeb_fitness_df = aeb_fitness_df.reindex(FITNESS_COLS[:3], axis=1)
-        session_fitness_data[aeb] = aeb_fitness_df
-    # form multi_index df, then take mean across all bodies
-    session_fitness_df = pd.concat(session_fitness_data, axis=1)
-    mean_fitness_df = calc_mean_fitness(session_fitness_df)
-    session_fitness = calc_fitness(mean_fitness_df)
-    logger.info(f'Session mean fitness: {session_fitness}\n{mean_fitness_df}')
-    return session_fitness_df
-
-
-def calc_trial_fitness_df(trial):
-    '''
-    Calculate the trial fitness df by aggregating from the collected session_data_dict (session_fitness_df's).
-    Adds a consistency dimension to fitness vector.
-    '''
-    trial_fitness_data = {}
-    try:
-        all_session_fitness_df = pd.concat(list(trial.session_data_dict.values()))
-    except ValueError as e:
-        logger.exception('Sessions failed, no data to analyze. Check stack trace above')
-    for aeb in util.get_df_aeb_list(all_session_fitness_df):
-        aeb_fitness_df = all_session_fitness_df.loc[:, aeb]
-        aeb_fitness_sr = aeb_fitness_df.mean()
-        consistency = calc_consistency(aeb_fitness_df)
-        aeb_fitness_sr = aeb_fitness_sr.append(pd.Series({'consistency': consistency}))
-        aeb_fitness_df = pd.DataFrame([aeb_fitness_sr], index=[trial.index])
-        aeb_fitness_df = aeb_fitness_df.reindex(FITNESS_COLS, axis=1)
-        trial_fitness_data[aeb] = aeb_fitness_df
-    # form multi_index df, then take mean across all bodies
-    trial_fitness_df = pd.concat(trial_fitness_data, axis=1)
-    mean_fitness_df = calc_mean_fitness(trial_fitness_df)
-    trial_fitness_df = mean_fitness_df
-    trial_fitness = calc_fitness(mean_fitness_df)
-    logger.info(f'Trial mean fitness: {trial_fitness}\n{mean_fitness_df}')
-    return trial_fitness_df
-
-
-def plot_session(session_spec, info_space, session_data):
-    '''Plot the session graph, 2 panes: reward, loss & explore_var. Each aeb_df gets its own color'''
-    max_tick_unit = ps.get(session_spec, 'meta.max_tick_unit')
-    aeb_count = len(session_data)
-    palette = viz.get_palette(aeb_count)
-    fig = viz.tools.make_subplots(rows=3, cols=1, shared_xaxes=True, print_grid=False)
-    for idx, (a, e, b) in enumerate(session_data):
-        aeb_str = f'{a}{e}{b}'
-        aeb_df = session_data[(a, e, b)]
-        aeb_df.fillna(0, inplace=True)  # for saving plot, cant have nan
-        fig_1 = viz.plot_line(aeb_df, 'reward_ma', max_tick_unit, legend_name=aeb_str, draw=False, trace_kwargs={'legendgroup': aeb_str, 'line': {'color': palette[idx]}})
-        fig.append_trace(fig_1.data[0], 1, 1)
-
-        fig_2 = viz.plot_line(aeb_df, ['loss'], max_tick_unit, y2_col=['explore_var'], trace_kwargs={'legendgroup': aeb_str, 'showlegend': False, 'line': {'color': palette[idx]}}, draw=False)
-        fig.append_trace(fig_2.data[0], 2, 1)
-        fig.append_trace(fig_2.data[1], 3, 1)
-
-    fig.layout['xaxis1'].update(title=max_tick_unit, zerolinewidth=1)
-    fig.layout['yaxis1'].update(fig_1.layout['yaxis'])
-    fig.layout['yaxis1'].update(domain=[0.55, 1])
-    fig.layout['yaxis2'].update(fig_2.layout['yaxis'])
-    fig.layout['yaxis2'].update(showgrid=False, domain=[0, 0.45])
-    fig.layout['yaxis3'].update(fig_2.layout['yaxis2'])
-    fig.layout['yaxis3'].update(overlaying='y2', anchor='x2')
-    fig.layout.update(ps.pick(fig_1.layout, ['legend']))
-    fig.layout.update(title=f'session graph: {session_spec["name"]} t{info_space.get("trial")} s{info_space.get("session")}', width=500, height=600)
-    viz.plot(fig)
-    return fig
-
-
-def gather_aeb_rewards_df(aeb, session_datas, max_tick_unit):
-    '''Gather rewards from each session for a body into a df'''
-    aeb_session_rewards = {}
-    for s, session_data in session_datas.items():
-        aeb_df = session_data[aeb]
-        aeb_reward_sr = aeb_df['reward_ma']
-        aeb_reward_sr.index = aeb_df[max_tick_unit]
-        # guard for duplicate eval result
-        aeb_reward_sr = aeb_reward_sr[~aeb_reward_sr.index.duplicated()]
-        if util.in_eval_lab_modes():
-            # guard for eval appending possibly not ordered
-            aeb_reward_sr.sort_index(inplace=True)
-        aeb_session_rewards[s] = aeb_reward_sr
-    aeb_rewards_df = pd.DataFrame(aeb_session_rewards)
-    return aeb_rewards_df
-
-
-def build_aeb_reward_fig(aeb_rewards_df, aeb_str, color, max_tick_unit):
-    '''Build the aeb_reward envelope figure'''
-    mean_sr = aeb_rewards_df.mean(axis=1)
-    std_sr = aeb_rewards_df.std(axis=1).fillna(0)
-    max_sr = mean_sr + std_sr
-    min_sr = mean_sr - std_sr
-    x = aeb_rewards_df.index.tolist()
-    max_y = max_sr.tolist()
-    min_y = min_sr.tolist()
-
-    envelope_trace = viz.go.Scatter(
-        x=x + x[::-1],
-        y=max_y + min_y[::-1],
-        fill='tozerox',
-        fillcolor=viz.lower_opacity(color, 0.2),
-        line=dict(color='rgba(0, 0, 0, 0)'),
-        showlegend=False,
-        legendgroup=aeb_str,
-    )
-    df = pd.DataFrame({max_tick_unit: x, 'mean_reward': mean_sr})
-    fig = viz.plot_line(
-        df, ['mean_reward'], [max_tick_unit], legend_name=aeb_str, draw=False, trace_kwargs={'legendgroup': aeb_str, 'line': {'color': color}}
-    )
-    fig.add_traces([envelope_trace])
-    return fig
-
-
-def calc_trial_df(trial_spec, info_space):
-    '''Calculate trial_df as mean of all session_df'''
-    from convlab.experiment import retro_analysis
-    prepath = util.get_prepath(trial_spec, info_space)
-    predir, _, _, _, _, _ = util.prepath_split(prepath)
-    session_datas = retro_analysis.session_datas_from_file(predir, trial_spec, info_space.get('trial'), ps.get(info_space, 'ckpt'))
-    aeb_transpose = {aeb: [] for aeb in session_datas[list(session_datas.keys())[0]]}
-    max_tick_unit = ps.get(trial_spec, 'meta.max_tick_unit')
-    for s, session_data in session_datas.items():
-        for aeb, aeb_df in session_data.items():
-            aeb_transpose[aeb].append(aeb_df.sort_values(by=[max_tick_unit]).set_index(max_tick_unit, drop=False))
-
-    trial_data = {}
-    for aeb, df_list in aeb_transpose.items():
-        trial_data[aeb] = pd.concat(df_list).groupby(level=0).mean().reset_index(drop=True)
-
-    trial_df = pd.concat(trial_data, axis=1)
-    return trial_df
-
-
-def plot_trial(trial_spec, info_space):
-    '''Plot the trial graph, 1 pane: mean and error envelope of reward graphs from all sessions. Each aeb_df gets its own color'''
-    from convlab.experiment import retro_analysis
-    prepath = util.get_prepath(trial_spec, info_space)
-    predir, _, _, _, _, _ = util.prepath_split(prepath)
-    session_datas = retro_analysis.session_datas_from_file(predir, trial_spec, info_space.get('trial'), ps.get(info_space, 'ckpt'))
-    rand_session_data = session_datas[list(session_datas.keys())[0]]
-    max_tick_unit = ps.get(trial_spec, 'meta.max_tick_unit')
-    aeb_count = len(rand_session_data)
-    palette = viz.get_palette(aeb_count)
-    fig = None
-    for idx, (a, e, b) in enumerate(rand_session_data):
-        aeb = (a, e, b)
-        aeb_str = f'{a}{e}{b}'
-        color = palette[idx]
-        aeb_rewards_df = gather_aeb_rewards_df(aeb, session_datas, max_tick_unit)
-        aeb_fig = build_aeb_reward_fig(aeb_rewards_df, aeb_str, color, max_tick_unit)
-        if fig is None:
-            fig = aeb_fig
-        else:
-            fig.add_traces(aeb_fig.data)
-    fig.layout.update(title=f'trial graph: {trial_spec["name"]} t{info_space.get("trial")}, {len(session_datas)} sessions', width=500, height=600)
-    viz.plot(fig)
-    return fig
-
-
-def plot_experiment(experiment_spec, experiment_df):
+def calc_consistency(local_strs_list):
     '''
-    Plot the variable specs vs fitness vector of an experiment, where each point is a trial.
-    ref colors: https://plot.ly/python/heatmaps-contours-and-2dhistograms-tutorial/#plotlys-predefined-color-scales
+    Calculate consistency for metric
+    con &= 1 - \frac{\sum_{i=0}^N 2 stdev_j(str_{i,j})}{\sum_{i=0}^N avg_j(str_{i,j})}
+    @param Series:local_strs_list A list of multiple series of local strengths from different sessions
+    @returns float:con, Series:local_cons
     '''
-    y_cols = ['fitness'] + FITNESS_COLS
-    x_cols = ps.difference(experiment_df.columns.tolist(), y_cols)
-
-    fig = viz.tools.make_subplots(rows=len(y_cols), cols=len(x_cols), shared_xaxes=True, shared_yaxes=True, print_grid=False)
-    fitness_sr = experiment_df['fitness']
-    min_fitness = fitness_sr.values.min()
-    max_fitness = fitness_sr.values.max()
-    for row_idx, y in enumerate(y_cols):
-        for col_idx, x in enumerate(x_cols):
-            x_sr = experiment_df[x]
-            guard_cat_x = x_sr.astype(str) if x_sr.dtype == 'object' else x_sr
-            trace = viz.go.Scatter(
-                y=experiment_df[y], yaxis=f'y{row_idx+1}',
-                x=guard_cat_x, xaxis=f'x{col_idx+1}',
-                showlegend=False, mode='markers',
-                marker={
-                    'symbol': 'circle-open-dot', 'color': experiment_df['fitness'], 'opacity': 0.5,
-                    # dump first quarter of colorscale that is too bright
-                    'cmin': min_fitness - 0.50 * (max_fitness - min_fitness), 'cmax': max_fitness,
-                    'colorscale': 'YlGnBu', 'reversescale': True
-                },
-            )
-            fig.append_trace(trace, row_idx + 1, col_idx + 1)
-            fig.layout[f'xaxis{col_idx+1}'].update(title='<br>'.join(ps.chunk(x, 20)), zerolinewidth=1, categoryarray=sorted(guard_cat_x.unique()))
-        fig.layout[f'yaxis{row_idx+1}'].update(title=y, rangemode='tozero')
-    fig.layout.update(title=f'experiment graph: {experiment_spec["name"]}', width=max(600, len(x_cols) * 300), height=700)
-    viz.plot(fig)
-    return fig
-
-
-def save_session_df(session_data, filepath, info_space):
-    '''Save session_df, and if is in eval mode, modify it and save with append'''
-    if util.in_eval_lab_modes():
-        ckpt = util.find_ckpt(info_space.eval_model_prepath)
-        epi = 0  #int(re.search('epi(\d+)', ckpt)[1])
-        totalt = 0  #int(re.search('totalt(\d+)', ckpt)[1])
-        session_df = pd.concat(session_data, axis=1)
-        mean_sr = session_df.mean()
-        mean_sr.name = totalt  # set index to prevent all being the same
-        eval_session_df = pd.DataFrame(data=[mean_sr])
-        # set sr name too, to total_t
-        for aeb in util.get_df_aeb_list(eval_session_df):
-            eval_session_df.loc[:, aeb + ('epi',)] = epi
-            eval_session_df.loc[:, aeb + ('total_t',)] = totalt
-        # if eval, save with append mode
-        header = not os.path.exists(filepath)
-        with open(filepath, 'a') as f:
-            eval_session_df.to_csv(f, header=header)
-    else:
-        session_df = pd.concat(session_data, axis=1)
-        util.write(session_df, filepath)
-
-
-def save_session_data(spec, info_space, session_data, session_fitness_df, session_fig, body_df_kind='eval'):
-    '''
-    Save the session data: session_df, session_fitness_df, session_graph.
-    session_data is saved as session_df; multi-indexed with (a,e,b), 3 extra levels
-    to read, use:
-    session_df = util.read(filepath, header=[0, 1, 2, 3], index_col=0)
-    session_data = util.session_df_to_data(session_df)
-    '''
-    prepath = util.get_prepath(spec, info_space, unit='session')
-    logger.info(f'Saving session data to {prepath}')
-    prefix = 'train' if body_df_kind == 'train' else ''
-    if 'retro_analyze' not in os.environ['PREPATH']:
-        save_session_df(session_data, f'{prepath}_{prefix}session_df.csv', info_space)
-    util.write(session_fitness_df, f'{prepath}_{prefix}session_fitness_df.csv')
-    viz.save_image(session_fig, f'{prepath}_{prefix}session_graph.png')
-
-
-def save_trial_data(spec, info_space, trial_df, trial_fitness_df, trial_fig, zip=True):
-    '''Save the trial data: spec, trial_fitness_df.'''
-    prepath = util.get_prepath(spec, info_space, unit='trial')
-    logger.info(f'Saving trial data to {prepath}')
-    util.write(trial_df, f'{prepath}_trial_df.csv')
-    util.write(trial_fitness_df, f'{prepath}_trial_fitness_df.csv')
-    viz.save_image(trial_fig, f'{prepath}_trial_graph.png')
-    if util.get_lab_mode() == 'train' and zip:
-        predir, _, _, _, _, _ = util.prepath_split(prepath)
-        shutil.make_archive(predir, 'zip', predir)
-        logger.info(f'All trial data zipped to {predir}.zip')
-
-
-def save_experiment_data(spec, info_space, experiment_df, experiment_fig):
-    '''Save the experiment data: best_spec, experiment_df, experiment_graph.'''
-    prepath = util.get_prepath(spec, info_space, unit='experiment')
-    logger.info(f'Saving experiment data to {prepath}')
-    util.write(experiment_df, f'{prepath}_experiment_df.csv')
-    viz.save_image(experiment_fig, f'{prepath}_experiment_graph.png')
-    # zip for ease of upload
-    predir, _, _, _, _, _ = util.prepath_split(prepath)
-    shutil.make_archive(predir, 'zip', predir)
-    logger.info(f'All experiment data zipped to {predir}.zip')
-
+    mean_local_strs, std_local_strs = util.calc_srs_mean_std(local_strs_list)
+    local_cons = 1 - 2 * std_local_strs / mean_local_strs
+    con = 1 - 2 * std_local_strs.sum() / mean_local_strs.sum()
+    return con, local_cons
 
-def _analyze_session(session, session_data, body_df_kind='eval'):
-    '''Helper method for analyze_session to run using eval_df and train_df'''
-    session_fitness_df = calc_session_fitness_df(session, session_data)
-    session_fig = plot_session(session.spec, session.info_space, session_data)
-    save_session_data(session.spec, session.info_space, session_data, session_fitness_df, session_fig, body_df_kind)
-    return session_fitness_df
 
-
-def analyze_session(session, eager_analyze_trial=False, tmp_space_session_sub=False):
-    '''
-    Gather session data, plot, and return fitness df for high level agg.
-    @returns {DataFrame} session_fitness_df Single-row df of session fitness vector (avg over aeb), indexed with session index.
+def calc_session_metrics(session_df, env_name, info_prepath=None, df_mode=None):
     '''
-    logger.info('Analyzing session')
-    session_data = get_session_data(session, body_df_kind='train')
-    session_fitness_df = _analyze_session(session, session_data, body_df_kind='train')
-    session_data = get_session_data(session, body_df_kind='eval', tmp_space_session_sub=tmp_space_session_sub)
-    session_fitness_df = _analyze_session(session, session_data, body_df_kind='eval')
-    if eager_analyze_trial:
-        # for live trial graph, analyze trial after analyzing session, this only takes a second
-        from convlab.experiment import retro_analysis
-        prepath = util.get_prepath(session.spec, session.info_space, unit='session')
-        # use new ones to prevent side effects
-        spec, info_space = util.prepath_to_spec_info_space(prepath)
-        predir, _, _, _, _, _ = util.prepath_split(prepath)
-        retro_analysis.analyze_eval_trial(spec, info_space, predir)
-    return session_fitness_df
-
-
-def analyze_trial(trial, zip=True):
+    Calculate the session metrics: strength, efficiency, stability
+    @param DataFrame:session_df Dataframe containing reward, frame, opt_step
+    @param str:env_name Name of the environment to get its random baseline
+    @param str:info_prepath Optional info_prepath to auto-save the output to
+    @param str:df_mode Optional df_mode to save with info_prepath
+    @returns dict:metrics Consists of scalar metrics and series local metrics
     '''
-    Gather trial data, plot, and return trial df for high level agg.
-    @returns {DataFrame} trial_fitness_df Single-row df of trial fitness vector (avg over aeb, sessions), indexed with trial index.
-    '''
-    logger.info('Analyzing trial')
-    trial_df = calc_trial_df(trial.spec, trial.info_space)
-    trial_fitness_df = calc_trial_fitness_df(trial)
-    trial_fig = plot_trial(trial.spec, trial.info_space)
-    save_trial_data(trial.spec, trial.info_space, trial_df, trial_fitness_df, trial_fig, zip)
-    return trial_fitness_df
-
-
-def analyze_experiment(experiment):
+    # rand_bl = random_baseline.get_random_baseline(env_name)
+    # mean_rand_returns = rand_bl['mean']
+    mean_returns = session_df['total_reward']
+    frames = session_df['frame']
+    opt_steps = session_df['opt_step']
+
+    # str_, local_strs = calc_strength(mean_returns, mean_rand_returns)
+    # max_str, final_str = local_strs.max(), local_strs.iloc[-1]
+    # sample_eff, local_sample_effs = calc_efficiency(local_strs, frames)
+    # train_eff, local_train_effs = calc_efficiency(local_strs, opt_steps)
+    # sta, local_stas = calc_stability(local_strs)
+
+    # all the scalar session metrics
+    # scalar = {
+        # 'strength': str_,
+        # 'max_strength': max_str,
+        # 'final_strength': final_str,
+        # 'sample_efficiency': sample_eff,
+        # 'training_efficiency': train_eff,
+        # 'stability': sta,
+    # }
+    # all the session local metrics
+    local = {
+        # 'strengths': local_strs,
+        # 'sample_efficiencies': local_sample_effs,
+        # 'training_efficiencies': local_train_effs,
+        # 'stabilities': local_stas,
+        'mean_returns': mean_returns,
+        'frames': frames,
+        'opt_steps': opt_steps,
+    }
+    metrics = {
+        # 'scalar': scalar,
+        'local': local,
+    }
+    if info_prepath is not None:  # auto-save if info_prepath is given
+        util.write(metrics, f'{info_prepath}_session_metrics_{df_mode}.pkl')
+        # util.write(scalar, f'{info_prepath}_session_metrics_scalar_{df_mode}.json')
+        # save important metrics in info_prepath directly
+        # util.write(scalar, f'{info_prepath.replace("info/", "")}_session_metrics_scalar_{df_mode}.json')
+    return metrics
+
+
+def calc_trial_metrics(session_metrics_list, info_prepath=None):
     '''
-    Gather experiment trial_data_dict as experiment_df, plot.
-    Search module must return best_spec and experiment_data with format {trial_index: exp_trial_data},
-    where trial_data = {**var_spec, **fitness_vec, fitness}.
-    This is then made into experiment_df.
-    @returns {DataFrame} experiment_df Of var_specs, fitness_vec, fitness for all trials.
+    Calculate the trial metrics: mean(strength), mean(efficiency), mean(stability), consistency
+    @param list:session_metrics_list The metrics collected from each session; format: {session_index: {'scalar': {...}, 'local': {...}}}
+    @param str:info_prepath Optional info_prepath to auto-save the output to
+    @returns dict:metrics Consists of scalar metrics and series local metrics
     '''
-    logger.info('Analyzing experiment')
-    experiment_df = pd.DataFrame(experiment.trial_data_dict).transpose()
-    cols = FITNESS_COLS + ['fitness']
+    # calculate mean of session metrics
+    # scalar_list = [sm['scalar'] for sm in session_metrics_list]
+    # mean_scalar = pd.DataFrame(scalar_list).mean().to_dict()
+
+    # local_strs_list = [sm['local']['strengths'] for sm in session_metrics_list]
+    # local_se_list = [sm['local']['sample_efficiencies'] for sm in session_metrics_list]
+    # local_te_list = [sm['local']['training_efficiencies'] for sm in session_metrics_list]
+    # local_sta_list = [sm['local']['stabilities'] for sm in session_metrics_list]
+    mean_returns_list = [sm['local']['mean_returns'] for sm in session_metrics_list]
+    frames = session_metrics_list[0]['local']['frames']
+    opt_steps = session_metrics_list[0]['local']['opt_steps']
+    # calculate consistency
+    # con, local_cons = calc_consistency(local_strs_list)
+
+    # all the scalar trial metrics
+    # scalar = {
+    #     'strength': mean_scalar['strength'],
+    #     'max_strength': mean_scalar['max_strength'],
+    #     'final_strength': mean_scalar['final_strength'],
+    #     'sample_efficiency': mean_scalar['sample_efficiency'],
+    #     'training_efficiency': mean_scalar['training_efficiency'],
+    #     'stability': mean_scalar['stability'],
+    #     'consistency': con,
+    # }
+    # assert set(scalar.keys()) == set(METRICS_COLS)
+    # for plotting: gather all local series of sessions
+    local = {
+        # 'strengths': local_strs_list,
+        # 'sample_efficiencies': local_se_list,
+        # 'training_efficiencies': local_te_list,
+        # 'stabilities': local_sta_list,
+        # 'consistencies': local_cons,  # this is a list
+        'mean_returns': mean_returns_list,
+        'frames': frames,
+        'opt_steps': opt_steps,
+    }
+    metrics = {
+        # 'scalar': scalar,
+        'local': local,
+    }
+    if info_prepath is not None:  # auto-save if info_prepath is given
+        util.write(metrics, f'{info_prepath}_trial_metrics.pkl')
+        # util.write(scalar, f'{info_prepath}_trial_metrics_scalar.json')
+        # save important metrics in info_prepath directly
+        # util.write(scalar, f'{info_prepath.replace("info/", "")}_trial_metrics_scalar.json')
+    return metrics
+
+
+def calc_experiment_df(trial_data_dict, info_prepath=None):
+    '''Collect all trial data (metrics and config) from trials into a dataframe'''
+    experiment_df = pd.DataFrame(trial_data_dict).transpose()
+    cols = METRICS_COLS
     config_cols = sorted(ps.difference(experiment_df.columns.tolist(), cols))
     sorted_cols = config_cols + cols
     experiment_df = experiment_df.reindex(sorted_cols, axis=1)
-    experiment_df.sort_values(by=['fitness'], ascending=False, inplace=True)
-    logger.info(f'Experiment data:\n{experiment_df}')
-    experiment_fig = plot_experiment(experiment.spec, experiment_df)
-    save_experiment_data(experiment.spec, experiment.info_space, experiment_df, experiment_fig)
+    experiment_df.sort_values(by=['strength'], ascending=False, inplace=True)
+    if info_prepath is not None:
+        util.write(experiment_df, f'{info_prepath}_experiment_df.csv')
+        # save important metrics in info_prepath directly
+        util.write(experiment_df, f'{info_prepath.replace("info/", "")}_experiment_df.csv')
+    return experiment_df
+
+
+# interface analyze methods
+
+def analyze_session(session_spec, session_df, df_mode):
+    '''Analyze session and save data, then return metrics. Note there are 2 types of session_df: body.eval_df and body.train_df'''
+    info_prepath = session_spec['meta']['info_prepath']
+    session_df = session_df.copy()
+    assert len(session_df) > 1, f'Need more than 1 datapoint to calculate metrics'
+    util.write(session_df, f'{info_prepath}_session_df_{df_mode}.csv')
+    # calculate metrics
+    session_metrics = calc_session_metrics(session_df, ps.get(session_spec, 'env.0.name'), info_prepath, df_mode)
+    # plot graph
+    viz.plot_session(session_spec, session_metrics, session_df, df_mode)
+    return session_metrics
+
+
+def analyze_trial(trial_spec, session_metrics_list):
+    '''Analyze trial and save data, then return metrics'''
+    info_prepath = trial_spec['meta']['info_prepath']
+    # calculate metrics
+    trial_metrics = calc_trial_metrics(session_metrics_list, info_prepath)
+    # plot graphs
+    viz.plot_trial(trial_spec, trial_metrics)
+    # zip files
+    if util.get_lab_mode() == 'train':
+        predir, _, _, _, _, _ = util.prepath_split(info_prepath)
+        shutil.make_archive(predir, 'zip', predir)
+        logger.info(f'All trial data zipped to {predir}.zip')
+    return trial_metrics
+
+
+def analyze_experiment(spec, trial_data_dict):
+    '''Analyze experiment and save data'''
+    info_prepath = spec['meta']['info_prepath']
+    util.write(trial_data_dict, f'{info_prepath}_trial_data_dict.json')
+    # calculate experiment df
+    experiment_df = calc_experiment_df(trial_data_dict, info_prepath)
+    # plot graph
+    viz.plot_experiment(spec, experiment_df, METRICS_COLS)
+    # zip files
+    predir, _, _, _, _, _ = util.prepath_split(info_prepath)
+    shutil.make_archive(predir, 'zip', predir)
+    logger.info(f'All experiment data zipped to {predir}.zip')
     return experiment_df
diff --git a/convlab/experiment/control.py b/convlab/experiment/control.py
index 10b2493..dff4579 100644
--- a/convlab/experiment/control.py
+++ b/convlab/experiment/control.py
@@ -1,331 +1,231 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The control module
-Creates and controls the units of SLM lab: Experiment, Trial, Session
-'''
+# The control module
+# Creates and runs control loops at levels: Experiment, Trial, Session
 from copy import deepcopy
-from importlib import reload
-from convlab import agent
-from convlab.agent import AgentSpace, Agent
-from convlab.env import EnvSpace, make_env
-from convlab.experiment import analysis, retro_analysis, search
-from convlab.experiment.monitor import AEBSpace, Body, enable_aeb_space
-from convlab.lib import logger, util
-from convlab.spec import spec_util
-import os
 import pydash as ps
 import torch.multiprocessing as mp
 
+from convlab import agent as agent_module
+from convlab.agent import Agent, Body
+from convlab.agent.net import net_util
+from convlab.env import make_env
+from convlab.experiment import analysis, search
+from convlab.lib import logger, util
+from convlab.spec import spec_util
+
+
+def make_agent_env(spec, global_nets=None):
+    '''Helper to create agent and env given spec'''
+    env = make_env(spec)
+    body = Body(env, spec['agent'])
+    # agent = Agent(spec, body=body, global_nets=global_nets)
+    AgentClass = getattr(agent_module, ps.get(spec['agent'][0], 'name'))
+    agent = AgentClass(spec, body=body, global_nets=global_nets)
+    return agent, env
+
+
+def mp_run_session(spec, global_nets, mp_dict):
+    '''Wrap for multiprocessing with shared variable'''
+    session = Session(spec, global_nets)
+    metrics = session.run()
+    mp_dict[session.index] = metrics
+
 
 class Session:
     '''
-    The base unit of instantiated RL system.
-    Given a spec,
-    session creates agent(s) and environment(s),
-    run the RL system and collect data, e.g. fitness metrics, till it ends,
-    then return the session data.
+    The base lab unit to run a RL session for a spec.
+    Given a spec, it creates the agent and env, runs the RL loop,
+    then gather data and analyze it to produce session data.
     '''
 
-    def __init__(self, spec, info_space, global_nets=None):
+    def __init__(self, spec, global_nets=None):
         self.spec = spec
-        self.info_space = info_space
-        self.index = self.info_space.get('session')
-        util.set_logger(self.spec, self.info_space, logger, 'session')
-        self.data = None
+        self.index = self.spec['meta']['session']
+        util.set_random_seed(self.spec)
+        util.set_cuda_id(self.spec)
+        util.set_logger(self.spec, logger, 'session')
+        spec_util.save(spec, unit='session')
 
-        # init singleton agent and env
-        self.env = make_env(self.spec)
-        util.set_rand_seed(self.info_space.get_random_seed(), self.env)
+        self.agent, self.env = make_agent_env(self.spec, global_nets)
         with util.ctx_lab_mode('eval'):  # env for eval
             self.eval_env = make_env(self.spec)
-            util.set_rand_seed(self.info_space.get_random_seed(), self.eval_env)
-        util.try_set_cuda_id(self.spec, self.info_space)
-        body = Body(self.env, self.spec['agent'])
-
-        AgentClass = getattr(agent, ps.get(self.spec['agent'][0], 'name'))
-        self.agent = AgentClass(self.spec, self.info_space, body=body, global_nets=global_nets)
-
-        enable_aeb_space(self)  # to use lab's data analysis framework
         logger.info(util.self_desc(self))
-        logger.info(f'Initialized session {self.index}')
 
-    def try_ckpt(self, agent, env):
-        '''Try to checkpoint agent at the start, save_freq, and the end'''
-        tick = env.clock.get(env.max_tick_unit)
-        to_ckpt = False
-        if not util.in_eval_lab_modes() and tick <= env.max_tick:
-            to_ckpt = (tick % env.eval_frequency == 0) or tick == env.max_tick
-        if env.max_tick_unit == 'epi':  # extra condition for epi
-            to_ckpt = to_ckpt and env.done
+    def to_ckpt(self, env, mode='eval'):
+        '''Check with clock whether to run log/eval ckpt: at the start, save_freq, and the end'''
+        if mode == 'eval' and util.in_eval_lab_modes():  # avoid double-eval: eval-ckpt in eval mode
+            return False
+        clock = env.clock
+        frame = clock.get()
+        frequency = env.eval_frequency if mode == 'eval' else env.log_frequency
+        if frame == 0 or clock.get('opt_step') == 0:  # avoid ckpt at init
+            to_ckpt = False
+        elif frequency is None:  # default episodic
+            to_ckpt = env.done
+        else:  # normal ckpt condition by mod remainder (general for venv)
+            to_ckpt = util.frame_mod(frame, frequency, env.num_envs) or frame == clock.max_frame
+        return to_ckpt
 
-        if to_ckpt:
-            if self.spec['meta'].get('parallel_eval'):
-                retro_analysis.run_parallel_eval(self, agent, env)
-            else:
-                self.run_eval_episode()
-            if analysis.new_best(agent):
+    def try_ckpt(self, agent, env):
+        '''Check then run checkpoint log/eval'''
+        body = agent.body
+        if self.to_ckpt(env, 'log'):
+            body.train_ckpt()
+            body.log_summary('train')
+
+        if self.to_ckpt(env, 'eval'):
+            avg_return = analysis.gen_avg_return(agent, self.eval_env)
+            body.eval_ckpt(self.eval_env, avg_return)
+            body.log_summary('eval')
+            if body.eval_reward_ma >= body.best_reward_ma:
+                body.best_reward_ma = body.eval_reward_ma
                 agent.save(ckpt='best')
-            if tick > 0:  # nothing to analyze at start
-                analysis.analyze_session(self, eager_analyze_trial=True)
-
-    def run_eval_episode(self):
-        with util.ctx_lab_mode('eval'):  # enter eval context
-            self.agent.algorithm.update()  # set explore_var etc. to end_val under ctx
-            self.eval_env.clock.tick('epi')
-            logger.info(f'Running eval episode for trial {self.info_space.get("trial")} session {self.index}')
-            total_reward = 0
-            reward, observation, done = self.eval_env.reset()
-            while not done:
-                self.eval_env.clock.tick('t')
-                action = self.agent.act(observation)
-                reward, observation, done = self.eval_env.step(action)
-                total_reward += reward
-        # exit eval context, restore variables simply by updating
-        self.agent.algorithm.update()
-        # update body.eval_df
-        self.agent.body.eval_update(self.eval_env, total_reward)
-        self.agent.body.log_summary(body_df_kind='eval')
-
-    def run_episode(self):
-        self.env.clock.tick('epi')
-        logger.info(f'Running trial {self.info_space.get("trial")} session {self.index} episode {self.env.clock.epi}')
-        reward, observation, done = self.env.reset()
-        self.agent.reset(observation)
-        while not done:
+            if len(body.train_df) > 1:  # need > 1 row to calculate stability
+                metrics = analysis.analyze_session(self.spec, body.train_df, 'train')
+                # body.log_metrics(metrics['scalar'], 'train')
+            if len(body.eval_df) > 1:  # need > 1 row to calculate stability
+                metrics = analysis.analyze_session(self.spec, body.eval_df, 'eval')
+                # body.log_metrics(metrics['scalar'], 'eval')
+
+    def run_eval(self):
+        returns = []
+        success = fail = 0 
+        num_eval = self.agent.spec['meta']['num_eval']
+        for _ in range(num_eval):
+            _return, task_success = analysis.gen_result(self.agent, self.eval_env)
+            returns.append(_return)
+            if hasattr(self.eval_env, 'get_task_success'):
+                if task_success:
+                    success += 1
+                else: 
+                    fail += 1
+        if hasattr(self.eval_env, 'get_task_success'):
+            logger.info('{} episodes, {} average return, {}% success rate'.format(num_eval, sum(returns)/num_eval, success/(success+fail)*100))
+        else:
+            logger.info('{} episodes, {} average return'.format(num_eval, sum(returns)/num_eval))
+
+    def run_rl(self):
+        '''Run the main RL loop until clock.max_frame'''
+        logger.info(f'Running RL loop for trial {self.spec["meta"]["trial"]} session {self.index}')
+        clock = self.env.clock
+        obs = self.env.reset()
+        self.agent.reset(obs)
+        done = False
+        while True:
+            if util.epi_done(done):  # before starting another episode
+                # logger.info(f'A dialog session is done')
+                logger.nl(f'A dialog session is done')
+                self.try_ckpt(self.agent, self.env)
+                t =  clock.get()
+                if clock.get() < clock.max_frame:  # reset and continue
+                    clock.tick('epi')
+                    obs = self.env.reset()
+                    self.agent.reset(obs)
+                    done = False
             self.try_ckpt(self.agent, self.env)
-            self.env.clock.tick('t')
-            action = self.agent.act(observation)
-            reward, observation, done = self.env.step(action)
-            self.agent.update(action, reward, observation, done)
-        self.try_ckpt(self.agent, self.env)  # final timestep ckpt
-        self.agent.body.log_summary(body_df_kind='train')
+            if clock.get() >= clock.max_frame:  # finish
+                break
+            clock.tick('t')
+            action = self.agent.act(obs)
+            next_obs, reward, done, info = self.env.step(action)
+            self.agent.update(obs, action, reward, next_obs, done)
+            obs = next_obs
 
     def close(self):
-        '''
-        Close session and clean up.
-        Save agent, close env.
-        '''
+        '''Close session and clean up. Save agent, close env.'''
         self.agent.close()
         self.env.close()
         self.eval_env.close()
-        logger.info('Session done and closed.')
-
-    def run(self):
-        while self.env.clock.get(self.env.max_tick_unit) < self.env.max_tick:
-            self.run_episode()
-        retro_analysis.try_wait_parallel_eval(self)
-        self.data = analysis.analyze_session(self)  # session fitness
-        self.close()
-        return self.data
-
-
-class SpaceSession(Session):
-    '''Session for multi-agent/env setting'''
-
-    def __init__(self, spec, info_space, global_nets=None):
-        self.spec = spec
-        self.info_space = info_space
-        self.index = self.info_space.get('session')
-        util.set_logger(self.spec, self.info_space, logger, 'session')
-        self.data = None
-
-        self.aeb_space = AEBSpace(self.spec, self.info_space)
-        self.env_space = EnvSpace(self.spec, self.aeb_space)
-        self.aeb_space.init_body_space()
-        util.set_rand_seed(self.info_space.get_random_seed(), self.env_space)
-        util.try_set_cuda_id(self.spec, self.info_space)
-        self.agent_space = AgentSpace(self.spec, self.aeb_space, global_nets)
-
-        logger.info(util.self_desc(self))
-        logger.info(f'Initialized session {self.index}')
-
-    def try_ckpt(self, agent_space, env_space):
-        '''Try to checkpoint agent at the start, save_freq, and the end'''
-        # TODO ckpt and eval not implemented for SpaceSession
-        pass
-        # for agent in agent_space.agents:
-        #     for body in agent.nanflat_body_a:
-        #         env = body.env
-        #         super(SpaceSession, self).try_ckpt(agent, env)
-
-    def run_all_episodes(self):
-        '''
-        Continually run all episodes, where each env can step and reset at its own clock_speed and timeline.
-        Will terminate when all envs done are done.
-        '''
-        all_done = self.aeb_space.tick('epi')
-        reward_space, state_space, done_space = self.env_space.reset()
-        self.agent_space.reset(state_space)
-        while not all_done:
-            self.try_ckpt(self.agent_space, self.env_space)
-            all_done = self.aeb_space.tick()
-            action_space = self.agent_space.act(state_space)
-            reward_space, state_space, done_space = self.env_space.step(action_space)
-            self.agent_space.update(action_space, reward_space, state_space, done_space)
-        self.try_ckpt(self.agent_space, self.env_space)
-        retro_analysis.try_wait_parallel_eval(self)
-
-    def close(self):
-        '''
-        Close session and clean up.
-        Save agent, close env.
-        '''
-        self.agent_space.close()
-        self.env_space.close()
-        logger.info('Session done and closed.')
+        logger.info(f'Session {self.index} done')
 
     def run(self):
-        self.run_all_episodes()
-        self.data = analysis.analyze_session(self, tmp_space_session_sub=True)  # session fitness
+        if util.in_eval_lab_modes():
+            self.run_eval()
+            metrics = None
+        else:
+            self.run_rl()
+            metrics = analysis.analyze_session(self.spec, self.agent.body.eval_df, 'eval')
+            # self.agent.body.log_metrics(metrics['scalar'], 'eval')
         self.close()
-        return self.data
-
-
-def init_run_session(*args):
-    '''Runner for multiprocessing'''
-    session = Session(*args)
-    return session.run()
-
-
-def init_run_space_session(*args):
-    '''Runner for multiprocessing'''
-    session = SpaceSession(*args)
-    return session.run()
+        return metrics
 
 
 class Trial:
     '''
-    The base unit of an experiment.
-    Given a spec and number s,
-    trial creates and runs s sessions,
-    gather and aggregate data from sessions as trial data,
-    then return the trial data.
+    The lab unit which runs repeated sessions for a same spec, i.e. a trial
+    Given a spec and number s, trial creates and runs s sessions,
+    then gathers session data and analyze it to produce trial data.
     '''
 
-    def __init__(self, spec, info_space):
+    def __init__(self, spec):
         self.spec = spec
-        self.info_space = info_space
-        self.index = self.info_space.get('trial')
-        info_space.set('session', None)  # Session starts anew for new trial
-        util.set_logger(self.spec, self.info_space, logger, 'trial')
-        self.session_data_dict = {}
-        self.data = None
-
-        analysis.save_spec(spec, info_space, unit='trial')
-        self.is_singleton = spec_util.is_singleton(spec)  # singleton mode as opposed to multi-agent-env space
-        self.SessionClass = Session if self.is_singleton else SpaceSession
-        self.mp_runner = init_run_session if self.is_singleton else init_run_space_session
-        logger.info(f'Initialized trial {self.index}')
+        self.index = self.spec['meta']['trial']
+        util.set_logger(self.spec, logger, 'trial')
+        spec_util.save(spec, unit='trial')
 
     def parallelize_sessions(self, global_nets=None):
-        workers = []
-        for _s in range(self.spec['meta']['max_session']):
-            self.info_space.tick('session')
-            w = mp.Process(target=self.mp_runner, args=(deepcopy(self.spec), deepcopy(self.info_space), global_nets))
-            w.start()
-            workers.append(w)
-        for w in workers:
-            w.join()
-        session_datas = retro_analysis.session_data_dict_for_dist(self.spec, self.info_space)
-        return session_datas
+        mp_dict = mp.Manager().dict()
+        spec_util.tick(self.spec, 'session')
+        mp_run_session(deepcopy(self.spec), global_nets, mp_dict)
+        # workers = []
+        # for _s in range(self.spec['meta']['max_session']):
+        #     spec_util.tick(self.spec, 'session')
+        #     w = mp.Process(target=mp_run_session, args=(deepcopy(self.spec), global_nets, mp_dict))
+        #     w.start()
+        #     workers.append(w)
+        # for w in workers:
+        #     w.join()
+        session_metrics_list = [mp_dict[idx] for idx in sorted(mp_dict.keys())]
+        return session_metrics_list
 
     def run_sessions(self):
         logger.info('Running sessions')
-        if util.get_lab_mode() in ('train', 'eval') and self.spec['meta']['max_session'] > 1:
-            # when training a single spec over multiple sessions
-            session_datas = self.parallelize_sessions()
-        else:
-            session_datas = []
-            for _s in range(self.spec['meta']['max_session']):
-                self.info_space.tick('session')
-                session = self.SessionClass(deepcopy(self.spec), deepcopy(self.info_space))
-                session_data = session.run()
-                session_datas.append(session_data)
-                if analysis.is_unfit(session_data, session):
-                    break
-        return session_datas
-
-    def make_global_nets(self, agent):
-        global_nets = {}
-        for net_name in agent.algorithm.net_names:
-            g_net = getattr(agent.algorithm, net_name)
-            g_net.share_memory()  # make net global
-            # TODO also create shared optimizer here
-            global_nets[net_name] = g_net
-        return global_nets
+        session_metrics_list = self.parallelize_sessions()
+        return session_metrics_list
 
     def init_global_nets(self):
-        session = self.SessionClass(deepcopy(self.spec), deepcopy(self.info_space))
-        if self.is_singleton:
-            session.env.close()  # safety
-            global_nets = self.make_global_nets(session.agent)
-        else:
-            session.env_space.close()  # safety
-            global_nets = [self.make_global_nets(agent) for agent in session.agent_space.agents]
+        session = Session(deepcopy(self.spec))
+        session.env.close()  # safety
+        global_nets = net_util.init_global_nets(session.agent.algorithm)
         return global_nets
 
     def run_distributed_sessions(self):
         logger.info('Running distributed sessions')
         global_nets = self.init_global_nets()
-        session_datas = self.parallelize_sessions(global_nets)
-        return session_datas
+        session_metrics_list = self.parallelize_sessions(global_nets)
+        return session_metrics_list
 
     def close(self):
-        logger.info('Trial done and closed.')
+        logger.info(f'Trial {self.index} done')
 
     def run(self):
-        if self.spec['meta'].get('distributed'):
-            session_datas = self.run_distributed_sessions()
+        if self.spec['meta'].get('distributed') == False:
+            session_metrics_list = self.run_sessions()
         else:
-            session_datas = self.run_sessions()
-        self.session_data_dict = {data.index[0]: data for data in session_datas}
-        self.data = analysis.analyze_trial(self)
+            session_metrics_list = self.run_distributed_sessions()
+        metrics = analysis.analyze_trial(self.spec, session_metrics_list)
         self.close()
-        return self.data
+        # return metrics['scalar']
+        return metrics
 
 
 class Experiment:
     '''
-    The core high level unit of Lab.
-    Given a spec-space/generator of cardinality t,
-    a number s,
-    a hyper-optimization algorithm hopt(spec, fitness-metric) -> spec_next/null
-    experiment creates and runs up to t trials of s sessions each to optimize (maximize) the fitness metric,
-    gather the trial data,
-    then return the experiment data for analysis and use in evolution graph.
-    Experiment data will include the trial data, notes on design, hypothesis, conclusion, analysis data, e.g. fitness metric, evolution link of ancestors to potential descendants.
-    An experiment then forms a node containing its data in the evolution graph with the evolution link and suggestion at the adjacent possible new experiments
-    On the evolution graph level, an experiment and its neighbors could be seen as test/development of traits.
+    The lab unit to run experiments.
+    It generates a list of specs to search over, then run each as a trial with s repeated session,
+    then gathers trial data and analyze it to produce experiment data.
     '''
 
-    def __init__(self, spec, info_space):
+    def __init__(self, spec):
         self.spec = spec
-        self.info_space = info_space
-        self.index = self.info_space.get('experiment')
-        util.set_logger(self.spec, self.info_space, logger, 'trial')
-        self.trial_data_dict = {}
-        self.data = None
-        analysis.save_spec(spec, info_space, unit='experiment')
-        SearchClass = getattr(search, spec['meta'].get('search'))
-        self.search = SearchClass(self)
-        logger.info(f'Initialized experiment {self.index}')
-
-    def init_trial_and_run(self, spec, info_space):
-        '''
-        Method to run trial with the properly updated info_space (trial_index) from experiment.search.lab_trial.
-        '''
-        trial = Trial(spec, info_space)
-        trial_data = trial.run()
-        return trial_data
+        self.index = self.spec['meta']['experiment']
+        util.set_logger(self.spec, logger, 'trial')
+        spec_util.save(spec, unit='experiment')
 
     def close(self):
-        reload(search)  # fixes ray consecutive run crashing due to bad cleanup
-        logger.info('Experiment done and closed.')
+        logger.info('Experiment done')
 
     def run(self):
-        self.trial_data_dict = self.search.run()
-        self.data = analysis.analyze_experiment(self)
+        trial_data_dict = search.run_ray_search(self.spec)
+        experiment_df = analysis.analyze_experiment(self.spec, trial_data_dict)
         self.close()
-        return self.data
+        return experiment_df
diff --git a/convlab/experiment/monitor.py b/convlab/experiment/monitor.py
deleted file mode 100644
index 846a2ac..0000000
--- a/convlab/experiment/monitor.py
+++ /dev/null
@@ -1,494 +0,0 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The monitor module with data_space
-Monitors agents, environments, sessions, trials, experiments, evolutions, and handles all the data produced by the Lab components.
-InfoSpace handles the unified hyperdimensional data for SLM Lab, used for analysis and experiment planning. Sources data from monitor.
-Each dataframe resolves from the coarsest dimension to the finest, with data coordinates coor in the form: (evolution,experiment,trial,session,agent,env,body)
-The resolution after session is the AEB space, hence it is a subspace.
-AEB space is not necessarily tabular, and hence the data is NoSQL.
-
-The data_space is congruent to the coor, with proper resolution.
-E.g. (evolution,experiment,trial,session) specifies the session_data of a session, ran over multiple episodes on the AEB space.
-
-Space ordering:
-InfoSpace: the general space for complete information
-AEBSpace: subspace of InfoSpace for a specific session
-AgentSpace: space agent instances, subspace of AEBSpace
-EnvSpace: space of env instances, subspace of AEBSpace
-DataSpace: a data space storing an AEB data projected to a-axis, and its dual projected to e-axis. This is so that a-proj data like action_space from agent_space can be used by env_space, which requires e-proj data, and vice versa.
-
-Object reference (for agent to access env properties, vice versa):
-Agents - AgentSpace - AEBSpace - EnvSpace - Envs
-'''
-from gym import spaces
-from convlab.agent import AGENT_DATA_NAMES
-from convlab.agent.algorithm import policy_util
-from convlab.agent.net import net_util
-from convlab.env import ENV_DATA_NAMES
-from convlab.experiment import analysis
-from convlab.lib import logger, util
-from convlab.spec import spec_util
-import numpy as np
-import pandas as pd
-import pydash as ps
-import time
-import torch
-
-# These correspond to the control unit classes, lower cased
-COOR_AXES = [
-    'evolution',
-    'experiment',
-    'trial',
-    'session',
-]
-COOR_AXES_ORDER = {
-    axis: idx for idx, axis in enumerate(COOR_AXES)
-}
-COOR_DIM = len(COOR_AXES)
-logger = logger.get_logger(__name__)
-
-
-def enable_aeb_space(session):
-    '''Enable aeb_space to session use Lab's data-monitor and analysis modules'''
-    session.aeb_space = AEBSpace(session.spec, session.info_space)
-    # make compatible with the generic multiagent setup
-    session.aeb_space.body_space = DataSpace('body', session.aeb_space)
-    body_v = np.full(session.aeb_space.aeb_shape, np.nan, dtype=object)
-    body_v[0, 0, 0] = session.agent.body
-    session.aeb_space.body_space.add(body_v)
-    session.agent.aeb_space = session.aeb_space
-    session.env.aeb_space = session.aeb_space
-
-
-def get_action_type(action_space):
-    '''Method to get the action type to choose prob. dist. to sample actions from NN logits output'''
-    if isinstance(action_space, spaces.Box):
-        shape = action_space.shape
-        assert len(shape) == 1
-        if shape[0] == 1:
-            return 'continuous'
-        else:
-            return 'multi_continuous'
-    elif isinstance(action_space, spaces.Discrete):
-        return 'discrete'
-    elif isinstance(action_space, spaces.MultiDiscrete):
-        return 'multi_discrete'
-    elif isinstance(action_space, spaces.MultiBinary):
-        return 'multi_binary'
-    else:
-        raise NotImplementedError
-
-
-class Body:
-    '''
-    Body of an agent inside an environment. This acts as the main variable storage and bridge between agent and environment to pair them up properly in the generalized multi-agent-env setting.
-    '''
-
-    def __init__(self, env, agent_spec, aeb=(0, 0, 0), aeb_space=None):
-        # essential reference variables
-        self.agent = None  # set later
-        self.env = env
-        self.aeb = aeb
-        self.a, self.e, self.b = aeb
-        self.nanflat_a_idx, self.nanflat_e_idx = self.a, self.e
-
-        # for action policy exploration, so be set in algo during init_algorithm_params()
-        self.explore_var = np.nan
-
-        # body stats variables
-        self.loss = np.nan  # training losses
-        # diagnostics variables/stats from action_policy prob. dist.
-        self.action_tensor = None
-        self.action_pd = None  # for the latest action, to compute entropy and log prob
-        self.entropies = []  # action entropies for exploration
-        self.log_probs = []  # action log probs
-        # mean values for debugging
-        self.mean_entropy = np.nan
-        self.mean_log_prob = np.nan
-        self.mean_grad_norm = np.nan
-
-        # stores running mean and std dev of states
-        self.state_mean = np.nan
-        self.state_std_dev_int = np.nan
-        self.state_std_dev = np.nan
-        self.state_n = 0
-
-        # store current and best reward_ma for model checkpointing and early termination if all the environments are solved
-        self.best_reward_ma = -np.inf
-        self.eval_reward_ma = np.nan
-
-        # dataframes to track data for analysis.analyze_session
-        # track training data within run_episode
-        self.train_df = pd.DataFrame(columns=[
-            'epi', 'total_t', 't', 'wall_t', 'reward', 'loss', 'lr',
-            'explore_var', 'entropy_coef', 'entropy', 'log_prob', 'grad_norm'])
-        # track eval data within run_eval_episode. the same as train_df except for reward
-        self.eval_df = self.train_df.copy()
-
-        if aeb_space is None:  # singleton mode
-            # the specific agent-env interface variables for a body
-            self.observation_space = self.env.observation_space
-            self.action_space = self.env.action_space
-            self.observable_dim = self.env.observable_dim
-            self.state_dim = self.observable_dim['state']
-            self.action_dim = self.env.action_dim
-            self.is_discrete = self.env.is_discrete
-        else:
-            self.space_init(aeb_space)
-
-        self.action_type = get_action_type(self.action_space)
-        self.action_pdtype = agent_spec[self.a]['algorithm'].get('action_pdtype')
-        if self.action_pdtype in (None, 'default'):
-            self.action_pdtype = policy_util.ACTION_PDS[self.action_type][0]
-
-    def action_pd_update(self):
-        '''Calculate and update action entropy and log_prob using self.action_pd. Call this in agent.update()'''
-        if self.action_pd is None:  # skip if None
-            return
-        # mean for single and multi-action
-        entropy = self.action_pd.entropy().mean(dim=0)
-        self.entropies.append(entropy)
-        log_prob = self.action_pd.log_prob(self.action_tensor).mean(dim=0)
-        self.log_probs.append(log_prob)
-        assert not torch.isnan(log_prob)
-
-    def calc_df_row(self, env, total_reward):
-        '''Calculate a row for updating train_df or eval_df, given a total_reward.'''
-        row = pd.Series({
-            # epi and total_t are always measured from training env
-            'epi': self.env.clock.get('epi'),
-            'total_t': self.env.clock.get('total_t'),
-            # t and reward are measured from a given env or eval_env
-            't': env.clock.get('t'),
-            'wall_t': env.clock.get_elapsed_wall_t(),
-            'reward': total_reward,
-            'loss': self.loss,
-            'lr': self.get_mean_lr(),
-            'explore_var': self.explore_var,
-            'entropy_coef': self.entropy_coef if hasattr(self, 'entropy_coef') else np.nan,
-            'entropy': self.mean_entropy,
-            'log_prob': self.mean_log_prob,
-            'grad_norm': self.mean_grad_norm,
-        }, dtype=np.float32)
-        assert all(col in self.train_df.columns for col in row.index), f'Mismatched row keys: {row.index} vs df columns {self.train_df.columns}'
-        return row
-
-    def epi_reset(self):
-        '''
-        Handles any body attribute reset at the start of an episode.
-        This method is called automatically at base memory.epi_reset().
-        '''
-        t = self.env.clock.t
-        assert t == 0, f'aeb: {self.aeb}, t: {t}'
-        if hasattr(self, 'aeb_space'):
-            self.space_fix_stats()
-
-    def epi_update(self):
-        '''Update to append data at the end of an episode (when env.done is true)'''
-        assert self.env.done
-        row = self.calc_df_row(self.env, self.memory.total_reward)
-        # append efficiently to df
-        self.train_df.loc[len(self.train_df)] = row
-
-    def eval_update(self, eval_env, total_reward):
-        '''Update to append data at eval checkpoint'''
-        row = self.calc_df_row(eval_env, total_reward)
-        # append efficiently to df
-        self.eval_df.loc[len(self.eval_df)] = row
-        # update current reward_ma
-        self.eval_reward_ma = self.eval_df[-analysis.MA_WINDOW:]['reward'].mean()
-
-    def flush(self):
-        '''Update and flush gradient-related variables after training step similar.'''
-        # update
-        self.mean_entropy = torch.tensor(self.entropies).mean().item()
-        self.mean_log_prob = torch.tensor(self.log_probs).mean().item()
-        # net.grad_norms is only available in dev mode for efficiency
-        grad_norms = net_util.get_grad_norms(self.agent.algorithm)
-        self.mean_grad_norm = np.nan if ps.is_empty(grad_norms) else np.mean(grad_norms)
-
-        # flush
-        self.action_tensor = None
-        self.action_pd = None
-        self.entropies = []
-        self.log_probs = []
-
-    def __str__(self):
-        return 'body: ' + util.to_json(util.get_class_attr(self))
-
-    def get_mean_lr(self):
-        '''Gets the average current learning rate of the algorithm's nets.'''
-        if not hasattr(self.agent.algorithm, 'net_names'):
-            return np.nan
-        lrs = []
-        for net_name in self.agent.algorithm.net_names:
-            # we are only interested in directly trainable network, so exclude target net
-            if net_name is 'target_net':
-                continue
-            net = getattr(self.agent.algorithm, net_name)
-            lrs.append(net.lr_scheduler.get_lr())
-        return np.mean(lrs)
-
-    def get_log_prefix(self):
-        '''Get the prefix for logging'''
-        spec = self.agent.spec
-        info_space = self.agent.info_space
-        clock = self.env.clock
-        prefix = f'{spec["name"]}_t{info_space.get("trial")}_s{info_space.get("session")}, aeb{self.aeb}'
-        return prefix
-
-    def log_summary(self, body_df_kind='eval'):
-        '''Log the summary for this body when its environment is done'''
-        prefix = self.get_log_prefix()
-        df = self.eval_df if body_df_kind == 'eval' else self.train_df
-        last_row = df.iloc[-1]
-        row_str = ', '.join([f'{k}: {v:g}' for k, v in last_row.items()])
-        reward_ma = df[-analysis.MA_WINDOW:]['reward'].mean()
-        reward_ma_str = f'last-{analysis.MA_WINDOW}-epi avg: {reward_ma:g}'
-        msg = f'{prefix} [{body_df_kind}_df] {row_str}, {reward_ma_str}'
-        logger.info(msg)
-
-    def space_init(self, aeb_space):
-        '''Post init override for space body. Note that aeb is already correct from __init__'''
-        self.aeb_space = aeb_space
-        # to be reset properly later
-        self.nanflat_a_idx, self.nanflat_e_idx = None, None
-
-        self.observation_space = self.env.observation_spaces[self.a]
-        self.action_space = self.env.action_spaces[self.a]
-        self.observable_dim = self.env._get_observable_dim(self.observation_space)
-        self.state_dim = self.observable_dim['state']
-        self.action_dim = self.env._get_action_dim(self.action_space)
-        self.is_discrete = self.env._is_discrete(self.action_space)
-
-    def space_fix_stats(self):
-        '''the space control loop will make agent append stat at done, so to offset for that, pop it at reset'''
-        for action_stat in [self.entropies, self.log_probs]:
-            if len(action_stat) > 0:
-                action_stat.pop()
-
-
-class DataSpace:
-    '''
-    AEB data space. Store all data from RL system in standard aeb-shaped tensors.
-    '''
-
-    def __init__(self, data_name, aeb_space):
-        self.data_name = data_name
-        self.aeb_space = aeb_space
-        self.aeb_shape = aeb_space.aeb_shape
-
-        # data from env have shape (eab), need to swap
-        self.to_swap = self.data_name in ENV_DATA_NAMES
-        self.swap_aeb_shape = self.aeb_shape[1], self.aeb_shape[0], self.aeb_shape[2]
-
-        self.data_shape = self.swap_aeb_shape if self.to_swap else self.aeb_shape
-        self.data_type = object if self.data_name in ['state', 'action'] else np.float32
-        self.data = None  # standard data in aeb_shape
-        self.swap_data = None
-
-    def __str__(self):
-        if self.data is None:
-            return '<None>'
-        s = '['
-        for a, a_arr in enumerate(self.data):
-            s += f'\n  a:{a} ['
-            for e, e_arr in enumerate(a_arr):
-                s += f'\n    e:{e} ['
-                for b, val in enumerate(e_arr):
-                    s += f'\n      b:{b} {val}'
-                s += ']'
-            s += ']'
-        s += '\n]'
-        return s
-
-    def __bool__(self):
-        return util.nonan_all(self.data)
-
-    def init_data_v(self):
-        '''Method to init a data volume filled with np.nan'''
-        data_v = np.full(self.data_shape, np.nan, dtype=self.data_type)
-        return data_v
-
-    def init_data_s(self, a=None, e=None):
-        '''Method to init a data surface (subset of data volume) filled with np.nan.'''
-        body_s = self.aeb_space.body_space.get(a=a, e=e)
-        data_s = np.full(body_s.shape, np.nan, dtype=self.data_type)
-        return data_s
-
-    def add(self, data_v):
-        '''
-        Take raw data from RL system and construct numpy object self.data.
-        If data is from env, auto-swap the data to aeb standard shape.
-        @param {[x: [y: [body_v]]} data_v As collected in RL sytem.
-        @returns {array} data Tensor in standard aeb shape.
-        '''
-        new_data = np.array(data_v)  # no type restriction, auto-infer
-        if self.to_swap:  # data from env has shape eab
-            self.swap_data = new_data
-            self.data = new_data.swapaxes(0, 1)
-        else:
-            self.data = new_data
-            self.swap_data = new_data.swapaxes(0, 1)
-        return self.data
-
-    def get(self, a=None, e=None):
-        '''
-        Get the data projected on a or e axes for use by agent_space, env_space.
-        @param {int} a The index a of an agent in agent_space
-        @param {int} e The index e of an env in env_space
-        @returns {array} data_x Where x is a or e.
-        '''
-        if e is None:
-            return self.data[a]
-        elif a is None:
-            return self.swap_data[e]
-        else:
-            return self.data[a][e]
-
-
-class AEBSpace:
-
-    def __init__(self, spec, info_space):
-        self.info_space = info_space
-        self.spec = spec
-        self.clock = None  # the finest common refinement as space clock
-        self.agent_space = None
-        self.env_space = None
-        self.body_space = None
-        (self.aeb_list, self.aeb_shape, self.aeb_sig) = self.get_aeb_info(self.spec)
-        self.data_spaces = self.init_data_spaces()
-
-    def get_aeb_info(cls, spec):
-        '''
-        Get from spec the aeb_list, aeb_shape and aeb_sig, which are used to resolve agent_space and env_space.
-        @returns {list, (a,e,b), array([a, e, b])} aeb_list, aeb_shape, aeb_sig
-        '''
-        aeb_list = spec_util.resolve_aeb(spec)
-        aeb_shape = util.get_aeb_shape(aeb_list)
-        aeb_sig = np.full(aeb_shape, np.nan)
-        for aeb in aeb_list:
-            aeb_sig.itemset(aeb, 1)
-        return aeb_list, aeb_shape, aeb_sig
-
-    def init_data_spaces(self):
-        self.data_spaces = {
-            data_name: DataSpace(data_name, self)
-            for data_name in AGENT_DATA_NAMES + ENV_DATA_NAMES
-        }
-        return self.data_spaces
-
-    def init_data_s(self, data_names, a=None, e=None):
-        '''Shortcut to init data_s_1, data_s_2, ...'''
-        return tuple(self.data_spaces[data_name].init_data_s(a=a, e=e) for data_name in data_names)
-
-    def init_data_v(self, data_names):
-        '''Shortcut to init data_v_1, data_v_2, ...'''
-        return tuple(self.data_spaces[data_name].init_data_v() for data_name in data_names)
-
-    def init_body_space(self):
-        '''Initialize the body_space (same class as data_space) used for AEB body resolution, and set reference in agents and envs'''
-        self.body_space = DataSpace('body', self)
-        body_v = np.full(self.aeb_shape, np.nan, dtype=object)
-        for (a, e, b), sig in np.ndenumerate(self.aeb_sig):
-            if sig == 1:
-                env = self.env_space.get(e)
-                body = Body(env, self.spec['agent'], aeb=(a, e, b), aeb_space=self)
-                body_v[(a, e, b)] = body
-        self.body_space.add(body_v)
-        # complete the backward reference to env_space
-        for env in self.env_space.envs:
-            body_e = self.body_space.get(e=env.e)
-            env.set_body_e(body_e)
-        self.clock = self.env_space.get_base_clock()
-        logger.info(util.self_desc(self))
-        return self.body_space
-
-    def add(self, data_name, data_v):
-        '''
-        Add a data to a data space, e.g. data actions collected per body, per agent, from agent_space, with AEB shape projected on a-axis, added to action_space.
-        Could also be a shortcut to do batch add data_v_1, data_v_2, ...
-        @param {str|[str]} data_name
-        @param {[x: [yb_idx:[body_v]]} data_v, where x, y could be a, e interchangeably.
-        @returns {DataSpace} data_space (aeb is implied)
-        '''
-        if ps.is_string(data_name):
-            data_space = self.data_spaces[data_name]
-            data_space.add(data_v)
-            return data_space
-        else:
-            return tuple(self.add(d_name, d_v) for d_name, d_v in zip(data_name, data_v))
-
-    def tick(self, unit=None):
-        '''Tick all the clocks in env_space, and tell if all envs are done'''
-        end_sessions = []
-        for env in self.env_space.envs:
-            if env.done:
-                for body in env.nanflat_body_e:
-                    body.log_summary(body_df_kind='train')
-            env.clock.tick(unit or ('epi' if env.done else 't'))
-            end_session = not (env.clock.get(env.max_tick_unit) < env.max_tick)
-            end_sessions.append(end_session)
-        return all(end_sessions)
-
-
-class InfoSpace:
-    def __init__(self, last_coor=None):
-        '''
-        Initialize the coor, the global point in info space that will advance according to experiment progress.
-        The coor starts with null first since the coor may not start at the origin.
-        '''
-        self.coor = last_coor or {k: None for k in COOR_AXES}
-        self.covered_space = []
-        # used to id experiment sharing the same spec name
-        self.experiment_ts = util.get_ts()
-        # ckpt gets appened to extend prepath using util.get_prepath for saving models, e.g. ckpt_str = ckpt-epi10-totalt1000
-        # ckpt = 'eval' is special for eval mode, so data files will save with `ckpt-eval`; no models will be saved, but to load models with normal ckpt it will find them using eval_model_prepath
-        # e.g. 'epi24-totalt1000', 'eval', 'best'
-        self.ckpt = None
-        # e.g. 'data/dqn_cartpole_2018_12_19_085843/dqn_cartpole_t0_s0_ckpt-epi24-totalt1000'
-        self.eval_model_prepath = None
-
-    def reset_lower_axes(cls, coor, axis):
-        '''Reset the axes lower than the given axis in coor'''
-        axis_idx = COOR_AXES_ORDER[axis]
-        for post_idx in range(axis_idx + 1, COOR_DIM):
-            post_axis = COOR_AXES[post_idx]
-            coor[post_axis] = None
-        return coor
-
-    def tick(self, axis):
-        '''
-        Advance the coor to the next point in axis (control unit class).
-        If the axis value has been reset, update to 0, else increment. For all axes lower than the specified axis, reset to None.
-        Note this will not skip coor in space, even though the covered space may not be rectangular.
-        @example
-
-        info_space.tick('session')
-        session = Session(spec, info_space)
-        '''
-        assert axis in self.coor
-        if axis == 'experiment':
-            self.experiment_ts = util.get_ts()
-        new_coor = self.coor.copy()
-        if new_coor[axis] is None:
-            new_coor[axis] = 0
-        else:
-            new_coor[axis] += 1
-        new_coor = self.reset_lower_axes(new_coor, axis)
-        self.covered_space.append(self.coor)
-        self.coor = new_coor
-        return self.coor
-
-    def get(self, axis):
-        return self.coor[axis]
-
-    def set(self, axis, val):
-        self.coor[axis] = val
-        return self.coor[axis]
-
-    def get_random_seed(self):
-        '''Standard method to get random seed for a session'''
-        return int(1e5 * (self.get('trial') or 0) + 1e3 * (self.get('session') or 0) + time.time())
diff --git a/convlab/experiment/retro_analysis.py b/convlab/experiment/retro_analysis.py
index 6f3286c..270f402 100644
--- a/convlab/experiment/retro_analysis.py
+++ b/convlab/experiment/retro_analysis.py
@@ -1,252 +1,75 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The retro analysis module
-Runs analysis after a lab run using existing data files
-e.g. yarn retro_analyze data/reinforce_cartpole_2018_01_22_211751
-'''
+# The retro analysis module
+# Runs analysis post-hoc using existing data files
+# example: yarn retro_analyze data/reinforce_cartpole_2018_01_22_211751/
+from glob import glob
 from convlab.experiment import analysis
 from convlab.lib import logger, util
-from convlab.spec import spec_util
-import numpy as np
 import os
 import pydash as ps
-import regex as re
 
 logger = logger.get_logger(__name__)
 
 
-def session_data_from_file(predir, trial_index, session_index, ckpt=None, prefix=''):
-    '''Build session.session_data from file'''
-    ckpt_str = '' if ckpt is None else f'_ckpt-{ckpt}'
-    for filename in os.listdir(predir):
-        if filename.endswith(f'_t{trial_index}_s{session_index}{ckpt_str}_{prefix}session_df.csv'):
-            filepath = f'{predir}/{filename}'
-            session_df = util.read(filepath, header=[0, 1, 2, 3], index_col=0)
-            session_data = util.session_df_to_data(session_df)
-            return session_data
-
-
-def session_datas_from_file(predir, trial_spec, trial_index, ckpt=None):
-    '''Return a dict of {session_index: session_data} for a trial'''
-    session_datas = {}
-    for s in range(trial_spec['meta']['max_session']):
-        session_data = session_data_from_file(predir, trial_index, s, ckpt)
-        if session_data is not None:
-            session_datas[s] = session_data
-    return session_datas
-
-
-def session_data_dict_from_file(predir, trial_index, ckpt=None):
-    '''Build trial.session_data_dict from file'''
-    ckpt_str = '' if ckpt is None else f'_ckpt-{ckpt}'
-    session_data_dict = {}
-    for filename in os.listdir(predir):
-        if f'_t{trial_index}_' in filename and filename.endswith(f'{ckpt_str}_session_fitness_df.csv'):
-            filepath = f'{predir}/{filename}'
-            fitness_df = util.read(filepath, header=[0, 1, 2, 3], index_col=0, dtype=np.float32)
-            util.fix_multi_index_dtype(fitness_df)
-            session_index = fitness_df.index[0]
-            session_data_dict[session_index] = fitness_df
-    return session_data_dict
-
-
-def session_data_dict_for_dist(spec, info_space):
-    '''Method to retrieve session_datas (fitness df, so the same as session_data_dict above) when a trial with distributed sessions is done, to avoid messy multiprocessing data communication'''
-    prepath = util.get_prepath(spec, info_space)
-    predir, _, _, _, _, _ = util.prepath_split(prepath)
-    session_datas = session_data_dict_from_file(predir, info_space.get('trial'), ps.get(info_space, 'ckpt'))
-    session_datas = [session_datas[k] for k in sorted(session_datas.keys())]
-    return session_datas
-
-
-def trial_data_dict_from_file(predir):
-    '''Build experiment.trial_data_dict from file'''
-    trial_data_dict = {}
-    for filename in os.listdir(predir):
-        if filename.endswith('_trial_data.json'):
-            filepath = f'{predir}/{filename}'
-            exp_trial_data = util.read(filepath)
-            trial_index = exp_trial_data.pop('trial_index')
-            trial_data_dict[trial_index] = exp_trial_data
-    return trial_data_dict
-
-
-'''
-Interface retro methods
-'''
-
-
-def analyze_eval_trial(spec, info_space, predir):
-    '''Create a trial and run analysis to get the trial graph and other trial data'''
-    from convlab.experiment.control import Trial
-    trial = Trial(spec, info_space)
-    trial.session_data_dict = session_data_dict_from_file(predir, trial.index, ps.get(info_space, 'ckpt'))
-    # don't zip for eval analysis, slow otherwise
-    analysis.analyze_trial(trial, zip=False)
-
-
-def parallel_eval(spec, info_space, ckpt):
-    '''
-    Calls a subprocess to run lab in eval mode with the constructed ckpt prepath, same as how one would manually run the bash cmd
-    @example
-
-    python run_lab.py data/dqn_cartpole_2018_12_19_224811/dqn_cartpole_t0_spec.json dqn_cartpole eval@dqn_cartpole_t0_s1_ckpt-epi10-totalt1000
-    '''
-    prepath_t = util.get_prepath(spec, info_space, unit='trial')
-    prepath_s = util.get_prepath(spec, info_space, unit='session')
-    predir, _, prename, spec_name, _, _ = util.prepath_split(prepath_s)
-    cmd = f'python run_lab.py {prepath_t}_spec.json {spec_name} eval@{prename}_ckpt-{ckpt}'
-    logger.info(f'Running parallel eval for ckpt-{ckpt}')
-    return util.run_cmd(cmd)
-
-
-def run_parallel_eval(session, agent, env):
-    '''Plugin to session to run parallel eval for train mode'''
-    if util.get_lab_mode() == 'train':
-        ckpt = f'epi{env.clock.epi}-totalt{env.clock.total_t}'
-        agent.save(ckpt=ckpt)
-        # set reference to eval process for handling
-        session.eval_proc = parallel_eval(session.spec, session.info_space, ckpt)
-
-
-def try_wait_parallel_eval(session):
-    '''Plugin to wait for session's final parallel eval if any'''
-    if hasattr(session, 'eval_proc') and session.eval_proc is not None:  # wait for final eval before closing
-        util.run_cmd_wait(session.eval_proc)
-        session_retro_eval(session)  # rerun failed eval
-
-
-def run_parallel_eval_from_prepath(prepath):
-    '''Used by retro_eval'''
-    spec, info_space = util.prepath_to_spec_info_space(prepath)
-    ckpt = util.find_ckpt(prepath)
-    return parallel_eval(spec, info_space, ckpt)
-
-
-def run_wait_eval(prepath):
-    '''Used by retro_eval'''
-    eval_proc = run_parallel_eval_from_prepath(prepath)
-    util.run_cmd_wait(eval_proc)
-
-
 def retro_analyze_sessions(predir):
-    '''Retro-analyze all session level datas.'''
-    logger.info('Retro-analyzing sessions from file')
-    from convlab.experiment.control import Session, SpaceSession
-    for filename in os.listdir(predir):
-        # to account for both types of session_df
-        if filename.endswith('_session_df.csv'):
-            body_df_kind = 'eval'  # from body.eval_df
-            prefix = ''
-            is_session_df = True
-        elif filename.endswith('_trainsession_df.csv'):
-            body_df_kind = 'train'  # from body.train_df
-            prefix = 'train'
-            is_session_df = True
-        else:
-            is_session_df = False
+    '''Retro analyze all sessions'''
+    logger.info('Running retro_analyze_sessions')
+    session_spec_paths = glob(f'{predir}/*_s*_spec.json')
+    util.parallelize(_retro_analyze_session, [(p,) for p in session_spec_paths], num_cpus=util.NUM_CPUS)
+
 
-        if is_session_df:
-            prepath = f'{predir}/{filename}'.replace(f'_{prefix}session_df.csv', '')
-            spec, info_space = util.prepath_to_spec_info_space(prepath)
-            trial_index, session_index = util.prepath_to_idxs(prepath)
-            SessionClass = Session if spec_util.is_singleton(spec) else SpaceSession
-            session = SessionClass(spec, info_space)
-            session_data = session_data_from_file(predir, trial_index, session_index, ps.get(info_space, 'ckpt'), prefix)
-            analysis._analyze_session(session, session_data, body_df_kind)
+def _retro_analyze_session(session_spec_path):
+    '''Method to retro analyze a single session given only a path to its spec'''
+    session_spec = util.read(session_spec_path)
+    info_prepath = session_spec['meta']['info_prepath']
+    for df_mode in ('eval', 'train'):
+        session_df = util.read(f'{info_prepath}_session_df_{df_mode}.csv')
+        analysis.analyze_session(session_spec, session_df, df_mode)
 
 
 def retro_analyze_trials(predir):
-    '''Retro-analyze all trial level datas.'''
-    logger.info('Retro-analyzing trials from file')
-    from convlab.experiment.control import Trial
-    filenames = ps.filter_(os.listdir(predir), lambda filename: filename.endswith('_trial_df.csv'))
-    for idx, filename in enumerate(filenames):
-        filepath = f'{predir}/{filename}'
-        prepath = filepath.replace('_trial_df.csv', '')
-        spec, info_space = util.prepath_to_spec_info_space(prepath)
-        trial_index, _ = util.prepath_to_idxs(prepath)
-        trial = Trial(spec, info_space)
-        trial.session_data_dict = session_data_dict_from_file(predir, trial_index, ps.get(info_space, 'ckpt'))
-        # zip only at the last
-        zip = (idx == len(filenames) - 1)
-        trial_fitness_df = analysis.analyze_trial(trial, zip)
+    '''Retro analyze all trials'''
+    logger.info('Running retro_analyze_trials')
+    session_spec_paths = glob(f'{predir}/*_s*_spec.json')
+    # remove session spec paths
+    trial_spec_paths = ps.difference(glob(f'{predir}/*_t*_spec.json'), session_spec_paths)
+    util.parallelize(_retro_analyze_trial, [(p,) for p in trial_spec_paths], num_cpus=util.NUM_CPUS)
 
-        # write trial_data that was written from ray search
-        trial_data_filepath = filepath.replace('_trial_df.csv', '_trial_data.json')
-        if os.path.exists(trial_data_filepath):
-            fitness_vec = trial_fitness_df.iloc[0].to_dict()
-            fitness = analysis.calc_fitness(trial_fitness_df)
-            trial_data = util.read(trial_data_filepath)
-            trial_data.update({
-                **fitness_vec, 'fitness': fitness, 'trial_index': trial_index,
-            })
-            util.write(trial_data, trial_data_filepath)
+
+def _retro_analyze_trial(trial_spec_path):
+    '''Method to retro analyze a single trial given only a path to its spec'''
+    trial_spec = util.read(trial_spec_path)
+    meta_spec = trial_spec['meta']
+    info_prepath = meta_spec['info_prepath']
+    session_metrics_list = [util.read(f'{info_prepath}_s{s}_session_metrics_eval.pkl') for s in range(meta_spec['max_session'])]
+    analysis.analyze_trial(trial_spec, session_metrics_list)
 
 
 def retro_analyze_experiment(predir):
-    '''Retro-analyze all experiment level datas.'''
-    logger.info('Retro-analyzing experiment from file')
-    from convlab.experiment.control import Experiment
-    _, _, _, spec_name, _, _ = util.prepath_split(predir)
-    prepath = f'{predir}/{spec_name}'
-    spec, info_space = util.prepath_to_spec_info_space(prepath)
-    if 'search' not in spec:
-        return
-    experiment = Experiment(spec, info_space)
-    experiment.trial_data_dict = trial_data_dict_from_file(predir)
-    if not ps.is_empty(experiment.trial_data_dict):
-        return analysis.analyze_experiment(experiment)
+    '''Retro analyze an experiment'''
+    logger.info('Running retro_analyze_experiment')
+    trial_spec_paths = glob(f'{predir}/*_t*_spec.json')
+    # remove trial and session spec paths
+    experiment_spec_paths = ps.difference(glob(f'{predir}/*_spec.json'), trial_spec_paths)
+    experiment_spec_path = experiment_spec_paths[0]
+    spec = util.read(experiment_spec_path)
+    info_prepath = spec['meta']['info_prepath']
+    if os.path.exists(f'{info_prepath}_trial_data_dict.json'):
+        return  # only run analysis if experiment had been ran
+    trial_data_dict = util.read(f'{info_prepath}_trial_data_dict.json')
+    analysis.analyze_experiment(spec, trial_data_dict)
 
 
 def retro_analyze(predir):
     '''
-    Method to analyze experiment from file after it ran.
-    Read from files, constructs lab units, run retro analyses on all lab units.
-    This method has no side-effects, i.e. doesn't overwrite data it should not.
+    Method to analyze experiment/trial from files after it ran.
     @example
 
-    yarn retro_analyze data/reinforce_cartpole_2018_01_22_211751
+    yarn retro_analyze data/reinforce_cartpole_2018_01_22_211751/
     '''
-    os.environ['PREPATH'] = f'{predir}/retro_analyze'  # to prevent overwriting log file
-    logger.info(f'Retro-analyzing {predir}')
+    predir = predir.strip('/')  # sanitary
+    os.environ['LOG_PREPATH'] = f'{predir}/log/retro_analyze'  # to prevent overwriting log file
+    logger.info(f'Running retro-analysis on {predir}')
     retro_analyze_sessions(predir)
     retro_analyze_trials(predir)
     retro_analyze_experiment(predir)
-
-
-def retro_eval(predir, session_index=None):
-    '''
-    Method to run eval sessions by scanning a predir for ckpt files. Used to rerun failed eval sessions.
-    @example
-
-    yarn retro_eval data/reinforce_cartpole_2018_01_22_211751
-    '''
-    logger.info(f'Retro-evaluate sessions from predir {predir}')
-    # collect all unique prepaths first
-    prepaths = []
-    s_filter = '' if session_index is None else f'_s{session_index}_'
-    for filename in os.listdir(predir):
-        if filename.endswith('model.pth') and s_filter in filename:
-            res = re.search('.+epi(\d+)-totalt(\d+)', filename)
-            if res is not None:
-                prepath = f'{predir}/{res[0]}'
-                if prepath not in prepaths:
-                    prepaths.append(prepath)
-    if ps.is_empty(prepaths):
-        return
-
-    logger.info(f'Starting retro eval')
-    np.random.shuffle(prepaths)  # so that CUDA_ID by trial/session index is spread out
-    rand_spec = util.prepath_to_spec(prepaths[0])  # get any prepath, read its max session
-    max_session = rand_spec['meta']['max_session']
-    util.parallelize_fn(run_wait_eval, prepaths, num_cpus=max_session)
-
-
-def session_retro_eval(session):
-    '''retro_eval but for session at the end to rerun failed evals'''
-    prepath = util.get_prepath(session.spec, session.info_space, unit='session')
-    predir, _, _, _, _, _ = util.prepath_split(prepath)
-    retro_eval(predir, session.index)
+    logger.info('Finished retro-analysis')
diff --git a/convlab/experiment/search.py b/convlab/experiment/search.py
index 5f741fe..03f6745 100644
--- a/convlab/experiment/search.py
+++ b/convlab/experiment/search.py
@@ -1,37 +1,16 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-from abc import ABC, abstractmethod
 from copy import deepcopy
-from deap import creator, base, tools, algorithms
-from ray.tune import grid_search
-from ray.tune.suggest import variant_generator
-from convlab.experiment import analysis
 from convlab.lib import logger, util
-from convlab.lib.decorator import lab_api
-import json
 import numpy as np
-import os
 import pydash as ps
 import random
 import ray
+import ray.tune as tune
 import torch
 
 logger = logger.get_logger(__name__)
 
 
-def register_ray_serializer():
-    '''Helper to register so objects can be serialized in Ray'''
-    from convlab.experiment.control import Experiment
-    from convlab.experiment.monitor import InfoSpace
-    import pandas as pd
-    ray.register_custom_serializer(Experiment, use_pickle=True)
-    ray.register_custom_serializer(InfoSpace, use_pickle=True)
-    ray.register_custom_serializer(pd.DataFrame, use_pickle=True)
-    ray.register_custom_serializer(pd.Series, use_pickle=True)
-
-
-def build_config_space(experiment):
+def build_config_space(spec):
     '''
     Build ray config space from flattened spec.search
     Specify a config space in spec using `"{key}__{space_type}": {v}`.
@@ -50,249 +29,97 @@ def build_config_space(experiment):
     '''
     space_types = ('grid_search', 'choice', 'randint', 'uniform', 'normal')
     config_space = {}
-    for k, v in util.flatten_dict(experiment.spec['search']).items():
+    for k, v in util.flatten_dict(spec['search']).items():
         key, space_type = k.split('__')
         assert space_type in space_types, f'Please specify your search variable as {key}__<space_type> in one of {space_types}'
         if space_type == 'grid_search':
-            config_space[key] = grid_search(v)
+            config_space[key] = tune.grid_search(v)
         elif space_type == 'choice':
-            config_space[key] = lambda spec, v=v: random.choice(v)
+            config_space[key] = tune.sample_from(lambda spec, v=v: random.choice(v))
         else:
             np_fn = getattr(np.random, space_type)
-            config_space[key] = lambda spec, v=v: np_fn(*v)
+            config_space[key] = tune.sample_from(lambda spec, v=v: np_fn(*v))
     return config_space
 
 
-def calc_population_size(experiment):
-    '''Calculate the population size for RandomSearch or EvolutionarySearch'''
-    pop_size = 2  # x2 for more search coverage
-    for k, v in util.flatten_dict(experiment.spec['search']).items():
-        if '__' in k:
-            key, space_type = k.split('__')
-        else:
-            key, space_type = k, 'grid_search'
-        if space_type in ('grid_search', 'choice'):
-            pop_size *= len(v)
-        else:
-            pop_size *= 3
-    return pop_size
+def infer_trial_resources(spec):
+    '''Infer the resources_per_trial for ray from spec'''
+    meta_spec = spec['meta']
+    num_cpus = min(util.NUM_CPUS, meta_spec['max_session'])
 
+    use_gpu = any(agent_spec['net'].get('gpu') for agent_spec in spec['agent'])
+    requested_gpu = meta_spec['max_session'] if use_gpu else 0
+    gpu_count = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    num_gpus = min(gpu_count, requested_gpu)
+    resources_per_trial = {'cpu': num_cpus, 'gpu': num_gpus}
+    return resources_per_trial
 
-def spec_from_config(experiment, config):
-    '''Helper to create spec from config - variables in spec.'''
-    spec = deepcopy(experiment.spec)
+
+def inject_config(spec, config):
+    '''Inject flattened config into SLM Lab spec.'''
+    spec = deepcopy(spec)
     spec.pop('search', None)
     for k, v in config.items():
         ps.set_(spec, k, v)
     return spec
 
 
-def create_remote_fn(experiment):
-    ray_gpu = int(bool(ps.get(experiment.spec, 'agent.0.net.gpu') and torch.cuda.device_count()))
-    # TODO fractional ray_gpu is broken
-
-    @ray.remote(num_gpus=ray_gpu)  # hack around bad Ray design of hard-coding
-    def run_trial(experiment, config):
-        trial_index = config.pop('trial_index')
-        spec = spec_from_config(experiment, config)
-        info_space = deepcopy(experiment.info_space)
-        info_space.set('trial', trial_index)
-        trial_fitness_df = experiment.init_trial_and_run(spec, info_space)
-        fitness_vec = trial_fitness_df.iloc[0].to_dict()
-        fitness = analysis.calc_fitness(trial_fitness_df)
-        trial_data = {**config, **fitness_vec, 'fitness': fitness, 'trial_index': trial_index}
-        prepath = util.get_prepath(spec, info_space, unit='trial')
-        util.write(trial_data, f'{prepath}_trial_data.json')
-        return trial_data
-    return run_trial
-
-
-def get_ray_results(pending_ids, ray_id_to_config):
-    '''Helper to wait and get ray results into a new trial_data_dict, or handle ray error'''
-    trial_data_dict = {}
-    for _t in range(len(pending_ids)):
-        ready_ids, pending_ids = ray.wait(pending_ids, num_returns=1)
-        ready_id = ready_ids[0]
-        try:
-            trial_data = ray.get(ready_id)
-            trial_index = trial_data.pop('trial_index')
-            trial_data_dict[trial_index] = trial_data
-        except:
-            logger.exception(f'Trial failed: {ray_id_to_config[ready_id]}')
-    return trial_data_dict
-
-
-class RaySearch(ABC):
+def ray_trainable(config, reporter):
     '''
-    RaySearch module for Experiment - Ray API integration with Lab
-    Abstract class ancestor to all RaySearch (using Ray).
-    specifies the necessary design blueprint for agent to work in Lab.
-    Mostly, implement just the abstract methods and properties.
+    Create an instance of a trainable function for ray: https://ray.readthedocs.io/en/latest/tune-usage.html#training-api
+    Lab needs a spec and a trial_index to be carried through config, pass them with config in ray.run() like so:
+    config = {
+        'spec': spec,
+        'trial_index': tune.sample_from(lambda spec: gen_trial_index()),
+        ... # normal ray config with sample, grid search etc.
+    }
     '''
-
-    def __init__(self, experiment):
-        self.experiment = experiment
-        self.config_space = build_config_space(experiment)
-        logger.info(f'Running {util.get_class_name(self)}, with meta spec:\n{self.experiment.spec["meta"]}')
-
-    @abstractmethod
-    def generate_config(self):
-        '''
-        Generate the next config given config_space, may update belief first.
-        Remember to update trial_index in config here, since run_trial() on ray.remote is not thread-safe.
-        '''
-        # use self.config_space to build config
-        config['trial_index'] = self.experiment.info_space.tick('trial')['trial']
-        raise NotImplementedError
-        return config
-
-    @abstractmethod
-    @lab_api
-    def run(self):
-        '''
-        Implement the main run_trial loop.
-        Remember to call ray init and cleanup before and after loop.
-        '''
-        ray.init()
-        register_ray_serializer()
-        # loop for max_trial: generate_config(); run_trial.remote(config)
-        ray.shutdown()
-        raise NotImplementedError
-        return trial_data_dict
-
-
-class RandomSearch(RaySearch):
-
-    def generate_config(self):
-        configs = []  # to accommodate for grid_search
-        for resolved_vars, config in variant_generator._generate_variants(self.config_space):
-            config['trial_index'] = self.experiment.info_space.tick('trial')['trial']
-            configs.append(config)
-        return configs
-
-    @lab_api
-    def run(self):
-        run_trial = create_remote_fn(self.experiment)
-        meta_spec = self.experiment.spec['meta']
-        ray.init(**meta_spec.get('resources', {}))
-        register_ray_serializer()
-        max_trial = meta_spec['max_trial']
-        trial_data_dict = {}
-        ray_id_to_config = {}
-        pending_ids = []
-
-        for _t in range(max_trial):
-            configs = self.generate_config()
-            for config in configs:
-                ray_id = run_trial.remote(self.experiment, config)
-                ray_id_to_config[ray_id] = config
-                pending_ids.append(ray_id)
-
-        trial_data_dict.update(get_ray_results(pending_ids, ray_id_to_config))
-        ray.shutdown()
-        return trial_data_dict
-
-
-class EvolutionarySearch(RaySearch):
-
-    def generate_config(self):
-        for resolved_vars, config in variant_generator._generate_variants(self.config_space):
-            # trial_index is set at population level
-            return config
-
-    def mutate(self, individual, indpb):
-        '''
-        Deap implementation for dict individual (config),
-        mutate an attribute with some probability - resample using the generate_config method and ensuring the new value is different.
-        @param {dict} individual Individual to be mutated.
-        @param {float} indpb Independent probability for each attribute to be mutated.
-        @returns A tuple of one individual.
-        '''
-        for k, v in individual.items():
-            if random.random() < indpb:
-                while True:
-                    new_ind = self.generate_config()
-                    if new_ind[k] != v:
-                        individual[k] = new_ind[k]
-                        break
-        return individual,
-
-    def cx_uniform(cls, ind1, ind2, indpb):
-        '''
-        Deap implementation for dict individual (config),
-        do a uniform crossover that modify in place the two individuals. The attributes are swapped with probability indpd.
-        @param {dict} ind1 The first individual participating in the crossover.
-        @param {dict} ind2 The second individual participating in the crossover.
-        @param {float} indpb Independent probabily for each attribute to be exchanged.
-        @returns A tuple of two individuals.
-        '''
-        for k in ind1:
-            if random.random() < indpb:
-                ind1[k], ind2[k] = ind2[k], ind1[k]
-        return ind1, ind2
-
-    def init_deap(self):
-        creator.create('FitnessMax', base.Fitness, weights=(1.0,))
-        creator.create('Individual', dict, fitness=creator.FitnessMax)
-        toolbox = base.Toolbox()
-        toolbox.register('attr', self.generate_config)
-        toolbox.register('individual', tools.initIterate,
-                         creator.Individual, toolbox.attr)
-        toolbox.register('population', tools.initRepeat,
-                         list, toolbox.individual)
-
-        toolbox.register('mate', self.cx_uniform, indpb=0.5)
-        toolbox.register('mutate', self.mutate, indpb=1 /
-                         len(toolbox.individual()))
-        toolbox.register('select', tools.selTournament, tournsize=3)
-        return toolbox
-
-    @lab_api
-    def run(self):
-        run_trial = create_remote_fn(self.experiment)
-        meta_spec = self.experiment.spec['meta']
-        ray.init(**meta_spec.get('resources', {}))
-        register_ray_serializer()
-        max_generation = meta_spec['max_generation']
-        pop_size = meta_spec['max_trial'] or calc_population_size(self.experiment)
-        logger.info(f'EvolutionarySearch max_generation: {max_generation}, population size: {pop_size}')
-        trial_data_dict = {}
-        config_hash = {}  # config hash_str to trial_index
-
-        toolbox = self.init_deap()
-        population = toolbox.population(n=pop_size)
-        for gen in range(1, max_generation + 1):
-            logger.info(f'Running generation: {gen}/{max_generation}')
-            ray_id_to_config = {}
-            pending_ids = []
-            for individual in population:
-                config = dict(individual.items())
-                hash_str = util.to_json(config, indent=0)
-                if hash_str not in config_hash:
-                    trial_index = self.experiment.info_space.tick('trial')['trial']
-                    config_hash[hash_str] = config['trial_index'] = trial_index
-                    ray_id = run_trial.remote(self.experiment, config)
-                    ray_id_to_config[ray_id] = config
-                    pending_ids.append(ray_id)
-                individual['trial_index'] = config_hash[hash_str]
-
-            trial_data_dict.update(get_ray_results(pending_ids, ray_id_to_config))
-
-            for individual in population:
-                trial_index = individual.pop('trial_index')
-                trial_data = trial_data_dict.get(trial_index, {'fitness': 0})  # if trial errored
-                individual.fitness.values = trial_data['fitness'],
-
-            preview = 'Fittest of population preview:'
-            for individual in tools.selBest(population, k=min(10, pop_size)):
-                preview += f'\nfitness: {individual.fitness.values[0]}, {individual}'
-            logger.info(preview)
-
-            # prepare offspring for next generation
-            if gen < max_generation:
-                population = toolbox.select(population, len(population))
-                # Vary the pool of individuals
-                population = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.5)
-
-        ray.shutdown()
-        return trial_data_dict
+    from convlab.experiment.control import Trial
+    # restore data carried from ray.run() config
+    spec = config.pop('spec')
+    trial_index = config.pop('trial_index')
+    spec['meta']['trial'] = trial_index
+    spec = inject_config(spec, config)
+    # run SLM Lab trial
+    metrics = Trial(spec).run()
+    metrics.update(config) # carry config for analysis too
+    # ray report to carry data in ray trial.last_result
+    reporter(trial_data={trial_index: metrics})
+
+
+def run_ray_search(spec):
+    '''
+    Method to run ray search from experiment. Uses RandomSearch now.
+    TODO support for other ray search algorithms: https://ray.readthedocs.io/en/latest/tune-searchalg.html
+    '''
+    logger.info(f'Running ray search for spec {spec["name"]}')
+    # generate trial index to pass into Lab Trial
+    global trial_index  # make gen_trial_index passable into ray.run
+    trial_index = -1
+
+    def gen_trial_index():
+        global trial_index
+        trial_index += 1
+        return trial_index
+
+    ray.init()
+
+    ray_trials = tune.run(
+        ray_trainable,
+        name=spec['name'],
+        config={
+            "spec": spec,
+            "trial_index": tune.sample_from(lambda spec: gen_trial_index()),
+            **build_config_space(spec)
+        },
+        resources_per_trial=infer_trial_resources(spec),
+        num_samples=spec['meta']['max_trial'],
+        queue_trials=True,
+    )
+    trial_data_dict = {}  # data for Lab Experiment to analyze
+    for ray_trial in ray_trials:
+        ray_trial_data = ray_trial.last_result['trial_data']
+        trial_data_dict.update(ray_trial_data)
+
+    ray.shutdown()
+    return trial_data_dict
diff --git a/convlab/lib/__init__.py b/convlab/lib/__init__.py
index 1f3c799..e69de29 100644
--- a/convlab/lib/__init__.py
+++ b/convlab/lib/__init__.py
@@ -1,7 +0,0 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The generic lib module
-Contains generic library methods for the Lab
-'''
diff --git a/convlab/lib/decorator.py b/convlab/lib/decorator.py
index 83dfc2e..aa563a8 100644
--- a/convlab/lib/decorator.py
+++ b/convlab/lib/decorator.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from functools import wraps
 from convlab.lib import logger
 import time
@@ -41,6 +38,6 @@ def time_fn(*args, **kwargs):
         start = time.time()
         output = fn(*args, **kwargs)
         end = time.time()
-        logger.debug3(f'Timed: {fn.__name__} {round((end - start) * 1000, 4)}ms')
+        logger.debug(f'Timed: {fn.__name__} {round((end - start) * 1000, 4)}ms')
         return output
     return time_fn
diff --git a/convlab/lib/distribution.py b/convlab/lib/distribution.py
new file mode 100644
index 0000000..6fed228
--- /dev/null
+++ b/convlab/lib/distribution.py
@@ -0,0 +1,87 @@
+# Custom PyTorch distribution classes to be registered in policy_util.py
+# Mainly used by policy_util action distribution
+from torch import distributions
+import torch
+
+
+class Argmax(distributions.Categorical):
+    '''
+    Special distribution class for argmax sampling, where probability is always 1 for the argmax.
+    NOTE although argmax is not a sampling distribution, this implementation is for API consistency.
+    '''
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        if probs is not None:
+            new_probs = torch.zeros_like(probs, dtype=torch.float)
+            new_probs[probs == probs.max(dim=-1, keepdim=True)[0]] = 1.0
+            probs = new_probs
+        elif logits is not None:
+            new_logits = torch.full_like(logits, -1e8, dtype=torch.float)
+            new_logits[logits == logits.max(dim=-1, keepdim=True)[0]] = 1.0
+            logits = new_logits
+
+        super().__init__(probs=probs, logits=logits, validate_args=validate_args)
+
+
+class GumbelCategorical(distributions.Categorical):
+    '''
+    Special Categorical using Gumbel distribution to simulate softmax categorical for discrete action.
+    Similar to OpenAI's https://github.com/openai/baselines/blob/98257ef8c9bd23a24a330731ae54ed086d9ce4a7/baselines/a2c/utils.py#L8-L10
+    Explanation http://amid.fish/assets/gumbel.html
+    '''
+
+    def sample(self, sample_shape=torch.Size()):
+        '''Gumbel softmax sampling'''
+        u = torch.empty(self.logits.size(), device=self.logits.device, dtype=self.logits.dtype).uniform_(0, 1)
+        noisy_logits = self.logits - torch.log(-torch.log(u))
+        return torch.argmax(noisy_logits, dim=0)
+
+
+class MultiCategorical(distributions.Categorical):
+    '''MultiCategorical as collection of Categoricals'''
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        self.categoricals = []
+        if probs is None:
+            probs = [None] * len(logits)
+        elif logits is None:
+            logits = [None] * len(probs)
+        else:
+            raise ValueError('Either probs or logits must be None')
+
+        for sub_probs, sub_logits in zip(probs, logits):
+            categorical = distributions.Categorical(probs=sub_probs, logits=sub_logits, validate_args=validate_args)
+            self.categoricals.append(categorical)
+
+    @property
+    def logits(self):
+        return [cat.logits for cat in self.categoricals]
+
+    @property
+    def probs(self):
+        return [cat.probs for cat in self.categoricals]
+
+    @property
+    def param_shape(self):
+        return [cat.param_shape for cat in self.categoricals]
+
+    @property
+    def mean(self):
+        return torch.stack([cat.mean for cat in self.categoricals])
+
+    @property
+    def variance(self):
+        return torch.stack([cat.variance for cat in self.categoricals])
+
+    def sample(self, sample_shape=torch.Size()):
+        return torch.stack([cat.sample(sample_shape=sample_shape) for cat in self.categoricals])
+
+    def log_prob(self, value):
+        value_t = value.transpose(0, 1)
+        return torch.stack([cat.log_prob(value_t[idx]) for idx, cat in enumerate(self.categoricals)])
+
+    def entropy(self):
+        return torch.stack([cat.entropy() for cat in self.categoricals])
+
+    def enumerate_support(self):
+        return [cat.enumerate_support() for cat in self.categoricals]
diff --git a/convlab/lib/logger.py b/convlab/lib/logger.py
index 43d58ec..ee98a0d 100644
--- a/convlab/lib/logger.py
+++ b/convlab/lib/logger.py
@@ -1,6 +1,3 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
 from convlab.lib import util
 import colorlog
 import logging
@@ -16,14 +13,13 @@ class FixedList(list):
     def append(self, e):
         pass
 
-
-# extra debugging level deeper than the default debug
-NEW_LVLS = {'DEBUG2': 9, 'DEBUG3': 8, 'NL': 17, 'ACT': 14, 'STATE': 13}
+NEW_LVLS = {'NL': 17, 'ACT': 14, 'STATE': 13}
 for name, val in NEW_LVLS.items():
     logging.addLevelName(val, name)
     setattr(logging, name, val)
-LOG_FORMAT = '[%(asctime)s %(levelname)s %(filename)s %(funcName)s] %(message)s'
-color_formatter = colorlog.ColoredFormatter('%(log_color)s[%(asctime)s %(levelname)s %(filename)s %(funcName)s]%(reset)s %(message)s',
+
+LOG_FORMAT = '[%(asctime)s PID:%(process)d %(levelname)s %(filename)s %(funcName)s] %(message)s'
+color_formatter = colorlog.ColoredFormatter('%(log_color)s[%(asctime)s PID:%(process)d %(levelname)s %(filename)s %(funcName)s]%(reset)s %(message)s',
 log_colors={
 		'DEBUG':    'cyan',
 		'NL':       'cyan',
@@ -35,92 +31,74 @@ def append(self, e):
 		'CRITICAL': 'red,bg_white'})
 sh = logging.StreamHandler(sys.stdout)
 sh.setFormatter(color_formatter)
-convlab_logger = logging.getLogger()
-convlab_logger.handlers = FixedList([sh])
+lab_logger = logging.getLogger()
+lab_logger.handlers = FixedList([sh])
+logging.getLogger('ray').propagate = False  # hack to mute poorly designed ray TF warning log
 
 # this will trigger from Experiment init on reload(logger)
-if os.environ.get('PREPATH') is not None:
+if os.environ.get('LOG_PREPATH') is not None:
     warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning)
 
-    log_filepath = os.environ['PREPATH'] + '.log'
+    log_filepath = os.environ['LOG_PREPATH'] + '.log'
     os.makedirs(os.path.dirname(log_filepath), exist_ok=True)
     # create file handler
     formatter = logging.Formatter(LOG_FORMAT)
     fh = logging.FileHandler(log_filepath)
     fh.setFormatter(formatter)
     # add stream and file handler
-    convlab_logger.handlers = FixedList([sh, fh])
+    lab_logger.handlers = FixedList([sh, fh])
 
 if os.environ.get('LOG_LEVEL'):
-    convlab_logger.setLevel(os.environ['LOG_LEVEL'])
+    lab_logger.setLevel(os.environ['LOG_LEVEL'])
 else:
-    convlab_logger.setLevel('INFO')
-
-
-def to_init(spec, info_space):
-    '''
-    Whether the lab's logger had been initialized:
-    - prepath present in env
-    - importlib.reload(logger) had been called
-    '''
-    return os.environ.get('PREPATH') is None
+    lab_logger.setLevel('INFO')
 
 
 def set_level(lvl):
-    convlab_logger.setLevel(lvl)
+    lab_logger.setLevel(lvl)
     os.environ['LOG_LEVEL'] = lvl
 
 
 def critical(msg, *args, **kwargs):
-    return convlab_logger.critical(msg, *args, **kwargs)
+    return lab_logger.critical(msg, *args, **kwargs)
 
 
 def debug(msg, *args, **kwargs):
-    return convlab_logger.debug(msg, *args, **kwargs)
+    return lab_logger.debug(msg, *args, **kwargs)
 
 
-def debug2(msg, *args, **kwargs):
-    return convlab_logger.log(NEW_LVLS['DEBUG2'], msg, *args, **kwargs)
+def error(msg, *args, **kwargs):
+    return lab_logger.error(msg, *args, **kwargs)
 
 
-def debug3(msg, *args, **kwargs):
-    return convlab_logger.log(NEW_LVLS['DEBUG3'], msg, *args, **kwargs)
+def exception(msg, *args, **kwargs):
+    return lab_logger.exception(msg, *args, **kwargs)
 
-def nl(msg, *args, **kwargs):
-    return convlab_logger.log(NEW_LVLS['NL'], msg, *args, **kwargs)
 
-def act(msg, *args, **kwargs):
-    return convlab_logger.log(NEW_LVLS['ACT'], msg, *args, **kwargs)
+def info(msg, *args, **kwargs):
+    return lab_logger.info(msg, *args, **kwargs)
 
-def state(msg, *args, **kwargs):
-    return convlab_logger.log(NEW_LVLS['STATE'], msg, *args, **kwargs)
 
-def error(msg, *args, **kwargs):
-    return convlab_logger.error(msg, *args, **kwargs)
+def warning(msg, *args, **kwargs):
+    return lab_logger.warning(msg, *args, **kwargs)
 
 
-def exception(msg, *args, **kwargs):
-    return convlab_logger.exception(msg, *args, **kwargs)
+def nl(msg, *args, **kwargs):
+    return lab_logger.log(NEW_LVLS['NL'], msg, *args, **kwargs)
 
 
-def info(msg, *args, **kwargs):
-    return convlab_logger.info(msg, *args, **kwargs)
+def act(msg, *args, **kwargs):
+    return lab_logger.log(NEW_LVLS['ACT'], msg, *args, **kwargs)
 
 
-def warn(msg, *args, **kwargs):
-    return convlab_logger.warn(msg, *args, **kwargs)
+def state(msg, *args, **kwargs):
+    return lab_logger.log(NEW_LVLS['STATE'], msg, *args, **kwargs)
 
 
 def get_logger(__name__):
     '''Create a child logger specific to a module'''
     module_logger = logging.getLogger(__name__)
 
-    def debug2(msg, *args, **kwargs):
-        return module_logger.log(NEW_LVLS['DEBUG2'], msg, *args, **kwargs)
-
-    def debug3(msg, *args, **kwargs):
-        return module_logger.log(NEW_LVLS['DEBUG3'], msg, *args, **kwargs)
-
     def nl(msg, *args, **kwargs):
         return module_logger.log(NEW_LVLS['NL'], msg, *args, **kwargs)
 
@@ -130,11 +108,10 @@ def act(msg, *args, **kwargs):
     def state(msg, *args, **kwargs):
         return module_logger.log(NEW_LVLS['STATE'], msg, *args, **kwargs)
 
-    setattr(module_logger, 'debug2', debug2)
-    setattr(module_logger, 'debug3', debug3)
     setattr(module_logger, 'nl', nl)
     setattr(module_logger, 'act', act)
     setattr(module_logger, 'state', state)
+
     return module_logger
 
 
diff --git a/convlab/lib/math_util.py b/convlab/lib/math_util.py
index dd80ecd..ee6da6a 100644
--- a/convlab/lib/math_util.py
+++ b/convlab/lib/math_util.py
@@ -1,122 +1,112 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-Calculations used by algorithms
-All calculations for training shall have a standard API that takes in `batch` from algorithm.sample() method and return np array for calculation.
-`batch` is a dict containing keys to any data type you wish, e.g. {rewards: np.array([...])}
-'''
-from convlab.lib import logger
+# Various math calculations used by algorithms
 import numpy as np
 import torch
 
-logger = logger.get_logger(__name__)
+
+# general math methods
+
+def normalize(v):
+    '''Method to normalize a rank-1 np array'''
+    v_min = v.min()
+    v_max = v.max()
+    v_range = v_max - v_min
+    v_range += 1e-08  # division guard
+    v_norm = (v - v_min) / v_range
+    return v_norm
+
+
+def standardize(v):
+    '''Method to standardize a rank-1 np array'''
+    assert len(v) > 1, 'Cannot standardize vector of size 1'
+    v_std = (v - v.mean()) / (v.std() + 1e-08)
+    return v_std
+
+
+def to_one_hot(data, max_val):
+    '''Convert an int list of data into one-hot vectors'''
+    return np.eye(max_val)[np.array(data)]
+
+
+def venv_pack(batch_tensor, num_envs):
+    '''Apply the reverse of venv_unpack to pack a batch tensor from (b*num_envs, *shape) to (b, num_envs, *shape)'''
+    shape = list(batch_tensor.shape)
+    if len(shape) < 2:  # scalar data (b, num_envs,)
+        return batch_tensor.view(-1, num_envs)
+    else:  # non-scalar data (b, num_envs, *shape)
+        pack_shape = [-1, num_envs] + shape[1:]
+        return batch_tensor.view(pack_shape)
+
+
+def venv_unpack(batch_tensor):
+    '''
+    Unpack a sampled vec env batch tensor
+    e.g. for a state with original shape (4, ), vec env should return vec state with shape (num_envs, 4) to store in memory
+    When sampled with batch_size b, we should get shape (b, num_envs, 4). But we need to unpack the num_envs dimension to get (b * num_envs, 4) for passing to a network. This method does that.
+    '''
+    shape = list(batch_tensor.shape)
+    if len(shape) < 3:  # scalar data (b, num_envs,)
+        return batch_tensor.view(-1)
+    else:  # non-scalar data (b, num_envs, *shape)
+        unpack_shape = [-1] + shape[2:]
+        return batch_tensor.view(unpack_shape)
 
 
 # Policy Gradient calc
 # advantage functions
 
-def calc_returns(batch, gamma):
+def calc_returns(rewards, dones, gamma):
     '''
-    Calculate the simple returns (full rollout) for advantage
-    i.e. sum discounted rewards up till termination
+    Calculate the simple returns (full rollout) i.e. sum discounted rewards up till termination
     '''
-    rewards = batch['rewards']
-    is_tensor = torch.is_tensor(rewards)
-    if is_tensor:
-        assert not torch.isnan(rewards).any()
-    else:
-        assert not np.any(np.isnan(rewards))
-    # handle epi-end, to not sum past current episode
-    not_dones = 1 - batch['dones']
     T = len(rewards)
-    if is_tensor:
-        rets = torch.empty(T, dtype=torch.float32, device=rewards.device)
-    else:
-        rets = np.empty(T, dtype='float32')
-    future_ret = 0.0
+    rets = torch.zeros_like(rewards)
+    future_ret = torch.tensor(0.0, dtype=rewards.dtype)
+    not_dones = 1 - dones
     for t in reversed(range(T)):
-        future_ret = rewards[t] + gamma * future_ret * not_dones[t]
-        rets[t] = future_ret
+        rets[t] = future_ret = rewards[t] + gamma * future_ret * not_dones[t]
     return rets
 
 
-def calc_gammas(batch, gamma):
-    '''Calculate the gammas to the right power for multiplication with rewards'''
-    dones = batch['dones']
-    news = torch.cat([torch.ones((1,), device=dones.device), dones[:-1]])
-    gammas = torch.empty_like(news)
-    cur_gamma = 1.0
-    for t, new in enumerate(news):
-        cur_gamma = new * 1.0 + (1 - new) * cur_gamma * gamma
-        gammas[t] = cur_gamma
-    return gammas
-
-
-def calc_nstep_returns(batch, gamma, n, next_v_preds):
+def calc_nstep_returns(rewards, dones, next_v_pred, gamma, n):
     '''
-    Calculate the n-step returns for advantage
-    see n-step return in: http://www-anw.cs.umass.edu/~barto/courses/cs687/Chapter%207.pdf
-    i.e. for each timestep t:
-        sum discounted rewards up till step n (0 to n-1 that is),
-        then add v_pred for n as final term
+    Calculate the n-step returns for advantage. Ref: http://www-anw.cs.umass.edu/~barto/courses/cs687/Chapter%207.pdf
+    Also see Algorithm S3 from A3C paper https://arxiv.org/pdf/1602.01783.pdf for the calculation used below
+    R^(n)_t = r_{t} + gamma r_{t+1} + ... + gamma^(n-1) r_{t+n-1} + gamma^(n) V(s_{t+n})
     '''
-    rets = batch['rewards'].clone()  # prevent mutation
-    next_v_preds = next_v_preds.clone()  # prevent mutation
-    nstep_rets = torch.zeros_like(rets) + rets
-    cur_gamma = gamma
-    for i in range(1, n):
-        # Shift returns by one and zero last element of each episode
-        rets[:-1] = rets[1:]
-        rets *= (1 - batch['dones'])
-        # Also shift V(s_t+1) so final terms use V(s_t+n)
-        next_v_preds[:-1] = next_v_preds[1:]
-        next_v_preds *= (1 - batch['dones'])
-        # Accumulate return
-        nstep_rets += cur_gamma * rets
-        # Update current gamma
-        cur_gamma *= cur_gamma
-    # Add final terms. Note no next state if epi is done
-    final_terms = cur_gamma * next_v_preds * (1 - batch['dones'])
-    nstep_rets += final_terms
-    return nstep_rets
-
-
-def calc_gaes(rewards, v_preds, next_v_preds, gamma, lam):
+    rets = torch.zeros_like(rewards)
+    future_ret = next_v_pred
+    not_dones = 1 - dones
+    for t in reversed(range(n)):
+        rets[t] = future_ret = rewards[t] + gamma * future_ret * not_dones[t]
+    return rets
+
+
+def calc_gaes(rewards, dones, v_preds, gamma, lam):
     '''
-    Calculate GAE
-    See http://www.breloff.com/DeepRL-OnlineGAE/ for clear example.
-    v_preds are values predicted for current states
-    next_v_preds are values predicted for next states
-    NOTE for standardization trick, do it out of here
+    Calculate GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf
+    v_preds are values predicted for current states, with one last element as the final next_state
+    delta is defined as r + gamma * V(s') - V(s) in eqn 10
+    GAE is defined in eqn 16
+    This method computes in torch tensor to prevent unnecessary moves between devices (e.g. GPU tensor to CPU numpy)
+    NOTE any standardization is done outside of this method
     '''
     T = len(rewards)
-    assert not torch.isnan(rewards).any()
-    assert T == len(v_preds)
-    gaes = torch.empty(T, dtype=torch.float32, device=v_preds.device)
-    future_gae = 0.0
+    assert T + 1 == len(v_preds)  # v_preds includes states and 1 last next_state
+    gaes = torch.zeros_like(rewards)
+    future_gae = torch.tensor(0.0, dtype=rewards.dtype)
+    # to multiply with not_dones to handle episode boundary (last state has no V(s'))
+    not_dones = 1 - dones
     for t in reversed(range(T)):
-        delta = rewards[t] + gamma * next_v_preds[t] - v_preds[t]
-        gaes[t] = future_gae = delta + gamma * lam * future_gae
-    assert not torch.isnan(gaes).any(), f'GAE has nan: {gaes}'
+        delta = rewards[t] + gamma * v_preds[t + 1] * not_dones[t] - v_preds[t]
+        gaes[t] = future_gae = delta + gamma * lam * not_dones[t] * future_gae
     return gaes
 
 
 def calc_q_value_logits(state_value, raw_advantages):
-    mean_adv = raw_advantages.mean(dim=-1).unsqueeze_(dim=-1)
+    mean_adv = raw_advantages.mean(dim=-1).unsqueeze(dim=-1)
     return state_value + raw_advantages - mean_adv
 
 
-def standardize(v):
-    '''Method to standardize a rank-1 tensor'''
-    v_std = v.std()
-    # guard nan std by setting to 0 and add small const
-    v_std[v_std != v_std] = 0  # nan guard
-    v_std += 1e-08  # division guard
-    v = (v - v.mean()) / v_std
-    return v
-
-
 # generic variable decay methods
 
 def no_decay(start_val, end_val, start_step, end_step, step):
@@ -163,35 +153,3 @@ def periodic_decay(start_val, end_val, start_step, end_step, step, frequency=60.
     val = end_val * 0.5 * unit * (1 + np.cos(x) * (1 - x / x_freq))
     val = max(val, end_val)
     return val
-
-
-# misc math methods
-
-def is_outlier(points, thres=3.5):
-    '''
-    Detects outliers using MAD modified_z_score method, generalized to work on points.
-    From https://stackoverflow.com/a/22357811/3865298
-    @example
-
-    is_outlier([1, 1, 1])
-    # => array([False, False, False], dtype=bool)
-    is_outlier([1, 1, 2])
-    # => array([False, False,  True], dtype=bool)
-    is_outlier([[1, 1], [1, 1], [1, 2]])
-    # => array([False, False,  True], dtype=bool)
-    '''
-    points = np.array(points)
-    if len(points.shape) == 1:
-        points = points[:, None]
-    median = np.median(points, axis=0)
-    diff = np.sum((points - median)**2, axis=-1)
-    diff = np.sqrt(diff)
-    med_abs_deviation = np.median(diff)
-    with np.errstate(divide='ignore', invalid='ignore'):
-        modified_z_score = 0.6745 * diff / med_abs_deviation
-        return modified_z_score > thres
-
-
-def to_one_hot(data, max_val):
-    '''Convert an int list of data into one-hot vectors'''
-    return np.eye(max_val)[np.array(data)]
diff --git a/convlab/lib/optimizer.py b/convlab/lib/optimizer.py
new file mode 100644
index 0000000..fecb379
--- /dev/null
+++ b/convlab/lib/optimizer.py
@@ -0,0 +1,102 @@
+# Custom PyTorch optimizer classes, to be registered in net_util.py
+import math
+import torch
+
+
+class GlobalAdam(torch.optim.Adam):
+    '''
+    Global Adam algorithm with shared states for Hogwild.
+    Adapted from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py (MIT)
+    '''
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
+        super().__init__(params, lr, betas, eps, weight_decay)
+
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['step'] = torch.zeros(1)
+                state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
+                state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
+
+    def share_memory(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['step'].share_memory_()
+                state['exp_avg'].share_memory_()
+                state['exp_avg_sq'].share_memory_()
+
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                state = self.state[p]
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+                state['step'] += 1
+                if group['weight_decay'] != 0:
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+                bias_correction1 = 1 - beta1 ** state['step'].item()
+                bias_correction2 = 1 - beta2 ** state['step'].item()
+                step_size = group['lr'] * math.sqrt(
+                    bias_correction2) / bias_correction1
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+        return loss
+
+
+class GlobalRMSprop(torch.optim.RMSprop):
+    '''
+    Global RMSprop algorithm with shared states for Hogwild.
+    Adapted from https://github.com/jingweiz/pytorch-rl/blob/master/optims/sharedRMSprop.py (MIT)
+    '''
+
+    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0):
+        super().__init__(params, lr=lr, alpha=alpha, eps=eps, weight_decay=weight_decay, momentum=0, centered=False)
+
+        # State initialisation (must be done before step, else will not be shared between threads)
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['step'] = p.data.new().resize_(1).zero_()
+                state['square_avg'] = p.data.new().resize_as_(p.data).zero_()
+
+    def share_memory(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['step'].share_memory_()
+                state['square_avg'].share_memory_()
+
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                state = self.state[p]
+                square_avg = state['square_avg']
+                alpha = group['alpha']
+                state['step'] += 1
+                if group['weight_decay'] != 0:
+                    grad = grad.add(group['weight_decay'], p.data)
+
+                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
+                avg = square_avg.sqrt().add_(group['eps'])
+                p.data.addcdiv_(-group['lr'], grad, avg)
+        return loss
diff --git a/convlab/lib/util.py b/convlab/lib/util.py
index b8b9a41..56095a4 100644
--- a/convlab/lib/util.py
+++ b/convlab/lib/util.py
@@ -1,37 +1,37 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
+from collections import deque
 from contextlib import contextmanager
 from datetime import datetime
 from importlib import reload
+from pprint import pformat
 # from convlab import ROOT_DIR, EVAL_MODES
-# import cv2
+import cv2
 import json
 import numpy as np
 import operator
 import os
 import pandas as pd
-from pathlib import Path
+import pickle
 import pydash as ps
 import regex as re
 import subprocess
 import sys
+import time
 import torch
 import torch.multiprocessing as mp
 import ujson
 import yaml
 
-# import allennlp.common.file_utils.cached_path as allennlp_cached_path
+from pathlib import Path
 from allennlp.common.file_utils import cached_path as allennlp_cached_path
 
-ROOT_DIR = os.path.normpath(os.path.join(os.path.dirname(os.path.dirname(__file__)), '..'))
+ROOT_DIR = os.path.normpath(os.path.join(os.path.dirname(__file__), '..'))
+
 EVAL_MODES = ('enjoy', 'eval')
 TRAIN_MODES = ('search', 'train', 'dev')
 
 NUM_CPUS = mp.cpu_count()
 FILE_TS_FORMAT = '%Y_%m_%d_%H%M%S'
 RE_FILE_TS = re.compile(r'(\d{4}_\d{2}_\d{2}_\d{6})')
-SPACE_PATH = ['agent', 'agent_space', 'aeb_space', 'env_space', 'env']
 
 
 class LabJsonEncoder(json.JSONEncoder):
@@ -46,6 +46,22 @@ def default(self, obj):
             return str(obj)
 
 
+def batch_get(arr, idxs):
+    '''Get multi-idxs from an array depending if it's a python list or np.array'''
+    if isinstance(arr, (list, deque)):
+        return np.array(operator.itemgetter(*idxs)(arr))
+    else:
+        return arr[idxs]
+
+
+def calc_srs_mean_std(sr_list):
+    '''Given a list of series, calculate their mean and std'''
+    cat_df = pd.DataFrame(dict(enumerate(sr_list)))
+    mean_sr = cat_df.mean(axis=1)
+    std_sr = cat_df.std(axis=1)
+    return mean_sr, std_sr
+
+
 def calc_ts_diff(ts2, ts1):
     '''
     Calculate the time from tss ts1 to ts2
@@ -104,21 +120,6 @@ def concat_batches(batches):
     return concat_batch
 
 
-def cond_multiget(arr, idxs):
-    '''Get multi-idxs from an array depending if it's a python list or np.array'''
-    if isinstance(arr, list):
-        return np.array(operator.itemgetter(*idxs)(arr))
-    else:
-        return arr[idxs]
-
-
-def count_nonan(arr):
-    try:
-        return np.count_nonzero(~np.isnan(arr))
-    except Exception:
-        return len(filter_nonan(arr))
-
-
 def downcast_float32(df):
     '''Downcast any float64 col to float32 to allow safer pandas comparison'''
     for col in df.columns:
@@ -127,6 +128,14 @@ def downcast_float32(df):
     return df
 
 
+def epi_done(done):
+    '''
+    General method to check if episode is done for both single and vectorized env
+    Only return True for singleton done since vectorized env does not have a natural episode boundary
+    '''
+    return np.isscalar(done) and done
+
+
 def find_ckpt(prepath):
     '''Find the ckpt-lorem-ipsum in a string and return lorem-ipsum'''
     if 'ckpt' in prepath:
@@ -137,6 +146,14 @@ def find_ckpt(prepath):
     return ckpt
 
 
+def frame_mod(frame, frequency, num_envs):
+    '''
+    Generic mod for (frame % frequency == 0) for when num_envs is 1 or more,
+    since frame will increase multiple ticks for vector env, use the remainder'''
+    remainder = num_envs or 1
+    return (frame % frequency < remainder)
+
+
 def flatten_dict(obj, delim='.'):
     '''Missing pydash method to flatten dict'''
     nobj = {}
@@ -155,48 +172,6 @@ def flatten_dict(obj, delim='.'):
     return nobj
 
 
-def filter_nonan(arr):
-    '''Filter to np array with no nan'''
-    try:
-        return arr[~np.isnan(arr)]
-    except Exception:
-        mixed_type = []
-        for v in arr:
-            if not gen_isnan(v):
-                mixed_type.append(v)
-        return np.array(mixed_type, dtype=arr.dtype)
-
-
-def fix_multi_index_dtype(df):
-    '''Restore aeb multi_index dtype from string to int, when read from file'''
-    df.columns = pd.MultiIndex.from_tuples([(int(x[0]), int(x[1]), int(x[2]), x[3]) for x in df.columns])
-    return df
-
-
-def nanflatten(arr):
-    '''Flatten np array while ignoring nan, like np.nansum etc.'''
-    flat_arr = arr.reshape(-1)
-    return filter_nonan(flat_arr)
-
-
-def gen_isnan(v):
-    '''Check isnan for general type (np.isnan is only operable on np type)'''
-    try:
-        return np.isnan(v).all()
-    except Exception:
-        return v is None
-
-
-def get_df_aeb_list(session_df):
-    '''Get the aeb list for session_df for iterating.'''
-    aeb_list = sorted(ps.uniq([(a, e, b) for a, e, b, col in session_df.columns.tolist()]))
-    return aeb_list
-
-
-def get_aeb_shape(aeb_list):
-    return np.amax(aeb_list, axis=0) + 1
-
-
 def get_class_name(obj, lower=False):
     '''Get the class name of an object'''
     class_name = obj.__class__.__name__
@@ -239,19 +214,20 @@ def get_lab_mode():
     return os.environ.get('lab_mode')
 
 
-def get_prepath(spec, info_space, unit='experiment'):
+def get_prepath(spec, unit='experiment'):
     spec_name = spec['name']
-    predir = f'output/{spec_name}_{info_space.experiment_ts}'
+    meta_spec = spec['meta']
+    predir = f'output/{spec_name}_{meta_spec["experiment_ts"]}'
     prename = f'{spec_name}'
-    trial_index = info_space.get('trial')
-    session_index = info_space.get('session')
+    trial_index = meta_spec['trial']
+    session_index = meta_spec['session']
     t_str = '' if trial_index is None else f'_t{trial_index}'
     s_str = '' if session_index is None else f'_s{session_index}'
     if unit == 'trial':
         prename += t_str
     elif unit == 'session':
         prename += f'{t_str}{s_str}'
-    ckpt = ps.get(info_space, 'ckpt')
+    ckpt = meta_spec['ckpt']
     if ckpt is not None:
         prename += f'_ckpt-{ckpt}'
     prepath = f'{predir}/{prename}'
@@ -274,14 +250,12 @@ def get_ts(pattern=FILE_TS_FORMAT):
     return ts
 
 
-def guard_data_a(cls, data_a, data_name):
-    '''Guard data_a in case if it scalar, create a data_a and fill.'''
-    if np.isscalar(data_a):
-        new_data_a, = s_get(cls, 'aeb_space').init_data_s([data_name], a=cls.a)
-        for eb, body in ndenumerate_nonan(cls.body_a):
-            new_data_a[eb] = data_a
-        data_a = new_data_a
-    return data_a
+def insert_folder(prepath, folder):
+    '''Insert a folder into prepath'''
+    split_path = prepath.split('/')
+    prename = split_path.pop()
+    split_path += [folder, prename]
+    return '/'.join(split_path)
 
 
 def in_eval_lab_modes():
@@ -305,7 +279,11 @@ def ctx_lab_mode(lab_mode):
     Creates context to run method with a specific lab_mode
     @example
     with util.ctx_lab_mode('eval'):
-        run_eval()
+        foo()
+
+    @util.ctx_lab_mode('eval')
+    def foo():
+        ...
     '''
     prev_lab_mode = os.environ.get('lab_mode')
     os.environ['lab_mode'] = lab_mode
@@ -323,24 +301,14 @@ def monkey_patch(base_cls, extend_cls):
         setattr(base_cls, fn, getattr(extend_cls, fn))
 
 
-def ndenumerate_nonan(arr):
-    '''Generic ndenumerate for np.ndenumerate with only not gen_isnan values'''
-    return (idx_v for idx_v in np.ndenumerate(arr) if not gen_isnan(idx_v[1]))
-
-
-def nonan_all(v):
-    '''Generic np.all that also returns false if array is all np.nan'''
-    return bool(np.all(v) and ~np.all(np.isnan(v)))
-
-
-def parallelize_fn(fn, args, num_cpus=NUM_CPUS):
+def parallelize(fn, args, num_cpus=NUM_CPUS):
     '''
     Parallelize a method fn, args and return results with order preserved per args.
-    fn should take only a single arg.
+    args should be a list of tuples.
     @returns {list} results Order preserved output from fn.
     '''
     pool = mp.Pool(num_cpus, maxtasksperchild=1)
-    results = pool.map(fn, args)
+    results = pool.starmap(fn, args)
     pool.close()
     pool.join()
     return results
@@ -363,7 +331,7 @@ def prepath_split(prepath):
     if ckpt is not None:  # separate ckpt
         tail = tail.replace(f'_ckpt-{ckpt}', '')
     if '/' in tail:  # tail = prefolder/prename
-        prefolder, prename = tail.split('/')
+        prefolder, prename = tail.split('/', 1)
     else:
         prefolder, prename = tail, None
     predir = f'output/{prefolder}'
@@ -392,43 +360,28 @@ def prepath_to_idxs(prepath):
 
 
 def prepath_to_spec(prepath):
-    '''Create spec from prepath such that it returns the same prepath with info_space'''
-    predir, _, prename, _, _, _ = prepath_split(prepath)
+    '''
+    Given a prepath, read the correct spec recover the meta_spec that will return the same prepath for eval lab modes
+    example: output/a2c_cartpole_2018_06_13_220436/a2c_cartpole_t0_s0
+    '''
+    predir, _, prename, _, experiment_ts, ckpt = prepath_split(prepath)
     sidx_res = re.search('_s\d+', prename)
     if sidx_res:  # replace the _s0 if any
         prename = prename.replace(sidx_res[0], '')
     spec_path = f'{predir}/{prename}_spec.json'
     # read the spec of prepath
     spec = read(spec_path)
-    return spec
-
-
-def prepath_to_info_space(prepath):
-    '''Create info_space from prepath such that it returns the same prepath with spec'''
-    from convlab.experiment.monitor import InfoSpace
-    _, _, _, _, experiment_ts, ckpt = prepath_split(prepath)
+    # recover meta_spec
     trial_index, session_index = prepath_to_idxs(prepath)
-    # create info_space for prepath
-    info_space = InfoSpace()
-    info_space.experiment_ts = experiment_ts
-    info_space.ckpt = ckpt
-    info_space.set('experiment', 0)
-    info_space.set('trial', trial_index)
-    info_space.set('session', session_index)
-    return info_space
-
-
-def prepath_to_spec_info_space(prepath):
-    '''
-    Given a prepath, read the correct spec and craete the info_space that will return the same prepath
-    This is used for lab_mode: enjoy
-    example: output/a2c_cartpole_2018_06_13_220436/a2c_cartpole_t0_s0
-    '''
-    spec = prepath_to_spec(prepath)
-    info_space = prepath_to_info_space(prepath)
-    check_prepath = get_prepath(spec, info_space, unit='session')
+    meta_spec = spec['meta']
+    meta_spec['experiment_ts'] = experiment_ts
+    meta_spec['ckpt'] = ckpt
+    meta_spec['experiment'] = 0
+    meta_spec['trial'] = trial_index
+    meta_spec['session'] = session_index
+    check_prepath = get_prepath(spec, unit='session')
     assert check_prepath in prepath, f'{check_prepath}, {prepath}'
-    return spec, info_space
+    return spec
 
 
 def read(data_path, **kwargs):
@@ -463,6 +416,8 @@ def read(data_path, **kwargs):
     ext = get_file_ext(data_path)
     if ext == '.csv':
         data = read_as_df(data_path, **kwargs)
+    elif ext == '.pkl':
+        data = read_as_pickle(data_path, **kwargs)
     else:
         data = read_as_plain(data_path, **kwargs)
     return data
@@ -475,6 +430,13 @@ def read_as_df(data_path, **kwargs):
     return data
 
 
+def read_as_pickle(data_path, **kwargs):
+    '''Submethod to read data as pickle'''
+    with open(data_path, 'rb') as f:
+        data = pickle.load(f)
+    return data
+
+
 def read_as_plain(data_path, **kwargs):
     '''Submethod to read data as plain type'''
     open_file = open(data_path, 'r')
@@ -482,7 +444,7 @@ def read_as_plain(data_path, **kwargs):
     if ext == '.json':
         data = ujson.load(open_file, **kwargs)
     elif ext == '.yml':
-        data = yaml.safe_load(open_file, **kwargs)
+        data = yaml.load(open_file, **kwargs)
     else:
         data = open_file.read()
     open_file.close()
@@ -492,7 +454,7 @@ def read_as_plain(data_path, **kwargs):
 def run_cmd(cmd):
     '''Run shell command'''
     print(f'+ {cmd}')
-    proc = subprocess.Popen(cmd, cwd=ROOT_DIR, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+    proc = subprocess.Popen(cmd, cwd=ROOT_DIR, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
     return proc
 
 
@@ -507,32 +469,6 @@ def run_cmd_wait(proc):
         return output
 
 
-def s_get(cls, attr_path):
-    '''
-    Method to get attribute across space via inferring agent <-> env paths.
-    @example
-    self.agent.agent_space.aeb_space.clock
-    # equivalently
-    util.s_get(self, 'aeb_space.clock')
-    '''
-    from_class_name = get_class_name(cls, lower=True)
-    from_idx = ps.find_index(SPACE_PATH, lambda s: from_class_name in (s, s.replace('_', '')))
-    from_idx = max(from_idx, 0)
-    attr_path = attr_path.split('.')
-    to_idx = SPACE_PATH.index(attr_path[0])
-    assert -1 not in (from_idx, to_idx)
-    if from_idx < to_idx:
-        path_link = SPACE_PATH[from_idx: to_idx]
-    else:
-        path_link = ps.reverse(SPACE_PATH[to_idx: from_idx])
-
-    res = cls
-    for attr in path_link + attr_path:
-        if not (get_class_name(res, lower=True) in (attr, attr.replace('_', ''))):
-            res = getattr(res, attr)
-    return res
-
-
 def self_desc(cls):
     '''Method to get self description, used at init.'''
     desc_list = [f'{get_class_name(cls)}:']
@@ -540,7 +476,7 @@ def self_desc(cls):
         if k == 'spec':
             desc_v = v['name']
         elif ps.is_dict(v) or ps.is_dict(ps.head(v)):
-            desc_v = to_json(v)
+            desc_v = pformat(v)
         else:
             desc_v = v
         desc_list.append(f'- {k} = {desc_v}')
@@ -548,24 +484,6 @@ def self_desc(cls):
     return desc
 
 
-def session_df_to_data(session_df):
-    '''
-    Convert a multi_index session_df (df) with column levels (a,e,b,col) to session_data[aeb] = aeb_df
-    @example
-
-    session_df = util.read(filepath, header=[0, 1, 2, 3])
-    session_data = util.session_df_to_data(session_df)
-    '''
-    session_data = {}
-    fix_multi_index_dtype(session_df)
-    aeb_list = get_df_aeb_list(session_df)
-    for aeb in aeb_list:
-        aeb_df = session_df.loc[:, aeb]
-        aeb_df.reset_index(inplace=True, drop=True)  # guard for eval append-row
-        session_data[aeb] = aeb_df
-    return session_data
-
-
 def set_attr(obj, attr_dict, keys=None):
     '''Set attribute of an object from a dict'''
     if keys is not None:
@@ -575,25 +493,46 @@ def set_attr(obj, attr_dict, keys=None):
     return obj
 
 
-def set_rand_seed(random_seed, env_space):
-    '''Set all the module random seeds'''
-    torch.cuda.manual_seed_all(random_seed)
-    torch.manual_seed(random_seed)
-    np.random.seed(random_seed)
-    envs = env_space.envs if hasattr(env_space, 'envs') else [env_space]
-    for env in envs:
-        try:
-            env.u_env.seed(random_seed)
-        except Exception as e:
-            pass
+def set_cuda_id(spec):
+    '''Use trial and session id to hash and modulo cuda device count for a cuda_id to maximize device usage. Sets the net_spec for the base Net class to pick up.'''
+    # Don't trigger any cuda call if not using GPU. Otherwise will break multiprocessing on machines with CUDA.
+    # see issues https://github.com/pytorch/pytorch/issues/334 https://github.com/pytorch/pytorch/issues/3491 https://github.com/pytorch/pytorch/issues/9996
+    for agent_spec in spec['agent']:
+        if 'net' not in agent_spec or not agent_spec['net'].get('gpu'):
+            return
+    meta_spec = spec['meta']
+    trial_idx = meta_spec['trial'] or 0
+    session_idx = meta_spec['session'] or 0
+    if meta_spec['distributed'] == 'shared':  # shared hogwild uses only global networks, offset them to idx 0
+        session_idx = 0
+    job_idx = trial_idx * meta_spec['max_session'] + session_idx
+    job_idx += meta_spec['cuda_offset']
+    device_count = torch.cuda.device_count()
+    cuda_id = None if not device_count else job_idx % device_count
+
+    for agent_spec in spec['agent']:
+        agent_spec['net']['cuda_id'] = cuda_id
 
 
-def set_logger(spec, info_space, logger, unit=None):
-    '''Set the logger for a lab unit give its spec and info_space'''
-    os.environ['PREPATH'] = get_prepath(spec, info_space, unit=unit)
+def set_logger(spec, logger, unit=None):
+    '''Set the logger for a lab unit give its spec'''
+    os.environ['LOG_PREPATH'] = insert_folder(get_prepath(spec, unit=unit), 'log')
     reload(logger)  # to set session-specific logger
 
 
+def set_random_seed(spec):
+    '''Generate and set random seed for relevant modules, and record it in spec.meta.random_seed'''
+    torch.set_num_threads(1)  # prevent multithread slowdown, set again for hogwild
+    trial = spec['meta']['trial']
+    session = spec['meta']['session']
+    random_seed = int(1e5 * (trial or 0) + 1e3 * (session or 0) + time.time())
+    torch.cuda.manual_seed_all(random_seed)
+    torch.manual_seed(random_seed)
+    np.random.seed(random_seed)
+    spec['meta']['random_seed'] = random_seed
+    return random_seed
+
+
 def _sizeof(obj, seen=None):
     '''Recursively finds size of objects'''
     size = sys.getsizeof(obj)
@@ -645,6 +584,21 @@ def smart_path(data_path, as_dir=False):
     return os.path.normpath(data_path)
 
 
+def split_minibatch(batch, mb_size):
+    '''Split a batch into minibatches of mb_size or smaller, without replacement'''
+    size = len(batch['rewards'])
+    assert mb_size < size, f'Minibatch size {mb_size} must be < batch size {size}'
+    idxs = np.arange(size)
+    np.random.shuffle(idxs)
+    chunks = int(size / mb_size)
+    nested_idxs = np.array_split(idxs, chunks)
+    mini_batches = []
+    for minibatch_idxs in nested_idxs:
+        minibatch = {k: v[minibatch_idxs] for k, v in batch.items()}
+        mini_batches.append(minibatch)
+    return mini_batches
+
+
 def to_json(d, indent=2):
     '''Shorthand method for stringify JSON with indent'''
     return json.dumps(d, indent=indent, cls=LabJsonEncoder)
@@ -661,31 +615,10 @@ def to_torch_batch(batch, device, is_episodic):
             batch[k] = np.concatenate(batch[k])
         elif ps.is_list(batch[k]):
             batch[k] = np.array(batch[k])
-        batch[k] = torch.from_numpy(batch[k].astype('float32')).to(device)
+        batch[k] = torch.from_numpy(batch[k].astype(np.float32)).to(device)
     return batch
 
 
-def try_set_cuda_id(spec, info_space):
-    '''Use trial and session id to hash and modulo cuda device count for a cuda_id to maximize device usage. Sets the net_spec for the base Net class to pick up.'''
-    # Don't trigger any cuda call if not using GPU. Otherwise will break multiprocessing on machines with CUDA.
-    # see issues https://github.com/pytorch/pytorch/issues/334 https://github.com/pytorch/pytorch/issues/3491 https://github.com/pytorch/pytorch/issues/9996
-    for agent_spec in spec['agent']:
-        if not agent_spec.get('net') or not agent_spec['net'].get('gpu'):
-            return
-    trial_idx = info_space.get('trial') or 0
-    session_idx = info_space.get('session') or 0
-    job_idx = trial_idx * spec['meta']['max_session'] + session_idx
-    job_idx += int(os.environ.get('CUDA_ID_OFFSET', 0))
-    device_count = torch.cuda.device_count()
-    if device_count == 0:
-        cuda_id = None
-    else:
-        cuda_id = job_idx % device_count
-
-    for agent_spec in spec['agent']:
-        agent_spec['net']['cuda_id'] = cuda_id
-
-
 def write(data, data_path):
     '''
     Universal data writing method with smart data parsing
@@ -713,6 +646,8 @@ def write(data, data_path):
     ext = get_file_ext(data_path)
     if ext == '.csv':
         write_as_df(data, data_path)
+    elif ext == '.pkl':
+        write_as_pickle(data, data_path)
     else:
         write_as_plain(data, data_path)
     return data_path
@@ -722,7 +657,14 @@ def write_as_df(data, data_path):
     '''Submethod to write data as DataFrame'''
     df = cast_df(data)
     ext = get_file_ext(data_path)
-    df.to_csv(data_path)
+    df.to_csv(data_path, index=False)
+    return data_path
+
+
+def write_as_pickle(data, data_path):
+    '''Submethod to write data as pickle'''
+    with open(data_path, 'wb') as f:
+        pickle.dump(data, f)
     return data_path
 
 
@@ -740,7 +682,26 @@ def write_as_plain(data, data_path):
     return data_path
 
 
-# Atari image transformation
+# Atari image preprocessing
+
+
+def to_opencv_image(im):
+    '''Convert to OpenCV image shape h,w,c'''
+    shape = im.shape
+    if len(shape) == 3 and shape[0] < shape[-1]:
+        return im.transpose(1, 2, 0)
+    else:
+        return im
+
+
+def to_pytorch_image(im):
+    '''Convert to PyTorch image shape c,h,w'''
+    shape = im.shape
+    if len(shape) == 3 and shape[-1] < shape[0]:
+        return im.transpose(2, 0, 1)
+    else:
+        return im
+
 
 def grayscale_image(im):
     return cv2.cvtColor(im, cv2.COLOR_RGB2GRAY)
@@ -750,64 +711,42 @@ def resize_image(im, w_h):
     return cv2.resize(im, w_h, interpolation=cv2.INTER_AREA)
 
 
-def crop_image(im):
-    '''Crop away the unused top-bottom game borders of Atari'''
-    return im[18:102, :]
-
-
 def normalize_image(im):
     '''Normalizing image by dividing max value 255'''
     # NOTE: beware in its application, may cause loss to be 255 times lower due to smaller input values
     return np.divide(im, 255.0)
 
 
-def nature_transform_image(im):
-    '''
-    Image preprocessing from the paper "Playing Atari with Deep Reinforcement Learning, 2013, Mnih et al"
-    Takes an RGB image and converts it to grayscale, downsizes to 110 x 84 and crops to square 84 x 84 without the game border
-    '''
-    im = grayscale_image(im)
-    im = resize_image(im, (84, 110))
-    im = crop_image(im)
-    return im
-
-
-def openai_transform_image(im):
+def preprocess_image(im):
     '''
-    Image transformation using OpenAI's baselines method: greyscale, resize
-    Instead of cropping as done in nature_transform_image(), this resizes and stretches the image.
+    Image preprocessing using OpenAI Baselines method: grayscale, resize
+    This resize uses stretching instead of cropping
     '''
+    im = to_opencv_image(im)
     im = grayscale_image(im)
     im = resize_image(im, (84, 84))
+    im = np.expand_dims(im, 0)
     return im
 
 
-def transform_image(im, method='openai'):
-    '''Apply image transformation using nature or openai method'''
-    if method == 'nature':
-        return nature_transform_image(im)
-    elif method == 'openai':
-        return openai_transform_image(im)
-    else:
-        raise ValueError('method must be one of: nature, openai')
-
-
 def debug_image(im):
-    '''Use this method to render image the agent sees; waits for a key press before continuing'''
-    cv2.imshow('image', im)
+    '''
+    Renders an image for debugging; pauses process until key press
+    Handles tensor/numpy and conventions among libraries
+    '''
+    if torch.is_tensor(im):  # if PyTorch tensor, get numpy
+        im = im.cpu().numpy()
+    im = to_opencv_image(im)
+    im = im.astype(np.uint8)  # typecast guard
+    if im.shape[0] == 3:  # RGB image
+        # accommodate from RGB (numpy) to BGR (cv2)
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    cv2.imshow('debug image', im)
     cv2.waitKey(0)
 
 
-def mpl_debug_image(im):
-    '''Uses matplotlib to plot image with bigger size, axes, and false color on greyscaled images'''
-    import matplotlib.pyplot as plt
-    plt.figure()
-    plt.imshow(im)
-    plt.show()
-
-
 def cached_path(file_path, cached_dir=None):
     if not cached_dir:
         cached_dir = str(Path(Path.home() / '.convlab') / "cache")
 
-    return allennlp_cached_path(file_path, cached_dir)
+    return allennlp_cached_path(file_path, cached_dir)
\ No newline at end of file
diff --git a/convlab/lib/viz.py b/convlab/lib/viz.py
index 4874591..815c6f8 100644
--- a/convlab/lib/viz.py
+++ b/convlab/lib/viz.py
@@ -1,34 +1,21 @@
-# Modified by Microsoft Corporation.
-# Licensed under the MIT license.
-
-'''
-The data visualization module
-TODO pie, swarm, box plots
-'''
-from plotly import (
-    graph_objs as go,
-    offline as py,
-    tools,
-)
+# The data visualization module
+# Defines plotting methods for analysis
+from plotly import graph_objs as go, io as pio, tools
+from plotly.offline import init_notebook_mode, iplot
 from convlab.lib import logger, util
 import colorlover as cl
 import os
-import plotly
-import plotly.io as pio
 import pydash as ps
-import sys
 
+logger = logger.get_logger(__name__)
 
-PLOT_FILEDIR = util.smart_path('data')
-os.makedirs(PLOT_FILEDIR, exist_ok=True)
+# warn orca failure only once
+orca_warn_once = ps.once(lambda e: logger.warning(f'Failed to generate graph. Run retro-analysis to generate graphs later.'))
 if util.is_jupyter():
-    py.init_notebook_mode(connected=True)
-logger = logger.get_logger(__name__)
+    init_notebook_mode(connected=True)
 
 
-def create_label(
-        y_col, x_col,
-        title=None, y_title=None, x_title=None, legend_name=None):
+def create_label(y_col, x_col, title=None, y_title=None, x_title=None, legend_name=None):
     '''Create label dict for go.Layout with smart resolution'''
     legend_name = legend_name or y_col
     y_col_list, x_col_list, legend_name_list = ps.map_(
@@ -48,9 +35,7 @@ def create_label(
     return label
 
 
-def create_layout(
-        title, y_title, x_title, x_type=None,
-        width=500, height=350, layout_kwargs=None):
+def create_layout(title, y_title, x_title, x_type=None, width=500, height=500, layout_kwargs=None):
     '''simplified method to generate Layout'''
     layout = go.Layout(
         title=title,
@@ -64,12 +49,12 @@ def create_layout(
     return layout
 
 
-def get_palette(aeb_count):
-    '''Get the suitable palette to plot for some number of aeb graphs, where each aeb is a color.'''
-    if aeb_count <= 8:
-        palette = cl.scales[str(max(3, aeb_count))]['qual']['Set2']
+def get_palette(size):
+    '''Get the suitable palette of a certain size'''
+    if size <= 8:
+        palette = cl.scales[str(max(3, size))]['qual']['Set2']
     else:
-        palette = cl.interp(cl.scales['8']['qual']['Set2'], aeb_count)
+        palette = cl.interp(cl.scales['8']['qual']['Set2'], size)
     return palette
 
 
@@ -79,162 +64,168 @@ def lower_opacity(rgb, opacity):
 
 def plot(*args, **kwargs):
     if util.is_jupyter():
-        return py.iplot(*args, **kwargs)
-    else:
-        kwargs.update({'auto_open': ps.get(kwargs, 'auto_open', False)})
-        return py.plot(*args, **kwargs)
+        return iplot(*args, **kwargs)
 
 
-def plot_go(
-        df, y_col=None, x_col='index', y2_col=None,
-        title=None, y_title=None, x_title=None, x_type=None,
-        legend_name=None, width=500, height=350, draw=True,
-        save=False, filename=None,
-        trace_class='Scatter', trace_kwargs=None, layout_kwargs=None):
-    '''
-    Quickly plot from df using trace_class, e.g. go.Scatter
-    1. create_label() to auto-resolve labels
-    2. create_layout() with go.Layout() and update(layout_kwargs)
-    3. spread and create go.<trace_class>() and update(trace_kwargs)
-    4. Create the figure and plot accordingly
-    @returns figure
-    '''
-    df = df.copy()
-    if x_col == 'index':
-        df['index'] = df.index.tolist()
-
-    label = create_label(y_col, x_col, title, y_title, x_title, legend_name)
-    layout = create_layout(
-        x_type=x_type, width=width, height=height, layout_kwargs=layout_kwargs,
-        **ps.pick(label, ['title', 'y_title', 'x_title']))
-    y_col_list, x_col_list = label['y_col_list'], label['x_col_list']
-
-    if y2_col is not None:
-        label2 = create_label(y2_col, x_col, title, y_title, x_title, legend_name)
-        layout.update(dict(yaxis2=dict(
-            rangemode='tozero', title=label2['y_title'],
-            side='right', overlaying='y1', anchor='x1',
-        )))
-        y2_col_list, x_col_list = label2['y_col_list'], label2['x_col_list']
-        label2_legend_name_list = label2['legend_name_list']
-    else:
-        y2_col_list = []
-        label2_legend_name_list = []
-
-    combo_y_col_list = y_col_list + y2_col_list
-    combo_legend_name_list = label['legend_name_list'] + label2_legend_name_list
-    y_col_num, x_col_num = len(combo_y_col_list), len(x_col_list)
-    trace_num = max(y_col_num, x_col_num)
-    data = []
-    for idx in range(trace_num):
-        y_c = ps.get(combo_y_col_list, idx % y_col_num)
-        x_c = ps.get(x_col_list, idx % x_col_num)
-        df_y, df_x = ps.get(df, y_c), ps.get(df, x_c)
-        trace = ps.get(go, trace_class)(y=df_y, x=df_x, name=combo_legend_name_list[idx])
-        trace.update(trace_kwargs)
-        if idx >= len(y_col_list):
-            trace.update(dict(yaxis='y2', xaxis='x1'))
-        data.append(trace)
-
-    figure = go.Figure(data=data, layout=layout)
-    if draw:
-        plot(figure)
-    if save:
-        save_image(figure, filename=filename)
-    return figure
-
-
-def plot_area(
-    *args, fill='tonexty', stack=False,
-    trace_kwargs=None, layout_kwargs=None,
-        **kwargs):
-    '''Plot area from df'''
-    if stack:
-        df, y_col = args[:2]
-        stack_df = stack_cumsum(df, y_col)
-        args = (stack_df,) + args[1:]
-    trace_kwargs = ps.merge(dict(fill=fill, mode='lines', line=dict(width=1)), trace_kwargs)
-    layout_kwargs = ps.merge(dict(), layout_kwargs)
-    return plot_go(
-        *args, trace_class='Scatter',
-        trace_kwargs=trace_kwargs, layout_kwargs=layout_kwargs,
-        **kwargs)
-
-
-def plot_bar(
-    *args, barmode='stack', orientation='v',
-    trace_kwargs=None, layout_kwargs=None,
-        **kwargs):
-    '''Plot bar chart from df'''
-    trace_kwargs = ps.merge(dict(orientation=orientation), trace_kwargs)
-    layout_kwargs = ps.merge(dict(barmode=barmode), layout_kwargs)
-    return plot_go(
-        *args, trace_class='Bar',
-        trace_kwargs=trace_kwargs, layout_kwargs=layout_kwargs,
-        **kwargs)
-
-
-def plot_line(
-    *args,
-    trace_kwargs=None, layout_kwargs=None,
-        **kwargs):
-    '''Plot line from df'''
-    trace_kwargs = ps.merge(dict(mode='lines', line=dict(width=1)), trace_kwargs)
-    layout_kwargs = ps.merge(dict(), layout_kwargs)
-    return plot_go(
-        *args, trace_class='Scatter',
-        trace_kwargs=trace_kwargs, layout_kwargs=layout_kwargs,
-        **kwargs)
-
-
-def plot_scatter(
-    *args,
-    trace_kwargs=None, layout_kwargs=None,
-        **kwargs):
-    '''Plot scatter from df'''
-    trace_kwargs = ps.merge(dict(mode='markers'), trace_kwargs)
-    layout_kwargs = ps.merge(dict(), layout_kwargs)
-    return plot_go(
-        *args, trace_class='Scatter',
-        trace_kwargs=trace_kwargs, layout_kwargs=layout_kwargs,
-        **kwargs)
-
-
-def plot_histogram(
-    *args, barmode='overlay', xbins=None, histnorm='count', orientation='v',
-    trace_kwargs=None, layout_kwargs=None,
-        **kwargs):
-    '''Plot histogram from df'''
-    trace_kwargs = ps.merge(dict(orientation=orientation, xbins={}, histnorm=histnorm), trace_kwargs)
-    layout_kwargs = ps.merge(dict(barmode=barmode), layout_kwargs)
-    return plot_go(
-        *args, trace_class='Histogram',
-        trace_kwargs=trace_kwargs, layout_kwargs=layout_kwargs,
-        **kwargs)
-
-
-def save_image(figure, filepath=None):
+def plot_sr(sr, time_sr, title, y_title, x_title):
+    '''Plot a series'''
+    x = time_sr.tolist()
+    color = get_palette(1)[0]
+    main_trace = go.Scatter(
+        x=x, y=sr, mode='lines', showlegend=False,
+        line={'color': color, 'width': 1},
+    )
+    data = [main_trace]
+    layout = create_layout(title=title, y_title=y_title, x_title=x_title)
+    fig = go.Figure(data, layout)
+    plot(fig)
+    return fig
+
+
+def plot_mean_sr(sr_list, time_sr, title, y_title, x_title):
+    '''Plot a list of series using its mean, with error bar using std'''
+    mean_sr, std_sr = util.calc_srs_mean_std(sr_list)
+    max_sr = mean_sr + std_sr
+    min_sr = mean_sr - std_sr
+    max_y = max_sr.tolist()
+    min_y = min_sr.tolist()
+    x = time_sr.tolist()
+    color = get_palette(1)[0]
+    main_trace = go.Scatter(
+        x=x, y=mean_sr, mode='lines', showlegend=False,
+        line={'color': color, 'width': 1},
+    )
+    envelope_trace = go.Scatter(
+        x=x + x[::-1], y=max_y + min_y[::-1], showlegend=False,
+        line={'color': 'rgba(0, 0, 0, 0)'},
+        fill='tozerox', fillcolor=lower_opacity(color, 0.2),
+    )
+    data = [main_trace, envelope_trace]
+    layout = create_layout(title=title, y_title=y_title, x_title=x_title)
+    fig = go.Figure(data, layout)
+    return fig
+
+
+def save_image(figure, filepath):
     if os.environ['PY_ENV'] == 'test':
         return
-    if filepath is None:
-        filepath = f'{PLOT_FILEDIR}/{ps.get(figure, "layout.title")}.png'
     filepath = util.smart_path(filepath)
     try:
         pio.write_image(figure, filepath)
-        logger.info(f'Graph saved to {filepath}')
     except Exception as e:
-        logger.warn(
-            f'{e}\nFailed to generate graph. Fix the issue and run retro-analysis to generate graphs.')
-
-
-def stack_cumsum(df, y_col):
-    '''Submethod to cumsum over y columns for stacked area plot'''
-    y_col_list = util.cast_list(y_col)
-    stack_df = df.copy()
-    for idx in range(len(y_col_list)):
-        col = y_col_list[idx]
-        presum_idx = idx - 1
-        if presum_idx > -1:
-            presum_col = y_col_list[presum_idx]
-            stack_df[col] += stack_df[presum_col]
-    return stack_df
+        orca_warn_once(e)
+
+
+# analysis plot methods
+
+def plot_session(session_spec, session_metrics, session_df, df_mode='eval'):
+    '''
+    Plot the session graphs:
+    - mean_returns, strengths, sample_efficiencies, training_efficiencies, stabilities (with error bar)
+    - additional plots from session_df: losses, exploration variable, entropy
+    '''
+    meta_spec = session_spec['meta']
+    prepath = meta_spec['prepath']
+    graph_prepath = meta_spec['graph_prepath']
+    title = f'session graph: {session_spec["name"]} t{meta_spec["trial"]} s{meta_spec["session"]}'
+
+    local_metrics = session_metrics['local']
+    name_time_pairs = [
+        ('mean_returns', 'frames'),
+        # ('strengths', 'frames'),
+        # ('sample_efficiencies', 'frames'),
+        # ('training_efficiencies', 'opt_steps'),
+        # ('stabilities', 'frames')
+    ]
+    for name, time in name_time_pairs:
+        fig = plot_sr(
+            local_metrics[name], local_metrics[time], title, name, time)
+        save_image(fig, f'{graph_prepath}_session_graph_{df_mode}_{name}_vs_{time}.png')
+        if name in ('mean_returns',):  # save important graphs in prepath directly
+            save_image(fig, f'{prepath}_session_graph_{df_mode}_{name}_vs_{time}.png')
+
+    if df_mode == 'eval':
+        return
+    # training plots from session_df
+    name_time_pairs = [
+        ('loss', 'frame'),
+        ('explore_var', 'frame'),
+        ('entropy', 'frame'),
+    ]
+    for name, time in name_time_pairs:
+        fig = plot_sr(
+            session_df[name], session_df[time], title, name, time)
+        save_image(fig, f'{graph_prepath}_session_graph_{df_mode}_{name}_vs_{time}.png')
+
+
+def plot_trial(trial_spec, trial_metrics):
+    '''
+    Plot the trial graphs:
+    - mean_returns, strengths, sample_efficiencies, training_efficiencies, stabilities (with error bar)
+    - consistencies (no error bar)
+    '''
+    meta_spec = trial_spec['meta']
+    prepath = meta_spec['prepath']
+    graph_prepath = meta_spec['graph_prepath']
+    title = f'trial graph: {trial_spec["name"]} t{meta_spec["trial"]} {meta_spec["max_session"]} sessions'
+
+    local_metrics = trial_metrics['local']
+    name_time_pairs = [
+        ('mean_returns', 'frames'),
+        # ('strengths', 'frames'),
+        # ('sample_efficiencies', 'frames'),
+        # ('training_efficiencies', 'opt_steps'),
+        # ('stabilities', 'frames'),
+        # ('consistencies', 'frames'),
+    ]
+    for name, time in name_time_pairs:
+        if name == 'consistencies':
+            fig = plot_sr(
+                local_metrics[name], local_metrics[time], title, name, time)
+        else:
+            fig = plot_mean_sr(
+                local_metrics[name], local_metrics[time], title, name, time)
+        save_image(fig, f'{graph_prepath}_trial_graph_{name}_vs_{time}.png')
+        if name in ('mean_returns',):  # save important graphs in prepath directly
+            save_image(fig, f'{prepath}_trial_graph_{name}_vs_{time}.png')
+
+
+def plot_experiment(experiment_spec, experiment_df, metrics_cols):
+    '''
+    Plot the metrics vs. specs parameters of an experiment, where each point is a trial.
+    ref colors: https://plot.ly/python/heatmaps-contours-and-2dhistograms-tutorial/#plotlys-predefined-color-scales
+    '''
+    y_cols = metrics_cols
+    x_cols = ps.difference(experiment_df.columns.tolist(), y_cols)
+    fig = tools.make_subplots(rows=len(y_cols), cols=len(x_cols), shared_xaxes=True, shared_yaxes=True, print_grid=False)
+    strength_sr = experiment_df['strength']
+    min_strength = strength_sr.values.min()
+    max_strength = strength_sr.values.max()
+    for row_idx, y in enumerate(y_cols):
+        for col_idx, x in enumerate(x_cols):
+            x_sr = experiment_df[x]
+            guard_cat_x = x_sr.astype(str) if x_sr.dtype == 'object' else x_sr
+            trace = go.Scatter(
+                y=experiment_df[y], yaxis=f'y{row_idx+1}',
+                x=guard_cat_x, xaxis=f'x{col_idx+1}',
+                showlegend=False, mode='markers',
+                marker={
+                    'symbol': 'circle-open-dot', 'color': experiment_df['strength'], 'opacity': 0.5,
+                    # dump first quarter of colorscale that is too bright
+                    'cmin': min_strength - 0.50 * (max_strength - min_strength), 'cmax': max_strength,
+                    'colorscale': 'YlGnBu', 'reversescale': True
+                },
+            )
+            fig.add_trace(trace, row_idx + 1, col_idx + 1)
+            fig.layout[f'xaxis{col_idx+1}'].update(title='<br>'.join(ps.chunk(x, 20)), zerolinewidth=1, categoryarray=sorted(guard_cat_x.unique()))
+        fig.layout[f'yaxis{row_idx+1}'].update(title=y, rangemode='tozero')
+    fig.layout.update(
+        title=f'experiment graph: {experiment_spec["name"]}',
+        width=100 + 300 * len(x_cols), height=200 + 300 * len(y_cols))
+    plot(fig)
+    graph_prepath = experiment_spec['meta']['graph_prepath']
+    save_image(fig, f'{graph_prepath}_experiment_graph.png')
+    # save important graphs in prepath directly
+    prepath = experiment_spec['meta']['prepath']
+    save_image(fig, f'{prepath}_experiment_graph.png')
+    return fig
diff --git a/convlab/modules/policy/system/multiwoz/rule_based_multiwoz_bot.py b/convlab/modules/policy/system/multiwoz/rule_based_multiwoz_bot.py
index ead0179..878920d 100644
--- a/convlab/modules/policy/system/multiwoz/rule_based_multiwoz_bot.py
+++ b/convlab/modules/policy/system/multiwoz/rule_based_multiwoz_bot.py
@@ -64,7 +64,7 @@ def predict(self, state):
 
         DA = {}
 
-        if(len(state['user_action']) > 0):
+        if 'user_action' in state and (len(state['user_action']) > 0):
             user_action = state['user_action']
         else:
             user_action = check_diff(self.last_state, state)
diff --git a/convlab/modules/policy/user/multiwoz/policy_agenda_multiwoz.py b/convlab/modules/policy/user/multiwoz/policy_agenda_multiwoz.py
index a269b73..bdce9ce 100644
--- a/convlab/modules/policy/user/multiwoz/policy_agenda_multiwoz.py
+++ b/convlab/modules/policy/user/multiwoz/policy_agenda_multiwoz.py
@@ -84,6 +84,10 @@ def predict(self, state, sys_action):
         """
         self.__turn += 2
 
+        # At the beginning of a dialog when there is no NLU.
+        if sys_action == "null":
+            sys_action = {}
+
         if self.__turn > self.max_turn:
             self.agenda.close_session()
         else:
diff --git a/convlab/spec/demo.json b/convlab/spec/demo.json
index 8a5d59e..4352a0b 100644
--- a/convlab/spec/demo.json
+++ b/convlab/spec/demo.json
@@ -43,7 +43,7 @@
         "is_user": true
       },
       "max_t": 40,
-      "max_tick": 1000
+      "max_frame": 1000
     }],
     "body": {
       "product": "outer",
@@ -52,6 +52,70 @@
     "meta": {
       "distributed": false,
       "eval_frequency": 1000,
+      "num_eval": 100,
+      "max_tick_unit": "total_t",
+      "max_trial": 1,
+      "max_session": 1,
+      "resources": {
+        "num_cpus": 1,
+        "num_gpus": 0
+      }
+    }
+  },
+  "milu_rule_rule_template": {
+    "agent": [{
+      "name": "DialogAgent",
+      "nlu": {
+        "name": "MILU",
+        "model_file": "https://convlab.blob.core.windows.net/models/milu.tar.gz"
+      },
+      "dst": {
+        "name": "RuleDST"
+      },
+      "nlg": {
+        "name": "MultiwozTemplateNLG",
+        "is_user": false
+      },
+      "algorithm": {
+        "name": "ExternalPolicy",
+        "policy": {
+          "name": "RuleBasedMultiwozBot"
+        },
+        "action_pdtype": "Argmax",
+        "action_policy": "epsilon_greedy"
+      },
+      "memory": {
+        "name": "Replay",
+        "max_size": 1 
+      }
+    }],
+    "env": [{
+      "name": "multiwoz",
+      "user_policy": {
+        "name": "UserPolicyAgendaMultiWoz"
+      },
+      "sys_policy": {
+        "name": "RuleBasedMultiwozBot"
+      },
+      "nlu": {
+        "name": "MILU",
+        "model_file": "https://convlab.blob.core.windows.net/models/milu.tar.gz"
+      },
+      "nlg": {
+        "name": "MultiwozTemplateNLG",
+        "is_user": true
+      },
+      "max_t": 40,
+      "max_frame": 1000
+    }],
+    "body": {
+      "product": "outer",
+      "num": 1
+    },
+    "meta": {
+      "distributed": false,
+      "eval_frequency": 1000,
+      "num_eval": 100,
       "max_tick_unit": "total_t",
       "max_trial": 1,
       "max_session": 1,
@@ -85,7 +149,7 @@
         "name": "DQN",
         "action_pdtype": "Argmax",
         "action_policy": "rule_guide_epsilon_greedy",
-        "rule_guide_max_epi": 1000,
+        "rule_guide_max_epi": 300,
         "rule_guide_frequency": 3,
         "explore_var_spec": {
           "name": "linear_decay",
@@ -95,9 +159,9 @@
           "end_step": 800,
         },
         "gamma": 0.9,
-        "training_batch_epoch": 10,
-        "training_epoch": 3,
-        "training_frequency": 50,
+        "training_batch_iter": 1000,
+        "training_iter": 1,
+        "training_frequency": 100,
         "training_start_step": 32,
         "normalize_state": false 
       },
@@ -125,7 +189,7 @@
           "gamma": 0.999,
         },
         "update_type": "replace",
-        "update_frequency": 500,
+        "update_frequency": 300,
         "polyak_coef": 0,
         "gpu": false
       }
@@ -135,7 +199,7 @@
       "action_dim": 300,
       "observation_dim": 393,
       "max_t": 40,
-      "max_tick": 20000,
+      "max_frame": 5000,
       "nlu": {
         "name": "OneNetLU",
         "model_file": "https://convlab.blob.core.windows.net/models/onenet.tar.gz"
@@ -157,6 +221,7 @@
     },
     "meta": {
       "distributed": false,
+      "num_eval": 100,
       "eval_frequency": 1000,
       "max_tick_unit": "total_t",
       "max_trial": 1,
@@ -166,5 +231,144 @@
         "num_gpus": 0
       }
     }
+  },
+  "rule_rule": {
+    "agent": [{
+      "name": "DialogAgent",
+      "dst": {
+        "name": "RuleDST"
+      },
+      "algorithm": {
+        "name": "ExternalPolicy",
+        "policy": {
+          "name": "RuleBasedMultiwozBot"
+        },
+        "action_pdtype": "Argmax",
+        "action_policy": "epsilon_greedy"
+      },
+      "memory": {
+        "name": "Replay",
+        "max_size": 1 
+      }
+    }],
+    "env": [{
+      "name": "multiwoz",
+      "action_dim": 300,
+      "observation_dim": 393,
+      "max_t": 40,
+      "max_frame": 100,
+      "user_policy": {
+        "name": "UserPolicyAgendaMultiWoz"
+      },
+      "sys_policy": {
+        "name": "RuleBasedMultiwozBot"
+      }
+    }],
+    "body": {
+      "product": "outer",
+      "num": 1
+    },
+    "meta": {
+      "distributed": false,
+      "eval_frequency": 10,
+      "num_eval": 10,
+      "max_tick_unit": "total_t",
+      "max_trial": 1,
+      "max_session": 1,
+      "resources": {
+        "num_cpus": 1,
+        "num_gpus": 0
+      }
+    }
+  },
+  "rule_dqn": {
+    "agent": [{
+      "name": "DialogAgent",
+      "dst": {
+        "name": "RuleDST"
+      },
+      "state_encoder": {
+        "name": "MultiWozStateEncoder"
+      },
+      "action_decoder": {
+        "name": "MultiWozVocabActionDecoder"
+      },
+      "algorithm": {
+        "name": "DQN",
+        "action_pdtype": "Argmax",
+        "action_policy": "rule_guide_epsilon_greedy",
+        "rule_guide_max_epi": 100,
+        "rule_guide_frequency": 3,
+        "explore_var_spec": {
+          "name": "linear_decay",
+          "start_val": 0.0,
+          "end_val": 0.0,
+          "start_step": 0,
+          "end_step": 800,
+        },
+        "gamma": 0.9,
+        "training_batch_iter": 100,
+        "training_iter": 3,
+        "training_frequency": 50,
+        "training_start_step": 32
+      },
+      "memory": {
+        "name": "Replay",
+        "batch_size": 16,
+        "max_size": 10000,
+        "use_cer": false 
+      },
+      "net": {
+        "type": "MLPNet",
+        "hid_layers": [100],
+        "hid_layers_activation": "relu",
+        "clip_grad_val": null,
+        "loss_spec": {
+          "name": "MSELoss"
+        },
+        "optim_spec": {
+          "name": "Adam",
+          "lr": 0.001
+        },
+        "lr_scheduler_spec": {
+          "name": "StepLR",
+          "step_size": 1000,
+          "gamma": 0.999,
+        },
+        "update_type": "replace",
+        "update_frequency": 50,
+        "polyak_coef": 0,
+        "gpu": false
+      }
+    }],
+    "env": [{
+      "name": "multiwoz",
+      "action_dim": 300,
+      "observation_dim": 393,
+      "max_t": 40,
+      "max_frame": 500,
+      "user_policy": {
+        "name": "UserPolicyAgendaMultiWoz"
+      },
+      "sys_policy": {
+        "name": "RuleBasedMultiwozBot"
+      }
+    }],
+    "body": {
+      "product": "outer",
+      "num": 1
+    },
+    "meta": {
+      "distributed": false,
+      "num_eval": 10,
+      "eval_frequency": 100,
+      "max_tick_unit": "total_t",
+      "max_trial": 1,
+      "max_session": 1,
+      "resources": {
+        "num_cpus": 1,
+        "num_gpus": 0
+      }
+    }
   }
 }
diff --git a/convlab/spec/random_baseline.py b/convlab/spec/random_baseline.py
new file mode 100644
index 0000000..58334ac
--- /dev/null
+++ b/convlab/spec/random_baseline.py
@@ -0,0 +1,133 @@
+# Module to generate random baselines
+# Run as: python convlab/spec/random_baseline.py
+from convlab.lib import logger, util
+import gym
+import numpy as np
+import pydash as ps
+
+
+FILEPATH = 'convlab/spec/_random_baseline.json'
+NUM_EVAL = 100
+# extra envs to include
+INCLUDE_ENVS = [
+    'vizdoom-v0',
+]
+EXCLUDE_ENVS = [
+    'CarRacing-v0',  # window bug
+    'Reacher-v2',  # exclude mujoco
+    'Pusher-v2',
+    'Thrower-v2',
+    'Striker-v2',
+    'InvertedPendulum-v2',
+    'InvertedDoublePendulum-v2',
+    'HalfCheetah-v3',
+    'Hopper-v3',
+    'Swimmer-v3',
+    'Walker2d-v3',
+    'Ant-v3',
+    'Humanoid-v3',
+    'HumanoidStandup-v2',
+    'FetchSlide-v1',
+    'FetchPickAndPlace-v1',
+    'FetchReach-v1',
+    'FetchPush-v1',
+    'HandReach-v0',
+    'HandManipulateBlockRotateZ-v0',
+    'HandManipulateBlockRotateParallel-v0',
+    'HandManipulateBlockRotateXYZ-v0',
+    'HandManipulateBlockFull-v0',
+    'HandManipulateBlock-v0',
+    'HandManipulateBlockTouchSensors-v0',
+    'HandManipulateEggRotate-v0',
+    'HandManipulateEggFull-v0',
+    'HandManipulateEgg-v0',
+    'HandManipulateEggTouchSensors-v0',
+    'HandManipulatePenRotate-v0',
+    'HandManipulatePenFull-v0',
+    'HandManipulatePen-v0',
+    'HandManipulatePenTouchSensors-v0',
+    'FetchSlideDense-v1',
+    'FetchPickAndPlaceDense-v1',
+    'FetchReachDense-v1',
+    'FetchPushDense-v1',
+    'HandReachDense-v0',
+    'HandManipulateBlockRotateZDense-v0',
+    'HandManipulateBlockRotateParallelDense-v0',
+    'HandManipulateBlockRotateXYZDense-v0',
+    'HandManipulateBlockFullDense-v0',
+    'HandManipulateBlockDense-v0',
+    'HandManipulateBlockTouchSensorsDense-v0',
+    'HandManipulateEggRotateDense-v0',
+    'HandManipulateEggFullDense-v0',
+    'HandManipulateEggDense-v0',
+    'HandManipulateEggTouchSensorsDense-v0',
+    'HandManipulatePenRotateDense-v0',
+    'HandManipulatePenFullDense-v0',
+    'HandManipulatePenDense-v0',
+    'HandManipulatePenTouchSensorsDense-v0',
+]
+
+
+def enum_envs():
+    '''Enumerate all the env names of the latest version'''
+    envs = [es.id for es in gym.envs.registration.registry.all()]
+    envs += INCLUDE_ENVS
+    envs = ps.difference(envs, EXCLUDE_ENVS)
+    return envs
+
+
+def gen_random_return(env_name, seed):
+    '''Generate a single-episode random policy return for an environment'''
+    # TODO generalize for unity too once it has a gym wrapper
+    env = gym.make(env_name)
+    env.seed(seed)
+    env.reset()
+    done = False
+    total_reward = 0
+    while not done:
+        _, reward, done, _ = env.step(env.action_space.sample())
+        total_reward += reward
+    return total_reward
+
+
+def gen_random_baseline(env_name, num_eval=NUM_EVAL):
+    '''Generate the random baseline for an environment by averaging over num_eval episodes'''
+    returns = util.parallelize(gen_random_return, [(env_name, i) for i in range(num_eval)])
+    mean_rand_ret = np.mean(returns)
+    std_rand_ret = np.std(returns)
+    return {'mean': mean_rand_ret, 'std': std_rand_ret}
+
+
+def get_random_baseline(env_name):
+    '''Get a single random baseline for env; if does not exist in file, generate live and update the file'''
+    random_baseline = util.read(FILEPATH)
+    if env_name in random_baseline:
+        baseline = random_baseline[env_name]
+    else:
+        try:
+            logger.info(f'Generating random baseline for {env_name}')
+            baseline = gen_random_baseline(env_name, NUM_EVAL)
+        except Exception as e:
+            logger.warning(f'Cannot start env: {env_name}, skipping random baseline generation')
+            baseline = None
+        # update immediately
+        logger.info(f'Updating new random baseline in {FILEPATH}')
+        random_baseline[env_name] = baseline
+        util.write(random_baseline, FILEPATH)
+    return baseline
+
+
+def main():
+    '''
+    Main method to generate all random baselines and write to file.
+    Run as: python convlab/spec/random_baseline.py
+    '''
+    envs = enum_envs()
+    for idx, env_name in enumerate(envs):
+        logger.info(f'Generating random baseline for {env_name}: {idx + 1}/{len(envs)}')
+        get_random_baseline(env_name)
+    logger.info(f'Done, random baseline updated in {FILEPATH}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/convlab/spec/spec_util.py b/convlab/spec/spec_util.py
index 7c6450b..3228dbe 100644
--- a/convlab/spec/spec_util.py
+++ b/convlab/spec/spec_util.py
@@ -1,15 +1,14 @@
-'''
-The spec util
-Handles the Lab experiment spec: reading, writing(evolution), validation and default setting
-Expands the spec and params into consumable inputs in info space for lab units.
-'''
+# The spec module
+# Manages specification to run things in lab
 from convlab.lib import logger, util
+from string import Template
 import itertools
 import json
 import numpy as np
 import os
 import pydash as ps
 
+
 SPEC_DIR = 'convlab/spec'
 '''
 All spec values are already param, inferred automatically.
@@ -21,26 +20,23 @@
 SPEC_FORMAT = {
     "agent": [{
         "name": str,
-        # "algorithm": dict,
+        "algorithm": dict,
         # "memory": dict,
         # "net": dict,
     }],
     "env": [{
         "name": str,
-        "max_t": (type(None), int),
-        "max_tick": int,
+        "max_t": (type(None), int, float),
+        # "max_frame": (int, float),
     }],
     "body": {
         "product": ["outer", "inner", "custom"],
         "num": (int, list),
     },
     "meta": {
-        "distributed": bool,
-        "eval_frequency": int,
-        "max_tick_unit": str,
+        "eval_frequency": (int, float),
         "max_session": int,
         "max_trial": (type(None), int),
-        # "search": str,
     },
     "name": str,
 }
@@ -57,10 +53,13 @@ def check_comp_spec(comp_spec, comp_spec_format):
         else:
             v_type = spec_format_v
             assert isinstance(comp_spec_v, v_type), f'Component spec {ps.pick(comp_spec, spec_k)} needs to be of type: {v_type}'
+            if isinstance(v_type, tuple) and int in v_type and isinstance(comp_spec_v, float):
+                # cast if it can be int
+                comp_spec[spec_k] = int(comp_spec_v)
 
 
 def check_body_spec(spec):
-    '''Base method to check body spec for AEB space resolution'''
+    '''Base method to check body spec for multi-agent multi-env'''
     ae_product = ps.get(spec, 'body.product')
     body_num = ps.get(spec, 'body.num')
     if ae_product == 'outer':
@@ -69,10 +68,17 @@ def check_body_spec(spec):
         agent_num = len(spec['agent'])
         env_num = len(spec['env'])
         assert agent_num == env_num, 'Agent and Env spec length must be equal for body `inner` product. Given {agent_num}, {env_num}'
-    else:  # custom AEB
+    else:  # custom
         assert ps.is_list(body_num)
 
 
+def check_compatibility(spec):
+    '''Check compatibility among spec setups'''
+    # TODO expand to be more comprehensive
+    if spec['meta'].get('distributed') == 'synced':
+        assert ps.get(spec, 'agent.0.net.gpu') == False, f'Distributed mode "synced" works with CPU only. Set gpu: false.'
+
+
 def check(spec):
     '''Check a single spec for validity'''
     try:
@@ -84,7 +90,8 @@ def check(spec):
             check_comp_spec(env_spec, SPEC_FORMAT['env'][0])
         check_comp_spec(spec['body'], SPEC_FORMAT['body'])
         check_comp_spec(spec['meta'], SPEC_FORMAT['meta'])
-        check_body_spec(spec)
+        # check_body_spec(spec)
+        check_compatibility(spec)
     except Exception as e:
         logger.exception(f'spec {spec_name} fails spec check')
         raise e
@@ -97,9 +104,10 @@ def check_all():
     for spec_file in spec_files:
         spec_dict = util.read(f'{SPEC_DIR}/{spec_file}')
         for spec_name, spec in spec_dict.items():
+            # fill-in info at runtime
+            spec['name'] = spec_name
+            spec = extend_meta_spec(spec)
             try:
-                spec['name'] = spec_name
-                spec['git_SHA'] = util.get_git_sha()
                 check(spec)
             except Exception as e:
                 logger.exception(f'spec_file {spec_file} fails spec check')
@@ -108,6 +116,26 @@ def check_all():
     return True
 
 
+def extend_meta_spec(spec):
+    '''Extend meta spec with information for lab functions'''
+    extended_meta_spec = {
+        # reset lab indices to -1 so that they tick to 0
+        'experiment': -1,
+        'trial': -1,
+        'session': -1,
+        'cuda_offset': int(os.environ.get('CUDA_OFFSET', 0)),
+        'experiment_ts': util.get_ts(),
+        'prepath': None,
+        # ckpt extends prepath, e.g. ckpt_str = ckpt-epi10-totalt1000
+        'ckpt': None,
+        'git_sha': util.get_git_sha(),
+        'random_seed': None,
+        'eval_model_prepath': None,
+    }
+    spec['meta'].update(extended_meta_spec)
+    return spec
+
+
 def get(spec_file, spec_name):
     '''
     Get an experiment spec from spec_file, spec_name.
@@ -116,6 +144,7 @@ def get(spec_file, spec_name):
 
     spec = spec_util.get('base.json', 'base_case_openai')
     '''
+    spec_file = spec_file.replace(SPEC_DIR, '')  # cleanup
     if 'data/' in spec_file:
         assert spec_name in spec_file, 'spec_file in data/ must be lab-generated and contains spec_name'
         spec = util.read(spec_file)
@@ -124,30 +153,42 @@ def get(spec_file, spec_name):
         spec_dict = util.read(spec_file)
         assert spec_name in spec_dict, f'spec_name {spec_name} is not in spec_file {spec_file}. Choose from:\n {ps.join(spec_dict.keys(), ",")}'
         spec = spec_dict[spec_name]
+        # fill-in info at runtime
         spec['name'] = spec_name
-        spec['git_SHA'] = util.get_git_sha()
+        spec = extend_meta_spec(spec)
     check(spec)
     return spec
 
 
-def is_aeb_compact(aeb_list):
-    '''
-    Check if aeb space (aeb_list) is compact; uniq count must equal shape in each of a,e axes. For b, per unique a,e hash, uniq must equal shape.'''
-    aeb_shape = util.get_aeb_shape(aeb_list)
-    aeb_uniq = [len(np.unique(col)) for col in np.transpose(aeb_list)]
-    ae_compact = np.array_equal(aeb_shape, aeb_uniq)
-    b_compact = True
-    for ae, ae_b_list in ps.group_by(aeb_list, lambda aeb: f'{aeb[0]}{aeb[1]}').items():
-        b_shape = util.get_aeb_shape(ae_b_list)[2]
-        b_uniq = [len(np.unique(col)) for col in np.transpose(ae_b_list)][2]
-        b_compact = b_compact and np.array_equal(b_shape, b_uniq)
-    aeb_compact = ae_compact and b_compact
-    return aeb_compact
+def get_eval_spec(spec_file, prename=None):
+    '''Get spec for eval mode'''
+    if prename:
+        predir, _, _, _, _, _ = util.prepath_split(spec_file)
+        prepath = f'{predir}/{prename}' 
+        spec = util.prepath_to_spec(prepath)
+    else:
+        spec = util.get_spec(spec_file)
+    spec['meta']['ckpt'] = 'eval'
+    spec['meta']['eval_model_prepath'] = util.insert_folder(prepath, 'model')
+    return spec
 
 
-def is_singleton(spec):
-    '''Check if spec uses a singleton Session'''
-    return len(spec['agent']) == 1 and len(spec['env']) == 1 and spec['body']['num'] == 1
+def get_param_specs(spec):
+    '''Return a list of specs with substituted spec_params'''
+    assert 'spec_params' in spec, 'Parametrized spec needs a spec_params key'
+    spec_params = spec.pop('spec_params')
+    spec_template = Template(json.dumps(spec))
+    keys = spec_params.keys()
+    specs = []
+    for idx, vals in enumerate(itertools.product(*spec_params.values())):
+        spec_str = spec_template.substitute(dict(zip(keys, vals)))
+        spec = json.loads(spec_str)
+        spec['name'] += f'_{"_".join(vals)}'
+        # offset to prevent parallel-run GPU competition, to mod in util.set_cuda_id
+        cuda_id_gap = int(spec['meta']['max_session'] / spec['meta']['param_spec_process'])
+        spec['meta']['cuda_offset'] += idx * cuda_id_gap
+        specs.append(spec)
+    return specs
 
 
 def override_dev_spec(spec):
@@ -162,58 +203,65 @@ def override_enjoy_spec(spec):
 
 
 def override_eval_spec(spec):
-    for agent_spec in spec['agent']:
-        if 'max_size' in agent_spec['memory']:
-            agent_spec['memory']['max_size'] = 100
+    spec['meta']['max_session'] = 1
     # evaluate by episode is set in env clock init in env/base.py
     return spec
 
 
 def override_test_spec(spec):
     for agent_spec in spec['agent']:
-        # covers episodic and timestep
-        agent_spec['algorithm']['training_frequency'] = 1
+        # onpolicy freq is episodic
+        freq = 1 if agent_spec['memory']['name'] == 'OnPolicyReplay' else 8
+        agent_spec['algorithm']['training_frequency'] = freq
         agent_spec['algorithm']['training_start_step'] = 1
-        agent_spec['algorithm']['training_epoch'] = 1
-        agent_spec['algorithm']['training_batch_epoch'] = 1
+        agent_spec['algorithm']['training_iter'] = 1
+        agent_spec['algorithm']['training_batch_iter'] = 1
     for env_spec in spec['env']:
-        env_spec['max_t'] = 20
-        env_spec['max_tick'] = 3
-    spec['meta']['eval_frequency'] = 1000
-    spec['meta']['max_tick_unit'] = 'epi'
+        env_spec['max_frame'] = 40
+        env_spec['max_t'] = 12
+    spec['meta']['log_frequency'] = 10
+    spec['meta']['eval_frequency'] = 10
     spec['meta']['max_session'] = 1
     spec['meta']['max_trial'] = 2
     return spec
 
 
-def resolve_aeb(spec):
-    '''
-    Resolve an experiment spec into the full list of points (coordinates) in AEB space.
-    @param {dict} spec An experiment spec.
-    @returns {list} aeb_list Resolved array of points in AEB space.
-    @example
+def save(spec, unit='experiment'):
+    '''Save spec to proper path. Called at Experiment or Trial init.'''
+    prepath = util.get_prepath(spec, unit)
+    util.write(spec, f'{prepath}_spec.json')
 
-    spec = spec_util.get('base.json', 'general_inner')
-    aeb_list = spec_util.resolve_aeb(spec)
-    # => [(0, 0, 0), (0, 0, 1), (1, 1, 0), (1, 1, 1)]
-    '''
-    agent_num = len(spec['agent']) if ps.is_list(spec['agent']) else 1
-    env_num = len(spec['env']) if ps.is_list(spec['env']) else 1
-    ae_product = ps.get(spec, 'body.product')
-    body_num = ps.get(spec, 'body.num')
-    body_num_list = body_num if ps.is_list(body_num) else [body_num] * env_num
 
-    aeb_list = []
-    if ae_product == 'outer':
-        for e in range(env_num):
-            sub_aeb_list = list(itertools.product(range(agent_num), [e], range(body_num_list[e])))
-            aeb_list.extend(sub_aeb_list)
-    elif ae_product == 'inner':
-        for a, e in zip(range(agent_num), range(env_num)):
-            sub_aeb_list = list(itertools.product([a], [e], range(body_num_list[e])))
-            aeb_list.extend(sub_aeb_list)
-    else:  # custom AEB, body_num is a aeb_list
-        aeb_list = [tuple(aeb) for aeb in body_num]
-    aeb_list.sort()
-    assert is_aeb_compact(aeb_list), 'Failed check: for a, e, uniq count == len (shape), and for each a,e hash, b uniq count == b len (shape)'
-    return aeb_list
+def tick(spec, unit):
+    '''
+    Method to tick lab unit (experiment, trial, session) in meta spec to advance their indices
+    Reset lower lab indices to -1 so that they tick to 0
+    spec_util.tick(spec, 'session')
+    session = Session(spec)
+    '''
+    meta_spec = spec['meta']
+    if unit == 'experiment':
+        meta_spec['experiment_ts'] = util.get_ts()
+        meta_spec['experiment'] += 1
+        meta_spec['trial'] = -1
+        meta_spec['session'] = -1
+    elif unit == 'trial':
+        if meta_spec['experiment'] == -1:
+            meta_spec['experiment'] += 1
+        meta_spec['trial'] += 1
+        meta_spec['session'] = -1
+    elif unit == 'session':
+        if meta_spec['experiment'] == -1:
+            meta_spec['experiment'] += 1
+        if meta_spec['trial'] == -1:
+            meta_spec['trial'] += 1
+        meta_spec['session'] += 1
+    else:
+        raise ValueError(f'Unrecognized lab unit to tick: {unit}')
+    # set prepath since it is determined at this point
+    meta_spec['prepath'] = prepath = util.get_prepath(spec, unit)
+    for folder in ('graph', 'info', 'log', 'model'):
+        folder_prepath = util.insert_folder(prepath, folder)
+        os.makedirs(os.path.dirname(util.smart_path(folder_prepath)), exist_ok=True)
+        meta_spec[f'{folder}_prepath'] = folder_prepath
+    return spec