fix engine crash in shutdown phase (#14382)

* fix engine crash in shutdown phase * fix lint * Revert "Bypass ThreadedEngine in test_operator_gpu.py:test_convolution_multiple_streams. (#14338)" This reverts commit d6eafca.
apache · Mar 11, 2019 · 4f5cba5 · 4f5cba5
1 parent 35098b8
commit 4f5cba5
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 11 deletions.
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
@@ -30,6 +30,7 @@
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
 #include <dmlc/omp.h>
+#include <mxnet/storage.h>
 #include <vector>
 #include <functional>
 #include <condition_variable>
@@ -306,6 +307,8 @@ class ThreadedEngine : public Engine {
  objpool_varblk_ref_ = common::ObjectPool<VersionedVarBlock>::_GetSharedRef();
  objpool_var_ref_ = common::ObjectPool<ThreadedVar>::_GetSharedRef();
 
+ storage_ref_ = Storage::_GetSharedRef();
+
  // Get a ref to the profiler so that it doesn't get killed before us
  profiler::Profiler::Get(&profiler_);
  }
@@ -549,6 +552,12 @@ class ThreadedEngine : public Engine {
  std::shared_ptr<common::ObjectPool<VersionedVarBlock> > objpool_varblk_ref_;
  std::shared_ptr<common::ObjectPool<ThreadedVar> > objpool_var_ref_;
 
+ /*!
+ * \brief Async destruction of some objects is relied on storage,
+ * prevent it from being destructed too early
+ */
+ std::shared_ptr<Storage> storage_ref_;
+
 #if MXNET_USE_CUDA
  /*! \brief Number of GPU devices available */
  std::atomic<int> device_count_{-1};

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
@@ -547,18 +547,8 @@ def _conv_with_num_streams(seed):
 
 @with_seed()
 def test_convolution_multiple_streams():
- engines = ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice']
-
- if os.getenv('MXNET_ENGINE_TYPE') is not None:
- engines = [os.getenv('MXNET_ENGINE_TYPE'),]
- print("Only running against '%s'" % engines[0], file=sys.stderr, end='')
- # Remove this else clause when the ThreadedEngine can handle this test
- else:
- engines.remove('ThreadedEngine')
- print("SKIP: 'ThreadedEngine', only running against %s" % engines, file=sys.stderr, end='')
-
  for num_streams in [1, 2]:
- for engine in engines:
+ for engine in ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice']:
  print("Starting engine %s with %d streams." % (engine, num_streams), file=sys.stderr)
  run_in_spawned_process(_conv_with_num_streams,
  {'MXNET_GPU_WORKER_NSTREAMS' : num_streams, 'MXNET_ENGINE_TYPE' : engine})