Skip to content

Commit

Permalink
Don't assert when attempting to create more than one virtual cluster …
Browse files Browse the repository at this point in the history
…at a time.

PiperOrigin-RevId: 159498430
  • Loading branch information
benoitsteiner authored and tensorflower-gardener committed Jun 19, 2017
1 parent f9a6477 commit 4689fe7
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 12 deletions.
12 changes: 0 additions & 12 deletions tensorflow/core/grappler/clusters/cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,15 @@ limitations under the License.
==============================================================================*/

#include "tensorflow/core/grappler/clusters/cluster.h"
#include <atomic>

namespace tensorflow {
namespace grappler {

static std::atomic<bool> already_created(false);

Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
// This is really ugly: to avoid leaking variables, we need to reset the tf
// session every time we're done processing a grappler item. However,
// variables are global, and therefore we can't have more than 1 session alive
// at a time. This check detects when more that one cluster is created.
CHECK(!already_created);
already_created = true;

DisableDetailedStats(false);
}

Cluster::~Cluster() {
CHECK(already_created);
already_created = false;
}

void Cluster::AllowSoftPlacement(bool soft_placement_state) {
Expand Down
13 changes: 13 additions & 0 deletions tensorflow/core/grappler/clusters/single_machine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ limitations under the License.

#include "tensorflow/core/grappler/clusters/single_machine.h"

#include <atomic>
#include <memory>

#include "tensorflow/cc/training/queue_runner.h"
Expand All @@ -31,11 +32,20 @@ limitations under the License.
namespace tensorflow {
namespace grappler {

static std::atomic<bool> already_created(false);

SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
: Cluster(timeout_s),
num_gpus_(num_gpus),
expected_init_time_s_(0),
closing_(false) {
// This is really ugly: to avoid leaking variables, we need to reset the tf
// session every time we're done processing a grappler item. However,
// variables are global, and therefore we can't have more than 1 session alive
// at a time. This check detects when more that one cluster is created.
CHECK(!already_created);
already_created = true;

VLOG(1) << "Number of CPU cores: " << num_cpu_cores
<< " Number of GPUs: " << num_gpus;
thread_pool_.reset(new thread::ThreadPool(
Expand Down Expand Up @@ -64,6 +74,9 @@ SingleMachine::~SingleMachine() {
thread_pool_.reset();

Reset(options_, {}).IgnoreError();

CHECK(already_created);
already_created = false;
}

Status SingleMachine::Provision() {
Expand Down

0 comments on commit 4689fe7

Please sign in to comment.