Improve lifetime management of CASESession async work helper. (#27659)

* Improve lifetime management of CASESession async work helper. As things stood, there were some codepaths (currently not really reached; it would require scheduling of SendSigma3c to fail and then for someone to InvokeBackgroundWorkWatchdog on the CASESession) where we could destroy the object while we were working with references to its members. Fixes #27541 * Address review comment.
project-chip · Feb 21, 2024 · 4374169 · 4374169
1 parent 2811f0a
commit 4374169
Showing 1 changed file with 41 additions and 8 deletions.
diff --git a/src/protocols/secure_channel/CASESession.cpp b/src/protocols/secure_channel/CASESession.cpp
@@ -144,7 +144,12 @@ class CASESession::WorkHelper
 
  // After work callback, processed in the main Matter task via `PlatformManager::ScheduleWork`.
  // This is a member function to be called on the associated session after the work callback.
- // The `status` value is the result of the work callback (called beforehand).
+ // The `status` value is the result of the work callback (called beforehand), or the status of
+ // queueing the after work callback back to the Matter thread, if the work callback succeeds
+ // but queueing fails.
+ //
+ // When this callback is called asynchronously (i.e. via ScheduleWork), the helper guarantees
+ // that it will keep itself (and hence `data`) alive until the callback completes.
  typedef CHIP_ERROR (CASESession::*AfterWorkCallback)(DATA & data, CHIP_ERROR status);
 
 public:
@@ -169,6 +174,9 @@ class CASESession::WorkHelper
 
  // Do the work immediately.
  // No scheduling, no outstanding work, no shared lifetime management.
+ //
+ // The caller must guarantee that it keeps the helper alive across this call, most likely by
+ // holding a reference to it on the stack.
  CHIP_ERROR DoWork()
  {
  // Ensure that this function is being called from main Matter thread
@@ -195,7 +203,7 @@ class CASESession::WorkHelper
  auto status = DeviceLayer::PlatformMgr().ScheduleBackgroundWork(WorkHandler, reinterpret_cast<intptr_t>(this));
  if (status != CHIP_NO_ERROR)
  {
- // Release strong ptr since scheduling failed
+ // Release strong ptr since scheduling failed.
  mStrongPtr.reset();
  }
  return status;
@@ -235,19 +243,32 @@ class CASESession::WorkHelper
  // Execute callback in background thread; data must be OK with this
  helper->mStatus = helper->mWorkCallback(helper->mData, cancel);
  VerifyOrReturn(!cancel && !helper->IsCancelled());
- // Hold strong ptr while work is outstanding
+ // Hold strong ptr to ourselves while work is outstanding
  helper->mStrongPtr.swap(strongPtr);
  auto status = DeviceLayer::PlatformMgr().ScheduleWork(AfterWorkHandler, reinterpret_cast<intptr_t>(helper));
  if (status != CHIP_NO_ERROR)
  {
+ ChipLogError(SecureChannel, "Failed to Schedule the AfterWorkCallback on foreground thread: %" CHIP_ERROR_FORMAT,
+ status.Format());
+
  // We failed to schedule after work callback, so setting mScheduleAfterWorkFailed flag to true
  // This can be checked from foreground thread and after work callback can be retried
  helper->mStatus = status;
- ChipLogError(SecureChannel, "Failed to Schedule the AfterWorkCallback on foreground thread");
- helper->mScheduleAfterWorkFailed.store(true);
 
- // Release strong ptr since scheduling failed
- helper->mStrongPtr.reset();
+ // Release strong ptr to self since scheduling failed, because nothing guarantees
+ // that AfterWorkHandler will get called at this point to release the reference,
+ // and we don't want to leak. That said, we want to ensure that "helper" stays
+ // alive through the end of this function (so we can set mScheduleAfterWorkFailed
+ // on it), but also want to avoid racing on the single SharedPtr instance in
+ // helper->mStrongPtr. That means we need to not touch helper->mStrongPtr after
+ // writing to mScheduleAfterWorkFailed.
+ //
+ // The simplest way to do this is to move the reference in helper->mStrongPtr to
+ // our stack, where it outlives all our accesses to "helper".
+ strongPtr.swap(helper->mStrongPtr);
+
+ // helper and any of its state should not be touched after storing mScheduleAfterWorkFailed.
+ helper->mScheduleAfterWorkFailed.store(true);
  }
  }
 
@@ -258,8 +279,17 @@ class CASESession::WorkHelper
  assertChipStackLockedByCurrentThread();
 
  auto * helper = reinterpret_cast<WorkHelper *>(arg);
- // Hold strong ptr while work is handled
+ // Hold strong ptr while work is handled, and ensure that helper->mStrongPtr does not keep
+ // holding a reference.
  auto strongPtr(std::move(helper->mStrongPtr));
+ if (!strongPtr)
+ {
+ // This can happen if scheduling AfterWorkHandler failed. Just grab a strong ref
+ // to handler directly, to fulfill our API contract of holding a strong reference
+ // across the after-work callback. At this point, we are guaranteed that the
+ // background thread is not touching the helper anymore.
+ strongPtr = helper->mWeakPtr.lock();
+ }
  if (auto * session = helper->mSession.load())
  {
  // Execute callback in Matter thread; session should be OK with this
@@ -288,6 +318,9 @@ class CASESession::WorkHelper
 
  // If background thread fails to schedule AfterWorkCallback then this flag is set to true
  // and CASEServer then can check this one and run the AfterWorkCallback for us.
+ //
+ // When this happens, the write to this boolean _must_ be the last code that touches this
+ // object on the background thread. After that, the Matter thread owns the object.
  std::atomic<bool> mScheduleAfterWorkFailed{ false };
 
 public: