From b3a6a58f17a5cd5dbfee23d4bd067dbf76b9b9d4 Mon Sep 17 00:00:00 2001
From: Clark Zinzow <clarkzinzow@gmail.com>
Date: Thu, 19 Jan 2023 12:35:38 -0800
Subject: [PATCH] [Datasets] Fix Torch tensor memory leak test. (#31748)

Torch tensor memory leak test was suffering from a noisy neighbor issue, and was low signal due to not doing the base tensor allocation in the leak-tested code. This PR fixes this by doing explicit garbage collection before and after each leak-code run, and moving the base tensor allocation into the leak-code.
---
 python/ray/train/tests/test_torch_utils.py | 15 +++++++++------
 python/ray/util/debug.py                   |  8 +++++++-
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/python/ray/train/tests/test_torch_utils.py b/python/ray/train/tests/test_torch_utils.py
index b076b83e261e3..fc3ba4dee40ea 100644
--- a/python/ray/train/tests/test_torch_utils.py
+++ b/python/ray/train/tests/test_torch_utils.py
@@ -81,15 +81,18 @@ def test_tensor_column_no_memory_leak(self):
         # column (e.g. post-casting from extension type) doesn't leak memory. Casting
         # these tensors directly with torch.as_tensor() currently leaks memory; see
         # https://github.com/ray-project/ray/issues/30629#issuecomment-1330954556
-        col = np.empty(1000, dtype=object)
-        col[:] = [np.ones((100, 100)) for _ in range(1000)]
-        df = pd.DataFrame({"a": col})
+        def code():
+            col = np.empty(1000, dtype=object)
+            col[:] = [np.ones((100, 100)) for _ in range(1000)]
+            df = pd.DataFrame({"a": col})
+            convert_pandas_to_torch_tensor(
+                df, columns=[["a"]], column_dtypes=[torch.int]
+            )
+
         suspicious_stats = _test_some_code_for_memory_leaks(
             desc="Testing convert_pandas_to_torch_tensor for memory leaks.",
             init=None,
-            code=lambda: convert_pandas_to_torch_tensor(
-                df, columns=[["a"]], column_dtypes=[torch.int]
-            ),
+            code=code,
             repeats=10,
         )
         assert not suspicious_stats
diff --git a/python/ray/util/debug.py b/python/ray/util/debug.py
index 5d93f9a6fbc29..d7f5a88bc463c 100644
--- a/python/ray/util/debug.py
+++ b/python/ray/util/debug.py
@@ -1,11 +1,13 @@
 from collections import defaultdict, namedtuple
-import numpy as np
+import gc
 import os
 import re
 import time
 import tracemalloc
 from typing import Callable, List, Optional
 
+import numpy as np
+
 from ray.util.annotations import DeveloperAPI
 
 _logged = set()
@@ -137,7 +139,11 @@ def _i_print(i):
         # Run `code` n times, each time taking a memory snapshot.
         for i in range(actual_repeats):
             _i_print(i)
+            # Manually trigger garbage collection before and after code runs in order to
+            # make tracemalloc snapshots as accurate as possible.
+            gc.collect()
             code()
+            gc.collect()
             _take_snapshot(table, suspicious)
         print("\n")