[Datasets] Fix Torch tensor memory leak test. (ray-project#31748)

Torch tensor memory leak test was suffering from a noisy neighbor issue, and was low signal due to not doing the base tensor allocation in the leak-tested code. This PR fixes this by doing explicit garbage collection before and after each leak-code run, and moving the base tensor allocation into the leak-code.
charleslai2000 · Jan 19, 2023 · b3a6a58 · b3a6a58
1 parent 14b78b3
commit b3a6a58
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 7 deletions.
diff --git a/python/ray/train/tests/test_torch_utils.py b/python/ray/train/tests/test_torch_utils.py
@@ -81,15 +81,18 @@ def test_tensor_column_no_memory_leak(self):
  # column (e.g. post-casting from extension type) doesn't leak memory. Casting
  # these tensors directly with torch.as_tensor() currently leaks memory; see
  # https://github.com/ray-project/ray/issues/30629#issuecomment-1330954556
- col = np.empty(1000, dtype=object)
- col[:] = [np.ones((100, 100)) for _ in range(1000)]
- df = pd.DataFrame({"a": col})
+ def code():
+ col = np.empty(1000, dtype=object)
+ col[:] = [np.ones((100, 100)) for _ in range(1000)]
+ df = pd.DataFrame({"a": col})
+ convert_pandas_to_torch_tensor(
+ df, columns=[["a"]], column_dtypes=[torch.int]
+ )
+
  suspicious_stats = _test_some_code_for_memory_leaks(
  desc="Testing convert_pandas_to_torch_tensor for memory leaks.",
  init=None,
- code=lambda: convert_pandas_to_torch_tensor(
- df, columns=[["a"]], column_dtypes=[torch.int]
- ),
+ code=code,
  repeats=10,
  )
  assert not suspicious_stats

diff --git a/python/ray/util/debug.py b/python/ray/util/debug.py
@@ -1,11 +1,13 @@
 from collections import defaultdict, namedtuple
-import numpy as np
+import gc
 import os
 import re
 import time
 import tracemalloc
 from typing import Callable, List, Optional
 
+import numpy as np
+
 from ray.util.annotations import DeveloperAPI
 
 _logged = set()
@@ -137,7 +139,11 @@ def _i_print(i):
  # Run `code` n times, each time taking a memory snapshot.
  for i in range(actual_repeats):
  _i_print(i)
+ # Manually trigger garbage collection before and after code runs in order to
+ # make tracemalloc snapshots as accurate as possible.
+ gc.collect()
  code()
+ gc.collect()
  _take_snapshot(table, suspicious)
  print("\n")