From b3a6a58f17a5cd5dbfee23d4bd067dbf76b9b9d4 Mon Sep 17 00:00:00 2001 From: Clark Zinzow Date: Thu, 19 Jan 2023 12:35:38 -0800 Subject: [PATCH] [Datasets] Fix Torch tensor memory leak test. (#31748) Torch tensor memory leak test was suffering from a noisy neighbor issue, and was low signal due to not doing the base tensor allocation in the leak-tested code. This PR fixes this by doing explicit garbage collection before and after each leak-code run, and moving the base tensor allocation into the leak-code. --- python/ray/train/tests/test_torch_utils.py | 15 +++++++++------ python/ray/util/debug.py | 8 +++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/python/ray/train/tests/test_torch_utils.py b/python/ray/train/tests/test_torch_utils.py index b076b83e261e3..fc3ba4dee40ea 100644 --- a/python/ray/train/tests/test_torch_utils.py +++ b/python/ray/train/tests/test_torch_utils.py @@ -81,15 +81,18 @@ def test_tensor_column_no_memory_leak(self): # column (e.g. post-casting from extension type) doesn't leak memory. Casting # these tensors directly with torch.as_tensor() currently leaks memory; see # https://github.com/ray-project/ray/issues/30629#issuecomment-1330954556 - col = np.empty(1000, dtype=object) - col[:] = [np.ones((100, 100)) for _ in range(1000)] - df = pd.DataFrame({"a": col}) + def code(): + col = np.empty(1000, dtype=object) + col[:] = [np.ones((100, 100)) for _ in range(1000)] + df = pd.DataFrame({"a": col}) + convert_pandas_to_torch_tensor( + df, columns=[["a"]], column_dtypes=[torch.int] + ) + suspicious_stats = _test_some_code_for_memory_leaks( desc="Testing convert_pandas_to_torch_tensor for memory leaks.", init=None, - code=lambda: convert_pandas_to_torch_tensor( - df, columns=[["a"]], column_dtypes=[torch.int] - ), + code=code, repeats=10, ) assert not suspicious_stats diff --git a/python/ray/util/debug.py b/python/ray/util/debug.py index 5d93f9a6fbc29..d7f5a88bc463c 100644 --- a/python/ray/util/debug.py +++ b/python/ray/util/debug.py @@ -1,11 +1,13 @@ from collections import defaultdict, namedtuple -import numpy as np +import gc import os import re import time import tracemalloc from typing import Callable, List, Optional +import numpy as np + from ray.util.annotations import DeveloperAPI _logged = set() @@ -137,7 +139,11 @@ def _i_print(i): # Run `code` n times, each time taking a memory snapshot. for i in range(actual_repeats): _i_print(i) + # Manually trigger garbage collection before and after code runs in order to + # make tracemalloc snapshots as accurate as possible. + gc.collect() code() + gc.collect() _take_snapshot(table, suspicious) print("\n")