Skip to content

Commit

Permalink
[Datasets] Fix Torch tensor memory leak test. (ray-project#31748)
Browse files Browse the repository at this point in the history
Torch tensor memory leak test was suffering from a noisy neighbor issue, and was low signal due to not doing the base tensor allocation in the leak-tested code. This PR fixes this by doing explicit garbage collection before and after each leak-code run, and moving the base tensor allocation into the leak-code.
  • Loading branch information
clarkzinzow committed Jan 19, 2023
1 parent 14b78b3 commit b3a6a58
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 7 deletions.
15 changes: 9 additions & 6 deletions python/ray/train/tests/test_torch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,18 @@ def test_tensor_column_no_memory_leak(self):
# column (e.g. post-casting from extension type) doesn't leak memory. Casting
# these tensors directly with torch.as_tensor() currently leaks memory; see
# https://github.com/ray-project/ray/issues/30629#issuecomment-1330954556
col = np.empty(1000, dtype=object)
col[:] = [np.ones((100, 100)) for _ in range(1000)]
df = pd.DataFrame({"a": col})
def code():
col = np.empty(1000, dtype=object)
col[:] = [np.ones((100, 100)) for _ in range(1000)]
df = pd.DataFrame({"a": col})
convert_pandas_to_torch_tensor(
df, columns=[["a"]], column_dtypes=[torch.int]
)

suspicious_stats = _test_some_code_for_memory_leaks(
desc="Testing convert_pandas_to_torch_tensor for memory leaks.",
init=None,
code=lambda: convert_pandas_to_torch_tensor(
df, columns=[["a"]], column_dtypes=[torch.int]
),
code=code,
repeats=10,
)
assert not suspicious_stats
Expand Down
8 changes: 7 additions & 1 deletion python/ray/util/debug.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from collections import defaultdict, namedtuple
import numpy as np
import gc
import os
import re
import time
import tracemalloc
from typing import Callable, List, Optional

import numpy as np

from ray.util.annotations import DeveloperAPI

_logged = set()
Expand Down Expand Up @@ -137,7 +139,11 @@ def _i_print(i):
# Run `code` n times, each time taking a memory snapshot.
for i in range(actual_repeats):
_i_print(i)
# Manually trigger garbage collection before and after code runs in order to
# make tracemalloc snapshots as accurate as possible.
gc.collect()
code()
gc.collect()
_take_snapshot(table, suspicious)
print("\n")

Expand Down

0 comments on commit b3a6a58

Please sign in to comment.