Skip to content

Commit

Permalink
Prototype fix of large-bs dataloader
Browse files Browse the repository at this point in the history
  • Loading branch information
Quentin-Anthony committed Mar 14, 2023
1 parent dd5a53d commit 7d682df
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
2 changes: 1 addition & 1 deletion megatron/data/gpt2_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):

# Total number of samples. For -1 see comments in `_num_epochs`.
num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int64)

# Index into sample_idx.
sample_index = 0
Expand Down
6 changes: 3 additions & 3 deletions megatron/data/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,

// Mapping and it's length (1D).
int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
int32_t* sample_idx = new int32_t[2 * (num_samples + 1)];
int64_t* sample_idx = new int64_t[2 * (num_samples + 1)];

cout << " using:" << endl << std::flush;
cout << " number of documents: " << doc_idx_.shape(0) / num_epochs << endl
Expand Down Expand Up @@ -161,12 +161,12 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,

// Method to deallocate memory.
py::capsule free_when_done(sample_idx, [](void* mem_) {
int32_t* mem = reinterpret_cast<int32_t*>(mem_);
int64_t* mem = reinterpret_cast<int64_t*>(mem_);
delete[] mem;
});

// Return the numpy array.
const auto byte_size = sizeof(int32_t);
const auto byte_size = sizeof(int64_t);
return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
{2 * byte_size, byte_size}, // C-style contiguous strides
sample_idx, // the data pointer
Expand Down

0 comments on commit 7d682df

Please sign in to comment.