Skip to content

Commit

Permalink
Allow the dataset builder to choose int32 or int64 at runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
Quentin-Anthony committed Mar 15, 2023
1 parent 7d682df commit 2d2eecd
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 12 deletions.
17 changes: 11 additions & 6 deletions megatron/data/gpt2_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def _build_index_mappings(
doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
np.save(doc_idx_filename, doc_idx, allow_pickle=True)
print_rank_0(
" > elasped time to build and save doc-idx mapping "
" > elapsed time to build and save doc-idx mapping "
"(seconds): {:4f}".format(time.time() - start_time)
)
# sample-idx.
Expand All @@ -167,11 +167,16 @@ def _build_index_mappings(

assert doc_idx.dtype == np.int32
assert sizes.dtype == np.int32
sample_idx = helpers.build_sample_idx(
sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
)
# sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
# num_epochs, tokens_per_epoch)

num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length
if 2 * (num_samples + 1) < np.iinfo(np.int32).max:
sample_idx = helpers.build_sample_idx_int32(
sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
)
else:
sample_idx = helpers.build_sample_idx_int64(
sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
)
np.save(sample_idx_filename, sample_idx, allow_pickle=True)
print_rank_0(
" > elapsed time to build and save sample-idx mapping "
Expand Down
99 changes: 93 additions & 6 deletions megatron/data/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,97 @@ void build_blending_indices(py::array_t<uint8_t>& dataset_index,
}
}

py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
const py::array_t<int32_t>& doc_idx_,
const int32_t seq_length,
const int32_t num_epochs,
const int64_t tokens_per_epoch)
py::array build_sample_idx_int32(const py::array_t<int32_t>& sizes_,
const py::array_t<int32_t>& doc_idx_,
const int32_t seq_length,
const int32_t num_epochs,
const int64_t tokens_per_epoch)
{
/* Sample index (sample_idx) is used for gpt2 like dataset for which
the documents are flattened and the samples are built based on this
1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
where [..., 0] contains the index into `doc_idx` and [..., 1] is the
starting offset in that document.*/

// Consistency checks.
assert(seq_length > 1);
assert(num_epochs > 0);
assert(tokens_per_epoch > 1);

// Remove bound checks.
auto sizes = sizes_.unchecked<1>();
auto doc_idx = doc_idx_.unchecked<1>();

// Mapping and it's length (1D).
int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
int32_t* sample_idx = new int32_t[2 * (num_samples + 1)];

cout << " using:" << endl << std::flush;
cout << " number of documents: " << doc_idx_.shape(0) / num_epochs << endl
<< std::flush;
cout << " number of epochs: " << num_epochs << endl << std::flush;
cout << " sequence length: " << seq_length << endl << std::flush;
cout << " total number of samples: " << num_samples << endl << std::flush;

// Index into sample_idx.
int64_t sample_index = 0;
// Index into doc_idx.
int64_t doc_idx_index = 0;
// Beginning offset for each document.
int32_t doc_offset = 0;
// Start with first document and no offset.
sample_idx[2 * sample_index] = doc_idx_index;
sample_idx[2 * sample_index + 1] = doc_offset;
++sample_index;

while (sample_index <= num_samples) {
// Start with a fresh sequence.
int32_t remaining_seq_length = seq_length + 1;
while (remaining_seq_length != 0) {
// Get the document length.
auto doc_id = doc_idx[doc_idx_index];
auto doc_length = sizes[doc_id] - doc_offset;
// And add it to the current sequence.
remaining_seq_length -= doc_length;
// If we have more than a full sequence, adjust offset and set
// remaining length to zero so we return from the while loop.
// Note that -1 here is for the same reason we have -1 in
// `_num_epochs` calculations.
if (remaining_seq_length <= 0) {
doc_offset += (remaining_seq_length + doc_length - 1);
remaining_seq_length = 0;
} else {
// Otherwise, start from the beginning of the next document.
++doc_idx_index;
doc_offset = 0;
}
}
// Record the sequence.
sample_idx[2 * sample_index] = doc_idx_index;
sample_idx[2 * sample_index + 1] = doc_offset;
++sample_index;
}

// Method to deallocate memory.
py::capsule free_when_done(sample_idx, [](void* mem_) {
int32_t* mem = reinterpret_cast<int32_t*>(mem_);
delete[] mem;
});

// Return the numpy array.
const auto byte_size = sizeof(int32_t);
return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
{2 * byte_size, byte_size}, // C-style contiguous strides
sample_idx, // the data pointer
free_when_done); // numpy array references
}


py::array build_sample_idx_int64(const py::array_t<int32_t>& sizes_,
const py::array_t<int32_t>& doc_idx_,
const int32_t seq_length,
const int32_t num_epochs,
const int64_t tokens_per_epoch)
{
/* Sample index (sample_idx) is used for gpt2 like dataset for which
the documents are flattened and the samples are built based on this
Expand Down Expand Up @@ -665,6 +751,7 @@ PYBIND11_MODULE(helpers, m)
{
m.def("build_mapping", &build_mapping);
m.def("build_blocks_mapping", &build_blocks_mapping);
m.def("build_sample_idx", &build_sample_idx);
m.def("build_sample_idx_int32", &build_sample_idx_int32);
m.def("build_sample_idx_int64", &build_sample_idx_int64);
m.def("build_blending_indices", &build_blending_indices);
}

0 comments on commit 2d2eecd

Please sign in to comment.