Skip to content

Commit

Permalink
Handle substreams for fine tuning data prep
Browse files Browse the repository at this point in the history
Signed-off-by: Alex-Brooks <[email protected]>
  • Loading branch information
alex-jw-brooks committed Sep 12, 2023
1 parent fca1afa commit 4bc9a16
Showing 1 changed file with 17 additions and 6 deletions.
23 changes: 17 additions & 6 deletions caikit_nlp/modules/text_generation/text_generation_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,6 @@ def _preprocess_function(
use_iterable_dataset: bool,
):
"""Pre-process each example to get it prepared for training."""
if base_model.REQUIRES_TOKEN_UNWRAPPING:
raise NotImplementedError("Token unwrapping not implemented for fine tuning data prep")
if use_iterable_dataset:
# Generator based
log.debug("Loading data as an iterable dataset")
Expand All @@ -525,11 +523,18 @@ def _preprocess_function(
else:
# Convert the train stream to an normal dataset in memory
log.debug("Loading data as a normal dataset")
# TODO: Optimize and clean this up!
inputs = []
outputs = []
for datum in train_stream:
inputs.append(datum.input)
outputs.append(datum.output)
if base_model.REQUIRES_TOKEN_UNWRAPPING:
for substream in train_stream:
for data in substream:
inputs.append(data.input)
outputs.append(data.output)
else:
for data in train_stream:
inputs.append(data.input)
outputs.append(data.output)
dataset = Dataset.from_dict({"input": inputs, "output": outputs})
# Map our HF datasets; with our tokenizer functions
mapped_dataset = dataset.map(
Expand Down Expand Up @@ -566,4 +571,10 @@ def _launch_training(

def get(train_stream):
for data in train_stream:
yield {"input": data.input, "output": data.output}
# Handle token unwrapping for causal language modeling
if isinstance(data, DataStream):
for datum in data:
yield {"input": datum.input, "output": datum.output}
# Otherwise assume we directly yield dictionaries
else:
yield {"input": data.input, "output": data.output}

0 comments on commit 4bc9a16

Please sign in to comment.