Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support LLM augmentation ops and support vllm #338

Merged
merged 22 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix unittest bug
  • Loading branch information
Cathy0908 committed Aug 29, 2024
commit 14d27cfb06a525e272974b4d6c4d0205b62179a7
8 changes: 6 additions & 2 deletions data_juicer/format/empty_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,22 @@ def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):

@property
def null_value(self):
return {}
return None

def load_dataset(self, *args, **kwargs):
data_dict = {}
features = Features()

for key in self.feature_keys:
features.update({key: Value('string')})
data_dict.update({key: [self.null_value] * self.length})
data_dict.update(
{key: [self.null_value for _ in range(self.length)]})

empty_dataset = Dataset.from_dict(data_dict, features=features)

from data_juicer.core.data import NestedDataset
empty_dataset = NestedDataset(empty_dataset)

return empty_dataset


Expand Down
5 changes: 4 additions & 1 deletion data_juicer/ops/mapper/extract_qa_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(self,
"""
Initialization method.
:param hf_model: Hugginface model id.
:param trust_remote_code: passed to transformers
:param pattern: regular expression pattern to search for within text.
:param qa_format: Output format of question and answer pair.
:param enable_vllm: Whether to use vllm for inference acceleration.
Expand Down Expand Up @@ -106,14 +107,16 @@ def __init__(self,
self.model_key = prepare_model(
model_type='vllm',
pretrained_model_name_or_path=hf_model,
trust_remote_code=trust_remote_code,
tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs)
self.sampling_params = SamplingParams(**sampling_params)
else:
self.model_key = prepare_model(
model_type='huggingface',
pretrained_model_name_or_path=hf_model)
pretrained_model_name_or_path=hf_model,
trust_remote_code=trust_remote_code)
self.sampling_params = sampling_params

def _extract_qa(self, output):
Expand Down
10 changes: 7 additions & 3 deletions data_juicer/ops/mapper/generate_instruction_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ class GenerateInstructionMapper(Mapper):
You should configure an empty dataset in your yaml config file:
```
generated_dataset_config:
type: 'EmptyFormatter'
type: 'EmptyFormatter' # use `RayEmptyFormatter` when enable ray
length: ${The number of generated samples}
feature_keys: ${text key}
```
The number of samples generated is determined by
the length of the empty dataset.
Expand All @@ -53,6 +54,7 @@ def __init__(self,
hf_model,
seed_file,
instruct_num,
trust_remote_code: bool = False,
similarity_threshold: float = 0.7,
prompt_template: str = None,
qa_pair_template: str = None,
Expand All @@ -73,7 +75,7 @@ def __init__(self,
:param instruct_num: The number of instruction samples.
Randomly select N samples from "seed_file" and
put them into prompt as instruction samples.
:param generate_num: The number of generated samples.
:param trust_remote_code: passed to transformers
:param similarity_threshold: The similarity score threshold
between the generated samples and the seed samples.
Range from 0 to 1. Samples with similarity score less than
Expand Down Expand Up @@ -140,14 +142,16 @@ def __init__(self,
self.model_key = prepare_model(
model_type='vllm',
pretrained_model_name_or_path=hf_model,
trust_remote_code=trust_remote_code,
tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs)
self.sampling_params = SamplingParams(**sampling_params)
else:
self.model_key = prepare_model(
model_type='huggingface',
pretrained_model_name_or_path=hf_model)
pretrained_model_name_or_path=hf_model,
trust_remote_code=trust_remote_code)
self.sampling_params = sampling_params

self.seed_qa_samples = self.load_seed_qa_samples(seed_file)
Expand Down
6 changes: 5 additions & 1 deletion data_juicer/ops/mapper/optimize_instruction_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class OptimizeInstructionMapper(Mapper):

def __init__(self,
hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine',
trust_remote_code: bool = False,
system_prompt: str = None,
enable_vllm: bool = True,
tensor_parallel_size: int = None,
Expand All @@ -44,6 +45,7 @@ def __init__(self,
"""
Initialization method.
:param hf_model: Hugginface model id.
:param trust_remote_code: passed to transformers
:param system_prompt: System prompt for optimize samples.
:param enable_vllm: Whether to use vllm for inference acceleration.
:param tensor_parallel_size: It is only valid when enable_vllm is True.
Expand Down Expand Up @@ -78,14 +80,16 @@ def __init__(self,
self.model_key = prepare_model(
model_type='vllm',
pretrained_model_name_or_path=hf_model,
trust_remote_code=trust_remote_code,
tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs)
self.sampling_params = SamplingParams(**sampling_params)
else:
self.model_key = prepare_model(
model_type='huggingface',
pretrained_model_name_or_path=hf_model)
pretrained_model_name_or_path=hf_model,
trust_remote_code=trust_remote_code)
self.sampling_params = sampling_params

def process(self, sample=None, rank=None):
Expand Down