fix unittest bug

modelscope · HYLcool · Aug 29, 2024 · Jul 4, 2024 · Jul 5, 2024 · Jul 11, 2024
commit 14d27cfb06a525e272974b4d6c4d0205b62179a7
diff --git a/data_juicer/format/empty_formatter.py b/data_juicer/format/empty_formatter.py
@@ -28,18 +28,22 @@ def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
 
  @property
  def null_value(self):
- return {}
+ return None
 
  def load_dataset(self, *args, **kwargs):
  data_dict = {}
  features = Features()
 
  for key in self.feature_keys:
  features.update({key: Value('string')})
- data_dict.update({key: [self.null_value] * self.length})
+ data_dict.update(
+ {key: [self.null_value for _ in range(self.length)]})
 
  empty_dataset = Dataset.from_dict(data_dict, features=features)
 
+ from data_juicer.core.data import NestedDataset
+ empty_dataset = NestedDataset(empty_dataset)
+
  return empty_dataset
 
 

diff --git a/data_juicer/ops/mapper/extract_qa_mapper.py b/data_juicer/ops/mapper/extract_qa_mapper.py
@@ -54,6 +54,7 @@ def __init__(self,
  """
  Initialization method.
  :param hf_model: Hugginface model id.
+ :param trust_remote_code: passed to transformers
  :param pattern: regular expression pattern to search for within text.
  :param qa_format: Output format of question and answer pair.
  :param enable_vllm: Whether to use vllm for inference acceleration.
@@ -106,14 +107,16 @@ def __init__(self,
  self.model_key = prepare_model(
  model_type='vllm',
  pretrained_model_name_or_path=hf_model,
+ trust_remote_code=trust_remote_code,
  tensor_parallel_size=tensor_parallel_size,
  max_model_len=max_model_len,
  max_num_seqs=max_num_seqs)
  self.sampling_params = SamplingParams(**sampling_params)
  else:
  self.model_key = prepare_model(
  model_type='huggingface',
- pretrained_model_name_or_path=hf_model)
+ pretrained_model_name_or_path=hf_model,
+ trust_remote_code=trust_remote_code)
  self.sampling_params = sampling_params
 
  def _extract_qa(self, output):

diff --git a/data_juicer/ops/mapper/generate_instruction_mapper.py b/data_juicer/ops/mapper/generate_instruction_mapper.py
@@ -41,8 +41,9 @@ class GenerateInstructionMapper(Mapper):
  You should configure an empty dataset in your yaml config file:
  ```
  generated_dataset_config:
- type: 'EmptyFormatter'
+ type: 'EmptyFormatter' # use `RayEmptyFormatter` when enable ray
  length: ${The number of generated samples}
+ feature_keys: ${text key}
  ```
  The number of samples generated is determined by
  the length of the empty dataset.
@@ -53,6 +54,7 @@ def __init__(self,
  hf_model,
  seed_file,
  instruct_num,
+ trust_remote_code: bool = False,
  similarity_threshold: float = 0.7,
  prompt_template: str = None,
  qa_pair_template: str = None,
@@ -73,7 +75,7 @@ def __init__(self,
  :param instruct_num: The number of instruction samples.
  Randomly select N samples from "seed_file" and
  put them into prompt as instruction samples.
- :param generate_num: The number of generated samples.
+ :param trust_remote_code: passed to transformers
  :param similarity_threshold: The similarity score threshold
  between the generated samples and the seed samples.
  Range from 0 to 1. Samples with similarity score less than
@@ -140,14 +142,16 @@ def __init__(self,
  self.model_key = prepare_model(
  model_type='vllm',
  pretrained_model_name_or_path=hf_model,
+ trust_remote_code=trust_remote_code,
  tensor_parallel_size=tensor_parallel_size,
  max_model_len=max_model_len,
  max_num_seqs=max_num_seqs)
  self.sampling_params = SamplingParams(**sampling_params)
  else:
  self.model_key = prepare_model(
  model_type='huggingface',
- pretrained_model_name_or_path=hf_model)
+ pretrained_model_name_or_path=hf_model,
+ trust_remote_code=trust_remote_code)
  self.sampling_params = sampling_params
 
  self.seed_qa_samples = self.load_seed_qa_samples(seed_file)

diff --git a/data_juicer/ops/mapper/optimize_instruction_mapper.py b/data_juicer/ops/mapper/optimize_instruction_mapper.py
@@ -33,6 +33,7 @@ class OptimizeInstructionMapper(Mapper):
 
  def __init__(self,
  hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine',
+ trust_remote_code: bool = False,
  system_prompt: str = None,
  enable_vllm: bool = True,
  tensor_parallel_size: int = None,
@@ -44,6 +45,7 @@ def __init__(self,
  """
  Initialization method.
  :param hf_model: Hugginface model id.
+ :param trust_remote_code: passed to transformers
  :param system_prompt: System prompt for optimize samples.
  :param enable_vllm: Whether to use vllm for inference acceleration.
  :param tensor_parallel_size: It is only valid when enable_vllm is True.
@@ -78,14 +80,16 @@ def __init__(self,
  self.model_key = prepare_model(
  model_type='vllm',
  pretrained_model_name_or_path=hf_model,
+ trust_remote_code=trust_remote_code,
  tensor_parallel_size=tensor_parallel_size,
  max_model_len=max_model_len,
  max_num_seqs=max_num_seqs)
  self.sampling_params = SamplingParams(**sampling_params)
  else:
  self.model_key = prepare_model(
  model_type='huggingface',
- pretrained_model_name_or_path=hf_model)
+ pretrained_model_name_or_path=hf_model,
+ trust_remote_code=trust_remote_code)
  self.sampling_params = sampling_params
 
  def process(self, sample=None, rank=None):