EleutherAI · haileyschoelkopf · Jun 3, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
@@ -44,6 +44,12 @@ This mode supports a number of command-line arguments, the details of which can
 
 - `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
 
+- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
+
+- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, a generic chat template is used.
+
+- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
+
 - `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
 
 * `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42.

@@ -162,6 +162,24 @@ def setup_parser() -> argparse.ArgumentParser:
  default=False,
  help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
  )
+ parser.add_argument(
+ "--system_instruction",
+ type=str,
+ default=None,
+ help="System instruction to be used in the prompt",
+ )
+ parser.add_argument(
+ "--apply_chat_template",
+ action="store_true",
+ default=False,
+ help="If True, applies the chat template to the prompt",
+ )
+ parser.add_argument(
+ "--fewshot_as_multiturn",
+ action="store_true",
+ default=False,
+ help="If True, uses the fewshot as a multi-turn conversation",
+ )
  parser.add_argument(
  "--show_config",
  action="store_true",
@@ -270,6 +288,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
  "Specify --output_path if providing --log_samples or --predict_only"
  )
 
+ if args.fewshot_as_multiturn and args.apply_chat_template is False:
+ raise ValueError(
+ "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
+ )
+
+ if args.num_fewshot is None or args.num_fewshot == 0 and args.fewshot_as_multiturn:
+ raise ValueError(
+ "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
+ )
+
  if args.include_path is not None:
  eval_logger.info(f"Including path: {args.include_path}")
  task_manager = TaskManager(args.verbosity, include_path=args.include_path)
@@ -357,6 +385,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
  check_integrity=args.check_integrity,
  write_out=args.write_out,
  log_samples=args.log_samples,
+ system_instruction=args.system_instruction,
+ apply_chat_template=args.apply_chat_template,
+ fewshot_as_multiturn=args.fewshot_as_multiturn,
  gen_kwargs=args.gen_kwargs,
  task_manager=task_manager,
  verbosity=args.verbosity,

@@ -3,7 +3,7 @@
 import json
 import logging
 import os
-from typing import List, Optional, Tuple, Type, TypeVar
+from typing import Dict, List, Optional, Tuple, Type, TypeVar
 
 import transformers
 from sqlitedict import SqliteDict
@@ -114,6 +114,20 @@ def generate_until(self, requests) -> List[str]:
  """
  pass
 
+ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+ """
+ Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
+
+ :param chat_history: list[dict[str, str]]
+ A list of dictionaries with keys 'role' and 'content'.
+ Values are strings representing the role name and the content of the message, respectively.
+ :return: str
+ A string representing the chat history in a format that can be used as input to the LM.
+ """
+ raise NotImplementedError(
+ "To use this model with chat templates, please implement the 'apply_chat_template' method."
+ )
+
  @classmethod
  def create_from_arg_string(
  cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
@@ -169,6 +183,12 @@ def world_size(self):
  # not support multi-device parallelism nor expect it.
  return self._world_size
 
+ @property
+ def tokenizer_name(self) -> str:
+ raise NotImplementedError(
+ "To use this model with chat templates, please implement the 'get_tokenizer_name' property."
+ )
+
  def set_cache_hook(self, cache_hook) -> None:
  self.cache_hook = cache_hook
 

@@ -35,37 +35,79 @@ def get_context(self, doc, num_fewshot):
  # TODO: should we just stop people from using fewshot from same split as evaluating?
  selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
 
- labeled_examples = (
- self.fewshot_delimiter.join(
- [
- # TODO: is separating doc_to_text and doc_to_target by one space always desired?
- (
- self.doc_to_text(doc)
- if (
- self.config.doc_to_choice is None
- or isinstance(self.doc_to_text(doc), str)
- )
- else self.doc_to_choice(doc)[self.doc_to_text(doc)]
- )
- + self.target_delimiter
- + (
- str(self.doc_to_target(doc)[0])
- if isinstance(self.doc_to_target(doc), list)
- else self.doc_to_target(doc)
- if (
- self.config.doc_to_choice is None
- or isinstance(self.doc_to_target(doc), str)
- )
- else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
- )
- for doc in selected_docs
- ]
+ labeled_examples = ""
+ for doc in selected_docs:
+ doc_content = self.doc_to_text(doc)
+ doc_target = self.doc_to_target(doc)
+ labeled_examples += (
+ doc_content
+ if self.config.doc_to_choice is None or isinstance(doc_content, str)
+ else self.doc_to_choice(doc)[doc_content]
  )
- + self.fewshot_delimiter
- )
+ labeled_examples += self.target_delimiter
+ labeled_examples += (
+ str(doc_target[0])
+ if isinstance(doc_target, list)
+ else doc_target
+ if self.config.doc_to_choice is None or isinstance(doc_target, str)
+ else str(self.doc_to_choice(doc)[doc_target])
+ )
+ labeled_examples += self.fewshot_delimiter
 
  return labeled_examples
 
+ def get_chat_context(
+ self,
+ doc,
+ num_fewshot,
+ fewshot_as_multiturn: bool = False,
+ ):
+ chat_history = []
+ # draw an extra fewshot sample if using same split as evaluating on
+ n_samples = (
+ num_fewshot + 1
+ if self.config.fewshot_split == self.config.test_split
+ else num_fewshot
+ )
+ # draw `n_samples` docs from fewshot_docs
+ fewshotex = self.sample(n_samples)
+
+ # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+ # TODO: should we just stop people from using fewshot from same split as evaluating?
+ selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+
+ if fewshot_as_multiturn:
+ for doc in selected_docs:
+ doc_content = self.doc_to_text(doc)
+ doc_target = self.doc_to_target(doc)
+ chat_history.append(
+ {
+ "role": "user",
+ "content": doc_content
+ if self.config.doc_to_choice is None
+ or isinstance(doc_content, str)
+ else self.doc_to_choice(doc)[doc_content],
+ }
+ )
+ chat_history.append(
+ {
+ "role": "assistant",
+ "content": str(doc_target[0])
+ if isinstance(doc_target, list)
+ else doc_target
+ if self.config.doc_to_choice is None
+ or isinstance(doc_target, str)
+ else str(self.doc_to_choice(doc)[doc_target]),
+ }
+ )
+ else:
+ # get fewshot context as one user turn
+ chat_history.append(
+ {"role": "user", "content": self.get_context(doc, num_fewshot)}
+ )
+
+ return chat_history
+
  def sample(self, n):
  """
  Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.

@@ -373,13 +373,25 @@ def build_all_requests(
  world_size=None,
  cache_requests=False,
  rewrite_requests_cache=False,
+ system_instruction=None,
+ apply_chat_template=False,
+ fewshot_as_multiturn=False,
+ lm=None,
  ) -> None:
  """Build a set of Instances for a task, and store them in task.instances"""
 
  # used with caching
  og_limit = limit
 
  cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
+ cache_key += "-chat_template" if apply_chat_template else ""
+ cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
+ cache_key += (
+ f"-system_prompt_hash{utils.hash_string(system_instruction)}"
+ if system_instruction is not None
+ else ""
+ )
+ cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
 
  cached_instances = load_from_cache(file_name=cache_key)
 
@@ -421,6 +433,10 @@ def build_all_requests(
  fewshot_ctx = self.fewshot_context(
  doc,
  0 if self.config.num_fewshot is None else self.config.num_fewshot,
+ system_instruction,
+ apply_chat_template,
+ fewshot_as_multiturn,
+ lm,
  )
 
  # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -957,31 +973,128 @@ def fewshot_docs(self):
  )
  return super().fewshot_docs()
 
+ @staticmethod
+ def append_target_question(
+ labeled_examples: List[Dict[str, str]],
+ question: str,
+ fewshot_as_multiturn: bool = False,
+ ) -> None:
+ """Adds a target question to the labeled examples list.
+ If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
+ Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
+ """
+ if not fewshot_as_multiturn:
+ # if no messages or last message is system, append as new user entry
+ if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
+ labeled_examples.append({"role": "user", "content": question})
+ # if last message is user, append to it to avoid two user messages in a row
+ else:
+ labeled_examples[-1]["content"] += question
+ else:
+ # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
+ labeled_examples.append({"role": "user", "content": question})
+
  @utils.positional_deprecated
- def fewshot_context(self, doc: str, num_fewshot: int) -> str:
+ def fewshot_context(
+ self,
+ doc: str,
+ num_fewshot: int,
+ system_instruction: Optional[str] = None,
+ apply_chat_template: bool = False,
+ fewshot_as_multiturn: bool = False,
+ lm=None,
+ ) -> str:
  """Returns a fewshot context string that is made up of a prepended description
  (if provided), the `num_fewshot` number of examples, and an appended prompt example.
 
  :param doc: str
  The document as returned from training_docs, validation_docs, or test_docs.
  :param num_fewshot: int
  The number of fewshot examples to provide in the returned context string.
+ :param system_instruction: str
+ System instruction to be applied to the prompt.
+ :param apply_chat_template: bool
+ Whether to apply the chat template to the fewshot context.
+ :param fewshot_as_multiturn: bool
+ Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+ :param lm:
+ Language model with definition of the tokenizer/function to use for applying the chat template.
  :returns: str
  The fewshot context.
  """
+
+ if apply_chat_template:
+ labeled_examples = []
+ else:
+ labeled_examples = ""
+
+ # get task description
  if description := self.config.description:
  description = utils.apply_template(self.config.description, doc)
 
- if num_fewshot == 0:
- # always prepend the (possibly empty) task description
- labeled_examples = description
+ # create system prompt based on the provided system instruction and description
+ if system_instruction is not None and description:
+ system_prompt = (
+ f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
+ )
+ elif system_instruction is not None:
+ system_prompt = system_instruction
+ elif description:
+ system_prompt = description
  else:
- labeled_examples = description + self.sampler.get_context(doc, num_fewshot)
+ system_prompt = ""
+
+ # add system prompt if specified
+ if system_prompt:
+ if apply_chat_template:
+ labeled_examples.append({"role": "system", "content": system_prompt})
+ else:
+ labeled_examples = system_prompt
+
+ # if few-shot - append examples after the system prompt
+ if num_fewshot > 0:
+ if apply_chat_template:
+ labeled_examples.extend(
+ self.sampler.get_chat_context(
+ doc, num_fewshot, fewshot_as_multiturn
+ )
+ )
+ else:
+ labeled_examples += self.sampler.get_context(doc, num_fewshot)
 
  example = self.doc_to_text(doc)
- if self.multiple_input:
- return labeled_examples
+ if apply_chat_template:
+ if self.multiple_input:
+ return lm.apply_chat_template(labeled_examples)
+ if isinstance(example, str):
+ self.append_target_question(
+ labeled_examples, example, fewshot_as_multiturn
+ )
+ # for loglikelihood create a list of questions with appended choices
+ elif isinstance(example, list):
+ labeled_examples_list = []
+ # copy chat history for each example and append the answer
+ for ex in example:
+ chat = deepcopy(labeled_examples)
+ self.append_target_question(chat, ex, fewshot_as_multiturn)
+ labeled_examples_list.append(lm.apply_chat_template(chat))
+ return labeled_examples_list
+ # if example is an integer, append the choice or convert to string
+ elif isinstance(example, int):
+ if self.config.doc_to_choice is not None:
+ choices = self.doc_to_choice(doc)
+ self.append_target_question(
+ labeled_examples, choices[example], fewshot_as_multiturn
+ )
+ else:
+ self.append_target_question(
+ labeled_examples, str(example), fewshot_as_multiturn
+ )
+ # return lm.apply_chat_template(labeled_examples)
+ return lm.apply_chat_template(labeled_examples)
  else:
+ if self.multiple_input:
+ return labeled_examples
  if isinstance(example, str):
  return labeled_examples + example
  elif isinstance(example, list):