templates/new_prompt_source_task.py

# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.

TODO: Write a Short Description of the task.

Homepage: TODO: Add the URL to the task's Homepage here.
"""
from lm_eval.api.task import PromptSourceTask


# TODO: Add the BibTeX citation for the task.
_CITATION = """
"""


# TODO: Replace `NewTask` with the name of your Task.
class NewTask(PromptSourceTask):

    # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
    # dataset as denoted in HuggingFace `datasets`.
    DATASET_PATH = ""
    # TODO: Add the `DATASET_NAME` string. This is the name of a subset within
    # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
    DATASET_NAME = None

    def has_training_docs(self):
        # TODO: Fill in the return with `True` if the Task has training data; else `False`.
        return False

    def has_validation_docs(self):
        # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
        return False

    def has_test_docs(self):
        # TODO: Fill in the return with `True` if the Task has test data; else `False`.
        return False

    def training_docs(self):
        if self.has_training_docs():
            # TODO: Return the training document generator from `self.dataset`.
            # If you need to process the data, `map` over the documents with
            # the custom processing function, `self._process_doc`. E.g.
            # `self.dataset["train"].map(self._process_doc)`
            # In most case you can leave this as is unless the dataset split is
            # named differently than the default `"train"`.
            return self.dataset["train"]

    def validation_docs(self):
        if self.has_validation_docs():
            # TODO: Return the validation document generator from `self.dataset`.
            # If you need to process the data, `map` over the documents with the
            # custom processing function, `self._process_doc`. E.g.
            # `self.dataset["validation"].map(self._process_doc)`
            # In most case you can leave this as is unless the dataset split is
            # named differently than the default `"validation"`.
            return self.dataset["validation"]

    def test_docs(self):
        if self.has_test_docs():
            # TODO: Return the test document generator from `self.dataset`.
            # If you need to process the data, `map` over the documents with the
            # custom processing function, `self._process_doc`. E.g.
            # `self.dataset["test"].map(self._process_doc)`
            # In most case you can leave this as is unless the dataset split is
            # named differently than the default `"test"`.
            return self.dataset["test"]

    def max_generation_length(self):
        # Define this method when you want to control the length of few-shot
        # generations on specific tokens. The default is `None` which gets mapped
        # to a model's default max generation token length. E.g. see `lm_eval/models/gpt2.py:max_tokens()`
        # NOTE: You may delete this function if the task does not required generation.
        return None

    def construct_requests(self, doc: dict, ctx: str, args: dict):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        Args:
            doc (dict):
                The document as returned from training_docs, validation_docs, or
                test_docs.
            ctx (str):
                The context string, generated by fewshot_context. This includes
                the natural language description, as well as the few shot examples,
                and the question part of the document for `doc`.
            args (dict):
                The specifics of the context, including number of few shots.

        Returns:
            An iterable of `Request` objects.
        """
        # TODO: Construct your language model requests with the request factory, `rf`,
        # and return them as an iterable.
        return []

    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
        dict where keys are the names of sub-metrics and values are the values of
        the metric for that one document.

        Args:
            doc (dict):
                The document as returned from training_docs, validation_docs, or
                test_docs.
            results (list):
                The results of the requests created in construct_requests.

        Returns:
            A dict of metric results.
        """
        # TODO: For each (sub)metric in the task evaluation, add a key-value pair
        # with the metric name as key and the corresponding metric result as value
        # for the current `doc`.
        return {}

    def aggregation(self):
        """
        Returns:
            A dictionary where keys are the names of sub-metrics and values are
            functions that aggregate a list of metric scores.
            {str: [metric_score] -> float}
        """
        # TODO: For each (sub)metric in the task evaluation, add a key-value pair
        # with the metric name as key and an aggregation function as value which
        # determines how to combine results from each document in the dataset.
        # Check `lm_eval.metric` to find built-in aggregation functions.
        return {}

    def higher_is_better(self):
        # TODO: For each (sub)metric in the task evaluation, add a key-value pair
        # with the metric name as key and a `bool` value determining whether or
        # not higher values of that metric are deemed better.
        return {}