From acbb67fc8044d496062127261ad69def63635c2a Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Wed, 5 Jul 2023 15:28:20 +0000 Subject: [PATCH 01/10] Load LLMs in FP16 for faster inference --- inference.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/inference.py b/inference.py index 4701229..5942dab 100644 --- a/inference.py +++ b/inference.py @@ -53,7 +53,7 @@ def load_model(split_name): isDeduped = split_name.startswith("deduped") model = split_name.split("duped.")[-1] corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}" - return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto") + return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto", torch_dtype=torch.float16) def calculate_perplexity(logits: torch.Tensor, labels: torch.Tensor) -> torch.float64: @@ -293,7 +293,6 @@ def parse_cli_args(): "--models", type=str, help=models_arg_help, - choices=models_args_default, default=models_args_default, ) From 85b45e8d00375eb136eae266d8af145281e22fcb Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Sat, 15 Jul 2023 03:24:55 +0000 Subject: [PATCH 02/10] Add error handling for when there is an exception in the pple code --- inference.py | 101 ++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/inference.py b/inference.py index 5942dab..d05fcff 100644 --- a/inference.py +++ b/inference.py @@ -69,46 +69,49 @@ def calculate_perplexity(logits: torch.Tensor, labels: torch.Tensor) -> torch.fl """ # Store the probabilities for each token. These will be summed later, but having the # individual probabilities is helpful for debugging. - token_probs = [] - - # Don't include the final token logits. There are no labels for - # these since the sequence has ended. - num_special_tokens = len(labels[labels == 0]) - num_normal_tokens = len(labels) - num_special_tokens - - for token_index in range(num_normal_tokens - 1): - # Map the logits to probabilities. - predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float16) - # Get the probability of the correct label. - label_prob = predicted_probs[labels[token_index + 1]] - - # Check if the label probability is 0. This is likely due a rounding error. Recalculate - # the probability using double precision. - if label_prob == 0: - predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float64) + try: + token_probs = [] + + # Don't include the final token logits. There are no labels for + # these since the sequence has ended. + num_special_tokens = len(labels[labels == 0]) + num_normal_tokens = len(labels) - num_special_tokens + + for token_index in range(num_normal_tokens - 1): + # Map the logits to probabilities. + predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float16) + # Get the probability of the correct label. label_prob = predicted_probs[labels[token_index + 1]] - # Store the probability for this token. - token_probs.append(label_prob.detach()) + # Check if the label probability is 0. This is likely due a rounding error. Recalculate + # the probability using double precision. + if label_prob == 0: + predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float64) + label_prob = predicted_probs[labels[token_index + 1]] - mid_index = len(token_probs) // 2 - prompt_ppl = None - log_likelihood = torch.log(torch.stack(token_probs[:mid_index])).sum() - cross_entropy = -log_likelihood / len(token_probs) - prompt_ppl = torch.exp(cross_entropy).item() + # Store the probability for this token. + token_probs.append(label_prob.detach()) - generation_ppl = None - log_likelihood = torch.log(torch.stack(token_probs[mid_index:])).sum() - cross_entropy = -log_likelihood / len(token_probs) - generation_ppl = torch.exp(cross_entropy).item() + mid_index = len(token_probs) // 2 + prompt_ppl = None + log_likelihood = torch.log(torch.stack(token_probs[:mid_index])).sum() + cross_entropy = -log_likelihood / len(token_probs) + prompt_ppl = torch.exp(cross_entropy).item() - sequence_ppl = None - log_likelihood = torch.log(torch.stack(token_probs)).sum() - cross_entropy = -log_likelihood / len(token_probs) - sequence_ppl = torch.exp(cross_entropy).item() + generation_ppl = None + log_likelihood = torch.log(torch.stack(token_probs[mid_index:])).sum() + cross_entropy = -log_likelihood / len(token_probs) + generation_ppl = torch.exp(cross_entropy).item() - # assert perplexity != float("inf"), "Perplexity is infinite. This is probably due to a token that has a probability of 0." - return prompt_ppl, generation_ppl, sequence_ppl + sequence_ppl = None + log_likelihood = torch.log(torch.stack(token_probs)).sum() + cross_entropy = -log_likelihood / len(token_probs) + sequence_ppl = torch.exp(cross_entropy).item() + + return prompt_ppl, generation_ppl, sequence_ppl + except Exception as e: + print(f"Failed to calulcate perplexity: {e}") + return -1, -1, -1 def get_batch_size(model_name: str) -> int: @@ -178,7 +181,7 @@ def run_model_inferences(split_name: str, run_id: str, dataset: str, features: l pile_dataset = PileDataset(pile_sequences, tokenizer) batch_size = get_batch_size(split_name) data_loader = DataLoader(pile_dataset, batch_size=batch_size) - + with torch.no_grad(): desc = f"Collecting {dataset} inference responses for {split_name}" for batch in tqdm(data_loader, desc=desc): @@ -208,11 +211,11 @@ def gini(array): # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm array = array.flatten() if np.amin(array) < 0: - array -= np.amin(array) - array = np.sort(array) - index = np.arange(1,array.shape[0]+1) - n = array.shape[0] - return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) + array -= np.amin(array) + array = np.sort(array) + index = np.arange(1,array.shape[0]+1) + n = array.shape[0] + return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) def accumilate_inference_log( @@ -231,7 +234,7 @@ def accumilate_inference_log( perplexities = [calculate_perplexity(logits[i], labels[i]) for i in range(len(logits))] if "ppl" in features else None inference_logs = [] e=1e-8 - + for index, id_tensor in enumerate(batch_sequence_ids): total_entropy = [] total_gini = [] @@ -243,21 +246,21 @@ def accumilate_inference_log( inference_log["generation_perplexity"] = perplexities[index][1] inference_log["sequence_perplexity"] = perplexities[index][2] if "attn" in features: - for layer_index, attention_layer in enumerate(outputs.attentions): - sequence_attention = attention_layer[index].detach() + for layer_index, attention_layer in enumerate(outputs.attentions): + sequence_attention = attention_layer[index].detach() head_e = [] gini_head = [] - for head_index, head in enumerate(sequence_attention): - attention_head = head.detach().cpu().numpy() + for head_index, head in enumerate(sequence_attention): + attention_head = head.detach().cpu().numpy() attention_head += e #adding 'e' to attention weights that are 0 to avoid log zero error while calculating entropy. Entropy = - ∑(w * log(w)) gini_coefficient = gini(attention_head) gini_head.append(gini_coefficient) - head_entropy = -np.sum(attention_head * np.log(attention_head)) + head_entropy = -np.sum(attention_head * np.log(attention_head)) head_e.append(head_entropy) inference_log[f"gini_head{head_index+1}_layer{layer_index+1}"] = gini_coefficient inference_log[f"entropy_head{head_index+1}_layer{layer_index+1}"] = head_entropy - + avg_head = np.mean(head_e) avg_head_gini = np.mean(gini_head) total_entropy.append(avg_head) @@ -266,8 +269,8 @@ def accumilate_inference_log( average_entropy = np.mean(total_entropy) average_gini = np.mean(total_gini) inference_log[f"avg entropy"] = average_entropy - inference_log[f"avg gini"] = average_gini - + inference_log[f"avg gini"] = average_gini + inference_logs.append(inference_log) return inference_logs From 5630341be29c86d7f030f135791e34e16e30c340 Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Mon, 7 Aug 2023 22:19:59 +0000 Subject: [PATCH 03/10] Begin making code parralel --- inference.py | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/inference.py b/inference.py index d05fcff..9ce252f 100644 --- a/inference.py +++ b/inference.py @@ -10,6 +10,7 @@ from transformers import AutoTokenizer, GPTNeoXForCausalLM from torch.utils.data import Dataset, DataLoader from datasets import load_dataset, ReadInstruction +from multiprocessing import Process from argparse import ArgumentParser from tqdm import tqdm from datetime import datetime @@ -246,25 +247,10 @@ def accumilate_inference_log( inference_log["generation_perplexity"] = perplexities[index][1] inference_log["sequence_perplexity"] = perplexities[index][2] if "attn" in features: + process_args = [layer_index, attention_layer for layer_index, attention_layer in enumerate(outputs.attentions)] + p = Process(target=get_layer_entropy, args=(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer)) for layer_index, attention_layer in enumerate(outputs.attentions): - sequence_attention = attention_layer[index].detach() - head_e = [] - gini_head = [] - - for head_index, head in enumerate(sequence_attention): - attention_head = head.detach().cpu().numpy() - attention_head += e #adding 'e' to attention weights that are 0 to avoid log zero error while calculating entropy. Entropy = - ∑(w * log(w)) - gini_coefficient = gini(attention_head) - gini_head.append(gini_coefficient) - head_entropy = -np.sum(attention_head * np.log(attention_head)) - head_e.append(head_entropy) - inference_log[f"gini_head{head_index+1}_layer{layer_index+1}"] = gini_coefficient - inference_log[f"entropy_head{head_index+1}_layer{layer_index+1}"] = head_entropy - - avg_head = np.mean(head_e) - avg_head_gini = np.mean(gini_head) - total_entropy.append(avg_head) - total_gini.append(avg_head_gini) + get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer) average_entropy = np.mean(total_entropy) average_gini = np.mean(total_gini) @@ -275,6 +261,28 @@ def accumilate_inference_log( return inference_logs + +def get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer): + sequence_attention = attention_layer[index].detach() + head_e = [] + gini_head = [] + + for head_index, head in enumerate(sequence_attention): + attention_head = head.detach().cpu().numpy() + attention_head += e #adding 'e' to attention weights that are 0 to avoid log zero error while calculating entropy. Entropy = - ∑(w * log(w)) + gini_coefficient = gini(attention_head) + gini_head.append(gini_coefficient) + head_entropy = -np.sum(attention_head * np.log(attention_head)) + head_e.append(head_entropy) + inference_log[f"gini_head{head_index+1}_layer{layer_index+1}"] = gini_coefficient + inference_log[f"entropy_head{head_index+1}_layer{layer_index+1}"] = head_entropy + + avg_head = np.mean(head_e) + avg_head_gini = np.mean(gini_head) + total_entropy.append(avg_head) + total_gini.append(avg_head_gini) + + def save_inference_log(split_name: str, run_id: str, dataset: pd.DataFrame, inference_logs: list): """Saves the accumilated inference log in a pandas dataframe From f4b02c06f8597a7d4fe79bc6d89cc6dad3ac516f Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Wed, 9 Aug 2023 22:27:32 +0000 Subject: [PATCH 04/10] Checkpoint --- inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference.py b/inference.py index 9ce252f..01224f6 100644 --- a/inference.py +++ b/inference.py @@ -247,8 +247,8 @@ def accumilate_inference_log( inference_log["generation_perplexity"] = perplexities[index][1] inference_log["sequence_perplexity"] = perplexities[index][2] if "attn" in features: - process_args = [layer_index, attention_layer for layer_index, attention_layer in enumerate(outputs.attentions)] - p = Process(target=get_layer_entropy, args=(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer)) + # process_args = [layer_index, attention_layer for layer_index, attention_layer in enumerate(outputs.attentions)] + # p = Process(target=get_layer_entropy, args=(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer)) for layer_index, attention_layer in enumerate(outputs.attentions): get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer) From 83f2c2a7885e1c822fede0a0f15bbb4e1f3ab79b Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Wed, 9 Aug 2023 22:54:39 +0000 Subject: [PATCH 05/10] Created configureble batch sizes --- inference.py | 8 +- working_dirs/kyle/upload-ppl/upload.ipynb | 108 ++++++++++++++++++++++ 2 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 working_dirs/kyle/upload-ppl/upload.ipynb diff --git a/inference.py b/inference.py index 01224f6..76dbc37 100644 --- a/inference.py +++ b/inference.py @@ -165,7 +165,7 @@ def get_dataset(dataset_name: str, split_name: str, sample: int = None) -> pd.Da return dataset if sample is None else dataset.sample(sample).reset_index(drop=True) -def run_model_inferences(split_name: str, run_id: str, dataset: str, features: list, sample_size: int = None): +def run_model_inferences(split_name: str, run_id: str, dataset: str, features: list, batch_size: int, sample_size: int = None): """ Run inference for the given model and dataset. Save the results to a CSV file. @@ -180,7 +180,6 @@ def run_model_inferences(split_name: str, run_id: str, dataset: str, features: l pythia_model = load_model(split_name) pile_sequences = get_dataset(dataset, split_name, sample=sample_size) pile_dataset = PileDataset(pile_sequences, tokenizer) - batch_size = get_batch_size(split_name) data_loader = DataLoader(pile_dataset, batch_size=batch_size) with torch.no_grad(): @@ -345,6 +344,8 @@ def parse_cli_args(): default=None, ) + parser.add_argument("--batch_size", type=int, default=None, help="Batch size for inference") + return parser.parse_args() @@ -369,7 +370,8 @@ def main(): for dataset in args.datasets if isinstance(args.datasets, list) else args.datasets.split(","): split_name = f"{data_scheme}.{model_size}" print(f"Collecting inferences for {split_name} on {dataset} dataset") - run_model_inferences(split_name, experiment_timestamp, dataset, args.features, args.sample_size) + batch_size = args.batch_size if args.batch_size is not None else get_batch_size(model_size) + run_model_inferences(split_name, experiment_timestamp, dataset, args.features, batch_size, args.sample_size) if __name__ == "__main__": diff --git a/working_dirs/kyle/upload-ppl/upload.ipynb b/working_dirs/kyle/upload-ppl/upload.ipynb new file mode 100644 index 0000000..a2c34bf --- /dev/null +++ b/working_dirs/kyle/upload-ppl/upload.ipynb @@ -0,0 +1,108 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "from huggingface_hub import HfApi\n", + "from datasets import load_dataset, Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RepoUrl('https://huggingface.co/datasets/Kyle1668/pythia-semantic-memorization-perplexities', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Kyle1668/pythia-semantic-memorization-perplexities')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "api = HfApi()\n", + "api.create_repo(\"pythia-semantic-memorization-perplexities\", repo_type=\"dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00 Date: Fri, 18 Aug 2023 15:44:22 +0000 Subject: [PATCH 06/10] Fix batch size issue --- inference.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/inference.py b/inference.py index 76dbc37..1b3945e 100644 --- a/inference.py +++ b/inference.py @@ -137,8 +137,7 @@ def get_batch_size(model_name: str) -> int: "6.9b": 64, "12b": 64, } - model_size = ".".join(model_name.split(".")[1:]) - return size_batch_map[model_size] + return size_batch_map[model_name] def get_dataset(dataset_name: str, split_name: str, sample: int = None) -> pd.DataFrame: From a22fa4c6d263ac06e7dd4efd00b3e4c72437f022 Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Thu, 14 Sep 2023 02:15:44 +0000 Subject: [PATCH 07/10] Updated files to handle new schema --- inference.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/inference.py b/inference.py index 1b3945e..08f47f2 100644 --- a/inference.py +++ b/inference.py @@ -31,12 +31,12 @@ def __init__(self, memories, tokenizer): self.memories = memories def __getitem__(self, index): - tokens = self.memories.iloc[index]["tokens"][:64] + tokens = self.memories.iloc[index]["Tokens"][:64] decoded_text = self.tokenizer.decode(tokens) - return self.memories.iloc[index]["index"], decoded_text + return self.memories.iloc[index]["Index"], decoded_text def __len__(self): - return len(self.memories["index"]) + return len(self.memories["Index"]) def load_tokenizer(split_name: str) -> AutoTokenizer: @@ -157,7 +157,7 @@ def get_dataset(dataset_name: str, split_name: str, sample: int = None) -> pd.Da if dataset_name.split("-")[0] == "pile": scheme = split_name.split(".")[0] pile_path = f"EleutherAI/pile-{scheme}-pythia-random-sampled" - dataset = load_dataset(pile_path, split="train").to_pandas()[["index", "tokens"]] + dataset = load_dataset(pile_path, split="train").to_pandas()[["Index", "Tokens"]] else: dataset = load_dataset("EleutherAI/pythia-memorized-evals")[split_name].to_pandas() @@ -174,7 +174,6 @@ def run_model_inferences(split_name: str, run_id: str, dataset: str, features: l dataset (str): The dataset to run inference on sample_size (int, optional): The maximum number of random samples run inference on. Defaults to None. """ - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = load_tokenizer(split_name) pythia_model = load_model(split_name) pile_sequences = get_dataset(dataset, split_name, sample=sample_size) @@ -192,7 +191,7 @@ def run_model_inferences(split_name: str, run_id: str, dataset: str, features: l truncation=True, padding=True, ) - tokenized_batch.to(device) + tokenized_batch.to(pythia_model.device) labels = tokenized_batch["input_ids"] outputs = pythia_model( From f56cf7acef8a053373b0588b370102cecd27378e Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Fri, 15 Sep 2023 02:22:26 +0000 Subject: [PATCH 08/10] Introduce parallel --- inference.py | 133 +++++++++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 57 deletions(-) diff --git a/inference.py b/inference.py index 08f47f2..a1277b8 100644 --- a/inference.py +++ b/inference.py @@ -16,6 +16,7 @@ from datetime import datetime import pandas as pd import numpy as np +import multiprocessing import torch import os @@ -28,7 +29,7 @@ class PileDataset(Dataset): def __init__(self, memories, tokenizer): self.tokenizer = tokenizer - self.memories = memories + self.memories = memories.rename(columns={"index": "Index", "tokens": "Tokens"}) if "index" in memories.columns else memories def __getitem__(self, index): tokens = self.memories.iloc[index]["Tokens"][:64] @@ -54,7 +55,7 @@ def load_model(split_name): isDeduped = split_name.startswith("deduped") model = split_name.split("duped.")[-1] corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}" - return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto", torch_dtype=torch.float16) + return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto") def calculate_perplexity(logits: torch.Tensor, labels: torch.Tensor) -> torch.float64: @@ -80,15 +81,16 @@ def calculate_perplexity(logits: torch.Tensor, labels: torch.Tensor) -> torch.fl for token_index in range(num_normal_tokens - 1): # Map the logits to probabilities. - predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float16) + # predicted_probs = torch.softmax(logits[token_index].view(torch.float64), dim=0, dtype=torch.float16) + predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float64) # Get the probability of the correct label. label_prob = predicted_probs[labels[token_index + 1]] # Check if the label probability is 0. This is likely due a rounding error. Recalculate # the probability using double precision. - if label_prob == 0: - predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float64) - label_prob = predicted_probs[labels[token_index + 1]] + # if label_prob == 0: + # predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float64) + # label_prob = predicted_probs[labels[token_index + 1]] # Store the probability for this token. token_probs.append(label_prob.detach()) @@ -129,13 +131,13 @@ def get_batch_size(model_name: str) -> int: """ size_batch_map = { "70m": 512, - "160m": 512, - "410m": 512, - "1b": 256, - "1.4b": 256, - "2.8b": 128, + "160m": 256, + "410m": 256, + "1b": 128, + "1.4b": 128, + "2.8b": 64, "6.9b": 64, - "12b": 64, + "12b": 16, } return size_batch_map[model_name] @@ -180,27 +182,36 @@ def run_model_inferences(split_name: str, run_id: str, dataset: str, features: l pile_dataset = PileDataset(pile_sequences, tokenizer) data_loader = DataLoader(pile_dataset, batch_size=batch_size) - with torch.no_grad(): - desc = f"Collecting {dataset} inference responses for {split_name}" - for batch in tqdm(data_loader, desc=desc): - batch_sequences = batch[1] - tokenized_batch = tokenizer( - batch_sequences, - return_tensors="pt", - max_length=512, - truncation=True, - padding=True, - ) - tokenized_batch.to(pythia_model.device) - labels = tokenized_batch["input_ids"] - - outputs = pythia_model( - **tokenized_batch, - labels=tokenized_batch["input_ids"], - output_attentions=True, - ) - inference_logs = accumilate_inference_log(batch[0], labels, outputs, features) - save_inference_log(split_name, run_id, dataset, inference_logs) + num_processes = multiprocessing.cpu_count() + # num_processes = 1 + with multiprocessing.Pool(num_processes) as pool: + with torch.no_grad(): + desc = f"Collecting {dataset} inference responses for {split_name}" + for batch in tqdm(data_loader, desc=desc): + batch_sequences = batch[1] + tokenized_batch = tokenizer( + batch_sequences, + return_tensors="pt", + max_length=256, + truncation=True, + padding=True, + ) + tokenized_batch.to(pythia_model.device) + labels = tokenized_batch["input_ids"] + + outputs = pythia_model( + **tokenized_batch, + labels=tokenized_batch["input_ids"], + output_attentions=True, + ) + logits = outputs.logits.detach().cpu() + labels = labels.detach().cpu() + loss = outputs.loss.detach().cpu() + attentions = [attn_tensor.detach().cpu() for attn_tensor in outputs.attentions] + + inference_logs = accumilate_inference_log(batch[0], labels, logits, loss, attentions, features, pool) + save_inference_log(split_name, run_id, dataset, inference_logs) + torch.cuda.empty_cache() def gini(array): @@ -217,7 +228,7 @@ def gini(array): def accumilate_inference_log( - batch_sequence_ids: list, labels: torch.Tensor, outputs: CausalLMOutputWithPast, features: list + batch_sequence_ids: list, labels: torch.Tensor, logits: torch.Tensor, loss: torch.Tensor, attentions: list[torch.Tensor], features: list, pool: multiprocessing.Pool ): """ Extract the desired data from the model response and save it to a CSV file. @@ -228,36 +239,43 @@ def accumilate_inference_log( outputs (CausalLMOutputWithPast): The response from the Pythia model features (list): The list of features to calculate. A subset of [loss, ppl, attn] """ - logits = outputs.logits.detach() - perplexities = [calculate_perplexity(logits[i], labels[i]) for i in range(len(logits))] if "ppl" in features else None inference_logs = [] + perplexities = [calculate_perplexity(logits[i], labels[i]) for i in range(len(logits))] if "ppl" in features else None + # perplexities = pool.starmap(calculate_perplexity, zip(logits, labels)) e=1e-8 + method_args = [] for index, id_tensor in enumerate(batch_sequence_ids): - total_entropy = [] - total_gini = [] - inference_log = {"index": id_tensor.detach().item()} - if "loss" in features: - inference_log["loss"] = outputs.loss.detach().item() / len(labels[index]) - if "ppl" in features: - inference_log["prompt_perplexity"] = perplexities[index][0] - inference_log["generation_perplexity"] = perplexities[index][1] - inference_log["sequence_perplexity"] = perplexities[index][2] - if "attn" in features: - # process_args = [layer_index, attention_layer for layer_index, attention_layer in enumerate(outputs.attentions)] - # p = Process(target=get_layer_entropy, args=(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer)) - for layer_index, attention_layer in enumerate(outputs.attentions): - get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer) - - average_entropy = np.mean(total_entropy) - average_gini = np.mean(total_gini) - inference_log[f"avg entropy"] = average_entropy - inference_log[f"avg gini"] = average_gini - - inference_logs.append(inference_log) + method_args.append((labels, loss, attentions, features, perplexities, e, index, id_tensor)) + # inference_log = get_inference_log(labels, outputs, features, perplexities, e, index, id_tensor) + # inference_logs.append(inference_log) + inference_logs = pool.starmap(get_inference_log, method_args) + torch.cuda.empty_cache() + del method_args return inference_logs +def get_inference_log(labels, loss, attentions, features, perplexities, e, index, id_tensor): + total_entropy = [] + total_gini = [] + inference_log = {"index": id_tensor.detach().item()} + if "loss" in features: + inference_log["loss"] = loss.detach().item() / len(labels[index]) + if "attn" in features: + for layer_index, attention_layer in enumerate(attentions): + get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer) + + average_entropy = np.mean(total_entropy) + average_gini = np.mean(total_gini) + inference_log[f"avg entropy"] = average_entropy + inference_log[f"avg gini"] = average_gini + if "ppl" in features: + inference_log["prompt_perplexity"] = perplexities[index][0] + inference_log["generation_perplexity"] = perplexities[index][1] + inference_log["sequence_perplexity"] = perplexities[index][2] + + return inference_log + def get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer): sequence_attention = attention_layer[index].detach() @@ -373,4 +391,5 @@ def main(): if __name__ == "__main__": + multiprocessing.set_start_method("spawn") main() From feda73f69c34f6d355bb93b9a86d753a0b398f9f Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Fri, 15 Sep 2023 19:03:49 +0000 Subject: [PATCH 09/10] Begin focusing just on ppl --- inference.py | 4 +- inference_outputs.py | 257 +++++++++++++++++++++++++++++ inference_sync.py | 384 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 643 insertions(+), 2 deletions(-) create mode 100644 inference_outputs.py create mode 100644 inference_sync.py diff --git a/inference.py b/inference.py index a1277b8..dcbf138 100644 --- a/inference.py +++ b/inference.py @@ -182,8 +182,8 @@ def run_model_inferences(split_name: str, run_id: str, dataset: str, features: l pile_dataset = PileDataset(pile_sequences, tokenizer) data_loader = DataLoader(pile_dataset, batch_size=batch_size) - num_processes = multiprocessing.cpu_count() - # num_processes = 1 + num_processes = multiprocessing.cpu_count() // 2 + # num_processes = 6 with multiprocessing.Pool(num_processes) as pool: with torch.no_grad(): desc = f"Collecting {dataset} inference responses for {split_name}" diff --git a/inference_outputs.py b/inference_outputs.py new file mode 100644 index 0000000..5fa5407 --- /dev/null +++ b/inference_outputs.py @@ -0,0 +1,257 @@ +""" +This file contains the code for running batch inference on the Pythia models. We can save results from +the inferences to a CSV file for later analysis. This is useful for calculating perplexity, entropy, +and other metrics. + +Example Usage: python inference.py --models=410m,1b,12b --schemes=duped --datasets=memories,pile --sample-size=100000 +""" + +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers import AutoTokenizer, GPTNeoXForCausalLM +from torch.utils.data import Dataset, DataLoader +from datasets import load_dataset, ReadInstruction +from argparse import ArgumentParser +from tqdm import tqdm +from datetime import datetime +import pandas as pd +import numpy as np +import multiprocessing +import torch +import os + + +class PileDataset(Dataset): + """ + The wrapped around the Pile-derived pandas dataframe. This allows us to use the + PyTorch DataLoader to load the data in batches. + """ + + def __init__(self, memories, tokenizer): + self.tokenizer = tokenizer + self.memories = memories.rename(columns={"index": "Index", "tokens": "Tokens"}) if "index" in memories.columns else memories + + def __getitem__(self, index): + tokens = self.memories.iloc[index]["Tokens"][:64] + decoded_text = self.tokenizer.decode(tokens) + return self.memories.iloc[index]["Index"], decoded_text + + def __len__(self): + return len(self.memories["Index"]) + + +def load_tokenizer(split_name: str) -> AutoTokenizer: + """Get the HuggingFace tokenizer for the current model""" + isDeduped = split_name.startswith("deduped") + model = split_name.split("duped.")[-1] + corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}" + tokenizer = AutoTokenizer.from_pretrained(corresponding_model) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def load_model(split_name): + """Get the HuggingFace model for the current model""" + isDeduped = split_name.startswith("deduped") + model = split_name.split("duped.")[-1] + corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}" + return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto") + + +def get_batch_size(model_name: str) -> int: + """ + Get the optimal batch size for the current model. This is based on the model's size + where the batch size is the largest that can fit on our GPUs. Yiu may need to adjust + this if you have a different GPU. + + Args: + model_name (str): The model name + + Returns: + int: The batch size to use for inference + """ + size_batch_map = { + "70m": 512, + "160m": 256, + "410m": 256, + "1b": 128, + "1.4b": 128, + "2.8b": 64, + "6.9b": 64, + "12b": 16, + } + return size_batch_map[model_name] + + +def get_dataset(dataset_name: str, split_name: str, sample: int = None) -> pd.DataFrame: + """ + Read the given dataframe from HuggingFace, process, and return the sequences as + a pandas dataframe. + + Args: + dataset_name (str): The dataset path + split_name (str): The split within the dataset we're interested in + sample (int, optional): The number of samples to take from the dataset. Defaults to None. + + Returns: + pd.DataFrame: The pandas dataframe storing the dataset + """ + dataset = None + if dataset_name.split("-")[0] == "pile": + scheme = split_name.split(".")[0] + pile_path = f"EleutherAI/pile-{scheme}-pythia-random-sampled" + dataset = load_dataset(pile_path, split="train").to_pandas()[["Index", "Tokens"]] + else: + dataset = load_dataset("EleutherAI/pythia-memorized-evals")[split_name].to_pandas() + + return dataset if sample is None else dataset.sample(sample).reset_index(drop=True) + + +def run_model_inferences(split_name: str, run_id: str, dataset: str, batch_size: int, sample_size: int = None): + """ + Run inference for the given model and dataset. Save the results to a CSV file. + + Args: + split_name (str): The model+scheme used to determine the tokenizer and model + run_id (str): The timestamp for this run + dataset (str): The dataset to run inference on + sample_size (int, optional): The maximum number of random samples run inference on. Defaults to None. + """ + tokenizer = load_tokenizer(split_name) + pythia_model = load_model(split_name) + pile_sequences = get_dataset(dataset, split_name, sample=sample_size) + pile_dataset = PileDataset(pile_sequences, tokenizer) + data_loader = DataLoader(pile_dataset, batch_size=batch_size) + + with torch.multiprocessing.Pool(processes=4) as p: + + with torch.no_grad(): + desc = f"Collecting {dataset} inference responses for {split_name}" + for batch in tqdm(data_loader, desc=desc): + batch_sequences = batch[1] + tokenized_batch = tokenizer( + batch_sequences, + return_tensors="pt", + max_length=256, + truncation=True, + padding=True, + ) + tokenized_batch.to(pythia_model.device) + labels = tokenized_batch["input_ids"] + + outputs = pythia_model( + **tokenized_batch, + labels=tokenized_batch["input_ids"], + output_attentions=True, + ) + + + + results = p.map(parse_attn, [t.detach().cpu() for t in outputs.attentions]) + print(results) + + # inference_logs = pd.DataFrame({ + # "Loss": outputs.loss.detach().cpu().tolist(), + # "Logits": outputs.logits.detach().cpu().tolist(), + # "Attentions": [attn_tensor.detach().cpu().tolist() for attn_tensor in outputs.attentions], + # }) + # save_inference_log(split_name, run_id, dataset, inference_logs) + # torch.cuda.empty_cache() + + +def save_inference_log(split_name: str, run_id: str, dataset: str, inference_logs_df: pd.DataFrame): + """Saves the accumilated inference log in a pandas dataframe + + Args: + split_name (str): The model+scheme used to determine the tokenizer and model + run_id (str): The timestamp for this run + dataset (str): The dataset to run inference on + inference_logs (list): Accumilated inference logs + """ + file_name = split_name.replace(".", "_") + inference_logs_df.to_csv(f"datasets/{run_id}/{dataset}_{file_name}.csv", index=False, mode="a") + + +def parse_attn(attn_t): + return attn_t.tolist() + +def parse_cli_args(): + parser = ArgumentParser() + models_arg_help = "The Pythia model to get the perplexities for. Valid options are: 70m, 160m, 410m, 1b, 1.4b, 2.8b, 6.9b, 12b" + models_args_default = ["70m", "160m", "410m", "1b", "1.4b", "2.8b", "6.9b", "12b"] + parser.add_argument( + "--models", + type=str, + help=models_arg_help, + default=models_args_default, + ) + + schemes_args_help = "The data scheme to get the perplexities for. Valid options are: deduped, duped" + schemes_args_default = ["deduped", "duped"] + parser.add_argument( + "--schemes", + type=str, + help=schemes_args_help, + choices=schemes_args_default, + default=schemes_args_default, + ) + + dataset_arg_help = "The dataset in which to get inference responses for. Valid options are: memories, pile." + datasets_args_default = ["pile", "memories"] + parser.add_argument( + "--datasets", + type=str, + help=dataset_arg_help, + choices=datasets_args_default, + default=datasets_args_default, + ) + + features_arg_help = "The features to extract from the model response. Valid options are: attn, loss, perplexity" + features_arg_default = ["attn", "loss", "ppl"] + parser.add_argument( + "--features", + type=str, + help=features_arg_help, + choices=features_arg_default, + default=features_arg_default, + ) + + sample_size_arg_help = "The number of samples to take from the dataset. Defaults to None." + parser.add_argument( + "--sample_size", + type=int, + help=sample_size_arg_help, + default=None, + ) + + parser.add_argument("--batch_size", type=int, default=None, help="Batch size for inference") + + return parser.parse_args() + + +def main(): + args = parse_cli_args() + experiment_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + os.makedirs(f"./datasets/{experiment_timestamp}", exist_ok=True) + + print("---------------------------------------------------------------------------") + print("Starting inference run with the following parameters:") + print(f"Timestamp: {experiment_timestamp}") + print(f"Models: {args.models}") + print(f"Schemes: {args.schemes}") + print(f"Datasets: {args.datasets}") + if args.sample_size is not None: + print(f"Sample size: {args.sample_size}") + print("---------------------------------------------------------------------------") + + for model_size in args.models if isinstance(args.models, list) else args.models.split(","): + for data_scheme in args.schemes if isinstance(args.schemes, list) else args.schemes.split(","): + for dataset in args.datasets if isinstance(args.datasets, list) else args.datasets.split(","): + split_name = f"{data_scheme}.{model_size}" + print(f"Collecting inferences for {split_name} on {dataset} dataset") + batch_size = args.batch_size if args.batch_size is not None else get_batch_size(model_size) + run_model_inferences(split_name, experiment_timestamp, dataset, batch_size, args.sample_size) + + +if __name__ == "__main__": + multiprocessing.set_start_method("spawn") + main() diff --git a/inference_sync.py b/inference_sync.py new file mode 100644 index 0000000..7c55d00 --- /dev/null +++ b/inference_sync.py @@ -0,0 +1,384 @@ +""" +This file contains the code for running batch inference on the Pythia models. We can save results from +the inferences to a CSV file for later analysis. This is useful for calculating perplexity, entropy, +and other metrics. + +Example Usage: python inference.py --models=410m,1b,12b --schemes=duped --datasets=memories,pile --sample-size=100000 +""" + +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers import AutoTokenizer, GPTNeoXForCausalLM +from torch.utils.data import Dataset, DataLoader +from datasets import load_dataset, ReadInstruction +from multiprocessing import Process +from argparse import ArgumentParser +from tqdm import tqdm +from datetime import datetime +import pandas as pd +import numpy as np +import torch +import os + + +class PileDataset(Dataset): + """ + The wrapped around the Pile-derived pandas dataframe. This allows us to use the + PyTorch DataLoader to load the data in batches. + """ + + def __init__(self, memories, tokenizer): + self.tokenizer = tokenizer + self.memories = memories.rename(columns={"index": "Index", "tokens": "Tokens"}) if "index" in memories.columns else memories + + def __getitem__(self, index): + tokens = self.memories.iloc[index]["Tokens"][:64] + decoded_text = self.tokenizer.decode(tokens) + return self.memories.iloc[index]["Index"], decoded_text + + def __len__(self): + return len(self.memories["Index"]) + + +def load_tokenizer(split_name: str) -> AutoTokenizer: + """Get the HuggingFace tokenizer for the current model""" + isDeduped = split_name.startswith("deduped") + model = split_name.split("duped.")[-1] + corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}" + tokenizer = AutoTokenizer.from_pretrained(corresponding_model) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def load_model(split_name): + """Get the HuggingFace model for the current model""" + isDeduped = split_name.startswith("deduped") + model = split_name.split("duped.")[-1] + corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}" + return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto", torch_dtype=torch.float16) + + +def calculate_perplexity(logits: torch.Tensor, labels: torch.Tensor) -> torch.float64: + """ + Clauclate the perplexity of a sequence given the logits and labels + + Args: + logits (torch.Tensor): The logits for the model's generation + labels (torch.Tensor): The true tokens for the sequence + + Returns: + torch.float64: The model's perplexity for the given sequence + """ + # Store the probabilities for each token. These will be summed later, but having the + # individual probabilities is helpful for debugging. + try: + token_probs = [] + + # Don't include the final token logits. There are no labels for + # these since the sequence has ended. + num_special_tokens = len(labels[labels == 0]) + num_normal_tokens = len(labels) - num_special_tokens + + for token_index in range(num_normal_tokens - 1): + # Map the logits to probabilities. + predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float16) + # Get the probability of the correct label. + label_prob = predicted_probs[labels[token_index + 1]] + + # Check if the label probability is 0. This is likely due a rounding error. Recalculate + # the probability using double precision. + if label_prob == 0: + predicted_probs = torch.softmax(logits[token_index], dim=0, dtype=torch.float64) + label_prob = predicted_probs[labels[token_index + 1]] + + # Store the probability for this token. + token_probs.append(label_prob.detach()) + + mid_index = len(token_probs) // 2 + prompt_ppl = None + log_likelihood = torch.log(torch.stack(token_probs[:mid_index])).sum() + cross_entropy = -log_likelihood / len(token_probs) + prompt_ppl = torch.exp(cross_entropy).item() + + generation_ppl = None + log_likelihood = torch.log(torch.stack(token_probs[mid_index:])).sum() + cross_entropy = -log_likelihood / len(token_probs) + generation_ppl = torch.exp(cross_entropy).item() + + sequence_ppl = None + log_likelihood = torch.log(torch.stack(token_probs)).sum() + cross_entropy = -log_likelihood / len(token_probs) + sequence_ppl = torch.exp(cross_entropy).item() + + return prompt_ppl, generation_ppl, sequence_ppl + except Exception as e: + print(f"Failed to calulcate perplexity: {e}") + return -1, -1, -1 + + +def get_batch_size(model_name: str) -> int: + """ + Get the optimal batch size for the current model. This is based on the model's size + where the batch size is the largest that can fit on our GPUs. Yiu may need to adjust + this if you have a different GPU. + + Args: + model_name (str): The model name + + Returns: + int: The batch size to use for inference + """ + size_batch_map = { + # Small + "70m": 512, + "160m": 512, + "410m": 512, + # Medium + "1b": 256, + "1.4b": 256, + "2.8b": 128, + # Large + "6.9b": 64, + "12b": 64, + } + model_size = ".".join(model_name.split(".")[1:]) + return size_batch_map[model_size] + + +def get_dataset(dataset_name: str, split_name: str, sample: int = None) -> pd.DataFrame: + """ + Read the given dataframe from HuggingFace, process, and return the sequences as + a pandas dataframe. + + Args: + dataset_name (str): The dataset path + split_name (str): The split within the dataset we're interested in + sample (int, optional): The number of samples to take from the dataset. Defaults to None. + + Returns: + pd.DataFrame: The pandas dataframe storing the dataset + """ + dataset = None + if dataset_name.split("-")[0] == "pile": + scheme = split_name.split(".")[0] + pile_path = f"EleutherAI/pile-{scheme}-pythia-random-sampled" + dataset = load_dataset(pile_path, split="train").to_pandas()[["Index", "Tokens"]] + else: + dataset = load_dataset("EleutherAI/pythia-memorized-evals")[split_name].to_pandas() + + return dataset if sample is None else dataset.sample(sample).reset_index(drop=True) + + +def run_model_inferences(split_name: str, run_id: str, dataset: str, features: list, sample_size: int = None): + """ + Run inference for the given model and dataset. Save the results to a CSV file. + + Args: + split_name (str): The model+scheme used to determine the tokenizer and model + run_id (str): The timestamp for this run + dataset (str): The dataset to run inference on + sample_size (int, optional): The maximum number of random samples run inference on. Defaults to None. + """ + tokenizer = load_tokenizer(split_name) + pythia_model = load_model(split_name) + pile_sequences = get_dataset(dataset, split_name, sample=sample_size) + pile_dataset = PileDataset(pile_sequences, tokenizer) + batch_size = get_batch_size(split_name) + data_loader = DataLoader(pile_dataset, batch_size=batch_size) + + with torch.no_grad(): + desc = f"Collecting {dataset} inference responses for {split_name}" + for batch in tqdm(data_loader, desc=desc): + batch_sequences = batch[1] + tokenized_batch = tokenizer( + batch_sequences, + return_tensors="pt", + max_length=512, + truncation=True, + padding=True, + ) + tokenized_batch.to(pythia_model.device) + labels = tokenized_batch["input_ids"] + + outputs = pythia_model( + **tokenized_batch, + labels=tokenized_batch["input_ids"], + output_attentions=True, + ) + inference_logs = accumilate_inference_log(batch[0], labels, outputs, features) + save_inference_log(split_name, run_id, dataset, inference_logs) + + +def gini(array): + """Calculate the Gini coefficient of a numpy array. Ref: https://github.com/oliviaguest/gini""" + # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif + # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm + array = array.flatten() + if np.amin(array) < 0: + array -= np.amin(array) + array = np.sort(array) + index = np.arange(1,array.shape[0]+1) + n = array.shape[0] + return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) + + +def accumilate_inference_log( + batch_sequence_ids: list, labels: torch.Tensor, outputs: CausalLMOutputWithPast, features: list +): + """ + Extract the desired data from the model response and save it to a CSV file. + + Args: + batch_sequence_ids (list): The list containing the sequence ids + labels (torch.Tensor): The labels for the batch. Used to calculate perplexity + outputs (CausalLMOutputWithPast): The response from the Pythia model + features (list): The list of features to calculate. A subset of [loss, ppl, attn] + """ + logits = outputs.logits.detach() + perplexities = [calculate_perplexity(logits[i], labels[i]) for i in range(len(logits))] if "ppl" in features else None + inference_logs = [] + e=1e-8 + + for index, id_tensor in enumerate(batch_sequence_ids): + total_entropy = [] + total_gini = [] + inference_log = {"Index": id_tensor.detach().item()} + if "loss" in features: + inference_log["loss"] = outputs.loss.detach().item() / len(labels[index]) + if "ppl" in features: + inference_log["prompt_perplexity"] = perplexities[index][0] + inference_log["generation_perplexity"] = perplexities[index][1] + inference_log["sequence_perplexity"] = perplexities[index][2] + if "attn" in features: + # process_args = [layer_index, attention_layer for layer_index, attention_layer in enumerate(outputs.attentions)] + # p = Process(target=get_layer_entropy, args=(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer)) + for layer_index, attention_layer in enumerate(outputs.attentions): + get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer) + + average_entropy = np.mean(total_entropy) + average_gini = np.mean(total_gini) + inference_log[f"avg entropy"] = average_entropy + inference_log[f"avg gini"] = average_gini + + inference_logs.append(inference_log) + + return inference_logs + + +def get_layer_entropy(e, index, total_entropy, total_gini, inference_log, layer_index, attention_layer): + sequence_attention = attention_layer[index].detach() + head_e = [] + gini_head = [] + + for head_index, head in enumerate(sequence_attention): + attention_head = head.detach().cpu().numpy() + attention_head += e #adding 'e' to attention weights that are 0 to avoid log zero error while calculating entropy. Entropy = - ∑(w * log(w)) + gini_coefficient = gini(attention_head) + gini_head.append(gini_coefficient) + head_entropy = -np.sum(attention_head * np.log(attention_head)) + head_e.append(head_entropy) + inference_log[f"gini_head{head_index+1}_layer{layer_index+1}"] = gini_coefficient + inference_log[f"entropy_head{head_index+1}_layer{layer_index+1}"] = head_entropy + + avg_head = np.mean(head_e) + avg_head_gini = np.mean(gini_head) + total_entropy.append(avg_head) + total_gini.append(avg_head_gini) + + +def save_inference_log(split_name: str, run_id: str, dataset: pd.DataFrame, inference_logs: list): + """Saves the accumilated inference log in a pandas dataframe + + Args: + split_name (str): The model+scheme used to determine the tokenizer and model + run_id (str): The timestamp for this run + dataset (str): The dataset to run inference on + inference_logs (list): Accumilated inference logs + """ + file_name = split_name.replace(".", "_") + inference_logs_df = pd.DataFrame(inference_logs) + inference_logs_df.to_csv(f"datasets/{run_id}/{dataset}_{file_name}.csv", index=False, mode="a") + +def parse_cli_args(): + parser = ArgumentParser() + models_arg_help = "The Pythia model to get the perplexities for. Valid options are: 70m, 160m, 410m, 1b, 1.4b, 2.8b, 6.9b, 12b" + models_args_default = ["70m", "160m", "410m", "1b", "1.4b", "2.8b", "6.9b", "12b"] + parser.add_argument( + "--models", + type=str, + help=models_arg_help, + default=models_args_default, + ) + + schemes_args_help = "The data scheme to get the perplexities for. Valid options are: deduped, duped" + schemes_args_default = ["deduped", "duped"] + parser.add_argument( + "--schemes", + type=str, + help=schemes_args_help, + choices=schemes_args_default, + default=schemes_args_default, + ) + + dataset_arg_help = "The dataset in which to get inference responses for. Valid options are: memories, pile." + datasets_args_default = ["pile", "memories"] + parser.add_argument( + "--datasets", + type=str, + help=dataset_arg_help, + choices=datasets_args_default, + default=datasets_args_default, + ) + + features_arg_help = "The features to extract from the model response. Valid options are: attn, loss, perplexity" + features_arg_default = ["attn", "loss", "ppl"] + parser.add_argument( + "--features", + type=str, + help=features_arg_help, + choices=features_arg_default, + default=features_arg_default, + ) + + sample_size_arg_help = "The number of samples to take from the dataset. Defaults to None." + parser.add_argument( + "--sample_size", + type=int, + help=sample_size_arg_help, + default=None, + ) + + parsed_args = parser.parse_args() + for arg_name in parsed_args.__dict__: + arg_value = parsed_args.__dict__[arg_name] + if isinstance(arg_value, str): + parsed_args.__dict__[arg_name] = arg_value.split(",") + + return parsed_args + + +def main(): + args = parse_cli_args() + experiment_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + os.makedirs(f"./datasets/{experiment_timestamp}", exist_ok=True) + + print("---------------------------------------------------------------------------") + print("Starting inference run with the following parameters:") + print(f"Timestamp: {experiment_timestamp}") + print(f"Models: {args.models}") + print(f"Schemes: {args.schemes}") + print(f"Datasets: {args.datasets}") + print(f"Features: {args.features}") + if args.sample_size is not None: + print(f"Sample size: {args.sample_size}") + print("---------------------------------------------------------------------------") + + for model_size in args.models if isinstance(args.models, list) else args.models.split(","): + for data_scheme in args.schemes if isinstance(args.schemes, list) else args.schemes.split(","): + for dataset in args.datasets if isinstance(args.datasets, list) else args.datasets.split(","): + split_name = f"{data_scheme}.{model_size}" + print(f"Collecting inferences for {split_name} on {dataset} dataset") + run_model_inferences(split_name, experiment_timestamp, dataset, args.features, args.sample_size) + + +if __name__ == "__main__": + main() \ No newline at end of file From f6da96b9d71bd63bb59b9fbc9d9d8546e33bd535 Mon Sep 17 00:00:00 2001 From: Kyle1668 Date: Fri, 15 Sep 2023 20:38:06 +0000 Subject: [PATCH 10/10] Have multiple options --- .gitignore | 2 +- inference_outputs.py | 12 ++++++++++-- inference_sync.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 86026f4..ca64718 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ !datasets/eval/Pythia_70m_Deduped_Low_Perplexity_Labeling_Formatted.csv - +*.pt *.zip .vscode ### Data ### diff --git a/inference_outputs.py b/inference_outputs.py index 5fa5407..4d9ac69 100644 --- a/inference_outputs.py +++ b/inference_outputs.py @@ -146,8 +146,16 @@ def run_model_inferences(split_name: str, run_id: str, dataset: str, batch_size: - results = p.map(parse_attn, [t.detach().cpu() for t in outputs.attentions]) - print(results) + # results = p.map(parse_attn, [t.detach().cpu() for t in outputs.attentions]) + # print(results) + + # attentions_table = {} + for i in tqdm(range(len(batch[0]))): + current_example_id = batch[0][i] + current_example_attentions = torch.stack(outputs.attentions)[:, i, :] + # attentions_table[current_example_id] = current_example_attentions + torch.save(current_example_attentions, f"datasets/{run_id}/{dataset}_attentions_{current_example_id}.pt") + # print(current_example_attentions.shape) # inference_logs = pd.DataFrame({ # "Loss": outputs.loss.detach().cpu().tolist(), diff --git a/inference_sync.py b/inference_sync.py index 7c55d00..7ecd9a0 100644 --- a/inference_sync.py +++ b/inference_sync.py @@ -138,7 +138,7 @@ def get_batch_size(model_name: str) -> int: "2.8b": 128, # Large "6.9b": 64, - "12b": 64, + "12b": 32, } model_size = ".".join(model_name.split(".")[1:]) return size_batch_map[model_size]