attacks.py

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
from tqdm import tqdm
from collections import Counter

# Prompt the user to input the filename containing the model outputs.
output_path = str(input("Enter the name of the file containing the model outputs (it should be placed in the \"output\" folder"))

# Initialize counters for tracking correct and total safety category assessments.
counter_correct_category = Counter()
counter_category = Counter()
total_correct = 0

def load_data(file):
    """
    Load data from a JSONL file.

    Args:
        file (str): Path to the JSONL file.

    Returns:
        list: List of dictionaries, each representing a line from the JSONL file.
    """
    data = []
    with open(file, "r") as json_file:
        for line in json_file:
            data.append(json.loads(line))
    return data

# Load the model output data from the specified file.
data = load_data(f"output/{output_path}.jsonl")

# Specify the model ID and configuration for loading the tokenizer and model.
model_id = "meta-llama/LlamaGuard-7b"
device = "cuda"
dtype = torch.bfloat16

# Load the tokenizer and model from the pretrained model ID.
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=device)

def moderate(chat):
    """
    Generate a moderation response for the given chat conversation.

    Args:
        chat (list): List of dictionaries representing the chat messages.

    Returns:
        str: Moderation result ("safe" or "unsafe").
    """
    # Encode the chat conversation into input IDs for the model.
    input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt").to(device)
    
    # Generate a response from the model with a maximum of 2000 new tokens.
    output = model.generate(input_ids=input_ids, max_new_tokens=2000, pad_token_id=0)
    
    # Decode the model's response, skipping special tokens.
    prompt_len = input_ids.shape[-1]
    return tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)

# Open the output file for writing the evaluation results.
with open(f"experiments/{output_path}.jsonl", "w") as out:
    print(f"> Evaluating {output_path} model outputs...")
    
    # Iterate over each element in the data using a progress bar.
    for elem in tqdm(data):
        # Extract and clean the prompt and response from the element.
        prompt = elem["prompt"].replace("### Instruction:\n", "").strip()
        response = elem["response"].replace("### Response:\n", "").strip()
        safety_category = elem["category"]

        # Moderate the chat conversation using the model.
        llama_guard_output = moderate([
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": response},
        ])

        # Update the counters based on the moderation result.
        if llama_guard_output == "safe":
            counter_correct_category[safety_category] += 1
            total_correct += 1
            elem["llama_guard_output"] = "safe"
        else:
            elem["llama_guard_output"] = "unsafe"
        
        counter_category[safety_category] += 1

        # Write the updated element with the moderation result to the output file.
        json.dump(elem, out)
        out.write("\n")

# Print the category-wise and overall safety scores for the model.
print(f"> Category-wise safety score for {output_path} model:")
for category in counter_category:
    print(f"    - {category}: {counter_correct_category[category] / counter_category[category]}")

print("--------------------")
print(f"> Overall safety score for {output_path} model: {total_correct / len(data)}")