-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
111 lines (80 loc) · 3.87 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import requests
import tensorflow as tf
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics import f1_score
import re
nltk.download('punkt')
DATA_SOURCE_URL = "https://raw.githubusercontent.com/citp/privacy-policy-historical/master/"
GITHUB_API_URL = "https://api.github.com/repos/citp/privacy-policy-historical/git/trees/master?recursive=1"
def get_file_content(file_url):
response = requests.get(file_url)
return response.text
def get_md_files_from_repo(api_url=GITHUB_API_URL):
response = requests.get(api_url)
data = response.json()
if 'tree' not in data:
print("Failed to fetch the repository structure.")
return []
md_files = [item for item in data['tree'] if item['path'].endswith('.md')]
return md_files
def get_training_data(max_files=None):
md_files = get_md_files_from_repo()
# Limit the number of files based on the max_files parameter
md_files = md_files[:max_files] if max_files else md_files
policies = []
for file_info in md_files:
file_content = get_file_content(DATA_SOURCE_URL + file_info['path'])
policies.append(file_content)
return policies
def save_data(data: list, text_file='data.txt'):
with open(text_file, 'w') as f:
for line in data:
f.write(line + '\n')
def strip_markdown(md_content: str):
# Remove headers (#, ##, ###, etc.)
md_content = re.sub(r'#+\s', '', md_content)
# Remove list numbers and bullet points (1., 2., *, etc.)
md_content = re.sub(r'^\d+\.\s|\*\s', '', md_content, flags=re.MULTILINE)
# Remove bold, italic, etc. (**text**, *text*, __text__, _text_)
md_content = re.sub(r'(\*\*|__|\*|_)(.*?)(\*\*|__|\*|_)', r'\2', md_content)
# Remove links [text](url)
md_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', md_content)
# Remove inline code `code`
md_content = re.sub(r'`(.*?)`', r'\1', md_content)
# Remove blockquotes
md_content = re.sub(r'^>\s', '', md_content, flags=re.MULTILINE)
return md_content
def generate_input_text(KEY_TERMS):
# Splitting by two newlines to consider a paragraph
data = get_training_data(max_files=10)
formatted_data = strip_markdown(''.join(data))
texts = sent_tokenize(formatted_data)
save_data(texts)
# Now, let's label our original paragraphs based on the presence of key terms.
labels = [1 if any(term in paragraph.lower() for term in KEY_TERMS) else 0 for paragraph in texts]
# Split data into training and validation
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.2, random_state=42)
return texts_train, texts_val, labels_train, labels_val
def generate_summary(policy_text, model, tokenizer, MAX_LENGTH,summary_length=3):
# Split the policy text into paragraphs, line by line
paragraphs = sent_tokenize(policy_text)
# Tokenize the paragraphs
encodings = tokenizer(paragraphs, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors="tf")
# To see decoded information
# tokenizer.decode(encodings['input_ids'][0].numpy(), skip_special_tokens=True)
# Predict importance scores for each paragraph
predictions = model.predict([encodings['input_ids'], encodings['attention_mask']])
predictions = tf.squeeze(predictions).numpy()
# Get indices of top paragraphs
top_paragraph_indices = predictions.argsort()[-summary_length:][::-1]
# Extract these paragraphs from the policy text
summarized_paragraphs = [paragraphs[i] for i in top_paragraph_indices]
# Combine paragraphs to form the summary
summary = '\n\n'.join(summarized_paragraphs)
return summary
# Define a custom F1-Score metric function
def f1_metric(y_true, y_pred):
y_pred = tf.round(y_pred)
return f1_score(y_true, y_pred, average='weighted')