Skip to content

Commit

Permalink
handle merged words, incorrect spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
Mirtia committed Apr 4, 2023
1 parent 33964d7 commit 8cd87ca
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 14 deletions.
8 changes: 6 additions & 2 deletions src/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pypdf
import re


class PDFToTextConverter:
"""
A class that converts .pdf files to .txt files.
Expand All @@ -26,9 +27,12 @@ def _validate_file(self, filename: str) -> str:
def _read_file(self, filename: str) -> str:
self._validate_file(filename)
with open(filename, mode="rb") as f:
writer = pypdf.PdfWriter(clone_from=f)
reader = pypdf.PdfReader(f)
writer = pypdf.PdfWriter(clone_from=reader)
writer.remove_annotations(subtypes=None)

return " ".join(page.extract_text().replace("-", "")
for page in writer.pages)
for page in writer.pages)

def export(self, filename: str) -> None:
with open(filename, mode="w", encoding="utf-8") as f:
Expand Down
32 changes: 24 additions & 8 deletions src/nltk_summarizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re
from concurrent.futures import ThreadPoolExecutor
from itertools import islice
import wordninja
import enchant

import nltk
from nltk import FreqDist
Expand Down Expand Up @@ -33,14 +35,10 @@ def __init__(self, filename: str) -> None:
self._download()
self.stop_words = set(stopwords.words("english"))
self.text = self._sanitize(self.text)
self.custom_stop_words = self._import_stop_words()
self.chunks = PDFSummarizer.split_text(self.text, self.CHUNK_SIZE)
self.summarizer = pipeline(task="summarization",
model="sshleifer/distilbart-cnn-12-6")

def _import_stop_words(self, filename=None):
pass

def _sanitize(self, text):
return re.sub(r"\[\d+\]", "",
re.sub(r"http\S+", "", text, flags=re.MULTILINE))
Expand Down Expand Up @@ -72,7 +70,6 @@ def process_concurrently(self, tokens: list, num_threads: int,

@staticmethod
def tokenize_sentences(chunks: list) -> list:
# Remove sentences with Fig or Figure, Table, Tab
return [token for chunk in chunks for token in sent_tokenize(chunk)]

@staticmethod
Expand All @@ -81,7 +78,10 @@ def tokenize_words(chunks: list) -> list:

@staticmethod
def filter_sentences(chunks: list) -> list:
return [line for line in chunks if len(line) > 1]
return [
token for token in chunks if len(token) > 1 and not any(
word in token for word in {"Figure", "Fig", "Tab", "Table"})
]

def filter_words(self, chunks: list) -> list:
return [
Expand Down Expand Up @@ -126,23 +126,39 @@ def summarize(self, quiet=False) -> None:
scores[sentence] = sentence_score

self.CHUNK_SIZE = 1024

raw_summary_chunks = PDFSummarizer.split_text(
" ".join(
"".join(
sorted(scores, key=scores.get,
reverse=True)[:self.NUM_SENTENCES]), self.CHUNK_SIZE)

with ThreadPoolExecutor() as executor:
futures = list(
executor.map(self._summarize_chunk, raw_summary_chunks))
self.summary = "".join(futures)
self._correct_summary()

def _summarize_chunk(self, chunk) -> str:
return self.summarizer(chunk,
max_length=self.MAX_LENGTH,
min_length=self.MIN_LENGTH,
do_sample=True)[0]["summary_text"]

def _correct_summary(self) -> None:
eng_dict = enchant.Dict("en_US")
words = word_tokenize(self.summary)
clean_summary = []
for word in words:
split_words = wordninja.split(word)
if len(word) > 5 and not eng_dict.check(word) and all(
eng_dict.check(w) for w in split_words):
clean_summary.extend(split_words)
else:
clean_summary.append(word)
self.summary = re.sub("\s+", " ", " ".join(clean_summary)).strip()

def export(self, filename: str) -> None:
with open(filename, mode="w", encoding="utf-8") as f:
f.write(self.summary)

def extract_keypoints(self) -> None:
pass
8 changes: 4 additions & 4 deletions src/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ def main():
summarizer_NTLK = nltk_summarizer.PDFSummarizer(args.file)
summarizer_NTLK.summarize()
summarizer_NTLK.export(args.output)
elif args.mode == "pegasus":
summarizer_pegasus = transformers_summarizer.PDFSummarizer(args.file)
summarizer_pegasus.summarize()
summarizer_pegasus.export(args.output)
# elif args.mode == "pegasus":
# summarizer_pegasus = transformers_summarizer.PDFSummarizer(args.file)
# summarizer_pegasus.summarize()
# summarizer_pegasus.export(args.output)


if __name__ == "__main__":
Expand Down

0 comments on commit 8cd87ca

Please sign in to comment.