handle merged words, incorrect spaces

Mirtia · Apr 4, 2023 · 8cd87ca · 8cd87ca
1 parent 33964d7
commit 8cd87ca
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 14 deletions.
diff --git a/src/converter.py b/src/converter.py
@@ -2,6 +2,7 @@
 import pypdf
 import re
 
+
 class PDFToTextConverter:
  """
  A class that converts .pdf files to .txt files.
@@ -26,9 +27,12 @@ def _validate_file(self, filename: str) -> str:
  def _read_file(self, filename: str) -> str:
  self._validate_file(filename)
  with open(filename, mode="rb") as f:
- writer = pypdf.PdfWriter(clone_from=f)
+ reader = pypdf.PdfReader(f)
+ writer = pypdf.PdfWriter(clone_from=reader)
+ writer.remove_annotations(subtypes=None)
+
  return " ".join(page.extract_text().replace("-", "")
- for page in writer.pages)
+  for page in writer.pages)
 
  def export(self, filename: str) -> None:
  with open(filename, mode="w", encoding="utf-8") as f:

diff --git a/src/nltk_summarizer.py b/src/nltk_summarizer.py
@@ -1,6 +1,8 @@
 import re
 from concurrent.futures import ThreadPoolExecutor
 from itertools import islice
+import wordninja
+import enchant
 
 import nltk
 from nltk import FreqDist
@@ -33,14 +35,10 @@ def __init__(self, filename: str) -> None:
  self._download()
  self.stop_words = set(stopwords.words("english"))
  self.text = self._sanitize(self.text)
- self.custom_stop_words = self._import_stop_words()
  self.chunks = PDFSummarizer.split_text(self.text, self.CHUNK_SIZE)
  self.summarizer = pipeline(task="summarization",
  model="sshleifer/distilbart-cnn-12-6")
 
- def _import_stop_words(self, filename=None):
- pass
-
  def _sanitize(self, text):
  return re.sub(r"\[\d+\]", "",
  re.sub(r"http\S+", "", text, flags=re.MULTILINE))
@@ -72,7 +70,6 @@ def process_concurrently(self, tokens: list, num_threads: int,
 
  @staticmethod
  def tokenize_sentences(chunks: list) -> list:
- # Remove sentences with Fig or Figure, Table, Tab
  return [token for chunk in chunks for token in sent_tokenize(chunk)]
 
  @staticmethod
@@ -81,7 +78,10 @@ def tokenize_words(chunks: list) -> list:
 
  @staticmethod
  def filter_sentences(chunks: list) -> list:
- return [line for line in chunks if len(line) > 1]
+ return [
+ token for token in chunks if len(token) > 1 and not any(
+ word in token for word in {"Figure", "Fig", "Tab", "Table"})
+ ]
 
  def filter_words(self, chunks: list) -> list:
  return [
@@ -126,23 +126,39 @@ def summarize(self, quiet=False) -> None:
  scores[sentence] = sentence_score
 
  self.CHUNK_SIZE = 1024
-
  raw_summary_chunks = PDFSummarizer.split_text(
- " ".join(
+ "".join(
  sorted(scores, key=scores.get,
  reverse=True)[:self.NUM_SENTENCES]), self.CHUNK_SIZE)
 
  with ThreadPoolExecutor() as executor:
  futures = list(
  executor.map(self._summarize_chunk, raw_summary_chunks))
  self.summary = "".join(futures)
+ self._correct_summary()
 
  def _summarize_chunk(self, chunk) -> str:
  return self.summarizer(chunk,
  max_length=self.MAX_LENGTH,
  min_length=self.MIN_LENGTH,
  do_sample=True)[0]["summary_text"]
 
+ def _correct_summary(self) -> None:
+ eng_dict = enchant.Dict("en_US")
+ words = word_tokenize(self.summary)
+ clean_summary = []
+ for word in words:
+ split_words = wordninja.split(word)
+ if len(word) > 5 and not eng_dict.check(word) and all(
+ eng_dict.check(w) for w in split_words):
+ clean_summary.extend(split_words)
+ else:
+ clean_summary.append(word)
+ self.summary = re.sub("\s+", " ", " ".join(clean_summary)).strip()
+
  def export(self, filename: str) -> None:
  with open(filename, mode="w", encoding="utf-8") as f:
  f.write(self.summary)
+
+ def extract_keypoints(self) -> None:
+ pass
diff --git a/src/test.py b/src/test.py
@@ -28,10 +28,10 @@ def main():
  summarizer_NTLK = nltk_summarizer.PDFSummarizer(args.file)
  summarizer_NTLK.summarize()
  summarizer_NTLK.export(args.output)
- elif args.mode == "pegasus":
- summarizer_pegasus = transformers_summarizer.PDFSummarizer(args.file)
- summarizer_pegasus.summarize()
- summarizer_pegasus.export(args.output)
+ # elif args.mode == "pegasus":
+ #  summarizer_pegasus = transformers_summarizer.PDFSummarizer(args.file)
+ #  summarizer_pegasus.summarize()
+ #  summarizer_pegasus.export(args.output)
 
 
 if __name__ == "__main__":