Skip to content

Commit

Permalink
add sumy option
Browse files Browse the repository at this point in the history
  • Loading branch information
Mirtia committed Apr 4, 2023
1 parent 8cd87ca commit 9c45271
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 12 deletions.
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
nltk
pypdf
nltk==3.6.7
pyenchant==3.2.2
pypdf==3.7.0
transformers
torch
wordninja==2.0.0
3 changes: 0 additions & 3 deletions src/nltk_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,3 @@ def _correct_summary(self) -> None:
def export(self, filename: str) -> None:
with open(filename, mode="w", encoding="utf-8") as f:
f.write(self.summary)

def extract_keypoints(self) -> None:
pass
30 changes: 30 additions & 0 deletions src/sumy_summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from converter import PDFToTextConverter


class PDFSummarizer(PDFToTextConverter):

LANGUAGE = "english"
NUM_SENTENCES = 20

def __init__(self, filename) -> None:
super().__init__(filename)
self.summary = ""

def summarize(self) -> None:
stemmer = Stemmer(self.LANGUAGE)
summarizer = Summarizer(stemmer)
parser = PlaintextParser.from_string(self.text,
Tokenizer(self.LANGUAGE))
summarizer.stop_words = get_stop_words(self.LANGUAGE)
for sentence in summarizer(parser.document, self.NUM_SENTENCES):
self.summary += sentence._text

def export(self, filename: str) -> None:
with open(filename, mode="w", encoding="utf-8") as f:
f.write(self.summary)
15 changes: 9 additions & 6 deletions src/test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import nltk_summarizer
import transformers_summarizer
import converter
import sumy_summarizer

def main():
parser = argparse.ArgumentParser(description='Get input .pdf')
Expand All @@ -28,11 +28,14 @@ def main():
summarizer_NTLK = nltk_summarizer.PDFSummarizer(args.file)
summarizer_NTLK.summarize()
summarizer_NTLK.export(args.output)
# elif args.mode == "pegasus":
# summarizer_pegasus = transformers_summarizer.PDFSummarizer(args.file)
# summarizer_pegasus.summarize()
# summarizer_pegasus.export(args.output)

elif args.mode == "pegasus":
summarizer_pegasus = transformers_summarizer.PDFSummarizer(args.file)
summarizer_pegasus.summarize()
summarizer_pegasus.export(args.output)
elif args.mode == "sumy":
summarizer_sumy = sumy_summarizer.PDFSummarizer(args.file)
summarizer_sumy.summarize()
summarizer_sumy.export(args.output)

if __name__ == "__main__":
main()

0 comments on commit 9c45271

Please sign in to comment.