mrsk1883 commited on
Commit
d9986c3
1 Parent(s): 67d721c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -25
app.py CHANGED
@@ -10,65 +10,60 @@ model_name = "pszemraj/led-base-book-summary"
10
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
 
13
  def extract_first_sentence(text):
14
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
 
 
 
15
  if sentences:
16
  return sentences[0]
17
  else:
18
  return text
19
 
 
20
  def extract_abstract_and_summarize(pdf_file):
 
 
 
21
  try:
22
- with open(pdf_file, 'rb') as file:
23
  pdf_reader = PdfReader(file)
24
- abstract_text = ''
25
-
26
  for page_num in range(len(pdf_reader.pages)):
27
  page = pdf_reader.pages[page_num]
28
  text = page.extract_text()
29
-
30
- abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
31
-
32
  if abstract_match:
33
  start_index = abstract_match.end()
34
-
35
- # Check for the next heading or section marker
36
- next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:])
37
-
38
- if next_section_match:
39
- end_index = start_index + next_section_match.start()
40
- abstract_text = text[start_index:end_index]
41
  else:
42
- abstract_text = text[start_index:]
43
-
44
  break # Exit loop once abstract is found
45
-
46
  # Summarize the extracted abstract
47
  inputs = tokenizer(abstract_text, return_tensors="pt")
48
  outputs = model.generate(**inputs)
49
  summary = tokenizer.decode(outputs[0])
50
-
51
  # Extract only the first sentence
52
  summary_sentence = extract_first_sentence(summary)
53
-
54
  # Generate audio
55
  speech = gTTS(text=summary_sentence, lang="en")
56
  speech_bytes = BytesIO()
57
  speech.write_to_fp(speech_bytes)
58
-
59
  # Return individual output values
60
  return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()
61
-
62
  except Exception as e:
63
  raise Exception(str(e))
64
 
 
65
  interface = gr.Interface(
66
  fn=extract_abstract_and_summarize,
67
  inputs=[gr.File(label="Upload PDF")],
68
  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
69
  title="PDF Summarization & Audio Tool",
70
- description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts.
71
- Please read the README.MD for information about the app and sample PDFs.""",
72
  )
73
-
74
- interface.launch(share=True)
 
10
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
+
14
  def extract_first_sentence(text):
15
+ """
16
+ Extracts the first sentence from a given text.
17
+ """
18
+ sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s+", text)
19
  if sentences:
20
  return sentences[0]
21
  else:
22
  return text
23
 
24
+
25
  def extract_abstract_and_summarize(pdf_file):
26
+ """
27
+ Extracts the abstract and summarizes it in one sentence with information till "Introduction".
28
+ """
29
  try:
30
+ with open(pdf_file, "rb") as file:
31
  pdf_reader = PdfReader(file)
32
+ abstract_text = ""
 
33
  for page_num in range(len(pdf_reader.pages)):
34
  page = pdf_reader.pages[page_num]
35
  text = page.extract_text()
36
+ abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE)
 
 
37
  if abstract_match:
38
  start_index = abstract_match.end()
39
+ introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE)
40
+ if introduction_match:
41
+ end_index = start_index + introduction_match.start()
 
 
 
 
42
  else:
43
+ end_index = None
44
+ abstract_text = text[start_index:end_index]
45
  break # Exit loop once abstract is found
 
46
  # Summarize the extracted abstract
47
  inputs = tokenizer(abstract_text, return_tensors="pt")
48
  outputs = model.generate(**inputs)
49
  summary = tokenizer.decode(outputs[0])
 
50
  # Extract only the first sentence
51
  summary_sentence = extract_first_sentence(summary)
 
52
  # Generate audio
53
  speech = gTTS(text=summary_sentence, lang="en")
54
  speech_bytes = BytesIO()
55
  speech.write_to_fp(speech_bytes)
 
56
  # Return individual output values
57
  return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()
 
58
  except Exception as e:
59
  raise Exception(str(e))
60
 
61
+
62
  interface = gr.Interface(
63
  fn=extract_abstract_and_summarize,
64
  inputs=[gr.File(label="Upload PDF")],
65
  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
66
  title="PDF Summarization & Audio Tool",
67
+ description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence with information till "Introduction", and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs.""",
 
68
  )
69
+ interface.launch(share=True)