html-viz / app.py
anton-l's picture
anton-l HF staff
titles
618b24a
raw
history blame contribute delete
No virus
1.57 kB
import re
import gradio as gr
import requests
from inscriptis import get_text
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.config import ParserConfig
from readability import Document
INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])
def extract_text(url: str):
html = requests.get(url).content.decode("utf-8")
if len(html.strip()) == 0:
return "", "", "", ""
parsed_doc = Document(html)
# get the body of the article with readability-lxml
title = parsed_doc.short_title()
clean_html = parsed_doc.summary(html_partial=True)
del parsed_doc
# get the formatted plaintext with inscriptis
text = get_text(clean_html, INSCRIPTIS_CONFIG).strip()
if not re.search(r"\w+", text):
# no words found, only whitespace and punctuation
return title, "", clean_html, html
# remove excessive empty lines
text = re.sub(r"\n\s*\n", "\n\n", text)
return title, text, clean_html, html
title = gr.Textbox(label="Title")
text = gr.Textbox(label="Text (`inscriptis` output)", lines=10)
clean_html = gr.Textbox(label="Clean HTML (`readability-lxml` output)", lines=10)
html = gr.Textbox(label="Raw HTML response", lines=10)
demo = gr.Interface(
extract_text,
gr.Textbox(placeholder="https://hf.co/", label="URL"),
[title, text, clean_html, html],
examples=[
["https://huggingface.co/blog/peft"],
[
"https://www.nytimes.com/2023/03/08/technology/chatbots-disrupt-internet-industry.html"
],
],
)
demo.launch()