import re import gradio as gr import requests from inscriptis import get_text from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig from readability import Document INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"]) def extract_text(url: str): html = requests.get(url).content.decode("utf-8") if len(html.strip()) == 0: return "", "", "", "" parsed_doc = Document(html) # get the body of the article with readability-lxml title = parsed_doc.short_title() clean_html = parsed_doc.summary(html_partial=True) del parsed_doc # get the formatted plaintext with inscriptis text = get_text(clean_html, INSCRIPTIS_CONFIG).strip() if not re.search(r"\w+", text): # no words found, only whitespace and punctuation return title, "", clean_html, html # remove excessive empty lines text = re.sub(r"\n\s*\n", "\n\n", text) return title, text, clean_html, html title = gr.Textbox(label="Title") text = gr.Textbox(label="Text (`inscriptis` output)", lines=10) clean_html = gr.Textbox(label="Clean HTML (`readability-lxml` output)", lines=10) html = gr.Textbox(label="Raw HTML response", lines=10) demo = gr.Interface( extract_text, gr.Textbox(placeholder="https://hf.co/", label="URL"), [title, text, clean_html, html], examples=[ ["https://huggingface.co/blog/peft"], [ "https://www.nytimes.com/2023/03/08/technology/chatbots-disrupt-internet-industry.html" ], ], ) demo.launch()