from distutils.command.config import config import requests from time import sleep import trafilatura from trafilatura.meta import reset_caches from trafilatura.settings import DEFAULT_CONFIG import spacy import os os.system("python -m spacy download en_core_web_sm") nlp = spacy.load('en_core_web_sm') import sys DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 def get_page(url): page = None for i in range(3): try: page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) assert page is not None print("Fetched "+url, file=sys.stderr) break except: sleep(3) return page def url2lines(url): page = get_page(url) if page is None: return [] lines = html2lines(page) return lines def line_correction(lines, max_size=100): out_lines = [] for line in lines: if len(line) < 4: continue if len(line) > max_size: doc = nlp(line[:5000]) # We split lines into sentences, but for performance we take only the first 5k characters per line stack = "" for sent in doc.sents: if len(stack) > 0: stack += " " stack += str(sent).strip() if len(stack) > max_size: out_lines.append(stack) stack = "" if len(stack) > 0: out_lines.append(stack) else: out_lines.append(line) return out_lines def html2lines(page): out_lines = [] if len(page.strip()) == 0 or page is None: return out_lines text = trafilatura.extract(page, config=DEFAULT_CONFIG) reset_caches() if text is None: return out_lines return text.split("\n") # We just spit out the entire page, so need to reformat later.