File size: 6,075 Bytes
8e29341 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import requests
from bs4 import BeautifulSoup
import html2text
import re
import os
from modules import app_constants, file_utils, app_logger
import json
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
import spacy
from duckduckgo_search import DDGS
nlp = spacy.load("en_core_web_sm")
# Use the logger from app_config
app_logger = app_logger.app_logger
TMP_DIRECTORY = app_constants.WORKSPACE_DIRECTORY + 'tmp'
DEFAULT_SEARCH_COUNT = app_constants.SEARCH_COUNT
def download_and_clean(url):
try:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for script in soup(["script", "style", "img", "a"]):
script.extract()
body_text = soup.get_text()
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_emphasis = True
h.ignore_tables = True
clean_text = h.handle(body_text)
clean_text = re.sub(r'[^\w\s\n<>/\.]+', '', clean_text) # Include '.' in the allowed characters
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
return clean_text
except requests.exceptions.RequestException as e:
app_logger.error(f"Error while downloading and cleaning URL {url}: {str(e)}")
return None
def save_notes_to_file(topic, note, source_url):
# Process the text
doc = nlp(note)
# Ensure the temp directory exists
if not os.path.exists(TMP_DIRECTORY):
os.makedirs(TMP_DIRECTORY)
# Sanitize the filename and create the full path
sanitized_filename = file_utils.sanitize_filename(topic)+'.jsonl'
full_path = os.path.join(TMP_DIRECTORY, sanitized_filename)
# Initialize variables for accumulating sentences
text_block = ""
word_count = 0
# Append each sentence to form a text block and write it to the file
with open(full_path, 'a') as file:
for sent in doc.sents:
sentence_word_count = len(sent.text.split())
if word_count + sentence_word_count > 240: # If adding the sentence exceeds the max limit
# Write the current text block to the file
if word_count >= 120: # Ensure the text block meets the minimum word count
data = {
"note": text_block,
"source_url": source_url
}
file.write(json.dumps(data) + '\n')
# Reset text block and word count
text_block = sent.text
word_count = sentence_word_count
else:
# Add the sentence to the text block
text_block += ' ' + sent.text if text_block else sent.text
word_count += sentence_word_count
# Write any remaining text block to the file if it meets the minimum word count
if word_count >= 300:
data = {
"note": text_block,
"source_url": source_url
}
file.write(json.dumps(data) + '\n')
app_logger.info(f"Notes saved to file {full_path}")
return full_path
def url_list_downloader(url_list, topic):
notes_file = None
for url in url_list:
try:
text = download_and_clean(url)
if text:
notes_file = save_notes_to_file(topic, text, url)
except Exception as e:
app_logger.error(f"Error during processing for URL {url}: {e}")
return notes_file
def search_term_ddg(topic,count=DEFAULT_SEARCH_COUNT):
try:
llm = ChatOpenAI(
model_name=app_constants.MODEL_NAME,
openai_api_key=app_constants.openai_api_key,
base_url=app_constants.local_model_uri,
streaming=True
)
prompt = [
SystemMessage(content="Generate 5 plain keywords in comma separated based on user input. For example ['cat','bat','monkey','donkey','eagel']"),
HumanMessage(content=topic),
]
response = llm(prompt)
# Extract string content from the response object
if hasattr(response, 'content'):
search_keywords = response.content
else:
raise ValueError("Invalid response format")
# Splitting and trimming the keywords
search_keywords = [keyword.strip() for keyword in search_keywords.split(',')]
#print(search_keywords)
# Limiting keywords to a maximum of 8
search_keywords = search_keywords[:8]
urls = []
# Initialize DDGS with a timeout
with DDGS(timeout=3) as ddgs:
for term in search_keywords:
# Fetch results for each search term
results = ddgs.text(f"{topic} {term}", max_results=count)
for result in results:
url = result['href']
if not url.endswith(('.pdf', '.ppt', '.pptx', '.doc', '.docx')):
urls.append(url)
return sorted(set(urls))
except Exception as e:
app_logger.error(f"An error occurred while searching for topic {topic}: {e}")
return []
def explore_url_on_internet(topic, count=DEFAULT_SEARCH_COUNT):
app_logger.info(f"Starting research on topic {topic}")
# Sanitize the filename and create the full path
sanitized_filename = file_utils.sanitize_filename(topic)+'.jsonl'
full_path = os.path.join(TMP_DIRECTORY, sanitized_filename)
# Check if the file already exists
if os.path.exists(full_path):
app_logger.info(f"File already exists skipping download: ",full_path)
note_file = full_path
else:
url_list = search_term_ddg(topic,count)
note_file = url_list_downloader(url_list, topic)
app_logger.info(f"Research on Internet completed for {topic}, file: {note_file}")
return note_file
|