File size: 6,075 Bytes
8e29341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import requests
from bs4 import BeautifulSoup
import html2text
import re
import os
from modules import app_constants, file_utils, app_logger
import json
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
import spacy
from duckduckgo_search import DDGS
nlp = spacy.load("en_core_web_sm")

# Use the logger from app_config
app_logger = app_logger.app_logger

TMP_DIRECTORY = app_constants.WORKSPACE_DIRECTORY + 'tmp'
DEFAULT_SEARCH_COUNT = app_constants.SEARCH_COUNT

def download_and_clean(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        for script in soup(["script", "style", "img", "a"]):
            script.extract()

        body_text = soup.get_text()
        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_images = True
        h.ignore_emphasis = True
        h.ignore_tables = True
        clean_text = h.handle(body_text)
        clean_text = re.sub(r'[^\w\s\n<>/\.]+', '', clean_text)  # Include '.' in the allowed characters
        clean_text = re.sub(r'\s+', ' ', clean_text).strip()
        return clean_text

    except requests.exceptions.RequestException as e:
        app_logger.error(f"Error while downloading and cleaning URL {url}: {str(e)}")
        return None

def save_notes_to_file(topic, note, source_url):
    # Process the text
    doc = nlp(note)

    # Ensure the temp directory exists
    if not os.path.exists(TMP_DIRECTORY):
        os.makedirs(TMP_DIRECTORY)

    # Sanitize the filename and create the full path
    sanitized_filename = file_utils.sanitize_filename(topic)+'.jsonl'
    full_path = os.path.join(TMP_DIRECTORY, sanitized_filename)

    # Initialize variables for accumulating sentences
    text_block = ""
    word_count = 0

    # Append each sentence to form a text block and write it to the file
    with open(full_path, 'a') as file:
        for sent in doc.sents:
            sentence_word_count = len(sent.text.split())
            if word_count + sentence_word_count > 240:  # If adding the sentence exceeds the max limit
                # Write the current text block to the file
                if word_count >= 120:  # Ensure the text block meets the minimum word count
                    data = {
                        "note": text_block,
                        "source_url": source_url
                    }
                    file.write(json.dumps(data) + '\n')
                # Reset text block and word count
                text_block = sent.text
                word_count = sentence_word_count
            else:
                # Add the sentence to the text block
                text_block += ' ' + sent.text if text_block else sent.text
                word_count += sentence_word_count

        # Write any remaining text block to the file if it meets the minimum word count
        if word_count >= 300:
            data = {
                "note": text_block,
                "source_url": source_url
            }
            file.write(json.dumps(data) + '\n')

    app_logger.info(f"Notes saved to file {full_path}")
    return full_path


def url_list_downloader(url_list, topic):
    notes_file = None
    for url in url_list:
        try:
            text = download_and_clean(url)
            if text:
                notes_file = save_notes_to_file(topic, text, url)
        except Exception as e:
            app_logger.error(f"Error during processing for URL {url}: {e}")
    return notes_file

def search_term_ddg(topic,count=DEFAULT_SEARCH_COUNT):
    try:
        llm = ChatOpenAI(
            model_name=app_constants.MODEL_NAME,
            openai_api_key=app_constants.openai_api_key,
            base_url=app_constants.local_model_uri,
            streaming=True
        )
        prompt = [
            SystemMessage(content="Generate 5 plain keywords in comma separated based on user input. For example ['cat','bat','monkey','donkey','eagel']"),
            HumanMessage(content=topic),
        ]
        response = llm(prompt)
        # Extract string content from the response object
        if hasattr(response, 'content'):
            search_keywords = response.content
        else:
            raise ValueError("Invalid response format")
        
        # Splitting and trimming the keywords
        search_keywords = [keyword.strip() for keyword in search_keywords.split(',')]
        #print(search_keywords)
        # Limiting keywords to a maximum of 8
        search_keywords = search_keywords[:8]

        urls = []
        # Initialize DDGS with a timeout
        with DDGS(timeout=3) as ddgs:
            for term in search_keywords:
                # Fetch results for each search term
                results = ddgs.text(f"{topic} {term}", max_results=count)
                for result in results:
                    url = result['href']
                    if not url.endswith(('.pdf', '.ppt', '.pptx', '.doc', '.docx')):
                        urls.append(url)
        return sorted(set(urls))

    except Exception as e:
        app_logger.error(f"An error occurred while searching for topic {topic}: {e}")
        return []

def explore_url_on_internet(topic, count=DEFAULT_SEARCH_COUNT):
    app_logger.info(f"Starting research on topic {topic}")
    # Sanitize the filename and create the full path
    sanitized_filename = file_utils.sanitize_filename(topic)+'.jsonl'
    full_path = os.path.join(TMP_DIRECTORY, sanitized_filename)

    # Check if the file already exists
    if os.path.exists(full_path):
        app_logger.info(f"File already exists skipping download: ",full_path)
        note_file = full_path
    else:
        url_list = search_term_ddg(topic,count)
        note_file = url_list_downloader(url_list, topic)
    app_logger.info(f"Research on Internet completed for {topic}, file: {note_file}")
    return note_file