BeTaLabs commited on
Commit
93b22d2
1 Parent(s): 6bdbc55

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +49 -26
main.py CHANGED
@@ -1,12 +1,21 @@
1
  # Import necessary libraries and modules
2
- import json
3
- import numpy as np
4
- from wiki import search as search_wikipedia
5
- from concurrent.futures import ThreadPoolExecutor
6
- from llm_handler import send_to_llm, agent, settings
7
- from params import OUTPUT_FILE_PATH, NUM_WORKERS
8
- from system_messages import SYSTEM_MESSAGES_VODALUS
9
- from topics import TOPICS
 
 
 
 
 
 
 
 
 
10
 
11
  # Set the system messages to those specified in SYSTEM_MESSAGES_VODALUS
12
  SYSTEM_MESSAGES = SYSTEM_MESSAGES_VODALUS
@@ -21,27 +30,27 @@ You are an AI assistant with a dynamic learning and reasoning capability. Begin
21
  - Throughout, adopt a conversational yet insightful tone. Weave in philosophical questioning, humorous insights, and practical examples to elucidate complex ideas in an engaging manner. Your goal is to provide clear, detailed explanations that make your reasoning process evident, fostering an understanding that resonates on both intellectual and emotional levels.
22
  - Finally, remember that each task is a unique learning opportunity that contributes to your holistic knowledge framework. Synthesize insights from individual tasks into your broader understanding, continually refining your cognitive processes and response strategies. Your ultimate aim is to evolve continuously, optimizing how you integrate and apply new information for comprehensive, adaptive problem-solving.
23
  - Never end the conversation with a 'send-off' or 'last-line' that offers nothing of real value to the user.
24
- ####DO NOT EVER MENTION THE DATASET AND THE ACTY OF CRAFTING QUESTIONS OR RESPONSES WHILE GENERATING, YOU ARE NOT ALLOWED TO BREAK THE 4TH-WALL AND CONTAMINATE THE DATASET. DO NOT EVERY SAY ANY PHRASES SUCH AS AND/OR SIMILAR TO: 'Here's a question that covers a very narrow topic in the SUBJECT_AREA'####"""
 
25
 
26
 
27
  # Define a dictionary to hold context information for message generation
28
  msg_context = {"role": "system", "content": str(PROMPT_1)}
29
 
30
- # Define a function to generate data based on a given topic and system messages
31
  async def generate_data(
32
  topic_selected,
33
  system_message_generation,
34
  system_message_selected,
35
- output_file_path
 
 
36
  ):
37
- # Fetch Wikipedia content for the selected topic
38
- wikipedia_info = search_wikipedia(topic_selected)
39
-
40
- # Format Wikipedia search results into a readable string
41
- wikipedia_summary = "\n".join([f"Title: {info['title']}, Abstract: {info['abstract']}" for info in wikipedia_info])
42
 
43
- # Append Wikipedia information to the system message generation prompt for LLM context
44
- full_prompt_for_llm = f"{system_message_generation}\n\n---\nWikipedia Information to use in your response generation:\n{wikipedia_summary}"
45
 
46
  # Create msg_context for LLM with Wikipedia info
47
  msg_context = {"role": "system", "content": full_prompt_for_llm}
@@ -50,7 +59,7 @@ async def generate_data(
50
  msg_list = [msg_context, {"role": "user", "content": f"Generate a question based on the SUBJECT_AREA: {topic_selected}"}]
51
 
52
  # Send to LLM for question generation
53
- question, _ = send_to_llm(agent, msg_list)
54
 
55
  # Prepare message list for LLM to generate the answer
56
  msg_list_answer = [
@@ -59,7 +68,7 @@ async def generate_data(
59
  ]
60
 
61
  # Send to LLM for answer generation
62
- answer, _ = send_to_llm(agent, msg_list_answer)
63
 
64
  # Prepare data for output (excluding usage information)
65
  data = {
@@ -74,12 +83,21 @@ async def generate_data(
74
 
75
  return data
76
 
 
 
 
 
 
 
 
 
 
 
77
  # Define the main function to orchestrate the data generation process
78
- def main():
79
- nn = 0 # Counter for successful generations
80
- failed = 0 # Counter for failed generations
81
  with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
82
- # Create a list of futures, one for each topic
83
  futures = []
84
  for _ in range(NUM_WORKERS):
85
  topic_number = np.random.randint(0, len(TOPICS))
@@ -93,7 +111,9 @@ def main():
93
  topic_selected,
94
  system_message_generation,
95
  system_message_selected,
96
- OUTPUT_FILE_PATH
 
 
97
  )
98
  )
99
 
@@ -110,5 +130,8 @@ def main():
110
  failed += 1
111
  print("=" * 132)
112
 
 
113
  if __name__ == "__main__":
114
- main()
 
 
 
1
  # Import necessary libraries and modules
2
+ import json # Used for encoding and decoding JSON data
3
+ import numpy as np # Provides support for large, multi-dimensional arrays and matrices
4
+ from wiki import search as search_wikipedia # Import the search function from the wiki module and rename it
5
+ from concurrent.futures import ThreadPoolExecutor # Import ThreadPoolExecutor for concurrent execution
6
+ from llm_handler import send_to_llm # Import the send_to_llm function from the llm_handler module
7
+ from params import OUTPUT_FILE_PATH, NUM_WORKERS, PROVIDER # Import constants from the params module
8
+ from datasets import load_dataset
9
+ from huggingface_hub import list_datasets
10
+
11
+ # Set the provider for the language model to "local-model"
12
+ PROVIDER = "local-model"
13
+
14
+ # Import system messages from the system_messages module
15
+ from system_messages import (
16
+ SYSTEM_MESSAGES_VODALUS,
17
+ )
18
+ from topics import TOPICS # Import topics from the topics module
19
 
20
  # Set the system messages to those specified in SYSTEM_MESSAGES_VODALUS
21
  SYSTEM_MESSAGES = SYSTEM_MESSAGES_VODALUS
 
30
  - Throughout, adopt a conversational yet insightful tone. Weave in philosophical questioning, humorous insights, and practical examples to elucidate complex ideas in an engaging manner. Your goal is to provide clear, detailed explanations that make your reasoning process evident, fostering an understanding that resonates on both intellectual and emotional levels.
31
  - Finally, remember that each task is a unique learning opportunity that contributes to your holistic knowledge framework. Synthesize insights from individual tasks into your broader understanding, continually refining your cognitive processes and response strategies. Your ultimate aim is to evolve continuously, optimizing how you integrate and apply new information for comprehensive, adaptive problem-solving.
32
  - Never end the conversation with a 'send-off' or 'last-line' that offers nothing of real value to the user.
33
+ ####DO NOT EVER MENTION THE DATASET AND THE ACTY OF CRAFTING QUESTIONS OR RESPONSES WHILE GENERATING, YOU ARE NOT ALLOWED TO BREAK THE 4TH-WALL AND CONTAMINATE THE DATASET. DO NOT EVERY SAY ANY PHRASES SUCH AS AND/OR SIMILAR TO: 'Here's a question that covers a very narrow topic in the SUBJECT_AREA'####
34
+ """
35
 
36
 
37
  # Define a dictionary to hold context information for message generation
38
  msg_context = {"role": "system", "content": str(PROMPT_1)}
39
 
40
+ # Modify the generate_data function to accept a dataset parameter
41
  async def generate_data(
42
  topic_selected,
43
  system_message_generation,
44
  system_message_selected,
45
+ output_file_path,
46
+ llm_provider,
47
+ dataset
48
  ):
49
+ # Use the provided dataset instead of Wikipedia
50
+ dataset_info = f"Dataset: {dataset.info.description}\n"
51
+ dataset_summary = "\n".join([f"{k}: {v}" for k, v in dataset[0].items()])
 
 
52
 
53
+ full_prompt_for_llm = f"{system_message_generation}\n\n---\nGround Truth Information to use in your response generation:\n{dataset_info}\nSample entry:\n{dataset_summary}"
 
54
 
55
  # Create msg_context for LLM with Wikipedia info
56
  msg_context = {"role": "system", "content": full_prompt_for_llm}
 
59
  msg_list = [msg_context, {"role": "user", "content": f"Generate a question based on the SUBJECT_AREA: {topic_selected}"}]
60
 
61
  # Send to LLM for question generation
62
+ question, _ = send_to_llm(llm_provider, msg_list)
63
 
64
  # Prepare message list for LLM to generate the answer
65
  msg_list_answer = [
 
68
  ]
69
 
70
  # Send to LLM for answer generation
71
+ answer, _ = send_to_llm(llm_provider, msg_list_answer)
72
 
73
  # Prepare data for output (excluding usage information)
74
  data = {
 
83
 
84
  return data
85
 
86
+ def load_huggingface_dataset(dataset_name, split="train"):
87
+ print(f"Loading dataset: {dataset_name}")
88
+ dataset = load_dataset(dataset_name, split=split)
89
+ print("Dataset loaded!")
90
+ return dataset
91
+
92
+ def search_huggingface_datasets(query, limit=10):
93
+ datasets = list_datasets(filter=query, limit=limit)
94
+ return [dataset.id for dataset in datasets]
95
+
96
  # Define the main function to orchestrate the data generation process
97
+ def main(dataset):
98
+ nn = 0
99
+ failed = 0
100
  with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
 
101
  futures = []
102
  for _ in range(NUM_WORKERS):
103
  topic_number = np.random.randint(0, len(TOPICS))
 
111
  topic_selected,
112
  system_message_generation,
113
  system_message_selected,
114
+ OUTPUT_FILE_PATH,
115
+ PROVIDER,
116
+ dataset
117
  )
118
  )
119
 
 
130
  failed += 1
131
  print("=" * 132)
132
 
133
+
134
  if __name__ == "__main__":
135
+ # Load a default dataset (e.g., Wikipedia) if no dataset is provided
136
+ default_dataset = load_huggingface_dataset("wikipedia", split="20220301.en")
137
+ main(default_dataset)