Vodalus

Sleeping

App Files Files Community

BeTaLabs commited on Jun 29

Commit

6bdbc55

•

1 Parent(s): 78d99a8

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -3

app.py CHANGED Viewed

@@ -14,12 +14,17 @@ import random
 from params import load_params, save_params
 import pandas as pd
 import csv
 ANNOTATION_CONFIG_FILE = "annotation_config.json"
 OUTPUT_FILE_PATH = "dataset.jsonl"
 def load_llm_config():
     params = load_params()
     return (
@@ -34,6 +39,8 @@ def load_llm_config():
         params.get('presence_penalty', 0.0)
     )
 def save_llm_config(provider, base_url, workspace, api_key, max_tokens, temperature, top_p, frequency_penalty, presence_penalty):
     save_params({
         'PROVIDER': provider,
@@ -49,6 +56,8 @@ def save_llm_config(provider, base_url, workspace, api_key, max_tokens, temperat
     return "LLM configuration saved successfully"
 def load_annotation_config():
     try:
         with open(ANNOTATION_CONFIG_FILE, 'r') as f:
@@ -92,6 +101,8 @@ def load_annotation_config():
         }
 def load_csv_dataset(file_path):
     data = []
     with open(file_path, 'r') as f:
@@ -100,20 +111,28 @@ def load_csv_dataset(file_path):
             data.append(row)
     return data
 def load_txt_dataset(file_path):
     with open(file_path, 'r') as f:
         return [{"content": line.strip()} for line in f if line.strip()]
 def save_annotation_config(config):
     with open(ANNOTATION_CONFIG_FILE, 'w') as f:
         json.dump(config, f, indent=2)
 def load_jsonl_dataset(file_path):
     if not os.path.exists(file_path):
         return []
     with open(file_path, 'r') as f:
         return [json.loads(line.strip()) for line in f if line.strip()]
 def load_dataset(file):
     if file is None:
         return "", 0, 0, "No file uploaded", "3", [], [], [], ""
@@ -136,6 +155,8 @@ def load_dataset(file):
     first_row = json.dumps(data[0], indent=2)
     return first_row, 0, len(data), f"Row: 1/{len(data)}", "3", [], [], [], ""
 def save_row(file_path, index, row_data):
     file_extension = file_path.split('.')[-1].lower()
@@ -150,6 +171,8 @@ def save_row(file_path, index, row_data):
     return f"Row {index} saved successfully"
 def save_jsonl_row(file_path, index, row_data):
     with open(file_path, 'r') as f:
         lines = f.readlines()
@@ -159,6 +182,8 @@ def save_jsonl_row(file_path, index, row_data):
     with open(file_path, 'w') as f:
         f.writelines(lines)
 def save_csv_row(file_path, index, row_data):
     df = pd.read_csv(file_path)
     row_dict = json.loads(row_data)
@@ -166,6 +191,8 @@ def save_csv_row(file_path, index, row_data):
         df.at[index, col] = value
     df.to_csv(file_path, index=False)
 def save_txt_row(file_path, index, row_data):
     with open(file_path, 'r') as f:
         lines = f.readlines()
@@ -176,6 +203,8 @@ def save_txt_row(file_path, index, row_data):
     with open(file_path, 'w') as f:
         f.writelines(lines)
 def get_row(file_path, index):
     data = load_jsonl_dataset(file_path)
     if not data:
@@ -184,6 +213,8 @@ def get_row(file_path, index):
         return json.dumps(data[index], indent=2), len(data)
     return "", len(data)
 def json_to_markdown(json_str):
     try:
         data = json.loads(json_str)
@@ -192,6 +223,8 @@ def json_to_markdown(json_str):
     except json.JSONDecodeError:
         return "Error: Invalid JSON format"
 def markdown_to_json(markdown_str):
     sections = re.split(r'#\s+(System|Instruction|Response)\s*\n', markdown_str)
     if len(sections) != 7:  # Should be: ['', 'System', content, 'Instruction', content, 'Response', content]
@@ -204,10 +237,14 @@ def markdown_to_json(markdown_str):
     }
     return json.dumps(json_data, indent=2)
 def navigate_rows(file_path: str, current_index: int, direction: Literal["prev", "next"], metadata_config):
     new_index = max(0, current_index + (-1 if direction == "prev" else 1))
     return load_and_show_row(file_path, new_index, metadata_config)
 def load_and_show_row(file_path, index, metadata_config):
     row_data, total = get_row(file_path, index)
     if not row_data:
@@ -229,6 +266,8 @@ def load_and_show_row(file_path, index, metadata_config):
     return (row_data, index, total, f"Row: {index + 1}/{total}", quality,
             high_quality_tags, low_quality_tags, toxic_tags, other)
 def save_row_with_metadata(file_path, index, row_data, config, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     data = json.loads(row_data)
     metadata = {
@@ -248,6 +287,8 @@ def save_row_with_metadata(file_path, index, row_data, config, quality, high_qua
     data["metadata"] = metadata
     return save_row(file_path, index, json.dumps(data))
 def update_annotation_ui(config):
     quality_choices = [(item["value"], item["label"]) for item in config["quality_scale"]["scale"]]
     quality_label = gr.Radio(
@@ -271,6 +312,8 @@ def update_annotation_ui(config):
     return quality_label, *tag_components, other_description
 def load_config_to_ui(config):
     return (
         config["quality_scale"]["name"],
@@ -280,6 +323,8 @@ def load_config_to_ui(config):
         [[field["name"], field["description"]] for field in config["free_text_fields"]]
     )
 def save_config_from_ui(name, description, scale, categories, fields, topics, all_topics_text):
     if all_topics_text.visible:
         topics_list = [topic.strip() for topic in all_topics_text.split("\n") if topic.strip()]
@@ -299,6 +344,8 @@ def save_config_from_ui(name, description, scale, categories, fields, topics, al
     save_annotation_config(new_config)
     return "Configuration saved successfully", new_config
 # Add this new function to generate the preview
 def generate_preview(row_data, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     try:
@@ -321,6 +368,8 @@ def generate_preview(row_data, quality, high_quality_tags, low_quality_tags, tox
     except json.JSONDecodeError:
         return "Error: Invalid JSON in the current row data"
 def load_dataset_config():
     params = load_params()
     with open("system_messages.py", "r") as f:
@@ -347,6 +396,8 @@ def load_dataset_config():
         params.get('presence_penalty', 0.0)
     )
 def edit_all_topics_func(topics):
     topics_list = [topic[0] for topic in topics]
     jsonl_rows = "\n".join([json.dumps({"topic": topic}) for topic in topics_list])
@@ -356,6 +407,8 @@ def edit_all_topics_func(topics):
         gr.update(visible=True)
     )
 def update_topics_from_text(text):
     try:
         # Try parsing as JSONL
@@ -366,6 +419,8 @@ def update_topics_from_text(text):
     return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
 def save_dataset_config(system_messages, prompt_1, topics, max_tokens, temperature, top_p, frequency_penalty, presence_penalty):
     # Save VODALUS_SYSTEM_MESSAGE to system_messages.py
     with open("system_messages.py", "w") as f:
@@ -426,6 +481,7 @@ def chat_with_llm(message, history):
         print(f"Error in chat_with_llm: {str(e)}")
         return history + [[message, f"Error: {str(e)}"]]
 def update_chat_context(row_data, index, total, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     context = f"""Current app state:
     Row: {index + 1}/{total}
@@ -440,12 +496,16 @@ def update_chat_context(row_data, index, total, quality, high_quality_tags, low_
     return [[None, context]]
-async def run_generate_dataset(num_workers, num_generations, output_file_path):
     generated_data = []
     for _ in range(num_generations):
         topic_selected = random.choice(TOPICS)
         system_message_selected = random.choice(SYSTEM_MESSAGES_VODALUS)
-        data = await generate_data(topic_selected, PROMPT_1, system_message_selected, output_file_path)
         if data:
             generated_data.append(json.dumps(data))
@@ -456,15 +516,21 @@ async def run_generate_dataset(num_workers, num_generations, output_file_path):
     return f"Generated {num_generations} entries and saved to {output_file_path}", "\n".join(generated_data[:5]) + "\n..."
 def add_topic_row(data):
     if isinstance(data, pd.DataFrame):
         return pd.concat([data, pd.DataFrame({"Topic": ["New Topic"]})], ignore_index=True)
     else:
         return data + [["New Topic"]]
 def remove_last_topic_row(data):
     return data[:-1] if len(data) > 1 else data
 def edit_all_topics_func(topics):
     topics_list = [topic[0] for topic in topics]
     jsonl_rows = "\n".join([json.dumps({"topic": topic}) for topic in topics_list])
@@ -474,6 +540,8 @@ def edit_all_topics_func(topics):
         gr.update(visible=True)
     )
 def update_topics_from_text(text):
     try:
         # Try parsing as JSONL
@@ -484,6 +552,8 @@ def update_topics_from_text(text):
     return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
 def update_topics_from_text(text):
     try:
         # Try parsing as JSONL
@@ -494,6 +564,82 @@ def update_topics_from_text(text):
     return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
 css = """
 body, #root {
     margin: 0;
@@ -740,6 +886,20 @@ with demo:
                 with gr.Row():
                     save_dataset_config_btn = gr.Button("Save Dataset Configuration", variant="primary")
                     dataset_config_status = gr.Textbox(label="Status")
             with gr.Tab("Dataset Generation"):
@@ -889,7 +1049,7 @@ with demo:
     start_generation_btn.click(
         run_generate_dataset,
-        inputs=[num_workers, num_generations, output_file_path],
         outputs=[generation_status, generation_output]
     )
@@ -915,6 +1075,30 @@ with demo:
             outputs=[chatbot]
         )
     demo.load(
         lambda: (

 from params import load_params, save_params
 import pandas as pd
 import csv
+from datasets import load_dataset
+from huggingface_hub import list_datasets, HfApi, hf_hub_download
 ANNOTATION_CONFIG_FILE = "annotation_config.json"
 OUTPUT_FILE_PATH = "dataset.jsonl"
 def load_llm_config():
     params = load_params()
     return (
         params.get('presence_penalty', 0.0)
     )
 def save_llm_config(provider, base_url, workspace, api_key, max_tokens, temperature, top_p, frequency_penalty, presence_penalty):
     save_params({
         'PROVIDER': provider,
     return "LLM configuration saved successfully"
 def load_annotation_config():
     try:
         with open(ANNOTATION_CONFIG_FILE, 'r') as f:
         }
 def load_csv_dataset(file_path):
     data = []
     with open(file_path, 'r') as f:
             data.append(row)
     return data
 def load_txt_dataset(file_path):
     with open(file_path, 'r') as f:
         return [{"content": line.strip()} for line in f if line.strip()]
 def save_annotation_config(config):
     with open(ANNOTATION_CONFIG_FILE, 'w') as f:
         json.dump(config, f, indent=2)
 def load_jsonl_dataset(file_path):
     if not os.path.exists(file_path):
         return []
     with open(file_path, 'r') as f:
         return [json.loads(line.strip()) for line in f if line.strip()]
 def load_dataset(file):
     if file is None:
         return "", 0, 0, "No file uploaded", "3", [], [], [], ""
     first_row = json.dumps(data[0], indent=2)
     return first_row, 0, len(data), f"Row: 1/{len(data)}", "3", [], [], [], ""
 def save_row(file_path, index, row_data):
     file_extension = file_path.split('.')[-1].lower()
     return f"Row {index} saved successfully"
 def save_jsonl_row(file_path, index, row_data):
     with open(file_path, 'r') as f:
         lines = f.readlines()
     with open(file_path, 'w') as f:
         f.writelines(lines)
 def save_csv_row(file_path, index, row_data):
     df = pd.read_csv(file_path)
     row_dict = json.loads(row_data)
         df.at[index, col] = value
     df.to_csv(file_path, index=False)
 def save_txt_row(file_path, index, row_data):
     with open(file_path, 'r') as f:
         lines = f.readlines()
     with open(file_path, 'w') as f:
         f.writelines(lines)
 def get_row(file_path, index):
     data = load_jsonl_dataset(file_path)
     if not data:
         return json.dumps(data[index], indent=2), len(data)
     return "", len(data)
 def json_to_markdown(json_str):
     try:
         data = json.loads(json_str)
     except json.JSONDecodeError:
         return "Error: Invalid JSON format"
 def markdown_to_json(markdown_str):
     sections = re.split(r'#\s+(System|Instruction|Response)\s*\n', markdown_str)
     if len(sections) != 7:  # Should be: ['', 'System', content, 'Instruction', content, 'Response', content]
     }
     return json.dumps(json_data, indent=2)
 def navigate_rows(file_path: str, current_index: int, direction: Literal["prev", "next"], metadata_config):
     new_index = max(0, current_index + (-1 if direction == "prev" else 1))
     return load_and_show_row(file_path, new_index, metadata_config)
 def load_and_show_row(file_path, index, metadata_config):
     row_data, total = get_row(file_path, index)
     if not row_data:
     return (row_data, index, total, f"Row: {index + 1}/{total}", quality,
             high_quality_tags, low_quality_tags, toxic_tags, other)
 def save_row_with_metadata(file_path, index, row_data, config, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     data = json.loads(row_data)
     metadata = {
     data["metadata"] = metadata
     return save_row(file_path, index, json.dumps(data))
 def update_annotation_ui(config):
     quality_choices = [(item["value"], item["label"]) for item in config["quality_scale"]["scale"]]
     quality_label = gr.Radio(
     return quality_label, *tag_components, other_description
 def load_config_to_ui(config):
     return (
         config["quality_scale"]["name"],
         [[field["name"], field["description"]] for field in config["free_text_fields"]]
     )
 def save_config_from_ui(name, description, scale, categories, fields, topics, all_topics_text):
     if all_topics_text.visible:
         topics_list = [topic.strip() for topic in all_topics_text.split("\n") if topic.strip()]
     save_annotation_config(new_config)
     return "Configuration saved successfully", new_config
 # Add this new function to generate the preview
 def generate_preview(row_data, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     try:
     except json.JSONDecodeError:
         return "Error: Invalid JSON in the current row data"
 def load_dataset_config():
     params = load_params()
     with open("system_messages.py", "r") as f:
         params.get('presence_penalty', 0.0)
     )
 def edit_all_topics_func(topics):
     topics_list = [topic[0] for topic in topics]
     jsonl_rows = "\n".join([json.dumps({"topic": topic}) for topic in topics_list])
         gr.update(visible=True)
     )
 def update_topics_from_text(text):
     try:
         # Try parsing as JSONL
     return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
 def save_dataset_config(system_messages, prompt_1, topics, max_tokens, temperature, top_p, frequency_penalty, presence_penalty):
     # Save VODALUS_SYSTEM_MESSAGE to system_messages.py
     with open("system_messages.py", "w") as f:
         print(f"Error in chat_with_llm: {str(e)}")
         return history + [[message, f"Error: {str(e)}"]]
 def update_chat_context(row_data, index, total, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     context = f"""Current app state:
     Row: {index + 1}/{total}
     return [[None, context]]
+async def run_generate_dataset(num_workers, num_generations, output_file_path, loaded_dataset):
+    if loaded_dataset is None:
+        return "Error: No dataset loaded. Please load a dataset before generating.", ""
     generated_data = []
     for _ in range(num_generations):
         topic_selected = random.choice(TOPICS)
         system_message_selected = random.choice(SYSTEM_MESSAGES_VODALUS)
+        data = await generate_data(topic_selected, PROMPT_1, system_message_selected, output_file_path, loaded_dataset)
         if data:
             generated_data.append(json.dumps(data))
     return f"Generated {num_generations} entries and saved to {output_file_path}", "\n".join(generated_data[:5]) + "\n..."
 def add_topic_row(data):
     if isinstance(data, pd.DataFrame):
         return pd.concat([data, pd.DataFrame({"Topic": ["New Topic"]})], ignore_index=True)
     else:
         return data + [["New Topic"]]
 def remove_last_topic_row(data):
     return data[:-1] if len(data) > 1 else data
 def edit_all_topics_func(topics):
     topics_list = [topic[0] for topic in topics]
     jsonl_rows = "\n".join([json.dumps({"topic": topic}) for topic in topics_list])
         gr.update(visible=True)
     )
 def update_topics_from_text(text):
     try:
         # Try parsing as JSONL
     return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
 def update_topics_from_text(text):
     try:
         # Try parsing as JSONL
     return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
+def search_huggingface_datasets(query):
+    try:
+        api = HfApi()
+        datasets = api.list_datasets(search=query, limit=20)
+        dataset_ids = [dataset.id for dataset in datasets]
+        return gr.update(choices=dataset_ids, visible=True), ""
+    except Exception as e:
+        print(f"Error searching datasets: {str(e)}")
+        return gr.update(choices=["Error: Could not search datasets"], visible=True), ""
+def load_huggingface_dataset(dataset_name, split="train"):
+    try:
+        print(f"Attempting to load dataset: {dataset_name}")
+        # Check if dataset_name is a string
+        if not isinstance(dataset_name, str):
+            raise ValueError(f"Expected dataset_name to be a string, but got {type(dataset_name)}")
+        # Try loading the dataset without specifying a config
+        full_dataset = load_dataset(dataset_name)
+        print(f"Dataset loaded. Available splits: {list(full_dataset.keys())}")
+        # Select the appropriate split
+        if split in full_dataset:
+            dataset = full_dataset[split]
+            print(f"Using specified split: {split}")
+        else:
+            available_splits = list(full_dataset.keys())
+            if available_splits:
+                dataset = full_dataset[available_splits[0]]
+                split = available_splits[0]
+                print(f"Specified split not found. Using first available split: {split}")
+            else:
+                raise ValueError("No valid splits found in the dataset")
+        return dataset, f"Dataset '{dataset_name}' (split: {split}) loaded successfully."
+    except Exception as e:
+        error_msg = f"Error loading dataset: {str(e)}"
+        print(f"Error details: {error_msg}")
+        # If loading fails, try to get the dataset card
+        try:
+            dataset_card = hf_hub_download(repo_id=dataset_name, filename="README.md")
+            with open(dataset_card, 'r') as f:
+                card_content = f.read()
+            return None, f"Dataset couldn't be loaded, but here's the dataset card:\n\n{card_content[:500]}..."
+        except:
+            return None, error_msg
+# Wrapper function to handle the Gradio interface
+def load_dataset_wrapper(dataset_name, split):
+    if not dataset_name:
+        return None, "Please enter a dataset name."
+    dataset, message = load_huggingface_dataset(dataset_name, split)
+    return dataset, message
+def get_popular_datasets():
+    return [
+        "wikipedia",
+        "squad",
+        "glue",
+        "imdb",
+        "wmt16",
+        "common_voice",
+        "cnn_dailymail",
+        "amazon_reviews_multi",
+        "yelp_review_full",
+        "ag_news"
+    ]
 css = """
 body, #root {
     margin: 0;
                 with gr.Row():
                     save_dataset_config_btn = gr.Button("Save Dataset Configuration", variant="primary")
                     dataset_config_status = gr.Textbox(label="Status")
+#                gr.Markdown("### Hugging Face Dataset")
+#                with gr.Row():
+#                    dataset_search = gr.Textbox(label="Search Datasets")
+#                    search_button = gr.Button("Search")
+#                dataset_input = gr.Textbox(label="Dataset Name", info="Enter a dataset name or select from search results")
+#                dataset_results = gr.Radio(label="Search Results", choices=[], visible=False)
+#                dataset_split = gr.Textbox(label="Dataset Split (optional)", value="train")
+#                load_dataset_button = gr.Button("Load Selected Dataset")
+#                dataset_status = gr.Textbox(label="Dataset Status")
+                # Add a state to store the loaded dataset
+#                loaded_dataset = gr.State(None)
             with gr.Tab("Dataset Generation"):
     start_generation_btn.click(
         run_generate_dataset,
+        inputs=[num_workers, num_generations, output_file_path, loaded_dataset],
         outputs=[generation_status, generation_output]
     )
             outputs=[chatbot]
         )
+    search_button.click(
+        search_huggingface_datasets,
+        inputs=[dataset_search],
+        outputs=[dataset_results, dataset_input]
+    )
+    dataset_results.change(
+        lambda choice: choice,
+        inputs=[dataset_results],
+        outputs=[dataset_input]
+    )
+    load_dataset_button.click(
+        load_dataset_wrapper,
+        inputs=[dataset_input, dataset_split],
+        outputs=[loaded_dataset, dataset_status]
+    )
+    # Modify the start_generation_btn.click to include the loaded dataset
+    start_generation_btn.click(
+        run_generate_dataset,
+        inputs=[num_workers, num_generations, output_file_path, loaded_dataset],
+        outputs=[generation_status, generation_output]
+    )
     demo.load(
         lambda: (