import json import os import urllib.parse import gradio as gr import requests from gradio_huggingfacehub_search import HuggingfaceHubSearch from huggingface_hub import InferenceClient example = HuggingfaceHubSearch().example_value() client = InferenceClient( "meta-llama/Meta-Llama-3.1-70B-Instruct", token=os.environ["HF_TOKEN"], ) def get_iframe(hub_repo_id, sql_query=None): if sql_query: sql_query = urllib.parse.quote(sql_query) url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer?sql_console=true&sql={sql_query}" else: url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer" iframe = f""" """ return iframe def get_column_info(hub_repo_id): url: str = f"https://datasets-server.huggingface.co/info?dataset={hub_repo_id}" response = requests.get(url) try: data = response.json() data = data.get("dataset_info") key = list(data.keys())[0] features: str = json.dumps(data.get(key).get("features")) except Exception as e: gr.Error(f"Error getting column info: {e}") return features def query_dataset(hub_repo_id, features, query): messages = [ { "role": "system", "content": "You are a helpful assistant that returns a DuckDB SQL query based on the user's query and dataset features. Only return the SQL query, no other text.", }, { "role": "user", "content": f"""table train # Features {features} # Query {query} """, }, ] response = client.chat_completion( messages=messages, max_tokens=1000, stream=False, ) query = response.choices[0].message.content return query, get_iframe(hub_repo_id, query) with gr.Blocks() as demo: gr.Markdown("""# 🐥 🦙 🤗 Text To SQL Hub Datasets 🐥 🦙 🤗 This is a basic text to SQL tool that allows you to query datasets on Huggingface Hub. It is built with [DuckDB](https://duckdb.org/), [Huggingface's Inference API](https://huggingface.co/docs/api-inference/index), and [LLama 3.1 70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct). Also, it uses the [dataset-server API](https://redocly.github.io/redoc/?url=https://datasets-server.huggingface.co/openapi.json#operation/isValidDataset). """) with gr.Row(): with gr.Column(): search_in = HuggingfaceHubSearch( label="Search Huggingface Hub", placeholder="Search for models on Huggingface", search_type="dataset", ) btn = gr.Button("Show Dataset") with gr.Row(): search_out = gr.HTML(label="Search Results") with gr.Row(): features = gr.Code(label="Features", language="json", visible=False) with gr.Row(): query = gr.Textbox(label="Query", placeholder="Enter a query to generate SQL") with gr.Row(): sql_out = gr.Code(label="SQL Query") with gr.Row(): btn2 = gr.Button("Query Dataset") gr.on( [btn.click, search_in.submit], fn=get_iframe, inputs=[search_in], outputs=[search_out], ).then( fn=get_column_info, inputs=[search_in], outputs=[features], ) btn2.click( fn=query_dataset, inputs=[search_in, features, query], outputs=[sql_out, search_out], ) if __name__ == "__main__": demo.launch()