Spaces:

towardsai-tutors
/

buster

Running

App Files Files Community

jerpint commited on Sep 15, 2023

Commit

69a190d

•

1 Parent(s): 2785052

update chunk exrtaction to handle links properly (#7)

Browse files

Files changed (3) hide show

gradio_app.py +3 -1
markdown_parser.py +22 -2
title_link_langchaincourse.json +1 -0

gradio_app.py CHANGED Viewed

@@ -36,7 +36,9 @@ def format_sources(matched_documents: pd.DataFrame) -> str:
     )
     # drop duplicates, keep highest ranking ones
-    matched_documents = matched_documents.sort_values("similarity_to_answer", ascending=False).drop_duplicates("title", keep="first")
     documents = "\n".join(
         [

     )
     # drop duplicates, keep highest ranking ones
+    matched_documents = matched_documents.sort_values(
+        "similarity_to_answer", ascending=False
+    ).drop_duplicates("title", keep="first")
     documents = "\n".join(
         [

markdown_parser.py CHANGED Viewed

@@ -29,6 +29,8 @@ def find_md_files(folder_path):
             if file.endswith(".md"):
                 file_path = os.path.join(root, file)
                 title = os.path.splitext(file)[0]
                 with open(file_path, "r", encoding="utf-8") as md_file:
                     content = md_file.read()
                 md_files.append({"title": title, "content": content})
@@ -41,6 +43,16 @@ def split_string_by_max_words(input_string, max_words):
     return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
 if __name__ == "__main__":
     folder_path = "/path/to/folder/with/md_content/"
     folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"
@@ -59,12 +71,20 @@ if __name__ == "__main__":
     chunks = []
     from tqdm import tqdm
     for md_file in tqdm(md_files):
         md_title = md_file["title"]
         md_raw_content = md_file["content"]
         md_header_splits = markdown_splitter.split_text(md_raw_content)
         for split in md_header_splits:
             # add the headers back to the content
             headers = "\n".join(
@@ -77,10 +97,10 @@ if __name__ == "__main__":
             substrings = split_string_by_max_words(split.page_content, max_words=600)
             for substring in substrings:
                 chunk = {
-                    "title": md_title,
                     "content": headers + "\n" + substring,
                     "source": "TAI Course",
-                    "url": "https://learn.activeloop.ai/courses/langchain/",
                 }
                 chunks.append(chunk)

             if file.endswith(".md"):
                 file_path = os.path.join(root, file)
                 title = os.path.splitext(file)[0]
+                # Remove the trailing junk (the last word is some kind of hash)
+                title = " ".join(title.split()[:-1])
                 with open(file_path, "r", encoding="utf-8") as md_file:
                     content = md_file.read()
                 md_files.append({"title": title, "content": content})
     return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
+def get_title_link_from_md_title(md_title: str, title_link_data: dict):
+    for data in title_link_data:
+        title = data["title"]
+        if md_title in title.replace(":", "").replace(".", " ").replace("? ", ""):
+            return data["title"], data["link"]
+    # default back to course link if not found...
+    print("\nNot found: ", md_title)
+    return md_title, "https://learn.activeloop.ai/courses/langchain/"
 if __name__ == "__main__":
     folder_path = "/path/to/folder/with/md_content/"
     folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"
     chunks = []
     from tqdm import tqdm
+    import json
+    with open("title_link_langchaincourse.json", "r") as f:
+        title_link_data = json.load(f)
     for md_file in tqdm(md_files):
         md_title = md_file["title"]
         md_raw_content = md_file["content"]
         md_header_splits = markdown_splitter.split_text(md_raw_content)
+        title, link = get_title_link_from_md_title(
+            md_title, title_link_data=title_link_data
+        )
         for split in md_header_splits:
             # add the headers back to the content
             headers = "\n".join(
             substrings = split_string_by_max_words(split.page_content, max_words=600)
             for substring in substrings:
                 chunk = {
+                    "title": title,
                     "content": headers + "\n" + substring,
                     "source": "TAI Course",
+                    "url": link,
                 }
                 chunks.append(chunk)

title_link_langchaincourse.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"title": "Course Intro", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317639-course-introduction-things-you-should-know-before-you-start"}, {"title": "LangChain 101: from Zero to Hero", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317643-langchain-101-from-zero-to-hero"}, {"title": "Intro to LLMs and LangChain module", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317652-introduction-to-llms-and-langchain"}, {"title": "Quick Intro to Large Language Models", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317654-quick-intro-to-large-language-models"}, {"title": "Understanding Tokens", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46866912-understanding-tokens"}, {"title": "Building Applications Powered by LLMs with LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317658-building-applications-powered-by-llms-with-langchain"}, {"title": "Exploring the World of Language Models: LLMs vs Chat Models", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317661-exploring-the-world-of-language-models-llms-vs-chat-models"}, {"title": "Exploring Conversational Capabilities with GPT-4 and ChatGPT", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317663-exploring-conversational-capabilities-with-gpt-4-and-chatgpt"}, {"title": "Build a News Articles Summarizer", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317667-build-a-news-articles-summarizer"}, {"title": "Using the Open-Source GPT4All Model Locally", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317672-using-the-open-source-gpt4all-model-locally"}, {"title": "What other models can we use? Popular LLM models compared", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317676-what-other-models-can-we-use-popular-llm-models-compared"}, {"title": "Intro to Prompting module", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317682-intro-to-prompting-module"}, {"title": "Intro to Prompt Engineering: Tips and Tricks", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317727-intro-to-prompt-engineering-tips-and-tricks"}, {"title": "Using Prompt Templates", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317867-using-prompt-templates"}, {"title": "Getting the Best of Few Shot Prompts and Example Selectors", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317950-getting-the-best-of-few-shot-prompts-and-example-selectors"}, {"title": "Managing Outputs with Output Parsers", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317956-managing-outputs-with-output-parses"}, {"title": "Improving Our News Articles Summarizer", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317962-improving-our-news-articles-summarizer"}, {"title": "Creating Knowledge Graphs from Textual Data: Unveiling Hidden Connections", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317967-creating-knowledge-graphs-from-textual-data-unveiling-hidden-connections"}, {"title": "Intro to Keeping Knowledge Organized with Indexes", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317975-intro-to-indexes-and-retrievers"}, {"title": "Exploring The Role of LangChain's Indexes and Retrievers", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317979-exploring-the-role-of-langchain-s-indexes-and-retrievers"}, {"title": "Streamlined Data Ingestion: Text, PyPDF, Selenium URL Loaders and Google Drive Sync", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317984-streamlined-data-ingestion-text-pypdf-selenium-url-loaders-and-google-drive-sync"}, {"title": "What are Text Splitters and Why They are Useful", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317993-what-are-text-splitters-and-why-they-are-useful"}, {"title": "Exploring the World of Embeddings", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318003-exploring-the-world-of-embeddings"}, {"title": "Build a Customer Support Question Answering Chatbot", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318012-build-a-customer-support-question-answering-chatbot"}, {"title": "Conversation Intelligence: Gong.io Open-Source Alternative AI Sales Assistant", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318017-conversation-intelligence-gong-io-open-source-alternative-ai-sales-assistant"}, {"title": "FableForge: Creating Picture Books with OpenAI, Replicate and Deep Lake", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318021-fableforge-creating-picture-books-with-openai-and-deep-lake"}, {"title": "Introduction to Chains", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318028-introduction-to-chains"}, {"title": "Chains and Why They Are Used", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318038-chains-and-why-they-are-used"}, {"title": "Create a YouTube Video Summarizer Using Whisper and LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318091-create-a-youtube-video-summarizer-using-whisper-and-langchain"}, {"title": "Creating a Voice Assistant for your Knowledge Base", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318140-creating-a-voice-assistant-for-your-knowledge-base"}, {"title": "LangChain & GPT-4 for Code Understanding: Twitter Algorithm", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318143-langchain-gpt-4-for-code-understanding-twitter-algorithm"}, {"title": "3 ways to build a recommendation engine for songs with LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318173-3-ways-to-build-a-recommendation-engine-for-songs-with-langchain"}, {"title": "Guarding Against Undesirable Outputs with the Self-Critique Chain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318181-guarding-against-undesirable-outputs-with-the-self-critique-chain"}, {"title": "Introduction to LLM Memory", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318189-introduction-to-llm-memory"}, {"title": "Optimizing Your Communication: The Importance of Monitoring Message History", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318198-optimizing-your-communication-the-importance-of-monitoring-message-history"}, {"title": "Mastering Memory Types in LangChain: A Comprehensive Guide with Practical Examples", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318209-mastering-memory-types-in-langchain-a-comprehensive-guide-with-practical-examples"}, {"title": "Chat with a GitHub Repository", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318213-chat-with-a-github-repository"}, {"title": "Build a Question Answering Chatbot over Documents with Sources", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318222-build-a-question-answering-chatbot-over-documents-with-sources"}, {"title": "Build ChatGPT to Answer Questions on Your Financial Data", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318274-build-chatgpt-to-answer-questions-on-your-financial-data"}, {"title": "DataChad: an AI App with LangChain & Deep Lake to Chat with Any Data", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318278-datachad-an-ai-app-with-langchain-deep-lake-to-chat-with-any-data"}, {"title": "Introduction to Tools", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318283-introduction-to-tools"}, {"title": "LangChain's Tool Utilization: Examining Diverse Applications through Illustrative Scenarios", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318291-langchain-s-tool-utilization-examing-diverse-applications-through-illustrative-scenarios"}, {"title": "Supercharge Your Blog Posts Automatically with LangChain and Google Search", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318297-supercharge-your-blog-posts-automatically-with-langchain-and-google-search"}, {"title": "Recreating the Bing Chatbot", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318300-recreating-the-bing-chatbot"}, {"title": "Integrating Multiple Tools for Web-Based Question-Answering", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318347-integrating-multiple-tools-for-web-based-question-answering"}, {"title": "Building a Custom Document Retrieval Tool with Deep Lake and LangChain: A Step-by-Step Workflow", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318355-building-a-custom-document-retrieval-tool-with-deep-lake-and-langchain-a-step-by-step-workflow"}, {"title": "Introduction to Agents", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318358-introduction-to-agents"}, {"title": "What are Agents: Agents as Content Generators and Reasoning Engines", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318365-what-are-agents-agents-as-content-generators-and-reasoning-engines"}, {"title": "Exploring the Fascinating World of Autonomous Agents: A Closer Look at AutoGPT and BabyAGI", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318374-exploring-the-fascinating-world-of-autonomous-agents-a-closer-look-at-autogpt-and-babyagi"}, {"title": "Using AutoGPT with LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318383-using-autogpt-with-langchain"}, {"title": "Agent Simulation Projects: CAMEL and Generative Agents", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318387-agent-simulation-projects-camel-and-generative-agents"}, {"title": "Building Autonomous Agents to Create Analysis Reports", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318392-building-autonomous-agents-to-create-analysis-reports"}, {"title": "Current Insights and Trends of Agents", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318395-current-insights-and-trends-of-agents"}]