jerpint commited on
Commit
69a190d
1 Parent(s): 2785052

update chunk exrtaction to handle links properly (#7)

Browse files
gradio_app.py CHANGED
@@ -36,7 +36,9 @@ def format_sources(matched_documents: pd.DataFrame) -> str:
36
  )
37
 
38
  # drop duplicates, keep highest ranking ones
39
- matched_documents = matched_documents.sort_values("similarity_to_answer", ascending=False).drop_duplicates("title", keep="first")
 
 
40
 
41
  documents = "\n".join(
42
  [
 
36
  )
37
 
38
  # drop duplicates, keep highest ranking ones
39
+ matched_documents = matched_documents.sort_values(
40
+ "similarity_to_answer", ascending=False
41
+ ).drop_duplicates("title", keep="first")
42
 
43
  documents = "\n".join(
44
  [
markdown_parser.py CHANGED
@@ -29,6 +29,8 @@ def find_md_files(folder_path):
29
  if file.endswith(".md"):
30
  file_path = os.path.join(root, file)
31
  title = os.path.splitext(file)[0]
 
 
32
  with open(file_path, "r", encoding="utf-8") as md_file:
33
  content = md_file.read()
34
  md_files.append({"title": title, "content": content})
@@ -41,6 +43,16 @@ def split_string_by_max_words(input_string, max_words):
41
  return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
42
 
43
 
 
 
 
 
 
 
 
 
 
 
44
  if __name__ == "__main__":
45
  folder_path = "/path/to/folder/with/md_content/"
46
  folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"
@@ -59,12 +71,20 @@ if __name__ == "__main__":
59
  chunks = []
60
 
61
  from tqdm import tqdm
 
 
 
 
62
 
63
  for md_file in tqdm(md_files):
64
  md_title = md_file["title"]
65
  md_raw_content = md_file["content"]
66
  md_header_splits = markdown_splitter.split_text(md_raw_content)
67
 
 
 
 
 
68
  for split in md_header_splits:
69
  # add the headers back to the content
70
  headers = "\n".join(
@@ -77,10 +97,10 @@ if __name__ == "__main__":
77
  substrings = split_string_by_max_words(split.page_content, max_words=600)
78
  for substring in substrings:
79
  chunk = {
80
- "title": md_title,
81
  "content": headers + "\n" + substring,
82
  "source": "TAI Course",
83
- "url": "https://learn.activeloop.ai/courses/langchain/",
84
  }
85
  chunks.append(chunk)
86
 
 
29
  if file.endswith(".md"):
30
  file_path = os.path.join(root, file)
31
  title = os.path.splitext(file)[0]
32
+ # Remove the trailing junk (the last word is some kind of hash)
33
+ title = " ".join(title.split()[:-1])
34
  with open(file_path, "r", encoding="utf-8") as md_file:
35
  content = md_file.read()
36
  md_files.append({"title": title, "content": content})
 
43
  return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
44
 
45
 
46
+ def get_title_link_from_md_title(md_title: str, title_link_data: dict):
47
+ for data in title_link_data:
48
+ title = data["title"]
49
+ if md_title in title.replace(":", "").replace(".", " ").replace("? ", ""):
50
+ return data["title"], data["link"]
51
+ # default back to course link if not found...
52
+ print("\nNot found: ", md_title)
53
+ return md_title, "https://learn.activeloop.ai/courses/langchain/"
54
+
55
+
56
  if __name__ == "__main__":
57
  folder_path = "/path/to/folder/with/md_content/"
58
  folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"
 
71
  chunks = []
72
 
73
  from tqdm import tqdm
74
+ import json
75
+
76
+ with open("title_link_langchaincourse.json", "r") as f:
77
+ title_link_data = json.load(f)
78
 
79
  for md_file in tqdm(md_files):
80
  md_title = md_file["title"]
81
  md_raw_content = md_file["content"]
82
  md_header_splits = markdown_splitter.split_text(md_raw_content)
83
 
84
+ title, link = get_title_link_from_md_title(
85
+ md_title, title_link_data=title_link_data
86
+ )
87
+
88
  for split in md_header_splits:
89
  # add the headers back to the content
90
  headers = "\n".join(
 
97
  substrings = split_string_by_max_words(split.page_content, max_words=600)
98
  for substring in substrings:
99
  chunk = {
100
+ "title": title,
101
  "content": headers + "\n" + substring,
102
  "source": "TAI Course",
103
+ "url": link,
104
  }
105
  chunks.append(chunk)
106
 
title_link_langchaincourse.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"title": "Course Intro", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317639-course-introduction-things-you-should-know-before-you-start"}, {"title": "LangChain 101: from Zero to Hero", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317643-langchain-101-from-zero-to-hero"}, {"title": "Intro to LLMs and LangChain module", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317652-introduction-to-llms-and-langchain"}, {"title": "Quick Intro to Large Language Models", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317654-quick-intro-to-large-language-models"}, {"title": "Understanding Tokens", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46866912-understanding-tokens"}, {"title": "Building Applications Powered by LLMs with LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317658-building-applications-powered-by-llms-with-langchain"}, {"title": "Exploring the World of Language Models: LLMs vs Chat Models", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317661-exploring-the-world-of-language-models-llms-vs-chat-models"}, {"title": "Exploring Conversational Capabilities with GPT-4 and ChatGPT", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317663-exploring-conversational-capabilities-with-gpt-4-and-chatgpt"}, {"title": "Build a News Articles Summarizer", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317667-build-a-news-articles-summarizer"}, {"title": "Using the Open-Source GPT4All Model Locally", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317672-using-the-open-source-gpt4all-model-locally"}, {"title": "What other models can we use? Popular LLM models compared", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317676-what-other-models-can-we-use-popular-llm-models-compared"}, {"title": "Intro to Prompting module", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317682-intro-to-prompting-module"}, {"title": "Intro to Prompt Engineering: Tips and Tricks", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317727-intro-to-prompt-engineering-tips-and-tricks"}, {"title": "Using Prompt Templates", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317867-using-prompt-templates"}, {"title": "Getting the Best of Few Shot Prompts and Example Selectors", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317950-getting-the-best-of-few-shot-prompts-and-example-selectors"}, {"title": "Managing Outputs with Output Parsers", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317956-managing-outputs-with-output-parses"}, {"title": "Improving Our News Articles Summarizer", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317962-improving-our-news-articles-summarizer"}, {"title": "Creating Knowledge Graphs from Textual Data: Unveiling Hidden Connections", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317967-creating-knowledge-graphs-from-textual-data-unveiling-hidden-connections"}, {"title": "Intro to Keeping Knowledge Organized with Indexes", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317975-intro-to-indexes-and-retrievers"}, {"title": "Exploring The Role of LangChain's Indexes and Retrievers", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317979-exploring-the-role-of-langchain-s-indexes-and-retrievers"}, {"title": "Streamlined Data Ingestion: Text, PyPDF, Selenium URL Loaders and Google Drive Sync", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317984-streamlined-data-ingestion-text-pypdf-selenium-url-loaders-and-google-drive-sync"}, {"title": "What are Text Splitters and Why They are Useful", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46317993-what-are-text-splitters-and-why-they-are-useful"}, {"title": "Exploring the World of Embeddings", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318003-exploring-the-world-of-embeddings"}, {"title": "Build a Customer Support Question Answering Chatbot", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318012-build-a-customer-support-question-answering-chatbot"}, {"title": "Conversation Intelligence: Gong.io Open-Source Alternative AI Sales Assistant", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318017-conversation-intelligence-gong-io-open-source-alternative-ai-sales-assistant"}, {"title": "FableForge: Creating Picture Books with OpenAI, Replicate and Deep Lake", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318021-fableforge-creating-picture-books-with-openai-and-deep-lake"}, {"title": "Introduction to Chains", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318028-introduction-to-chains"}, {"title": "Chains and Why They Are Used", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318038-chains-and-why-they-are-used"}, {"title": "Create a YouTube Video Summarizer Using Whisper and LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318091-create-a-youtube-video-summarizer-using-whisper-and-langchain"}, {"title": "Creating a Voice Assistant for your Knowledge Base", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318140-creating-a-voice-assistant-for-your-knowledge-base"}, {"title": "LangChain & GPT-4 for Code Understanding: Twitter Algorithm", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318143-langchain-gpt-4-for-code-understanding-twitter-algorithm"}, {"title": "3 ways to build a recommendation engine for songs with LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318173-3-ways-to-build-a-recommendation-engine-for-songs-with-langchain"}, {"title": "Guarding Against Undesirable Outputs with the Self-Critique Chain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318181-guarding-against-undesirable-outputs-with-the-self-critique-chain"}, {"title": "Introduction to LLM Memory", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318189-introduction-to-llm-memory"}, {"title": "Optimizing Your Communication: The Importance of Monitoring Message History", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318198-optimizing-your-communication-the-importance-of-monitoring-message-history"}, {"title": "Mastering Memory Types in LangChain: A Comprehensive Guide with Practical Examples", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318209-mastering-memory-types-in-langchain-a-comprehensive-guide-with-practical-examples"}, {"title": "Chat with a GitHub Repository", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318213-chat-with-a-github-repository"}, {"title": "Build a Question Answering Chatbot over Documents with Sources", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318222-build-a-question-answering-chatbot-over-documents-with-sources"}, {"title": "Build ChatGPT to Answer Questions on Your Financial Data", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318274-build-chatgpt-to-answer-questions-on-your-financial-data"}, {"title": "DataChad: an AI App with LangChain & Deep Lake to Chat with Any Data", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318278-datachad-an-ai-app-with-langchain-deep-lake-to-chat-with-any-data"}, {"title": "Introduction to Tools", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318283-introduction-to-tools"}, {"title": "LangChain's Tool Utilization: Examining Diverse Applications through Illustrative Scenarios", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318291-langchain-s-tool-utilization-examing-diverse-applications-through-illustrative-scenarios"}, {"title": "Supercharge Your Blog Posts Automatically with LangChain and Google Search", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318297-supercharge-your-blog-posts-automatically-with-langchain-and-google-search"}, {"title": "Recreating the Bing Chatbot", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318300-recreating-the-bing-chatbot"}, {"title": "Integrating Multiple Tools for Web-Based Question-Answering", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318347-integrating-multiple-tools-for-web-based-question-answering"}, {"title": "Building a Custom Document Retrieval Tool with Deep Lake and LangChain: A Step-by-Step Workflow", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318355-building-a-custom-document-retrieval-tool-with-deep-lake-and-langchain-a-step-by-step-workflow"}, {"title": "Introduction to Agents", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318358-introduction-to-agents"}, {"title": "What are Agents: Agents as Content Generators and Reasoning Engines", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318365-what-are-agents-agents-as-content-generators-and-reasoning-engines"}, {"title": "Exploring the Fascinating World of Autonomous Agents: A Closer Look at AutoGPT and BabyAGI", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318374-exploring-the-fascinating-world-of-autonomous-agents-a-closer-look-at-autogpt-and-babyagi"}, {"title": "Using AutoGPT with LangChain", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318383-using-autogpt-with-langchain"}, {"title": "Agent Simulation Projects: CAMEL and Generative Agents", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318387-agent-simulation-projects-camel-and-generative-agents"}, {"title": "Building Autonomous Agents to Create Analysis Reports", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318392-building-autonomous-agents-to-create-analysis-reports"}, {"title": "Current Insights and Trends of Agents", "link": "https://learn.activeloop.ai/courses/take/langchain/multimedia/46318395-current-insights-and-trends-of-agents"}]