Spaces:

towardsai-tutors
/

buster

Running

App Files Files Community

jerpint commited on Oct 3, 2023

Commit

7710388

•

1 Parent(s): 69a190d

update buster to pypi version (#8)

Browse files

Files changed (4) hide show

cfg.py +3 -5
embed_documents.py +1 -1
gradio_app.py +2 -2
requirements.txt +1 -1

cfg.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from buster.busterbot import Buster, BusterConfig
 from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
-from buster.formatters.documents import DocumentsFormatter
 from buster.formatters.prompts import PromptFormatter
 from buster.retriever import DeepLakeRetriever, Retriever
 from buster.tokenizers import GPTTokenizer
@@ -92,7 +92,7 @@ A user will now submit a question. Respond 'true' if it is valid, respond 'false
     },
     documents_formatter_cfg={
         "max_tokens": 3500,
-        "formatter": "{content}",
     },
     prompt_formatter_cfg={
         "max_tokens": 3500,
@@ -103,10 +103,8 @@ A user will now submit a question. Respond 'true' if it is valid, respond 'false
             "If the answer is in the documentation, summarize it in a helpful way to the user. "
             "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
             "Here is the information you can use: "
-            "<DOCUMENTS> "
         ),
         "text_after_docs": (
-            "<\DOCUMENTS>\n"
             "REMEMBER:\n"
             "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
             "You are provided information found in the <DOCUMENTS> tag. "
@@ -134,7 +132,7 @@ def setup_buster(buster_cfg):
     tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
     document_answerer: DocumentAnswerer = DocumentAnswerer(
         completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
-        documents_formatter=DocumentsFormatter(
             tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
         ),
         prompt_formatter=PromptFormatter(

 from buster.busterbot import Buster, BusterConfig
 from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
+from buster.formatters.documents import DocumentsFormatterJSON
 from buster.formatters.prompts import PromptFormatter
 from buster.retriever import DeepLakeRetriever, Retriever
 from buster.tokenizers import GPTTokenizer
     },
     documents_formatter_cfg={
         "max_tokens": 3500,
+        "columns": ["content", "source", "title"],
     },
     prompt_formatter_cfg={
         "max_tokens": 3500,
             "If the answer is in the documentation, summarize it in a helpful way to the user. "
             "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
             "Here is the information you can use: "
         ),
         "text_after_docs": (
             "REMEMBER:\n"
             "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
             "You are provided information found in the <DOCUMENTS> tag. "
     tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
     document_answerer: DocumentAnswerer = DocumentAnswerer(
         completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
+        documents_formatter=DocumentsFormatterJSON(
             tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
         ),
         prompt_formatter=PromptFormatter(

embed_documents.py CHANGED Viewed

@@ -3,7 +3,7 @@ from buster.documents_manager import DeepLakeDocumentsManager
 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
-    chunk_file = "langchain_course.csv"
     overwrite = True
     df = pd.read_csv(chunk_file)

 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
+    chunk_file = "data/wiki_and_tai.csv"
     overwrite = True
     df = pd.read_csv(chunk_file)

gradio_app.py CHANGED Viewed

@@ -40,7 +40,7 @@ def format_sources(matched_documents: pd.DataFrame) -> str:
         "similarity_to_answer", ascending=False
     ).drop_duplicates("title", keep="first")
-    documents = "\n".join(
         [
             document_template.format(document=document)
             for _, document in matched_documents.iterrows()
@@ -115,4 +115,4 @@ with block:
 block.queue(concurrency_count=16)
-block.launch(debug=True, share=False, auth=check_auth)

         "similarity_to_answer", ascending=False
     ).drop_duplicates("title", keep="first")
+    documents = "\n\n".join(
         [
             document_template.format(document=document)
             for _, document in matched_documents.iterrows()
 block.queue(concurrency_count=16)
+block.launch(debug=True, share=False)

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-git+https://github.com/jerpint/buster@main
 gradio
 deeplake

+buster-doctalk==1.0.19
 gradio
 deeplake