Spaces:

GIZ
/

audit_assistant

Running on CPU Upgrade

App Files Files Community

jonas commited on 26 days ago

Commit

3da458d

•

1 Parent(s): 325624a

Upload app.py

Browse files

Tried to add live streaming of an answers

Files changed (1) hide show

app.py +58 -23

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import pandas as pd
 import logging
 import os
 import re
 import json
@@ -14,6 +15,7 @@ from langchain.schema import (
     HumanMessage,
     SystemMessage,
 )
 from langchain_community.llms import HuggingFaceEndpoint
 from auditqa.process_chunks import load_chunks, getconfig
 from langchain_community.chat_models.huggingface import ChatHuggingFace
@@ -215,36 +217,69 @@ async def chat(query,history,sources,reports,subtype,year):
     ##-----------------------getting inference endpoints------------------------------
-    #callbacks = [StreamingStdOutCallbackHandler()]
     llm_qa = HuggingFaceEndpoint(
-        endpoint_url= model_config.get('reader','ENDPOINT'),
         max_new_tokens=512,
         repetition_penalty=1.03,
         timeout=70,
-        huggingfacehub_api_token=HF_token,)
-    # create RAG
     chat_model = ChatHuggingFace(llm=llm_qa)
-    ##-------------------------- get answers ---------------------------------------
-    answer_lst = []
-    for question, context in zip(question_lst , context_retrieved_lst):
-        answer = chat_model.invoke(messages)
-        answer_lst.append(answer.content)
     docs_html = []
     for i, d in enumerate(context_retrieved, 1):
         docs_html.append(make_html_source(d, i))
     docs_html = "".join(docs_html)
-    previous_answer = history[-1][1]
-    previous_answer = previous_answer if previous_answer is not None else ""
-    answer_yet = previous_answer + answer_lst[0]
-    answer_yet = parse_output_llm_with_sources(answer_yet)
-    history[-1] = (query,answer_yet)
-    history = [tuple(x) for x in history]
-    yield history,docs_html
     # logging the event
     try:
@@ -472,14 +507,14 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
     # using event listeners for 1. query box 2. click on example question
     # https://www.gradio.app/docs/gradio/textbox#event-listeners-arguments
     (textbox
-     .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-     .then(chat, [textbox,chatbot, dropdown_sources,dropdown_reports,dropdown_category,dropdown_year], [chatbot,sources_textbox],concurrency_limit = 8,api_name = "chat_textbox")
-     .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox"))
     (examples_hidden
-        .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot, dropdown_sources,dropdown_reports,dropdown_category,dropdown_year], [chatbot,sources_textbox],concurrency_limit = 8,api_name = "chat_examples")
-        .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
     demo.queue()

 import gradio as gr
 import pandas as pd
 import logging
+import asyncio
 import os
 import re
 import json
     HumanMessage,
     SystemMessage,
 )
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain_community.llms import HuggingFaceEndpoint
 from auditqa.process_chunks import load_chunks, getconfig
 from langchain_community.chat_models.huggingface import ChatHuggingFace
     ##-----------------------getting inference endpoints------------------------------
+    callback = StreamingStdOutCallbackHandler()
     llm_qa = HuggingFaceEndpoint(
+        endpoint_url=model_config.get('reader', 'ENDPOINT'),
         max_new_tokens=512,
         repetition_penalty=1.03,
         timeout=70,
+        huggingfacehub_api_token=HF_token,
+        streaming=True,
+        callbacks=[callback]
+    )
     chat_model = ChatHuggingFace(llm=llm_qa)
     docs_html = []
     for i, d in enumerate(context_retrieved, 1):
         docs_html.append(make_html_source(d, i))
     docs_html = "".join(docs_html)
+    answer_yet = ""
+    async def process_stream():
+        nonlocal answer_yet
+        async for chunk in chat_model.astream(messages):
+            token = chunk.content
+            answer_yet += token
+            parsed_answer = parse_output_llm_with_sources(answer_yet)
+            history[-1] = (query, parsed_answer)
+            yield [tuple(x) for x in history], docs_html
+    async for update in process_stream():
+        yield update
+    # #callbacks = [StreamingStdOutCallbackHandler()]
+    # llm_qa = HuggingFaceEndpoint(
+    #     endpoint_url= model_config.get('reader','ENDPOINT'),
+    #     max_new_tokens=512,
+    #     repetition_penalty=1.03,
+    #     timeout=70,
+    #     huggingfacehub_api_token=HF_token,)
+    # # create RAG
+    # chat_model = ChatHuggingFace(llm=llm_qa)
+    # ##-------------------------- get answers ---------------------------------------
+    # answer_lst = []
+    # for question, context in zip(question_lst , context_retrieved_lst):
+    #     answer = chat_model.invoke(messages)
+    #     answer_lst.append(answer.content)
+    # docs_html = []
+    # for i, d in enumerate(context_retrieved, 1):
+    #     docs_html.append(make_html_source(d, i))
+    # docs_html = "".join(docs_html)
+    # previous_answer = history[-1][1]
+    # previous_answer = previous_answer if previous_answer is not None else ""
+    # answer_yet = previous_answer + answer_lst[0]
+    # answer_yet = parse_output_llm_with_sources(answer_yet)
+    # history[-1] = (query,answer_yet)
+    # history = [tuple(x) for x in history]
+    # yield history,docs_html
     # logging the event
     try:
     # using event listeners for 1. query box 2. click on example question
     # https://www.gradio.app/docs/gradio/textbox#event-listeners-arguments
     (textbox
+    .submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
+    .then(chat, [textbox, chatbot, dropdown_sources, dropdown_reports, dropdown_category, dropdown_year], [chatbot, sources_textbox], queue=True, concurrency_limit=8, api_name="chat_textbox")
+    .then(finish_chat, None, [textbox], api_name="finish_chat_textbox"))
     (examples_hidden
+        .change(start_chat, [examples_hidden, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_examples")
+        .then(chat, [examples_hidden, chatbot, dropdown_sources, dropdown_reports, dropdown_category, dropdown_year], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_examples")
+        .then(finish_chat, None, [textbox], api_name="finish_chat_examples")
     )
     demo.queue()