Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from sentence_transformers import SentenceTransformer | |
from langchain_community.vectorstores import Chroma | |
# from langchain.llms.huggingface_pipeline import HuggingFacePipeline | |
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain.prompts import PromptTemplate | |
# import os | |
import google.generativeai as genai | |
import git # pip install gitpython | |
genai.configure(api_key = os.environ['GOOGLE_API_KEY']) | |
# quantization_config = BitsAndBytesConfig( | |
# load_in_4bit=True, | |
# bnb_4bit_compute_dtype=torch.bfloat16 | |
# ) | |
model_kwargs = {'device': 'cpu'} | |
embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl",model_kwargs=model_kwargs, ) | |
# embeddings = SentenceTransformer(model_name_or_path="All-MiniLM-L6-v2") | |
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") | |
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map='auto', quantization_config = quantization_config) | |
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens = 1000) | |
# llm = HuggingFacePipeline(pipeline=pipe) | |
# def clone_repo(repo): | |
# if os.path.exists("githubCode") and os.path.isdir("githubCode"): | |
# print("File already exists!!") | |
# pass | |
# else: | |
# print("Cloning repo!!") | |
# git.Repo.clone_from(repo,"githubCode") | |
# git.Repo.clone_from("https://github.com/Divyansh3021/Github_code_assistant.git", "githubCode") | |
llm = genai.GenerativeModel('gemini-pro') | |
def get_folder_paths(directory = "githubCode"): | |
folder_paths = [] | |
for root, dirs, files in os.walk(directory): | |
if '.git' in dirs: | |
# Skip the directory if a .git folder is found | |
dirs.remove('.git') | |
for dir_name in dirs: | |
folder_paths.append(os.path.join(root, dir_name)) | |
return folder_paths | |
directory_paths = get_folder_paths() | |
directory_paths.append("Code") | |
print("directory_paths: ", directory_paths) | |
with open("Code.txt", "w", encoding='utf-8') as output: | |
for directory_path in directory_paths: | |
for filename in os.listdir(directory_path): | |
if filename.endswith((".py",".ipynb",".js", ".ts")): | |
filepath = os.path.join(directory_path, filename) | |
with open(filepath, "r", encoding='utf-8') as file: | |
code = file.read() | |
output.write(f"Filepath: {filepath}:\n\n") | |
output.write(code + "\n\n") | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import TextLoader | |
# for filename in os.listdir(directory_path): | |
# if filename.endswith(".txt"): # Only process PD files | |
# file_path = os.path.join(directory_path, filename) | |
loader = TextLoader("Code.txt", encoding="utf-8") | |
pages = loader.load_and_split() | |
# Split data into chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 2000, | |
chunk_overlap = 20, | |
add_start_index = True, | |
) | |
chunks = text_splitter.split_documents(pages) | |
# Store data into database | |
db=Chroma.from_documents(chunks,embedding=embeddings,persist_directory="test_index") | |
db.persist() | |
# Load the database | |
vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings) | |
# Load the retriver | |
retriever = vectordb.as_retriever(search_kwargs = {"k": 3}) | |
# Function to generate assistant's response using ask function | |
def generate_assistant_response(question): | |
context = retriever.get_relevant_documents(question) | |
qna_prompt_template= f"""### [INST] Instruction: You will be provided with questions and context. Your task is to find the answers to the questions using the given data.' | |
Context: ``` | |
{context} | |
``` | |
### Question: {question} [/INST]""" | |
print("Context: ", context) | |
answer = llm.generate_content(qna_prompt_template).text | |
return answer | |
# print(generate_assistant_response("Tell me about the instructor_embeddings function.")) |