import os
import streamlit as st
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import Chroma
# from langchain.llms.huggingface_pipeline import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
# import os
import google.generativeai as genai
import git # pip install gitpython
genai.configure(api_key = os.environ['GOOGLE_API_KEY'])
# quantization_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_compute_dtype=torch.bfloat16
# )
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl",model_kwargs=model_kwargs, )
# embeddings = SentenceTransformer(model_name_or_path="All-MiniLM-L6-v2")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map='auto', quantization_config = quantization_config)
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens = 1000)
# llm = HuggingFacePipeline(pipeline=pipe)
# def clone_repo(repo):
# if os.path.exists("githubCode") and os.path.isdir("githubCode"):
# print("File already exists!!")
# pass
# else:
# print("Cloning repo!!")
# git.Repo.clone_from(repo,"githubCode")
# git.Repo.clone_from("", "githubCode")
llm = genai.GenerativeModel('gemini-pro')
def get_folder_paths(directory = "githubCode"):
folder_paths = []
for root, dirs, files in os.walk(directory):
if '.git' in dirs:
# Skip the directory if a .git folder is found
for dir_name in dirs:
folder_paths.append(os.path.join(root, dir_name))
return folder_paths
directory_paths = get_folder_paths()
print("directory_paths: ", directory_paths)
with open("Code.txt", "w", encoding='utf-8') as output:
for directory_path in directory_paths:
for filename in os.listdir(directory_path):
if filename.endswith((".py",".ipynb",".js", ".ts")):
filepath = os.path.join(directory_path, filename)
with open(filepath, "r", encoding='utf-8') as file:
code =
output.write(f"Filepath: {filepath}:\n\n")
output.write(code + "\n\n")
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
# for filename in os.listdir(directory_path):
# if filename.endswith(".txt"): # Only process PD files
# file_path = os.path.join(directory_path, filename)
loader = TextLoader("Code.txt", encoding="utf-8")
pages = loader.load_and_split()
# Split data into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 2000,
chunk_overlap = 20,
add_start_index = True,
chunks = text_splitter.split_documents(pages)
# Store data into database
# Load the database
vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings)
# Load the retriver
retriever = vectordb.as_retriever(search_kwargs = {"k": 3})
# Function to generate assistant's response using ask function
def generate_assistant_response(question):
context = retriever.get_relevant_documents(question)
qna_prompt_template= f"""### [INST] Instruction: You will be provided with questions and context. Your task is to find the answers to the questions using the given data.'
Context: ```
### Question: {question} [/INST]"""
print("Context: ", context)
answer = llm.generate_content(qna_prompt_template).text
return answer
# print(generate_assistant_response("Tell me about the instructor_embeddings function."))