from google.colab import drive
import os

drive.mount('/content/gdrive')

!ls
%cd /content/gdrive/MyDrive/rajat.bans/RAG/
!pip install -r requirements.txt from rag import VARIABLE_MANAGER
vm = VARIABLE_MANAGER()
rag = vm.getRag()
# data = vm.QnAAdsSampleGenerationPreProcessing()
tot_cost = 0
rag.getRagResponse("Hola") import pandas as pd
responses_file_name = './data/147_results_webmd_healthline_12Jun-18Jun_1000each_145BIGQSPRCR_QuestionSystemPromptImprovedClusteringAdded_.tsv'
try:
 responses = pd.read_csv(responses_file_name, sep='\t')
except FileNotFoundError:
 responses = pd.DataFrame()

new_rows = []
for i in range(len(responses), len(data)):
 print(i, end = ',')
 row = data.iloc[i, :]
 try:
 answer = {
 'domain_name': row['domain_name'],
 'url': row['url'],
 # 'input': '. '.join(row['stripped_url'].split('/')[3:]),
 'kwd_imp': row['kwd_imp'],
 'kwd_click': row['kwd_click'],
 'ad_click': row['ad_click'],
 'revenue': row['revenue'],
 'rank': row['rank'],
 'url_title': row['url_title'],
 'url_content': row['url_content'],
 'input': row['core_content'],
 }

 reply, clustered_docs = rag.getRagResponse(row['core_content'])
 answer["relation_reasoning"] = reply['relation_answer']['reasoning']
 answer["relation_classification"] = reply['relation_answer']['classification']
 answer["relation_tokens_used"] = reply['tokens_used_relation']

 answer["reasoning"] = reply['question_answer']['reasoning']
 answer["question"] = reply['question_answer']['question']
 options = reply['question_answer']['options']
 options_res = ""
 for option in options:
 options_res += option + "\n"
 for ad in options[option]:
 options_res += ad + "\n"
 options_res += "\n"
 answer["options"] = options_res
 answer["options_count"] = str(len(options))
 answer["question_tokens_used"] = reply['tokens_used_question']

 ads_data = ""
 for ind, cluster in enumerate(clustered_docs):
 ads_data += f"*************** Cluster-:{ind+1} **************\n"
 for doc in cluster:
 ad = doc[0]
 ads_data += ad.page_content + "\n"
 ads_data += "publisher_url: " + ad.metadata['publisher_url'] + " | "
 ads_data += "keyword_term: " + ad.metadata['keyword_term'] + " | "
 ads_data += "ad_display_url: " + ad.metadata['ad_display_url'] + " | "
 ads_data += "revenue: " + str(ad.metadata['revenue']) + " | "
 ads_data += "ad_click_count: " + str(ad.metadata['ad_click_count']) + " | "
 ads_data += "RPC: " + str(ad.metadata['RPC']) + " | "
 ads_data += "Type: " + ad.metadata['Type'] + "\n"
 ads_data += "Value: " + str(doc[1]) + "\n"
 ads_data += "\n"
 ads_data += "\n"
 answer["ads_data"] = ads_data

 cost = (answer["relation_tokens_used"] + answer["question_tokens_used"]) * 0.6/1000000
 tot_cost += cost
 answer['cost'] = float(cost)
 except Exception as e:
 print(e)
 new_rows.append(answer)

 if i % 10 == 0:
 print(" Total cost is up to now is", tot_cost)
 responses = pd.concat([responses, pd.DataFrame(new_rows)], ignore_index=True)
 responses.to_csv(responses_file_name, sep='\t', index=False)
 new_rows = []

responses = pd.concat([responses, pd.DataFrame(new_rows)], ignore_index=True)
responses.to_csv(responses_file_name, sep='\t', index=False)
responses