{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SCq5lAKuZxYx", "outputId": "6c44cd5b-efe4-4364-d19c-4650be91f9c6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/gdrive\n" ] } ], "source": [ "from google.colab import drive\n", "import os\n", "\n", "drive.mount('/content/gdrive')\n", "\n", "!ls\n", "%cd /content/gdrive/MyDrive/rajat.bans/RAG/\n", "!pip install -r requirements.txt" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "({'relation_answer': {'reasoning': \"No relevant ads found for the user's input 'Hola'.\",\n", " 'classification': 0},\n", " 'tokens_used_relation': 460,\n", " 'question_answer': {'reasoning': '', 'question': '', 'options': []},\n", " 'tokens_used_question': 0},\n", " [])" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from rag import VARIABLE_MANAGER\n", "vm = VARIABLE_MANAGER()\n", "rag = vm.getRag()\n", "# data = vm.QnAAdsSampleGenerationPreProcessing()\n", "tot_cost = 0\n", "rag.getRagResponse(\"Hola\")" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zzJNW1fCcP3v", "outputId": "6bcb20f5-6596-4e42-ffb7-d3a70566d2e8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20," ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Total cost is up to now is 0.014402400000000001\n" ] } ], "source": [ "import pandas as pd\n", "responses_file_name = './data/147_results_webmd_healthline_12Jun-18Jun_1000each_145BIGQSPRCR_QuestionSystemPromptImprovedClusteringAdded_.tsv'\n", "try:\n", " responses = pd.read_csv(responses_file_name, sep='\\t')\n", "except FileNotFoundError:\n", " responses = pd.DataFrame()\n", "\n", "new_rows = []\n", "for i in range(len(responses), len(data)):\n", " print(i, end = ',')\n", " row = data.iloc[i, :]\n", " try:\n", " answer = {\n", " 'domain_name': row['domain_name'],\n", " 'url': row['url'],\n", " # 'input': '. '.join(row['stripped_url'].split('/')[3:]),\n", " 'kwd_imp': row['kwd_imp'],\n", " 'kwd_click': row['kwd_click'],\n", " 'ad_click': row['ad_click'],\n", " 'revenue': row['revenue'],\n", " 'rank': row['rank'],\n", " 'url_title': row['url_title'],\n", " 'url_content': row['url_content'],\n", " 'input': row['core_content'],\n", " }\n", "\n", " reply, clustered_docs = rag.getRagResponse(row['core_content'])\n", " answer[\"relation_reasoning\"] = reply['relation_answer']['reasoning']\n", " answer[\"relation_classification\"] = reply['relation_answer']['classification']\n", " answer[\"relation_tokens_used\"] = reply['tokens_used_relation']\n", "\n", " answer[\"reasoning\"] = reply['question_answer']['reasoning']\n", " answer[\"question\"] = reply['question_answer']['question']\n", " options = reply['question_answer']['options']\n", " options_res = \"\"\n", " for option in options:\n", " options_res += option + \"\\n\"\n", " for ad in options[option]:\n", " options_res += ad + \"\\n\"\n", " options_res += \"\\n\"\n", " answer[\"options\"] = options_res\n", " answer[\"options_count\"] = str(len(options))\n", " answer[\"question_tokens_used\"] = reply['tokens_used_question']\n", "\n", " ads_data = \"\"\n", " for ind, cluster in enumerate(clustered_docs):\n", " ads_data += f\"*************** Cluster-:{ind+1} **************\\n\"\n", " for doc in cluster:\n", " ad = doc[0]\n", " ads_data += ad.page_content + \"\\n\"\n", " ads_data += \"publisher_url: \" + ad.metadata['publisher_url'] + \" | \"\n", " ads_data += \"keyword_term: \" + ad.metadata['keyword_term'] + \" | \"\n", " ads_data += \"ad_display_url: \" + ad.metadata['ad_display_url'] + \" | \"\n", " ads_data += \"revenue: \" + str(ad.metadata['revenue']) + \" | \"\n", " ads_data += \"ad_click_count: \" + str(ad.metadata['ad_click_count']) + \" | \"\n", " ads_data += \"RPC: \" + str(ad.metadata['RPC']) + \" | \"\n", " ads_data += \"Type: \" + ad.metadata['Type'] + \"\\n\"\n", " ads_data += \"Value: \" + str(doc[1]) + \"\\n\"\n", " ads_data += \"\\n\"\n", " ads_data += \"\\n\"\n", " answer[\"ads_data\"] = ads_data\n", "\n", " cost = (answer[\"relation_tokens_used\"] + answer[\"question_tokens_used\"]) * 0.6/1000000\n", " tot_cost += cost\n", " answer['cost'] = float(cost)\n", " except Exception as e:\n", " print(e)\n", " new_rows.append(answer)\n", "\n", " if i % 10 == 0:\n", " print(\" Total cost is up to now is\", tot_cost)\n", " responses = pd.concat([responses, pd.DataFrame(new_rows)], ignore_index=True)\n", " responses.to_csv(responses_file_name, sep='\\t', index=False)\n", " new_rows = []\n", "\n", "responses = pd.concat([responses, pd.DataFrame(new_rows)], ignore_index=True)\n", "responses.to_csv(responses_file_name, sep='\\t', index=False)\n", "responses\n" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [ "5gRHp_nCJHlf", "ScYo9Q38IbGr", "Yd1qWPjlxCTd", "P2soYnTaxE5c", "DOLcMgW6IWX8" ], "gpuType": "T4", "machine_shape": "hm", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" }, "widgets": { "application/vnd.jupyter.widget-state+json": {} } }, "nbformat": 4, "nbformat_minor": 0 }