{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SCq5lAKuZxYx",
        "outputId": "6c44cd5b-efe4-4364-d19c-4650be91f9c6"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Mounted at /content/gdrive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "import os\n",
        "\n",
        "drive.mount('/content/gdrive')\n",
        "\n",
        "!ls\n",
        "%cd /content/gdrive/MyDrive/rajat.bans/RAG/\n",
        "!pip install -r requirements.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n",
            "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "({'relation_answer': {'reasoning': \"No relevant ads found for the user's input 'Hola'.\",\n",
              "   'classification': 0},\n",
              "  'tokens_used_relation': 460,\n",
              "  'question_answer': {'reasoning': '', 'question': '', 'options': []},\n",
              "  'tokens_used_question': 0},\n",
              " [])"
            ]
          },
          "execution_count": 1,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "from rag import VARIABLE_MANAGER\n",
        "vm = VARIABLE_MANAGER()\n",
        "rag = vm.getRag()\n",
        "# data = vm.QnAAdsSampleGenerationPreProcessing()\n",
        "tot_cost = 0\n",
        "rag.getRagResponse(\"Hola\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 59,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zzJNW1fCcP3v",
        "outputId": "6bcb20f5-6596-4e42-ffb7-d3a70566d2e8"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "20,"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " Total cost is up to now is 0.014402400000000001\n"
          ]
        }
      ],
      "source": [
        "import pandas as pd\n",
        "responses_file_name = './data/147_results_webmd_healthline_12Jun-18Jun_1000each_145BIGQSPRCR_QuestionSystemPromptImprovedClusteringAdded_.tsv'\n",
        "try:\n",
        "    responses = pd.read_csv(responses_file_name, sep='\\t')\n",
        "except FileNotFoundError:\n",
        "    responses = pd.DataFrame()\n",
        "\n",
        "new_rows = []\n",
        "for i in range(len(responses), len(data)):\n",
        "    print(i, end = ',')\n",
        "    row = data.iloc[i, :]\n",
        "    try:\n",
        "        answer = {\n",
        "            'domain_name': row['domain_name'],\n",
        "            'url': row['url'],\n",
        "            #   'input': '. '.join(row['stripped_url'].split('/')[3:]),\n",
        "            'kwd_imp': row['kwd_imp'],\n",
        "            'kwd_click': row['kwd_click'],\n",
        "            'ad_click': row['ad_click'],\n",
        "            'revenue': row['revenue'],\n",
        "            'rank': row['rank'],\n",
        "            'url_title': row['url_title'],\n",
        "            'url_content': row['url_content'],\n",
        "            'input': row['core_content'],\n",
        "        }\n",
        "\n",
        "        reply, clustered_docs = rag.getRagResponse(row['core_content'])\n",
        "        answer[\"relation_reasoning\"] = reply['relation_answer']['reasoning']\n",
        "        answer[\"relation_classification\"] = reply['relation_answer']['classification']\n",
        "        answer[\"relation_tokens_used\"] = reply['tokens_used_relation']\n",
        "\n",
        "        answer[\"reasoning\"] = reply['question_answer']['reasoning']\n",
        "        answer[\"question\"] = reply['question_answer']['question']\n",
        "        options = reply['question_answer']['options']\n",
        "        options_res = \"\"\n",
        "        for option in options:\n",
        "            options_res += option + \"\\n\"\n",
        "            for ad in options[option]:\n",
        "                options_res += ad + \"\\n\"\n",
        "            options_res += \"\\n\"\n",
        "        answer[\"options\"] = options_res\n",
        "        answer[\"options_count\"] = str(len(options))\n",
        "        answer[\"question_tokens_used\"] = reply['tokens_used_question']\n",
        "\n",
        "        ads_data = \"\"\n",
        "        for ind, cluster in enumerate(clustered_docs):\n",
        "            ads_data += f\"*************** Cluster-:{ind+1} **************\\n\"\n",
        "            for doc in cluster:\n",
        "                ad = doc[0]\n",
        "                ads_data += ad.page_content + \"\\n\"\n",
        "                ads_data += \"publisher_url: \" + ad.metadata['publisher_url'] + \" | \"\n",
        "                ads_data += \"keyword_term: \" + ad.metadata['keyword_term'] + \" | \"\n",
        "                ads_data += \"ad_display_url: \" + ad.metadata['ad_display_url'] + \" | \"\n",
        "                ads_data += \"revenue: \" + str(ad.metadata['revenue']) + \" | \"\n",
        "                ads_data += \"ad_click_count: \" + str(ad.metadata['ad_click_count']) + \" | \"\n",
        "                ads_data += \"RPC: \" + str(ad.metadata['RPC']) + \" | \"\n",
        "                ads_data += \"Type: \" + ad.metadata['Type'] + \"\\n\"\n",
        "                ads_data += \"Value: \" + str(doc[1]) + \"\\n\"\n",
        "                ads_data += \"\\n\"\n",
        "            ads_data += \"\\n\"\n",
        "        answer[\"ads_data\"] = ads_data\n",
        "\n",
        "        cost = (answer[\"relation_tokens_used\"] + answer[\"question_tokens_used\"]) * 0.6/1000000\n",
        "        tot_cost += cost\n",
        "        answer['cost'] = float(cost)\n",
        "    except Exception as e:\n",
        "        print(e)\n",
        "    new_rows.append(answer)\n",
        "\n",
        "    if i % 10 == 0:\n",
        "        print(\" Total cost is up to now is\", tot_cost)\n",
        "        responses = pd.concat([responses, pd.DataFrame(new_rows)], ignore_index=True)\n",
        "        responses.to_csv(responses_file_name, sep='\\t', index=False)\n",
        "        new_rows = []\n",
        "\n",
        "responses = pd.concat([responses, pd.DataFrame(new_rows)], ignore_index=True)\n",
        "responses.to_csv(responses_file_name, sep='\\t', index=False)\n",
        "responses\n"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [
        "5gRHp_nCJHlf",
        "ScYo9Q38IbGr",
        "Yd1qWPjlxCTd",
        "P2soYnTaxE5c",
        "DOLcMgW6IWX8"
      ],
      "gpuType": "T4",
      "machine_shape": "hm",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.4"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {}
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}