{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "from langchain_community.vectorstores import FAISS\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "import numpy as np\n", "import os\n", "\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", "DB_FAISS_PATH = \"./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8\"\n", "embedding_model_hf = \"BAAI/bge-m3\"\n", "embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)\n", "db = FAISS.load_local(\n", " DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True\n", ")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "api_data = \"\"\"\n", "Page Title -: Shingles Symptoms, Causes, and Treatments\n", "Page Content -: Slideshow: A Visual Guide to Shingles. Medically Reviewed by Jabeen Begum, MD on March 05, 2024. What is Shingles? 1 /18. If you've ever had the chickenpox -- and almost all adults have or have at least been exposed to it-- there's a good chance the virus is still at large in your body. The varicella zoster virus can lie dormant for decades without causing any symptoms. In some people, the virus wakes up and travels along nerve fibers to the skin. The result is a distinctive, painful rash called shingles\n", "\"\"\"\n", "api_data = \"\"\"\n", "Page Title -: Best and Worst Snacks\n", "Page Content -: 5 Snacks to Enjoy (and 5 to Avoid). Written by Amy Capetta. We all snack. But some snacks are better than others, especially if you’re managing type 2 diabetes or obesity. An ideal snack gives you protein or fiber -- or both -- to help you feel full, says Gillian Culbertson, RD, certified diabetes educator at the Cleveland Clinic. It should give you plenty of energy without too many calories. Aim for between 100 and 150 calories for women, and about 200 calories for men, with 15 to 20 grams of protein\n", "\"\"\"\n", "api_data = \"\"\"\n", "Page Title -: End-Stage COPD (Stage IV)\n", "Page Content -: End-Stage COPD (Stage IV). Medically Reviewed by Zilpah Sheikh, MD on November 13, 2023. Written by Alyson Powell Key , William Moore. What Is End-Stage COPD? End-stage, or stage IV, COPD is the final stage of chronic obstructive pulmonary disease. Most people reach it after years of living with the disease and the lung damage it causes. As a result, your quality of life is low. You’ll have exacerbations, or flares, often – one of which could be fatal\n", "\"\"\"\n", "api_data = \"\"\"\n", "Page Title -: How to Lower Your A1c Level\n", "Page Content -: English. How to Lower Your A1c Level. Medically Reviewed by Brunilda Nazario, MD on January 18, 2024. Written by Elizabeth Svoboda. When you have diabetes , you probably know you should check your blood sugar regularly. Your doctor will also recommend that you take an A1c blood test a few times a year, with a goal of lowering the results to help protect your health. And there’s a lot you can do to move toward meeting that goal\n", "\"\"\"\n", "ads = db.similarity_search_with_score(api_data, k = 100)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# KMEANS CLUSTERING" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "8 =========> HOW TO LOWER A1C LEVELS INSTANTLY - Find HOW TO LOWER A1C LEVELS INSTANTLY 0.5105679\n", "68 =========> WHAT FOODS LOWER A1C QUICKLY - Find WHAT FOODS LOWER A1C QUICKLY 0.65786356\n", "27 =========> lower a1c levels naturally - Browse and Discover 0.5767728\n", "37 =========> Best Snack to Help Lower A1C Levels - See Results For best snack to help lower a1c levels 0.60074186\n", "\n", "1\n", "16 =========> Lower Blood Sugar Naturally - High Blood Glucose Symptoms 0.55070686\n", "35 =========> How To Lower Blood Sugar Immediately - Type 2 Diabetes Diet 0.59327304\n", "11 =========> Blood Sugar & Type 2 Diabetes - Get Helpful Tips & Resources 0.52739334\n", "36 =========> Bring Down Blood Glucose Fast - Bring down Blood Sugar 0.5967244\n", "17 =========> Reduce Your Blood Sugar Levels - Lower Blood Sugar At Home 0.55285\n", "\n", "2\n", "6 =========> Low, Normal, High A1c Ranges - Blood Glucose Chart 0.47047788\n", "1 =========> Learn More About Managing A1C - Help Reduce Your Risk 0.3698194\n", "0 =========> A1C Levels - Help Reduce Your Risk 0.36521938\n", "2 =========> A1C-Related Questions? - Learn More About Managing A1C 0.37610498\n", "10 =========> Blood Glucose Chart - What is Normal A1C? 0.52313244\n", "\n" ] } ], "source": [ "from sklearn.cluster import KMeans\n", "from scipy.spatial.distance import euclidean\n", "import re\n", "import numpy as np\n", "\n", "\n", "def remove_html_tags(text):\n", " clean = re.compile('<.*?>')\n", " return re.sub(clean, '', text)\n", "\n", "no_of_clusters = 3\n", "no_of_points = 5\n", "kmeans = KMeans(n_clusters=no_of_clusters, random_state=42)\n", "embeddings = np.array(embeddings_hf.embed_documents([remove_html_tags(doc[0].page_content) for doc in ads]))\n", "kmeans.fit(embeddings)\n", "cluster_centers = kmeans.cluster_centers_\n", "labels = kmeans.labels_\n", "closest_indices = [[] for _ in range(no_of_clusters)]\n", "for i, embedding in enumerate(embeddings):\n", " cluster_idx = labels[i]\n", " center = cluster_centers[cluster_idx]\n", " dist = euclidean(embedding, center)\n", " closest_indices[cluster_idx].append((i, dist))\n", "for i in range(no_of_clusters):\n", " closest_indices[i].sort(key=lambda x: x[1])\n", "selected_indices = [closest_indices[i][:no_of_points] for i in range(no_of_clusters)]\n", "\n", "for ind, cluster in enumerate(selected_indices):\n", " print(f\"{ind}\")\n", " for cluster_point in cluster:\n", " doc_ind = cluster_point[0]\n", " print(f\"{doc_ind} =========> \", remove_html_tags(ads[doc_ind][0].page_content.split(\".\")[0]), ads[doc_ind][1])\n", " print()\n", "\n", "# starting_indexes = [[] for _ in range(no_of_clusters)]\n", "# for i, label in enumerate(labels):\n", "# if len(starting_indexes[label]) < no_of_points:\n", "# starting_indexes[label].append(i)\n", "# if all(len(cluster) == no_of_points for cluster in starting_indexes):\n", "# break\n", "\n", "# for i, cluster in enumerate(starting_indexes):\n", "# print(i)\n", "# for id in cluster:\n", "# print(f\"{id} ====> {remove_html_tags(ads[id][0].page_content.split(\".\")[0])}, {ads[id][1]}\")\n", "# print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# SPECTRAL CLUSTERING" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "4 ====> Lower Blood Sugar Naturally - Get Blood Glucose in Order, 0.4132922291755676\n", "5 ====> Lower Blood Sugar Naturally - 5 Tips to Control Blood Sugar, 0.4696233868598938\n", "9 ====> Lower Blood Sugar Naturally - Cinnamon Tackles Diabetes, 0.5150120854377747\n", "12 ====> What Type II's Must Avoid - Lower Blood Sugar Naturally, 0.5334694385528564\n", "16 ====> Lower Blood Sugar Naturally - High Blood Glucose Symptoms, 0.5507068634033203\n", "\n", "1\n", "1 ====> Learn More About Managing A1C - Help Reduce Your Risk, 0.36981940269470215\n", "3 ====> Learn More About Managing A1C - A Once-Daily Treatment Option, 0.3764190971851349\n", "11 ====> Blood Sugar & Type 2 Diabetes - Get Helpful Tips & Resources, 0.5273933410644531\n", "14 ====> Type 2 Diabetes Treatment - Official Patient Site, 0.5485913753509521\n", "15 ====> What Foods Lower A1c Quickly - Keep Blood Sugar in Check, 0.550499677658081\n", "\n", "2\n", "0 ====> A1C Levels - Help Reduce Your Risk, 0.365219384431839\n", "2 ====> A1C-Related Questions? - Learn More About Managing A1C, 0.3761049807071686\n", "6 ====> Low, Normal, High A1c Ranges - Blood Glucose Chart, 0.4704778790473938\n", "7 ====> HbA1c Blood Test | Check Your Glucose Levels, 0.5101255774497986\n", "8 ====> HOW TO LOWER A1C LEVELS INSTANTLY - Find HOW TO LOWER A1C LEVELS INSTANTLY, 0.5105679035186768\n", "\n" ] } ], "source": [ "from sklearn.cluster import SpectralClustering\n", "no_of_clusters = 3\n", "no_of_points = 5\n", "\n", "spectral_clustering = SpectralClustering(n_clusters=no_of_clusters, affinity='nearest_neighbors', random_state=42)\n", "labels = spectral_clustering.fit_predict(embeddings)\n", "\n", "starting_indexes = [[] for _ in range(no_of_clusters)]\n", "for i, label in enumerate(labels):\n", " if len(starting_indexes[label]) < no_of_points:\n", " starting_indexes[label].append(i)\n", " if all(len(cluster) == no_of_points for cluster in starting_indexes):\n", " break\n", "\n", "starting_indexes\n", "for i, cluster in enumerate(starting_indexes):\n", " print(i)\n", " for id in cluster:\n", " print(f\"{id} ====> {remove_html_tags(ads[id][0].page_content.split(\".\")[0])}, {ads[id][1]}\")\n", " print()\n", "# for i in range(len(embeddings)):\n", "# cluster_idx = labels[i]\n", "# dist = distances[i, :].sum() / distances.shape[0]\n", "# closest_indices[cluster_idx].append((i, dist))\n", "\n", "# closest_indices\n", "# # # Sort closest indices based on distance to cluster center\n", "# for i in range(no_of_clusters):\n", "# closest_indices[i].sort(key=lambda x: x[0])\n" ] } ], "metadata": { "kernelspec": { "display_name": "langchain-rag-venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }