In [1]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

DB_FAISS_PATH = "./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8"
embedding_model_hf = "BAAI/bge-m3"
embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
db = FAISS.load_local(
 DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
)

 from .autonotebook import tqdm as notebook_tqdm


In [2]:
api_data = """
Page Title -: Shingles Symptoms, Causes, and Treatments
Page Content -: Slideshow: A Visual Guide to Shingles. Medically Reviewed by Jabeen Begum, MD on March 05, 2024. What is Shingles? 1 /18. If you've ever had the chickenpox -- and almost all adults have or have at least been exposed to it-- there's a good chance the virus is still at large in your body. The varicella zoster virus can lie dormant for decades without causing any symptoms. In some people, the virus wakes up and travels along nerve fibers to the skin. The result is a distinctive, painful rash called shingles
"""
api_data = """
Page Title -: Best and Worst Snacks
Page Content -: 5 Snacks to Enjoy (and 5 to Avoid). Written by Amy Capetta. We all snack. But some snacks are better than others, especially if you’re managing type 2 diabetes or obesity. An ideal snack gives you protein or fiber -- or both -- to help you feel full, says Gillian Culbertson, RD, certified diabetes educator at the Cleveland Clinic. It should give you plenty of energy without too many calories. Aim for between 100 and 150 calories for women, and about 200 calories for men, with 15 to 20 grams of protein
"""
api_data = """
Page Title -: End-Stage COPD (Stage IV)
Page Content -: End-Stage COPD (Stage IV). Medically Reviewed by Zilpah Sheikh, MD on November 13, 2023. Written by Alyson Powell Key , William Moore. What Is End-Stage COPD? End-stage, or stage IV, COPD is the final stage of chronic obstructive pulmonary disease. Most people reach it after years of living with the disease and the lung damage it causes. As a result, your quality of life is low. You’ll have exacerbations, or flares, often – one of which could be fatal
"""
api_data = """
Page Title -: How to Lower Your A1c Level
Page Content -: English. How to Lower Your A1c Level. Medically Reviewed by Brunilda Nazario, MD on January 18, 2024. Written by Elizabeth Svoboda. When you have diabetes , you probably know you should check your blood sugar regularly. Your doctor will also recommend that you take an A1c blood test a few times a year, with a goal of lowering the results to help protect your health. And there’s a lot you can do to move toward meeting that goal
"""
ads = db.similarity_search_with_score(api_data, k = 100)


# KMEANS CLUSTERING

In [12]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import re
import numpy as np


def remove_html_tags(text):
 clean = re.compile('<.*?>')
 return re.sub(clean, '', text)

no_of_clusters = 3
no_of_points = 5
kmeans = KMeans(n_clusters=no_of_clusters, random_state=42)
embeddings = np.array(embeddings_hf.embed_documents([remove_html_tags(doc[0].page_content) for doc in ads]))
kmeans.fit(embeddings)
cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_
closest_indices = [[] for _ in range(no_of_clusters)]
for i, embedding in enumerate(embeddings):
 cluster_idx = labels[i]
 center = cluster_centers[cluster_idx]
 dist = euclidean(embedding, center)
 closest_indices[cluster_idx].append((i, dist))
for i in range(no_of_clusters):
 closest_indices[i].sort(key=lambda x: x[1])
selected_indices = [closest_indices[i][:no_of_points] for i in range(no_of_clusters)]

for ind, cluster in enumerate(selected_indices):
 print(f"{ind}")
 for cluster_point in cluster:
 doc_ind = cluster_point[0]
 print(f"{doc_ind} =========> ", remove_html_tags(ads[doc_ind][0].page_content.split(".")[0]), ads[doc_ind][1])
 print()

# starting_indexes = [[] for _ in range(no_of_clusters)]
# for i, label in enumerate(labels):
# if len(starting_indexes[label]) < no_of_points:
# starting_indexes[label].append(i)
# if all(len(cluster) == no_of_points for cluster in starting_indexes):
# break

# for i, cluster in enumerate(starting_indexes):
# print(i)
# for id in cluster:
# print(f"{id} ====> {remove_html_tags(ads[id][0].page_content.split(".")[0])}, {ads[id][1]}")
# print()

0

1

2



# SPECTRAL CLUSTERING

In [11]:
from sklearn.cluster import SpectralClustering
no_of_clusters = 3
no_of_points = 5

spectral_clustering = SpectralClustering(n_clusters=no_of_clusters, affinity='nearest_neighbors', random_state=42)
labels = spectral_clustering.fit_predict(embeddings)

starting_indexes = [[] for _ in range(no_of_clusters)]
for i, label in enumerate(labels):
 if len(starting_indexes[label]) < no_of_points:
 starting_indexes[label].append(i)
 if all(len(cluster) == no_of_points for cluster in starting_indexes):
 break

starting_indexes
for i, cluster in enumerate(starting_indexes):
 print(i)
 for id in cluster:
 print(f"{id} ====> {remove_html_tags(ads[id][0].page_content.split(".")[0])}, {ads[id][1]}")
 print()
# for i in range(len(embeddings)):
# cluster_idx = labels[i]
# dist = distances[i, :].sum() / distances.shape[0]
# closest_indices[cluster_idx].append((i, dist))

# closest_indices
# # # Sort closest indices based on distance to cluster center
# for i in range(no_of_clusters):
# closest_indices[i].sort(key=lambda x: x[0])


0
4 ====> Lower Blood Sugar Naturally - Get Blood Glucose in Order, 0.4132922291755676
5 ====> Lower Blood Sugar Naturally - 5 Tips to Control Blood Sugar, 0.4696233868598938
9 ====> Lower Blood Sugar Naturally - Cinnamon Tackles Diabetes, 0.5150120854377747
12 ====> What Type II's Must Avoid - Lower Blood Sugar Naturally, 0.5334694385528564
16 ====> Lower Blood Sugar Naturally - High Blood Glucose Symptoms, 0.5507068634033203

1
1 ====> Learn More About Managing A1C - Help Reduce Your Risk, 0.36981940269470215
3 ====> Learn More About Managing A1C - A Once-Daily Treatment Option, 0.3764190971851349
11 ====> Blood Sugar & Type 2 Diabetes - Get Helpful Tips & Resources, 0.5273933410644531
14 ====> Type 2 Diabetes Treatment - Official Patient Site, 0.5485913753509521
15 ====> What Foods Lower A1c Quickly - Keep Blood Sugar in Check, 0.550499677658081

2
0 ====> A1C Levels - Help Reduce Your Risk, 0.365219384431839
2 ====> A1C-Related Questions? - Learn More About Managing A1C, 0.37610498