Spaces:

tagirshin
/

VQGAE

Sleeping

App Files Files Community

tagirshin commited on Sep 26, 2023

Commit

1adc128

•

1 Parent(s): ca8cff5

first version of app

Browse files

Files changed (1) hide show

app.py +213 -131

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import pandas as pd
 import pickle
 import pygad
-from tqdm.auto import tqdm
 from VQGAE.models import VQGAE, OrderingNetwork
 from CGRtools.containers import QueryContainer
 from VQGAE.utils import frag_counts_to_inds, restore_order, decode_molecules
@@ -55,44 +54,6 @@ def tanimoto_kernel(x, y):
     return result
-def rescoring(vqgae_latents):
-    frag_counts = np.array(vqgae_latents)
-    rf_scores = rf_model.predict_proba(frag_counts)[:, 1]
-    similarity_scores = tanimoto_kernel(frag_counts, X).max(-1)
-    frag_inds = frag_counts_to_inds(frag_counts, max_atoms=51)
-    _, ordering_scores = restore_order(frag_inds, ordering_model)
-    return rf_scores.tolist(), similarity_scores.tolist(), ordering_scores
-def fitness_func_batch(ga_instance, solutions, solutions_indices):
-    frag_counts = np.array(solutions)
-    # prediction of activity by random forest
-    rf_score = rf_model.predict_proba(frag_counts)[:, 1]
-    # size penalty if molecule too small
-    mol_size = frag_counts.sum(-1).astype(np.int64)
-    size_penalty = np.where(mol_size < 18, -1.0, 0.)
-    # adding dissimilarity so it generates different solutions
-    dissimilarity_score = 1 - tanimoto_kernel(frag_counts, X).max(-1)
-    dissimilarity_score += np.where(dissimilarity_score == 0, -5, 0)
-    # prediction of ordering score
-    frag_inds = frag_counts_to_inds(frag_counts, max_atoms=51)
-    _, ordering_scores = restore_order(frag_inds, ordering_model)
-    ordering_scores = np.array(ordering_scores)
-    # full fitness function
-    fitness = 0.5 * rf_score + 0.3 * dissimilarity_score + size_penalty + 0.2 * ordering_scores
-    return fitness.tolist()
-def on_generation_progress(ga):
-    pbar.update(1)
 @st.cache_data
 def load_data(batch_size):
     X = np.load("saved_model/tubulin_qsar_class_train_data_vqgae.npz")["x"]
@@ -122,97 +83,218 @@ st.title('Inverse QSAR of Tubulin inhibitors in colchicine site with VQGAE')
 data_load_state = st.text('Loading data...')
 batch_size = 500
 X, Y, rf_model, vqgae_model, ordering_model = load_data(batch_size)
 data_load_state.text("Done! (using st.cache_data)")
-# initial_pop = X
-#
-# num_parents_mating = int(initial_pop.shape[0] * 0.33 // 10 * 10)
-# keep_parents = int(num_parents_mating * 0.66 // 10 * 10)
-# print(num_parents_mating, keep_parents)
-#
-# num_generations = 30
-# with tqdm(total=num_generations) as pbar:
-#     ga_instance = pygad.GA(
-#         fitness_func=fitness_func_batch,
-#         on_generation=on_generation_progress,
-#         initial_population=initial_pop,
-#         num_genes=initial_pop.shape[-1],
-#         fitness_batch_size=batch_size,
-#         num_generations=num_generations,
-#         num_parents_mating=num_parents_mating,
-#         parent_selection_type="rws",
-#         crossover_type="single_point",
-#         mutation_type="adaptive",
-#         mutation_percent_genes=[10, 5],
-#         # https://pygad.readthedocs.io/en/latest/pygad.html#use-adaptive-mutation-in-pygad
-#         save_best_solutions=False,
-#         save_solutions=True,
-#         keep_elitism=0,  # turn it off to make keep_parents work
-#         keep_parents=keep_parents,  # 2/3 of num_parents_mating
-#         # parallel_processing=['process', 5],
-#         suppress_warnings=True,
-#         random_seed=42,
-#         gene_type=int
-#     )
-#     ga_instance.run()
-#
-# solutions = ga_instance.solutions
-# solutions = list(set(tuple(s) for s in solutions))
-# print(len(solutions))
-#
-# scores = {"rf_score": [], "similarity_score": [], "ordering_score": []}
-# for i in tqdm(range(len(solutions) // 100 + 1)):
-#     solution = solutions[i * 100: (i + 1) * 100]
-#     rf_score, similarity_score, ordering_score = rescoring(solution)
-#     scores["rf_score"].extend(rf_score)
-#     scores["similarity_score"].extend(similarity_score)
-#     scores["ordering_score"].extend(ordering_score)
-#
-# sc_df = pd.DataFrame(scores)
-#
-# chosen_gen = sc_df[(sc_df["similarity_score"] < 0.95) & (sc_df["rf_score"] > 0.5) & (sc_df["ordering_score"] > 0.7)]
-#
-# chosen_ids = chosen_gen.index.to_list()
-# chosen_solutions = np.array([solutions[ind] for ind in chosen_ids])
-# gen_frag_inds = frag_counts_to_inds(chosen_solutions, max_atoms=51)
-#
-# gen_molecules = []
-# results = {"score": [], "valid": []}
-# for i in tqdm(range(gen_frag_inds.shape[0] // batch_size + 1)):
-#     inputs = gen_frag_inds[i * batch_size: (i + 1) * batch_size]
-#     canon_order_inds, scores = restore_order(
-#         frag_inds=inputs,
-#         ordering_model=ordering_model,
-#     )
-#     molecules, validity = decode_molecules(
-#         ordered_frag_inds=canon_order_inds,
-#         vqgae_model=vqgae_model
-#     )
-#     gen_molecules.extend(molecules)
-#     results["score"].extend(scores)
-#     results["valid"].extend([1 if i else 0 for i in validity])
-#
-# gen_stats = pd.DataFrame(results)
-# full_stats = pd.concat([chosen_gen.reset_index(), gen_stats], axis=1, ignore_index=False)
-# valid_gen_stats = full_stats[full_stats.valid == 1]
-# valid_gen_mols = []
-# for i, record in zip(list(valid_gen_stats.index), valid_gen_stats.to_dict("records")):
-#     mol = gen_molecules[i]
-#     mol.meta.update({
-#         "rf_score": record["rf_score"],
-#         "similarity_score": record["similarity_score"],
-#         "ordering_score": record["ordering_score"],
-#     })
-#     valid_gen_mols.append(mol)
-#
-# filtered_gen_mols = []
-# for mol in valid_gen_mols:
-#     is_frag = allene < mol or peroxide_charge < mol or peroxide < mol
-#     is_macro = False
-#     for ring in mol.sssr:
-#         if len(ring) > 8 or len(ring) < 4:
-#             is_macro = True
-#             break
-#     if not is_frag and not is_macro:
-#         filtered_gen_mols.append(mol)

 import pickle
 import pygad
 from VQGAE.models import VQGAE, OrderingNetwork
 from CGRtools.containers import QueryContainer
 from VQGAE.utils import frag_counts_to_inds, restore_order, decode_molecules
     return result
 @st.cache_data
 def load_data(batch_size):
     X = np.load("saved_model/tubulin_qsar_class_train_data_vqgae.npz")["x"]
 data_load_state = st.text('Loading data...')
 batch_size = 500
 X, Y, rf_model, vqgae_model, ordering_model = load_data(batch_size)
 data_load_state.text("Done! (using st.cache_data)")
+num_generations = st.slider(
+    'Number of generations for GA',
+    min_value=3,
+    max_value=40,
+    value=5
+)
+parent_selection_type = st.selectbox(
+    label='Parent selection type',
+    options=(
+        'Steady-state selection',
+        'Roulette wheel selection',
+        'Stochastic universal selection',
+        'Rank selection',
+        'Random selection',
+        'Tournament selection'
+    ),
+    index=1
+)
+parent_selection_translator = {
+    "Steady-state selection": "sss",
+    "Roulette wheel selection": "rws",
+    "Stochastic universal selection": "sus",
+    "Rank selection": "rank",
+    "Random selection": "random",
+    "Tournament selection": "tournament",
+}
+parent_selection_type = parent_selection_translator[parent_selection_type]
+crossover_type = st.selectbox(
+    label='Crossover type',
+    options=(
+        'Single point',
+        'Two points',
+    ),
+    index=0
+)
+crossover_translator = {
+    "Single point": "single_point",
+    "Two points": "two_points",
+}
+crossover_type = crossover_translator[crossover_type]
+num_parents_mating = st.slider(
+    'Number of generations for GA',
+    min_value=1,
+    max_value=X.shape[0],
+    value=int(X.shape[0] * 0.33 // 10 * 10)
+)
+keep_parents = st.slider(
+    'Number of generations for GA',
+    min_value=1,
+    max_value=num_parents_mating,
+    value=int(num_parents_mating * 0.66 // 10 * 10) # 2/3 of num_parents_mating
+)
+use_ordering_score = st.toggle('Use ordering score', value=True)
+random_seed = int(st.number_input("Random seed", value=42, placeholder="Type a number..."))
+def fitness_func_batch(ga_instance, solutions, solutions_indices):
+    frag_counts = np.array(solutions)
+    # prediction of activity by random forest
+    rf_score = rf_model.predict_proba(frag_counts)[:, 1]
+    # size penalty if molecule too small
+    mol_size = frag_counts.sum(-1).astype(np.int64)
+    size_penalty = np.where(mol_size < 18, -1.0, 0.)
+    # adding dissimilarity so it generates different solutions
+    dissimilarity_score = 1 - tanimoto_kernel(frag_counts, X).max(-1)
+    dissimilarity_score += np.where(dissimilarity_score == 0, -5, 0)
+    # full fitness function
+    fitness = 0.5 * rf_score + 0.3 * dissimilarity_score + size_penalty
+    # prediction of ordering score
+    if use_ordering_score:
+        frag_inds = frag_counts_to_inds(frag_counts, max_atoms=51)
+        _, ordering_scores = restore_order(frag_inds, ordering_model)
+        ordering_scores = np.array(ordering_scores)
+        fitness += 0.2 * ordering_scores
+    return fitness.tolist()
+def on_generation_progress(ga):
+    global ga_progress
+    ga_progress = ga_progress + 1
+    ga_bar.progress(ga_progress // num_generations * 100, text=ga_progress_text)
+if st.button("Start optimisation"):
+    ga_instance = pygad.GA(
+        fitness_func=fitness_func_batch,
+        on_generation=on_generation_progress,
+        initial_population=X,
+        num_genes=X.shape[-1],
+        fitness_batch_size=batch_size,
+        num_generations=num_generations,
+        num_parents_mating=num_parents_mating,
+        parent_selection_type=parent_selection_type,
+        crossover_type=crossover_type,
+        mutation_type="adaptive",
+        mutation_percent_genes=[10, 5],
+        # https://pygad.readthedocs.io/en/latest/pygad.html#use-adaptive-mutation-in-pygad
+        save_best_solutions=False,
+        save_solutions=True,
+        keep_elitism=0,  # turn it off to make keep_parents work
+        keep_parents=keep_parents,
+        suppress_warnings=True,
+        random_seed=random_seed,
+        gene_type=int
+    )
+    ga_progress = 0
+    ga_progress_text = "Genetic optimisation in progress. Please wait."
+    ga_bar = st.progress(ga_progress // num_generations * 100, text=ga_progress_text)
+    ga_instance.run()
+    with st.spinner('Getting unique solutions'):
+        unique_solutions = list(set(tuple(s) for s in ga_instance.solutions))
+    st.success(f'{len(unique_solutions)} solutions were obtained')
+    scores = {
+        "rf_score": [],
+        "similarity_score": []
+    }
+    if use_ordering_score:
+        scores["ordering_score"] = []
+    rescoring_progress = 0
+    rescoring_progress_text = "Rescoring obtained solutions"
+    rescoring_bar = st.progress(0, text=rescoring_progress_text)
+    total_rescoring_steps = len(unique_solutions) // batch_size + 1
+    for i in range(total_rescoring_steps):
+        vqgae_latents = unique_solutions[i * batch_size: (i + 1) * batch_size]
+        frag_counts = np.array(vqgae_latents)
+        rf_scores = rf_model.predict_proba(frag_counts)[:, 1]
+        similarity_scores = tanimoto_kernel(frag_counts, X).max(-1)
+        scores["rf_score"].extend(rf_scores.tolist())
+        scores["similarity_score"].extend(similarity_scores.tolist())
+        if use_ordering_score:
+            frag_inds = frag_counts_to_inds(frag_counts, max_atoms=51)
+            _, ordering_scores = restore_order(frag_inds, ordering_model)
+            scores["ordering_score"].extend(ordering_scores)
+        rescoring_bar.progress(i // total_rescoring_steps * 100, text=rescoring_progress_text)
+    sc_df = pd.DataFrame(scores)
+    if use_ordering_score:
+        chosen_gen = sc_df[(sc_df["similarity_score"] < 0.95) & (sc_df["rf_score"] > 0.5) & (sc_df["ordering_score"] > 0.7)]
+    else:
+        chosen_gen = sc_df[
+            (sc_df["similarity_score"] < 0.95) & (sc_df["rf_score"] > 0.5)]
+    chosen_ids = chosen_gen.index.to_list()
+    chosen_solutions = np.array([unique_solutions[ind] for ind in chosen_ids])
+    gen_frag_inds = frag_counts_to_inds(chosen_solutions, max_atoms=51)
+    st.info(f'The number of chosen solutions is {gen_frag_inds.shape[0]}', icon="ℹ️")
+    gen_molecules = []
+    results = {"smiles": [], "ordering_score": [], "validity": []}
+    decoding_progress = 0
+    decoding_progress_text = "Decoding chosen solutions"
+    decoding_bar = st.progress(0, text=decoding_progress_text)
+    total_decoding_steps = gen_frag_inds.shape[0] // batch_size + 1
+    for i in range(total_decoding_steps):
+        inputs = gen_frag_inds[i * batch_size: (i + 1) * batch_size]
+        canon_order_inds, scores = restore_order(
+            frag_inds=inputs,
+            ordering_model=ordering_model,
+        )
+        molecules, validity = decode_molecules(
+            ordered_frag_inds=canon_order_inds,
+            vqgae_model=vqgae_model
+        )
+        gen_molecules.extend(molecules)
+        results["smiles"].extend([str(molecule) for molecule in molecules])
+        results["ordering_score"].extend(scores)
+        results["validity"].extend([1 if i else 0 for i in validity])
+        decoding_bar.progress(i // total_decoding_steps * 100, text=rescoring_progress_text)
+    gen_stats = pd.DataFrame(results)
+    full_stats = pd.concat([gen_stats, chosen_gen[["similarity_score", "rf_score"]].reset_index(), ], axis=1, ignore_index=False)
+    st.dataframe(full_stats)
+    # valid_gen_stats = full_stats[full_stats.valid == 1]
+    #
+    # valid_gen_mols = []
+    # for i, record in zip(list(valid_gen_stats.index), valid_gen_stats.to_dict("records")):
+    #     mol = gen_molecules[i]
+    #     valid_gen_mols.append(mol)
+    #
+    # filtered_gen_mols = []
+    # for mol in valid_gen_mols:
+    #     is_frag = allene < mol or peroxide_charge < mol or peroxide < mol
+    #     is_macro = False
+    #     for ring in mol.sssr:
+    #         if len(ring) > 8 or len(ring) < 4:
+    #             is_macro = True
+    #             break
+    #     if not is_frag and not is_macro:
+    #         filtered_gen_mols.append(mol)