Spaces:

tagirshin
/

VQGAE

Sleeping

App Files Files Community

tagirshin commited on Sep 26, 2023

Commit

3ae9d07

•

1 Parent(s): 39706b3

added form to keep results

Browse files

Files changed (1) hide show

app.py +178 -176

app.py CHANGED Viewed

@@ -56,7 +56,8 @@ def tanimoto_kernel(x, y):
 def fitness_func_batch(ga_instance, solutions, solutions_indices):
     frag_counts = np.array(solutions)
-    st.write(frag_counts.shape)
     # prediction of activity by random forest
     rf_score = rf_model.predict_proba(frag_counts)[:, 1]
@@ -84,6 +85,7 @@ def fitness_func_batch(ga_instance, solutions, solutions_indices):
 def on_generation_progress(ga):
     global ga_progress
     ga_progress = ga_progress + 1
     ga_bar.progress(ga_progress // num_generations * 100, text=ga_progress_text)
@@ -119,184 +121,184 @@ X, Y, rf_model, vqgae_model, ordering_model = load_data(batch_size)
 assert X.shape == (603, 4096)
 with st.sidebar:
-    num_generations = st.slider(
-        'Number of generations for GA',
-        min_value=3,
-        max_value=40,
-        value=5
-    )
-    parent_selection_type = st.selectbox(
-        label='Parent selection type',
-        options=(
-            'Steady-state selection',
-            'Roulette wheel selection',
-            'Stochastic universal selection',
-            'Rank selection',
-            'Random selection',
-            'Tournament selection'
-        ),
-        index=1
-    )
-    parent_selection_translator = {
-        "Steady-state selection": "sss",
-        "Roulette wheel selection": "rws",
-        "Stochastic universal selection": "sus",
-        "Rank selection": "rank",
-        "Random selection": "random",
-        "Tournament selection": "tournament",
-    }
-    parent_selection_type = parent_selection_translator[parent_selection_type]
-    crossover_type = st.selectbox(
-        label='Crossover type',
-        options=(
-            'Single point',
-            'Two points',
-        ),
-        index=0
-    )
-    crossover_translator = {
-        "Single point": "single_point",
-        "Two points": "two_points",
-    }
-    crossover_type = crossover_translator[crossover_type]
-    num_parents_mating = st.slider(
-        'Number of parents mating',
-        min_value=1,
-        max_value=X.shape[0],
-        value=int(X.shape[0] * 0.33 // 10 * 10)
-    )
-    keep_parents = st.slider(
-        'Number of parents kept',
-        min_value=1,
-        max_value=num_parents_mating,
-        value=int(num_parents_mating * 0.66 // 10 * 10) # 2/3 of num_parents_mating
     )
-    use_ordering_score = st.toggle('Use ordering score', value=True)
-    random_seed = int(st.number_input("Random seed", value=42, placeholder="Type a number..."))
-    start_optimisation = st.button("Start optimisation")
-if start_optimisation:
-    ga_instance = pygad.GA(
-        fitness_func=fitness_func_batch,
-        on_generation=on_generation_progress,
-        initial_population=X,
-        num_genes=X.shape[-1],
-        fitness_batch_size=batch_size,
-        num_generations=num_generations,
-        num_parents_mating=num_parents_mating,
-        parent_selection_type=parent_selection_type,
-        crossover_type=crossover_type,
-        mutation_type="adaptive",
-        mutation_percent_genes=[10, 5],
-        # https://pygad.readthedocs.io/en/latest/pygad.html#use-adaptive-mutation-in-pygad
-        save_best_solutions=False,
-        save_solutions=True,
-        keep_elitism=0,  # turn it off to make keep_parents work
-        keep_parents=keep_parents,
-        suppress_warnings=True,
-        random_seed=random_seed,
-        gene_type=int
     )
-    ga_progress = 0
-    ga_progress_text = "Genetic optimisation in progress. Please wait."
-    ga_bar = st.progress(ga_progress // num_generations * 100, text=ga_progress_text)
-    ga_instance.run()
-    with st.spinner('Getting unique solutions'):
-        unique_solutions = list(set(tuple(s) for s in ga_instance.solutions))
-    st.success(f'{len(unique_solutions)} solutions were obtained')
-    scores = {
-        "rf_score": [],
-        "similarity_score": []
-    }
-    if use_ordering_score:
-        scores["ordering_score"] = []
-    rescoring_progress = 0
-    rescoring_progress_text = "Rescoring obtained solutions"
-    rescoring_bar = st.progress(0, text=rescoring_progress_text)
-    total_rescoring_steps = len(unique_solutions) // batch_size + 1
-    for i in range(total_rescoring_steps):
-        vqgae_latents = unique_solutions[i * batch_size: (i + 1) * batch_size]
-        frag_counts = np.array(vqgae_latents)
-        rf_scores = rf_model.predict_proba(frag_counts)[:, 1]
-        similarity_scores = tanimoto_kernel(frag_counts, X).max(-1)
-        scores["rf_score"].extend(rf_scores.tolist())
-        scores["similarity_score"].extend(similarity_scores.tolist())
-        if use_ordering_score:
-            frag_inds = frag_counts_to_inds(frag_counts, max_atoms=51)
-            _, ordering_scores = restore_order(frag_inds, ordering_model)
-            scores["ordering_score"].extend(ordering_scores)
-        rescoring_bar.progress(i // total_rescoring_steps * 100, text=rescoring_progress_text)
-    sc_df = pd.DataFrame(scores)
-    if use_ordering_score:
-        chosen_gen = sc_df[(sc_df["similarity_score"] < 0.95) & (sc_df["rf_score"] > 0.5) & (sc_df["ordering_score"] > 0.7)]
-    else:
-        chosen_gen = sc_df[
-            (sc_df["similarity_score"] < 0.95) & (sc_df["rf_score"] > 0.5)]
-    chosen_ids = chosen_gen.index.to_list()
-    chosen_solutions = np.array([unique_solutions[ind] for ind in chosen_ids])
-    gen_frag_inds = frag_counts_to_inds(chosen_solutions, max_atoms=51)
-    st.info(f'The number of chosen solutions is {gen_frag_inds.shape[0]}', icon="ℹ️")
-    gen_molecules = []
-    results = {"smiles": [], "ordering_score": [], "validity": []}
-    decoding_progress = 0
-    decoding_progress_text = "Decoding chosen solutions"
-    decoding_bar = st.progress(0, text=decoding_progress_text)
-    total_decoding_steps = gen_frag_inds.shape[0] // batch_size + 1
-    for i in range(total_decoding_steps):
-        inputs = gen_frag_inds[i * batch_size: (i + 1) * batch_size]
-        canon_order_inds, scores = restore_order(
-            frag_inds=inputs,
-            ordering_model=ordering_model,
-        )
-        molecules, validity = decode_molecules(
-            ordered_frag_inds=canon_order_inds,
-            vqgae_model=vqgae_model
-        )
-        gen_molecules.extend(molecules)
-        results["smiles"].extend([str(molecule) for molecule in molecules])
-        results["ordering_score"].extend(scores)
-        results["validity"].extend([1 if i else 0 for i in validity])
-        decoding_bar.progress(i // total_decoding_steps * 100, text=rescoring_progress_text)
-    gen_stats = pd.DataFrame(results)
-    full_stats = pd.concat([gen_stats, chosen_gen[["similarity_score", "rf_score"]].reset_index(), ], axis=1, ignore_index=False)
-    st.dataframe(full_stats)
-    # valid_gen_stats = full_stats[full_stats.valid == 1]
-    #
-    # valid_gen_mols = []
-    # for i, record in zip(list(valid_gen_stats.index), valid_gen_stats.to_dict("records")):
-    #     mol = gen_molecules[i]
-    #     valid_gen_mols.append(mol)
-    #
-    # filtered_gen_mols = []
-    # for mol in valid_gen_mols:
-    #     is_frag = allene < mol or peroxide_charge < mol or peroxide < mol
-    #     is_macro = False
-    #     for ring in mol.sssr:
-    #         if len(ring) > 8 or len(ring) < 4:
-    #             is_macro = True
-    #             break
-    #     if not is_frag and not is_macro:
-    #         filtered_gen_mols.append(mol)

 def fitness_func_batch(ga_instance, solutions, solutions_indices):
     frag_counts = np.array(solutions)
+    if len(frag_counts.shape) == 1:
+        frag_counts = frag_counts[np.newaxis, :]
     # prediction of activity by random forest
     rf_score = rf_model.predict_proba(frag_counts)[:, 1]
 def on_generation_progress(ga):
     global ga_progress
+    global ga_bar
     ga_progress = ga_progress + 1
     ga_bar.progress(ga_progress // num_generations * 100, text=ga_progress_text)
 assert X.shape == (603, 4096)
 with st.sidebar:
+    with st.form("my_form"):
+        num_generations = st.slider(
+            'Number of generations for GA',
+            min_value=3,
+            max_value=40,
+            value=5
+        )
+        parent_selection_type = st.selectbox(
+            label='Parent selection type',
+            options=(
+                'Steady-state selection',
+                'Roulette wheel selection',
+                'Stochastic universal selection',
+                'Rank selection',
+                'Random selection',
+                'Tournament selection'
+            ),
+            index=1
+        )
+        parent_selection_translator = {
+            "Steady-state selection": "sss",
+            "Roulette wheel selection": "rws",
+            "Stochastic universal selection": "sus",
+            "Rank selection": "rank",
+            "Random selection": "random",
+            "Tournament selection": "tournament",
+        }
+        parent_selection_type = parent_selection_translator[parent_selection_type]
+        crossover_type = st.selectbox(
+            label='Crossover type',
+            options=(
+                'Single point',
+                'Two points',
+            ),
+            index=0
+        )
+        crossover_translator = {
+            "Single point": "single_point",
+            "Two points": "two_points",
+        }
+        crossover_type = crossover_translator[crossover_type]
+        num_parents_mating = st.slider(
+            'Pecentage of parents mating taken from initial population',
+            min_value=0,
+            max_value=X.shape[0],
+            step=0.01,
+            value=0.33,
+        ) * X.shape[0] * 10 // 10
+        keep_parents = st.slider(
+            'Percentage of parents kept taken from number of parents mating',
+            min_value=1,
+            max_value=num_parents_mating,
+            value=int(num_parents_mating * 0.66 // 10 * 10) # 2/3 of num_parents_mating
+        )
+        use_ordering_score = st.toggle('Use ordering score', value=True)
+        random_seed = int(st.number_input("Random seed", value=42, placeholder="Type a number..."))
+        st.form_submit_button('Start optimisation')
+ga_instance = pygad.GA(
+    fitness_func=fitness_func_batch,
+    on_generation=on_generation_progress,
+    initial_population=X,
+    num_genes=X.shape[-1],
+    fitness_batch_size=batch_size,
+    num_generations=num_generations,
+    num_parents_mating=num_parents_mating,
+    parent_selection_type=parent_selection_type,
+    crossover_type=crossover_type,
+    mutation_type="adaptive",
+    mutation_percent_genes=[10, 5],
+    # https://pygad.readthedocs.io/en/latest/pygad.html#use-adaptive-mutation-in-pygad
+    save_best_solutions=False,
+    save_solutions=True,
+    keep_elitism=0,  # turn it off to make keep_parents work
+    keep_parents=keep_parents,
+    suppress_warnings=True,
+    random_seed=random_seed,
+    gene_type=int
+)
+ga_progress = 0
+ga_progress_text = "Genetic optimisation in progress. Please wait."
+ga_bar = st.progress(ga_progress // num_generations * 100, text=ga_progress_text)
+ga_instance.run()
+with st.spinner('Getting unique solutions'):
+    unique_solutions = list(set(tuple(s) for s in ga_instance.solutions))
+st.success(f'{len(unique_solutions)} solutions were obtained')
+scores = {
+    "rf_score": [],
+    "similarity_score": []
+}
+if use_ordering_score:
+    scores["ordering_score"] = []
+rescoring_progress = 0
+rescoring_progress_text = "Rescoring obtained solutions"
+rescoring_bar = st.progress(0, text=rescoring_progress_text)
+total_rescoring_steps = len(unique_solutions) // batch_size + 1
+for i in range(total_rescoring_steps):
+    vqgae_latents = unique_solutions[i * batch_size: (i + 1) * batch_size]
+    frag_counts = np.array(vqgae_latents)
+    rf_scores = rf_model.predict_proba(frag_counts)[:, 1]
+    similarity_scores = tanimoto_kernel(frag_counts, X).max(-1)
+    scores["rf_score"].extend(rf_scores.tolist())
+    scores["similarity_score"].extend(similarity_scores.tolist())
+    if use_ordering_score:
+        frag_inds = frag_counts_to_inds(frag_counts, max_atoms=51)
+        _, ordering_scores = restore_order(frag_inds, ordering_model)
+        scores["ordering_score"].extend(ordering_scores)
+    rescoring_bar.progress(i // total_rescoring_steps * 100, text=rescoring_progress_text)
+sc_df = pd.DataFrame(scores)
+if use_ordering_score:
+    chosen_gen = sc_df[(sc_df["similarity_score"] < 0.95) & (sc_df["rf_score"] > 0.5) & (sc_df["ordering_score"] > 0.7)]
+else:
+    chosen_gen = sc_df[
+        (sc_df["similarity_score"] < 0.95) & (sc_df["rf_score"] > 0.5)]
+chosen_ids = chosen_gen.index.to_list()
+chosen_solutions = np.array([unique_solutions[ind] for ind in chosen_ids])
+gen_frag_inds = frag_counts_to_inds(chosen_solutions, max_atoms=51)
+st.info(f'The number of chosen solutions is {gen_frag_inds.shape[0]}', icon="ℹ️")
+gen_molecules = []
+results = {"smiles": [], "ordering_score": [], "validity": []}
+decoding_progress = 0
+decoding_progress_text = "Decoding chosen solutions"
+decoding_bar = st.progress(0, text=decoding_progress_text)
+total_decoding_steps = gen_frag_inds.shape[0] // batch_size + 1
+for i in range(total_decoding_steps):
+    inputs = gen_frag_inds[i * batch_size: (i + 1) * batch_size]
+    canon_order_inds, scores = restore_order(
+        frag_inds=inputs,
+        ordering_model=ordering_model,
     )
+    molecules, validity = decode_molecules(
+        ordered_frag_inds=canon_order_inds,
+        vqgae_model=vqgae_model
     )
+    gen_molecules.extend(molecules)
+    results["smiles"].extend([str(molecule) for molecule in molecules])
+    results["ordering_score"].extend(scores)
+    results["validity"].extend([1 if i else 0 for i in validity])
+    decoding_bar.progress(i // total_decoding_steps * 100, text=rescoring_progress_text)
+gen_stats = pd.DataFrame(results)
+full_stats = pd.concat([gen_stats, chosen_gen[["similarity_score", "rf_score"]].reset_index(), ], axis=1, ignore_index=False)
+st.dataframe(full_stats)
+# valid_gen_stats = full_stats[full_stats.valid == 1]
+#
+# valid_gen_mols = []
+# for i, record in zip(list(valid_gen_stats.index), valid_gen_stats.to_dict("records")):
+#     mol = gen_molecules[i]
+#     valid_gen_mols.append(mol)
+#
+# filtered_gen_mols = []
+# for mol in valid_gen_mols:
+#     is_frag = allene < mol or peroxide_charge < mol or peroxide < mol
+#     is_macro = False
+#     for ring in mol.sssr:
+#         if len(ring) > 8 or len(ring) < 4:
+#             is_macro = True
+#             break
+#     if not is_frag and not is_macro:
+#         filtered_gen_mols.append(mol)