File size: 6,251 Bytes
20a9f7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e500c79
20a9f7a
 
 
 
 
 
 
 
 
 
e500c79
20a9f7a
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import time
import numpy as np
import gradio as gr
import pandas as pd
import torch
from pathlib import Path
from Bio import SeqIO
from tqdm.auto import tqdm
from uuid import uuid4
from tempfile import TemporaryDirectory
from torch.utils.data import DataLoader
from pathvalidate import sanitize_filename


from conplex_dti.featurizer import MorganFeaturizer, ProtBertFeaturizer
from conplex_dti.model.architectures import SimpleCoembeddingNoSigmoid

theme = "Default"
title = "ConPLex: Predicting Drug-Target Interactions"
description = """
If you use this interface to make predictions, please let us know (by emailing samsl@mit.edu)!
We want to keep this web version free to use with GPU support, and to do that we need to demonstrate to
our funders that it is being used. Thank you! 
"""

# article = """

# <hr>

# <img style="margin-left:auto; margin-right:auto" src="https://raw.githubusercontent.com/samsledje/D-SCRIPT/main/docs/source/img/dscript_architecture.png" alt="D-SCRIPT architecture" width="70%"/>

# <hr>

# D-SCRIPT is a deep learning method for predicting a physical interaction between two proteins given just their sequences.
# It generalizes well to new species and is robust to limitations in training data size. Its design reflects the intuition that for two proteins to physically interact, 
# a subset of amino acids from each protein should be in contact with the other. The intermediate stages of D-SCRIPT directly implement this intuition, with the penultimate stage 
# in D-SCRIPT being a rough estimate of the inter-protein contact map of the protein dimer. This structurally-motivated design enhances the interpretability of the results and, 
# since structure is more conserved evolutionarily than sequence, improves generalizability across species.

# <hr>

# Computational methods to predict protein-protein interaction (PPI) typically segregate into sequence-based "bottom-up" methods that infer properties from the characteristics of the 
# individual protein sequences, or global "top-down" methods that infer properties from the pattern of already known PPIs in the species of interest. However, a way to incorporate 
# top-down insights into sequence-based bottom-up PPI prediction methods has been elusive. Topsy-Turvy builds upon D-SCRIPT by synthesizing both views in a sequence-based, 
# multi-scale, deep-learning model for PPI prediction. While Topsy-Turvy makes predictions using only sequence data, during the training phase it takes a transfer-learning approach by 
# incorporating patterns from both global and molecular-level views of protein interaction. In a cross-species context, we show it achieves state-of-the-art performance, offering the 
# ability to perform genome-scale, interpretable PPI prediction for non-model organisms with no existing experimental PPI data.


# """

article = """
The pairs file should be a tab-separated values file where each row is a candidate pair, formatted as `[protein ID]\t[molecule ID]\t[protein Sequence]\t[molecule SMILES]`
"""

def predict(run_name, model_name, csv_file, progress = gr.Progress()):

    MODEL_MAP = {
        "ConPLex_V1_BindingDB": "./models/conplex_v1_bindingdb.pt",
    }

    try:
        with TemporaryDirectory() as tmpdir:
            run_id = uuid4()
            run_name = sanitize_filename(run_name)

            device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

            gr.Info("Loading data...")
            query_df = pd.read_csv(
                csv_file.name,
                sep="\t",
                names=["proteinID", "moleculeID", "proteinSequence", "moleculeSmiles"],
            )

            # Loading model
            gr.Info("Loading model -- this may take a while, as the ProtBert language model must be downloaded...")
            target_featurizer = ProtBertFeaturizer(
                save_dir=tmpdir, per_tok=False
            ).to(device)
            drug_featurizer = MorganFeaturizer(save_dir=tmpdir).to(device)

            gr.Info("Preloading embeddings...")
            drug_featurizer.preload(query_df["moleculeSmiles"].unique())
            target_featurizer.preload(query_df["proteinSequence"].unique())

            model = SimpleCoembeddingNoSigmoid(
                drug_featurizer.shape, target_featurizer.shape, 1024
            )

            model.load_state_dict(torch.load(MODEL_MAP[model_name], map_location=device))
            model = model.eval()
            model = model.to(device)

            dt_feature_pairs = [
                (drug_featurizer(r["moleculeSmiles"]), target_featurizer(r["proteinSequence"]))
                for _, r in query_df.iterrows()
            ]
            dloader = DataLoader(dt_feature_pairs, batch_size=1024, shuffle=False)

            progress(0, desc="Starting...")
            preds = []
            for b in progress.tqdm(dloader):

                preds.append(model(b[0], b[1]).detach().cpu().numpy())

            preds = np.concatenate(preds)

            results = pd.DataFrame(query_df[["moleculeID", "proteinID"]])
            results["Prediction"] = preds
            results.columns = ['Protein', 'Small Molecule', 'Predicted Interaction']

            file_path = f"/tmp/conplex_{run_name}_{run_id}.tsv"
            with open(file_path, "w+") as f:
                results.to_csv(f, sep="\t", index=False, header = True)

            return file_path
        
    except Exception as e:
        gr.Error(e)
        return None, None

demo = gr.Interface(
    fn=predict,
    inputs = [
        gr.Textbox(label="Run Name", placeholder = "predictions", type="text"),
        gr.Dropdown(label="Model", choices = ["ConPLex_V1_BindingDB"], value = "ConPLex_V1_BindingDB"),
        gr.File(label="Pairs (.tsv)", file_types = [".tsv"]),
    ],
    outputs = [
        # gr.DataFrame(
        #     label='Results',
        #     headers=['Protein', 'Small Molecule', 'Predicted Interaction'],
        #     height = 200,
        #     row_count = 20
        #     ),
        gr.File(label="Download results", type="filepath")
    ],
    # title = title,
    # description = description,
    article = article,
    theme = theme,
)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()