File size: 1,642 Bytes
f60ce93
0059ef7
 
bcf8eca
bf91270
7944a63
0059ef7
9b79169
 
0059ef7
4d92e12
4b60c06
 
 
 
0059ef7
4b60c06
 
 
 
5c158f1
4b60c06
 
 
 
 
 
 
 
 
 
 
 
 
 
0059ef7
9b7ebe5
f60ce93
a044018
f60ce93
45b475d
 
 
 
 
 
 
 
659d788
f60ce93
 
 
 
bb50521
a044018
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import streamlit as st
import numpy as np
import pandas as pd
import sklearn
import xgboost
seed=42

data = pd.read_csv("annotations_dataset.csv")
data = data.set_index("Gene")

training_data = pd.read_csv("./selected_features_training_data.csv", header=0)
training_data.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in training_data.columns.values
]

training_data["BPlabel_encoded"] = training_data["BPlabel"].map(
    {"most likely": 1, "probable": 0.75, "least likely": 0.1}
)
Y = training_data["BPlabel_encoded"]
X = training_data.drop(columns=["BPlabel_encoded","BPlabel"])
xgb = xgboost.XGBRegressor(
    n_estimators=40,
    learning_rate=0.2,
    max_depth=4,
    reg_alpha=1,
    reg_lambda=1,
    random_state=seed,
    objective="reg:squarederror",
)


xgb.fit(X, Y)

predictions = list(xgb.predict(data))

output = pd.Series(data=predictions, index=data.index, name="XGB_Score")
df_total = pd.concat([data, output], axis=1)
df_total['XGB_Score'] = round(df_total['XGB_Score'], 2)

df_total = df_total[['XGB_Score', 'ExomiserScore',
 'SDI', 'Liver_GTExTPM',  'pLI_ExAC',
 'HIPred',
 'Cells - EBV-transformed lymphocytes_GTExTPM',
 'Pituitary_GTExTPM',
 'IPA_BP_annotation']]


st.title('Blood Pressure Gene Prioritisation Post-GWAS')
st.markdown("""
A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure.
""")

gene_input = st.text_input('Input HGNC Gene:') 
df = df_total[df_total.index == gene_input] 
st.dataframe(df)


st.markdown("""
Total Gene Prioritisation Results:
""")

st.dataframe(df_total)