BP-GWAS-Prioritise

Sleeping

File size: 1,309 Bytes

f60ce93
0059ef7
 
bcf8eca
bf91270
0059ef7
9b79169
 
0059ef7
4b60c06
 
 
 
 
0059ef7
4b60c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0059ef7
9b7ebe5
f60ce93

import streamlit as st
import numpy as np
import pandas as pd
import sklearn
import xgboost

data = pd.read_csv("annotations_dataset.csv")
data = data.set_index("Gene")

training_data = pd.read_csv("selected_features_training_data.csv", header=0)
training_data.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in training_data.columns.values
]

training_data["BPlabel_encoded"] = training_data["BPlabel"].map(
    {"most likely": 1, "probable": 0.75, "least likely": 0.1}
)
Y = training_data["BPlabel_encoded"]
X = training_data.drop("BPlabel_encoded","BPlabel", errors=ignore)
xgb = xgboost.XGBRegressor(
    n_estimators=40,
    learning_rate=0.2,
    max_depth=4,
    reg_alpha=1,
    reg_lambda=1,
    random_state=seed,
    objective="reg:squarederror",
)


xgb.fit(X, Y)

predictions = list(xgb.predict(data))

output = pd.Series(data=predictions, index=data.index, name="XGB_Score")
df_total = pd.concat([data, output], axis=1)


st.title('Blood Pressure Gene Prioritisation Post-Genome-wide Association Study')
st.markdown("""
A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure.
""")

st.sidebar.header('Input Gene')
sepal_length = st.text_input(
    label='HGNC Gene Name')