import streamlit as st import numpy as np import pandas as pd import sklearn import xgboost data = pd.read_csv("annotations_dataset.csv") data = data.set_index("Gene") training_data = pd.read_csv("selected_features_training_data.csv", header=0) training_data.columns = [ regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col for col in training_data.columns.values ] training_data["BPlabel_encoded"] = training_data["BPlabel"].map( {"most likely": 1, "probable": 0.75, "least likely": 0.1} ) Y = training_data["BPlabel_encoded"] X = training_data.drop("BPlabel_encoded","BPlabel", errors=ignore) xgb = xgboost.XGBRegressor( n_estimators=40, learning_rate=0.2, max_depth=4, reg_alpha=1, reg_lambda=1, random_state=seed, objective="reg:squarederror", ) xgb.fit(X, Y) predictions = list(xgb.predict(data)) output = pd.Series(data=predictions, index=data.index, name="XGB_Score") df_total = pd.concat([data, output], axis=1) st.title('Blood Pressure Gene Prioritisation Post-Genome-wide Association Study') st.markdown(""" A machine learning pipeline for predicting disease-causing genes post-genome-wide association study in blood pressure. """) st.sidebar.header('Input Gene') sepal_length = st.text_input( label='HGNC Gene Name')