Braddy commited on
Commit
a619426
1 Parent(s): ae714e8

update loader

Browse files
app.py CHANGED
@@ -2,11 +2,10 @@ import gradio as gr
2
  import os
3
  import time
4
 
5
- from langchain.document_loaders import OnlinePDFLoader, PyPDFLoader
6
 
7
  from langchain.text_splitter import CharacterTextSplitter
8
 
9
-
10
  from langchain.llms import OpenAI
11
 
12
  from langchain.embeddings import OpenAIEmbeddings
@@ -27,30 +26,25 @@ Follow Up Input: {question}
27
 
28
  CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(_template)
29
 
30
- def loading_pdf():
31
- return "Loading..."
32
-
33
- def pdf_changes():
34
-
35
- loader = PyPDFLoader("He Yingxu_2806.pdf")
36
- documents = loader.load()
37
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
38
- texts = text_splitter.split_documents(documents)
39
- embeddings = OpenAIEmbeddings()
40
- db = Chroma.from_documents(texts, embeddings)
41
- retriever = db.as_retriever()
42
- global qa
43
- qa = ConversationalRetrievalChain.from_llm(
44
- llm=OpenAI(temperature=0.5),
45
- retriever=retriever,
46
- condense_question_prompt=CUSTOM_QUESTION_PROMPT,
47
- return_source_documents=False)
48
- return "Ready"
49
 
50
  def add_text(history, text):
51
  history = history + [(text, None)]
52
  return history, ""
53
 
 
54
  def bot(history):
55
  print(history)
56
  response = infer(history[-1][0], history)
@@ -76,7 +70,8 @@ def infer(question, history):
76
  #print(result)
77
  return result["answer"]
78
 
79
- css="""
 
80
  #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
81
  """
82
 
@@ -93,23 +88,15 @@ title = """
93
  with gr.Blocks(css=css) as demo:
94
  with gr.Column(elem_id="col-container"):
95
  gr.HTML(title)
96
-
97
- with gr.Column():
98
- # openai_key = gr.Textbox(label="You OpenAI API key", type="password")
99
- # pdf_doc = gr.File(label="Load a pdf", file_types=['.pdf'], type="file")
100
- with gr.Row():
101
- langchain_status = gr.Textbox(label="Status", placeholder="", interactive=False)
102
- load_pdf = gr.Button("Load pdf to langchain")
103
-
104
  chatbot = gr.Chatbot([], elem_id="chatbot").style(height=350)
105
  question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
106
  submit_btn = gr.Button("Send Message")
107
- load_pdf.click(loading_pdf, None, langchain_status, queue=False)
108
- load_pdf.click(pdf_changes, inputs=[], outputs=[langchain_status], queue=False)
109
  question.submit(add_text, [chatbot, question], [chatbot, question]).then(
110
  bot, chatbot, chatbot
111
  )
112
  submit_btn.click(add_text, [chatbot, question], [chatbot, question]).then(
113
  bot, chatbot, chatbot)
114
 
115
- demo.launch()
 
2
  import os
3
  import time
4
 
5
+ from langchain.document_loaders import UnstructuredMarkdownLoader
6
 
7
  from langchain.text_splitter import CharacterTextSplitter
8
 
 
9
  from langchain.llms import OpenAI
10
 
11
  from langchain.embeddings import OpenAIEmbeddings
 
26
 
27
  CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(_template)
28
 
29
+ loader = UnstructuredMarkdownLoader('docs/resume.md')
30
+ documents = loader.load()
31
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
32
+ texts = text_splitter.split_documents(documents)
33
+ embeddings = OpenAIEmbeddings()
34
+ db = Chroma.from_documents(texts, embeddings)
35
+ retriever = db.as_retriever()
36
+ qa = ConversationalRetrievalChain.from_llm(
37
+ llm=OpenAI(temperature=0.3),
38
+ retriever=retriever,
39
+ condense_question_prompt=CUSTOM_QUESTION_PROMPT,
40
+ return_source_documents=False)
41
+
 
 
 
 
 
 
42
 
43
  def add_text(history, text):
44
  history = history + [(text, None)]
45
  return history, ""
46
 
47
+
48
  def bot(history):
49
  print(history)
50
  response = infer(history[-1][0], history)
 
70
  #print(result)
71
  return result["answer"]
72
 
73
+
74
+ css = """
75
  #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
76
  """
77
 
 
88
  with gr.Blocks(css=css) as demo:
89
  with gr.Column(elem_id="col-container"):
90
  gr.HTML(title)
91
+
 
 
 
 
 
 
 
92
  chatbot = gr.Chatbot([], elem_id="chatbot").style(height=350)
93
  question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")
94
  submit_btn = gr.Button("Send Message")
95
+
 
96
  question.submit(add_text, [chatbot, question], [chatbot, question]).then(
97
  bot, chatbot, chatbot
98
  )
99
  submit_btn.click(add_text, [chatbot, question], [chatbot, question]).then(
100
  bot, chatbot, chatbot)
101
 
102
+ demo.launch()
He Yingxu_2806.pdf → docs/He Yingxu_2806.pdf RENAMED
File without changes
docs/resume.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # personal information
2
+ ## identification
3
+ Singapore Permanent Resident|Chinese citizen
4
+
5
+ ## address
6
+ 17 Jalan Masjid, Singapore
7
+
8
+ ## contact
9
+ yingxu.he1998@gmail.com|+65 91752741|+86 15063250971
10
+
11
+ # Working Experience
12
+ ## Machine Learning Engineer at Huawei Ltd.
13
+ • from Dec 2022 to present
14
+
15
+ • Built a pipeline to automatically visualize data tables using LSTM network trained on ChatGPT-generated
16
+ data with pairwise loss method, achieving 80% recall@5 on 100+ internal test cases.
17
+
18
+ • Designed and implemented a novel SISR method that enhanced WIFI-signal simulations for office buildings
19
+ by achieving 10x speedup compared to physics-based simulation with negligible loss in accuracy (1% MAE)
20
+ on over 80 large-scale office layouts.
21
+
22
+ ## Machine Learning Research Engineer at Dyson Ltd.
23
+ • from Sept 2021 to Dec 2022
24
+
25
+ • Implemented an object localization model in a few -shot context by semi -supervised training. The model
26
+ achieved comparable results to professional software with improved adaptability and robustness .
27
+
28
+ • Designed and implemented an air quality estimation model, using LGBM, Bayesian Regression, etc., with
29
+ geographical and meteorological features . Demonstrat ed its advantages over spatial interpolated methods
30
+ and deployed the pipeline with Metaflow framework on AWS services.
31
+
32
+ ## ML Research Assistant at NUS -Singtel Cyber Security Lab
33
+ • from Sept 2020 to July 2021
34
+
35
+ • Identif ied anomalies from system logs leveraging DBSCAN and hierarchical clustering for model training .
36
+
37
+ • Developed an information retrieval method for web -attack strategy identification from system and firewall
38
+ logs. The recall@3 rate achieved 80% on 100+ hand -labelled samples .
39
+
40
+ ## Data Analyst Intern at GIC Pte. Ltd.
41
+ • from Dec 2018 to July 2019
42
+
43
+ • Deployed an R application that forecasts the mid -term returns of portfolio with visualization using R shiny .
44
+
45
+ • Optimized the coefficients of a mean reversion forecasting model using the Genetic Algorithm.
46
+
47
+ ## Data Analyst Intern at PropertyGuru
48
+ • from May 2018 to Aug 2018
49
+
50
+ • Developed dashboard s in Tableau to analyze the user behaviors and listings’ performance to better match
51
+ user demand to agents’ recommendations.
52
+
53
+ • Implemented a POC to calculate and geographically visualize the liveability score for properties .
54
+
55
+ # Education
56
+ ## Master of Computing in Artificial Intelligence at National University of Singapore
57
+ • from Aug 2020 to Sept 2021
58
+ • School of Computing : CAP 4.42/5.0
59
+ • Teaching Assistant : Advanced Analytics and Machine Learning (from Jan 2021 to May 2021)
60
+
61
+ ## Bachelor of Science (Hons) in Business Analytics at National University of Singapore
62
+ • from Aug 2016 to June 2020
63
+ • School of Computing : CAP 4.15/5.0 , Dean’s List in Semester 3 AY 2018/2019
64
+ • Distinction : Analytics Techniques Knowledge Area (awarded in Dec 2020)
65
+ • Teaching Assistant : Programming Methodology in python (from Aug 2017 to June 2018)
66
+
67
+ # Relevant Projects
68
+ ## Distilling ChatGPT for finetuning image captioning models
69
+ • from Jan 2023 to Present
70
+ • Employed Chain -of-Thought with verification prompting technique on ChatGPT to create 10k+ accurate
71
+ capt ions from the xView annotations. Fine -tuned a GIT image captioning model and significantly improved
72
+ the CIDE r score from 11.59 to 85.93 over 2k RSICD samples.
73
+ ## Dialogue Response Generation ( Master Thesis ) at NUS NExT++ Lab
74
+ • from Nov 2020 to Aug 2021
75
+ • Built an enriched task -oriented response generation by implementing copy -mechanism on GPT -2 using
76
+ Pytorch. The proposed model is capable of naturally incorporating external tips/user reviews about venues
77
+ into responses. The generated response outperforms m any state -of-the-art models on user satisfaction.
78
+ ## Property Resale Price Prediction
79
+ • from Jan 2021 to May 2021
80
+ • Fitted CatBoost, LGBM, XGBoost on 43k pieces of property sales data. Selected features by correlation and
81
+ information gain. Engineered new features describing properties’ livability. Reduce d data dimensionality
82
+ with WOE encoding. The f inal ensemble methods’ accuracy achieved 5th/64 place.
83
+
84
+ # Skills
85
+ • Python (Pytorch, Tensorflow), R : Machine
86
+ Learning, Deep Learning , Data processing
87
+ • SQL, Spark: Data query and big data
88
+ • Tableau, PowerBI : Visualization development
89
+ • Java, Git, Scala, JavaScript, HTML, CSS : Software
90
+ Development