hardiksharma6555 commited on
Commit
34aedae
1 Parent(s): 4279f17

Upload 2 files

Browse files
Files changed (2) hide show
  1. README.md +6 -0
  2. app.py +108 -0
README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ---
2
+ title: OCR_IITR
3
+ app_file: new.py
4
+ sdk: gradio
5
+ sdk_version: 4.44.0
6
+ ---
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client, handle_file
3
+ import re
4
+ from thefuzz import fuzz
5
+
6
+ # hugging face clients for both OCR options
7
+ surya_ocr_client = Client("artificialguybr/Surya-OCR")
8
+ got_ocr_client = Client("stepfun-ai/GOT_official_online_demo")
9
+
10
+ # Global variable to store the extracted OCR text
11
+ extracted_text = ""
12
+
13
+ def ocr_extraction(image, ocr_model):
14
+ global extracted_text
15
+ if image is None:
16
+ return "Please upload an image first."
17
+
18
+ try:
19
+ if ocr_model == "Surya OCR":
20
+ client = surya_ocr_client
21
+ result = client.predict(
22
+ image=handle_file(image),
23
+ langs="en",
24
+ api_name="/ocr_workflow"
25
+ )
26
+ text_matches = re.findall(r"text='(.*?)'", str(result))
27
+ extracted_text = "\n".join(text_matches)
28
+ elif ocr_model == "GOT OCR":
29
+ client = got_ocr_client
30
+ result = client.predict(
31
+ image=handle_file(image),
32
+ got_mode="plain texts OCR",
33
+ fine_grained_mode="box",
34
+ ocr_color="red",
35
+ ocr_box="Hello!!",
36
+ api_name="/run_GOT"
37
+ )
38
+ extracted_text = result[0]
39
+ else:
40
+ return "Invalid OCR model selected."
41
+
42
+ return extracted_text
43
+ except Exception as e:
44
+ return f"An error occurred: {str(e)}"
45
+
46
+ def search_keyword(keyword, search_type):
47
+ global extracted_text
48
+ if not extracted_text:
49
+ return "No OCR text found. Please extract text from an image first."
50
+ if not keyword:
51
+ return extracted_text
52
+
53
+ if search_type == "Direct Search":
54
+ highlighted_text = re.sub(f"({re.escape(keyword)})", r'<span style="background-color: yellow;">\1</span>', extracted_text, flags=re.IGNORECASE)
55
+ else: # Nearest Search
56
+ words = extracted_text.split()
57
+ highlighted_words = []
58
+ for word in words:
59
+ if fuzz.ratio(word.lower(), keyword.lower()) >= 80: # Adjust threshold as needed
60
+ highlighted_words.append(f'<span style="background-color: yellow;">{word}</span>')
61
+ else:
62
+ highlighted_words.append(word)
63
+ highlighted_text = " ".join(highlighted_words)
64
+
65
+ return highlighted_text
66
+
67
+ with gr.Blocks(theme=gr.themes.Soft()) as gr_interface:
68
+ gr.Markdown("# 📷 OCR Text Extraction and Advanced Keyword Search 🔍")
69
+
70
+ with gr.Row():
71
+ with gr.Column(scale=1):
72
+ image_input = gr.Image(type="filepath", label="Upload Image")
73
+ ocr_model_dropdown = gr.Dropdown(
74
+ choices=["Surya OCR", "GOT OCR"],
75
+ value="Surya OCR",
76
+ label="Select OCR Model"
77
+ )
78
+ ocr_button = gr.Button("Extract Text", variant="primary")
79
+
80
+ with gr.Column(scale=2):
81
+ extracted_text_output = gr.Textbox(
82
+ label="Extracted Text",
83
+ placeholder="Text extracted from the image will appear here.",
84
+ lines=10
85
+ )
86
+
87
+ with gr.Row():
88
+ with gr.Column(scale=1):
89
+ keyword_input = gr.Textbox(label="Enter keyword to search")
90
+ search_type = gr.Radio(["Direct Search", "Nearest Search"], label="Search Type", value="Direct Search")
91
+ search_button = gr.Button("Search Keyword", variant="secondary")
92
+
93
+ with gr.Column(scale=2):
94
+ highlighted_output = gr.HTML(label="Highlighted Text")
95
+
96
+ ocr_button.click(
97
+ fn=ocr_extraction,
98
+ inputs=[image_input, ocr_model_dropdown],
99
+ outputs=extracted_text_output
100
+ )
101
+
102
+ search_button.click(
103
+ fn=search_keyword,
104
+ inputs=[keyword_input, search_type],
105
+ outputs=highlighted_output
106
+ )
107
+
108
+ gr_interface.launch(share=True)