Yassmen commited on
Commit
526fa39
1 Parent(s): d0ea87a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +136 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
3
+ from PIL import Image
4
+ import requests
5
+ from byaldi import RAGMultiModalModel
6
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
7
+ from PIL import Image
8
+ from io import BytesIO
9
+ import torch
10
+ import re
11
+ import base64
12
+
13
+ RAG = RAGMultiModalModel.from_pretrained("vidore/colpali", verbose=10)
14
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
15
+ "Qwen/Qwen2-VL-2B-Instruct",
16
+ torch_dtype=torch.float16,
17
+ device_map="auto",
18
+ )
19
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
20
+
21
+ def create_rag_index(image_path):
22
+ RAG.index(
23
+ input_path=image_path,
24
+ index_name="image_index",
25
+ store_collection_with_index=True,
26
+ overwrite=True,
27
+ )
28
+
29
+ def extract_relevant_text(qwen_output):
30
+ # Extract the main content from the Qwen2-VL output (assuming it's a list of strings)
31
+ qwen_text = qwen_output[0]
32
+
33
+ # Split the text by newlines and periods to handle various sentence structures
34
+ lines = qwen_text.split('\n')
35
+
36
+ # Initialize a list to hold relevant text lines
37
+ relevant_text = []
38
+
39
+ # Loop through each line to identify relevant text
40
+ for line in lines:
41
+ # Use a regex to match text that looks like it's extracted from the image
42
+ # We ignore any description or meta information
43
+ if re.match(r'[A-Za-z0-9]', line): # Matches lines that have words or numbers
44
+ relevant_text.append(line.strip())
45
+
46
+ # Join the relevant text into a single output (you can customize the format)
47
+ return "\n".join(relevant_text)
48
+
49
+
50
+ # put all in one function
51
+ def ocr_image(image_path,text_query):
52
+ if text_query:
53
+ create_rag_index(image_path)
54
+ results = RAG.search(text_query, k=1, return_base64_results=True)
55
+
56
+ image_data = base64.b64decode(results[0].base64)
57
+ image = Image.open(BytesIO(image_data))
58
+ else:
59
+ image = Image.open(image_path)
60
+ messages = [
61
+ {
62
+ "role": "user",
63
+ "content": [
64
+ {
65
+ "type": "image",
66
+ "image": image,
67
+ },
68
+ {
69
+ "type": "text",
70
+ "text": "explain all text find in the image."
71
+ }
72
+ ]
73
+ }
74
+ ]
75
+
76
+ text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
77
+
78
+ inputs = processor(
79
+ text=[text_prompt],
80
+ images=[image],
81
+ padding=True,
82
+ return_tensors="pt"
83
+ )
84
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
85
+
86
+ inputs = inputs.to(device)
87
+
88
+ output_ids = model.generate(**inputs, max_new_tokens=1024)
89
+
90
+ generated_ids = [
91
+ output_ids[len(input_ids):]
92
+ for input_ids, output_ids in zip(inputs.input_ids, output_ids)
93
+ ]
94
+
95
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
96
+
97
+ # Extract relevant text from the Qwen2-VL output
98
+ relevant_text = extract_relevant_text(output_text)
99
+
100
+ return relevant_text
101
+
102
+
103
+ def highlight_text(text, query):
104
+ highlighted_text = text
105
+ for word in query.split():
106
+ pattern = re.compile(re.escape(word), re.IGNORECASE)
107
+ highlighted_text = pattern.sub(lambda m: f'<span style="background-color: yellow;">{m.group()}</span>', highlighted_text)
108
+ return highlighted_text
109
+
110
+ def ocr_and_search(image, keyword):
111
+ extracted_text = ocr_image(image,keyword)
112
+ #print(extracted_text)
113
+ if keyword =='':
114
+ return extracted_text , 'Please Enter a Keyword'
115
+
116
+ else:
117
+ highlighted_text = highlight_text(extracted_text, keyword)
118
+ return extracted_text , highlighted_text
119
+
120
+ # Create Gradio Interface
121
+ interface = gr.Interface(
122
+ fn=ocr_and_search,
123
+ inputs=[
124
+ gr.Image(type="filepath", label="Upload Image"),
125
+ gr.Textbox(label="Enter Keyword")
126
+ ],
127
+ outputs=[
128
+ gr.Textbox(label="Extracted Text"),
129
+ gr.HTML("Search Result"),
130
+ ],
131
+ title="OCR and Document Search Web Application",
132
+ description="Upload an image to extract text in Hindi and English and search for keywords."
133
+ )
134
+
135
+ if __name__ == "__main__":
136
+ interface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ byaldi
3
+ qwen_vl_utils