Spaces:
Running
on
Zero
Running
on
Zero
jeremyarancio
commited on
Commit
•
17306ce
1
Parent(s):
32de7fa
Change app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,19 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
import torch
|
4 |
import spaces
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
# Example images and texts
|
8 |
EXAMPLES = [
|
9 |
["images/ingredients_1.jpg", "24.36% chocolat noir 63% origine non UE (cacao, sucre, beurre de cacao, émulsifiant léci - thine de colza, vanille bourbon gousse), œuf, farine de blé, beurre, sucre, miel, sucre perlé, levure chimique, zeste de citron."],
|
@@ -12,18 +22,47 @@ EXAMPLES = [
|
|
12 |
["images/ingredients_4.jpg", "Eau de noix de coco 93.9%, Arôme natutel de fruit"],
|
13 |
["images/ingredients_5.jpg", "Sucre, pâte de cacao, beurre de cacao, émulsifiant: léci - thines (soja). Peut contenir des traces de lait. Chocolat noir: cacao: 50% minimum. À conserver à l'abri de la chaleur et de l'humidité. Élaboré en France."],
|
14 |
]
|
|
|
15 |
MODEL_ID = "openfoodfacts/spellcheck-mistral-7b"
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# CPU/GPU device
|
19 |
zero = torch.Tensor([0]).cuda()
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Tokenizer
|
|
|
22 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
23 |
tokenizer.pad_token = tokenizer.eos_token
|
24 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
25 |
|
26 |
# Model
|
|
|
27 |
model = AutoModelForCausalLM.from_pretrained(
|
28 |
MODEL_ID,
|
29 |
device_map="auto",
|
@@ -31,6 +70,10 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
31 |
# torch_dtype=torch.bfloat16,
|
32 |
)
|
33 |
|
|
|
|
|
|
|
|
|
34 |
@spaces.GPU
|
35 |
def process(text: str) -> str:
|
36 |
"""Take the text, the tokenizer and the causal model and generate the correction."""
|
@@ -50,6 +93,7 @@ def process(text: str) -> str:
|
|
50 |
|
51 |
def prepare_instruction(text: str) -> str:
|
52 |
"""Prepare instruction prompt for fine-tuning and inference.
|
|
|
53 |
|
54 |
Args:
|
55 |
text (str): List of ingredients
|
@@ -68,20 +112,17 @@ def prepare_instruction(text: str) -> str:
|
|
68 |
##########################
|
69 |
# GRADIO SETUP
|
70 |
##########################
|
71 |
-
|
72 |
-
# Creating the Gradio interface
|
73 |
with gr.Blocks() as demo:
|
74 |
|
75 |
-
gr.Markdown(
|
76 |
-
gr.Markdown("")
|
77 |
|
78 |
with gr.Row():
|
79 |
with gr.Column():
|
80 |
-
image = gr.Image(type="pil", label="image_input")
|
81 |
-
spellcheck_button = gr.Button(value='Spellcheck')
|
82 |
|
83 |
with gr.Column():
|
84 |
ingredients = gr.Textbox(label="List of ingredients")
|
|
|
85 |
correction = gr.Textbox(label="Correction", interactive=False)
|
86 |
|
87 |
with gr.Row():
|
@@ -92,10 +133,7 @@ with gr.Blocks() as demo:
|
|
92 |
image,
|
93 |
ingredients,
|
94 |
],
|
95 |
-
outputs=[correction],
|
96 |
-
run_on_click=False,
|
97 |
)
|
98 |
-
|
99 |
spellcheck_button.click(
|
100 |
fn=process,
|
101 |
inputs=[ingredients],
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
import gradio as gr
|
4 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
|
5 |
import torch
|
6 |
import spaces
|
7 |
|
8 |
|
9 |
+
##########################
|
10 |
+
# CONFIGURATION
|
11 |
+
##########################
|
12 |
+
logging.basicConfig(
|
13 |
+
level=logging.getLevelName("INFO"),
|
14 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
15 |
+
)
|
16 |
+
|
17 |
# Example images and texts
|
18 |
EXAMPLES = [
|
19 |
["images/ingredients_1.jpg", "24.36% chocolat noir 63% origine non UE (cacao, sucre, beurre de cacao, émulsifiant léci - thine de colza, vanille bourbon gousse), œuf, farine de blé, beurre, sucre, miel, sucre perlé, levure chimique, zeste de citron."],
|
|
|
22 |
["images/ingredients_4.jpg", "Eau de noix de coco 93.9%, Arôme natutel de fruit"],
|
23 |
["images/ingredients_5.jpg", "Sucre, pâte de cacao, beurre de cacao, émulsifiant: léci - thines (soja). Peut contenir des traces de lait. Chocolat noir: cacao: 50% minimum. À conserver à l'abri de la chaleur et de l'humidité. Élaboré en France."],
|
24 |
]
|
25 |
+
|
26 |
MODEL_ID = "openfoodfacts/spellcheck-mistral-7b"
|
27 |
|
28 |
+
PRESENTATION = """# 🍊 Ingredients Spellcheck - Open Food Facts
|
29 |
+
|
30 |
+
Open Food Facts is a non-profit organization building the largest open food database in the world. 🌎
|
31 |
+
|
32 |
+
When a product is added to the database, all its details, such as allergens, additives, or nutritional values, are either wrote down by the contributor,
|
33 |
+
or automatically extracted from the product pictures using OCR.
|
34 |
+
|
35 |
+
However, it often happens the information extracted by OCR contains typos and errors due to bad quality pictures: low-definition, curved product, light reflection, etc...
|
36 |
+
|
37 |
+
To solve this problem, we developed an 🍊 **Ingredient Spellcheck** 🍊, a model capable of correcting typos in a list of ingredients following a defined guideline.
|
38 |
+
The model, based on Mistral-7B-v0.3, was fine-tuned on thousand of corrected lists of ingredients extracted from the database. More information in the model card.
|
39 |
+
|
40 |
+
## 👇 Links
|
41 |
+
|
42 |
+
* Open Food Facts website: https://world.openfoodfacts.org/discover
|
43 |
+
* Open Food Facts Github: https://github.com/openfoodfacts
|
44 |
+
* Spellcheck project: https://github.com/openfoodfacts/openfoodfacts-ai/tree/develop/spellcheck
|
45 |
+
* Model card: https://huggingface.co/openfoodfacts/spellcheck-mistral-7b
|
46 |
+
"""
|
47 |
|
48 |
# CPU/GPU device
|
49 |
zero = torch.Tensor([0]).cuda()
|
50 |
|
51 |
+
# Transformers seed to orient generation to be reproducible (as possible since it doesn't ensure 100% reproducibility)
|
52 |
+
set_seed(42)
|
53 |
+
|
54 |
+
|
55 |
+
##########################
|
56 |
+
# LOADING
|
57 |
+
##########################
|
58 |
# Tokenizer
|
59 |
+
logging.info(f"Load tokenizer from {MODEL_ID}.")
|
60 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
61 |
tokenizer.pad_token = tokenizer.eos_token
|
62 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
63 |
|
64 |
# Model
|
65 |
+
logging.info(f"Load model from {MODEL_ID}.")
|
66 |
model = AutoModelForCausalLM.from_pretrained(
|
67 |
MODEL_ID,
|
68 |
device_map="auto",
|
|
|
70 |
# torch_dtype=torch.bfloat16,
|
71 |
)
|
72 |
|
73 |
+
|
74 |
+
##########################
|
75 |
+
# FUNCTIONS
|
76 |
+
##########################
|
77 |
@spaces.GPU
|
78 |
def process(text: str) -> str:
|
79 |
"""Take the text, the tokenizer and the causal model and generate the correction."""
|
|
|
93 |
|
94 |
def prepare_instruction(text: str) -> str:
|
95 |
"""Prepare instruction prompt for fine-tuning and inference.
|
96 |
+
Identical to instruction during training.
|
97 |
|
98 |
Args:
|
99 |
text (str): List of ingredients
|
|
|
112 |
##########################
|
113 |
# GRADIO SETUP
|
114 |
##########################
|
|
|
|
|
115 |
with gr.Blocks() as demo:
|
116 |
|
117 |
+
gr.Markdown(PRESENTATION)
|
|
|
118 |
|
119 |
with gr.Row():
|
120 |
with gr.Column():
|
121 |
+
image = gr.Image(type="pil", label="image_input", interactive=False)
|
|
|
122 |
|
123 |
with gr.Column():
|
124 |
ingredients = gr.Textbox(label="List of ingredients")
|
125 |
+
spellcheck_button = gr.Button(value='Run spellcheck')
|
126 |
correction = gr.Textbox(label="Correction", interactive=False)
|
127 |
|
128 |
with gr.Row():
|
|
|
133 |
image,
|
134 |
ingredients,
|
135 |
],
|
|
|
|
|
136 |
)
|
|
|
137 |
spellcheck_button.click(
|
138 |
fn=process,
|
139 |
inputs=[ingredients],
|