Spaces:

jeremyarancio
/

ingredients-spellcheck

Running on Zero

App Files Files Community

jeremyarancio commited on Jul 11

Commit

17306ce

•

1 Parent(s): 32de7fa

Change app.py

Browse files

Files changed (1) hide show

app.py +48 -10

app.py CHANGED Viewed

@@ -1,9 +1,19 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import spaces
 # Example images and texts
 EXAMPLES = [
     ["images/ingredients_1.jpg", "24.36% chocolat noir 63% origine non UE (cacao, sucre, beurre de cacao, émulsifiant léci - thine de colza, vanille bourbon gousse), œuf, farine de blé, beurre, sucre, miel, sucre perlé, levure chimique, zeste de citron."],
@@ -12,18 +22,47 @@ EXAMPLES = [
     ["images/ingredients_4.jpg", "Eau de noix de coco 93.9%, Arôme natutel de fruit"],
     ["images/ingredients_5.jpg", "Sucre, pâte de cacao, beurre de cacao, émulsifiant: léci - thines (soja). Peut contenir des traces de lait. Chocolat noir: cacao: 50% minimum. À conserver à l'abri de la chaleur et de l'humidité. Élaboré en France."],
 ]
 MODEL_ID = "openfoodfacts/spellcheck-mistral-7b"
 # CPU/GPU device
 zero = torch.Tensor([0]).cuda()
 # Tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.pad_token_id = tokenizer.eos_token_id
 # Model
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
@@ -31,6 +70,10 @@ model = AutoModelForCausalLM.from_pretrained(
     # torch_dtype=torch.bfloat16,
 )
 @spaces.GPU
 def process(text: str) -> str:
     """Take the text, the tokenizer and the causal model and generate the correction."""
@@ -50,6 +93,7 @@ def process(text: str) -> str:
 def prepare_instruction(text: str) -> str:
     """Prepare instruction prompt for fine-tuning and inference.
     Args:
         text (str): List of ingredients
@@ -68,20 +112,17 @@ def prepare_instruction(text: str) -> str:
 ##########################
 # GRADIO SETUP
 ##########################
-# Creating the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Ingredients Spellcheck")
-    gr.Markdown("")
     with gr.Row():
         with gr.Column():
-            image = gr.Image(type="pil", label="image_input")
-            spellcheck_button = gr.Button(value='Spellcheck')
         with gr.Column():
             ingredients = gr.Textbox(label="List of ingredients")
             correction = gr.Textbox(label="Correction", interactive=False)
     with gr.Row():
@@ -92,10 +133,7 @@ with gr.Blocks() as demo:
                 image,
                 ingredients,
             ],
-            outputs=[correction],
-            run_on_click=False,
         )
     spellcheck_button.click(
         fn=process,
         inputs=[ingredients],

+import logging
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 import torch
 import spaces
+##########################
+# CONFIGURATION
+##########################
+logging.basicConfig(
+    level=logging.getLevelName("INFO"),
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
 # Example images and texts
 EXAMPLES = [
     ["images/ingredients_1.jpg", "24.36% chocolat noir 63% origine non UE (cacao, sucre, beurre de cacao, émulsifiant léci - thine de colza, vanille bourbon gousse), œuf, farine de blé, beurre, sucre, miel, sucre perlé, levure chimique, zeste de citron."],
     ["images/ingredients_4.jpg", "Eau de noix de coco 93.9%, Arôme natutel de fruit"],
     ["images/ingredients_5.jpg", "Sucre, pâte de cacao, beurre de cacao, émulsifiant: léci - thines (soja). Peut contenir des traces de lait. Chocolat noir: cacao: 50% minimum. À conserver à l'abri de la chaleur et de l'humidité. Élaboré en France."],
 ]
 MODEL_ID = "openfoodfacts/spellcheck-mistral-7b"
+PRESENTATION = """# 🍊 Ingredients Spellcheck - Open Food Facts
+Open Food Facts is a non-profit organization building the largest open food database in the world. 🌎
+When a product is added to the database, all its details, such as allergens, additives, or nutritional values, are either wrote down by the contributor,
+or automatically extracted from the product pictures using OCR.
+However, it often happens the information extracted by OCR contains typos and errors due to bad quality pictures: low-definition, curved product, light reflection, etc...
+To solve this problem, we developed an 🍊 **Ingredient Spellcheck** 🍊, a model capable of correcting typos in a list of ingredients following a defined guideline.
+The model, based on Mistral-7B-v0.3, was fine-tuned on thousand of corrected lists of ingredients extracted from the database. More information in the model card.
+## 👇 Links
+* Open Food Facts website: https://world.openfoodfacts.org/discover
+* Open Food Facts Github: https://github.com/openfoodfacts
+* Spellcheck project: https://github.com/openfoodfacts/openfoodfacts-ai/tree/develop/spellcheck
+* Model card: https://huggingface.co/openfoodfacts/spellcheck-mistral-7b
+"""
 # CPU/GPU device
 zero = torch.Tensor([0]).cuda()
+# Transformers seed to orient generation to be reproducible (as possible since it doesn't ensure 100% reproducibility)
+set_seed(42)
+##########################
+# LOADING
+##########################
 # Tokenizer
+logging.info(f"Load tokenizer from {MODEL_ID}.")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.pad_token_id = tokenizer.eos_token_id
 # Model
+logging.info(f"Load model from {MODEL_ID}.")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
     # torch_dtype=torch.bfloat16,
 )
+##########################
+# FUNCTIONS
+##########################
 @spaces.GPU
 def process(text: str) -> str:
     """Take the text, the tokenizer and the causal model and generate the correction."""
 def prepare_instruction(text: str) -> str:
     """Prepare instruction prompt for fine-tuning and inference.
+    Identical to instruction during training.
     Args:
         text (str): List of ingredients
 ##########################
 # GRADIO SETUP
 ##########################
 with gr.Blocks() as demo:
+    gr.Markdown(PRESENTATION)
     with gr.Row():
         with gr.Column():
+            image = gr.Image(type="pil", label="image_input", interactive=False)
         with gr.Column():
             ingredients = gr.Textbox(label="List of ingredients")
+            spellcheck_button = gr.Button(value='Run spellcheck')
             correction = gr.Textbox(label="Correction", interactive=False)
     with gr.Row():
                 image,
                 ingredients,
             ],
         )
     spellcheck_button.click(
         fn=process,
         inputs=[ingredients],