from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig, AutoTokenizer, Qwen2TokenizerFast from PIL import Image import torch import requests from accelerate import init_empty_weights USE_GPU = True device = torch.device("cuda" if USE_GPU and torch.cuda.is_available() else "cpu") processor = AutoProcessor.from_pretrained( 'allenai/MolmoE-1B-0924', trust_remote_code=True, torch_dtype='auto', device_map='auto' if USE_GPU else None, cache_dir="./models/molmo1" ) with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained( 'allenai/MolmoE-1B-0924', trust_remote_code=True, torch_dtype='auto', device_map='auto' if USE_GPU else None, cache_dir="./models/molmo1", attn_implementation="eager" ) if not USE_GPU: model.to(device) model.tie_weights() image_path = "./public/image.jpg" # Replace with your image file path image = Image.open(image_path) image = image.convert("RGB") inputs = processor.process( images=[image], text="Extract text" ) inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()} print('STARTED') output = model.generate_from_batch( inputs, GenerationConfig( max_new_tokens=2000, # temperature=0.1, # top_p=top_p, stop_strings="<|endoftext|>" ), tokenizer=processor.tokenizer ) # Only get generated tokens; decode them to text generated_tokens = output[0, inputs['input_ids'].size(1):] generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) print(generated_text)