import torch
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
model_path = "Crystalcareai/GemMoE-Medium-v0.5"
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
prompt_template = "[INST] {prompt} [/INST]"
prompt = "You're standing on the surface of the Earth. "\
"You walk one mile south, one mile west and one mile north. "\
"You end up exactly where you started. Where are you?"
tokens = tokenizer(
prompt_template.format(prompt=prompt),
return_tensors='pt'
).input_ids.cuda()
generation_output = model.generate(
tokens,
streamer=streamer,
max_new_tokens=512
)