vistune / app.py
smejak's picture
app
cc288cc
raw
history blame contribute delete
No virus
1.5 kB
import requests
from PIL import Image
import scipy
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, MusicgenForConditionalGeneration
import streamlit as st
def image_to_music(raw_image):
img_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
img_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
inputs = img_processor(raw_image, return_tensors="pt")
out = img_model.generate(**inputs)
txt = img_processor.decode(out[0], skip_special_tokens=True)
audio_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
audio_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
inputs = audio_processor(
text=[txt],
padding=True,
return_tensors="pt",
)
audio_values = audio_model.generate(**inputs, max_new_tokens=256)
sampling_rate = audio_model.config.audio_encoder.sampling_rate
scipy.io.wavfile.write("music.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())
st.header("VisTune: an AI Image-to-Music generator")
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
if uploaded_image:
st.image(uploaded_image, caption="Uploaded Image.", use_column_width=True)
if st.button("Generate Music") and uploaded_image:
raw_image = Image.open(uploaded_image).convert('RGB')
image_to_music(raw_image)
st.audio("music.wav")