-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimg2story.py
54 lines (38 loc) · 1.21 KB
/
img2story.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Turn image to text. Write a story based on that text. Read the story
"""
from dotenv import find_dotenv, load_dotenv
from transformers import pipeline
import os
import requests
IMG = "moscow.jpg"
load_dotenv(find_dotenv())
HF_TOKEN = os.getenv("HF_API_TOKEN")
# Image to text
def img2text(url):
image_to_text = pipeline(
"image-to-text", model="Salesforce/blip-image-captioning-large", device=1
)
text = image_to_text(url)[0]["generated_text"]
return text
# Text to story
def text2story(text):
text_to_story = pipeline("text-generation", model="gpt2", device=1)
story = text_to_story(text, max_length=200, do_sample=True)[0]["generated_text"]
return story
# Story to speech
# Use API this time
def story2speech(story):
API_URL = (
"https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
)
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
payloads = {"inputs": story}
response = requests.post(API_URL, headers=headers, json=payloads)
with open("audio.flac", "wb") as file:
file.write(response.content)
text = img2text(IMG)
story = text2story(text)
print("\nText:" + text)
print("\nStory:" + story)
story2speech(story)