Skip to content

Commit

Permalink
facebook-swing-tiny-coco image segmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
manufy committed May 20, 2024
1 parent 0f38944 commit fc8d202
Show file tree
Hide file tree
Showing 14 changed files with 309 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ old
*.flac
*.jpg
*photo*
*.png


5 changes: 5 additions & 0 deletions 3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from espnet2.bin.tts_inference import Text2Speech

model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space_train.loss.best")

speech, *_ = model("text to generate speech from")
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import requests

API_URL = "https://huggingface.co/spaces/coqui-ai/TTS/+/api/predict"
headers = {"Content-Type": "application/json"}

def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
response.raise_for_status() # Esto lanzará una excepción si el código de estado no es 200
return response.json()

# Formato de la carga útil para el espacio de coqui-ai/TTS
payload = {
"data": ["The answer to the universe is 42"]
}

result = query(payload)

# El audio está en result['data'] que contiene una lista de resultados.
# Aquí asumimos que el primer resultado es el audio deseado.
audio_url = result['data'][0]
audio_bytes = requests.get(audio_url).content

# Guardar los bytes de audio en un archivo .wav
with open('output.wav', 'wb') as f:
f.write(audio_bytes)
print("Audio guardado en output.wav")
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import requests

API_URL = "https://hf.space/embed/facebook/fastspeech2-en-ljspeech/+/api/predict"
headers = {"Content-Type": "application/json"}

def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
response.raise_for_status() # Esto lanzará una excepción si el código de estado no es 200
return response.json()

# Formato de la carga útil para el espacio de fastspeech2
payload = {
"data": ["The answer to the universe is 42"]
}

result = query(payload)

# El audio está en result['data'] que contiene una lista de resultados.
# Aquí asumimos que el primer resultado es el audio deseado.
audio_url = result['data'][0]['name']
audio_bytes = requests.get(audio_url).content

# Guardar los bytes de audio en un archivo .wav
with open('output.wav', 'wb') as f:
f.write(audio_bytes)
print("Audio guardado en output.wav")
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import requests

API_URL = "https://api-inference.huggingface.co/models/espeak-ng"
headers = {"Content-Type": "application/json"}

def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
response.raise_for_status() # Esto lanzará una excepción si el código de estado no es 200
return response.content

audio_bytes = query({
"inputs": "The answer to the universe is 42",
})

# Guardar los bytes de audio en un archivo .wav
with open('output.wav', 'wb') as f:
f.write(audio_bytes)
print("Audio guardado en output.wav")
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://huggingface.co/microsoft/speecht5_tts
24 changes: 24 additions & 0 deletions 3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

cache_dir = ".cache"

# Nombre del modelo TTS
model_name = "espnet/kan-bayashi_ljspeech_tts_train_tacotron2"

# Cargar el modelo y el tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_dir )
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

# Crear un pipeline de text-to-speech
text_to_speech = pipeline("text2speech", model=model, tokenizer=tokenizer)

# Texto de ejemplo
text = "Hello, how are you?"

# Generar el audio
outputs = text_to_speech(text)

# Guardar el audio en un archivo
with open("output.wav", "wb") as f:
f.write(outputs["audio"])
print("Audio guardado en output.wav")
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import requests
from pycocotools import mask
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageOps, ImageFont
from dotenv import find_dotenv, load_dotenv
import os
import base64
import io
import random
import numpy as np

# Load environment variables
load_dotenv(find_dotenv())
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

API_URL = "https://api-inference.huggingface.co/models/facebook/mask2former-swin-tiny-coco-panoptic"
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}

def segment_image(image_path):
with open(image_path, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data)
return response.json()

def decode_mask(mask_str, size):
mask_data = base64.b64decode(mask_str)
mask_image = Image.open(io.BytesIO(mask_data))
mask_image = mask_image.resize(size).convert("L")
return mask_image

def overlay_masks_on_image(image_path, segments, transparency=0.4):
original_image = Image.open(image_path).convert("RGBA")
overlay = Image.new("RGBA", original_image.size, (255, 255, 255, 0))

# Nueva capa para el texto

text_layer = Image.new("RGBA", original_image.size, (255, 255, 255, 0))

for segment in segments:
print(segment['label'] + " " + str(segment['score']))
mask_str = segment['mask']
mask_image = decode_mask(mask_str, original_image.size)
color = generate_random_color()

color_mask = ImageOps.colorize(mask_image, black="black", white=color)
color_mask.putalpha(mask_image)

overlay = Image.alpha_composite(overlay, color_mask)

# Calcula el centroide de la mascara

x, y = np.where(np.array(mask_image) > 0)
centroid_x = x.mean()
centroid_y = y.mean()

# Imprime la etiqueta y la puntuación en la capa de texto

font_size = 30
draw = ImageDraw.Draw(text_layer)
font_path = "/System/Library/Fonts/Arial.ttf" # Path to Arial font on macOS
font = ImageFont.truetype(font_path, font_size)
label = segment['label']
score = segment['score']
text =f"{label}: {score}"

# Estima el tamaño del texto hard rockandroll way

text_width = 500
text_height = 100
draw.text((centroid_x - text_width / 2, centroid_y - text_height / 2), text, fill=(255, 255, 255, 255), font=font)


# Ajusta la transparencia de la capa de superposición

overlay = Image.blend(original_image, overlay, transparency)

# Combina la capa de superposición con la capa de texto

final_image = Image.alpha_composite(overlay, text_layer)


return final_image

def generate_random_color():
return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))


def main():
image_path = "cats.jpg"
response = segment_image(image_path)

if isinstance(response, list):
overlayed_image = overlay_masks_on_image(image_path, response)
overlayed_image.show()
overlayed_image.save("overlayed_image.png")
else:
print("Error in segmentation:", response)

__main__ = main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import requests
from pycocotools import mask
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageOps
from pprint import pprint
from dotenv import find_dotenv, load_dotenv
import os
import base64
import io
from pprint import pprint
#load_dotenv(find_dotenv())
HUGGINGFACEHUB_API_TOKEN=os.getenv("HUGGINGFACEHUB_API_TOKEN")

API_URL = "https://api-inference.huggingface.co/models/facebook/mask2former-swin-tiny-coco-panoptic"
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
#print(headers)

#def query(filename):
# with open(filename, "rb") as f:
# data = f.read()
# response = requests.post(API_URL, headers=headers, data=data)
# return response.json()

#s¡output = query("cats.jpg")
# Imprime el tipo de la variable 'output'
# ∫print(type(output))


def segment_image(image_path):
with open(image_path, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data)
return response.json()




def draw_segmented_image(image_path, segments):
image = Image.open(image_path).convert("RGBA")
overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
draw = ImageDraw.Draw(overlay)

for segment in segments:
mask = Image.open(io.BytesIO(segment['mask']['data']))
mask = mask.resize(image.size)
color = tuple(segment['color'])
mask = mask.convert("L").point(lambda p: p > 128 and 255)
overlay = Image.composite(Image.new("RGBA", image.size, color + (128,)), overlay, mask)

combined = Image.alpha_composite(image, overlay)
return combined

def decode_mask(mask_str):
mask_data = base64.b64decode(mask_str)
mask_image = Image.open(io.BytesIO(mask_data)).convert("L")
return mask_image

def overlay_mask_on_image(original_image, mask_image, color=(255, 0, 0), alpha=0.5):
# Create a color version of the mask
color_mask = ImageOps.colorize(mask_image, black="black", white=color)
# Convert the mask to have an alpha channel
color_mask.putalpha(mask_image)
# Resize the mask to match the original image size
color_mask = color_mask.resize(original_image.size, resample=Image.BILINEAR)
# Composite the mask with the original image
overlay_image = Image.alpha_composite(original_image.convert("RGBA"), color_mask)
return overlay_image






def decode_mask(mask_str, size):
mask_data = base64.b64decode(mask_str)
mask_image = Image.open(io.BytesIO(mask_data))
mask_image = mask_image.resize(size).convert("L")
return mask_image

def decode_and_display_mask(mask_str):
mask_data = base64.b64decode(mask_str)
mask_image = Image.open(io.BytesIO(mask_data)).convert("L")
mask_image.show() # Display the mask image


image_path = "cats.jpg"
response = segment_image(image_path)

if isinstance(response, list): # Check if the result is a list of segments
original_image = Image.open(image_path).convert("RGBA")
for segment in response:
label = segment['label']
score = segment['score']
mask = segment['mask']
print(f"Label: {label}, Score: {score}")
# decode_and_display_mask(mask)

mask_image = decode_mask(mask, original_image.size)
overlay_image = overlay_mask_on_image(original_image, mask_image)

overlay_image.show() # Display the image with overlay
overlay_image.save(f"overlay_{label}.png")
else:
print("Error in segmentation:", response)
#if 'segments' in segments:
# segmented_image = draw_segmented_image(image_path, segments['segments'])
# segmented_image.show()
#else:
# print("Error in segmentation:", segments)

0 comments on commit fc8d202

Please sign in to comment.