From fc8d20234dedcd22658c8ce7067a3cb843f02a47 Mon Sep 17 00:00:00 2001 From: Manu Date: Mon, 20 May 2024 11:15:46 +0200 Subject: [PATCH] facebook-swing-tiny-coco image segmentation --- .gitignore | 1 + .../espnet-tacotron2-notokenbad.py | 5 + .../huggingface-inference-api/coquiai404.py | 26 +++++ ...ggingface-spaces-facebook-fastspeech404.py | 26 +++++ .../huggingface-speak-ng404.py | 18 +++ .../huggingface-microsoft-tts5/README.md | 1 + .../{native => native-osx}/pyttsx-native.py | 0 .../{native => native-osx}/test_objc.py | 0 .../{native => native-osx}/test_pyttsx3.py | 0 .../{native => native-osx}/tts_osx_native.py | 0 .../transformers-tacotron2-nomodel.py | 24 ++++ ...ransformers-wav2vec-nofeatureextractor.py} | 0 ...-iapi-facebook-swig-tiny-coco-overlayed.py | 99 ++++++++++++++++ ...f-iapi-facebook-swig-tiny-coco-segments.py | 109 ++++++++++++++++++ 14 files changed, 309 insertions(+) create mode 100644 3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py create mode 100644 3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py create mode 100644 3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py create mode 100644 3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py create mode 100644 3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md rename 3-experiments/4-text-to-speech/{native => native-osx}/pyttsx-native.py (100%) rename 3-experiments/4-text-to-speech/{native => native-osx}/test_objc.py (100%) rename 3-experiments/4-text-to-speech/{native => native-osx}/test_pyttsx3.py (100%) rename 3-experiments/4-text-to-speech/{native => native-osx}/tts_osx_native.py (100%) create mode 100644 3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py rename 3-experiments/4-text-to-speech/{huggingface-tts.py => transformers-wav2vec-nofeatureextractor.py} (100%) create mode 100644 3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py create mode 100644 3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py diff --git a/.gitignore b/.gitignore index 7893d9c..84aca07 100644 --- a/.gitignore +++ b/.gitignore @@ -15,5 +15,6 @@ old *.flac *.jpg *photo* +*.png diff --git a/3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py b/3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py new file mode 100644 index 0000000..7eeeea1 --- /dev/null +++ b/3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py @@ -0,0 +1,5 @@ +from espnet2.bin.tts_inference import Text2Speech + +model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space_train.loss.best") + +speech, *_ = model("text to generate speech from") \ No newline at end of file diff --git a/3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py b/3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py new file mode 100644 index 0000000..8dbe137 --- /dev/null +++ b/3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py @@ -0,0 +1,26 @@ +import requests + +API_URL = "https://huggingface.co/spaces/coqui-ai/TTS/+/api/predict" +headers = {"Content-Type": "application/json"} + +def query(payload): + response = requests.post(API_URL, headers=headers, json=payload) + response.raise_for_status() # Esto lanzará una excepción si el código de estado no es 200 + return response.json() + +# Formato de la carga útil para el espacio de coqui-ai/TTS +payload = { + "data": ["The answer to the universe is 42"] +} + +result = query(payload) + +# El audio está en result['data'] que contiene una lista de resultados. +# Aquí asumimos que el primer resultado es el audio deseado. +audio_url = result['data'][0] +audio_bytes = requests.get(audio_url).content + +# Guardar los bytes de audio en un archivo .wav +with open('output.wav', 'wb') as f: + f.write(audio_bytes) +print("Audio guardado en output.wav") diff --git a/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py new file mode 100644 index 0000000..42c5f42 --- /dev/null +++ b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py @@ -0,0 +1,26 @@ +import requests + +API_URL = "https://hf.space/embed/facebook/fastspeech2-en-ljspeech/+/api/predict" +headers = {"Content-Type": "application/json"} + +def query(payload): + response = requests.post(API_URL, headers=headers, json=payload) + response.raise_for_status() # Esto lanzará una excepción si el código de estado no es 200 + return response.json() + +# Formato de la carga útil para el espacio de fastspeech2 +payload = { + "data": ["The answer to the universe is 42"] +} + +result = query(payload) + +# El audio está en result['data'] que contiene una lista de resultados. +# Aquí asumimos que el primer resultado es el audio deseado. +audio_url = result['data'][0]['name'] +audio_bytes = requests.get(audio_url).content + +# Guardar los bytes de audio en un archivo .wav +with open('output.wav', 'wb') as f: + f.write(audio_bytes) +print("Audio guardado en output.wav") diff --git a/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py new file mode 100644 index 0000000..c486221 --- /dev/null +++ b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py @@ -0,0 +1,18 @@ +import requests + +API_URL = "https://api-inference.huggingface.co/models/espeak-ng" +headers = {"Content-Type": "application/json"} + +def query(payload): + response = requests.post(API_URL, headers=headers, json=payload) + response.raise_for_status() # Esto lanzará una excepción si el código de estado no es 200 + return response.content + +audio_bytes = query({ + "inputs": "The answer to the universe is 42", +}) + +# Guardar los bytes de audio en un archivo .wav +with open('output.wav', 'wb') as f: + f.write(audio_bytes) +print("Audio guardado en output.wav") diff --git a/3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md b/3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md new file mode 100644 index 0000000..017bcdc --- /dev/null +++ b/3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md @@ -0,0 +1 @@ +https://huggingface.co/microsoft/speecht5_tts \ No newline at end of file diff --git a/3-experiments/4-text-to-speech/native/pyttsx-native.py b/3-experiments/4-text-to-speech/native-osx/pyttsx-native.py similarity index 100% rename from 3-experiments/4-text-to-speech/native/pyttsx-native.py rename to 3-experiments/4-text-to-speech/native-osx/pyttsx-native.py diff --git a/3-experiments/4-text-to-speech/native/test_objc.py b/3-experiments/4-text-to-speech/native-osx/test_objc.py similarity index 100% rename from 3-experiments/4-text-to-speech/native/test_objc.py rename to 3-experiments/4-text-to-speech/native-osx/test_objc.py diff --git a/3-experiments/4-text-to-speech/native/test_pyttsx3.py b/3-experiments/4-text-to-speech/native-osx/test_pyttsx3.py similarity index 100% rename from 3-experiments/4-text-to-speech/native/test_pyttsx3.py rename to 3-experiments/4-text-to-speech/native-osx/test_pyttsx3.py diff --git a/3-experiments/4-text-to-speech/native/tts_osx_native.py b/3-experiments/4-text-to-speech/native-osx/tts_osx_native.py similarity index 100% rename from 3-experiments/4-text-to-speech/native/tts_osx_native.py rename to 3-experiments/4-text-to-speech/native-osx/tts_osx_native.py diff --git a/3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py b/3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py new file mode 100644 index 0000000..db14b9a --- /dev/null +++ b/3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py @@ -0,0 +1,24 @@ +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline + +cache_dir = ".cache" + +# Nombre del modelo TTS +model_name = "espnet/kan-bayashi_ljspeech_tts_train_tacotron2" + +# Cargar el modelo y el tokenizer +model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_dir ) +tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) + +# Crear un pipeline de text-to-speech +text_to_speech = pipeline("text2speech", model=model, tokenizer=tokenizer) + +# Texto de ejemplo +text = "Hello, how are you?" + +# Generar el audio +outputs = text_to_speech(text) + +# Guardar el audio en un archivo +with open("output.wav", "wb") as f: + f.write(outputs["audio"]) +print("Audio guardado en output.wav") diff --git a/3-experiments/4-text-to-speech/huggingface-tts.py b/3-experiments/4-text-to-speech/transformers-wav2vec-nofeatureextractor.py similarity index 100% rename from 3-experiments/4-text-to-speech/huggingface-tts.py rename to 3-experiments/4-text-to-speech/transformers-wav2vec-nofeatureextractor.py diff --git a/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py new file mode 100644 index 0000000..96919f2 --- /dev/null +++ b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py @@ -0,0 +1,99 @@ +import requests +from pycocotools import mask +import matplotlib.pyplot as plt +from PIL import Image, ImageDraw, ImageOps, ImageFont +from dotenv import find_dotenv, load_dotenv +import os +import base64 +import io +import random +import numpy as np + +# Load environment variables +load_dotenv(find_dotenv()) +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") + +API_URL = "https://api-inference.huggingface.co/models/facebook/mask2former-swin-tiny-coco-panoptic" +headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} + +def segment_image(image_path): + with open(image_path, "rb") as f: + data = f.read() + response = requests.post(API_URL, headers=headers, data=data) + return response.json() + +def decode_mask(mask_str, size): + mask_data = base64.b64decode(mask_str) + mask_image = Image.open(io.BytesIO(mask_data)) + mask_image = mask_image.resize(size).convert("L") + return mask_image + +def overlay_masks_on_image(image_path, segments, transparency=0.4): + original_image = Image.open(image_path).convert("RGBA") + overlay = Image.new("RGBA", original_image.size, (255, 255, 255, 0)) + + # Nueva capa para el texto + + text_layer = Image.new("RGBA", original_image.size, (255, 255, 255, 0)) + + for segment in segments: + print(segment['label'] + " " + str(segment['score'])) + mask_str = segment['mask'] + mask_image = decode_mask(mask_str, original_image.size) + color = generate_random_color() + + color_mask = ImageOps.colorize(mask_image, black="black", white=color) + color_mask.putalpha(mask_image) + + overlay = Image.alpha_composite(overlay, color_mask) + + # Calcula el centroide de la mascara + + x, y = np.where(np.array(mask_image) > 0) + centroid_x = x.mean() + centroid_y = y.mean() + + # Imprime la etiqueta y la puntuación en la capa de texto + + font_size = 30 + draw = ImageDraw.Draw(text_layer) + font_path = "/System/Library/Fonts/Arial.ttf" # Path to Arial font on macOS + font = ImageFont.truetype(font_path, font_size) + label = segment['label'] + score = segment['score'] + text =f"{label}: {score}" + + # Estima el tamaño del texto hard rockandroll way + + text_width = 500 + text_height = 100 + draw.text((centroid_x - text_width / 2, centroid_y - text_height / 2), text, fill=(255, 255, 255, 255), font=font) + + + # Ajusta la transparencia de la capa de superposición + + overlay = Image.blend(original_image, overlay, transparency) + + # Combina la capa de superposición con la capa de texto + + final_image = Image.alpha_composite(overlay, text_layer) + + + return final_image + +def generate_random_color(): + return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + + +def main(): + image_path = "cats.jpg" + response = segment_image(image_path) + + if isinstance(response, list): + overlayed_image = overlay_masks_on_image(image_path, response) + overlayed_image.show() + overlayed_image.save("overlayed_image.png") + else: + print("Error in segmentation:", response) + +__main__ = main() \ No newline at end of file diff --git a/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py new file mode 100644 index 0000000..ea58770 --- /dev/null +++ b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py @@ -0,0 +1,109 @@ +import requests +from pycocotools import mask +import matplotlib.pyplot as plt +from PIL import Image, ImageDraw, ImageOps +from pprint import pprint +from dotenv import find_dotenv, load_dotenv +import os +import base64 +import io +from pprint import pprint +#load_dotenv(find_dotenv()) +HUGGINGFACEHUB_API_TOKEN=os.getenv("HUGGINGFACEHUB_API_TOKEN") + +API_URL = "https://api-inference.huggingface.co/models/facebook/mask2former-swin-tiny-coco-panoptic" +headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} +#print(headers) + +#def query(filename): +# with open(filename, "rb") as f: +# data = f.read() +# response = requests.post(API_URL, headers=headers, data=data) +# return response.json() + +#s¡output = query("cats.jpg") +# Imprime el tipo de la variable 'output' +# ∫print(type(output)) + + +def segment_image(image_path): + with open(image_path, "rb") as f: + data = f.read() + response = requests.post(API_URL, headers=headers, data=data) + return response.json() + + + + +def draw_segmented_image(image_path, segments): + image = Image.open(image_path).convert("RGBA") + overlay = Image.new("RGBA", image.size, (255, 255, 255, 0)) + draw = ImageDraw.Draw(overlay) + + for segment in segments: + mask = Image.open(io.BytesIO(segment['mask']['data'])) + mask = mask.resize(image.size) + color = tuple(segment['color']) + mask = mask.convert("L").point(lambda p: p > 128 and 255) + overlay = Image.composite(Image.new("RGBA", image.size, color + (128,)), overlay, mask) + + combined = Image.alpha_composite(image, overlay) + return combined + +def decode_mask(mask_str): + mask_data = base64.b64decode(mask_str) + mask_image = Image.open(io.BytesIO(mask_data)).convert("L") + return mask_image + +def overlay_mask_on_image(original_image, mask_image, color=(255, 0, 0), alpha=0.5): + # Create a color version of the mask + color_mask = ImageOps.colorize(mask_image, black="black", white=color) + # Convert the mask to have an alpha channel + color_mask.putalpha(mask_image) + # Resize the mask to match the original image size + color_mask = color_mask.resize(original_image.size, resample=Image.BILINEAR) + # Composite the mask with the original image + overlay_image = Image.alpha_composite(original_image.convert("RGBA"), color_mask) + return overlay_image + + + + + + +def decode_mask(mask_str, size): + mask_data = base64.b64decode(mask_str) + mask_image = Image.open(io.BytesIO(mask_data)) + mask_image = mask_image.resize(size).convert("L") + return mask_image + +def decode_and_display_mask(mask_str): + mask_data = base64.b64decode(mask_str) + mask_image = Image.open(io.BytesIO(mask_data)).convert("L") + mask_image.show() # Display the mask image + + +image_path = "cats.jpg" +response = segment_image(image_path) + +if isinstance(response, list): # Check if the result is a list of segments + original_image = Image.open(image_path).convert("RGBA") + for segment in response: + label = segment['label'] + score = segment['score'] + mask = segment['mask'] + print(f"Label: {label}, Score: {score}") + # decode_and_display_mask(mask) + + mask_image = decode_mask(mask, original_image.size) + overlay_image = overlay_mask_on_image(original_image, mask_image) + + overlay_image.show() # Display the image with overlay + overlay_image.save(f"overlay_{label}.png") +else: + print("Error in segmentation:", response) +#if 'segments' in segments: +# segmented_image = draw_segmented_image(image_path, segments['segments']) +# segmented_image.show() +#else: +# print("Error in segmentation:", segments) \ No newline at end of file