From fc8d20234dedcd22658c8ce7067a3cb843f02a47 Mon Sep 17 00:00:00 2001
From: Manu <dev@mfy.es>
Date: Mon, 20 May 2024 11:15:46 +0200
Subject: [PATCH] facebook-swing-tiny-coco image segmentation

---
 .gitignore                                    |   1 +
 .../espnet-tacotron2-notokenbad.py            |   5 +
 .../huggingface-inference-api/coquiai404.py   |  26 +++++
 ...ggingface-spaces-facebook-fastspeech404.py |  26 +++++
 .../huggingface-speak-ng404.py                |  18 +++
 .../huggingface-microsoft-tts5/README.md      |   1 +
 .../{native => native-osx}/pyttsx-native.py   |   0
 .../{native => native-osx}/test_objc.py       |   0
 .../{native => native-osx}/test_pyttsx3.py    |   0
 .../{native => native-osx}/tts_osx_native.py  |   0
 .../transformers-tacotron2-nomodel.py         |  24 ++++
 ...ransformers-wav2vec-nofeatureextractor.py} |   0
 ...-iapi-facebook-swig-tiny-coco-overlayed.py |  99 ++++++++++++++++
 ...f-iapi-facebook-swig-tiny-coco-segments.py | 109 ++++++++++++++++++
 14 files changed, 309 insertions(+)
 create mode 100644 3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py
 create mode 100644 3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py
 create mode 100644 3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py
 create mode 100644 3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py
 create mode 100644 3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md
 rename 3-experiments/4-text-to-speech/{native => native-osx}/pyttsx-native.py (100%)
 rename 3-experiments/4-text-to-speech/{native => native-osx}/test_objc.py (100%)
 rename 3-experiments/4-text-to-speech/{native => native-osx}/test_pyttsx3.py (100%)
 rename 3-experiments/4-text-to-speech/{native => native-osx}/tts_osx_native.py (100%)
 create mode 100644 3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py
 rename 3-experiments/4-text-to-speech/{huggingface-tts.py => transformers-wav2vec-nofeatureextractor.py} (100%)
 create mode 100644 3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py
 create mode 100644 3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py

diff --git a/.gitignore b/.gitignore
index 7893d9c..84aca07 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,5 +15,6 @@ old
 *.flac
 *.jpg
 *photo*
+*.png
 
 
diff --git a/3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py b/3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py
new file mode 100644
index 0000000..7eeeea1
--- /dev/null
+++ b/3-experiments/4-text-to-speech/espnet-tacotron2-notokenbad.py
@@ -0,0 +1,5 @@
+from espnet2.bin.tts_inference import Text2Speech
+
+model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space_train.loss.best")
+
+speech, *_ = model("text to generate speech from")
\ No newline at end of file
diff --git a/3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py b/3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py
new file mode 100644
index 0000000..8dbe137
--- /dev/null
+++ b/3-experiments/4-text-to-speech/huggingface-inference-api/coquiai404.py
@@ -0,0 +1,26 @@
+import requests
+
+API_URL = "https://huggingface.co/spaces/coqui-ai/TTS/+/api/predict"
+headers = {"Content-Type": "application/json"}
+
+def query(payload):
+    response = requests.post(API_URL, headers=headers, json=payload)
+    response.raise_for_status()  # Esto lanzará una excepción si el código de estado no es 200
+    return response.json()
+
+# Formato de la carga útil para el espacio de coqui-ai/TTS
+payload = {
+    "data": ["The answer to the universe is 42"]
+}
+
+result = query(payload)
+
+# El audio está en result['data'] que contiene una lista de resultados.
+# Aquí asumimos que el primer resultado es el audio deseado.
+audio_url = result['data'][0]
+audio_bytes = requests.get(audio_url).content
+
+# Guardar los bytes de audio en un archivo .wav
+with open('output.wav', 'wb') as f:
+    f.write(audio_bytes)
+print("Audio guardado en output.wav")
diff --git a/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py
new file mode 100644
index 0000000..42c5f42
--- /dev/null
+++ b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-spaces-facebook-fastspeech404.py
@@ -0,0 +1,26 @@
+import requests
+
+API_URL = "https://hf.space/embed/facebook/fastspeech2-en-ljspeech/+/api/predict"
+headers = {"Content-Type": "application/json"}
+
+def query(payload):
+    response = requests.post(API_URL, headers=headers, json=payload)
+    response.raise_for_status()  # Esto lanzará una excepción si el código de estado no es 200
+    return response.json()
+
+# Formato de la carga útil para el espacio de fastspeech2
+payload = {
+    "data": ["The answer to the universe is 42"]
+}
+
+result = query(payload)
+
+# El audio está en result['data'] que contiene una lista de resultados.
+# Aquí asumimos que el primer resultado es el audio deseado.
+audio_url = result['data'][0]['name']
+audio_bytes = requests.get(audio_url).content
+
+# Guardar los bytes de audio en un archivo .wav
+with open('output.wav', 'wb') as f:
+    f.write(audio_bytes)
+print("Audio guardado en output.wav")
diff --git a/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py
new file mode 100644
index 0000000..c486221
--- /dev/null
+++ b/3-experiments/4-text-to-speech/huggingface-inference-api/huggingface-speak-ng404.py
@@ -0,0 +1,18 @@
+import requests
+
+API_URL = "https://api-inference.huggingface.co/models/espeak-ng"
+headers = {"Content-Type": "application/json"}
+
+def query(payload):
+    response = requests.post(API_URL, headers=headers, json=payload)
+    response.raise_for_status()  # Esto lanzará una excepción si el código de estado no es 200
+    return response.content
+
+audio_bytes = query({
+    "inputs": "The answer to the universe is 42",
+})
+
+# Guardar los bytes de audio en un archivo .wav
+with open('output.wav', 'wb') as f:
+    f.write(audio_bytes)
+print("Audio guardado en output.wav")
diff --git a/3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md b/3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md
new file mode 100644
index 0000000..017bcdc
--- /dev/null
+++ b/3-experiments/4-text-to-speech/huggingface-microsoft-tts5/README.md
@@ -0,0 +1 @@
+https://huggingface.co/microsoft/speecht5_tts
\ No newline at end of file
diff --git a/3-experiments/4-text-to-speech/native/pyttsx-native.py b/3-experiments/4-text-to-speech/native-osx/pyttsx-native.py
similarity index 100%
rename from 3-experiments/4-text-to-speech/native/pyttsx-native.py
rename to 3-experiments/4-text-to-speech/native-osx/pyttsx-native.py
diff --git a/3-experiments/4-text-to-speech/native/test_objc.py b/3-experiments/4-text-to-speech/native-osx/test_objc.py
similarity index 100%
rename from 3-experiments/4-text-to-speech/native/test_objc.py
rename to 3-experiments/4-text-to-speech/native-osx/test_objc.py
diff --git a/3-experiments/4-text-to-speech/native/test_pyttsx3.py b/3-experiments/4-text-to-speech/native-osx/test_pyttsx3.py
similarity index 100%
rename from 3-experiments/4-text-to-speech/native/test_pyttsx3.py
rename to 3-experiments/4-text-to-speech/native-osx/test_pyttsx3.py
diff --git a/3-experiments/4-text-to-speech/native/tts_osx_native.py b/3-experiments/4-text-to-speech/native-osx/tts_osx_native.py
similarity index 100%
rename from 3-experiments/4-text-to-speech/native/tts_osx_native.py
rename to 3-experiments/4-text-to-speech/native-osx/tts_osx_native.py
diff --git a/3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py b/3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py
new file mode 100644
index 0000000..db14b9a
--- /dev/null
+++ b/3-experiments/4-text-to-speech/transformers-tacotron2-nomodel.py
@@ -0,0 +1,24 @@
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+
+cache_dir = ".cache"
+
+# Nombre del modelo TTS
+model_name = "espnet/kan-bayashi_ljspeech_tts_train_tacotron2"
+
+# Cargar el modelo y el tokenizer
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_dir   )
+tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+
+# Crear un pipeline de text-to-speech
+text_to_speech = pipeline("text2speech", model=model, tokenizer=tokenizer)
+
+# Texto de ejemplo
+text = "Hello, how are you?"
+
+# Generar el audio
+outputs = text_to_speech(text)
+
+# Guardar el audio en un archivo
+with open("output.wav", "wb") as f:
+    f.write(outputs["audio"])
+print("Audio guardado en output.wav")
diff --git a/3-experiments/4-text-to-speech/huggingface-tts.py b/3-experiments/4-text-to-speech/transformers-wav2vec-nofeatureextractor.py
similarity index 100%
rename from 3-experiments/4-text-to-speech/huggingface-tts.py
rename to 3-experiments/4-text-to-speech/transformers-wav2vec-nofeatureextractor.py
diff --git a/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py
new file mode 100644
index 0000000..96919f2
--- /dev/null
+++ b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-overlayed.py
@@ -0,0 +1,99 @@
+import requests
+from pycocotools import mask
+import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw, ImageOps, ImageFont
+from dotenv import find_dotenv, load_dotenv
+import os
+import base64
+import io
+import random
+import numpy as np
+
+# Load environment variables
+load_dotenv(find_dotenv())
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+
+API_URL = "https://api-inference.huggingface.co/models/facebook/mask2former-swin-tiny-coco-panoptic"
+headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
+
+def segment_image(image_path):
+    with open(image_path, "rb") as f:
+        data = f.read()
+    response = requests.post(API_URL, headers=headers, data=data)
+    return response.json()
+
+def decode_mask(mask_str, size):
+    mask_data = base64.b64decode(mask_str)
+    mask_image = Image.open(io.BytesIO(mask_data))
+    mask_image = mask_image.resize(size).convert("L")
+    return mask_image
+
+def overlay_masks_on_image(image_path, segments, transparency=0.4):
+    original_image = Image.open(image_path).convert("RGBA")
+    overlay = Image.new("RGBA", original_image.size, (255, 255, 255, 0))
+    
+    # Nueva capa para el texto
+
+    text_layer = Image.new("RGBA", original_image.size, (255, 255, 255, 0))  
+    
+    for segment in segments:
+        print(segment['label'] + " " + str(segment['score']))
+        mask_str = segment['mask']
+        mask_image = decode_mask(mask_str, original_image.size)
+        color = generate_random_color()
+        
+        color_mask = ImageOps.colorize(mask_image, black="black", white=color)
+        color_mask.putalpha(mask_image)
+        
+        overlay = Image.alpha_composite(overlay, color_mask)
+        
+        # Calcula el centroide de la mascara
+        
+        x, y = np.where(np.array(mask_image) > 0)
+        centroid_x = x.mean()
+        centroid_y = y.mean()
+        
+        # Imprime la etiqueta y la puntuación en la capa de texto
+        
+        font_size = 30
+        draw = ImageDraw.Draw(text_layer)
+        font_path = "/System/Library/Fonts/Arial.ttf"  # Path to Arial font on macOS
+        font = ImageFont.truetype(font_path, font_size)
+        label = segment['label']
+        score = segment['score']
+        text =f"{label}: {score}"
+        
+        # Estima el tamaño del texto hard rockandroll way
+       
+        text_width = 500
+        text_height = 100
+        draw.text((centroid_x - text_width / 2, centroid_y - text_height / 2), text, fill=(255, 255, 255, 255), font=font)
+        
+    
+    # Ajusta la transparencia de la capa de superposición
+    
+    overlay = Image.blend(original_image, overlay, transparency)
+
+    # Combina la capa de superposición con la capa de texto
+    
+    final_image = Image.alpha_composite(overlay, text_layer)
+    
+    
+    return final_image
+
+def generate_random_color():
+    return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+
+
+def main():
+    image_path = "cats.jpg"
+    response = segment_image(image_path)
+
+    if isinstance(response, list):
+        overlayed_image = overlay_masks_on_image(image_path, response)
+        overlayed_image.show()
+        overlayed_image.save("overlayed_image.png")
+    else:
+        print("Error in segmentation:", response)
+
+__main__ = main()
\ No newline at end of file
diff --git a/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py
new file mode 100644
index 0000000..ea58770
--- /dev/null
+++ b/3-experiments/5-image-segmentation/1-panoptic-segmentation/facebook-swing-tiny-coco/hf-iapi-facebook-swig-tiny-coco-segments.py
@@ -0,0 +1,109 @@
+import requests
+from pycocotools import mask
+import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw, ImageOps
+from pprint import pprint
+from dotenv import find_dotenv, load_dotenv
+import os
+import base64
+import io
+from pprint import pprint
+#load_dotenv(find_dotenv())
+HUGGINGFACEHUB_API_TOKEN=os.getenv("HUGGINGFACEHUB_API_TOKEN")
+
+API_URL = "https://api-inference.huggingface.co/models/facebook/mask2former-swin-tiny-coco-panoptic"
+headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
+#print(headers)
+
+#def query(filename):
+#    with open(filename, "rb") as f:
+#        data = f.read()
+#    response = requests.post(API_URL, headers=headers, data=data)
+#    return response.json()
+
+#s¡output = query("cats.jpg")
+# Imprime el tipo de la variable 'output'
+# ∫print(type(output))
+
+
+def segment_image(image_path):
+   with open(image_path, "rb") as f:
+       data = f.read()
+   response = requests.post(API_URL, headers=headers, data=data)
+   return response.json()
+
+
+
+
+def draw_segmented_image(image_path, segments):
+    image = Image.open(image_path).convert("RGBA")
+    overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
+    draw = ImageDraw.Draw(overlay)
+
+    for segment in segments:
+        mask = Image.open(io.BytesIO(segment['mask']['data']))
+        mask = mask.resize(image.size)
+        color = tuple(segment['color'])
+        mask = mask.convert("L").point(lambda p: p > 128 and 255)
+        overlay = Image.composite(Image.new("RGBA", image.size, color + (128,)), overlay, mask)
+
+    combined = Image.alpha_composite(image, overlay)
+    return combined
+
+def decode_mask(mask_str):
+    mask_data = base64.b64decode(mask_str)
+    mask_image = Image.open(io.BytesIO(mask_data)).convert("L")
+    return mask_image
+
+def overlay_mask_on_image(original_image, mask_image, color=(255, 0, 0), alpha=0.5):
+    # Create a color version of the mask
+    color_mask = ImageOps.colorize(mask_image, black="black", white=color)
+    # Convert the mask to have an alpha channel
+    color_mask.putalpha(mask_image)
+    # Resize the mask to match the original image size
+    color_mask = color_mask.resize(original_image.size, resample=Image.BILINEAR)
+    # Composite the mask with the original image
+    overlay_image = Image.alpha_composite(original_image.convert("RGBA"), color_mask)
+    return overlay_image
+
+
+
+
+
+
+def decode_mask(mask_str, size):
+    mask_data = base64.b64decode(mask_str)
+    mask_image = Image.open(io.BytesIO(mask_data))
+    mask_image = mask_image.resize(size).convert("L")
+    return mask_image
+
+def decode_and_display_mask(mask_str):
+    mask_data = base64.b64decode(mask_str)
+    mask_image = Image.open(io.BytesIO(mask_data)).convert("L")
+    mask_image.show()  # Display the mask image
+
+
+image_path = "cats.jpg"
+response = segment_image(image_path)
+
+if isinstance(response, list):  # Check if the result is a list of segments
+        original_image = Image.open(image_path).convert("RGBA")
+        for segment in response:
+            label = segment['label']
+            score = segment['score']
+            mask = segment['mask']
+            print(f"Label: {label}, Score: {score}")
+            # decode_and_display_mask(mask)
+            
+            mask_image = decode_mask(mask, original_image.size)
+            overlay_image = overlay_mask_on_image(original_image, mask_image)
+            
+            overlay_image.show()  # Display the image with overlay
+            overlay_image.save(f"overlay_{label}.png")
+else:
+        print("Error in segmentation:", response)
+#if 'segments' in segments:
+#        segmented_image = draw_segmented_image(image_path, segments['segments'])
+#        segmented_image.show()
+#else:
+#        print("Error in segmentation:", segments)
\ No newline at end of file