Skip to content

Commit ea0b3de

Browse files
committed
fix(datasets): move storage to gcs
1 parent ef81df8 commit ea0b3de

File tree

15 files changed

+743
-754
lines changed

15 files changed

+743
-754
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
setup(
1212
name="sign-language-datasets",
1313
packages=packages,
14-
version="0.1.8",
14+
version="0.2.0",
1515
description="TFDS Datasets for sign language",
1616
author="Amit Moryossef",
1717
author_email="amitmoryossef@gmail.com",

sign_language_datasets/datasets/autsl/autsl.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from tqdm import tqdm
1313

1414
from ..warning import dataset_warning
15-
from ...datasets.config import SignDatasetConfig
15+
from ...datasets.config import SignDatasetConfig, cloud_bucket_file
1616
from ...utils.features import PoseFeature
1717

1818
_DESCRIPTION = """
@@ -44,16 +44,16 @@
4444
_TRAIN_LABELS = "http://158.109.8.102/AuTSL/data/train/train_labels.csv"
4545

4646
_VALID_VIDEOS = "http://158.109.8.102/AuTSL/data/validation/val_set_bjhfy68.zip" # 3 files
47-
_VALID_LABELS = "https://nlp.biu.ac.il/~amit/datasets/public/autsl_validation_labels.csv"
47+
_VALID_LABELS = cloud_bucket_file("public/autsl_validation_labels.csv")
4848

4949
_TEST_VIDEOS = "http://158.109.8.102/AuTSL/data/test/test_set_xsaft57.zip" # 3 files
50-
_TEST_LABELS = "https://nlp.biu.ac.il/~amit/datasets/public/autsl_test_labels.csv"
50+
_TEST_LABELS = cloud_bucket_file("public/autsl_test_labels.csv")
5151

5252
_CLASSES = "https://data.chalearnlap.cvc.uab.cat/AuTSL/data/SignList_ClassId_TR_EN.csv"
5353

5454
_POSE_URLS = {
55-
"holistic": "https://nlp.biu.ac.il/~amit/datasets/poses/holistic/autsl.tar.gz",
56-
"openpose": "https://nlp.biu.ac.il/~amit/datasets/poses/openpose/autsl.tar.gz",
55+
"holistic": cloud_bucket_file("poses/holistic/autsl.tar.gz"),
56+
"openpose": cloud_bucket_file("poses/openpose/autsl.tar.gz")
5757
}
5858
_POSE_HEADERS = {
5959
"holistic": path.join(path.dirname(path.realpath(__file__)), "holistic.poseheader"),

sign_language_datasets/datasets/autsl/checksums.tsv

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,3 @@ http://158.109.8.102/AuTSL/data/train/train_set_vfbha39.zip.018 688988403 4cc
2424
http://158.109.8.102/AuTSL/data/validation/val_set_bjhfy68.zip.001 1048576000 007a96f8e15055e370adaa3b7e8ca9e3b76abd269f677bc33808fc739401ce49 val_set_bjhfy68.zip.001
2525
http://158.109.8.102/AuTSL/data/validation/val_set_bjhfy68.zip.002 1048576000 c138b73f87f952008fe21b18466bc924d539532fd348c9a7e775761c27294591 val_set_bjhfy68.zip.002
2626
http://158.109.8.102/AuTSL/data/validation/val_set_bjhfy68.zip.003 1028644777 ed1549f54ddc5b4cf71d89743888daacde11f28616e52d9b5133bccf2f9fd0b1 val_set_bjhfy68.zip.003
27-
https://nlp.biu.ac.il/~amit/datasets/poses/holistic/autsl.tar.gz 14820092818 4e80233393dcf83c4d98cbde80e7e9bc202bb4d6782f71e343d148ab212b3f9f autsl.tar.gz
28-
https://nlp.biu.ac.il/~amit/datasets/poses/openpose/autsl.tar.gz 1107613563 8678569831453dd9befe3b53a2856e320d052d76f63c5e4c0d45c6add0f9feda autsl.tar.gz
29-
https://nlp.biu.ac.il/~amit/datasets/public/autsl_test_labels.csv 86676 a3fe15717484beb2565fd812aba720afcccb36c43173f891ce31163af9137651 autsl_test_labels.csv
30-
https://nlp.biu.ac.il/~amit/datasets/public/autsl_validation_labels.csv 102264 12e74248fb6199ad41fa58aa85b151966e6da31912ec109b790c6c6d627c773b autsl_validation_labels.csv

sign_language_datasets/datasets/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import cv2
44
import tensorflow_datasets as tfds
55

6+
def cloud_bucket_file(file_name: str) -> str:
7+
escaped_file_name = file_name.replace("/", "%2F")
8+
return f"https://firebasestorage.googleapis.com/v0/b/sign-language-datasets/o/{escaped_file_name}?alt=media"
69

710
class SignDatasetConfig(tfds.core.BuilderConfig):
811
"""General BuilderConfig for sign language datasets."""

sign_language_datasets/datasets/dgs_corpus/create_index.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import re
55
import urllib.request
66

7+
from sign_language_datasets.datasets.config import cloud_bucket_file
8+
79
corpus_path = "https://www.sign-lang.uni-hamburg.de/meinedgs/"
810

911
index_data = {}
@@ -33,12 +35,12 @@
3335

3436
# Add holistic
3537
for c in ["a", "b"]:
36-
holistic_path = "https://nlp.biu.ac.il/~amit/datasets/poses/holistic/dgs_corpus/" + tr_id + "_" + c + ".pose"
38+
holistic_path = cloud_bucket_file(f"poses/holistic/dgs_corpus/{tr_id}_{c}.pose")
3739
index_data[tr_id]["holistic_" + c] = holistic_path if index_data[tr_id]["video_" + c] is not None else None
3840

3941
# Make sure parsing worked
4042
if index_data[tr_id]["openpose"] is not None:
4143
assert index_data[tr_id]["openpose"].endswith(".json.gz")
4244

43-
with open("data.json", "w") as f:
45+
with open("dgs.json", "w") as f:
4446
json.dump(index_data, f)

sign_language_datasets/datasets/dgs_corpus/dgs.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

sign_language_datasets/datasets/dgs_corpus/dgs_corpus.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from pose_format.pose import Pose
2222

2323
from ..warning import dataset_warning
24-
from ...datasets.config import SignDatasetConfig
24+
from ...datasets.config import SignDatasetConfig, cloud_bucket_file
2525
from ...utils.features import PoseFeature
2626

2727
try:
@@ -49,7 +49,7 @@
4949
_HOMEPAGE = "https://www.sign-lang.uni-hamburg.de/meinedgs/"
5050

5151
# This `dgs.json` file was created using `create_index.py`
52-
INDEX_URL = "https://nlp.biu.ac.il/~amit/datasets/dgs.json"
52+
INDEX_PATH = path.join(path.dirname(path.realpath(__file__)), "dgs.json")
5353

5454
_POSE_HEADERS = {
5555
"holistic": path.join(path.dirname(path.realpath(__file__)), "holistic.poseheader"),
@@ -279,9 +279,7 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager):
279279
"""Returns SplitGenerators."""
280280
dataset_warning(self)
281281

282-
index_path = dl_manager.download(INDEX_URL)
283-
284-
with open(index_path, "r", encoding="utf-8") as f:
282+
with open(INDEX_PATH, "r", encoding="utf-8") as f:
285283
index_data = json.load(f)
286284

287285
# No need to download HTML pages

0 commit comments

Comments
 (0)