Skip to content

Commit 3f32ecd

Browse files
committed
datasets(signbank): add sign language code
1 parent a81d308 commit 3f32ecd

File tree

1 file changed

+144
-142
lines changed

1 file changed

+144
-142
lines changed

sign_language_datasets/datasets/signbank/signbank.py

Lines changed: 144 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -29,147 +29,147 @@ def is_signwriting(fsw: str) -> bool:
2929

3030

3131
PUDDLES = {
32-
2: ["my", "mm"], # Myanmar Dictionary",
33-
4: ["en", "us"], # Dictionary US",
34-
5: ["en", "us"], # Literature US",
35-
11: ["en", "sg"], # Singapore Sign Language (SgSL) Dictionary",
36-
12: ["zh-CN", "hk"], # Hong Kong Dictionary",
37-
13: ["zh-CN", "hk"], # Hong Kong Literature",
38-
14: ["en", "sg"], # Singapore Sign Language (SgSL) Literature",
39-
16: ["es", "hn"], # Diccionario Honduras",
40-
17: ["", ""], # Deaf Harbor",
41-
18: ["am", "et"], # Dictionary Ethiopia",
42-
19: ["pl", "pl"], # Słownik PL",
43-
20: ["fr", "ch"], # Littérature CH-fr",
44-
21: ["en", "us"], # Encyclopedia US",
45-
22: ["fr", "ch"], # Encyclopédie CH-fr",
46-
23: ["no", "no"], # Litteratur NO",
47-
24: ["no", "no"], # Leksikon NO",
48-
25: ["en", "us"], # LLCN & SignTyp",
49-
26: ["de", "de"], # Literatur DE",
50-
27: ["de", "de"], # Enzyklopädie DE",
51-
28: ["en", "us"], # ASL Bible Dictionary",
52-
29: ["de", "at"], # Wörterbuch AT",
53-
30: ["da", "dk"], # Ordbog Danmark",
54-
31: ["mt", "mt"], # Dictionary Malta",
55-
32: ["en", "ng"], # Dictionary Nigeria",
56-
33: ["pt", "pt"], # Dicionário Portugal",
57-
34: ["th", "th"], # Dictionary Thailand",
58-
35: ["en", "isl"], # Dictionary International",
59-
36: ["cs", "cz"], # Literatura CZ",
60-
37: ["cs", "cz"], # Encyklopedie CZ",
61-
38: ["pl", "pl"], # Literatura PL",
62-
39: ["pl", "pl"], # Encyklopedie PL",
63-
40: ["ar", "sa"], # Dictionary Saudi Arabia",
64-
41: ["es", "ar"], # Diccionario Argentina",
65-
42: ["en", "au"], # Dictionary Australia",
66-
43: ["fr", "be"], # Dictionnaire BE-fr",
67-
44: ["nl", "be"], # Woordenboek Flanders",
68-
45: ["es", "bo"], # Diccionario Bolivia",
69-
46: ["pt", "br"], # Dicionário Brasil",
70-
47: ["fr", "ca"], # Dictionnaire Quebec",
71-
48: ["de", "ch"], # Wörterbuch CH-de",
72-
49: ["fr", "ch"], # Dictionnaire CH-fr",
73-
50: ["it", "ch"], # Dizionario CH-it",
74-
51: ["es", "co"], # Diccionario Colombia",
75-
52: ["sk", "sk"], # Slovník CZ",
76-
53: ["de", "de"], # Wörterbuch DE",
77-
54: ["", ""], # Vortaro",
78-
55: ["es", "es"], # Diccionario España",
79-
56: ["ca", "es"], # Diccionario Catalán",
80-
57: ["fi", "fi"], # Dictionary Finland",
81-
58: ["fr", "fr"], # Dictionnaire FR",
82-
59: ["en", "gb"], # Dictionary Great Britain",
83-
60: ["en", "ie"], # Dictionary Northern Ireland",
84-
61: ["gr", "gr"], # Dictionary Greece",
85-
62: ["en", "ir"], # Dictionary Ireland",
86-
63: ["it", "it"], # Dizionario IT",
87-
64: ["ja", "jp"], # Dictionary Japan",
88-
65: ["es", "mx"], # Diccionario Mexico",
89-
66: ["ms", "my"], # Dictionary Malaysia",
90-
67: ["es", "ni"], # Diccionario Nicaragua",
91-
68: ["nl", "nl"], # Woordenboek NL",
92-
69: ["", ""], # Ordbok NO",
93-
70: ["en", "nz"], # Dictionary New Zealand",
94-
71: ["es", "pe"], # Diccionario Peru",
95-
72: ["fil", "ph"], # Dictionary Philippines",
96-
73: ["sv", "se"], # Ordbok Sverige",
97-
74: ["sl", "sl"], # Slovar Slovenia",
98-
75: ["zh-tw", "tw"], # Dictionary Taiwan",
99-
76: ["es", "ve"], # Diccionario Venezuela",
100-
77: ["en", "za"], # Dictionary South Africa",
101-
78: ["ko", "kr"], # Dictionary Korea",
102-
79: ["sw", "ke"], # Dictionary Kenya",
103-
80: ["", ""], # Project 2 Dictionary Sorting",
104-
81: ["fr", "ca"], # Littérature Quebec",
105-
82: ["sq", "al"], # Dictionary Albania",
106-
83: ["zh-cn", "cn"], # Dictionary China",
107-
84: ["ar", "eg"], # Dictionary Egypt",
108-
85: ["hi", "in"], # Dictionary India",
109-
86: ["ar", "jo"], # Dictionary Jordan",
110-
87: ["ur", "pk"], # Dictionary Pakistan",
111-
88: ["ru", "ru"], # Dictionary Russia",
112-
89: ["sk", "sk"], # Dictionary Slovakia",
113-
90: ["tr", "tr"], # Dictionary Turkey",
114-
91: ["ar", "sa"], # Literature Saudi Arabia",
115-
92: ["ar", "jo"], # Literature Jordan",
116-
93: ["es", "es"], # Literatura España",
117-
94: ["ca", "es"], # Literatura Catalán",
118-
95: ["fr", "be"], # Littérature BE-fr",
119-
96: ["de", "ch"], # Literatur CH-de",
120-
98: ["nl", "be"], # Literatuur Flanders",
121-
99: ["ja", "jp"], # Literature Japan",
122-
100: ["am", "et"], # Literature Ethiopia",
123-
103: ["mt", "mt"], # Malta LSM Private Puddle",
124-
104: ["ar", "tn"], # Dictionnaire Tunisien",
125-
105: ["", ""], # DAC Private Puddle",
126-
106: ["ps", "af"], # Dictionary Afghanistan",
127-
107: ["lt", "lt"], # Dictionary Lithuania",
128-
108: ["lv", "lv"], # Dictionary Latvia",
129-
109: ["et", "et"], # Dictionary Estonia",
130-
110: ["he", "il"], # Dictionary Israel",
131-
111: ["", ""], # Project 1 Translate Wiki",
132-
112: ["es", "gt"], # Dictionary Guatemala",
133-
113: ["ht", "ht"], # Dictionary Haiti",
134-
114: ["pt", "br"], # Literatura Brasil",
135-
115: ["pt", "pt"], # Literatura Portugal",
136-
116: ["pt", "br"], # Enciclopédia Brasil",
137-
117: ["pt", "pt"], # Enciclopédia Portugal",
138-
118: ["da", "dk"], # Litteratur Danmark",
139-
119: ["es", "ni"], # Literatura Nicaragua",
140-
120: ["es", "mx"], # Literatura Mexico",
141-
122: ["hu", "hu"], # Dictionary Hungary",
142-
123: ["hu", "hu"], # Literature Hungary",
143-
124: ["fr", "fr"], # Literature France",
144-
125: ["en", "gb"], # Literature Great Britain",
145-
126: ["ar", "tn"], # Littérature Tunisien",
146-
127: ["mt", "mt"], # Literature Malta",
147-
128: ["mw", "mw"], # Dictionary Malawi",
148-
129: ["gn", "py"], # Diccionario Paraguay",
149-
130: ["uk", "ua"], # Dictionary Ukraine",
150-
131: ["", ""], # Ordabók IS",
151-
132: ["ro", "ro"], # Dictionary Romania",
152-
133: ["ne", "np"], # Dictionary Nepal",
153-
134: ["bg", "bg"], # Dictionary Bulgaria",
154-
135: ["es", "cl"], # Diccionario Chile",
155-
136: ["es", "ec"], # Diccionario Ecuador",
156-
137: ["es", "sv"], # Diccionario El Salvador",
157-
138: ["ro", "ro"], # Literature Romania",
158-
139: ["ro", "ro"], # Encyclopedia Romania",
159-
140: ["fr", "ca"], # Encyclopédie Quebec",
160-
141: ["ru", "ru"], # Literature Russia",
161-
142: ["ru", "ru"], # Encyclopedia Russia",
162-
143: ["es", "uy"], # Diccionario Uruguay",
163-
144: ["es", "uy"], # Literatura Uruguay",
164-
145: ["es", "ar"], # Literatura Argentina",
165-
146: ["es", "ar"], # Enciclopedia Argentina",
166-
147: ["mt", "mt"], # Literature Malta Archive",
167-
148: ["sl", "sl"], # Besedila Slovenia",
168-
149: ["sl", "sl"], # Enciklopedija Slovenia",
169-
150: ["", ""], # Anthropology Book Project",
170-
151: ["en", "us"], # ASL Bible Books NLT",
171-
152: ["en", "us"], # ASL Bible Books Shores Deaf Church",
172-
153: ["vn", "vn"], # Dictionary Vietnam"
32+
2: ["my", "mm", "ysm"], # Myanmar Dictionary
33+
4: ["en", "us", "ase"], # Dictionary US
34+
5: ["en", "us", "ase"], # Literature US
35+
11: ["en", "sg", "sls"], # Singapore Sign Language (SgSL) Dictionary
36+
12: ["zh-CN", "hk", "hks"], # Hong Kong Dictionary
37+
13: ["zh-CN", "hk", "hks"], # Hong Kong Literature
38+
14: ["en", "sg", "sls"], # Singapore Sign Language (SgSL) Literature
39+
16: ["es", "hn", "hds"], # Diccionario Honduras
40+
17: ["en", "us", "ase"], # Deaf Harbor
41+
18: ["am", "et", "eth"], # Dictionary Ethiopia
42+
19: ["pl", "pl", "pso"], # Słownik PL
43+
20: ["fr", "ch", "ssr"], # Littérature CH-fr
44+
21: ["en", "us", "ase"], # Encyclopedia US
45+
22: ["fr", "ch", "ssr"], # Encyclopédie CH-fr
46+
23: ["no", "no", "nsl"], # Litteratur NO
47+
24: ["no", "no", "nsl"], # Leksikon NO
48+
25: ["en", "us", "ase"], # LLCN & SignTyp
49+
26: ["de", "de", "gsg"], # Literatur DE
50+
27: ["de", "de", "gsg"], # Enzyklopädie DE
51+
28: ["en", "us", "ase"], # ASL Bible Dictionary
52+
29: ["de", "at", "asq"], # Wörterbuch AT
53+
30: ["da", "dk", "dsl"], # Ordbog Danmark
54+
31: ["mt", "mt", "mdl"], # Dictionary Malta
55+
32: ["en", "ng", "nsi"], # Dictionary Nigeria
56+
33: ["pt", "pt", "psr"], # Dicionário Portugal
57+
34: ["th", "th", "tsq"], # Dictionary Thailand
58+
35: ["en", "isl", "ase"], # Dictionary International
59+
36: ["cs", "cz", "cse"], # Literatura CZ
60+
37: ["cs", "cz", "cse"], # Encyklopedie CZ
61+
38: ["pl", "pl", "pso"], # Literatura PL
62+
39: ["pl", "pl", "pso"], # Encyklopedie PL
63+
40: ["ar", "sa", "sdl"], # Dictionary Saudi Arabia
64+
41: ["es", "ar", "aed"], # Diccionario Argentina
65+
42: ["en", "au", "asf"], # Dictionary Australia
66+
43: ["fr", "be", "sfb"], # Dictionnaire BE-fr
67+
44: ["nl", "be", "vgt"], # Woordenboek Flanders
68+
45: ["es", "bo", "bvl"], # Diccionario Bolivia
69+
46: ["pt", "br", "bzs"], # Dicionário Brasil
70+
47: ["fr", "ca", "fcs"], # Dictionnaire Quebec
71+
48: ["de", "ch", "sgg"], # Wörterbuch CH-de
72+
49: ["fr", "ch", "ssr"], # Dictionnaire CH-fr
73+
50: ["it", "ch", "slf"], # Dizionario CH-it
74+
51: ["es", "co", "csn"], # Diccionario Colombia
75+
52: ["sk", "sk", "svk"], # Slovník CZ
76+
53: ["de", "de", "gsg"], # Wörterbuch DE
77+
54: ["eo", "", "ase"], # Vortaro (esperanto)
78+
55: ["es", "es", "ssp"], # Diccionario España
79+
56: ["ca", "es", "csc"], # Diccionario Catalán
80+
57: ["fi", "fi", "fse"], # Dictionary Finland
81+
58: ["fr", "fr", "fsl"], # Dictionnaire FR
82+
59: ["en", "gb", "bfi"], # Dictionary Great Britain
83+
60: ["en", "ie", "isg"], # Dictionary Northern Ireland
84+
61: ["gr", "gr", "gss"], # Dictionary Greece
85+
62: ["en", "ir", "psc"], # Dictionary Ireland
86+
63: ["it", "it", "ise"], # Dizionario IT
87+
64: ["ja", "jp", "jsl"], # Dictionary Japan
88+
65: ["es", "mx", "mfs"], # Diccionario Mexico
89+
66: ["ms", "my", "xml"], # Dictionary Malaysia
90+
67: ["es", "ni", "ncs"], # Diccionario Nicaragua
91+
68: ["nl", "nl", "dse"], # Woordenboek NL
92+
69: ["no", "no", "nsl"], # Ordbok NO
93+
70: ["en", "nz", "nzs"], # Dictionary New Zealand
94+
71: ["es", "pe", "prl"], # Diccionario Peru
95+
72: ["fil", "ph", "psp"], # Dictionary Philippines
96+
73: ["sv", "se", "swl"], # Ordbok Sverige
97+
74: ["sl", "sl", "ysl"], # Slovar Slovenia
98+
75: ["zh-TW", "tw", "tss"], # Dictionary Taiwan
99+
76: ["es", "ve", "vsl"], # Diccionario Venezuela
100+
77: ["en", "za", "sfs"], # Dictionary South Africa
101+
78: ["ko", "kr", "kvk"], # Dictionary Korea
102+
79: ["sw", "ke", "xki"], # Dictionary Kenya
103+
80: ["pt", "pt", "psr"], # Project 2 Dictionary Sorting
104+
81: ["fr", "ca", "fcs"], # Littérature Quebec
105+
82: ["sq", "al", "sqk"], # Dictionary Albania
106+
83: ["zh-CN", "cn", "csl"], # Dictionary China
107+
84: ["ar", "eg", "esl"], # Dictionary Egypt
108+
85: ["hi", "in", "ins"], # Dictionary India
109+
86: ["ar", "jo", "jos"], # Dictionary Jordan
110+
87: ["ur", "pk", "pks"], # Dictionary Pakistan
111+
88: ["ru", "ru", "rsl"], # Dictionary Russia
112+
89: ["sk", "sk", "svk"], # Dictionary Slovakia
113+
90: ["tr", "tr", "tsm"], # Dictionary Turkey
114+
91: ["ar", "sa", "sdl"], # Literature Saudi Arabia
115+
92: ["ar", "jo", "jos"], # Literature Jordan
116+
93: ["es", "es", "ssp"], # Literatura España
117+
94: ["ca", "es", "csc"], # Literatura Catalán
118+
95: ["fr", "be", "sfb"], # Littérature BE-fr
119+
96: ["de", "ch", "sgg"], # Literatur CH-de
120+
98: ["nl", "be", "vgt"], # Literatuur Flanders
121+
99: ["ja", "jp", "jsl"], # Literature Japan
122+
100: ["am", "et", "eth"], # Literature Ethiopia
123+
103: ["mt", "mt", "mdl"], # Malta LSM Private Puddle
124+
104: ["ar", "tn", "tse"], # Dictionnaire Tunisien
125+
105: ["en", "us", "ase"], # DAC Private Puddle
126+
106: ["ps", "af", "afg"], # Dictionary Afghanistan
127+
107: ["lt", "lt", "lls"], # Dictionary Lithuania
128+
108: ["lv", "lv", "lsl"], # Dictionary Latvia
129+
109: ["et", "et", "eso"], # Dictionary Estonia
130+
110: ["he", "il", "isr"], # Dictionary Israel
131+
111: ["en", "us", "ase"], # Project 1 Translate Wiki
132+
112: ["es", "gt", "gsm"], # Dictionary Guatemala
133+
113: ["ht", "ht", ""], # Dictionary Haiti
134+
114: ["pt", "br", "bzs"], # Literatura Brasil
135+
115: ["pt", "pt", "psr"], # Literatura Portugal
136+
116: ["pt", "br", "bzs"], # Enciclopédia Brasil
137+
117: ["pt", "pt", "psr"], # Enciclopédia Portugal
138+
118: ["da", "dk", "dsl"], # Litteratur Danmark
139+
119: ["es", "ni", "ncs"], # Literatura Nicaragua
140+
120: ["es", "mx", "mfs"], # Literatura Mexico
141+
122: ["hu", "hu", "hsh"], # Dictionary Hungary
142+
123: ["hu", "hu", "hsh"], # Literature Hungary
143+
124: ["fr", "fr", "fsl"], # Literature France
144+
125: ["en", "gb", "bfi"], # Literature Great Britain
145+
126: ["ar", "tn", "tse"], # Littérature Tunisien
146+
127: ["mt", "mt", "mdl"], # Literature Malta
147+
128: ["mw", "mw", "lws"], # Dictionary Malawi
148+
129: ["gn", "py", "pys"], # Diccionario Paraguay
149+
130: ["uk", "ua", "ukl"], # Dictionary Ukraine
150+
131: ["is", "is", "icl"], # Ordabók IS
151+
132: ["ro", "ro", "rms"], # Dictionary Romania
152+
133: ["ne", "np", "nsp"], # Dictionary Nepal
153+
134: ["bg", "bg", "bqn"], # Dictionary Bulgaria
154+
135: ["es", "cl", "csg"], # Diccionario Chile
155+
136: ["es", "ec", "ecs"], # Diccionario Ecuador
156+
137: ["es", "sv", "esn"], # Diccionario El Salvador
157+
138: ["ro", "ro", "rms"], # Literature Romania
158+
139: ["ro", "ro", "rms"], # Encyclopedia Romania
159+
140: ["fr", "ca", "fcs"], # Encyclopédie Quebec
160+
141: ["ru", "ru", "rsl"], # Literature Russia
161+
142: ["ru", "ru", "rsl"], # Encyclopedia Russia
162+
143: ["es", "uy", "ugy"], # Diccionario Uruguay
163+
144: ["es", "uy", "ugy"], # Literatura Uruguay
164+
145: ["es", "ar", "aed"], # Literatura Argentina
165+
146: ["es", "ar", "aed"], # Enciclopedia Argentina
166+
147: ["mt", "mt", "mdl"], # Literature Malta Archive
167+
148: ["sl", "sl", "ysl"], # Besedila Slovenia
168+
149: ["sl", "sl", "ysl"], # Enciklopedija Slovenia
169+
150: ["", "", ""], # Anthropology Book Project
170+
151: ["en", "us", "ase"], # ASL Bible Books NLT
171+
152: ["en", "us", "ase"], # ASL Bible Books Shores Deaf Church
172+
153: ["vn", "vn", "haf"], # Dictionary Vietnam
173173
}
174174

175175
CACHE_BUSTER = str(datetime.today().date())
@@ -191,6 +191,7 @@ def _info(self) -> tfds.core.DatasetInfo:
191191
"puddle": tf.int32,
192192
"id": tfds.features.Text(),
193193
"assumed_spoken_language_code": tfds.features.Text(),
194+
"sign_language_code": tfds.features.Text(),
194195
"country_code": tfds.features.Text(),
195196
"created_date": tfds.features.Text(),
196197
"modified_date": tfds.features.Text(),
@@ -255,12 +256,13 @@ def _generate_examples(self, spmls: List[str]):
255256
# print(child.attrib)
256257
if len(texts) > 0:
257258
sample_id = "_".join([str(puddle), _id, str(i)])
258-
assumed_spoken_language_code, country_code = PUDDLES[puddle] if puddle in PUDDLES else ["", ""]
259+
assumed_spoken_language_code, country_code, sign_language_code = PUDDLES[puddle] if puddle in PUDDLES else ["", "", ""]
259260
i += 1
260261
yield sample_id, {
261262
"puddle": puddle,
262263
"id": _id,
263264
"assumed_spoken_language_code": assumed_spoken_language_code,
265+
"sign_language_code": sign_language_code,
264266
"country_code": country_code,
265267
"created_date": str(datetime.fromtimestamp(cdt)),
266268
"modified_date": str(datetime.fromtimestamp(mdt)),

0 commit comments

Comments
 (0)