@@ -29,147 +29,147 @@ def is_signwriting(fsw: str) -> bool:
2929
3030
3131PUDDLES = {
32- 2 : ["my" , "mm" ], # Myanmar Dictionary",
33- 4 : ["en" , "us" ], # Dictionary US",
34- 5 : ["en" , "us" ], # Literature US",
35- 11 : ["en" , "sg" ], # Singapore Sign Language (SgSL) Dictionary",
36- 12 : ["zh-CN" , "hk" ], # Hong Kong Dictionary",
37- 13 : ["zh-CN" , "hk" ], # Hong Kong Literature",
38- 14 : ["en" , "sg" ], # Singapore Sign Language (SgSL) Literature",
39- 16 : ["es" , "hn" ], # Diccionario Honduras",
40- 17 : ["" , "" ], # Deaf Harbor",
41- 18 : ["am" , "et" ], # Dictionary Ethiopia",
42- 19 : ["pl" , "pl" ], # Słownik PL",
43- 20 : ["fr" , "ch" ], # Littérature CH-fr",
44- 21 : ["en" , "us" ], # Encyclopedia US",
45- 22 : ["fr" , "ch" ], # Encyclopédie CH-fr",
46- 23 : ["no" , "no" ], # Litteratur NO",
47- 24 : ["no" , "no" ], # Leksikon NO",
48- 25 : ["en" , "us" ], # LLCN & SignTyp",
49- 26 : ["de" , "de" ], # Literatur DE",
50- 27 : ["de" , "de" ], # Enzyklopädie DE",
51- 28 : ["en" , "us" ], # ASL Bible Dictionary",
52- 29 : ["de" , "at" ], # Wörterbuch AT",
53- 30 : ["da" , "dk" ], # Ordbog Danmark",
54- 31 : ["mt" , "mt" ], # Dictionary Malta",
55- 32 : ["en" , "ng" ], # Dictionary Nigeria",
56- 33 : ["pt" , "pt" ], # Dicionário Portugal",
57- 34 : ["th" , "th" ], # Dictionary Thailand",
58- 35 : ["en" , "isl" ], # Dictionary International",
59- 36 : ["cs" , "cz" ], # Literatura CZ",
60- 37 : ["cs" , "cz" ], # Encyklopedie CZ",
61- 38 : ["pl" , "pl" ], # Literatura PL",
62- 39 : ["pl" , "pl" ], # Encyklopedie PL",
63- 40 : ["ar" , "sa" ], # Dictionary Saudi Arabia",
64- 41 : ["es" , "ar" ], # Diccionario Argentina",
65- 42 : ["en" , "au" ], # Dictionary Australia",
66- 43 : ["fr" , "be" ], # Dictionnaire BE-fr",
67- 44 : ["nl" , "be" ], # Woordenboek Flanders",
68- 45 : ["es" , "bo" ], # Diccionario Bolivia",
69- 46 : ["pt" , "br" ], # Dicionário Brasil",
70- 47 : ["fr" , "ca" ], # Dictionnaire Quebec",
71- 48 : ["de" , "ch" ], # Wörterbuch CH-de",
72- 49 : ["fr" , "ch" ], # Dictionnaire CH-fr",
73- 50 : ["it" , "ch" ], # Dizionario CH-it",
74- 51 : ["es" , "co" ], # Diccionario Colombia",
75- 52 : ["sk" , "sk" ], # Slovník CZ",
76- 53 : ["de" , "de" ], # Wörterbuch DE",
77- 54 : ["" , "" ], # Vortaro",
78- 55 : ["es" , "es" ], # Diccionario España",
79- 56 : ["ca" , "es" ], # Diccionario Catalán",
80- 57 : ["fi" , "fi" ], # Dictionary Finland",
81- 58 : ["fr" , "fr" ], # Dictionnaire FR",
82- 59 : ["en" , "gb" ], # Dictionary Great Britain",
83- 60 : ["en" , "ie" ], # Dictionary Northern Ireland",
84- 61 : ["gr" , "gr" ], # Dictionary Greece",
85- 62 : ["en" , "ir" ], # Dictionary Ireland",
86- 63 : ["it" , "it" ], # Dizionario IT",
87- 64 : ["ja" , "jp" ], # Dictionary Japan",
88- 65 : ["es" , "mx" ], # Diccionario Mexico",
89- 66 : ["ms" , "my" ], # Dictionary Malaysia",
90- 67 : ["es" , "ni" ], # Diccionario Nicaragua",
91- 68 : ["nl" , "nl" ], # Woordenboek NL",
92- 69 : ["" , "" ], # Ordbok NO",
93- 70 : ["en" , "nz" ], # Dictionary New Zealand",
94- 71 : ["es" , "pe" ], # Diccionario Peru",
95- 72 : ["fil" , "ph" ], # Dictionary Philippines",
96- 73 : ["sv" , "se" ], # Ordbok Sverige",
97- 74 : ["sl" , "sl" ], # Slovar Slovenia",
98- 75 : ["zh-tw " , "tw" ], # Dictionary Taiwan",
99- 76 : ["es" , "ve" ], # Diccionario Venezuela",
100- 77 : ["en" , "za" ], # Dictionary South Africa",
101- 78 : ["ko" , "kr" ], # Dictionary Korea",
102- 79 : ["sw" , "ke" ], # Dictionary Kenya",
103- 80 : ["" , "" ], # Project 2 Dictionary Sorting",
104- 81 : ["fr" , "ca" ], # Littérature Quebec",
105- 82 : ["sq" , "al" ], # Dictionary Albania",
106- 83 : ["zh-cn " , "cn" ], # Dictionary China",
107- 84 : ["ar" , "eg" ], # Dictionary Egypt",
108- 85 : ["hi" , "in" ], # Dictionary India",
109- 86 : ["ar" , "jo" ], # Dictionary Jordan",
110- 87 : ["ur" , "pk" ], # Dictionary Pakistan",
111- 88 : ["ru" , "ru" ], # Dictionary Russia",
112- 89 : ["sk" , "sk" ], # Dictionary Slovakia",
113- 90 : ["tr" , "tr" ], # Dictionary Turkey",
114- 91 : ["ar" , "sa" ], # Literature Saudi Arabia",
115- 92 : ["ar" , "jo" ], # Literature Jordan",
116- 93 : ["es" , "es" ], # Literatura España",
117- 94 : ["ca" , "es" ], # Literatura Catalán",
118- 95 : ["fr" , "be" ], # Littérature BE-fr",
119- 96 : ["de" , "ch" ], # Literatur CH-de",
120- 98 : ["nl" , "be" ], # Literatuur Flanders",
121- 99 : ["ja" , "jp" ], # Literature Japan",
122- 100 : ["am" , "et" ], # Literature Ethiopia",
123- 103 : ["mt" , "mt" ], # Malta LSM Private Puddle",
124- 104 : ["ar" , "tn" ], # Dictionnaire Tunisien",
125- 105 : ["" , "" ], # DAC Private Puddle",
126- 106 : ["ps" , "af" ], # Dictionary Afghanistan",
127- 107 : ["lt" , "lt" ], # Dictionary Lithuania",
128- 108 : ["lv" , "lv" ], # Dictionary Latvia",
129- 109 : ["et" , "et" ], # Dictionary Estonia",
130- 110 : ["he" , "il" ], # Dictionary Israel",
131- 111 : ["" , "" ], # Project 1 Translate Wiki",
132- 112 : ["es" , "gt" ], # Dictionary Guatemala",
133- 113 : ["ht" , "ht" ], # Dictionary Haiti",
134- 114 : ["pt" , "br" ], # Literatura Brasil",
135- 115 : ["pt" , "pt" ], # Literatura Portugal",
136- 116 : ["pt" , "br" ], # Enciclopédia Brasil",
137- 117 : ["pt" , "pt" ], # Enciclopédia Portugal",
138- 118 : ["da" , "dk" ], # Litteratur Danmark",
139- 119 : ["es" , "ni" ], # Literatura Nicaragua",
140- 120 : ["es" , "mx" ], # Literatura Mexico",
141- 122 : ["hu" , "hu" ], # Dictionary Hungary",
142- 123 : ["hu" , "hu" ], # Literature Hungary",
143- 124 : ["fr" , "fr" ], # Literature France",
144- 125 : ["en" , "gb" ], # Literature Great Britain",
145- 126 : ["ar" , "tn" ], # Littérature Tunisien",
146- 127 : ["mt" , "mt" ], # Literature Malta",
147- 128 : ["mw" , "mw" ], # Dictionary Malawi",
148- 129 : ["gn" , "py" ], # Diccionario Paraguay",
149- 130 : ["uk" , "ua" ], # Dictionary Ukraine",
150- 131 : ["" , "" ], # Ordabók IS",
151- 132 : ["ro" , "ro" ], # Dictionary Romania",
152- 133 : ["ne" , "np" ], # Dictionary Nepal",
153- 134 : ["bg" , "bg" ], # Dictionary Bulgaria",
154- 135 : ["es" , "cl" ], # Diccionario Chile",
155- 136 : ["es" , "ec" ], # Diccionario Ecuador",
156- 137 : ["es" , "sv" ], # Diccionario El Salvador",
157- 138 : ["ro" , "ro" ], # Literature Romania",
158- 139 : ["ro" , "ro" ], # Encyclopedia Romania",
159- 140 : ["fr" , "ca" ], # Encyclopédie Quebec",
160- 141 : ["ru" , "ru" ], # Literature Russia",
161- 142 : ["ru" , "ru" ], # Encyclopedia Russia",
162- 143 : ["es" , "uy" ], # Diccionario Uruguay",
163- 144 : ["es" , "uy" ], # Literatura Uruguay",
164- 145 : ["es" , "ar" ], # Literatura Argentina",
165- 146 : ["es" , "ar" ], # Enciclopedia Argentina",
166- 147 : ["mt" , "mt" ], # Literature Malta Archive",
167- 148 : ["sl" , "sl" ], # Besedila Slovenia",
168- 149 : ["sl" , "sl" ], # Enciklopedija Slovenia",
169- 150 : ["" , "" ], # Anthropology Book Project",
170- 151 : ["en" , "us" ], # ASL Bible Books NLT",
171- 152 : ["en" , "us" ], # ASL Bible Books Shores Deaf Church",
172- 153 : ["vn" , "vn" ], # Dictionary Vietnam"
32+ 2 : ["my" , "mm" , "ysm" ], # Myanmar Dictionary
33+ 4 : ["en" , "us" , "ase" ], # Dictionary US
34+ 5 : ["en" , "us" , "ase" ], # Literature US
35+ 11 : ["en" , "sg" , "sls" ], # Singapore Sign Language (SgSL) Dictionary
36+ 12 : ["zh-CN" , "hk" , "hks" ], # Hong Kong Dictionary
37+ 13 : ["zh-CN" , "hk" , "hks" ], # Hong Kong Literature
38+ 14 : ["en" , "sg" , "sls" ], # Singapore Sign Language (SgSL) Literature
39+ 16 : ["es" , "hn" , "hds" ], # Diccionario Honduras
40+ 17 : ["en " , "us" , "ase" ], # Deaf Harbor
41+ 18 : ["am" , "et" , "eth" ], # Dictionary Ethiopia
42+ 19 : ["pl" , "pl" , "pso" ], # Słownik PL
43+ 20 : ["fr" , "ch" , "ssr" ], # Littérature CH-fr
44+ 21 : ["en" , "us" , "ase" ], # Encyclopedia US
45+ 22 : ["fr" , "ch" , "ssr" ], # Encyclopédie CH-fr
46+ 23 : ["no" , "no" , "nsl" ], # Litteratur NO
47+ 24 : ["no" , "no" , "nsl" ], # Leksikon NO
48+ 25 : ["en" , "us" , "ase" ], # LLCN & SignTyp
49+ 26 : ["de" , "de" , "gsg" ], # Literatur DE
50+ 27 : ["de" , "de" , "gsg" ], # Enzyklopädie DE
51+ 28 : ["en" , "us" , "ase" ], # ASL Bible Dictionary
52+ 29 : ["de" , "at" , "asq" ], # Wörterbuch AT
53+ 30 : ["da" , "dk" , "dsl" ], # Ordbog Danmark
54+ 31 : ["mt" , "mt" , "mdl" ], # Dictionary Malta
55+ 32 : ["en" , "ng" , "nsi" ], # Dictionary Nigeria
56+ 33 : ["pt" , "pt" , "psr" ], # Dicionário Portugal
57+ 34 : ["th" , "th" , "tsq" ], # Dictionary Thailand
58+ 35 : ["en" , "isl" , "ase" ], # Dictionary International
59+ 36 : ["cs" , "cz" , "cse" ], # Literatura CZ
60+ 37 : ["cs" , "cz" , "cse" ], # Encyklopedie CZ
61+ 38 : ["pl" , "pl" , "pso" ], # Literatura PL
62+ 39 : ["pl" , "pl" , "pso" ], # Encyklopedie PL
63+ 40 : ["ar" , "sa" , "sdl" ], # Dictionary Saudi Arabia
64+ 41 : ["es" , "ar" , "aed" ], # Diccionario Argentina
65+ 42 : ["en" , "au" , "asf" ], # Dictionary Australia
66+ 43 : ["fr" , "be" , "sfb" ], # Dictionnaire BE-fr
67+ 44 : ["nl" , "be" , "vgt" ], # Woordenboek Flanders
68+ 45 : ["es" , "bo" , "bvl" ], # Diccionario Bolivia
69+ 46 : ["pt" , "br" , "bzs" ], # Dicionário Brasil
70+ 47 : ["fr" , "ca" , "fcs" ], # Dictionnaire Quebec
71+ 48 : ["de" , "ch" , "sgg" ], # Wörterbuch CH-de
72+ 49 : ["fr" , "ch" , "ssr" ], # Dictionnaire CH-fr
73+ 50 : ["it" , "ch" , "slf" ], # Dizionario CH-it
74+ 51 : ["es" , "co" , "csn" ], # Diccionario Colombia
75+ 52 : ["sk" , "sk" , "svk" ], # Slovník CZ
76+ 53 : ["de" , "de" , "gsg" ], # Wörterbuch DE
77+ 54 : ["eo " , "" , "ase" ], # Vortaro (esperanto)
78+ 55 : ["es" , "es" , "ssp" ], # Diccionario España
79+ 56 : ["ca" , "es" , "csc" ], # Diccionario Catalán
80+ 57 : ["fi" , "fi" , "fse" ], # Dictionary Finland
81+ 58 : ["fr" , "fr" , "fsl" ], # Dictionnaire FR
82+ 59 : ["en" , "gb" , "bfi" ], # Dictionary Great Britain
83+ 60 : ["en" , "ie" , "isg" ], # Dictionary Northern Ireland
84+ 61 : ["gr" , "gr" , "gss" ], # Dictionary Greece
85+ 62 : ["en" , "ir" , "psc" ], # Dictionary Ireland
86+ 63 : ["it" , "it" , "ise" ], # Dizionario IT
87+ 64 : ["ja" , "jp" , "jsl" ], # Dictionary Japan
88+ 65 : ["es" , "mx" , "mfs" ], # Diccionario Mexico
89+ 66 : ["ms" , "my" , "xml" ], # Dictionary Malaysia
90+ 67 : ["es" , "ni" , "ncs" ], # Diccionario Nicaragua
91+ 68 : ["nl" , "nl" , "dse" ], # Woordenboek NL
92+ 69 : ["no " , "no" , "nsl" ], # Ordbok NO
93+ 70 : ["en" , "nz" , "nzs" ], # Dictionary New Zealand
94+ 71 : ["es" , "pe" , "prl" ], # Diccionario Peru
95+ 72 : ["fil" , "ph" , "psp" ], # Dictionary Philippines
96+ 73 : ["sv" , "se" , "swl" ], # Ordbok Sverige
97+ 74 : ["sl" , "sl" , "ysl" ], # Slovar Slovenia
98+ 75 : ["zh-TW " , "tw" , "tss" ], # Dictionary Taiwan
99+ 76 : ["es" , "ve" , "vsl" ], # Diccionario Venezuela
100+ 77 : ["en" , "za" , "sfs" ], # Dictionary South Africa
101+ 78 : ["ko" , "kr" , "kvk" ], # Dictionary Korea
102+ 79 : ["sw" , "ke" , "xki" ], # Dictionary Kenya
103+ 80 : ["pt " , "pt" , "psr" ], # Project 2 Dictionary Sorting
104+ 81 : ["fr" , "ca" , "fcs" ], # Littérature Quebec
105+ 82 : ["sq" , "al" , "sqk" ], # Dictionary Albania
106+ 83 : ["zh-CN " , "cn" , "csl" ], # Dictionary China
107+ 84 : ["ar" , "eg" , "esl" ], # Dictionary Egypt
108+ 85 : ["hi" , "in" , "ins" ], # Dictionary India
109+ 86 : ["ar" , "jo" , "jos" ], # Dictionary Jordan
110+ 87 : ["ur" , "pk" , "pks" ], # Dictionary Pakistan
111+ 88 : ["ru" , "ru" , "rsl" ], # Dictionary Russia
112+ 89 : ["sk" , "sk" , "svk" ], # Dictionary Slovakia
113+ 90 : ["tr" , "tr" , "tsm" ], # Dictionary Turkey
114+ 91 : ["ar" , "sa" , "sdl" ], # Literature Saudi Arabia
115+ 92 : ["ar" , "jo" , "jos" ], # Literature Jordan
116+ 93 : ["es" , "es" , "ssp" ], # Literatura España
117+ 94 : ["ca" , "es" , "csc" ], # Literatura Catalán
118+ 95 : ["fr" , "be" , "sfb" ], # Littérature BE-fr
119+ 96 : ["de" , "ch" , "sgg" ], # Literatur CH-de
120+ 98 : ["nl" , "be" , "vgt" ], # Literatuur Flanders
121+ 99 : ["ja" , "jp" , "jsl" ], # Literature Japan
122+ 100 : ["am" , "et" , "eth" ], # Literature Ethiopia
123+ 103 : ["mt" , "mt" , "mdl" ], # Malta LSM Private Puddle
124+ 104 : ["ar" , "tn" , "tse" ], # Dictionnaire Tunisien
125+ 105 : ["en " , "us" , "ase" ], # DAC Private Puddle
126+ 106 : ["ps" , "af" , "afg" ], # Dictionary Afghanistan
127+ 107 : ["lt" , "lt" , "lls" ], # Dictionary Lithuania
128+ 108 : ["lv" , "lv" , "lsl" ], # Dictionary Latvia
129+ 109 : ["et" , "et" , "eso" ], # Dictionary Estonia
130+ 110 : ["he" , "il" , "isr" ], # Dictionary Israel
131+ 111 : ["en " , "us" , "ase" ], # Project 1 Translate Wiki
132+ 112 : ["es" , "gt" , "gsm" ], # Dictionary Guatemala
133+ 113 : ["ht" , "ht" , "" ], # Dictionary Haiti
134+ 114 : ["pt" , "br" , "bzs" ], # Literatura Brasil
135+ 115 : ["pt" , "pt" , "psr" ], # Literatura Portugal
136+ 116 : ["pt" , "br" , "bzs" ], # Enciclopédia Brasil
137+ 117 : ["pt" , "pt" , "psr" ], # Enciclopédia Portugal
138+ 118 : ["da" , "dk" , "dsl" ], # Litteratur Danmark
139+ 119 : ["es" , "ni" , "ncs" ], # Literatura Nicaragua
140+ 120 : ["es" , "mx" , "mfs" ], # Literatura Mexico
141+ 122 : ["hu" , "hu" , "hsh" ], # Dictionary Hungary
142+ 123 : ["hu" , "hu" , "hsh" ], # Literature Hungary
143+ 124 : ["fr" , "fr" , "fsl" ], # Literature France
144+ 125 : ["en" , "gb" , "bfi" ], # Literature Great Britain
145+ 126 : ["ar" , "tn" , "tse" ], # Littérature Tunisien
146+ 127 : ["mt" , "mt" , "mdl" ], # Literature Malta
147+ 128 : ["mw" , "mw" , "lws" ], # Dictionary Malawi
148+ 129 : ["gn" , "py" , "pys" ], # Diccionario Paraguay
149+ 130 : ["uk" , "ua" , "ukl" ], # Dictionary Ukraine
150+ 131 : ["is " , "is" , "icl" ], # Ordabók IS
151+ 132 : ["ro" , "ro" , "rms" ], # Dictionary Romania
152+ 133 : ["ne" , "np" , "nsp" ], # Dictionary Nepal
153+ 134 : ["bg" , "bg" , "bqn" ], # Dictionary Bulgaria
154+ 135 : ["es" , "cl" , "csg" ], # Diccionario Chile
155+ 136 : ["es" , "ec" , "ecs" ], # Diccionario Ecuador
156+ 137 : ["es" , "sv" , "esn" ], # Diccionario El Salvador
157+ 138 : ["ro" , "ro" , "rms" ], # Literature Romania
158+ 139 : ["ro" , "ro" , "rms" ], # Encyclopedia Romania
159+ 140 : ["fr" , "ca" , "fcs" ], # Encyclopédie Quebec
160+ 141 : ["ru" , "ru" , "rsl" ], # Literature Russia
161+ 142 : ["ru" , "ru" , "rsl" ], # Encyclopedia Russia
162+ 143 : ["es" , "uy" , "ugy" ], # Diccionario Uruguay
163+ 144 : ["es" , "uy" , "ugy" ], # Literatura Uruguay
164+ 145 : ["es" , "ar" , "aed" ], # Literatura Argentina
165+ 146 : ["es" , "ar" , "aed" ], # Enciclopedia Argentina
166+ 147 : ["mt" , "mt" , "mdl" ], # Literature Malta Archive
167+ 148 : ["sl" , "sl" , "ysl" ], # Besedila Slovenia
168+ 149 : ["sl" , "sl" , "ysl" ], # Enciklopedija Slovenia
169+ 150 : ["" , "" , "" ], # Anthropology Book Project
170+ 151 : ["en" , "us" , "ase" ], # ASL Bible Books NLT
171+ 152 : ["en" , "us" , "ase" ], # ASL Bible Books Shores Deaf Church
172+ 153 : ["vn" , "vn" , "haf" ], # Dictionary Vietnam
173173}
174174
175175CACHE_BUSTER = str (datetime .today ().date ())
@@ -191,6 +191,7 @@ def _info(self) -> tfds.core.DatasetInfo:
191191 "puddle" : tf .int32 ,
192192 "id" : tfds .features .Text (),
193193 "assumed_spoken_language_code" : tfds .features .Text (),
194+ "sign_language_code" : tfds .features .Text (),
194195 "country_code" : tfds .features .Text (),
195196 "created_date" : tfds .features .Text (),
196197 "modified_date" : tfds .features .Text (),
@@ -255,12 +256,13 @@ def _generate_examples(self, spmls: List[str]):
255256 # print(child.attrib)
256257 if len (texts ) > 0 :
257258 sample_id = "_" .join ([str (puddle ), _id , str (i )])
258- assumed_spoken_language_code , country_code = PUDDLES [puddle ] if puddle in PUDDLES else ["" , "" ]
259+ assumed_spoken_language_code , country_code , sign_language_code = PUDDLES [puddle ] if puddle in PUDDLES else ["" , "" , "" ]
259260 i += 1
260261 yield sample_id , {
261262 "puddle" : puddle ,
262263 "id" : _id ,
263264 "assumed_spoken_language_code" : assumed_spoken_language_code ,
265+ "sign_language_code" : sign_language_code ,
264266 "country_code" : country_code ,
265267 "created_date" : str (datetime .fromtimestamp (cdt )),
266268 "modified_date" : str (datetime .fromtimestamp (mdt )),
0 commit comments