@@ -138,8 +138,6 @@ def test_dataset_processor_t5_style():
138
138
"<task 0>convert to text2text\n Example:\n foo\n Label:\n " ,
139
139
"<task 0>convert to text2text\n Example:\n bar\n Label:\n " ,
140
140
],
141
- "input_col" : ["foo" , "bar" ],
142
- "output_col" : ["baz" , "qux" ],
143
141
"model_output" : ["baz" , "qux" ],
144
142
}
145
143
),
@@ -149,8 +147,6 @@ def test_dataset_processor_t5_style():
149
147
"<task 0>convert to text2text\n Example:\n foo\n Label:\n " ,
150
148
"<task 0>convert to text2text\n Example:\n bar\n Label:\n " ,
151
149
],
152
- "input_col" : ["foo" , "bar" ],
153
- "output_col" : ["baz" , "qux" ],
154
150
"model_output" : ["baz" , "qux" ],
155
151
}
156
152
),
@@ -164,8 +160,6 @@ def test_dataset_processor_t5_style():
164
160
"<task 1>convert to text2text\n Example:\n spam\n Label:\n " ,
165
161
"<task 1>convert to text2text\n Example:\n eggs\n Label:\n " ,
166
162
],
167
- "input_col" : ["spam" , "eggs" ],
168
- "output_col" : ["ham" , "sau" ],
169
163
"model_output" : ["ham" , "sau" ],
170
164
}
171
165
),
@@ -175,8 +169,6 @@ def test_dataset_processor_t5_style():
175
169
"<task 1>convert to text2text\n Example:\n spam\n Label:\n " ,
176
170
"<task 1>convert to text2text\n Example:\n eggs\n Label:\n " ,
177
171
],
178
- "input_col" : ["spam" , "eggs" ],
179
- "output_col" : ["ham" , "sau" ],
180
172
"model_output" : ["ham" , "sau" ],
181
173
}
182
174
),
@@ -188,6 +180,88 @@ def test_dataset_processor_t5_style():
188
180
gc .collect ()
189
181
190
182
183
+ def test_dataset_processor_with_numerical_column ():
184
+ """Test process_dataset_dict with numerical column values."""
185
+ t5_processor = TextualizeProcessor (has_encoder = True )
186
+ raw_dataset_dicts = [
187
+ datasets .DatasetDict (
188
+ {
189
+ "train" : datasets .Dataset .from_dict (
190
+ {
191
+ "input_col" : ["foo" , "bar" ],
192
+ "output_col" : ["baz" , "qux" ],
193
+ }
194
+ ),
195
+ "test" : datasets .Dataset .from_dict (
196
+ {
197
+ "input_col" : ["spam" , "eggs" ],
198
+ "output_col" : ["ham" , "sau" ],
199
+ }
200
+ ),
201
+ }
202
+ ),
203
+ datasets .DatasetDict (
204
+ {
205
+ "train" : datasets .Dataset .from_dict (
206
+ {
207
+ "input_col" : ["foo" , "bar" ],
208
+ "output_col" : [0 , 1 ],
209
+ }
210
+ ),
211
+ "test" : datasets .Dataset .from_dict (
212
+ {
213
+ "input_col" : ["spam" , "eggs" ],
214
+ "output_col" : [1 , 2 ],
215
+ }
216
+ ),
217
+ }
218
+ ),
219
+ ]
220
+ t5_modified_dataset_dicts = t5_processor .process_dataset_dict (
221
+ INSTRUCTION , raw_dataset_dicts
222
+ )
223
+ expected_dataset_dict = datasets .DatasetDict (
224
+ {
225
+ "train" : datasets .Dataset .from_dict (
226
+ {
227
+ "model_input" : [
228
+ "<task 0>convert to text2text\n Example:\n foo\n Label:\n " ,
229
+ "<task 0>convert to text2text\n Example:\n bar\n Label:\n " ,
230
+ "<task 1>convert to text2text\n Example:\n foo\n Label:\n " ,
231
+ "<task 1>convert to text2text\n Example:\n bar\n Label:\n " ,
232
+ ],
233
+ "model_output" : ["foo" , "bar" , "0" , "1" ],
234
+ }
235
+ ),
236
+ "test" : datasets .Dataset .from_dict (
237
+ {
238
+ "model_input" : [
239
+ "<task 0>convert to text2text\n Example:\n spam\n Label:\n " ,
240
+ "<task 0>convert to text2text\n Example:\n eggs\n Label:\n " ,
241
+ "<task 1>convert to text2text\n Example:\n spam\n Label:\n " ,
242
+ "<task 1>convert to text2text\n Example:\n eggs\n Label:\n " ,
243
+ ],
244
+ "model_output" : ["ham" , "sau" , "1" , "2" ],
245
+ }
246
+ ),
247
+ }
248
+ )
249
+ training_datasets = []
250
+ test_datasets = []
251
+ for modified_dataset_dict in t5_modified_dataset_dicts :
252
+ training_datasets .append (modified_dataset_dict ["train" ])
253
+ test_datasets .append (modified_dataset_dict ["test" ])
254
+
255
+ concatenated_training_dataset = datasets .concatenate_datasets (training_datasets )
256
+ concatenated_test_dataset = datasets .concatenate_datasets (test_datasets )
257
+ actual_dataset_dict = datasets .DatasetDict (
258
+ {"train" : concatenated_training_dataset , "test" : concatenated_test_dataset }
259
+ )
260
+ are_dataset_dicts_identical (expected_dataset_dict , actual_dataset_dict )
261
+
262
+ gc .collect ()
263
+
264
+
191
265
def test_dataset_processor_decoder_only_style ():
192
266
"""Test the `process_dataset_dict` function of a GPT-type `TextualizeProcessor`."""
193
267
_ , gpt2_tokenizer = create_gpt2_model_and_tokenizer ()
@@ -213,8 +287,6 @@ def test_dataset_processor_decoder_only_style():
213
287
"<task 0>convert to text2text\n Example:\n foo\n Label:\n baz<|endoftext|>" , # noqa: E501
214
288
"<task 0>convert to text2text\n Example:\n bar\n Label:\n qux<|endoftext|>" , # noqa: E501
215
289
],
216
- "input_col" : ["foo" , "bar" ],
217
- "output_col" : ["baz" , "qux" ],
218
290
"model_output" : ["baz<|endoftext|>" , "qux<|endoftext|>" ],
219
291
}
220
292
),
@@ -224,8 +296,6 @@ def test_dataset_processor_decoder_only_style():
224
296
"<task 0>convert to text2text\n Example:\n foo\n Label:\n " ,
225
297
"<task 0>convert to text2text\n Example:\n bar\n Label:\n " ,
226
298
],
227
- "input_col" : ["foo" , "bar" ],
228
- "output_col" : ["baz" , "qux" ],
229
299
"model_output" : ["baz" , "qux" ],
230
300
}
231
301
),
@@ -239,8 +309,6 @@ def test_dataset_processor_decoder_only_style():
239
309
"<task 1>convert to text2text\n Example:\n spam\n Label:\n ham<|endoftext|>" , # noqa: E501
240
310
"<task 1>convert to text2text\n Example:\n eggs\n Label:\n sau<|endoftext|>" , # noqa: E501
241
311
],
242
- "input_col" : ["spam" , "eggs" ],
243
- "output_col" : ["ham" , "sau" ],
244
312
"model_output" : ["ham<|endoftext|>" , "sau<|endoftext|>" ],
245
313
}
246
314
),
@@ -250,8 +318,6 @@ def test_dataset_processor_decoder_only_style():
250
318
"<task 1>convert to text2text\n Example:\n spam\n Label:\n " ,
251
319
"<task 1>convert to text2text\n Example:\n eggs\n Label:\n " ,
252
320
],
253
- "input_col" : ["spam" , "eggs" ],
254
- "output_col" : ["ham" , "sau" ],
255
321
"model_output" : ["ham" , "sau" ],
256
322
}
257
323
),
@@ -341,8 +407,6 @@ def test_empty_filter_t5_type():
341
407
"model_input" : [
342
408
"<task 0>convert to text2text\n Example:\n test\n Label:\n " ,
343
409
],
344
- "input_col" : ["test" ],
345
- "output_col" : ["key" ],
346
410
"model_output" : ["key" ],
347
411
}
348
412
),
@@ -351,12 +415,6 @@ def test_empty_filter_t5_type():
351
415
"model_input" : [
352
416
"<task 0>convert to text2text\n Example:\n foo\n Label:\n " ,
353
417
],
354
- "input_col" : [
355
- "foo" ,
356
- ],
357
- "output_col" : [
358
- "baz" ,
359
- ],
360
418
"model_output" : [
361
419
"baz" ,
362
420
],
@@ -369,8 +427,6 @@ def test_empty_filter_t5_type():
369
427
"train" : datasets .Dataset .from_dict (
370
428
{
371
429
"model_input" : [],
372
- "input_col" : [],
373
- "output_col" : [],
374
430
"model_output" : [],
375
431
}
376
432
),
@@ -403,8 +459,6 @@ def test_empty_filter_decoder_only_style():
403
459
"model_input" : [
404
460
"<task 0>convert to text2text\n Example:\n test\n Label:\n key<|endoftext|>" , # noqa: E501
405
461
],
406
- "input_col" : ["test" ],
407
- "output_col" : ["key" ],
408
462
"model_output" : ["key<|endoftext|>" ],
409
463
}
410
464
),
@@ -413,8 +467,6 @@ def test_empty_filter_decoder_only_style():
413
467
"model_input" : [
414
468
"<task 0>convert to text2text\n Example:\n foo\n Label:\n " ,
415
469
],
416
- "input_col" : ["foo" ],
417
- "output_col" : ["baz" ],
418
470
"model_output" : ["baz" ],
419
471
}
420
472
),
@@ -425,8 +477,6 @@ def test_empty_filter_decoder_only_style():
425
477
"train" : datasets .Dataset .from_dict (
426
478
{
427
479
"model_input" : [],
428
- "input_col" : [],
429
- "output_col" : [],
430
480
"model_output" : [],
431
481
}
432
482
),
0 commit comments