@@ -328,52 +328,6 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
328328 )
329329 if current_format == DatasetFormat .PREFERENCE_OPENAI :
330330 validate_preference_openai (json_line , idx )
331- elif current_format == DatasetFormat .PREFERENCE :
332- for column in JSONL_REQUIRED_COLUMNS_MAP [current_format ]:
333- if not isinstance (json_line [column ], list ):
334- raise InvalidFileFormatError (
335- message = f"The dataset is malformed, the column `{ column } ` must be a list." ,
336- line_number = idx + 1 ,
337- error_source = "key_value" ,
338- )
339- if len (json_line [column ]) == 0 :
340- raise InvalidFileFormatError (
341- message = f"The dataset is malformed, the column `{ column } ` must not be empty." ,
342- line_number = idx + 1 ,
343- error_source = "key_value" ,
344- )
345- validate_messages (json_line [column ], idx )
346- if not json_line [column ][- 1 ].get ("role" ) == "assistant" :
347- raise InvalidFileFormatError (
348- message = f"The last message in { column } must be from an assistant" ,
349- line_number = idx + 1 ,
350- error_source = "key_value" ,
351- )
352- # Check that all messages except the last one are the same for "chosen" and "rejected"
353- chosen_messages = json_line ["chosen" ]
354- rejected_messages = json_line ["rejected" ]
355-
356- if len (chosen_messages ) != len (rejected_messages ):
357- raise InvalidFileFormatError (
358- message = "The 'chosen' and 'rejected' lists must have the same number of messages." ,
359- line_number = idx + 1 ,
360- error_source = "key_value" ,
361- )
362-
363- # Count discrepancies between messages using a generator
364- discrepancies = sum (
365- 1
366- for i in range (len (chosen_messages ) - 1 )
367- if chosen_messages [i ] != rejected_messages [i ]
368- )
369-
370- if discrepancies > 1 :
371- raise InvalidFileFormatError (
372- message = f"Found { discrepancies } different messages between 'chosen' and 'rejected'. "
373- "Only the last message should differ." ,
374- line_number = idx + 1 ,
375- error_source = "key_value" ,
376- )
377331 elif current_format == DatasetFormat .CONVERSATION :
378332 message_column = JSONL_REQUIRED_COLUMNS_MAP [
379333 DatasetFormat .CONVERSATION
0 commit comments