File tree Expand file tree Collapse file tree 1 file changed +25
-0
lines changed
Expand file tree Collapse file tree 1 file changed +25
-0
lines changed Original file line number Diff line number Diff line change @@ -304,6 +304,31 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
304304 line_number = idx + 1 ,
305305 error_source = "key_value" ,
306306 )
307+ # Check that all messages except the last one are the same for "chosen" and "rejected"
308+ chosen_messages = json_line ["chosen" ]
309+ rejected_messages = json_line ["rejected" ]
310+
311+ if len (chosen_messages ) != len (rejected_messages ):
312+ raise InvalidFileFormatError (
313+ message = "The 'chosen' and 'rejected' lists must have the same number of messages." ,
314+ line_number = idx + 1 ,
315+ error_source = "key_value" ,
316+ )
317+
318+ # Count discrepancies between messages using a generator
319+ discrepancies = sum (
320+ 1
321+ for i in range (len (chosen_messages ) - 1 )
322+ if chosen_messages [i ] != rejected_messages [i ]
323+ )
324+
325+ if discrepancies > 1 :
326+ raise InvalidFileFormatError (
327+ message = f"Found { discrepancies } different messages between 'chosen' and 'rejected'. "
328+ "Only the last message should differ." ,
329+ line_number = idx + 1 ,
330+ error_source = "key_value" ,
331+ )
307332 elif current_format == DatasetFormat .CONVERSATION :
308333 message_column = JSONL_REQUIRED_COLUMNS_MAP [
309334 DatasetFormat .CONVERSATION
You can’t perform that action at this time.
0 commit comments