Skip to content

Commit ee470fc

Browse files
committed
Add check that the prompt is the same for the PREFERENCE dataset format
1 parent fdbdc8e commit ee470fc

File tree

1 file changed

+25
-0
lines changed

1 file changed

+25
-0
lines changed

src/together/utils/files.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,31 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
304304
line_number=idx + 1,
305305
error_source="key_value",
306306
)
307+
# Check that all messages except the last one are the same for "chosen" and "rejected"
308+
chosen_messages = json_line["chosen"]
309+
rejected_messages = json_line["rejected"]
310+
311+
if len(chosen_messages) != len(rejected_messages):
312+
raise InvalidFileFormatError(
313+
message="The 'chosen' and 'rejected' lists must have the same number of messages.",
314+
line_number=idx + 1,
315+
error_source="key_value",
316+
)
317+
318+
# Count discrepancies between messages using a generator
319+
discrepancies = sum(
320+
1
321+
for i in range(len(chosen_messages) - 1)
322+
if chosen_messages[i] != rejected_messages[i]
323+
)
324+
325+
if discrepancies > 1:
326+
raise InvalidFileFormatError(
327+
message=f"Found {discrepancies} different messages between 'chosen' and 'rejected'. "
328+
"Only the last message should differ.",
329+
line_number=idx + 1,
330+
error_source="key_value",
331+
)
307332
elif current_format == DatasetFormat.CONVERSATION:
308333
message_column = JSONL_REQUIRED_COLUMNS_MAP[
309334
DatasetFormat.CONVERSATION

0 commit comments

Comments
 (0)