Skip to content

Commit 5320437

Browse files
committed
Remove PREFERENCE dataset support
1 parent f1f8d9a commit 5320437

File tree

1 file changed

+0
-46
lines changed

1 file changed

+0
-46
lines changed

src/together/utils/files.py

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -328,52 +328,6 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
328328
)
329329
if current_format == DatasetFormat.PREFERENCE_OPENAI:
330330
validate_preference_openai(json_line, idx)
331-
elif current_format == DatasetFormat.PREFERENCE:
332-
for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
333-
if not isinstance(json_line[column], list):
334-
raise InvalidFileFormatError(
335-
message=f"The dataset is malformed, the column `{column}` must be a list.",
336-
line_number=idx + 1,
337-
error_source="key_value",
338-
)
339-
if len(json_line[column]) == 0:
340-
raise InvalidFileFormatError(
341-
message=f"The dataset is malformed, the column `{column}` must not be empty.",
342-
line_number=idx + 1,
343-
error_source="key_value",
344-
)
345-
validate_messages(json_line[column], idx)
346-
if not json_line[column][-1].get("role") == "assistant":
347-
raise InvalidFileFormatError(
348-
message=f"The last message in {column} must be from an assistant",
349-
line_number=idx + 1,
350-
error_source="key_value",
351-
)
352-
# Check that all messages except the last one are the same for "chosen" and "rejected"
353-
chosen_messages = json_line["chosen"]
354-
rejected_messages = json_line["rejected"]
355-
356-
if len(chosen_messages) != len(rejected_messages):
357-
raise InvalidFileFormatError(
358-
message="The 'chosen' and 'rejected' lists must have the same number of messages.",
359-
line_number=idx + 1,
360-
error_source="key_value",
361-
)
362-
363-
# Count discrepancies between messages using a generator
364-
discrepancies = sum(
365-
1
366-
for i in range(len(chosen_messages) - 1)
367-
if chosen_messages[i] != rejected_messages[i]
368-
)
369-
370-
if discrepancies > 1:
371-
raise InvalidFileFormatError(
372-
message=f"Found {discrepancies} different messages between 'chosen' and 'rejected'. "
373-
"Only the last message should differ.",
374-
line_number=idx + 1,
375-
error_source="key_value",
376-
)
377331
elif current_format == DatasetFormat.CONVERSATION:
378332
message_column = JSONL_REQUIRED_COLUMNS_MAP[
379333
DatasetFormat.CONVERSATION

0 commit comments

Comments
 (0)