Move tests to another file; Add more test cases for openai format

VProv · VProv · commit 7357926dc3ce · 2025-03-11T07:12:07.000-07:00
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
@@ -96,21 +96,9 @@ def check_file(
     return report_dict
 
 
-def _has_weights(messages: List[Dict[str, str | bool]]) -> bool:
-    """Check if any message in the conversation has a weight parameter.
-
-    Args:
-        messages (List[Dict[str, str]]): List of messages to check.
-
-    Returns:
-        bool: True if any message has a weight parameter, False otherwise.
-    """
-    return any("weight" in message for message in messages)
-
-
 def validate_messages(
-    messages: List[Dict[str, str | bool]], idx: int = 0
-) -> tuple[List[Dict[str, str | bool]], bool]:
+    messages: List[Dict[str, str | bool]], idx: int
+) -> None:
     """Validate the messages column."""
     if not isinstance(messages, list):
         raise InvalidFileFormatError(
@@ -127,10 +115,7 @@ def validate_messages(
             error_source="key_value",
         )
 
-    has_weights = False
-    # Check for weights in messages
-    if _has_weights(messages):
-        has_weights = True
+    has_weights = any("weight" in message for message in messages)
 
     previous_role = None
     for message in messages:
@@ -189,10 +174,8 @@ def validate_messages(
             )
         previous_role = message["role"]
 
-    return messages, has_weights
 
-
-def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> Dict[str, Any]:
+def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
     """Validate the OpenAI preference dataset format.
 
     Args:
@@ -201,9 +184,6 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> Dict[st
 
     Raises:
         InvalidFileFormatError: If the dataset format is invalid.
-
-    Returns:
-        Dict[str, Any]: The validated example.
     """
     if not isinstance(example["input"], dict):
         raise InvalidFileFormatError(
@@ -219,43 +199,38 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> Dict[st
             error_source="key_value",
         )
 
-    example["input"]["messages"], _ = validate_messages(
-        example["input"]["messages"], idx
-    )
-
-    if not isinstance(example["preferred_output"], list):
-        raise InvalidFileFormatError(
-            message="The dataset is malformed, the `preferred_output` field must be a list.",
-            line_number=idx + 1,
-            error_source="key_value",
-        )
-
-    if not isinstance(example["non_preferred_output"], list):
-        raise InvalidFileFormatError(
-            message="The dataset is malformed, the `non_preferred_output` field must be a list.",
-            line_number=idx + 1,
-            error_source="key_value",
-        )
+    validate_messages(example["input"]["messages"], idx)
 
-    if len(example["preferred_output"]) != 1:
-        raise InvalidFileFormatError(
-            message="The dataset is malformed, the `preferred_output` list must contain exactly one message.",
-            line_number=idx + 1,
-            error_source="key_value",
-        )
+    for output_field in ["preferred_output", "non_preferred_output"]:
+        if not isinstance(example[output_field], list):
+            raise InvalidFileFormatError(
+                message=f"The dataset is malformed, the `{output_field}` field must be a list.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
 
-    if len(example["non_preferred_output"]) != 1:
-        raise InvalidFileFormatError(
-            message="The dataset is malformed, the `non_preferred_output` list must contain exactly one message.",
-            line_number=idx + 1,
-            error_source="key_value",
-        )
+        if len(example[output_field]) != 1:
+            raise InvalidFileFormatError(
+                message=f"The dataset is malformed, the `{output_field}` list must contain exactly one message.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
+        if "role" not in example[output_field][0]:
+            raise InvalidFileFormatError(
+                message=f"The dataset is malformed, the `{output_field}` message is missing the `role` field.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
+        elif example[output_field][0]["role"] != "assistant":
+            raise InvalidFileFormatError(
+                message=f"The dataset is malformed, the `{output_field}` must contain an assistant message.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
+        
 
-    example["preferred_output"], _ = validate_messages(example["preferred_output"], idx)
-    example["non_preferred_output"], _ = validate_messages(
-        example["non_preferred_output"], idx
-    )
-    return example
+    validate_messages(example["preferred_output"], idx)
+    validate_messages(example["non_preferred_output"], idx)
 
 
 def _check_jsonl(file: Path) -> Dict[str, Any]:
@@ -332,9 +307,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                     message_column = JSONL_REQUIRED_COLUMNS_MAP[
                         DatasetFormat.CONVERSATION
                     ][0]
-                    messages, has_weights = validate_messages(
-                        json_line[message_column], idx
-                    )
+                    validate_messages(json_line[message_column], idx)
                 else:
                     for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
                         if not isinstance(json_line[column], str):
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
@@ -5,54 +5,6 @@
 from together.constants import MIN_SAMPLES
 from together.utils.files import check_file
 
-_TEST_PREFERENCE_OPENAI_CONTENT = [
-    {
-        "input": {
-            "messages": [
-                {"role": "user", "content": "Hi there, I have a question."},
-                {"role": "assistant", "content": "Hello, how is your day going?"},
-                {
-                    "role": "user",
-                    "content": "Hello, can you tell me how cold San Francisco is today?",
-                },
-            ],
-        },
-        "preferred_output": [
-            {
-                "role": "assistant",
-                "content": "Today in San Francisco, it is not quite cold as expected. Morning clouds will give away "
-                "to sunshine, with a high near 68°F (20°C) and a low around 57°F (14°C).",
-            }
-        ],
-        "non_preferred_output": [
-            {
-                "role": "assistant",
-                "content": "It is not particularly cold in San Francisco today.",
-            }
-        ],
-    },
-    {
-        "input": {
-            "messages": [
-                {
-                    "role": "user",
-                    "content": "What's the best way to learn programming?",
-                },
-            ],
-        },
-        "preferred_output": [
-            {
-                "role": "assistant",
-                "content": "The best way to learn programming is through consistent practice, working on real projects, "
-                "and breaking down complex problems into smaller parts. Start with a beginner-friendly language like Python.",
-            }
-        ],
-        "non_preferred_output": [
-            {"role": "assistant", "content": "Just read some books and you'll be fine."}
-        ],
-    },
-]
-
 
 def test_check_jsonl_valid_general(tmp_path: Path):
     # Create a valid JSONL file
@@ -128,149 +80,45 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
 def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
     # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs
     file = tmp_path / "valid_conversational_multiple_turns.jsonl"
-    content = _TEST_PREFERENCE_OPENAI_CONTENT
-    with file.open("w") as f:
-        f.write("\n".join(json.dumps(item) for item in content))
-
-    report = check_file(file)
-
-    assert report["is_check_passed"]
-    assert report["utf8"]
-    assert report["num_samples"] == len(content)
-    assert report["has_min_samples"]
-
-
-def test_check_jsonl_valid_preference_openai(tmp_path: Path):
-    file = tmp_path / "valid_preference_openai.jsonl"
-    content = _TEST_PREFERENCE_OPENAI_CONTENT
-    with file.open("w") as f:
-        f.write("\n".join(json.dumps(item) for item in content))
-
-    report = check_file(file)
-
-    assert report["is_check_passed"]
-    assert report["utf8"]
-    assert report["num_samples"] == len(content)
-    assert report["has_min_samples"]
-
-
-def test_check_jsonl_invalid_preference_openai_missing_fields(tmp_path: Path):
-    # Test all required fields in OpenAI preference format
-    required_fields = [
-        ("input", "Missing input field"),
-        ("preferred_output", "Missing preferred_output field"),
-        ("non_preferred_output", "Missing non_preferred_output field"),
-    ]
-
-    for field_to_remove, description in required_fields:
-        file = tmp_path / f"invalid_preference_openai_missing_{field_to_remove}.jsonl"
-        content = [item.copy() for item in _TEST_PREFERENCE_OPENAI_CONTENT]
-
-        # Remove the specified field from the first item
-        del content[0][field_to_remove]
-
-        with file.open("w") as f:
-            f.write("\n".join(json.dumps(item) for item in content))
-
-        report = check_file(file)
-
-        assert not report["is_check_passed"], f"Test should fail when {description}"
-
-
-def test_check_jsonl_invalid_preference_openai_structural_issues(tmp_path: Path):
-    # Test various structural issues in OpenAI preference format
-    test_cases = [
-        {
-            "name": "empty_messages",
-            "modifier": lambda item: item.update({"input": {"messages": []}}),
-            "description": "Empty messages array",
-        },
-        {
-            "name": "missing_role_preferred",
-            "modifier": lambda item: item.update(
-                {"preferred_output": [{"content": "Missing role field"}]}
-            ),
-            "description": "Missing role in preferred_output",
-        },
-        {
-            "name": "missing_role_non_preferred",
-            "modifier": lambda item: item.update(
-                {"non_preferred_output": [{"content": "Missing role field"}]}
-            ),
-            "description": "Missing role in non_preferred_output",
-        },
-        {
-            "name": "wrong_output_format_preferred",
-            "modifier": lambda item: item.update(
-                {"preferred_output": "Not an array but a string"}
-            ),
-            "description": "Wrong format for preferred_output",
-        },
-        {
-            "name": "wrong_output_format_non_preferred",
-            "modifier": lambda item: item.update(
-                {"non_preferred_output": "Not an array but a string"}
-            ),
-            "description": "Wrong format for non_preferred_output",
-        },
-        {
-            "name": "missing_content",
-            "modifier": lambda item: item.update(
-                {"input": {"messages": [{"role": "user"}]}}
-            ),
-            "description": "Missing content in messages",
-        },
-        {
-            "name": "multiple_preferred_outputs",
-            "modifier": lambda item: item.update(
-                {
-                    "preferred_output": [
-                        {"role": "assistant", "content": "First response"},
-                        {"role": "assistant", "content": "Second response"},
-                    ]
-                }
-            ),
-            "description": "Multiple messages in preferred_output",
-        },
+    content = [
         {
-            "name": "multiple_non_preferred_outputs",
-            "modifier": lambda item: item.update(
+            "messages": [
+                {"role": "user", "content": "Is it going to rain today?"},
                 {
-                    "non_preferred_output": [
-                        {"role": "assistant", "content": "First response"},
-                        {"role": "assistant", "content": "Second response"},
-                    ]
-                }
-            ),
-            "description": "Multiple messages in non_preferred_output",
+                    "role": "assistant",
+                    "content": "Yes, expect showers in the afternoon.",
+                },
+                {"role": "user", "content": "What is the weather like in Tokyo?"},
+                {"role": "assistant", "content": "It is sunny with a chance of rain."},
+            ]
         },
         {
-            "name": "empty_preferred_output",
-            "modifier": lambda item: item.update({"preferred_output": []}),
-            "description": "Empty preferred_output array",
+            "messages": [
+                {"role": "user", "content": "Who won the game last night?"},
+                {"role": "assistant", "content": "The home team won by two points."},
+                {"role": "user", "content": "What is the weather like in Amsterdam?"},
+                {"role": "assistant", "content": "It is cloudy with a chance of snow."},
+            ]
         },
         {
-            "name": "empty_non_preferred_output",
-            "modifier": lambda item: item.update({"non_preferred_output": []}),
-            "description": "Empty non_preferred_output array",
+            "messages": [
+                {"role": "system", "content": "You are a kind AI"},
+                {"role": "user", "content": "Who won the game last night?"},
+                {"role": "assistant", "content": "The home team won by two points."},
+                {"role": "user", "content": "What is the weather like in Amsterdam?"},
+                {"role": "assistant", "content": "It is cloudy with a chance of snow."},
+            ]
         },
     ]
+    with file.open("w") as f:
+        f.write("\n".join(json.dumps(item) for item in content))
 
-    for test_case in test_cases:
-        file = tmp_path / f"invalid_preference_openai_{test_case['name']}.jsonl"
-        content = [item.copy() for item in _TEST_PREFERENCE_OPENAI_CONTENT]
-
-        # Apply the modification to the first item
-        test_case["modifier"](content[0])
-
-        with file.open("w") as f:
-            f.write("\n".join(json.dumps(item) for item in content))
-
-        report = check_file(file)
+    report = check_file(file)
 
-        assert not report[
-            "is_check_passed"
-        ], f"Test should fail with {test_case['description']}"
+    assert report["is_check_passed"]
+    assert report["utf8"]
+    assert report["num_samples"] == len(content)
+    assert report["has_min_samples"]
 
 
 def test_check_jsonl_empty_file(tmp_path: Path):
diff --git a/tests/unit/test_preference_openai.py b/tests/unit/test_preference_openai.py