From acb7361664388279e33431afe76a8b9398f37d6f Mon Sep 17 00:00:00 2001 From: unidel2035 Date: Sat, 1 Nov 2025 16:12:54 +0000 Subject: [PATCH 1/3] Initial commit with task details for issue #714 Adding CLAUDE.md with task information for AI processing. This file will be removed when the task is complete. Issue: undefined --- CLAUDE.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..d59a512c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,7 @@ +Issue to solve: undefined +Your prepared branch: issue-714-7cbca4c6 +Your prepared working directory: /tmp/gh-issue-solver-1762012808189 +Your forked repository: unidel2035/KAG +Original repository (upstream): OpenSPG/KAG + +Proceed. \ No newline at end of file From b3a8222e60515d22d04c3deb9103b16c6895c7c2 Mon Sep 17 00:00:00 2001 From: unidel2035 Date: Sat, 1 Nov 2025 16:19:04 +0000 Subject: [PATCH 2/3] fix(builder): handle both dict and string formats for core_entities in knowledge unit extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix AttributeError when LLM returns core_entities as dict instead of string. The code now handles both formats: - String format (Chinese): "entity1,entity2,entity3" - Dict format (English): {"entity1": "Type1", "entity2": "Type2"} This resolves the issue where knowledge extraction would fail with: AttributeError: 'dict' object has no attribute 'split' Fixes #714 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- experiments/test_core_entities_handling.py | 115 +++++++++++ experiments/test_fix.py | 136 +++++++++++++ .../extractor/knowledge_unit_extractor.py | 22 ++- .../test_knowledge_unit_core_entities.py | 185 ++++++++++++++++++ 4 files changed, 454 insertions(+), 4 deletions(-) create mode 100644 experiments/test_core_entities_handling.py create mode 100644 experiments/test_fix.py create mode 100644 tests/unit/builder/component/test_knowledge_unit_core_entities.py diff --git a/experiments/test_core_entities_handling.py b/experiments/test_core_entities_handling.py new file mode 100644 index 00000000..c3d0875c --- /dev/null +++ b/experiments/test_core_entities_handling.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +Test script to verify the core_entities handling issue +""" + +# Simulate the scenario where core_entities can be either string or dict + +def test_string_format(): + """Test with string format (Chinese example)""" + knowledge_value = { + "content": "test content", + "knowledgetype": "事实性知识", + "core_entities": "火电发电量,同比增长率,2019年" # String format + } + + print("Testing STRING format:") + print(f" core_entities type: {type(knowledge_value.get('core_entities'))}") + print(f" core_entities value: {knowledge_value.get('core_entities')}") + + try: + # This is what the current code does + for item in knowledge_value.get("core_entities", "").split(","): + if not item.strip(): + continue + print(f" - Entity: {item.strip()}") + print(" ✓ SUCCESS: String format works") + except AttributeError as e: + print(f" ✗ FAILED: {e}") + print() + +def test_dict_format(): + """Test with dict format (English example)""" + knowledge_value = { + "content": "test content", + "knowledgetype": "Factual Knowledge", + "core_entities": { # Dict format + "T.I.": "Person", + "No Mediocre": "Culture and Entertainment", + "Paperwork": "Culture and Entertainment" + } + } + + print("Testing DICT format (THIS WILL FAIL WITH CURRENT CODE):") + print(f" core_entities type: {type(knowledge_value.get('core_entities'))}") + print(f" core_entities value: {knowledge_value.get('core_entities')}") + + try: + # This is what the current code does - will fail! + for item in knowledge_value.get("core_entities", "").split(","): + if not item.strip(): + continue + print(f" - Entity: {item.strip()}") + print(" ✓ SUCCESS: Dict format works") + except AttributeError as e: + print(f" ✗ FAILED: {e}") + print() + +def test_fixed_approach(): + """Test with fixed approach that handles both formats""" + test_cases = [ + { + "name": "String format", + "knowledge_value": { + "core_entities": "火电发电量,同比增长率,2019年" + } + }, + { + "name": "Dict format", + "knowledge_value": { + "core_entities": { + "T.I.": "Person", + "No Mediocre": "Culture and Entertainment" + } + } + }, + { + "name": "Empty string", + "knowledge_value": { + "core_entities": "" + } + }, + { + "name": "Missing field", + "knowledge_value": {} + } + ] + + print("Testing FIXED approach that handles both formats:") + for test_case in test_cases: + print(f"\n Test: {test_case['name']}") + knowledge_value = test_case['knowledge_value'] + core_entities = {} + + try: + core_entities_raw = knowledge_value.get("core_entities", "") + + # Handle both string and dict formats + if isinstance(core_entities_raw, dict): + # Dict format: use as-is + core_entities = core_entities_raw + elif isinstance(core_entities_raw, str): + # String format: parse comma-separated values + for item in core_entities_raw.split(","): + if item.strip(): + core_entities[item.strip()] = "Others" + + print(f" Parsed entities: {core_entities}") + print(f" ✓ SUCCESS") + except Exception as e: + print(f" ✗ FAILED: {e}") + +if __name__ == "__main__": + test_string_format() + test_dict_format() + test_fixed_approach() diff --git a/experiments/test_fix.py b/experiments/test_fix.py new file mode 100644 index 00000000..7daa18ce --- /dev/null +++ b/experiments/test_fix.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Test script to verify the fix works correctly +""" +import sys +import logging + +# Set up logging +logging.basicConfig(level=logging.WARNING) +logger = logging.getLogger(__name__) + +def simulate_fixed_code(knowledge_value): + """Simulates the fixed code logic""" + core_entities = {} + core_entities_raw = knowledge_value.get("core_entities", "") + + # Handle both string and dict formats for core_entities + if isinstance(core_entities_raw, dict): + # Dict format: {entity_name: entity_type} + core_entities = core_entities_raw + elif isinstance(core_entities_raw, str): + # String format: comma-separated values + for item in core_entities_raw.split(","): + if not item.strip(): + continue + core_entities[item.strip()] = "Others" + else: + # Handle unexpected types gracefully + logger.warning( + f"Unexpected type for core_entities: {type(core_entities_raw)}, " + f"expected str or dict. Value: {core_entities_raw}" + ) + + return core_entities + +def test_all_scenarios(): + """Test all possible scenarios""" + test_cases = [ + { + "name": "Chinese format (string)", + "knowledge_value": { + "content": "2019年全国火电发电量51654亿千瓦时", + "knowledgetype": "事实性知识", + "core_entities": "火电发电量,同比增长率,2019年" + }, + "expected": { + "火电发电量": "Others", + "同比增长率": "Others", + "2019年": "Others" + } + }, + { + "name": "English format (dict)", + "knowledge_value": { + "content": "No Mediocre is a song by T.I.", + "knowledgetype": "Factual Knowledge", + "core_entities": { + "T.I.": "Person", + "No Mediocre": "Culture and Entertainment", + "Paperwork": "Culture and Entertainment", + "DJ Mustard": "Person" + } + }, + "expected": { + "T.I.": "Person", + "No Mediocre": "Culture and Entertainment", + "Paperwork": "Culture and Entertainment", + "DJ Mustard": "Person" + } + }, + { + "name": "Empty string", + "knowledge_value": { + "core_entities": "" + }, + "expected": {} + }, + { + "name": "Missing field", + "knowledge_value": {}, + "expected": {} + }, + { + "name": "String with extra spaces", + "knowledge_value": { + "core_entities": " entity1 , entity2 , entity3 " + }, + "expected": { + "entity1": "Others", + "entity2": "Others", + "entity3": "Others" + } + }, + { + "name": "Invalid type (should log warning)", + "knowledge_value": { + "core_entities": 123 + }, + "expected": {} + }, + { + "name": "List type (should log warning)", + "knowledge_value": { + "core_entities": ["entity1", "entity2"] + }, + "expected": {} + } + ] + + all_passed = True + for i, test_case in enumerate(test_cases, 1): + try: + result = simulate_fixed_code(test_case["knowledge_value"]) + expected = test_case["expected"] + + if result == expected: + print(f"✓ Test {i}: {test_case['name']} - PASSED") + else: + print(f"✗ Test {i}: {test_case['name']} - FAILED") + print(f" Expected: {expected}") + print(f" Got: {result}") + all_passed = False + except Exception as e: + print(f"✗ Test {i}: {test_case['name']} - EXCEPTION: {e}") + all_passed = False + + print() + if all_passed: + print("🎉 All tests passed!") + return 0 + else: + print("❌ Some tests failed!") + return 1 + +if __name__ == "__main__": + sys.exit(test_all_scenarios()) diff --git a/kag/builder/component/extractor/knowledge_unit_extractor.py b/kag/builder/component/extractor/knowledge_unit_extractor.py index 6d99c03a..7854d424 100644 --- a/kag/builder/component/extractor/knowledge_unit_extractor.py +++ b/kag/builder/component/extractor/knowledge_unit_extractor.py @@ -584,10 +584,24 @@ def triple_to_knowledge_unit(triple): {"name": knowledge_id, "category": "KnowledgeUnit"} ) core_entities = {} - for item in knowledge_value.get("core_entities", "").split(","): - if not item.strip(): - continue - core_entities[item.strip()] = "Others" + core_entities_raw = knowledge_value.get("core_entities", "") + + # Handle both string and dict formats for core_entities + if isinstance(core_entities_raw, dict): + # Dict format: {entity_name: entity_type} + core_entities = core_entities_raw + elif isinstance(core_entities_raw, str): + # String format: comma-separated values + for item in core_entities_raw.split(","): + if not item.strip(): + continue + core_entities[item.strip()] = "Others" + else: + # Handle unexpected types gracefully + logger.warning( + f"Unexpected type for core_entities: {type(core_entities_raw)}, " + f"expected str or dict. Value: {core_entities_raw}" + ) for core_entity, ent_type in core_entities.items(): if core_entity == "": diff --git a/tests/unit/builder/component/test_knowledge_unit_core_entities.py b/tests/unit/builder/component/test_knowledge_unit_core_entities.py new file mode 100644 index 00000000..bd00db1b --- /dev/null +++ b/tests/unit/builder/component/test_knowledge_unit_core_entities.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +""" +Unit tests for core_entities handling in knowledge unit extractor. +Tests fix for issue #714: https://github.com/OpenSPG/KAG/issues/714 +""" +import logging +from unittest.mock import MagicMock +from kag.builder.model.sub_graph import SubGraph +from kag.builder.component.extractor.knowledge_unit_extractor import KnowledgeUnitExtractor + + +def test_core_entities_string_format(): + """Test handling of core_entities in string format (Chinese example)""" + # Create a minimal extractor instance + extractor = _create_minimal_extractor() + + # Simulate knowledge_units with string format + input_knowledge_units = { + "2019年全国火电发电量": { + "content": "2019年全国火电发电量51654亿千瓦时", + "knowledgetype": "事实性知识", + "core_entities": "火电发电量,同比增长率,2019年" # String format + } + } + + sub_graph = SubGraph() + source_entities = [] + triples = [] + + # This should not raise AttributeError + try: + result = extractor.assemble_knowledge_unit( + sub_graph, + source_entities, + input_knowledge_units, + triples + ) + assert isinstance(result, list), "Result should be a list" + assert len(result) > 0, "Should have at least one knowledge unit node" + except AttributeError as e: + if "'dict' object has no attribute 'split'" in str(e): + raise AssertionError(f"String format handling failed: {e}") + raise + + +def test_core_entities_dict_format(): + """Test handling of core_entities in dict format (English example)""" + # Create a minimal extractor instance + extractor = _create_minimal_extractor() + + # Simulate knowledge_units with dict format + input_knowledge_units = { + "No Mediocre Song Details": { + "content": "No Mediocre is a song by T.I.", + "knowledgetype": "Factual Knowledge", + "core_entities": { # Dict format + "T.I.": "Person", + "No Mediocre": "Culture and Entertainment", + "Paperwork": "Culture and Entertainment" + } + } + } + + sub_graph = SubGraph() + source_entities = [] + triples = [] + + # This should not raise AttributeError + try: + result = extractor.assemble_knowledge_unit( + sub_graph, + source_entities, + input_knowledge_units, + triples + ) + assert isinstance(result, list), "Result should be a list" + assert len(result) > 0, "Should have at least one knowledge unit node" + except AttributeError as e: + if "'dict' object has no attribute 'split'" in str(e): + raise AssertionError(f"Dict format handling failed: {e}") + raise + + +def test_core_entities_empty_string(): + """Test handling of empty core_entities""" + extractor = _create_minimal_extractor() + + input_knowledge_units = { + "Test Knowledge": { + "content": "Test content", + "knowledgetype": "Factual Knowledge", + "core_entities": "" # Empty string + } + } + + sub_graph = SubGraph() + source_entities = [] + triples = [] + + # Should handle empty string gracefully + result = extractor.assemble_knowledge_unit( + sub_graph, + source_entities, + input_knowledge_units, + triples + ) + assert isinstance(result, list), "Result should be a list" + + +def test_core_entities_missing_field(): + """Test handling of missing core_entities field""" + extractor = _create_minimal_extractor() + + input_knowledge_units = { + "Test Knowledge": { + "content": "Test content", + "knowledgetype": "Factual Knowledge" + # core_entities field is missing + } + } + + sub_graph = SubGraph() + source_entities = [] + triples = [] + + # Should handle missing field gracefully + result = extractor.assemble_knowledge_unit( + sub_graph, + source_entities, + input_knowledge_units, + triples + ) + assert isinstance(result, list), "Result should be a list" + + +def test_core_entities_invalid_type(caplog): + """Test handling of invalid type for core_entities (should log warning)""" + extractor = _create_minimal_extractor() + + input_knowledge_units = { + "Test Knowledge": { + "content": "Test content", + "knowledgetype": "Factual Knowledge", + "core_entities": 123 # Invalid type + } + } + + sub_graph = SubGraph() + source_entities = [] + triples = [] + + # Should handle invalid type gracefully and log warning + with caplog.at_level(logging.WARNING): + result = extractor.assemble_knowledge_unit( + sub_graph, + source_entities, + input_knowledge_units, + triples + ) + assert isinstance(result, list), "Result should be a list" + assert any("Unexpected type for core_entities" in record.message for record in caplog.records), \ + "Should log warning for unexpected type" + + +def _create_minimal_extractor(): + """Create a minimal KnowledgeUnitExtractor instance for testing""" + # Mock the LLM client + mock_llm = MagicMock() + + # Create extractor with minimal configuration + extractor = KnowledgeUnitExtractor( + llm=mock_llm, + ner_prompt=None, + kn_prompt=None, + triple_prompt=None, + external_graph=None + ) + + # Mock the get_stand_schema method + extractor.get_stand_schema = MagicMock(return_value="Others") + + # Mock the assemble_sub_graph_with_spg_properties method + extractor.assemble_sub_graph_with_spg_properties = MagicMock() + + return extractor From fbbff98ca49366b7d1ad4c2f40332fa7ff0f9286 Mon Sep 17 00:00:00 2001 From: unidel2035 Date: Sat, 1 Nov 2025 16:22:44 +0000 Subject: [PATCH 3/3] Revert "Initial commit with task details for issue #714" This reverts commit acb7361664388279e33431afe76a8b9398f37d6f. --- CLAUDE.md | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index d59a512c..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1,7 +0,0 @@ -Issue to solve: undefined -Your prepared branch: issue-714-7cbca4c6 -Your prepared working directory: /tmp/gh-issue-solver-1762012808189 -Your forked repository: unidel2035/KAG -Original repository (upstream): OpenSPG/KAG - -Proceed. \ No newline at end of file