From acb7361664388279e33431afe76a8b9398f37d6f Mon Sep 17 00:00:00 2001
From: unidel2035 <izhkar2012@gmail.com>
Date: Sat, 1 Nov 2025 16:12:54 +0000
Subject: [PATCH 1/3] Initial commit with task details for issue #714

Adding CLAUDE.md with task information for AI processing.
This file will be removed when the task is complete.

Issue: undefined
---
 CLAUDE.md | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 00000000..d59a512c
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,7 @@
+Issue to solve: undefined
+Your prepared branch: issue-714-7cbca4c6
+Your prepared working directory: /tmp/gh-issue-solver-1762012808189
+Your forked repository: unidel2035/KAG
+Original repository (upstream): OpenSPG/KAG
+
+Proceed.
\ No newline at end of file

From b3a8222e60515d22d04c3deb9103b16c6895c7c2 Mon Sep 17 00:00:00 2001
From: unidel2035 <izhkar2012@gmail.com>
Date: Sat, 1 Nov 2025 16:19:04 +0000
Subject: [PATCH 2/3] fix(builder): handle both dict and string formats for
 core_entities in knowledge unit extractor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix AttributeError when LLM returns core_entities as dict instead of string.
The code now handles both formats:
- String format (Chinese): "entity1,entity2,entity3"
- Dict format (English): {"entity1": "Type1", "entity2": "Type2"}

This resolves the issue where knowledge extraction would fail with:
AttributeError: 'dict' object has no attribute 'split'

Fixes #714

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 experiments/test_core_entities_handling.py    | 115 +++++++++++
 experiments/test_fix.py                       | 136 +++++++++++++
 .../extractor/knowledge_unit_extractor.py     |  22 ++-
 .../test_knowledge_unit_core_entities.py      | 185 ++++++++++++++++++
 4 files changed, 454 insertions(+), 4 deletions(-)
 create mode 100644 experiments/test_core_entities_handling.py
 create mode 100644 experiments/test_fix.py
 create mode 100644 tests/unit/builder/component/test_knowledge_unit_core_entities.py

diff --git a/experiments/test_core_entities_handling.py b/experiments/test_core_entities_handling.py
new file mode 100644
index 00000000..c3d0875c
--- /dev/null
+++ b/experiments/test_core_entities_handling.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the core_entities handling issue
+"""
+
+# Simulate the scenario where core_entities can be either string or dict
+
+def test_string_format():
+    """Test with string format (Chinese example)"""
+    knowledge_value = {
+        "content": "test content",
+        "knowledgetype": "事实性知识",
+        "core_entities": "火电发电量,同比增长率,2019年"  # String format
+    }
+
+    print("Testing STRING format:")
+    print(f"  core_entities type: {type(knowledge_value.get('core_entities'))}")
+    print(f"  core_entities value: {knowledge_value.get('core_entities')}")
+
+    try:
+        # This is what the current code does
+        for item in knowledge_value.get("core_entities", "").split(","):
+            if not item.strip():
+                continue
+            print(f"  - Entity: {item.strip()}")
+        print("  ✓ SUCCESS: String format works")
+    except AttributeError as e:
+        print(f"  ✗ FAILED: {e}")
+    print()
+
+def test_dict_format():
+    """Test with dict format (English example)"""
+    knowledge_value = {
+        "content": "test content",
+        "knowledgetype": "Factual Knowledge",
+        "core_entities": {  # Dict format
+            "T.I.": "Person",
+            "No Mediocre": "Culture and Entertainment",
+            "Paperwork": "Culture and Entertainment"
+        }
+    }
+
+    print("Testing DICT format (THIS WILL FAIL WITH CURRENT CODE):")
+    print(f"  core_entities type: {type(knowledge_value.get('core_entities'))}")
+    print(f"  core_entities value: {knowledge_value.get('core_entities')}")
+
+    try:
+        # This is what the current code does - will fail!
+        for item in knowledge_value.get("core_entities", "").split(","):
+            if not item.strip():
+                continue
+            print(f"  - Entity: {item.strip()}")
+        print("  ✓ SUCCESS: Dict format works")
+    except AttributeError as e:
+        print(f"  ✗ FAILED: {e}")
+    print()
+
+def test_fixed_approach():
+    """Test with fixed approach that handles both formats"""
+    test_cases = [
+        {
+            "name": "String format",
+            "knowledge_value": {
+                "core_entities": "火电发电量,同比增长率,2019年"
+            }
+        },
+        {
+            "name": "Dict format",
+            "knowledge_value": {
+                "core_entities": {
+                    "T.I.": "Person",
+                    "No Mediocre": "Culture and Entertainment"
+                }
+            }
+        },
+        {
+            "name": "Empty string",
+            "knowledge_value": {
+                "core_entities": ""
+            }
+        },
+        {
+            "name": "Missing field",
+            "knowledge_value": {}
+        }
+    ]
+
+    print("Testing FIXED approach that handles both formats:")
+    for test_case in test_cases:
+        print(f"\n  Test: {test_case['name']}")
+        knowledge_value = test_case['knowledge_value']
+        core_entities = {}
+
+        try:
+            core_entities_raw = knowledge_value.get("core_entities", "")
+
+            # Handle both string and dict formats
+            if isinstance(core_entities_raw, dict):
+                # Dict format: use as-is
+                core_entities = core_entities_raw
+            elif isinstance(core_entities_raw, str):
+                # String format: parse comma-separated values
+                for item in core_entities_raw.split(","):
+                    if item.strip():
+                        core_entities[item.strip()] = "Others"
+
+            print(f"    Parsed entities: {core_entities}")
+            print(f"    ✓ SUCCESS")
+        except Exception as e:
+            print(f"    ✗ FAILED: {e}")
+
+if __name__ == "__main__":
+    test_string_format()
+    test_dict_format()
+    test_fixed_approach()
diff --git a/experiments/test_fix.py b/experiments/test_fix.py
new file mode 100644
index 00000000..7daa18ce
--- /dev/null
+++ b/experiments/test_fix.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the fix works correctly
+"""
+import sys
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.WARNING)
+logger = logging.getLogger(__name__)
+
+def simulate_fixed_code(knowledge_value):
+    """Simulates the fixed code logic"""
+    core_entities = {}
+    core_entities_raw = knowledge_value.get("core_entities", "")
+
+    # Handle both string and dict formats for core_entities
+    if isinstance(core_entities_raw, dict):
+        # Dict format: {entity_name: entity_type}
+        core_entities = core_entities_raw
+    elif isinstance(core_entities_raw, str):
+        # String format: comma-separated values
+        for item in core_entities_raw.split(","):
+            if not item.strip():
+                continue
+            core_entities[item.strip()] = "Others"
+    else:
+        # Handle unexpected types gracefully
+        logger.warning(
+            f"Unexpected type for core_entities: {type(core_entities_raw)}, "
+            f"expected str or dict. Value: {core_entities_raw}"
+        )
+
+    return core_entities
+
+def test_all_scenarios():
+    """Test all possible scenarios"""
+    test_cases = [
+        {
+            "name": "Chinese format (string)",
+            "knowledge_value": {
+                "content": "2019年全国火电发电量51654亿千瓦时",
+                "knowledgetype": "事实性知识",
+                "core_entities": "火电发电量,同比增长率,2019年"
+            },
+            "expected": {
+                "火电发电量": "Others",
+                "同比增长率": "Others",
+                "2019年": "Others"
+            }
+        },
+        {
+            "name": "English format (dict)",
+            "knowledge_value": {
+                "content": "No Mediocre is a song by T.I.",
+                "knowledgetype": "Factual Knowledge",
+                "core_entities": {
+                    "T.I.": "Person",
+                    "No Mediocre": "Culture and Entertainment",
+                    "Paperwork": "Culture and Entertainment",
+                    "DJ Mustard": "Person"
+                }
+            },
+            "expected": {
+                "T.I.": "Person",
+                "No Mediocre": "Culture and Entertainment",
+                "Paperwork": "Culture and Entertainment",
+                "DJ Mustard": "Person"
+            }
+        },
+        {
+            "name": "Empty string",
+            "knowledge_value": {
+                "core_entities": ""
+            },
+            "expected": {}
+        },
+        {
+            "name": "Missing field",
+            "knowledge_value": {},
+            "expected": {}
+        },
+        {
+            "name": "String with extra spaces",
+            "knowledge_value": {
+                "core_entities": " entity1 , entity2  ,  entity3  "
+            },
+            "expected": {
+                "entity1": "Others",
+                "entity2": "Others",
+                "entity3": "Others"
+            }
+        },
+        {
+            "name": "Invalid type (should log warning)",
+            "knowledge_value": {
+                "core_entities": 123
+            },
+            "expected": {}
+        },
+        {
+            "name": "List type (should log warning)",
+            "knowledge_value": {
+                "core_entities": ["entity1", "entity2"]
+            },
+            "expected": {}
+        }
+    ]
+
+    all_passed = True
+    for i, test_case in enumerate(test_cases, 1):
+        try:
+            result = simulate_fixed_code(test_case["knowledge_value"])
+            expected = test_case["expected"]
+
+            if result == expected:
+                print(f"✓ Test {i}: {test_case['name']} - PASSED")
+            else:
+                print(f"✗ Test {i}: {test_case['name']} - FAILED")
+                print(f"  Expected: {expected}")
+                print(f"  Got:      {result}")
+                all_passed = False
+        except Exception as e:
+            print(f"✗ Test {i}: {test_case['name']} - EXCEPTION: {e}")
+            all_passed = False
+
+    print()
+    if all_passed:
+        print("🎉 All tests passed!")
+        return 0
+    else:
+        print("❌ Some tests failed!")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(test_all_scenarios())
diff --git a/kag/builder/component/extractor/knowledge_unit_extractor.py b/kag/builder/component/extractor/knowledge_unit_extractor.py
index 6d99c03a..7854d424 100644
--- a/kag/builder/component/extractor/knowledge_unit_extractor.py
+++ b/kag/builder/component/extractor/knowledge_unit_extractor.py
@@ -584,10 +584,24 @@ def triple_to_knowledge_unit(triple):
                 {"name": knowledge_id, "category": "KnowledgeUnit"}
             )
             core_entities = {}
-            for item in knowledge_value.get("core_entities", "").split(","):
-                if not item.strip():
-                    continue
-                core_entities[item.strip()] = "Others"
+            core_entities_raw = knowledge_value.get("core_entities", "")
+
+            # Handle both string and dict formats for core_entities
+            if isinstance(core_entities_raw, dict):
+                # Dict format: {entity_name: entity_type}
+                core_entities = core_entities_raw
+            elif isinstance(core_entities_raw, str):
+                # String format: comma-separated values
+                for item in core_entities_raw.split(","):
+                    if not item.strip():
+                        continue
+                    core_entities[item.strip()] = "Others"
+            else:
+                # Handle unexpected types gracefully
+                logger.warning(
+                    f"Unexpected type for core_entities: {type(core_entities_raw)}, "
+                    f"expected str or dict. Value: {core_entities_raw}"
+                )
 
             for core_entity, ent_type in core_entities.items():
                 if core_entity == "":
diff --git a/tests/unit/builder/component/test_knowledge_unit_core_entities.py b/tests/unit/builder/component/test_knowledge_unit_core_entities.py
new file mode 100644
index 00000000..bd00db1b
--- /dev/null
+++ b/tests/unit/builder/component/test_knowledge_unit_core_entities.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for core_entities handling in knowledge unit extractor.
+Tests fix for issue #714: https://github.com/OpenSPG/KAG/issues/714
+"""
+import logging
+from unittest.mock import MagicMock
+from kag.builder.model.sub_graph import SubGraph
+from kag.builder.component.extractor.knowledge_unit_extractor import KnowledgeUnitExtractor
+
+
+def test_core_entities_string_format():
+    """Test handling of core_entities in string format (Chinese example)"""
+    # Create a minimal extractor instance
+    extractor = _create_minimal_extractor()
+
+    # Simulate knowledge_units with string format
+    input_knowledge_units = {
+        "2019年全国火电发电量": {
+            "content": "2019年全国火电发电量51654亿千瓦时",
+            "knowledgetype": "事实性知识",
+            "core_entities": "火电发电量,同比增长率,2019年"  # String format
+        }
+    }
+
+    sub_graph = SubGraph()
+    source_entities = []
+    triples = []
+
+    # This should not raise AttributeError
+    try:
+        result = extractor.assemble_knowledge_unit(
+            sub_graph,
+            source_entities,
+            input_knowledge_units,
+            triples
+        )
+        assert isinstance(result, list), "Result should be a list"
+        assert len(result) > 0, "Should have at least one knowledge unit node"
+    except AttributeError as e:
+        if "'dict' object has no attribute 'split'" in str(e):
+            raise AssertionError(f"String format handling failed: {e}")
+        raise
+
+
+def test_core_entities_dict_format():
+    """Test handling of core_entities in dict format (English example)"""
+    # Create a minimal extractor instance
+    extractor = _create_minimal_extractor()
+
+    # Simulate knowledge_units with dict format
+    input_knowledge_units = {
+        "No Mediocre Song Details": {
+            "content": "No Mediocre is a song by T.I.",
+            "knowledgetype": "Factual Knowledge",
+            "core_entities": {  # Dict format
+                "T.I.": "Person",
+                "No Mediocre": "Culture and Entertainment",
+                "Paperwork": "Culture and Entertainment"
+            }
+        }
+    }
+
+    sub_graph = SubGraph()
+    source_entities = []
+    triples = []
+
+    # This should not raise AttributeError
+    try:
+        result = extractor.assemble_knowledge_unit(
+            sub_graph,
+            source_entities,
+            input_knowledge_units,
+            triples
+        )
+        assert isinstance(result, list), "Result should be a list"
+        assert len(result) > 0, "Should have at least one knowledge unit node"
+    except AttributeError as e:
+        if "'dict' object has no attribute 'split'" in str(e):
+            raise AssertionError(f"Dict format handling failed: {e}")
+        raise
+
+
+def test_core_entities_empty_string():
+    """Test handling of empty core_entities"""
+    extractor = _create_minimal_extractor()
+
+    input_knowledge_units = {
+        "Test Knowledge": {
+            "content": "Test content",
+            "knowledgetype": "Factual Knowledge",
+            "core_entities": ""  # Empty string
+        }
+    }
+
+    sub_graph = SubGraph()
+    source_entities = []
+    triples = []
+
+    # Should handle empty string gracefully
+    result = extractor.assemble_knowledge_unit(
+        sub_graph,
+        source_entities,
+        input_knowledge_units,
+        triples
+    )
+    assert isinstance(result, list), "Result should be a list"
+
+
+def test_core_entities_missing_field():
+    """Test handling of missing core_entities field"""
+    extractor = _create_minimal_extractor()
+
+    input_knowledge_units = {
+        "Test Knowledge": {
+            "content": "Test content",
+            "knowledgetype": "Factual Knowledge"
+            # core_entities field is missing
+        }
+    }
+
+    sub_graph = SubGraph()
+    source_entities = []
+    triples = []
+
+    # Should handle missing field gracefully
+    result = extractor.assemble_knowledge_unit(
+        sub_graph,
+        source_entities,
+        input_knowledge_units,
+        triples
+    )
+    assert isinstance(result, list), "Result should be a list"
+
+
+def test_core_entities_invalid_type(caplog):
+    """Test handling of invalid type for core_entities (should log warning)"""
+    extractor = _create_minimal_extractor()
+
+    input_knowledge_units = {
+        "Test Knowledge": {
+            "content": "Test content",
+            "knowledgetype": "Factual Knowledge",
+            "core_entities": 123  # Invalid type
+        }
+    }
+
+    sub_graph = SubGraph()
+    source_entities = []
+    triples = []
+
+    # Should handle invalid type gracefully and log warning
+    with caplog.at_level(logging.WARNING):
+        result = extractor.assemble_knowledge_unit(
+            sub_graph,
+            source_entities,
+            input_knowledge_units,
+            triples
+        )
+        assert isinstance(result, list), "Result should be a list"
+        assert any("Unexpected type for core_entities" in record.message for record in caplog.records), \
+            "Should log warning for unexpected type"
+
+
+def _create_minimal_extractor():
+    """Create a minimal KnowledgeUnitExtractor instance for testing"""
+    # Mock the LLM client
+    mock_llm = MagicMock()
+
+    # Create extractor with minimal configuration
+    extractor = KnowledgeUnitExtractor(
+        llm=mock_llm,
+        ner_prompt=None,
+        kn_prompt=None,
+        triple_prompt=None,
+        external_graph=None
+    )
+
+    # Mock the get_stand_schema method
+    extractor.get_stand_schema = MagicMock(return_value="Others")
+
+    # Mock the assemble_sub_graph_with_spg_properties method
+    extractor.assemble_sub_graph_with_spg_properties = MagicMock()
+
+    return extractor

From fbbff98ca49366b7d1ad4c2f40332fa7ff0f9286 Mon Sep 17 00:00:00 2001
From: unidel2035 <izhkar2012@gmail.com>
Date: Sat, 1 Nov 2025 16:22:44 +0000
Subject: [PATCH 3/3] Revert "Initial commit with task details for issue #714"

This reverts commit acb7361664388279e33431afe76a8b9398f37d6f.
---
 CLAUDE.md | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
deleted file mode 100644
index d59a512c..00000000
--- a/CLAUDE.md
+++ /dev/null
@@ -1,7 +0,0 @@
-Issue to solve: undefined
-Your prepared branch: issue-714-7cbca4c6
-Your prepared working directory: /tmp/gh-issue-solver-1762012808189
-Your forked repository: unidel2035/KAG
-Original repository (upstream): OpenSPG/KAG
-
-Proceed.
\ No newline at end of file