Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions experiments/test_core_entities_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python3
"""
Test script to verify the core_entities handling issue
"""

# Simulate the scenario where core_entities can be either string or dict

def test_string_format():
"""Test with string format (Chinese example)"""
knowledge_value = {
"content": "test content",
"knowledgetype": "ไบ‹ๅฎžๆ€ง็Ÿฅ่ฏ†",
"core_entities": "็ซ็”ตๅ‘็”ต้‡,ๅŒๆฏ”ๅขž้•ฟ็އ,2019ๅนด" # String format
}

print("Testing STRING format:")
print(f" core_entities type: {type(knowledge_value.get('core_entities'))}")
print(f" core_entities value: {knowledge_value.get('core_entities')}")

try:
# This is what the current code does
for item in knowledge_value.get("core_entities", "").split(","):
if not item.strip():
continue
print(f" - Entity: {item.strip()}")
print(" โœ“ SUCCESS: String format works")
except AttributeError as e:
print(f" โœ— FAILED: {e}")
print()

def test_dict_format():
"""Test with dict format (English example)"""
knowledge_value = {
"content": "test content",
"knowledgetype": "Factual Knowledge",
"core_entities": { # Dict format
"T.I.": "Person",
"No Mediocre": "Culture and Entertainment",
"Paperwork": "Culture and Entertainment"
}
}

print("Testing DICT format (THIS WILL FAIL WITH CURRENT CODE):")
print(f" core_entities type: {type(knowledge_value.get('core_entities'))}")
print(f" core_entities value: {knowledge_value.get('core_entities')}")

try:
# This is what the current code does - will fail!
for item in knowledge_value.get("core_entities", "").split(","):
if not item.strip():
continue
print(f" - Entity: {item.strip()}")
print(" โœ“ SUCCESS: Dict format works")
except AttributeError as e:
print(f" โœ— FAILED: {e}")
print()

def test_fixed_approach():
"""Test with fixed approach that handles both formats"""
test_cases = [
{
"name": "String format",
"knowledge_value": {
"core_entities": "็ซ็”ตๅ‘็”ต้‡,ๅŒๆฏ”ๅขž้•ฟ็އ,2019ๅนด"
}
},
{
"name": "Dict format",
"knowledge_value": {
"core_entities": {
"T.I.": "Person",
"No Mediocre": "Culture and Entertainment"
}
}
},
{
"name": "Empty string",
"knowledge_value": {
"core_entities": ""
}
},
{
"name": "Missing field",
"knowledge_value": {}
}
]

print("Testing FIXED approach that handles both formats:")
for test_case in test_cases:
print(f"\n Test: {test_case['name']}")
knowledge_value = test_case['knowledge_value']
core_entities = {}

try:
core_entities_raw = knowledge_value.get("core_entities", "")

# Handle both string and dict formats
if isinstance(core_entities_raw, dict):
# Dict format: use as-is
core_entities = core_entities_raw
elif isinstance(core_entities_raw, str):
# String format: parse comma-separated values
for item in core_entities_raw.split(","):
if item.strip():
core_entities[item.strip()] = "Others"

print(f" Parsed entities: {core_entities}")
print(f" โœ“ SUCCESS")
except Exception as e:
print(f" โœ— FAILED: {e}")

if __name__ == "__main__":
test_string_format()
test_dict_format()
test_fixed_approach()
136 changes: 136 additions & 0 deletions experiments/test_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Test script to verify the fix works correctly
"""
import sys
import logging

# Set up logging
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

def simulate_fixed_code(knowledge_value):
"""Simulates the fixed code logic"""
core_entities = {}
core_entities_raw = knowledge_value.get("core_entities", "")

# Handle both string and dict formats for core_entities
if isinstance(core_entities_raw, dict):
# Dict format: {entity_name: entity_type}
core_entities = core_entities_raw
elif isinstance(core_entities_raw, str):
# String format: comma-separated values
for item in core_entities_raw.split(","):
if not item.strip():
continue
core_entities[item.strip()] = "Others"
else:
# Handle unexpected types gracefully
logger.warning(
f"Unexpected type for core_entities: {type(core_entities_raw)}, "
f"expected str or dict. Value: {core_entities_raw}"
)

return core_entities

def test_all_scenarios():
"""Test all possible scenarios"""
test_cases = [
{
"name": "Chinese format (string)",
"knowledge_value": {
"content": "2019ๅนดๅ…จๅ›ฝ็ซ็”ตๅ‘็”ต้‡51654ไบฟๅƒ็“ฆๆ—ถ",
"knowledgetype": "ไบ‹ๅฎžๆ€ง็Ÿฅ่ฏ†",
"core_entities": "็ซ็”ตๅ‘็”ต้‡,ๅŒๆฏ”ๅขž้•ฟ็އ,2019ๅนด"
},
"expected": {
"็ซ็”ตๅ‘็”ต้‡": "Others",
"ๅŒๆฏ”ๅขž้•ฟ็އ": "Others",
"2019ๅนด": "Others"
}
},
{
"name": "English format (dict)",
"knowledge_value": {
"content": "No Mediocre is a song by T.I.",
"knowledgetype": "Factual Knowledge",
"core_entities": {
"T.I.": "Person",
"No Mediocre": "Culture and Entertainment",
"Paperwork": "Culture and Entertainment",
"DJ Mustard": "Person"
}
},
"expected": {
"T.I.": "Person",
"No Mediocre": "Culture and Entertainment",
"Paperwork": "Culture and Entertainment",
"DJ Mustard": "Person"
}
},
{
"name": "Empty string",
"knowledge_value": {
"core_entities": ""
},
"expected": {}
},
{
"name": "Missing field",
"knowledge_value": {},
"expected": {}
},
{
"name": "String with extra spaces",
"knowledge_value": {
"core_entities": " entity1 , entity2 , entity3 "
},
"expected": {
"entity1": "Others",
"entity2": "Others",
"entity3": "Others"
}
},
{
"name": "Invalid type (should log warning)",
"knowledge_value": {
"core_entities": 123
},
"expected": {}
},
{
"name": "List type (should log warning)",
"knowledge_value": {
"core_entities": ["entity1", "entity2"]
},
"expected": {}
}
]

all_passed = True
for i, test_case in enumerate(test_cases, 1):
try:
result = simulate_fixed_code(test_case["knowledge_value"])
expected = test_case["expected"]

if result == expected:
print(f"โœ“ Test {i}: {test_case['name']} - PASSED")
else:
print(f"โœ— Test {i}: {test_case['name']} - FAILED")
print(f" Expected: {expected}")
print(f" Got: {result}")
all_passed = False
except Exception as e:
print(f"โœ— Test {i}: {test_case['name']} - EXCEPTION: {e}")
all_passed = False

print()
if all_passed:
print("๐ŸŽ‰ All tests passed!")
return 0
else:
print("โŒ Some tests failed!")
return 1

if __name__ == "__main__":
sys.exit(test_all_scenarios())
22 changes: 18 additions & 4 deletions kag/builder/component/extractor/knowledge_unit_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,10 +584,24 @@ def triple_to_knowledge_unit(triple):
{"name": knowledge_id, "category": "KnowledgeUnit"}
)
core_entities = {}
for item in knowledge_value.get("core_entities", "").split(","):
if not item.strip():
continue
core_entities[item.strip()] = "Others"
core_entities_raw = knowledge_value.get("core_entities", "")

# Handle both string and dict formats for core_entities
if isinstance(core_entities_raw, dict):
# Dict format: {entity_name: entity_type}
core_entities = core_entities_raw
elif isinstance(core_entities_raw, str):
# String format: comma-separated values
for item in core_entities_raw.split(","):
if not item.strip():
continue
core_entities[item.strip()] = "Others"
else:
# Handle unexpected types gracefully
logger.warning(
f"Unexpected type for core_entities: {type(core_entities_raw)}, "
f"expected str or dict. Value: {core_entities_raw}"
)

for core_entity, ent_type in core_entities.items():
if core_entity == "":
Expand Down
Loading