Skip to content

Commit

Permalink
Merge pull request #54 from neavo/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
neavo authored Aug 30, 2024
2 parents 8f836c0 + f095afc commit dc1a31e
Show file tree
Hide file tree
Showing 12 changed files with 268 additions and 128 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/manaul.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
- name: Install Model
shell: cmd
run: |
.\resource\aria2c.exe https://github.com/neavo/KeywordGachaModel/releases/download/kg_ner_20240819/kg_ner_cpu.zip -o kg_ner_cpu.zip
.\resource\aria2c.exe https://github.com/neavo/KeywordGachaModel/releases/download/kg_ner_20240826/kg_ner_cpu.zip -o kg_ner_cpu.zip
powershell -Command "Expand-Archive -Path 'kg_ner_cpu.zip' -DestinationPath 'dist\KeywordGacha\resource\kg_ner_cpu'"
powershell -Command "Remove-Item -Path 'kg_ner_cpu.zip' -Recurse -Force -ErrorAction SilentlyContinue"
Expand Down Expand Up @@ -71,7 +71,7 @@ jobs:
.\dist\KeywordGacha\env\python.exe -m pip install torch --index-url https://download.pytorch.org/whl/cu121
.\dist\KeywordGacha\env\python.exe -m pip cache purge
.\resource\aria2c.exe https://github.com/neavo/KeywordGachaModel/releases/download/kg_ner_20240819/kg_ner_gpu.zip -o kg_ner_gpu.zip
.\resource\aria2c.exe https://github.com/neavo/KeywordGachaModel/releases/download/kg_ner_20240826/kg_ner_gpu.zip -o kg_ner_gpu.zip
powershell -Command "Expand-Archive -Path 'kg_ner_gpu.zip' -DestinationPath 'dist\KeywordGacha\resource\kg_ner_gpu'"
powershell -Command "Remove-Item -Path 'kg_ner_gpu.zip' -Recurse -Force -ErrorAction SilentlyContinue"
echo > .\dist\KeywordGacha\gpuboost.txt
Expand Down
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
__pycache__

/env*
/dist*
/build*
/input*
/output*
__pycache__

/*log*
/debug*
/gpuboost*
/words_dict*
/words_all*
/config_dev*
/resource/kg_ner_*
/resource/*ner_*
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@
- 具体可见 [Wiki - 支持的文件格式](https://github.com/neavo/KeywordGacha/wiki/%E6%94%AF%E6%8C%81%E7%9A%84%E6%96%87%E4%BB%B6%E6%A0%BC%E5%BC%8F)

## 近期更新 📅
- 20240826 v0.4.0
- 新增 - 初步完成对 `韩文` 的支持
- 完全不懂 `韩文`,所以无法评估表现水平
- 寻求懂 `韩文` 的用户协助测试
- 调整 - 优化了 NER 实体识别步骤的执行速度
- `CPU``GPU` 版本都提速了一倍左右

- 20240820 v0.3.0
- 调整 - NER 模型更新至 20240819
- 调整 - 移除了一些不再需要的步骤以节约处理时间
Expand Down Expand Up @@ -126,7 +133,7 @@
- [X] 添加 对 组织、道具、地域 等其他名词类型的支持
- [X] 添加 对 `英文内容` 的支持
- [X] 添加 对 `中文内容` 的支持
- [ ] 添加 对 `韩文内容` 的支持
- [X] 添加 对 `韩文内容` 的支持
- [ ] 添加 对 `俄文内容` 的支持
- [X] 添加 对 GPU 加速的支持
- [X] 添加 全自动生成模式
Expand Down
76 changes: 52 additions & 24 deletions helper/TestHelper.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
from helper.LogHelper import LogHelper
from helper.TextHelper import TextHelper


class TestHelper:

@staticmethod
def check_duplicates(*args):
a = {
"ブルースライム": "蓝史莱姆",
"ワイバーン": "双足飞龙",
"ミスリル": "秘银",
"ポーション": "药水",
"ハイポーション": "高级药水",
"魔付き": "魔附者",
x = set(
{
# "ブルースライム": "蓝史莱姆",
# "ワイバーン": "双足飞龙",
# "ミスリル": "秘银",
# "ポーション": "药水",
# "ハイポーション": "高级药水",
# "魔付き": "魔附者",
"オルディネ": "奥迪涅",
"イシュラナ": "伊修拉纳",
"エリルキア": "艾利尔齐亚",
Expand Down Expand Up @@ -155,23 +155,51 @@ def check_duplicates(*args):
"ノワルスール": "诺瓦尔苏尔",
"ヴェントルジェント": "文特尔金特",
}
)

b = {}

if len(a) == 0 or len(b) == 0:
return
@staticmethod
def check_score_threshold(words, path):
thresholds = [
0.50,
0.55,
0.60,
0.65,
0.70,
0.75,
0.80,
0.85,
0.90,
0.95,
]

keys_a = set(a.keys())
keys_b = set(b.keys())
with open(path, "w", encoding="utf-8") as writer:
for threshold in thresholds:
y = {
word.surface
for word in words
if word.ner_type == "PER" and word.score > threshold
}

LogHelper.print(f"第一个词典独有的键 - {len(keys_a - keys_b)}")
LogHelper.print(f"{keys_a - keys_b}")
LogHelper.print(f"")
writer.write(f"当置信度阈值设置为 {threshold:.4f} 时:\n")
writer.write(f"第一个词典独有的键 - {len(TestHelper.x - y)}\n")
writer.write(f"{TestHelper.x - y}\n")
writer.write(f"第二个词典独有的键 - {len(y - TestHelper.x)}\n")
writer.write(f"{y - TestHelper.x}\n")
writer.write(f"两个字典共有的键 - {len(TestHelper.x & y)}\n")
writer.write(f"{TestHelper.x & y}\n")
writer.write(f"\n")
writer.write(f"\n")

LogHelper.print(f"第二个词典独有的键 - {len(keys_b - keys_a)}")
LogHelper.print(f"{keys_b - keys_a}")
LogHelper.print(f"")
@staticmethod
def check_result_duplication(words, path):
with open(path, "w", encoding="utf-8") as writer:
y = {word.surface for word in words if word.ner_type == "PER"}

LogHelper.print(f"两个字典共有的键 - {len(keys_a & keys_b)}")
LogHelper.print(f"{keys_a & keys_b}")
LogHelper.print(f"")
writer.write(f"第一个词典独有的键 - {len(TestHelper.x - y)}\n")
writer.write(f"{TestHelper.x - y}\n")
writer.write(f"第二个词典独有的键 - {len(y - TestHelper.x)}\n")
writer.write(f"{y - TestHelper.x}\n")
writer.write(f"两个字典共有的键 - {len(TestHelper.x & y)}\n")
writer.write(f"{TestHelper.x & y}\n")
writer.write(f"\n")
writer.write(f"\n")
50 changes: 50 additions & 0 deletions helper/TextHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@ class TextHelper:
# 濁音和半浊音符号
VOICED_SOUND_MARKS = ("\u309B", "\u309C")

# 韩文字母 (Hangul Jamo)
HANGUL_JAMO = ("\u1100", "\u11FF")

# 韩文字母扩展-A (Hangul Jamo Extended-A)
HANGUL_JAMO_EXTENDED_A = ("\uA960", "\uA97F")

# 韩文字母扩展-B (Hangul Jamo Extended-B)
HANGUL_JAMO_EXTENDED_B = ("\uD7B0", "\uD7FF")

# 韩文音节块 (Hangul Syllables)
HANGUL_SYLLABLES = ("\uAC00", "\uD7AF")

# 韩文兼容字母 (Hangul Compatibility Jamo)
HANGUL_COMPATIBILITY_JAMO = ("\u3130", "\u318F")

# 中日韩统一表意文字
CJK = ("\u4E00", "\u9FFF")

Expand Down Expand Up @@ -247,4 +262,39 @@ def strip_not_latin(text):
while text and not TextHelper.is_latin(text[-1]):
text = text[:-1]

return text.strip()

# 判断字符是否为韩文字符
@staticmethod
def is_korean(ch):
return (
TextHelper.CJK[0] <= ch <= TextHelper.CJK[1]
or TextHelper.HANGUL_JAMO[0] <= ch <= TextHelper.HANGUL_JAMO[1]
or TextHelper.HANGUL_JAMO_EXTENDED_A[0] <= ch <= TextHelper.HANGUL_JAMO_EXTENDED_A[1]
or TextHelper.HANGUL_JAMO_EXTENDED_B[0] <= ch <= TextHelper.HANGUL_JAMO_EXTENDED_B[1]
or TextHelper.HANGUL_SYLLABLES[0] <= ch <= TextHelper.HANGUL_SYLLABLES[1]
or TextHelper.HANGUL_COMPATIBILITY_JAMO[0] <= ch <= TextHelper.HANGUL_COMPATIBILITY_JAMO[1]
)

# 判断输入的字符串是否全部由韩文字符组成
@staticmethod
def is_all_korean(text):
return all(TextHelper.is_korean(ch) for ch in text)

# 检查字符串是否包含至少一个韩文字符组成
@staticmethod
def has_any_korean(text):
return any(TextHelper.is_korean(ch) for ch in text)

# 移除开头结尾的非韩文字符
@staticmethod
def strip_not_korean(text):
text = text.strip()

while text and not TextHelper.is_korean(text[0]):
text = text[1:]

while text and not TextHelper.is_korean(text[-1]):
text = text[:-1]

return text.strip()
Binary file modified image/01.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit dc1a31e

Please sign in to comment.