From 6df47e65f971de4fa68246c2052f41f9d69a0191 Mon Sep 17 00:00:00 2001 From: jiangxinfa <425776024@qq.com> Date: Tue, 10 Nov 2020 21:33:49 +0800 Subject: [PATCH] simbert --- nlpcda.egg-info/PKG-INFO | 46 +++++++++++++++++++++++++++++-- nlpcda.egg-info/SOURCES.txt | 5 +++- nlpcda.egg-info/requires.txt | 1 + nlpcda/tools/simbert/generator.py | 5 ---- requirements.txt | 6 ++-- setup.py | 4 +-- 6 files changed, 53 insertions(+), 14 deletions(-) diff --git a/nlpcda.egg-info/PKG-INFO b/nlpcda.egg-info/PKG-INFO index ec8fee3..b107842 100644 --- a/nlpcda.egg-info/PKG-INFO +++ b/nlpcda.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: nlpcda -Version: 2.4.2 +Version: 2.5.1 Summary: NLP Chinese Data Augmentation.一键中文数据增强.NLP数据增强 Home-page: https://github.com/425776024/nlpcda Author: Jiang.XinFa @@ -27,11 +27,11 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具 - 6.随机置换邻近的字:**研表究明,汉字序顺并不定一影响文字的阅读理解**<<是乱序的 - 7.百度中英翻译互转实现的增强 - 8.中文等价字替换(1 一 壹 ①,2 二 贰 ②) + - 9.使用`UniLM`做生成式相似句生成 `经过细节特殊处理,比如不改变年月日数字,尽量保证不改变原文语义。即使改变也能被猜出来、能被猜出来、能被踩出来、能被菜粗来、被菜粗、能菜粗来` ## 计划中的未来内容 - - 使用`UniLM`做生成式相似句生成 - 增加多线程操作,一键操作 - [使用 WordNet数据库 来做同义词替换](http://openkg.cn/dataset/chinese-wordnet) - 随机噪声注入?随机插入一些字符,太简单实现了。 @@ -288,6 +288,46 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具 ``` + ### 9.simbert + [来源:https://github.com/ZhuiyiTechnology/pretrained-models](https://github.com/ZhuiyiTechnology/pretrained-models) + + 下载其中任意模型,解压到任意位置赋值给`model_path`变量: + + | 名称 | 训练数据大小 | 词表大小 | 模型大小 | 下载地址 | + | :----------: |:---------:| :------: | :------: | :------: | + | SimBERT Tiny | 2200万相似句组 | 13685 | 26MB | [百度网盘](https://pan.baidu.com/s/1z_agqTuBTuyHANwrS-gPcg)(1tp7) | + | SimBERT Small | 2200万相似句组 | 13685 | 49MB | [百度网盘](https://pan.baidu.com/s/1kq_EQDI0gpiZBLFd_AxwrA)(nu67) | + | SimBERT Base | 2200万相似句组 | 13685 | 344MB | [百度网盘](https://pan.baidu.com/s/1uGfQmX1Kxcv_cXTVsvxTsQ)(6xhq) | + + 参数: + - config:model_path(上述下载的模型位置),设备(GPU/CPU)、最大长度、随机种子 + - sent:需要增强的句子数量 + - k:增强数据 + - threhold:阈值 + ```python + from nlpcda import Simbert + config = { + 'model_path': '/xxxx/chinese_simbert_L-12_H-768_A-12', + 'device': 'cpu', + 'max_len': 32, + 'seed': 1 + } + simbert = Simbert(config=config) + sent = '把我的一个亿存银行安全吗' + synonyms = simbert.replace(sent=sent, k=5, threhold=0.85) + print(synonyms) + ''' + [('我的一个亿,存银行,安全吗', 0.9871675372123718), + ('把一个亿存到银行里安全吗', 0.9352194666862488), + ('一个亿存银行安全吗', 0.9330801367759705), + ('一个亿的存款存银行安全吗', 0.92387855052948), + ('我的一千万存到银行安不安全', 0.9014463424682617)] + ''' + + + ``` + + ### 添加自定义词典 用于使用之前,增加分词效果 ```python @@ -304,6 +344,6 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具 ``` -Keywords: pip,nlptool,nlpcda,nlp, +Keywords: pip,nlptool,nlpcda,nlp,数据增强 Platform: any Description-Content-Type: text/markdown diff --git a/nlpcda.egg-info/SOURCES.txt b/nlpcda.egg-info/SOURCES.txt index 81d1db0..6137fc4 100644 --- a/nlpcda.egg-info/SOURCES.txt +++ b/nlpcda.egg-info/SOURCES.txt @@ -20,8 +20,11 @@ nlpcda/tools/Homophone.py nlpcda/tools/Ner.py nlpcda/tools/Random_delete_char.py nlpcda/tools/Random_word.py +nlpcda/tools/Simbert.py nlpcda/tools/Similar_word.py nlpcda/tools/Translate.py nlpcda/tools/__init__.py nlpcda/tools/homophone.py -nlpcda/tools/ner.py \ No newline at end of file +nlpcda/tools/ner.py +nlpcda/tools/simbert/__init__.py +nlpcda/tools/simbert/generator.py \ No newline at end of file diff --git a/nlpcda.egg-info/requires.txt b/nlpcda.egg-info/requires.txt index aba7ffe..624037b 100644 --- a/nlpcda.egg-info/requires.txt +++ b/nlpcda.egg-info/requires.txt @@ -1,2 +1,3 @@ jieba requests +bert4keras==0.7.7 diff --git a/nlpcda/tools/simbert/generator.py b/nlpcda/tools/simbert/generator.py index 6d7dda9..e324691 100644 --- a/nlpcda/tools/simbert/generator.py +++ b/nlpcda/tools/simbert/generator.py @@ -10,14 +10,9 @@ def setup_seed(seed): try: import random - import torch import numpy as np - import tensorflow np.random.seed(seed) random.seed(seed) - tensorflow.set_random_seed(seed) - torch.manual_seed(seed) # cpu - torch.backends.cudnn.deterministic = True # cpu/gpu结果一致 except Exception as e: pass diff --git a/requirements.txt b/requirements.txt index 7a5814b..56a1488 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.18.5 requests==2.24.0 -bert4keras==0.9.1 -jieba==0.42.1 +numpy==1.18.5 +bert4keras==0.7.7 +jieba==0.42.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 4ab7234..d8d1298 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name="nlpcda", - version="2.5.0", + version="2.5.1", keywords=("pip", "nlptool", "nlpcda", "nlp", '数据增强'), description="NLP Chinese Data Augmentation.一键中文数据增强.NLP数据增强", long_description=long_description, @@ -21,5 +21,5 @@ packages=find_packages(), include_package_data=True, platforms="any", - install_requires=['jieba', 'requests', 'bert4keras==0.9.1'] + install_requires=['jieba', 'requests', 'bert4keras==0.7.7'] )