simbert

425776024 · Nov 10, 2020 · 6df47e6 · 6df47e6
1 parent 462ffea
commit 6df47e6
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 14 deletions.
diff --git a/nlpcda.egg-info/PKG-INFO b/nlpcda.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nlpcda
-Version: 2.4.2
+Version: 2.5.1
 Summary: NLP Chinese Data Augmentation.一键中文数据增强.NLP数据增强
 Home-page: https://github.com/425776024/nlpcda
 Author: Jiang.XinFa
@@ -27,11 +27,11 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具
         - 6.随机置换邻近的字：**研表究明，汉字序顺并不定一影响文字的阅读理解**<<是乱序的
         - 7.百度中英翻译互转实现的增强
         - 8.中文等价字替换（1	一	壹	①，2	二	贰	②）
+        - 9.使用`UniLM`做生成式相似句生成
 
         `经过细节特殊处理，比如不改变年月日数字，尽量保证不改变原文语义。即使改变也能被猜出来、能被猜出来、能被踩出来、能被菜粗来、被菜粗、能菜粗来`
 
         ## 计划中的未来内容
-        - 使用`UniLM`做生成式相似句生成
         - 增加多线程操作，一键操作
         - [使用 WordNet数据库 来做同义词替换](http://openkg.cn/dataset/chinese-wordnet)
         - 随机噪声注入？随机插入一些字符，太简单实现了。
@@ -288,6 +288,46 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具
 
         ```
 
+        ### 9.simbert
+        [来源：https://github.com/ZhuiyiTechnology/pretrained-models](https://github.com/ZhuiyiTechnology/pretrained-models)
+
+        下载其中任意模型，解压到任意位置赋值给`model_path`变量：
+
+        | 名称           | 训练数据大小 | 词表大小 | 模型大小 | 下载地址 |
+        | :----------:  |:---------:| :------: | :------: | :------: |
+        | SimBERT Tiny  | 2200万相似句组  | 13685   | 26MB   | [百度网盘](https://pan.baidu.com/s/1z_agqTuBTuyHANwrS-gPcg)(1tp7) |
+        | SimBERT Small |  2200万相似句组 | 13685  | 49MB  | [百度网盘](https://pan.baidu.com/s/1kq_EQDI0gpiZBLFd_AxwrA)(nu67) |
+        | SimBERT Base  |  2200万相似句组 | 13685  | 344MB | [百度网盘](https://pan.baidu.com/s/1uGfQmX1Kxcv_cXTVsvxTsQ)(6xhq) |
+
+        参数：
+        - config：model_path（上述下载的模型位置），设备（GPU/CPU）、最大长度、随机种子
+        - sent：需要增强的句子数量
+        - k：增强数据
+        - threhold：阈值
+        ```python
+        from nlpcda import Simbert
+        config = {
+                'model_path': '/xxxx/chinese_simbert_L-12_H-768_A-12',
+                'device': 'cpu',
+                'max_len': 32,
+                'seed': 1
+        }
+        simbert = Simbert(config=config)
+        sent = '把我的一个亿存银行安全吗'
+        synonyms = simbert.replace(sent=sent, k=5, threhold=0.85)
+        print(synonyms)
+        '''
+        [('我的一个亿，存银行，安全吗', 0.9871675372123718), 
+        ('把一个亿存到银行里安全吗', 0.9352194666862488), 
+        ('一个亿存银行安全吗', 0.9330801367759705), 
+        ('一个亿的存款存银行安全吗', 0.92387855052948),
+         ('我的一千万存到银行安不安全', 0.9014463424682617)]
+        '''
+
+
+        ```
+
+
         ### 添加自定义词典
         用于使用之前，增加分词效果
         ```python
@@ -304,6 +344,6 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具
 
         ```
 
-Keywords: pip,nlptool,nlpcda,nlp,
+Keywords: pip,nlptool,nlpcda,nlp,数据增强
 Platform: any
 Description-Content-Type: text/markdown
diff --git a/nlpcda.egg-info/SOURCES.txt b/nlpcda.egg-info/SOURCES.txt
@@ -20,8 +20,11 @@ nlpcda/tools/Homophone.py
 nlpcda/tools/Ner.py
 nlpcda/tools/Random_delete_char.py
 nlpcda/tools/Random_word.py
+nlpcda/tools/Simbert.py
 nlpcda/tools/Similar_word.py
 nlpcda/tools/Translate.py
 nlpcda/tools/__init__.py
 nlpcda/tools/homophone.py
-nlpcda/tools/ner.py
+nlpcda/tools/ner.py
+nlpcda/tools/simbert/__init__.py
+nlpcda/tools/simbert/generator.py
diff --git a/nlpcda.egg-info/requires.txt b/nlpcda.egg-info/requires.txt
@@ -1,2 +1,3 @@
 jieba
 requests
+bert4keras==0.7.7
diff --git a/nlpcda/tools/simbert/generator.py b/nlpcda/tools/simbert/generator.py
@@ -10,14 +10,9 @@
 def setup_seed(seed):
     try:
         import random
-        import torch
         import numpy as np
-        import tensorflow
         np.random.seed(seed)
         random.seed(seed)
-        tensorflow.set_random_seed(seed)
-        torch.manual_seed(seed)  # cpu
-        torch.backends.cudnn.deterministic = True  # cpu/gpu结果一致
     except Exception as e:
         pass
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-numpy==1.18.5
 requests==2.24.0
-bert4keras==0.9.1
-jieba==0.42.1
+numpy==1.18.5
+bert4keras==0.7.7
+jieba==0.42.1
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name="nlpcda",
-    version="2.5.0",
+    version="2.5.1",
     keywords=("pip", "nlptool", "nlpcda", "nlp", '数据增强'),
     description="NLP Chinese Data Augmentation.一键中文数据增强.NLP数据增强",
     long_description=long_description,
@@ -21,5 +21,5 @@
     packages=find_packages(),
     include_package_data=True,
     platforms="any",
-    install_requires=['jieba', 'requests', 'bert4keras==0.9.1']
+    install_requires=['jieba', 'requests', 'bert4keras==0.7.7']
 )