From 6df47e65f971de4fa68246c2052f41f9d69a0191 Mon Sep 17 00:00:00 2001
From: jiangxinfa <425776024@qq.com>
Date: Tue, 10 Nov 2020 21:33:49 +0800
Subject: [PATCH] simbert

---
 nlpcda.egg-info/PKG-INFO          | 46 +++++++++++++++++++++++++++++--
 nlpcda.egg-info/SOURCES.txt       |  5 +++-
 nlpcda.egg-info/requires.txt      |  1 +
 nlpcda/tools/simbert/generator.py |  5 ----
 requirements.txt                  |  6 ++--
 setup.py                          |  4 +--
 6 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/nlpcda.egg-info/PKG-INFO b/nlpcda.egg-info/PKG-INFO
index ec8fee3..b107842 100644
--- a/nlpcda.egg-info/PKG-INFO
+++ b/nlpcda.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nlpcda
-Version: 2.4.2
+Version: 2.5.1
 Summary: NLP Chinese Data Augmentation.一键中文数据增强.NLP数据增强
 Home-page: https://github.com/425776024/nlpcda
 Author: Jiang.XinFa
@@ -27,11 +27,11 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具
         - 6.随机置换邻近的字：**研表究明，汉字序顺并不定一影响文字的阅读理解**<<是乱序的
         - 7.百度中英翻译互转实现的增强
         - 8.中文等价字替换（1	一	壹	①，2	二	贰	②）
+        - 9.使用`UniLM`做生成式相似句生成
         
         `经过细节特殊处理，比如不改变年月日数字，尽量保证不改变原文语义。即使改变也能被猜出来、能被猜出来、能被踩出来、能被菜粗来、被菜粗、能菜粗来`
         
         ## 计划中的未来内容
-        - 使用`UniLM`做生成式相似句生成
         - 增加多线程操作，一键操作
         - [使用 WordNet数据库 来做同义词替换](http://openkg.cn/dataset/chinese-wordnet)
         - 随机噪声注入？随机插入一些字符，太简单实现了。
@@ -288,6 +288,46 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具
         
         ```
         
+        ### 9.simbert
+        [来源：https://github.com/ZhuiyiTechnology/pretrained-models](https://github.com/ZhuiyiTechnology/pretrained-models)
+        
+        下载其中任意模型，解压到任意位置赋值给`model_path`变量：
+        
+        | 名称           | 训练数据大小 | 词表大小 | 模型大小 | 下载地址 |
+        | :----------:  |:---------:| :------: | :------: | :------: |
+        | SimBERT Tiny  | 2200万相似句组  | 13685   | 26MB   | [百度网盘](https://pan.baidu.com/s/1z_agqTuBTuyHANwrS-gPcg)(1tp7) |
+        | SimBERT Small |  2200万相似句组 | 13685  | 49MB  | [百度网盘](https://pan.baidu.com/s/1kq_EQDI0gpiZBLFd_AxwrA)(nu67) |
+        | SimBERT Base  |  2200万相似句组 | 13685  | 344MB | [百度网盘](https://pan.baidu.com/s/1uGfQmX1Kxcv_cXTVsvxTsQ)(6xhq) |
+        
+        参数：
+        - config：model_path（上述下载的模型位置），设备（GPU/CPU）、最大长度、随机种子
+        - sent：需要增强的句子数量
+        - k：增强数据
+        - threhold：阈值
+        ```python
+        from nlpcda import Simbert
+        config = {
+                'model_path': '/xxxx/chinese_simbert_L-12_H-768_A-12',
+                'device': 'cpu',
+                'max_len': 32,
+                'seed': 1
+        }
+        simbert = Simbert(config=config)
+        sent = '把我的一个亿存银行安全吗'
+        synonyms = simbert.replace(sent=sent, k=5, threhold=0.85)
+        print(synonyms)
+        '''
+        [('我的一个亿，存银行，安全吗', 0.9871675372123718), 
+        ('把一个亿存到银行里安全吗', 0.9352194666862488), 
+        ('一个亿存银行安全吗', 0.9330801367759705), 
+        ('一个亿的存款存银行安全吗', 0.92387855052948),
+         ('我的一千万存到银行安不安全', 0.9014463424682617)]
+        '''
+        
+        
+        ```
+        
+        
         ### 添加自定义词典
         用于使用之前，增加分词效果
         ```python
@@ -304,6 +344,6 @@ Description: # NLP Chinese Data Augmentation 一键中文数据增强工具
         
         ```
         
-Keywords: pip,nlptool,nlpcda,nlp,
+Keywords: pip,nlptool,nlpcda,nlp,数据增强
 Platform: any
 Description-Content-Type: text/markdown
diff --git a/nlpcda.egg-info/SOURCES.txt b/nlpcda.egg-info/SOURCES.txt
index 81d1db0..6137fc4 100644
--- a/nlpcda.egg-info/SOURCES.txt
+++ b/nlpcda.egg-info/SOURCES.txt
@@ -20,8 +20,11 @@ nlpcda/tools/Homophone.py
 nlpcda/tools/Ner.py
 nlpcda/tools/Random_delete_char.py
 nlpcda/tools/Random_word.py
+nlpcda/tools/Simbert.py
 nlpcda/tools/Similar_word.py
 nlpcda/tools/Translate.py
 nlpcda/tools/__init__.py
 nlpcda/tools/homophone.py
-nlpcda/tools/ner.py
\ No newline at end of file
+nlpcda/tools/ner.py
+nlpcda/tools/simbert/__init__.py
+nlpcda/tools/simbert/generator.py
\ No newline at end of file
diff --git a/nlpcda.egg-info/requires.txt b/nlpcda.egg-info/requires.txt
index aba7ffe..624037b 100644
--- a/nlpcda.egg-info/requires.txt
+++ b/nlpcda.egg-info/requires.txt
@@ -1,2 +1,3 @@
 jieba
 requests
+bert4keras==0.7.7
diff --git a/nlpcda/tools/simbert/generator.py b/nlpcda/tools/simbert/generator.py
index 6d7dda9..e324691 100644
--- a/nlpcda/tools/simbert/generator.py
+++ b/nlpcda/tools/simbert/generator.py
@@ -10,14 +10,9 @@
 def setup_seed(seed):
     try:
         import random
-        import torch
         import numpy as np
-        import tensorflow
         np.random.seed(seed)
         random.seed(seed)
-        tensorflow.set_random_seed(seed)
-        torch.manual_seed(seed)  # cpu
-        torch.backends.cudnn.deterministic = True  # cpu/gpu结果一致
     except Exception as e:
         pass
 
diff --git a/requirements.txt b/requirements.txt
index 7a5814b..56a1488 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-numpy==1.18.5
 requests==2.24.0
-bert4keras==0.9.1
-jieba==0.42.1
+numpy==1.18.5
+bert4keras==0.7.7
+jieba==0.42.1
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4ab7234..d8d1298 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name="nlpcda",
-    version="2.5.0",
+    version="2.5.1",
     keywords=("pip", "nlptool", "nlpcda", "nlp", '数据增强'),
     description="NLP Chinese Data Augmentation.一键中文数据增强.NLP数据增强",
     long_description=long_description,
@@ -21,5 +21,5 @@
     packages=find_packages(),
     include_package_data=True,
     platforms="any",
-    install_requires=['jieba', 'requests', 'bert4keras==0.9.1']
+    install_requires=['jieba', 'requests', 'bert4keras==0.7.7']
 )