Merge branch 'master' of github.com:eaudeweb/NLPService

eaudeweb · Jul 10, 2019 · 82fff41 · 82fff41
2 parents 9374256 + 9d5e7ae
commit 82fff41
Show file tree

Hide file tree

Showing 11 changed files with 213 additions and 97 deletions.
diff --git a/nlpservice/nlp/classify.py b/nlpservice/nlp/classify.py
@@ -20,7 +20,9 @@
 from tensorflow.keras.utils import to_categorical
 from textacy.preprocess import normalize_whitespace
 
+from .fasttext import main as train_kv
 from .models import get_model, gpu_session, nongpu_session
+from .prepare import main as prepare_text
 from .prepare import text_tokenize
 from .utils import get_lemmatized_kg
 
@@ -202,9 +204,9 @@ def get_doc_labels(doc, kg):
 
 
 def read_corpus(path):
-    """ Returns a list of documents. A doc is a list of sentences.
+    """ Takes a text file and returns a list of documents.
 
-    A sentence is space separated tokens (words)
+    A doc is a list of sentences. A sentence is space separated tokens (words)
     """
 
     logger.info('Loading corpus')
@@ -269,9 +271,6 @@ def main(output, ftpath, corpus, kg_url, cpu):
     ft_model = FastText.load(ftpath)
     kg = get_lemmatized_kg(kg_url)
 
-    # import pdb
-    # pdb.set_trace()
-
     if cpu:
         sess = nongpu_session()
     else:
@@ -419,9 +418,14 @@ def predict(text):
 
         return list(pairs)
 
+    def train():
+        raise NotImplementedError
+
+        return
+
     return {
         'predict': predict,
-        'train': lambda: None,
+        'train': train,
         'metadata': {},
     }
 
@@ -435,6 +439,7 @@ def kg_classifier_keras(config):
     model_path = settings['nlp.kg_model_path']
     ft_model_path = settings['nlp.kg_kv_path']
     kg_url = settings['nlp.kg_url']
+    kg_elastic = settings['nlp.kg_elastic']
 
     kg = get_lemmatized_kg(kg_url)
     labels = list(sorted(kg.keys()))
@@ -448,6 +453,8 @@ def kg_classifier_keras(config):
     vocab = kv_model.wv.index2word
     label_encoder = make_labelencoder(labels)
 
+    corpus_path = settings['nlp.kg_corpus']
+
     def predict(text):
         maxlen = model.inputs[0].get_shape()[1].value
 
@@ -458,8 +465,34 @@ def predict(text):
 
         return list(pairs)
 
+    def train():
+        # pipeline is: get text from elastic, prepare kv model, train on text
+        logger.warning('Preparing corpus text')
+        prepare_text.callback(corpus_path, kg_elastic, None)
+
+        logger.warning('Preparing kv model')
+        train_kv.callback(corpus_path, ft_model_path)
+
+        logger.warning('Training Keras classifier')
+        out = main.callback(model_path, ft_model_path, corpus_path, kg_url,
+                            False)
+
+        return out
+
     return {
         'predict': predict,
         'metadata': {},
-        'train': lambda: None,
+        'train': train,
     }
+#
+#
+# @click.command()
+# @click.argument('model', nargs=-1, required=True)
+# def retrain(model):
+#     # TODO: we can't properly get models without an .ini file
+#
+#     for name in model:
+#         suite = get_model(name)
+#         train = suite['train']
+#         logger.warning("Retraining %s", name)
+#         train()
diff --git a/nlpservice/nlp/fasttext.py b/nlpservice/nlp/fasttext.py
@@ -2,7 +2,6 @@
 """
 
 import logging
-from pathlib import Path
 
 import click
 from tqdm import tqdm
@@ -22,43 +21,6 @@ def trim(word, count, min_count):
         return RULE_DEFAULT
 
 
-# count = 0
-#
-#
-# def counter(it):
-#     global count
-#
-#     for line in it:
-#         line = line.strip()
-#
-#         if not line:
-#             continue
-#
-#         count += 1
-#         yield line.split(' ')
-#
-
-@click.command()
-@click.argument('textfile')
-@click.argument('output')
-def main(textfile, output):
-    """ A script to generate FastText-based word embedings
-    """
-
-    logger.setLevel(logging.WARNING)
-    model = FastText(size=100, window=3, sg=True, min_count=5,
-                     seed=0, word_ngrams=True, trim_rule=trim)
-
-    with open(textfile) as f:
-        sentences = [l.strip().split(' ') for l in list(f)]
-
-    model.build_vocab(sentences=tqdm(sentences))
-    model.train(sentences=tqdm(sentences),
-                epochs=10, total_examples=len(sentences), workers=7)
-
-    model.save(output)
-
-
 def similar_by_word(word, model):
     ft = get_model(model)
     wv = ft['model'].wv
@@ -89,3 +51,24 @@ def corpus_kv_settings(config):
     settings = config.get_settings()
 
     return settings['nlp.kg_kv_path']
+
+
+@click.command()
+@click.argument('textfile')
+@click.argument('output')
+def main(textfile, output):
+    """ A script to generate FastText-based word embedings
+    """
+
+    logger.setLevel(logging.WARNING)
+    model = FastText(size=100, window=3, sg=True, min_count=5,
+                     seed=0, word_ngrams=True, trim_rule=trim)
+
+    with open(textfile) as f:
+        sentences = [l.strip().split(' ') for l in list(f)]
+
+    model.build_vocab(sentences=tqdm(sentences))
+    model.train(sentences=tqdm(sentences),
+                epochs=10, total_examples=len(sentences), workers=7)
+
+    model.save(output)
diff --git a/nlpservice/nlp/prepare.py b/nlpservice/nlp/prepare.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import click
+
 import ftfy
 import syntok.segmenter as segmenter
 import textacy as tc

diff --git a/nlpservice/nlpservice-ui/src/App.vue b/nlpservice/nlpservice-ui/src/App.vue
@@ -25,6 +25,9 @@
       <v-btn flat>
         <router-link to="/classify">Classify</router-link>
       </v-btn>
+      <v-btn flat>
+        <router-link to="/classify/retrain">Retrain</router-link>
+      </v-btn>
 
     </v-toolbar>
 

diff --git a/nlpservice/nlpservice-ui/src/components/Classify.vue b/nlpservice/nlpservice-ui/src/components/Classify.vue
@@ -2,7 +2,7 @@
   <v-container grid-list-md>
     <v-layout row wrap>
       <v-flex xs12 md12>
-      <h2 class="headline font-weight-bold mb-3">Similarity</h2>
+      <h2 class="headline font-weight-bold mb-3">Classify</h2>
       </v-flex>
 
       <v-flex xs12 md6>
@@ -15,7 +15,7 @@
 
       <v-flex xs12 md6>
         <v-sheet color="purple lighten-3" elevation="1" min-height="10em">
-          <div v-for="s in scores">
+          <div v-for="s in scores" :key="s">
             {{ s[0] }} - {{ s[1] }}
           </div>
         </v-sheet>
@@ -48,23 +48,20 @@
       },
       submit() {
         axios
-          .post('http://localhost:6543/classify',
+          .post('http://localhost:6543/classify/' + this.model,
             {
               'text': this.text,
-              'model': this.model,
             })
           .then((resp) => {
             this.scores = resp.data.result
-            console.log(resp)
           })
       }
     },
     mounted() {
         axios
-          .get('http://localhost:6543/list-classifiers')
+          .get('http://localhost:6543/classify/')
           .then((resp) => {
             this.models = resp.data.result
-            console.log(resp)
           })
     }
   }

diff --git a/nlpservice/nlpservice-ui/src/components/KeyedVectors.vue b/nlpservice/nlpservice-ui/src/components/KeyedVectors.vue
@@ -14,7 +14,7 @@
 
       <v-flex xs12 md6>
         <v-sheet color="purple lighten-3" elevation="1" min-height="10em">
-          <div v-for="s in scores" key="s">
+          <div v-for="s in scores" :key="s">
             {{ s[0] }} - {{ s[1] }}
           </div>
         </v-sheet>
@@ -49,7 +49,6 @@
             })
           .then((resp) => {
             this.scores = resp.data.result
-            console.log(resp)
           })
       }
     }

diff --git a/nlpservice/nlpservice-ui/src/components/Retrain.vue b/nlpservice/nlpservice-ui/src/components/Retrain.vue
@@ -0,0 +1,59 @@
+<template>
+  <v-container grid-list-md>
+    <v-layout row wrap>
+      <v-flex xs12 md12>
+      <h2 class="headline font-weight-bold mb-3">Retrain</h2>
+      </v-flex>
+
+      <v-flex xs12 md6>
+        <v-sheet color="green lighten-3" elevation="1" min-height="10em">
+          <h4>Set models for retraining</h4>
+          <v-select multiple :items="models" label="Model" @change="setModel"></v-select>
+        </v-sheet>
+      </v-flex>
+
+      <v-flex xs12 md12>
+        <v-btn primary @click="submit">Submit</v-btn>
+      </v-flex>
+
+    </v-layout>
+  </v-container>
+</template>
+<script>
+  import axios from 'axios'
+
+  export default {
+    components: {
+    },
+    data () {
+      return {
+        'models': [],
+        'retrain': []
+      }
+    },
+    methods: {
+      setModel(value) {
+        this.retrain = value
+      },
+      submit() {
+        axios
+          .post('http://localhost:6543/classify/',
+            {
+              'models': this.retrain,
+            })
+          .then((resp) => {
+            this.scores = resp.data.result
+          })
+      }
+    },
+    mounted() {
+        axios
+          .get('http://localhost:6543/classify/')
+          .then((resp) => {
+            this.models = resp.data.result
+          })
+    }
+  }
+</script>
+<style>
+</style>
diff --git a/nlpservice/nlpservice-ui/src/router/routes.js b/nlpservice/nlpservice-ui/src/router/routes.js
@@ -1,10 +1,11 @@
-import Start from '../components/Start'
-import Summarize from '../components/summarize/Main'
-import Duplicate from '../components/duplicate/Main'
-import Similarity from '../components/Similarity'
-import KeyedVectors from '../components/KeyedVectors'
 import Classify from '../components/Classify'
 import Clusterize from '../components/cluster/Main'
+import Duplicate from '../components/duplicate/Main'
+import KeyedVectors from '../components/KeyedVectors'
+import Retrain from '../components/Retrain'
+import Similarity from '../components/Similarity'
+import Summarize from '../components/summarize/Main'
+import Start from '../components/Start'
 
 const routes = [
   {
@@ -35,6 +36,10 @@ const routes = [
     path: '/classify',
     component: Classify,
   },
+  {
+    path: '/classify/retrain',
+    component: Retrain,
+  },
 ]
 
 export default routes