merge with master, fix conflicts

RasaHQ · Jun 13, 2017 · 39d143c · 39d143c
2 parents 4db7ec4 + a764bff
commit 39d143c
Show file tree

Hide file tree

Showing 32 changed files with 330 additions and 287 deletions.
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,4 @@ models/
 test_models/model_*
 
 rasa_nlu/tmbo_test.py
+.mypy_cache/
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -18,7 +18,9 @@ Changed
 - replaced existing CRF library (python-crfsuite) with sklearn-crfsuite (due to better windows support)
 - updated to spacy 1.8.2
 - logging format of logged request now includes model name and timestamp
-
+- use module specific loggers instead of default python root logger
+- output format of the duckling extractor changed. the ``value`` field now includes the complete value from duckling instead of just text (so this is an property is an object now instead of just text). includes granularity information now.
+- deprecated ``intent_examples`` and ``entity_examples`` sections in training data. all examples should go into the ``common_examples`` section
 Removed
 -------
 

diff --git a/README.md b/README.md
@@ -113,3 +113,30 @@ Releasing a new version is quite simple, as the packages are build and distribut
     ```
 ## License
 Licensed under the Apache License, Version 2.0. Copyright 2016 LastMile Technologies Ltd. [Copy of the license](LICENSE.txt).
+
+As a reference, the following contains a listing of the licenses of the different dependencies as of this writing. 
+Licenses of minimal dependencies:
+
+| required package | License            	|
+|------------------|------------------------|
+| gevent     	   | MIT                	|
+| flask      	   | BSD 3-clause       	|
+| boto3      	   | Apache License 2.0 	|
+| typing     	   | PSF                	|
+| future     	   | MIT                	|
+| six        	   | MIT                	|
+| jsonschema 	   | MIT                	|
+
+Licenses of optional dependencies (only required for certain components of Rasa NLU. Hence, they are optional):
+
+| optional package     | License            	    |
+|----------------------|----------------------------|
+| MITIE     	       | Boost Software License 1.0 |
+| spacy      	       | MIT       	                |
+| scikit-learn         | BSD 3-clause             	|
+| scipy                | BSD 3-clause             	|
+| numpy                | BSD 3-clause             	|
+| duckling     	       | Apache License 2.0         |
+| sklearn-crfsuite     | MIT                     	|
+| cloudpickle          | BSD 3-clause             	|
+| google-cloud-storage | Apache License 2.0    	    |
diff --git a/_pytest/test_components.py b/_pytest/test_components.py
@@ -42,58 +42,36 @@ def test_all_components_are_in_all_components_template():
 
 
 @pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied_during_init(component_class, default_config, component_builder):
-    """Check that `pipeline_init` method parameters can be filled filled from the context.
-
-    The parameters declared on the `pipeline_init` are not filled directly, rather the method is called via reflection.
-    During the reflection, the parameters are filled from a so called context that is created when creating the
-    pipeline and gets initialized with the configuration values. To make sure all arguments `pipeline_init` declares
-    can be provided during the reflection, we do a 'dry run' where we check all parameters are part of the context."""
-
-    # All available context arguments that will ever be generated during init
-    component = component_builder.create_component(component_class.name, default_config)
-    context_arguments = {}
-    for clz in registry.component_classes:
-        for ctx_arg in clz.context_provides.get("pipeline_init", []):
-            context_arguments[ctx_arg] = None
-
-    filled_args = fill_args(component.pipeline_init_args(), context_arguments, default_config.as_dict())
-    assert len(filled_args) == len(component.pipeline_init_args())
-
-
-@pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied_during_train(component_class, default_config, component_builder):
+def test_all_arguments_can_be_satisfied_during_train(component_class, default_config):
     """Check that `train` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""
 
     # All available context arguments that will ever be generated during train
     # it might still happen, that in a certain pipeline configuration arguments can not be satisfied!
-    component = component_builder.create_component(component_class.name, default_config)
     context_arguments = {"training_data": None}
     for clz in registry.component_classes:
         for ctx_arg in clz.context_provides.get("pipeline_init", []):
             context_arguments[ctx_arg] = None
         for ctx_arg in clz.context_provides.get("train", []):
             context_arguments[ctx_arg] = None
 
-    filled_args = fill_args(component.train_args(), context_arguments, default_config.as_dict())
-    assert len(filled_args) == len(component.train_args())
+    filled_args = fill_args(component_class.train_args(), context_arguments, default_config.as_dict())
+    assert len(filled_args) == len(component_class.train_args())
 
 
 @pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied_during_parse(component_class, default_config, component_builder):
+def test_all_arguments_can_be_satisfied_during_parse(component_class, default_config):
     """Check that `parse` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""
 
     # All available context arguments that will ever be generated during parse
-    component = component_builder.create_component(component_class.name, default_config)
     context_arguments = {"text": None, "time": None}
     for clz in registry.component_classes:
         for ctx_arg in clz.context_provides.get("pipeline_init", []):
             context_arguments[ctx_arg] = None
         for ctx_arg in clz.context_provides.get("process", []):
             context_arguments[ctx_arg] = None
 
-    filled_args = fill_args(component.process_args(), context_arguments, default_config.as_dict())
-    assert len(filled_args) == len(component.process_args())
+    filled_args = fill_args(component_class.process_args(), context_arguments, default_config.as_dict())
+    assert len(filled_args) == len(component_class.process_args())
 
 
 def test_all_extractors_use_previous_entities():

diff --git a/_pytest/test_extractors.py b/_pytest/test_extractors.py
@@ -20,7 +20,7 @@ def test_crf_extractor(spacy_nlp):
             "intent": "restaurant_search",
             "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}]
         }]
-    ext.train(TrainingData(entity_examples_only=examples), spacy_nlp, True, ext.crf_features)
+    ext.train(TrainingData(training_examples=examples), spacy_nlp, True, ext.crf_features)
     crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp)
     assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'])
     feats = ext._sentence_to_features(crf_format)

diff --git a/data/test/api_converted_to_rasa.json b/data/test/api_converted_to_rasa.json
@@ -66,10 +66,7 @@
             "entity": "location"
           }
         ]
-      }
-    ], 
-    "entity_examples": [], 
-    "intent_examples": [
+      },
       {
         "text": "great", 
         "intent": "affirm"

diff --git a/data/test/luis_converted_to_rasa.json b/data/test/luis_converted_to_rasa.json
@@ -96,22 +96,19 @@
             "entity": "cuisine"
           }
         ]
-      }, 
+      },
       {
-        "text": "any indonesian places?", 
-        "intent": "inform", 
+        "text": "any indonesian places?",
+        "intent": "inform",
         "entities": [
           {
-            "start": 4, 
-            "end": 14, 
-            "value": "indonesian", 
+            "start": 4,
+            "end": 14,
+            "value": "indonesian",
             "entity": "cuisine"
           }
         ]
-      }
-    ], 
-    "entity_examples": [], 
-    "intent_examples": [
+      },
       {
         "text": "hello", 
         "intent": "greet"

diff --git a/data/test/wit_converted_to_rasa.json b/data/test/wit_converted_to_rasa.json
@@ -26,9 +26,7 @@
             "entity": "datetime"
           }
         ]
-      }
-    ], 
-    "entity_examples": [
+      },
       {
         "text": "i'm looking for a flight from london to amsterdam next monday", 
         "entities": [
@@ -78,7 +76,6 @@
           }
         ]
       }
-    ], 
-    "intent_examples": []
+    ]
   }
 }
diff --git a/docs/dataformat.rst b/docs/dataformat.rst
@@ -3,33 +3,27 @@
 Training Data Format
 ====================
 
-The training data for rasa NLU has three arrays inside of a top level object ``common_examples``, ``intent_examples``, and ``entity_examples``. Not all three are required, you can use each of them as needed by the model you are trying to train.
+The training data for rasa NLU is structured into different parts. The most important one is ``common_examples``.
 
 .. code-block:: json
 
     {
         "rasa_nlu_data": {
-            "common_examples": [],
-            "intent_examples": [],
-            "entity_examples": []
+            "common_examples": []
         }
     }
 
-The ``common_examples`` are used to train both the entity and the intent models while the other arrays target intents and entities exclusively.
-
-In many cases it's fine to put all of your training examples in the ``common_examples`` array. 
-However, if you need lots and lots of examples to train a good entity recogniser, that can mess up 
-your intent model because your classes would become unbalanced. In that case it makes sense
-to split up these lists.
+The ``common_examples`` are used to train both the entity and the intent models. You should put all of your training
+examples in the ``common_examples`` array. The next section describes in detail how an example looks like.
 
 Common Examples
 ---------------
 
 Common examples have three components: ``text``, ``intent``, and ``entities``. The first two are strings while the last one is an array.
 
- - The *text* is the search query; An example of what would be submitted for parsing.
- - The *intent* is the intent that should be associated with the text.
- - The *entities* are specific parts of the text which need to be identified.
+ - The *text* is the search query; An example of what would be submitted for parsing. [required]
+ - The *intent* is the intent that should be associated with the text. [optional]
+ - The *entities* are specific parts of the text which need to be identified. [optional]
 
 Entities are specified with a ``start`` and  ``end`` value, which together make a python
 style range to apply to the string, e.g. in the example below, with ``text="show me chinese
@@ -51,35 +45,6 @@ That way you can map syonyms, or misspellings, to the same ``value``.
         }
       ]
     }
-    
-Intent Examples
----------------
-Omit the entire entities array:
-
-.. code-block:: json
-
-    {
-      "text": "show me chinese restaurants", 
-      "intent": "restaurant_search"
-    }
-
-Entity Examples
----------------
-Simply omit the ``intent`` section:
-
-.. code-block:: json
-
-    {
-      "text": "show me chinese restaurants", 
-      "entities": [
-        {
-          "start": 8, 
-          "end": 15, 
-          "value": "chinese", 
-          "entity": "cuisine"
-        }
-      ]
-    }
 
 Entity Synonyms
 ---------------

diff --git a/rasa_nlu/__init__.py b/rasa_nlu/__init__.py
@@ -1,7 +1,12 @@
-from __future__ import unicode_literals
-from __future__ import print_function
-from __future__ import division
 from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import logging
+
 import rasa_nlu.version
 
-__version__ = version.__version__
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+
+__version__ = rasa_nlu.version.__version__
diff --git a/rasa_nlu/classifiers/sklearn_intent_classifier.py b/rasa_nlu/classifiers/sklearn_intent_classifier.py
@@ -18,6 +18,8 @@
 from rasa_nlu.components import Component
 from rasa_nlu.training_data import TrainingData
 
+logger = logging.getLogger(__name__)
+
 # How many intents are at max put into the output intent ranking, everything else will be cut off
 INTENT_RANKING_LENGTH = 10
 
@@ -84,8 +86,8 @@ def train(self, training_data, intent_features, num_threads):
         labels = [e["intent"] for e in training_data.intent_examples]
 
         if len(set(labels)) < 2:
-            logging.warn("Can not train an intent classifier. Need at least 2 different classes. " +
-                         "Skipping training of intent classifier.")
+            logger.warn("Can not train an intent classifier. Need at least 2 different classes. " +
+                        "Skipping training of intent classifier.")
         else:
             y = self.transform_labels_str2num(labels)
             X = intent_features