Skip to content

Commit

Permalink
merge with master, fix conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
vinvinod committed Jun 13, 2017
2 parents 4db7ec4 + a764bff commit 39d143c
Show file tree
Hide file tree
Showing 32 changed files with 330 additions and 287 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ models/
test_models/model_*

rasa_nlu/tmbo_test.py
.mypy_cache/
4 changes: 3 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ Changed
- replaced existing CRF library (python-crfsuite) with sklearn-crfsuite (due to better windows support)
- updated to spacy 1.8.2
- logging format of logged request now includes model name and timestamp

- use module specific loggers instead of default python root logger
- output format of the duckling extractor changed. the ``value`` field now includes the complete value from duckling instead of just text (so this is an property is an object now instead of just text). includes granularity information now.
- deprecated ``intent_examples`` and ``entity_examples`` sections in training data. all examples should go into the ``common_examples`` section
Removed
-------

Expand Down
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,30 @@ Releasing a new version is quite simple, as the packages are build and distribut
```
## License
Licensed under the Apache License, Version 2.0. Copyright 2016 LastMile Technologies Ltd. [Copy of the license](LICENSE.txt).

As a reference, the following contains a listing of the licenses of the different dependencies as of this writing.
Licenses of minimal dependencies:

| required package | License |
|------------------|------------------------|
| gevent | MIT |
| flask | BSD 3-clause |
| boto3 | Apache License 2.0 |
| typing | PSF |
| future | MIT |
| six | MIT |
| jsonschema | MIT |

Licenses of optional dependencies (only required for certain components of Rasa NLU. Hence, they are optional):

| optional package | License |
|----------------------|----------------------------|
| MITIE | Boost Software License 1.0 |
| spacy | MIT |
| scikit-learn | BSD 3-clause |
| scipy | BSD 3-clause |
| numpy | BSD 3-clause |
| duckling | Apache License 2.0 |
| sklearn-crfsuite | MIT |
| cloudpickle | BSD 3-clause |
| google-cloud-storage | Apache License 2.0 |
34 changes: 6 additions & 28 deletions _pytest/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,58 +42,36 @@ def test_all_components_are_in_all_components_template():


@pytest.mark.parametrize("component_class", registry.component_classes)
def test_all_arguments_can_be_satisfied_during_init(component_class, default_config, component_builder):
"""Check that `pipeline_init` method parameters can be filled filled from the context.
The parameters declared on the `pipeline_init` are not filled directly, rather the method is called via reflection.
During the reflection, the parameters are filled from a so called context that is created when creating the
pipeline and gets initialized with the configuration values. To make sure all arguments `pipeline_init` declares
can be provided during the reflection, we do a 'dry run' where we check all parameters are part of the context."""

# All available context arguments that will ever be generated during init
component = component_builder.create_component(component_class.name, default_config)
context_arguments = {}
for clz in registry.component_classes:
for ctx_arg in clz.context_provides.get("pipeline_init", []):
context_arguments[ctx_arg] = None

filled_args = fill_args(component.pipeline_init_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component.pipeline_init_args())


@pytest.mark.parametrize("component_class", registry.component_classes)
def test_all_arguments_can_be_satisfied_during_train(component_class, default_config, component_builder):
def test_all_arguments_can_be_satisfied_during_train(component_class, default_config):
"""Check that `train` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""

# All available context arguments that will ever be generated during train
# it might still happen, that in a certain pipeline configuration arguments can not be satisfied!
component = component_builder.create_component(component_class.name, default_config)
context_arguments = {"training_data": None}
for clz in registry.component_classes:
for ctx_arg in clz.context_provides.get("pipeline_init", []):
context_arguments[ctx_arg] = None
for ctx_arg in clz.context_provides.get("train", []):
context_arguments[ctx_arg] = None

filled_args = fill_args(component.train_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component.train_args())
filled_args = fill_args(component_class.train_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component_class.train_args())


@pytest.mark.parametrize("component_class", registry.component_classes)
def test_all_arguments_can_be_satisfied_during_parse(component_class, default_config, component_builder):
def test_all_arguments_can_be_satisfied_during_parse(component_class, default_config):
"""Check that `parse` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""

# All available context arguments that will ever be generated during parse
component = component_builder.create_component(component_class.name, default_config)
context_arguments = {"text": None, "time": None}
for clz in registry.component_classes:
for ctx_arg in clz.context_provides.get("pipeline_init", []):
context_arguments[ctx_arg] = None
for ctx_arg in clz.context_provides.get("process", []):
context_arguments[ctx_arg] = None

filled_args = fill_args(component.process_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component.process_args())
filled_args = fill_args(component_class.process_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component_class.process_args())


def test_all_extractors_use_previous_entities():
Expand Down
2 changes: 1 addition & 1 deletion _pytest/test_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_crf_extractor(spacy_nlp):
"intent": "restaurant_search",
"entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}]
}]
ext.train(TrainingData(entity_examples_only=examples), spacy_nlp, True, ext.crf_features)
ext.train(TrainingData(training_examples=examples), spacy_nlp, True, ext.crf_features)
crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp)
assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'])
feats = ext._sentence_to_features(crf_format)
Expand Down
5 changes: 1 addition & 4 deletions data/test/api_converted_to_rasa.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,7 @@
"entity": "location"
}
]
}
],
"entity_examples": [],
"intent_examples": [
},
{
"text": "great",
"intent": "affirm"
Expand Down
17 changes: 7 additions & 10 deletions data/test/luis_converted_to_rasa.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,22 +96,19 @@
"entity": "cuisine"
}
]
},
},
{
"text": "any indonesian places?",
"intent": "inform",
"text": "any indonesian places?",
"intent": "inform",
"entities": [
{
"start": 4,
"end": 14,
"value": "indonesian",
"start": 4,
"end": 14,
"value": "indonesian",
"entity": "cuisine"
}
]
}
],
"entity_examples": [],
"intent_examples": [
},
{
"text": "hello",
"intent": "greet"
Expand Down
7 changes: 2 additions & 5 deletions data/test/wit_converted_to_rasa.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@
"entity": "datetime"
}
]
}
],
"entity_examples": [
},
{
"text": "i'm looking for a flight from london to amsterdam next monday",
"entities": [
Expand Down Expand Up @@ -78,7 +76,6 @@
}
]
}
],
"intent_examples": []
]
}
}
49 changes: 7 additions & 42 deletions docs/dataformat.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,27 @@
Training Data Format
====================

The training data for rasa NLU has three arrays inside of a top level object ``common_examples``, ``intent_examples``, and ``entity_examples``. Not all three are required, you can use each of them as needed by the model you are trying to train.
The training data for rasa NLU is structured into different parts. The most important one is ``common_examples``.

.. code-block:: json
{
"rasa_nlu_data": {
"common_examples": [],
"intent_examples": [],
"entity_examples": []
"common_examples": []
}
}
The ``common_examples`` are used to train both the entity and the intent models while the other arrays target intents and entities exclusively.

In many cases it's fine to put all of your training examples in the ``common_examples`` array.
However, if you need lots and lots of examples to train a good entity recogniser, that can mess up
your intent model because your classes would become unbalanced. In that case it makes sense
to split up these lists.
The ``common_examples`` are used to train both the entity and the intent models. You should put all of your training
examples in the ``common_examples`` array. The next section describes in detail how an example looks like.

Common Examples
---------------

Common examples have three components: ``text``, ``intent``, and ``entities``. The first two are strings while the last one is an array.

- The *text* is the search query; An example of what would be submitted for parsing.
- The *intent* is the intent that should be associated with the text.
- The *entities* are specific parts of the text which need to be identified.
- The *text* is the search query; An example of what would be submitted for parsing. [required]
- The *intent* is the intent that should be associated with the text. [optional]
- The *entities* are specific parts of the text which need to be identified. [optional]

Entities are specified with a ``start`` and ``end`` value, which together make a python
style range to apply to the string, e.g. in the example below, with ``text="show me chinese
Expand All @@ -51,35 +45,6 @@ That way you can map syonyms, or misspellings, to the same ``value``.
}
]
}
Intent Examples
---------------
Omit the entire entities array:

.. code-block:: json
{
"text": "show me chinese restaurants",
"intent": "restaurant_search"
}
Entity Examples
---------------
Simply omit the ``intent`` section:

.. code-block:: json
{
"text": "show me chinese restaurants",
"entities": [
{
"start": 8,
"end": 15,
"value": "chinese",
"entity": "cuisine"
}
]
}
Entity Synonyms
---------------
Expand Down
13 changes: 9 additions & 4 deletions rasa_nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging

import rasa_nlu.version

__version__ = version.__version__
logging.getLogger(__name__).addHandler(logging.NullHandler())

__version__ = rasa_nlu.version.__version__
6 changes: 4 additions & 2 deletions rasa_nlu/classifiers/sklearn_intent_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from rasa_nlu.components import Component
from rasa_nlu.training_data import TrainingData

logger = logging.getLogger(__name__)

# How many intents are at max put into the output intent ranking, everything else will be cut off
INTENT_RANKING_LENGTH = 10

Expand Down Expand Up @@ -84,8 +86,8 @@ def train(self, training_data, intent_features, num_threads):
labels = [e["intent"] for e in training_data.intent_examples]

if len(set(labels)) < 2:
logging.warn("Can not train an intent classifier. Need at least 2 different classes. " +
"Skipping training of intent classifier.")
logger.warn("Can not train an intent classifier. Need at least 2 different classes. " +
"Skipping training of intent classifier.")
else:
y = self.transform_labels_str2num(labels)
X = intent_features
Expand Down
Loading

0 comments on commit 39d143c

Please sign in to comment.