Merge branch 'master' into metada-on-session-start-action

degiz · web-flow · commit 2b79796bb8ca · 2020-04-16T10:33:48.000+02:00
diff --git a/changelog/5587.misc.rst b/changelog/5587.misc.rst
@@ -0,0 +1,5 @@
+Fixed misleading error message for a situation when ``rasa train nlu --nlu`` is given a file with a wrong format.
+
+Before this change the output message was always: ``No NLU data given.``
+
+Now in case the format is wrong the command prints: ``Path `nlu_data` doesn't contain valid NLU data in it. Please verify the data format. The NLU model training will be skipped now.``
diff --git a/changelog/5614.misc.rst b/changelog/5614.misc.rst
@@ -0,0 +1,2 @@
+Replace ``TrainingData.filter_by_intent`` function with a more general function which filters training
+examples using a filtering function.
diff --git a/data/test/markdown_single_sections/incorrect_nlu_format.md b/data/test/markdown_single_sections/incorrect_nlu_format.md
@@ -0,0 +1,4 @@
+## deny
+- non, merci
+- non merci
+- non
diff --git a/examples/formbot/actions.py b/examples/formbot/actions.py
@@ -29,7 +29,9 @@ def slot_mappings(self) -> Dict[Text, Union[Dict, List[Dict]]]:
         return {
             "cuisine": self.from_entity(entity="cuisine", not_intent="chitchat"),
             "num_people": [
-                self.from_entity(entity="number", intent=["inform", "request_restaurant"]),
+                self.from_entity(
+                    entity="number", intent=["inform", "request_restaurant"]
+                ),
             ],
             "outdoor_seating": [
                 self.from_entity(entity="seating"),
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -67,15 +67,15 @@ showcontent = false
 python = "^3.6"
 boto3 = "^1.12"
 requests = "^2.23"
-matplotlib = "~3.1"
+matplotlib = ">=3.1,<3.3"
 attrs = "~19.3"
 jsonpickle = "~1.3"
 redis = "^3.4"
 numpy = "^1.16"
 scipy = "^1.4.1"
 absl-py = "^0.9"
 apscheduler = "~3.6"
-tqdm = "~4.31.0"
+tqdm = ">=4.31,<4.46"
 networkx = "~2.4.0"
 fbmessenger = "~6.0.0"
 pykwalify = "~1.7.0"
@@ -87,7 +87,7 @@ python-telegram-bot = "^11.1"
 twilio = "~6.26"
 webexteamssdk = "~1.1.1"
 mattermostwrapper = "~2.2"
-rocketchat_API = "~0.6.31"
+rocketchat_API = ">=0.6.31,<1.4.0"
 colorhash = "~1.0.2"
 pika = "~1.1.0"
 jsonschema = "~3.2"
@@ -104,7 +104,7 @@ cloudpickle = ">=1.2,<1.4"
 multidict = "^4.6"
 aiohttp = "~3.6"
 questionary = "~1.5.1"
-python-socketio = "~4.4"
+python-socketio = ">=4.4,<4.6"
 python-engineio = ">=3.11,<3.13"
 pydot = "~1.4"
 async_generator = "~1.10"
@@ -126,15 +126,15 @@ oauth2client = "4.1.3"
 [tool.poetry.dev-dependencies]
 pytest-cov = "^2.8.1"
 pytest-localserver = "^0.5.0"
-pytest-sanic = "^1.6.0"
+pytest-sanic = "^1.6.1"
 pytest-asyncio = "^0.10.0"
 pytest-xdist = "^1.31.0"
 pytest = "^5.3.4"
 freezegun = "^0.3.14"
 responses = "^0.10.9"
 nbsphinx = "~0.5"
 aioresponses = "^0.6.2"
-moto = "==1.3.8"
+moto = "==1.3.14"
 fakeredis = "^1.4.0"
 mongomock = "^3.18.0"
 black = "^19.10b0"
@@ -152,7 +152,7 @@ sphinxcontrib-programoutput = "==0.11"
 pygments = "^2.6.1"
 sphinxcontrib-httpdomain = "==1.6.1"
 sphinxcontrib-websupport = "==1.1.0"
-sphinxcontrib-trio = "==1.0.2"
+sphinxcontrib-trio = "==1.1.1"
 sphinx-tabs = "==1.1.13"
 sphinx-autodoc-typehints = "==1.6.0"
 rasabaster = "^0.7.23"
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
@@ -272,7 +272,9 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         """
 
         if self.retrieval_intent:
-            training_data = training_data.filter_by_intent(self.retrieval_intent)
+            training_data = training_data.filter_training_examples(
+                lambda ex: self.retrieval_intent == ex.get(INTENT)
+            )
         else:
             # retrieval intent was left to its default value
             logger.info(
diff --git a/rasa/nlu/training_data/training_data.py b/rasa/nlu/training_data/training_data.py
@@ -4,11 +4,11 @@
 from collections import Counter, OrderedDict
 from copy import deepcopy
 from os.path import relpath
-from typing import Any, Dict, List, Optional, Set, Text, Tuple
+from typing import Any, Dict, List, Optional, Set, Text, Tuple, Callable
 
 import rasa.nlu.utils
 from rasa.utils.common import raise_warning, lazy_property
-from rasa.nlu.constants import RESPONSE, RESPONSE_KEY_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES, INTENT, RESPONSE, RESPONSE_KEY_ATTRIBUTE
 from rasa.nlu.training_data.message import Message
 from rasa.nlu.training_data.util import check_duplicate_synonym
 from rasa.nlu.utils import list_to_str
@@ -75,21 +75,35 @@ def merge(self, *others: "TrainingData") -> "TrainingData":
             nlg_stories,
         )
 
-    def filter_by_intent(self, intent: Text):
-        """Filter training examples """
+    def filter_training_examples(
+        self, condition: Callable[[Message], bool]
+    ) -> "TrainingData":
+        """Filter training examples.
 
-        training_examples = []
-        for ex in self.training_examples:
-            if ex.get("intent") == intent:
-                training_examples.append(ex)
+        Args:
+            condition: A function that will be applied to filter training examples.
+
+        Returns:
+            TrainingData: A TrainingData with filtered training examples.
+        """
 
         return TrainingData(
-            training_examples,
+            list(filter(condition, self.training_examples)),
             self.entity_synonyms,
             self.regex_features,
             self.lookup_tables,
         )
 
+    def filter_by_intent(self, intent: Text) -> "TrainingData":
+        """Filter training examples."""
+        raise_warning(
+            "The `filter_by_intent` function is deprecated. "
+            "Please use `filter_training_examples` instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.filter_training_examples(lambda ex: intent == ex.get(INTENT))
+
     def __hash__(self) -> int:
         from rasa.core import utils as core_utils
 
@@ -105,49 +119,49 @@ def sanitize_examples(examples: List[Message]) -> List[Message]:
         Remove trailing whitespaces from intent and response annotations and drop duplicate examples."""
 
         for ex in examples:
-            if ex.get("intent"):
-                ex.set("intent", ex.get("intent").strip())
+            if ex.get(INTENT):
+                ex.set(INTENT, ex.get(INTENT).strip())
 
-            if ex.get("response"):
-                ex.set("response", ex.get("response").strip())
+            if ex.get(RESPONSE):
+                ex.set(RESPONSE, ex.get(RESPONSE).strip())
 
         return list(OrderedDict.fromkeys(examples))
 
     @lazy_property
     def intent_examples(self) -> List[Message]:
-        return [ex for ex in self.training_examples if ex.get("intent")]
+        return [ex for ex in self.training_examples if ex.get(INTENT)]
 
     @lazy_property
     def response_examples(self) -> List[Message]:
-        return [ex for ex in self.training_examples if ex.get("response")]
+        return [ex for ex in self.training_examples if ex.get(RESPONSE)]
 
     @lazy_property
     def entity_examples(self) -> List[Message]:
-        return [ex for ex in self.training_examples if ex.get("entities")]
+        return [ex for ex in self.training_examples if ex.get(ENTITIES)]
 
     @lazy_property
     def intents(self) -> Set[Text]:
         """Returns the set of intents in the training data."""
-        return {ex.get("intent") for ex in self.training_examples} - {None}
+        return {ex.get(INTENT) for ex in self.training_examples} - {None}
 
     @lazy_property
     def responses(self) -> Set[Text]:
         """Returns the set of responses in the training data."""
-        return {ex.get("response") for ex in self.training_examples} - {None}
+        return {ex.get(RESPONSE) for ex in self.training_examples} - {None}
 
     @lazy_property
     def retrieval_intents(self) -> Set[Text]:
         """Returns the total number of response types in the training data"""
         return {
-            ex.get("intent")
+            ex.get(INTENT)
             for ex in self.training_examples
-            if ex.get("response") is not None
+            if ex.get(RESPONSE) is not None
         }
 
     @lazy_property
     def examples_per_intent(self) -> Dict[Text, int]:
         """Calculates the number of examples per intent."""
-        intents = [ex.get("intent") for ex in self.training_examples]
+        intents = [ex.get(INTENT) for ex in self.training_examples]
         return dict(Counter(intents))
 
     @lazy_property
@@ -299,7 +313,7 @@ def sorted_intent_examples(self) -> List[Message]:
         """Sorts the intent examples by the name of the intent and then response"""
 
         return sorted(
-            self.intent_examples, key=lambda e: (e.get("intent"), e.get("response"))
+            self.intent_examples, key=lambda e: (e.get(INTENT), e.get(RESPONSE))
         )
 
     def validate(self) -> None:
@@ -393,7 +407,7 @@ def split_nlu_examples(
     ) -> Tuple[list, list]:
         train, test = [], []
         for intent, count in self.examples_per_intent.items():
-            ex = [e for e in self.intent_examples if e.data["intent"] == intent]
+            ex = [e for e in self.intent_examples if e.data[INTENT] == intent]
             if random_seed is not None:
                 random.Random(random_seed).shuffle(ex)
             else:
diff --git a/rasa/train.py b/rasa/train.py
@@ -424,6 +424,13 @@ async def _train_nlu_async(
     fixed_model_name: Optional[Text] = None,
     persist_nlu_training_data: bool = False,
 ):
+    if not nlu_data:
+        print_error(
+            "No NLU data given. Please provide NLU data in order to train "
+            "a Rasa NLU model using the '--nlu' argument."
+        )
+        return
+
     # training NLU only hence the training files still have to be selected
     file_importer = TrainingDataImporter.load_nlu_importer_from_config(
         config, training_data_paths=[nlu_data]
@@ -432,8 +439,9 @@ async def _train_nlu_async(
     training_datas = await file_importer.get_nlu_data()
     if training_datas.is_empty():
         print_error(
-            "No NLU data given. Please provide NLU data in order to train "
-            "a Rasa NLU model using the '--nlu' argument."
+            f"Path '{nlu_data}' doesn't contain valid NLU data in it. "
+            "Please verify the data format. "
+            "The NLU model training will be skipped now."
         )
         return
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -39,6 +39,7 @@
     DEFAULT_STORIES_FILE,
     END_TO_END_STORY_FILE,
     MOODBOT_MODEL_PATH,
+    INCORRECT_NLU_DATA,
 )
 from tests.utilities import update_number_of_epochs
 
@@ -142,6 +143,11 @@ def default_nlu_data() -> Text:
     return DEFAULT_NLU_DATA
 
 
+@pytest.fixture(scope="session")
+def incorrect_nlu_data() -> Text:
+    return INCORRECT_NLU_DATA
+
+
 @pytest.fixture(scope="session")
 def end_to_end_story_file() -> Text:
     return END_TO_END_STORY_FILE
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
@@ -37,6 +37,8 @@
 
 DEFAULT_NLU_DATA = "examples/moodbot/data/nlu.md"
 
+INCORRECT_NLU_DATA = "data/test/markdown_single_sections/incorrect_nlu_format.md"
+
 END_TO_END_STORY_FILE = "data/test_evaluations/end_to_end_story.md"
 
 E2E_STORY_FILE_UNKNOWN_ENTITY = "data/test_evaluations/story_unknown_entity.md"
diff --git a/tests/nlu/training_data/test_training_data.py b/tests/nlu/training_data/test_training_data.py
@@ -4,7 +4,7 @@
 import tempfile
 from jsonschema import ValidationError
 
-from rasa.nlu.constants import TEXT
+from rasa.nlu.constants import TEXT, RESPONSE_KEY_ATTRIBUTE
 from rasa.nlu import training_data
 from rasa.nlu.convert import convert_training_data
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
@@ -174,6 +174,37 @@ def test_demo_data(files):
     ]
 
 
+@pytest.mark.parametrize(
+    "files",
+    [
+        [
+            "data/examples/rasa/demo-rasa.json",
+            "data/examples/rasa/demo-rasa-responses.md",
+        ],
+        [
+            "data/examples/rasa/demo-rasa.md",
+            "data/examples/rasa/demo-rasa-responses.md",
+        ],
+    ],
+)
+def test_demo_data_filter_out_retrieval_intents(files):
+    from rasa.importers.utils import training_data_from_paths
+
+    td = training_data_from_paths(files, language="en")
+    assert len(td.training_examples) == 46
+
+    td1 = td.filter_training_examples(lambda ex: ex.get(RESPONSE_KEY_ATTRIBUTE) is None)
+    assert len(td1.training_examples) == 42
+
+    td2 = td.filter_training_examples(
+        lambda ex: ex.get(RESPONSE_KEY_ATTRIBUTE) is not None
+    )
+    assert len(td2.training_examples) == 4
+
+    # make sure filtering operation doesn't mutate the source training data
+    assert len(td.training_examples) == 46
+
+
 @pytest.mark.parametrize(
     "filepaths",
     [["data/examples/rasa/demo-rasa.md", "data/examples/rasa/demo-rasa-responses.md"]],
diff --git a/tests/test_train.py b/tests/test_train.py
@@ -3,6 +3,7 @@
 from typing import Text
 
 import pytest
+from _pytest.capture import CaptureFixture
 from _pytest.monkeypatch import MonkeyPatch
 
 import rasa.model
@@ -129,3 +130,38 @@ def test_train_nlu_temp_files(
     )
 
     assert count_temp_rasa_files(tempfile.tempdir) == 0
+
+
+def test_train_nlu_wrong_format_error_message(
+    capsys: CaptureFixture,
+    tmp_path: Text,
+    monkeypatch: MonkeyPatch,
+    default_stack_config: Text,
+    incorrect_nlu_data: Text,
+):
+    monkeypatch.setattr(tempfile, "tempdir", tmp_path)
+
+    train_nlu(
+        default_stack_config,
+        incorrect_nlu_data,
+        output="test_train_nlu_temp_files_models",
+    )
+
+    captured = capsys.readouterr()
+    assert "Please verify the data format" in captured.out
+
+
+def test_train_nlu_no_nlu_file_error_message(
+    capsys: CaptureFixture,
+    tmp_path: Text,
+    monkeypatch: MonkeyPatch,
+    default_stack_config: Text,
+):
+    monkeypatch.setattr(tempfile, "tempdir", tmp_path)
+
+    train_nlu(
+        default_stack_config, "", output="test_train_nlu_temp_files_models",
+    )
+
+    captured = capsys.readouterr()
+    assert "No NLU data given" in captured.out

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+Replace ``TrainingData.filter_by_intent`` function with a more general function which filters training
	`2`	`+examples using a filtering function.`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +## deny
 +- non, merci
 +- non merci
 +- non