Merge branch 'master' into patch-release-1.9.3

dakshvar22 · web-flow · commit e85f4428a41a · 2020-03-27T18:30:27.000+05:30
diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@
 Rasa is an open source machine learning framework to automate text-and voice-based conversations. With Rasa, you can build contexual assistants on:
 - Facebook Messenger
 - Slack
+- Google Hangouts
+- Webex Teams
 - Microsoft Bot Framework
 - Rocket.Chat
 - Mattermost
diff --git a/changelog/5475.bugfix.rst b/changelog/5475.bugfix.rst
@@ -0,0 +1,5 @@
+One word can just have one entity label.
+
+If you are using, for example, ``ConveRTTokenizer`` words can be split into multiple tokens.
+Our entity extractors assign entity labels per token. So, it might happen, that a word, that was split into two tokens,
+got assigned two different entity labels. This is now fixed. One word can just have one entity label at a time.
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
@@ -329,11 +329,11 @@ def _tag_id_index_mapping(self, training_data: TrainingData) -> Dict[Text, int]:
         if self.component_config[BILOU_FLAG]:
             return bilou_utils.build_tag_id_dict(training_data)
 
-        distinct_tag_ids = set(
+        distinct_tag_ids = {
             e["entity"]
             for example in training_data.entity_examples
             for e in example.get(ENTITIES)
-        ) - {None}
+        } - {None}
 
         tag_id_dict = {
             tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
@@ -662,7 +662,7 @@ def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
                 "There is no trained model: component is either not trained or "
                 "didn't receive enough training data."
             )
-            return
+            return None
 
         # create session data from message and convert it into a batch of 1
         model_data = self._create_model_data([message])
@@ -744,9 +744,8 @@ def _predict_entities(
 
         return entities
 
-    @staticmethod
     def _convert_tags_to_entities(
-        text: Text, tokens: List[Token], tags: List[Text]
+        self, text: Text, tokens: List[Token], tags: List[Text]
     ) -> List[Dict[Text, Any]]:
         entities = []
         last_tag = NO_ENTITY_TAG
@@ -774,7 +773,7 @@ def _convert_tags_to_entities(
         for entity in entities:
             entity["value"] = text[entity["start"] : entity["end"]]
 
-        return entities
+        return self.clean_up_entities(entities)
 
     def process(self, message: Message, **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
@@ -1191,7 +1190,7 @@ def _combine_sparse_dense_features(
 
     def _features_as_seq_ids(
         self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
-    ) -> tf.Tensor:
+    ) -> Optional[tf.Tensor]:
         """Creates dense labels for negative sampling."""
 
         # if there are dense features - we can use them
@@ -1206,6 +1205,8 @@ def _features_as_seq_ids(
                     self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
                 )
 
+        return None
+
     def _create_bow(
         self,
         features: List[Union[tf.Tensor, tf.SparseTensor]],
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -165,8 +165,9 @@ def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         if self.ent_tagger is not None:
             text_data = self._from_text_to_crf(message)
             features = self._sentence_to_features(text_data)
-            ents = self.ent_tagger.predict_marginals_single(features)
-            return self._from_crf_to_json(message, ents)
+            entities = self.ent_tagger.predict_marginals_single(features)
+            entities = self._from_crf_to_json(message, entities)
+            return self.clean_up_entities(entities)
         else:
             return []
 
diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Text, Tuple
+from typing import Any, Dict, List, Text, Tuple, Optional
 
 from rasa.nlu.components import Component
 from rasa.nlu.constants import EXTRACTOR, ENTITIES
@@ -21,6 +21,93 @@ def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
 
         return entity
 
+    def clean_up_entities(
+        self, entities: List[Dict[Text, Any]], keep: bool = True
+    ) -> List[Dict[Text, Any]]:
+        """
+        Checks if multiple entity labels are assigned to one word.
+
+        This might happen if you are using a tokenizer that splits up words into
+        sub-words and different entity labels are assigned to the individual sub-words.
+        In such a case keep the entity label with the highest confidence as entity
+        label for that word. If you set 'keep' to 'False', all entity labels for
+        that word will be removed.
+
+        Args:
+            entities: list of entities
+            keep:
+                If set to 'True', the entity label with the highest confidence is kept
+                if multiple entity labels are assigned to one word. If set to 'False'
+                all entity labels for that word will be removed.
+
+        Returns: updated list of entities
+        """
+        if len(entities) <= 1:
+            return entities
+
+        entity_indices: List[List[int]] = []
+
+        # get indices of entity labels that belong to one word
+        for idx in range(1, len(entities)):
+            if entities[idx]["start"] == entities[idx - 1]["end"]:
+                if entity_indices and entity_indices[-1][-1] == idx - 1:
+                    entity_indices[-1].append(idx)
+                else:
+                    entity_indices.append([idx - 1, idx])
+
+        entity_indices_to_remove = set()
+
+        for indices in entity_indices:
+            if not keep:
+                entity_indices_to_remove.update(indices)
+                continue
+
+            # get start, end, and value of entity matching the complete word
+            start = entities[indices[0]]["start"]
+            end = entities[indices[-1]]["end"]
+            value = "".join(entities[idx]["value"] for idx in indices)
+            idx = self._get_highest_confidence_idx(entities, indices)
+
+            if idx is None:
+                entity_indices_to_remove.update(indices)
+            else:
+                # We just want to keep the entity with the highest confidence value
+                indices.remove(idx)
+                entity_indices_to_remove.update(indices)
+                # update that entity to cover the complete word
+                entities[idx]["start"] = start
+                entities[idx]["end"] = end
+                entities[idx]["value"] = value
+
+        # sort indices to remove entries at the end of the list first
+        # to avoid index out of range errors
+        for idx in sorted(entity_indices_to_remove, reverse=True):
+            entities.remove(entities[idx])
+
+        return entities
+
+    @staticmethod
+    def _get_highest_confidence_idx(
+        entities: List[Dict[Text, Any]], indices: List[int]
+    ) -> Optional[int]:
+        """
+        Args:
+            entities: the full list of entities
+            indices: the indices to consider
+
+        Returns: the idx of the entity label with the highest confidence.
+        """
+        confidences = [
+            entities[idx]["confidence"]
+            for idx in indices
+            if "confidence" in entities[idx]
+        ]
+
+        if len(confidences) != len(indices):
+            return None
+
+        return confidences.index(max(confidences))
+
     @staticmethod
     def filter_irrelevant_entities(extracted: list, requested_dimensions: set) -> list:
         """Only return dimensions the user configured"""
diff --git a/tests/nlu/extractors/test_extractor.py b/tests/nlu/extractors/test_extractor.py
@@ -0,0 +1,119 @@
+from typing import Any, Text, Dict, List
+
+import pytest
+
+from rasa.nlu.extractors.extractor import EntityExtractor
+
+
+@pytest.mark.parametrize(
+    "entities, keep, expected_entities",
+    [
+        (
+            [
+                {"entity": "iata", "start": 0, "end": 3, "value": "Aar"},
+                {"entity": "city", "start": 3, "end": 6, "value": "hus"},
+            ],
+            False,
+            [],
+        ),
+        (
+            [
+                {"entity": "iata", "start": 0, "end": 3, "value": "Aar"},
+                {"entity": "city", "start": 3, "end": 6, "value": "hus"},
+            ],
+            True,
+            [],
+        ),
+        (
+            [
+                {"entity": "city", "start": 0, "end": 3, "value": "Aarhus"},
+                {"entity": "type", "start": 4, "end": 9, "value": "city"},
+            ],
+            False,
+            [
+                {"entity": "city", "start": 0, "end": 3, "value": "Aarhus"},
+                {"entity": "type", "start": 4, "end": 9, "value": "city"},
+            ],
+        ),
+        (
+            [
+                {
+                    "entity": "city",
+                    "start": 0,
+                    "end": 3,
+                    "confidence": 0.87,
+                    "value": "Aar",
+                },
+                {
+                    "entity": "iata",
+                    "start": 3,
+                    "end": 6,
+                    "confidence": 0.43,
+                    "value": "hus",
+                },
+            ],
+            True,
+            [
+                {
+                    "entity": "city",
+                    "start": 0,
+                    "end": 6,
+                    "confidence": 0.87,
+                    "value": "Aarhus",
+                }
+            ],
+        ),
+        (
+            [
+                {
+                    "entity": "iata",
+                    "start": 0,
+                    "end": 2,
+                    "confidence": 0.32,
+                    "value": "Aa",
+                },
+                {
+                    "entity": "city",
+                    "start": 2,
+                    "end": 3,
+                    "confidence": 0.87,
+                    "value": "r",
+                },
+                {
+                    "entity": "iata",
+                    "start": 3,
+                    "end": 5,
+                    "confidence": 0.21,
+                    "value": "hu",
+                },
+                {
+                    "entity": "city",
+                    "start": 5,
+                    "end": 6,
+                    "confidence": 0.43,
+                    "value": "s",
+                },
+            ],
+            True,
+            [
+                {
+                    "entity": "city",
+                    "start": 0,
+                    "end": 6,
+                    "confidence": 0.87,
+                    "value": "Aarhus",
+                }
+            ],
+        ),
+    ],
+)
+def test_convert_tags_to_entities(
+    entities: List[Dict[Text, Any]],
+    keep: bool,
+    expected_entities: List[Dict[Text, Any]],
+):
+    extractor = EntityExtractor()
+
+    updated_entities = extractor.clean_up_entities(entities, keep)
+
+    assert updated_entities == expected_entities