Merge pull request RasaHQ#5511 from RasaHQ/fix-entity-recognition-prediction

tabergma · web-flow · commit 328b49b4e0a1 · 2020-03-30T17:04:42.000+02:00
Entity applies to complete word not just parts of it
diff --git a/changelog/5509.bugfix.rst b/changelog/5509.bugfix.rst
@@ -0,0 +1,5 @@
+An entity label should always cover a complete word.
+
+If you are using, for example, ``ConveRTTokenizer`` words can be split into multiple tokens.
+Our entity extractors assign entity labels per token. So, it might happen, that just a part of a word has
+an entity label. This is now fixed. An entity label always covers a complete word.
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
@@ -739,13 +739,15 @@ def _predict_entities(
             message.text, message.get(TOKENS_NAMES[TEXT], []), tags
         )
 
-        extracted = self.add_extractor_name(entities)
-        entities = message.get(ENTITIES, []) + extracted
+        entities = self.add_extractor_name(entities)
+        entities = self.clean_up_entities(message, entities)
+        entities = message.get(ENTITIES, []) + entities
 
         return entities
 
+    @staticmethod
     def _convert_tags_to_entities(
-        self, text: Text, tokens: List[Token], tags: List[Text]
+        text: Text, tokens: List[Token], tags: List[Text]
     ) -> List[Dict[Text, Any]]:
         entities = []
         last_tag = NO_ENTITY_TAG
@@ -773,7 +775,7 @@ def _convert_tags_to_entities(
         for entity in entities:
             entity["value"] = text[entity["start"] : entity["end"]]
 
-        return self.clean_up_entities(entities)
+        return entities
 
     def process(self, message: Message, **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -156,8 +156,9 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         return dataset
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        extracted = self.add_extractor_name(self.extract_entities(message))
-        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
+        entities = self.add_extractor_name(self.extract_entities(message))
+        entities = self.clean_up_entities(message, entities)
+        message.set(ENTITIES, message.get(ENTITIES, []) + entities, add_to_output=True)
 
     def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         """Take a sentence and return entities in json format"""
@@ -166,8 +167,7 @@ def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
             text_data = self._from_text_to_crf(message)
             features = self._sentence_to_features(text_data)
             entities = self.ent_tagger.predict_marginals_single(features)
-            entities = self._from_crf_to_json(message, entities)
-            return self.clean_up_entities(entities)
+            return self._from_crf_to_json(message, entities)
         else:
             return []
 
diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -186,9 +186,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
             )
 
         extracted = self.add_extractor_name(extracted)
-        message.set(
-            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
-        )
+        extracted = self.clean_up_entities(message, extracted)
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     @classmethod
     def load(
diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, List, Text, Tuple, Optional
+from typing import Any, Dict, List, Text, Tuple, Optional, Union
 
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.components import Component
-from rasa.nlu.constants import EXTRACTOR, ENTITIES
+from rasa.nlu.constants import EXTRACTOR, ENTITIES, TOKENS_NAMES, TEXT
 from rasa.nlu.training_data import Message
 
 
@@ -22,62 +23,60 @@ def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
         return entity
 
     def clean_up_entities(
-        self, entities: List[Dict[Text, Any]], keep: bool = True
+        self, message: Message, entities: List[Dict[Text, Any]], keep: bool = True
     ) -> List[Dict[Text, Any]]:
         """
-        Checks if multiple entity labels are assigned to one word.
+        Check if multiple entity labels are assigned to one word or if an entity label
+        is assigned to just a part of a word or if an entity label covers multiple
+        words, but one word just partly.
 
         This might happen if you are using a tokenizer that splits up words into
         sub-words and different entity labels are assigned to the individual sub-words.
-        In such a case keep the entity label with the highest confidence as entity
-        label for that word. If you set 'keep' to 'False', all entity labels for
-        that word will be removed.
+        If multiple entity labels are assigned to one word, we keep the entity label
+        with the highest confidence as entity label for that word. If just a part
+        of the word is annotated, that entity label is taken for the complete word.
+        If you set 'keep' to 'False', all entity labels for the word will be removed.
 
         Args:
+            message: message object
             entities: list of entities
             keep:
                 If set to 'True', the entity label with the highest confidence is kept
                 if multiple entity labels are assigned to one word. If set to 'False'
                 all entity labels for that word will be removed.
 
-        Returns: updated list of entities
+        Returns:
+            Updated entities.
         """
-        if len(entities) <= 1:
-            return entities
-
-        entity_indices: List[List[int]] = []
-
-        # get indices of entity labels that belong to one word
-        for idx in range(1, len(entities)):
-            if entities[idx]["start"] == entities[idx - 1]["end"]:
-                if entity_indices and entity_indices[-1][-1] == idx - 1:
-                    entity_indices[-1].append(idx)
-                else:
-                    entity_indices.append([idx - 1, idx])
+        misaligned_entities = self._get_misaligned_entities(
+            message.get(TOKENS_NAMES[TEXT]), entities
+        )
 
         entity_indices_to_remove = set()
 
-        for indices in entity_indices:
+        for misaligned_entity in misaligned_entities:
+            # entity indices involved in the misalignment
+            entity_indices = misaligned_entity["entity_indices"]
+
             if not keep:
-                entity_indices_to_remove.update(indices)
+                entity_indices_to_remove.update(entity_indices)
                 continue
 
-            # get start, end, and value of entity matching the complete word
-            start = entities[indices[0]]["start"]
-            end = entities[indices[-1]]["end"]
-            value = "".join(entities[idx]["value"] for idx in indices)
-            idx = self._get_highest_confidence_idx(entities, indices)
+            idx = self._entity_index_to_keep(entities, entity_indices)
 
             if idx is None:
-                entity_indices_to_remove.update(indices)
+                entity_indices_to_remove.update(entity_indices)
             else:
-                # We just want to keep the entity with the highest confidence value
-                indices.remove(idx)
-                entity_indices_to_remove.update(indices)
-                # update that entity to cover the complete word
-                entities[idx]["start"] = start
-                entities[idx]["end"] = end
-                entities[idx]["value"] = value
+                # keep just one entity
+                entity_indices.remove(idx)
+                entity_indices_to_remove.update(entity_indices)
+
+                # update that entity to cover the complete word(s)
+                entities[idx]["start"] = misaligned_entity["start"]
+                entities[idx]["end"] = misaligned_entity["end"]
+                entities[idx]["value"] = message.text[
+                    misaligned_entity["start"] : misaligned_entity["end"]
+                ]
 
         # sort indices to remove entries at the end of the list first
         # to avoid index out of range errors
@@ -86,24 +85,183 @@ def clean_up_entities(
 
         return entities
 
+    def _get_misaligned_entities(
+        self, tokens: List[Token], entities: List[Dict[Text, Any]]
+    ) -> List[Dict[Text, Any]]:
+        """Identify entities and tokens that are misaligned.
+
+        Misaligned entities are those that apply only to a part of a word, i.e.
+        sub-word.
+
+        Args:
+            tokens: list of tokens
+            entities: list of detected entities by the entity extractor
+
+        Returns:
+            Misaligned entities including the start and end position
+            of the final entity in the text and entity indices that are part of this
+            misalignment.
+        """
+        if not tokens:
+            return []
+
+        # group tokens: one token cluster corresponds to one word
+        token_clusters = self._token_clusters(tokens)
+
+        # added for tests, should only happen if tokens are not set or len(tokens) == 1
+        if not token_clusters:
+            return []
+
+        misaligned_entities = []
+        for entity_idx, entity in enumerate(entities):
+            # get all tokens that are covered/touched by the entity
+            entity_tokens = self._tokens_of_entity(entity, token_clusters)
+
+            if len(entity_tokens) == 1:
+                # entity covers exactly one word
+                continue
+
+            # get start and end position of complete word
+            # needed to update the final entity later
+            start_position = entity_tokens[0].start
+            end_position = entity_tokens[-1].end
+
+            # check if an entity was already found that covers the exact same word(s)
+            _idx = self._misaligned_entity_index(
+                misaligned_entities, start_position, end_position
+            )
+
+            if _idx is None:
+                misaligned_entities.append(
+                    {
+                        "start": start_position,
+                        "end": end_position,
+                        "entity_indices": [entity_idx],
+                    }
+                )
+            else:
+                misaligned_entities[_idx]["entity_indices"].append(entity_idx)
+
+        return misaligned_entities
+
     @staticmethod
-    def _get_highest_confidence_idx(
-        entities: List[Dict[Text, Any]], indices: List[int]
+    def _misaligned_entity_index(
+        word_entity_cluster: List[Dict[Text, Union[int, List[int]]]],
+        start_position: int,
+        end_position: int,
     ) -> Optional[int]:
+        """Get index of matching misaligned entity.
+
+        Args:
+            word_entity_cluster: word entity cluster
+            start_position: start position
+            end_position: end position
+
+        Returns:
+            Index of the misaligned entity that matches the provided start and end
+            position.
         """
+        for idx, cluster in enumerate(word_entity_cluster):
+            if cluster["start"] == start_position and cluster["end"] == end_position:
+                return idx
+        return None
+
+    @staticmethod
+    def _tokens_of_entity(
+        entity: Dict[Text, Any], token_clusters: List[List[Token]]
+    ) -> List[Token]:
+        """Get all tokens of token clusters that are covered by the entity.
+
+        The entity can cover them completely or just partly.
+
+        Args:
+            entity: the entity
+            token_clusters: list of token clusters
+
+        Returns:
+            Token clusters that belong to the provided entity.
+
+        """
+        entity_tokens = []
+        for token_cluster in token_clusters:
+            entity_starts_inside_cluster = (
+                token_cluster[0].start <= entity["start"] <= token_cluster[-1].end
+            )
+            entity_ends_inside_cluster = (
+                token_cluster[0].start <= entity["end"] <= token_cluster[-1].end
+            )
+
+            if entity_starts_inside_cluster or entity_ends_inside_cluster:
+                entity_tokens += token_cluster
+        return entity_tokens
+
+    @staticmethod
+    def _token_clusters(tokens: List[Token]) -> List[List[Token]]:
+        """Build clusters of tokens that belong to one word.
+
+        Args:
+            tokens: list of tokens
+
+        Returns:
+            Token clusters.
+
+        """
+        # token cluster = list of token indices that belong to one word
+        token_index_clusters = []
+
+        # start at 1 in order to check if current token and previous token belong
+        # to the same word
+        for token_idx in range(1, len(tokens)):
+            previous_token_idx = token_idx - 1
+            # two tokens belong to the same word if there is no other character
+            # between them
+            if tokens[token_idx].start == tokens[previous_token_idx].end:
+                # a word was split into multiple tokens
+                token_cluster_already_exists = (
+                    token_index_clusters
+                    and token_index_clusters[-1][-1] == previous_token_idx
+                )
+                if token_cluster_already_exists:
+                    token_index_clusters[-1].append(token_idx)
+                else:
+                    token_index_clusters.append([previous_token_idx, token_idx])
+            else:
+                # the token corresponds to a single word
+                if token_idx == 1:
+                    token_index_clusters.append([previous_token_idx])
+                token_index_clusters.append([token_idx])
+
+        return [[tokens[idx] for idx in cluster] for cluster in token_index_clusters]
+
+    @staticmethod
+    def _entity_index_to_keep(
+        entities: List[Dict[Text, Any]], entity_indices: List[int]
+    ) -> Optional[int]:
+        """
+        Determine the entity index to keep.
+
+        If we just have one entity index, i.e. candidate, we return the index of that
+        candidate. If we have multiple candidates, we return the index of the entity
+        value with the highest confidence score. If no confidence score is present,
+        no entity label will be kept.
+
         Args:
             entities: the full list of entities
-            indices: the indices to consider
+            entity_indices: the entity indices to consider
 
-        Returns: the idx of the entity label with the highest confidence.
+        Returns: the idx of the entity to keep
         """
+        if len(entity_indices) == 1:
+            return entity_indices[0]
+
         confidences = [
             entities[idx]["confidence"]
-            for idx in indices
+            for idx in entity_indices
             if "confidence" in entities[idx]
         ]
 
-        if len(confidences) != len(indices):
+        # we don't have confidence values for all entity labels
+        if len(confidences) != len(entity_indices):
             return None
 
         return confidences.index(max(confidences))
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -142,9 +142,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
             message.text, self._tokens_without_cls(message), mitie_feature_extractor
         )
         extracted = self.add_extractor_name(ents)
-        message.set(
-            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
-        )
+        extracted = self.clean_up_entities(message, extracted)
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     @classmethod
     def load(
diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -32,13 +32,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
         spacy_nlp = kwargs.get("spacy_nlp", None)
         doc = spacy_nlp(message.text)
         all_extracted = self.add_extractor_name(self.extract_entities(doc))
+        all_extracted = self.clean_up_entities(message, all_extracted)
         dimensions = self.component_config["dimensions"]
         extracted = SpacyEntityExtractor.filter_irrelevant_entities(
             all_extracted, dimensions
         )
-        message.set(
-            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
-        )
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     @staticmethod
     def extract_entities(doc: "Doc") -> List[Dict[Text, Any]]:
diff --git a/tests/nlu/extractors/test_extractor.py b/tests/nlu/extractors/test_extractor.py