Merge branch 'master' of github.com:RasaHQ/rasa_nlu

tmbo · tmbo · commit 29ea89b82504 · 2019-01-23T13:35:55.000+01:00
diff --git a/.travis.yml b/.travis.yml
@@ -11,7 +11,11 @@ python:
 env:
   # needed to fix issues with boto during testing:
   # https://github.com/travis-ci/travis-ci/issues/7940
-  global: BOTO_CONFIG=/dev/null
+  global:
+  - BOTO_CONFIG=/dev/null
+  # secret for FOSSA
+  - secure: "g0c6z+CKVPiuGE3G3OGjzcKZJsdsA/j+zsTQ/xr1ie9gKNpKRA0KQAvsU/mow4NeMmt5YKnwKxFqWy0b3Oufm5WLTAWIVepT5FHA7YMVCMcpPIMsbtqe64FqJxhgL+sBJZku5i94PzCruXwbjk3Q5Uad95ZsIJLQLEkzIla6Fdcu5hdkfYVIGRGe0W2YxNVx+3fZimnOOvJmXD/nnZEBesq5fR09uA4v2PsBGuHOTkwuG60rU5bBPZ2PNiEZXt552kJ5yfdSszNd6uzgQNp10L0qXzt1fXeyf9uGAleDS1HrbcVcdWX0jwCwZF/FD40wUcyKGqOWWDs81mehsjhXymKBhy62QinCSFBmcJb/uRJpMWvvzOLaH94TgxFJCxogNKdqNN/jU3V8CuJ2ELp+RCiniO1u9Dd134j6dYWSWYO+R4WXYoIQYVlGZZqIZz+5b5eKsY+r6I1zPBfD+MowJx2HiLpSdgXh3tlhblOZwPKIuUQ++MeBY4JttJA4Sx5K+YIUzDvcx+7jyuYIjrV4D23n6i4dZZHpAqqtlf1iSbnEzO/rnAxZxy1UuEVmXNJSDOoKEzA7Mdd/Nh+M8wcJAb/KDS/TEvT27+TLRlrgOjgjU2q+4yXNomivi3rZJ1+EFUTgs7lgOGCM/LBpJgnyyiSYZoep0YDZHtA3DCnHKQI="
+
 install: 
   - pip install git+https://github.com/tmbo/MITIE.git
   - pip install -r alt_requirements/requirements_dev.txt
@@ -35,6 +39,15 @@ after_success:
   - coveralls
 jobs:
   include:
+  - stage: test
+    name: Check Dependency Licenses
+    before_script:
+    - "curl -H 'Cache-Control: no-cache' https://raw.githubusercontent.com/fossas/fossa-cli/master/install.sh | sudo bash"
+    script:
+    - pip freeze > requirements.txt
+    - fossa init
+    - fossa analyze
+    - fossa test
   - stage: docs
     if: fork = false AND branch = "master" # forked repository will skip building docs, only master & PRs to it
     install:
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,8 @@ This project adheres to `Semantic Versioning`_ starting with version 0.7.0.
 
 Added
 -----
+- Ability to save successful predictions and classification results to a JSON
+  file from ``rasa_nlu.evaluate``
 - environment variables specified with ``${env_variable}`` in a yaml
   configuration file are now replaced with the value of the environment
   variable
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@
 [![Coverage Status](https://coveralls.io/repos/github/RasaHQ/rasa_nlu/badge.svg?branch=master)](https://coveralls.io/github/RasaHQ/rasa_nlu?branch=master)
 [![PyPI version](https://badge.fury.io/py/rasa_nlu.svg)](https://badge.fury.io/py/rasa_nlu)
 [![Documentation Status](https://img.shields.io/badge/docs-stable-brightgreen.svg)](https://nlu.rasa.com/)
+[![FOSSA Status](https://app.fossa.io/api/projects/git%2Bgithub.com%2FRasaHQ%2Frasa_nlu.svg?type=shield)](https://app.fossa.io/projects/git%2Bgithub.com%2FRasaHQ%2Frasa_nlu?ref=badge_shield)
 
 Rasa NLU (Natural Language Understanding) is a tool for understanding what is being said in short pieces of text.
 For example, taking a short message like:
diff --git a/docs/evaluation.rst b/docs/evaluation.rst
@@ -77,17 +77,25 @@ for every cross-validation fold.
 
 Intent Classification
 ---------------------
-The evaluation script will log precision, recall, and f1 measure for
-each intent and once summarized for all.
-Furthermore, it creates a confusion matrix for you to see which
-intents are mistaken for which others.
-Samples which have not been predicted correctly are logged and saved to a file 
-called ``errors.json`` for easier debugging. 
-Finally, the evaluation script creates a histogram of the confidence distribution for all predictions,
-separating the confidence of wrong and correct predictions in different bars of the histogram.
-Improving the quality of your training data will move the blue-ish histogram bars
-(confidence of the correct predictions) to the right and the wine-ish histogram bars
-(confidence of wrong predictions) to the left.
+The evaluation script will produce a report, confusion matrix
+and confidence histogram for your model.
+
+The report logs precision, recall, and f1 measure for
+each intent, as well as provide an overall average.  You can save this
+report as a JSON file using the `--report` flag.
+
+The confusion matrix shows you which
+intents are mistaken for others; any samples which have been
+incorrectly predicted are logged and saved to a file
+called ``errors.json`` for easier debugging.
+
+The histogram that the script produces allows you to visualise the
+confidence distribution for all predictions,
+with the volume of correct and incorrect predictions being displayed by
+blue and red bars respectively.
+Improving the quality of your training data will move the blue
+histogram bars to the right and the red histogram bars
+to the left of the plot.
 
 
 .. note::
diff --git a/rasa_nlu/evaluate.py b/rasa_nlu/evaluate.py
@@ -63,8 +63,16 @@ def create_argument_parser():
     parser.add_argument('-f', '--folds', required=False, default=10,
                         help="number of CV folds (crossvalidation only)")
 
+    parser.add_argument('--report', required=False, nargs='?',
+                        const="report.json", default=False,
+                        help="output path to save the metrics report")
+
+    parser.add_argument('--successes', required=False, nargs='?',
+                        const="successes.json", default=False,
+                        help="output path to save successful predictions")
+
     parser.add_argument('--errors', required=False, default="errors.json",
-                        help="output path for the json with wrong predictions")
+                        help="output path to save model errors")
 
     parser.add_argument('--histogram', required=False, default="hist.png",
                         help="output path for the confidence histogram")
@@ -163,14 +171,15 @@ def log_evaluation_table(report,  # type: Text
     logger.info("Classification report: \n{}".format(report))
 
 
-def get_evaluation_metrics(targets, predictions):  # pragma: no cover
+def get_evaluation_metrics(targets, predictions, output_dict=False):  # pragma: no cover
     """Compute the f1, precision, accuracy and summary report from sklearn."""
     from sklearn import metrics
 
     targets = clean_intent_labels(targets)
     predictions = clean_intent_labels(predictions)
 
-    report = metrics.classification_report(targets, predictions)
+    report = metrics.classification_report(targets, predictions,
+                                           output_dict=output_dict)
     precision = metrics.precision_score(targets, predictions,
                                         average='weighted')
     f1 = metrics.f1_score(targets, predictions, average='weighted')
@@ -213,37 +222,50 @@ def drop_intents_below_freq(td, cutoff=5):
     return TrainingData(keep_examples, td.entity_synonyms, td.regex_features)
 
 
-def save_nlu_errors(errors, filename):
-    """Write out nlu classification errors to a file."""
+def save_json(data, filename):
+    """Write out nlu classification to a file."""
 
     utils.write_to_file(filename,
-                        json.dumps(errors, indent=4, ensure_ascii=False))
-    logger.info("Model prediction errors saved to {}.".format(filename))
+                        json.dumps(data, indent=4, ensure_ascii=False))
+
+
+def collect_nlu_successes(intent_results, successes_filename):
+    """Log messages which result in successful predictions
+    and save them to file"""
+
+    successes = [{"text": r.message,
+                  "intent": r.target,
+                  "intent_prediction": {"name": r.prediction,
+                                        "confidence": r.confidence}}
+                 for r in intent_results if r.target == r.prediction]
+
+    if successes:
+        save_json(successes, successes_filename)
+        logger.info("Model prediction successes saved to {}."
+                    .format(successes_filename))
+        logger.debug("\n\nSuccessfully predicted the following"
+                     "intents: \n{}".format(successes))
+    else:
+        logger.info("Your model made no successful predictions")
 
 
-def collect_nlu_errors(intent_results):  # pragma: no cover
+def collect_nlu_errors(intent_results, errors_filename):
     """Log messages which result in wrong predictions and save them to file"""
 
-    # it could be interesting to include entity-errors later
-    # therefore we start with a "intent_errors" key
-    intent_errors = [{"text": r.message,
-                      "intent": r.target,
-                      "intent_prediction": {
-                          "name": r.prediction,
-                          "confidence": r.confidence
-                      }}
-                     for r in intent_results if r.target != r.prediction]
-
-    if intent_errors:
-        logger.info("There were some nlu intent classification errors. "
-                    "Use `--verbose` to show them in the log.")
-        logger.debug("\n\nThese intent examples could not be classified "
-                     "correctly \n{}".format(intent_errors))
+    errors = [{"text": r.message,
+               "intent": r.target,
+               "intent_prediction": {"name": r.prediction,
+                                     "confidence": r.confidence}}
+              for r in intent_results if r.target != r.prediction]
 
-        return {'intent_errors': intent_errors}
+    if errors:
+        save_json(errors, errors_filename)
+        logger.info("Model prediction errors saved to {}."
+                    .format(errors_filename))
+        logger.debug("\n\nThese intent examples could not be classified "
+                     "correctly: \n{}".format(errors))
     else:
-        logger.info("No prediction errors were found. You are AWESOME!")
-        return None
+        logger.info("Your model made no errors")
 
 
 def plot_intent_confidences(intent_results, intent_hist_filename):
@@ -262,6 +284,8 @@ def plot_intent_confidences(intent_results, intent_hist_filename):
 
 
 def evaluate_intents(intent_results,
+                     report_filename,
+                     successes_filename,
                      errors_filename,
                      confmat_filename,
                      intent_hist_filename):  # pragma: no cover
@@ -284,16 +308,27 @@ def evaluate_intents(intent_results,
 
     targets, predictions = _targets_predictions_from(intent_results)
 
-    report, precision, f1, accuracy = get_evaluation_metrics(targets,
-                                                             predictions)
+    if report_filename:
+        report, precision, f1, accuracy = get_evaluation_metrics(targets,
+                                                                 predictions,
+                                                                 output_dict=True)
 
-    log_evaluation_table(report, precision, f1, accuracy)
+        save_json(report, report_filename)
+        logger.info("Classification report saved to {}."
+                    .format(report_filename))
+
+    else:
+        report, precision, f1, accuracy = get_evaluation_metrics(targets,
+                                                                 predictions)
+        log_evaluation_table(report, precision, f1, accuracy)
 
-    # log and save misclassified samples to file for debugging
-    errors = collect_nlu_errors(intent_results)
+    if successes_filename:
+        # save classified samples to file for debugging
+        collect_nlu_successes(intent_results, successes_filename)
 
-    if errors and errors_filename:
-        save_nlu_errors(errors, errors_filename)
+    if errors_filename:
+        # log and save misclassified samples to file for debugging
+        collect_nlu_errors(intent_results, errors_filename)
 
     if confmat_filename:
         from sklearn.metrics import confusion_matrix
@@ -673,6 +708,8 @@ def remove_duckling_entities(entity_predictions):
 
 
 def run_evaluation(data_path, model,
+                   report_filename=None,
+                   successes_filename=None,
                    errors_filename='errors.json',
                    confmat_filename=None,
                    intent_hist_filename=None,
@@ -706,6 +743,8 @@ def run_evaluation(data_path, model,
 
         logger.info("Intent evaluation results:")
         result['intent_evaluation'] = evaluate_intents(intent_results,
+                                                       report_filename,
+                                                       successes_filename,
                                                        errors_filename,
                                                        confmat_filename,
                                                        intent_hist_filename)
@@ -919,6 +958,8 @@ def main():
     elif cmdline_args.mode == "evaluation":
         run_evaluation(cmdline_args.data,
                        cmdline_args.model,
+                       cmdline_args.report,
+                       cmdline_args.successes,
                        cmdline_args.errors,
                        cmdline_args.confmat,
                        cmdline_args.histogram)
diff --git a/tests/base/test_evaluation.py b/tests/base/test_evaluation.py
@@ -14,13 +14,16 @@
     remove_empty_intent_examples, get_entity_extractors,
     get_duckling_dimensions, known_duckling_dimensions,
     find_component, remove_duckling_extractors, drop_intents_below_freq,
-    run_cv_evaluation, substitute_labels, IntentEvaluationResult)
+    run_cv_evaluation, substitute_labels, IntentEvaluationResult,
+    evaluate_intents)
 from rasa_nlu.evaluate import does_token_cross_borders
 from rasa_nlu.evaluate import align_entity_predictions
 from rasa_nlu.evaluate import determine_intersection
 from rasa_nlu.evaluate import determine_token_labels
 from rasa_nlu.config import RasaNLUModelConfig
 from rasa_nlu.tokenizers import Token
+from rasa_nlu import utils
+import json
 from rasa_nlu import training_data, config
 from tests import utilities
 
@@ -258,6 +261,41 @@ def test_run_cv_evaluation():
     assert len(entity_results.test['ner_crf']["F1-score"]) == n_folds
 
 
+def test_evaluation_report(tmpdir_factory):
+
+    path = tmpdir_factory.mktemp("evaluation").strpath
+    report_filename = path + "report.json"
+
+    intent_results = [
+        IntentEvaluationResult("", "restaurant_search",
+                               "I am hungry", 0.12345),
+        IntentEvaluationResult("greet", "greet",
+                               "hello", 0.98765)]
+
+    result = evaluate_intents(intent_results,
+                              report_filename,
+                              successes_filename=None,
+                              errors_filename=None,
+                              confmat_filename=None,
+                              intent_hist_filename=None)
+
+    report = json.loads(utils.read_file(report_filename))
+
+    greet_results = {"precision": 1.0,
+                     "recall": 1.0,
+                     "f1-score": 1.0,
+                     "support": 1}
+
+    prediction = {'text': 'hello',
+                  'intent': 'greet',
+                  'predicted': 'greet',
+                  'confidence': 0.98765}
+
+    assert len(report.keys()) == 4
+    assert report["greet"] == greet_results
+    assert result["predictions"][0] == prediction
+
+
 def test_empty_intent_removal():
     intent_results = [
         IntentEvaluationResult("", "restaurant_search",