@@ -63,8 +63,16 @@ def create_argument_parser():
63
63
parser .add_argument ('-f' , '--folds' , required = False , default = 10 ,
64
64
help = "number of CV folds (crossvalidation only)" )
65
65
66
+ parser .add_argument ('--report' , required = False , nargs = '?' ,
67
+ const = "report.json" , default = False ,
68
+ help = "output path to save the metrics report" )
69
+
70
+ parser .add_argument ('--successes' , required = False , nargs = '?' ,
71
+ const = "successes.json" , default = False ,
72
+ help = "output path to save successful predictions" )
73
+
66
74
parser .add_argument ('--errors' , required = False , default = "errors.json" ,
67
- help = "output path for the json with wrong predictions " )
75
+ help = "output path to save model errors " )
68
76
69
77
parser .add_argument ('--histogram' , required = False , default = "hist.png" ,
70
78
help = "output path for the confidence histogram" )
@@ -163,14 +171,15 @@ def log_evaluation_table(report, # type: Text
163
171
logger .info ("Classification report: \n {}" .format (report ))
164
172
165
173
166
- def get_evaluation_metrics (targets , predictions ): # pragma: no cover
174
+ def get_evaluation_metrics (targets , predictions , output_dict = False ): # pragma: no cover
167
175
"""Compute the f1, precision, accuracy and summary report from sklearn."""
168
176
from sklearn import metrics
169
177
170
178
targets = clean_intent_labels (targets )
171
179
predictions = clean_intent_labels (predictions )
172
180
173
- report = metrics .classification_report (targets , predictions )
181
+ report = metrics .classification_report (targets , predictions ,
182
+ output_dict = output_dict )
174
183
precision = metrics .precision_score (targets , predictions ,
175
184
average = 'weighted' )
176
185
f1 = metrics .f1_score (targets , predictions , average = 'weighted' )
@@ -213,37 +222,50 @@ def drop_intents_below_freq(td, cutoff=5):
213
222
return TrainingData (keep_examples , td .entity_synonyms , td .regex_features )
214
223
215
224
216
- def save_nlu_errors ( errors , filename ):
217
- """Write out nlu classification errors to a file."""
225
+ def save_json ( data , filename ):
226
+ """Write out nlu classification to a file."""
218
227
219
228
utils .write_to_file (filename ,
220
- json .dumps (errors , indent = 4 , ensure_ascii = False ))
221
- logger .info ("Model prediction errors saved to {}." .format (filename ))
229
+ json .dumps (data , indent = 4 , ensure_ascii = False ))
230
+
231
+
232
+ def collect_nlu_successes (intent_results , successes_filename ):
233
+ """Log messages which result in successful predictions
234
+ and save them to file"""
235
+
236
+ successes = [{"text" : r .message ,
237
+ "intent" : r .target ,
238
+ "intent_prediction" : {"name" : r .prediction ,
239
+ "confidence" : r .confidence }}
240
+ for r in intent_results if r .target == r .prediction ]
241
+
242
+ if successes :
243
+ save_json (successes , successes_filename )
244
+ logger .info ("Model prediction successes saved to {}."
245
+ .format (successes_filename ))
246
+ logger .debug ("\n \n Successfully predicted the following"
247
+ "intents: \n {}" .format (successes ))
248
+ else :
249
+ logger .info ("Your model made no successful predictions" )
222
250
223
251
224
- def collect_nlu_errors (intent_results ): # pragma: no cover
252
+ def collect_nlu_errors (intent_results , errors_filename ):
225
253
"""Log messages which result in wrong predictions and save them to file"""
226
254
227
- # it could be interesting to include entity-errors later
228
- # therefore we start with a "intent_errors" key
229
- intent_errors = [{"text" : r .message ,
230
- "intent" : r .target ,
231
- "intent_prediction" : {
232
- "name" : r .prediction ,
233
- "confidence" : r .confidence
234
- }}
235
- for r in intent_results if r .target != r .prediction ]
236
-
237
- if intent_errors :
238
- logger .info ("There were some nlu intent classification errors. "
239
- "Use `--verbose` to show them in the log." )
240
- logger .debug ("\n \n These intent examples could not be classified "
241
- "correctly \n {}" .format (intent_errors ))
255
+ errors = [{"text" : r .message ,
256
+ "intent" : r .target ,
257
+ "intent_prediction" : {"name" : r .prediction ,
258
+ "confidence" : r .confidence }}
259
+ for r in intent_results if r .target != r .prediction ]
242
260
243
- return {'intent_errors' : intent_errors }
261
+ if errors :
262
+ save_json (errors , errors_filename )
263
+ logger .info ("Model prediction errors saved to {}."
264
+ .format (errors_filename ))
265
+ logger .debug ("\n \n These intent examples could not be classified "
266
+ "correctly: \n {}" .format (errors ))
244
267
else :
245
- logger .info ("No prediction errors were found. You are AWESOME!" )
246
- return None
268
+ logger .info ("Your model made no errors" )
247
269
248
270
249
271
def plot_intent_confidences (intent_results , intent_hist_filename ):
@@ -262,6 +284,8 @@ def plot_intent_confidences(intent_results, intent_hist_filename):
262
284
263
285
264
286
def evaluate_intents (intent_results ,
287
+ report_filename ,
288
+ successes_filename ,
265
289
errors_filename ,
266
290
confmat_filename ,
267
291
intent_hist_filename ): # pragma: no cover
@@ -284,16 +308,27 @@ def evaluate_intents(intent_results,
284
308
285
309
targets , predictions = _targets_predictions_from (intent_results )
286
310
287
- report , precision , f1 , accuracy = get_evaluation_metrics (targets ,
288
- predictions )
311
+ if report_filename :
312
+ report , precision , f1 , accuracy = get_evaluation_metrics (targets ,
313
+ predictions ,
314
+ output_dict = True )
289
315
290
- log_evaluation_table (report , precision , f1 , accuracy )
316
+ save_json (report , report_filename )
317
+ logger .info ("Classification report saved to {}."
318
+ .format (report_filename ))
319
+
320
+ else :
321
+ report , precision , f1 , accuracy = get_evaluation_metrics (targets ,
322
+ predictions )
323
+ log_evaluation_table (report , precision , f1 , accuracy )
291
324
292
- # log and save misclassified samples to file for debugging
293
- errors = collect_nlu_errors (intent_results )
325
+ if successes_filename :
326
+ # save classified samples to file for debugging
327
+ collect_nlu_successes (intent_results , successes_filename )
294
328
295
- if errors and errors_filename :
296
- save_nlu_errors (errors , errors_filename )
329
+ if errors_filename :
330
+ # log and save misclassified samples to file for debugging
331
+ collect_nlu_errors (intent_results , errors_filename )
297
332
298
333
if confmat_filename :
299
334
from sklearn .metrics import confusion_matrix
@@ -673,6 +708,8 @@ def remove_duckling_entities(entity_predictions):
673
708
674
709
675
710
def run_evaluation (data_path , model ,
711
+ report_filename = None ,
712
+ successes_filename = None ,
676
713
errors_filename = 'errors.json' ,
677
714
confmat_filename = None ,
678
715
intent_hist_filename = None ,
@@ -706,6 +743,8 @@ def run_evaluation(data_path, model,
706
743
707
744
logger .info ("Intent evaluation results:" )
708
745
result ['intent_evaluation' ] = evaluate_intents (intent_results ,
746
+ report_filename ,
747
+ successes_filename ,
709
748
errors_filename ,
710
749
confmat_filename ,
711
750
intent_hist_filename )
@@ -919,6 +958,8 @@ def main():
919
958
elif cmdline_args .mode == "evaluation" :
920
959
run_evaluation (cmdline_args .data ,
921
960
cmdline_args .model ,
961
+ cmdline_args .report ,
962
+ cmdline_args .successes ,
922
963
cmdline_args .errors ,
923
964
cmdline_args .confmat ,
924
965
cmdline_args .histogram )
0 commit comments