made it possible to specify C values in tune_segmentation_model.py

dchartash · Aug 5, 2014 · ef82193 · ef82193
1 parent 6a49d06
commit ef82193
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,8 @@ This repository is pip-installable.  To make it work properly, I recommend runni
 
 Additionally, the syntactic parsing code must be set up to use ZPar.  The simplest but least efficient way is to put the ZPar distribution (version 0.6) in a subdirectory `zpar` (or symbolic link) in the current working directory, along with the English models in a subdirectory `zpar/english`.  For efficiency, a better method is to use the `python-zpar` wrapper, which is currently available at `https://bitbucket.org/desilinguist/python-zpar`.  To set this up, run make and then either a) set an environment variable `ZPAR_LIBRARY_DIR` equal to the directory where `zpar.so` is created (e.g., `/Users/USER1/python-zpar/dist`) to run ZPar as part of the discourse parser, or b) start a separate server using python-zpar's `zpar_server.py`.
 
+Finally, CRF++ (version 0.58) should be installed, and its `bin` directory should be added to your `PATH` environment variable.  See `http://crfpp.googlecode.com/svn/trunk/doc/index.html`.
+
 Input Preparation
 =================
 

diff --git a/discourseparsing/tune_segmentation_model.py b/discourseparsing/tune_segmentation_model.py
@@ -8,7 +8,8 @@
 
 from sklearn.metrics import f1_score, precision_score, recall_score
 
-from discourseparsing.make_segmentation_crfpp_template import make_segmentation_crfpp_template
+from discourseparsing.make_segmentation_crfpp_template \
+    import make_segmentation_crfpp_template
 
 
 def main():
@@ -17,12 +18,21 @@ def main():
     parser.add_argument('train_path',
                         help='The path to the training set .tsv file for CRF++')
     parser.add_argument('dev_path',
-                        help='The path to the development set .tsv file for CRF++')
+                        help='The path to the development set .tsv file for' +
+                        ' CRF++')
     parser.add_argument('model_path_prefix',
-                        help='The path prefix for where the models should be stored.  Multiple files will be saved, for different hyperparameter settings.')
+                        help='The path prefix for where the models should be ' +
+                        'stored.  Multiple files will be saved, for ' +
+                        'different hyperparameter settings.')
     parser.add_argument('--template_path',
-                        help='path to the CRF++ template for segmentation (this will be created if the file does not exist)',
+                        help='path to the CRF++ template for segmentation ' +
+                        '(this will be created if the file does not exist)',
                         default='segmentation_crfpp_template.txt')
+    parser.add_argument('-C', '--C_values',
+                        help='comma-separated list of model complexity ' +
+                        'parameter settings to evaluate.',
+                        default=','.join([str(2.0 ** x)
+                                          for x in range(-6, 7)]))
     args = parser.parse_args()
 
     best_f1 = -1
@@ -34,15 +44,17 @@ def main():
     if not os.path.exists(args.template_path):
         make_segmentation_crfpp_template(args.template_path)
 
-    for C in [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]:
-        model_path = "{}.C{}".format(args.model_path_prefix, C)
+    C_values = [float(x) for x in args.C_values.split(',')]
+    for C_value in C_values:
+        model_path = "{}.C{}".format(args.model_path_prefix, C_value)
         subprocess.call(shlex.split(
             'crf_learn {} {} {} -c {}'.format(args.template_path,
                                               args.train_path,
                                               model_path,
-                                              C)))
+                                              C_value)))
         crf_test_output = subprocess.check_output(shlex.split(
-            'crf_test -m {} {}'.format(model_path, args.dev_path))).decode('utf-8')
+            'crf_test -m {} {}'.format(model_path, args.dev_path))) \
+            .decode('utf-8')
         output_split = [re.split(r'\t', x)[-2:]
                         for x in re.split(r'\n+', crf_test_output)
                         if x.strip()]
@@ -57,11 +69,11 @@ def main():
             best_f1 = f1
             best_precision = precision
             best_recall = recall
-            best_C = C
+            best_C = C_value
             best_model_path = model_path
 
         print("model path = {}".format(model_path))
-        print("C = {}".format(C))
+        print("C = {}".format(C_value))
         print("precision (B-EDU class) = {}".format(precision))
         print("recall (B-EDU class) = {}".format(recall))
         print("F1 (B-EDU class) = {}".format(f1))