add annotation method

kermitt2 · kermitt2 · May 23, 2022 · May 21, 2022 · May 21, 2022 · May 21, 2022
commit 1ba1def4c5bdb10e43d3dfa27c80a0f0c4806044
diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py
@@ -48,6 +48,7 @@ def configure(architecture, output_path=None, max_sequence_length=-1, batch_size
 
     return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop
 
+
 # train a model with all available data
 def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
                input_path=None, output_path=None, fold_count=1,
@@ -96,6 +97,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
     else:
         model.save()
 
+
 # split data, train a model and evaluate it
 def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
                input_path=None, output_path=None, fold_count=1,
@@ -154,10 +156,37 @@ def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=Non
     else:
         model.save()
 
+
 def eval_(input_path=None, architecture=None):
     return
 
 
+# annotate a list of texts
+def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
+    annotations = []
+
+    # load model
+    model_name = 'datasets'
+    model_name += '-'+architecture
+    if use_ELMo:
+        model_name += '-with_ELMo'
+
+    model = Sequence(model_name)
+    model.load()
+
+    start_time = time.time()
+
+    annotations = model.tag(texts, output_format, features=features)
+    runtime = round(time.time() - start_time, 3)
+
+    if output_format == 'json':
+        annotations["runtime"] = runtime
+    else:
+        print("runtime: %s seconds " % (runtime))
+
+    return annotations
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description = "Trainer for dataset recognition models using the DeLFT library")