Merge pull request #1 from shabnam-b/sb

added scripts for 3 tasks
shijie-wu · Oct 21, 2021 · 6fd7ea7 · 6fd7ea7
2 parents ccd91ba + a74a31e
commit 6fd7ea7
Show file tree

Hide file tree

Showing 16 changed files with 2,813 additions and 6 deletions.
diff --git a/example/data-projection/README.md b/example/data-projection/README.md
@@ -71,6 +71,15 @@ done
 for split in train dev test; do
   bash $SCRIPT_DIR/project-ner.sh $split $TGT $ALIGN_ENCODER $ALIGN_LAYER $ALIGN_NAME
 done
+
+# for Arabic as a target language, we could use the following two tasks:
+# IE (ACE)
+bash $SCRIPT_DIR/project-ace.sh $split $TGT $ALIGN_ENCODER $ALIGN_LAYER $ALIGN_NAME
+
+# IE (BETTER abstract)
+for split in train analysis devtest; do
+  bash $SCRIPT_DIR/project-better.sh $split $TGT $ALIGN_ENCODER $ALIGN_LAYER $ALIGN_NAME
+done
 ```
 ### Self-training
 ```bash

diff --git a/example/data-projection/project-ace.sh b/example/data-projection/project-ace.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+src="en"
+
+split=${1:-"devtest"}
+tgt=${2:-"ar"}
+encoder=${3:-"bert-base-multilingual-cased"}
+align_layer=${4:-8}
+align_system=${5:-"mbert_l8"}
+mt_system="helsinki_opus"
+
+max_len=500
+#ACE data_dir
+DATA_DIR=""
+#temp dir where outputs are saved in, after each step
+DIR="intermediary/ace"
+#path to where final projection file will be saved
+FINAL_DIR="projection/ace"
+mkdir -p $DIR $FINAL_DIR
+
+#dir containing the splits info
+SPLITS_DIR=""
+
+if [ -f "$DIR/$src.$split.text" ]; then
+    echo "$DIR/$src.$split.text exists."
+else
+    python scripts/process_ace.py \
+        --input "$DATA_DIR" \
+        --output "$DIR" \
+        --lang "english"
+
+    python scripts/extract-text.py \
+        --task ace \
+        --path "$DIR" \
+        --lang $src \
+        --split "$split" \
+        >"$DIR/$src.$split.text"
+fi
+
+if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
+    echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
+else
+    python scripts/translate.py \
+        --infile "$DIR/$src.$split.text" \
+        --model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
+        --src $src \
+        --tgt "$tgt" \
+        >"$DIR/$src.to_$tgt.$mt_system.$split.text"
+fi
+
+if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
+    echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
+else
+    python scripts/bitext-concat.py \
+        --src_fp "$DIR/$src.$split.text" \
+        --tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
+        >"$DIR/$src.and_$tgt.$mt_system.$split.text"
+fi
+
+if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
+    echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
+else
+    python scripts/awesome-align.py \
+        --data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
+        --align_layer "$align_layer" \
+        --model_name_or_path "$encoder" \
+        --max_len $max_len \
+        --output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
+fi
+
+python scripts/project-label.py \
+    --task ace \
+    --path "$DIR" \
+    --lang $src \
+    --split "$split" \
+    --bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
+    --alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
+    --output_path $FINAL_DIR \
+    --name "$tgt.from_$src.$mt_system.$align_system"
+
+mv "$FINAL_DIR/out.json" "$FINAL_DIR/arabic.json"
+python scripts/process_ace.py \
+    --input "$DATA_DIR" \
+    --output "$FINAL_DIR" \
+    --lang "arabic" \
+    --split "$SPLITS_DIR"
diff --git a/example/data-projection/project-better.sh b/example/data-projection/project-better.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+src="en"
+
+split=${1:-"analysis"}
+tgt=${2:-"ar"}
+encoder=${3:-"bert-base-multilingual-cased"}
+align_layer=${4:-8}
+align_system=${5:-"mbert_l8"}
+subtask=${6:-"abstract"}
+mt_system="helsinki_opus"
+
+max_len=500
+#better data dir
+DATA_DIR=""
+#temp dir where outputs are saved in, after each step
+DIR="intermediary/better-$subtask"
+#path to where final projection file will be saved
+FINAL_DIR="projection/better-$subtask"
+mkdir -p "$DIR" "$FINAL_DIR"
+
+if [ -f "$DIR/$split.tok" ]; then
+    echo "$DIR/$split.tok exists."
+else
+    mkdir -p "$DIR"
+    python scripts/tokenize_en.py \
+        --bpjson "$DATA_DIR/$subtask-8d-inclusive.$split.update2.bp.json" \
+        --output "$DIR/$split.tok"
+fi
+
+if [ -f "$DIR/$src.$split.text" ]; then
+    echo "$DIR/$src.$split.text exists."
+else
+    python scripts/extract-text.py \
+        --task "better-$subtask" \
+        --path "$DIR" \
+        --lang $src \
+        --split "$split" \
+        >"$DIR/$src.$split.text"
+fi
+
+if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
+    echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
+else
+    python scripts/translate.py \
+        --infile "$DIR/$src.$split.text" \
+        --model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
+        --src $src \
+        --tgt "$tgt" \
+        >"$DIR/$src.to_$tgt.$mt_system.$split.text"
+fi
+
+if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
+    echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
+else
+    python scripts/bitext-concat.py \
+        --src_fp "$DIR/$src.$split.text" \
+        --tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
+        >"$DIR/$src.and_$tgt.$mt_system.$split.text"
+fi
+
+if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
+    echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
+else
+    python scripts/awesome-align.py \
+        --data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
+        --align_layer "$align_layer" \
+        --model_name_or_path "$encoder" \
+        --max_len $max_len \
+        --output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
+fi
+
+python scripts/project-label.py \
+    --task "better-$subtask" \
+    --path "$DIR" \
+    --lang $src \
+    --split "$split" \
+    --bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
+    --alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
+    --output_path "$DIR" \
+    --name "$tgt.from_$src.$mt_system.$align_system"
+
+python scripts/filter_bpjson.py \
+    --input "$DIR/silver-temp-$split.json" \
+    --outputdir "$DIR"
+
+python scripts/tokenize_en.py \
+    --bpjson "$DIR/silver-temp-$split.valid.bp.json" \
+    --output "$FINAL_DIR/$subtask.$split.silver"
diff --git a/example/data-projection/project-muc.sh b/example/data-projection/project-muc.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+src="en"
+
+split=${1:-"dev"}
+tgt=${2:-"ar"}
+encoder=${3:-"bert-base-multilingual-cased"}
+align_layer=${4:-8}
+align_system=${5:-"mbert_l8"}
+mt_system="helsinki_opus"
+
+max_len=500
+#data from https://github.com/xinyadu/doc_event_role/tree/master/data/processed
+DATA_DIR=""
+#temp dir where outputs are saved in, after each step
+DIR="intermediary/muc"
+#path to where final projection file will be saved
+FINAL_DIR="projection/muc"
+mkdir -p $DIR $FINAL_DIR
+
+if [ -f "$DIR/$src.$split.text" ]; then
+    echo "$DIR/$src.$split.text exists."
+else
+    python scripts/extract-text.py \
+        --task muc \
+        --path "$DATA_DIR" \
+        --lang $src \
+        --split "$split" \
+        >"$DIR/$src.$split.text"
+fi
+
+if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
+    echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
+else
+    python scripts/translate.py \
+        --infile "$DIR/$src.$split.text" \
+        --model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
+        --src $src \
+        --tgt "$tgt" \
+        >"$DIR/$src.to_$tgt.$mt_system.$split.text"
+fi
+
+if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
+    echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
+else
+    python scripts/bitext-concat.py \
+        --src_fp "$DIR/$src.$split.text" \
+        --tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
+        >"$DIR/$src.and_$tgt.$mt_system.$split.text"
+fi
+
+if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
+    echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
+else
+    python scripts/awesome-align.py \
+        --data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
+        --align_layer "$align_layer" \
+        --model_name_or_path "$encoder" \
+        --max_len $max_len \
+        --output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
+fi
+
+python scripts/muc_convert_to_doc.py \
+    --path "$DATA_DIR" \
+    --lang $src \
+    --split "$split" \
+    --bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
+    --alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
+    --output_bitext "$DIR/$src.and_$tgt.$mt_system.$split.doc" \
+    --output_alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.doc.align"
+
+python scripts/project-label.py \
+    --task muc \
+    --path "$DATA_DIR" \
+    --lang $src \
+    --split "$split" \
+    --bitext "$DIR/$src.and_$tgt.$mt_system.$split.doc" \
+    --alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.doc.align" \
+    --output_path $FINAL_DIR \
+    --name "$tgt.from_$src.$mt_system.$align_system"
diff --git a/scripts/ace_align_util.py b/scripts/ace_align_util.py
@@ -0,0 +1,55 @@
+# Read word alignments in Pharoah format.
+
+# Inputs:
+#
+# 1. Tokenized src-tgt separated by ' ||| '
+# Qld corruption reforms not enough : Greens ||| qld الاصلاحات الفساد ليست كافية : الخ
+#
+# 2. Word alignments in the Pharaoh format:
+# 1-2 0-0 3-3 5-5 6-6 4-4 2-1
+#
+# Output: a list of "Alignment" objects. The ith item of the list is the
+# alignment for the ith line of the input.
+
+
+import sys
+from dataclasses import dataclass
+from typing import Any, Dict
+
+
+@dataclass
+class Alignment:
+    src_tok: list
+    tgt_tok: list
+    align: dict
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {"src_tok": self.src_tok, "tgt_tok": self.tgt_tok, "align": self.align}
+
+
+def load_aligns(src_tgt_path, pha_path):
+    all_alignments = []
+    src_tgt_file = open(src_tgt_path)
+    pha_file = open(pha_path)
+    for ph, s_t in zip(pha_file, src_tgt_file):
+        ph = ph.strip()
+        s_t = s_t.strip()
+        src = s_t.split(" ||| ")[0]
+        src_toks = src.split(" ")
+        tgt = s_t.split(" ||| ")[1]
+        tgt_toks = tgt.split(" ")
+        align = {}  # src tok id to its list of target toks ids
+        for ph_i in ph.split(" "):
+            s = int(ph_i.split("-")[0])
+            t = int(ph_i.split("-")[1])
+            if s + 1 in align:
+                align[s + 1].append(t + 1)
+            else:
+                align[s + 1] = [t + 1]
+        alignment = Alignment(src_toks, tgt_toks, align)
+        all_alignments.append(alignment)
+    return all_alignments
+
+
+if __name__ == "__main__":
+    all_alignments = load_aligns(sys.argv[1], sys.argv[2])
diff --git a/scripts/extract-text.py b/scripts/extract-text.py
@@ -4,14 +4,24 @@
 import fire
 
 sys.path.append("src")
-from dataset import Dataset, ParsingDataset, WikiAnnNER  # noqa: E402
+from dataset import (  # noqa: E402
+    ACEDataset,
+    BetterDataset,
+    Dataset,
+    MUCDataset,
+    ParsingDataset,
+    WikiAnnNER,
+)
 
 
 def main(task: str, path: str, lang: str, split: str):
 
     MAPPING: Dict[str, Type[Dataset]] = {
         "wikiann": WikiAnnNER,
         "ud": ParsingDataset,
+        "better-abstract": BetterDataset,
+        "ace": ACEDataset,
+        "muc": MUCDataset,
     }
     assert task in MAPPING
     CLASS = MAPPING[task]
@@ -21,7 +31,13 @@ def main(task: str, path: str, lang: str, split: str):
         print("Empty file path")
         exit()
     for example in CLASS.read_file(file_path, lang, split):
-        if task == "ud" or task == "wikiann":
+        if task == "ace":
+            for s in example["sentences"]:
+                print(" ".join(s["tokens"]))
+        elif task == "muc":
+            for i in example["sent"]:
+                print(" ".join(i))
+        else:
             print(" ".join(example["sent"]))