Skip to content

Commit

Permalink
Merge pull request #1 from shabnam-b/sb
Browse files Browse the repository at this point in the history
added scripts for 3 tasks
  • Loading branch information
shijie-wu authored Oct 21, 2021
2 parents ccd91ba + a74a31e commit 6fd7ea7
Show file tree
Hide file tree
Showing 16 changed files with 2,813 additions and 6 deletions.
9 changes: 9 additions & 0 deletions example/data-projection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ done
for split in train dev test; do
bash $SCRIPT_DIR/project-ner.sh $split $TGT $ALIGN_ENCODER $ALIGN_LAYER $ALIGN_NAME
done

# for Arabic as a target language, we could use the following two tasks:
# IE (ACE)
bash $SCRIPT_DIR/project-ace.sh $split $TGT $ALIGN_ENCODER $ALIGN_LAYER $ALIGN_NAME

# IE (BETTER abstract)
for split in train analysis devtest; do
bash $SCRIPT_DIR/project-better.sh $split $TGT $ALIGN_ENCODER $ALIGN_LAYER $ALIGN_NAME
done
```
### Self-training
```bash
Expand Down
85 changes: 85 additions & 0 deletions example/data-projection/project-ace.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash
src="en"

split=${1:-"devtest"}
tgt=${2:-"ar"}
encoder=${3:-"bert-base-multilingual-cased"}
align_layer=${4:-8}
align_system=${5:-"mbert_l8"}
mt_system="helsinki_opus"

max_len=500
#ACE data_dir
DATA_DIR=""
#temp dir where outputs are saved in, after each step
DIR="intermediary/ace"
#path to where final projection file will be saved
FINAL_DIR="projection/ace"
mkdir -p $DIR $FINAL_DIR

#dir containing the splits info
SPLITS_DIR=""

if [ -f "$DIR/$src.$split.text" ]; then
echo "$DIR/$src.$split.text exists."
else
python scripts/process_ace.py \
--input "$DATA_DIR" \
--output "$DIR" \
--lang "english"

python scripts/extract-text.py \
--task ace \
--path "$DIR" \
--lang $src \
--split "$split" \
>"$DIR/$src.$split.text"
fi

if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
else
python scripts/translate.py \
--infile "$DIR/$src.$split.text" \
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
--src $src \
--tgt "$tgt" \
>"$DIR/$src.to_$tgt.$mt_system.$split.text"
fi

if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
else
python scripts/bitext-concat.py \
--src_fp "$DIR/$src.$split.text" \
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
>"$DIR/$src.and_$tgt.$mt_system.$split.text"
fi

if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
else
python scripts/awesome-align.py \
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--align_layer "$align_layer" \
--model_name_or_path "$encoder" \
--max_len $max_len \
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
fi

python scripts/project-label.py \
--task ace \
--path "$DIR" \
--lang $src \
--split "$split" \
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
--output_path $FINAL_DIR \
--name "$tgt.from_$src.$mt_system.$align_system"

mv "$FINAL_DIR/out.json" "$FINAL_DIR/arabic.json"
python scripts/process_ace.py \
--input "$DATA_DIR" \
--output "$FINAL_DIR" \
--lang "arabic" \
--split "$SPLITS_DIR"
88 changes: 88 additions & 0 deletions example/data-projection/project-better.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash
src="en"

split=${1:-"analysis"}
tgt=${2:-"ar"}
encoder=${3:-"bert-base-multilingual-cased"}
align_layer=${4:-8}
align_system=${5:-"mbert_l8"}
subtask=${6:-"abstract"}
mt_system="helsinki_opus"

max_len=500
#better data dir
DATA_DIR=""
#temp dir where outputs are saved in, after each step
DIR="intermediary/better-$subtask"
#path to where final projection file will be saved
FINAL_DIR="projection/better-$subtask"
mkdir -p "$DIR" "$FINAL_DIR"

if [ -f "$DIR/$split.tok" ]; then
echo "$DIR/$split.tok exists."
else
mkdir -p "$DIR"
python scripts/tokenize_en.py \
--bpjson "$DATA_DIR/$subtask-8d-inclusive.$split.update2.bp.json" \
--output "$DIR/$split.tok"
fi

if [ -f "$DIR/$src.$split.text" ]; then
echo "$DIR/$src.$split.text exists."
else
python scripts/extract-text.py \
--task "better-$subtask" \
--path "$DIR" \
--lang $src \
--split "$split" \
>"$DIR/$src.$split.text"
fi

if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
else
python scripts/translate.py \
--infile "$DIR/$src.$split.text" \
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
--src $src \
--tgt "$tgt" \
>"$DIR/$src.to_$tgt.$mt_system.$split.text"
fi

if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
else
python scripts/bitext-concat.py \
--src_fp "$DIR/$src.$split.text" \
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
>"$DIR/$src.and_$tgt.$mt_system.$split.text"
fi

if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
else
python scripts/awesome-align.py \
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--align_layer "$align_layer" \
--model_name_or_path "$encoder" \
--max_len $max_len \
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
fi

python scripts/project-label.py \
--task "better-$subtask" \
--path "$DIR" \
--lang $src \
--split "$split" \
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
--output_path "$DIR" \
--name "$tgt.from_$src.$mt_system.$align_system"

python scripts/filter_bpjson.py \
--input "$DIR/silver-temp-$split.json" \
--outputdir "$DIR"

python scripts/tokenize_en.py \
--bpjson "$DIR/silver-temp-$split.valid.bp.json" \
--output "$FINAL_DIR/$subtask.$split.silver"
79 changes: 79 additions & 0 deletions example/data-projection/project-muc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash
src="en"

split=${1:-"dev"}
tgt=${2:-"ar"}
encoder=${3:-"bert-base-multilingual-cased"}
align_layer=${4:-8}
align_system=${5:-"mbert_l8"}
mt_system="helsinki_opus"

max_len=500
#data from https://github.com/xinyadu/doc_event_role/tree/master/data/processed
DATA_DIR=""
#temp dir where outputs are saved in, after each step
DIR="intermediary/muc"
#path to where final projection file will be saved
FINAL_DIR="projection/muc"
mkdir -p $DIR $FINAL_DIR

if [ -f "$DIR/$src.$split.text" ]; then
echo "$DIR/$src.$split.text exists."
else
python scripts/extract-text.py \
--task muc \
--path "$DATA_DIR" \
--lang $src \
--split "$split" \
>"$DIR/$src.$split.text"
fi

if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
else
python scripts/translate.py \
--infile "$DIR/$src.$split.text" \
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
--src $src \
--tgt "$tgt" \
>"$DIR/$src.to_$tgt.$mt_system.$split.text"
fi

if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
else
python scripts/bitext-concat.py \
--src_fp "$DIR/$src.$split.text" \
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
>"$DIR/$src.and_$tgt.$mt_system.$split.text"
fi

if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
else
python scripts/awesome-align.py \
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--align_layer "$align_layer" \
--model_name_or_path "$encoder" \
--max_len $max_len \
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
fi

python scripts/muc_convert_to_doc.py \
--path "$DATA_DIR" \
--lang $src \
--split "$split" \
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
--output_bitext "$DIR/$src.and_$tgt.$mt_system.$split.doc" \
--output_alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.doc.align"

python scripts/project-label.py \
--task muc \
--path "$DATA_DIR" \
--lang $src \
--split "$split" \
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.doc" \
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.doc.align" \
--output_path $FINAL_DIR \
--name "$tgt.from_$src.$mt_system.$align_system"
55 changes: 55 additions & 0 deletions scripts/ace_align_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Read word alignments in Pharoah format.

# Inputs:
#
# 1. Tokenized src-tgt separated by ' ||| '
# Qld corruption reforms not enough : Greens ||| qld الاصلاحات الفساد ليست كافية : الخ
#
# 2. Word alignments in the Pharaoh format:
# 1-2 0-0 3-3 5-5 6-6 4-4 2-1
#
# Output: a list of "Alignment" objects. The ith item of the list is the
# alignment for the ith line of the input.


import sys
from dataclasses import dataclass
from typing import Any, Dict


@dataclass
class Alignment:
src_tok: list
tgt_tok: list
align: dict

def to_dict(self) -> Dict[str, Any]:
return {"src_tok": self.src_tok, "tgt_tok": self.tgt_tok, "align": self.align}


def load_aligns(src_tgt_path, pha_path):
all_alignments = []
src_tgt_file = open(src_tgt_path)
pha_file = open(pha_path)
for ph, s_t in zip(pha_file, src_tgt_file):
ph = ph.strip()
s_t = s_t.strip()
src = s_t.split(" ||| ")[0]
src_toks = src.split(" ")
tgt = s_t.split(" ||| ")[1]
tgt_toks = tgt.split(" ")
align = {} # src tok id to its list of target toks ids
for ph_i in ph.split(" "):
s = int(ph_i.split("-")[0])
t = int(ph_i.split("-")[1])
if s + 1 in align:
align[s + 1].append(t + 1)
else:
align[s + 1] = [t + 1]
alignment = Alignment(src_toks, tgt_toks, align)
all_alignments.append(alignment)
return all_alignments


if __name__ == "__main__":
all_alignments = load_aligns(sys.argv[1], sys.argv[2])
20 changes: 18 additions & 2 deletions scripts/extract-text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,24 @@
import fire

sys.path.append("src")
from dataset import Dataset, ParsingDataset, WikiAnnNER # noqa: E402
from dataset import ( # noqa: E402
ACEDataset,
BetterDataset,
Dataset,
MUCDataset,
ParsingDataset,
WikiAnnNER,
)


def main(task: str, path: str, lang: str, split: str):

MAPPING: Dict[str, Type[Dataset]] = {
"wikiann": WikiAnnNER,
"ud": ParsingDataset,
"better-abstract": BetterDataset,
"ace": ACEDataset,
"muc": MUCDataset,
}
assert task in MAPPING
CLASS = MAPPING[task]
Expand All @@ -21,7 +31,13 @@ def main(task: str, path: str, lang: str, split: str):
print("Empty file path")
exit()
for example in CLASS.read_file(file_path, lang, split):
if task == "ud" or task == "wikiann":
if task == "ace":
for s in example["sentences"]:
print(" ".join(s["tokens"]))
elif task == "muc":
for i in example["sent"]:
print(" ".join(i))
else:
print(" ".join(example["sent"]))


Expand Down
Loading

0 comments on commit 6fd7ea7

Please sign in to comment.