-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from shabnam-b/sb
added scripts for 3 tasks
- Loading branch information
Showing
16 changed files
with
2,813 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#!/bin/bash | ||
src="en" | ||
|
||
split=${1:-"devtest"} | ||
tgt=${2:-"ar"} | ||
encoder=${3:-"bert-base-multilingual-cased"} | ||
align_layer=${4:-8} | ||
align_system=${5:-"mbert_l8"} | ||
mt_system="helsinki_opus" | ||
|
||
max_len=500 | ||
#ACE data_dir | ||
DATA_DIR="" | ||
#temp dir where outputs are saved in, after each step | ||
DIR="intermediary/ace" | ||
#path to where final projection file will be saved | ||
FINAL_DIR="projection/ace" | ||
mkdir -p $DIR $FINAL_DIR | ||
|
||
#dir containing the splits info | ||
SPLITS_DIR="" | ||
|
||
if [ -f "$DIR/$src.$split.text" ]; then | ||
echo "$DIR/$src.$split.text exists." | ||
else | ||
python scripts/process_ace.py \ | ||
--input "$DATA_DIR" \ | ||
--output "$DIR" \ | ||
--lang "english" | ||
|
||
python scripts/extract-text.py \ | ||
--task ace \ | ||
--path "$DIR" \ | ||
--lang $src \ | ||
--split "$split" \ | ||
>"$DIR/$src.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then | ||
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists." | ||
else | ||
python scripts/translate.py \ | ||
--infile "$DIR/$src.$split.text" \ | ||
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \ | ||
--src $src \ | ||
--tgt "$tgt" \ | ||
>"$DIR/$src.to_$tgt.$mt_system.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then | ||
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists." | ||
else | ||
python scripts/bitext-concat.py \ | ||
--src_fp "$DIR/$src.$split.text" \ | ||
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \ | ||
>"$DIR/$src.and_$tgt.$mt_system.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then | ||
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists." | ||
else | ||
python scripts/awesome-align.py \ | ||
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \ | ||
--align_layer "$align_layer" \ | ||
--model_name_or_path "$encoder" \ | ||
--max_len $max_len \ | ||
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" | ||
fi | ||
|
||
python scripts/project-label.py \ | ||
--task ace \ | ||
--path "$DIR" \ | ||
--lang $src \ | ||
--split "$split" \ | ||
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \ | ||
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \ | ||
--output_path $FINAL_DIR \ | ||
--name "$tgt.from_$src.$mt_system.$align_system" | ||
|
||
mv "$FINAL_DIR/out.json" "$FINAL_DIR/arabic.json" | ||
python scripts/process_ace.py \ | ||
--input "$DATA_DIR" \ | ||
--output "$FINAL_DIR" \ | ||
--lang "arabic" \ | ||
--split "$SPLITS_DIR" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/bin/bash | ||
src="en" | ||
|
||
split=${1:-"analysis"} | ||
tgt=${2:-"ar"} | ||
encoder=${3:-"bert-base-multilingual-cased"} | ||
align_layer=${4:-8} | ||
align_system=${5:-"mbert_l8"} | ||
subtask=${6:-"abstract"} | ||
mt_system="helsinki_opus" | ||
|
||
max_len=500 | ||
#better data dir | ||
DATA_DIR="" | ||
#temp dir where outputs are saved in, after each step | ||
DIR="intermediary/better-$subtask" | ||
#path to where final projection file will be saved | ||
FINAL_DIR="projection/better-$subtask" | ||
mkdir -p "$DIR" "$FINAL_DIR" | ||
|
||
if [ -f "$DIR/$split.tok" ]; then | ||
echo "$DIR/$split.tok exists." | ||
else | ||
mkdir -p "$DIR" | ||
python scripts/tokenize_en.py \ | ||
--bpjson "$DATA_DIR/$subtask-8d-inclusive.$split.update2.bp.json" \ | ||
--output "$DIR/$split.tok" | ||
fi | ||
|
||
if [ -f "$DIR/$src.$split.text" ]; then | ||
echo "$DIR/$src.$split.text exists." | ||
else | ||
python scripts/extract-text.py \ | ||
--task "better-$subtask" \ | ||
--path "$DIR" \ | ||
--lang $src \ | ||
--split "$split" \ | ||
>"$DIR/$src.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then | ||
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists." | ||
else | ||
python scripts/translate.py \ | ||
--infile "$DIR/$src.$split.text" \ | ||
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \ | ||
--src $src \ | ||
--tgt "$tgt" \ | ||
>"$DIR/$src.to_$tgt.$mt_system.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then | ||
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists." | ||
else | ||
python scripts/bitext-concat.py \ | ||
--src_fp "$DIR/$src.$split.text" \ | ||
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \ | ||
>"$DIR/$src.and_$tgt.$mt_system.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then | ||
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists." | ||
else | ||
python scripts/awesome-align.py \ | ||
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \ | ||
--align_layer "$align_layer" \ | ||
--model_name_or_path "$encoder" \ | ||
--max_len $max_len \ | ||
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" | ||
fi | ||
|
||
python scripts/project-label.py \ | ||
--task "better-$subtask" \ | ||
--path "$DIR" \ | ||
--lang $src \ | ||
--split "$split" \ | ||
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \ | ||
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \ | ||
--output_path "$DIR" \ | ||
--name "$tgt.from_$src.$mt_system.$align_system" | ||
|
||
python scripts/filter_bpjson.py \ | ||
--input "$DIR/silver-temp-$split.json" \ | ||
--outputdir "$DIR" | ||
|
||
python scripts/tokenize_en.py \ | ||
--bpjson "$DIR/silver-temp-$split.valid.bp.json" \ | ||
--output "$FINAL_DIR/$subtask.$split.silver" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#!/bin/bash | ||
src="en" | ||
|
||
split=${1:-"dev"} | ||
tgt=${2:-"ar"} | ||
encoder=${3:-"bert-base-multilingual-cased"} | ||
align_layer=${4:-8} | ||
align_system=${5:-"mbert_l8"} | ||
mt_system="helsinki_opus" | ||
|
||
max_len=500 | ||
#data from https://github.com/xinyadu/doc_event_role/tree/master/data/processed | ||
DATA_DIR="" | ||
#temp dir where outputs are saved in, after each step | ||
DIR="intermediary/muc" | ||
#path to where final projection file will be saved | ||
FINAL_DIR="projection/muc" | ||
mkdir -p $DIR $FINAL_DIR | ||
|
||
if [ -f "$DIR/$src.$split.text" ]; then | ||
echo "$DIR/$src.$split.text exists." | ||
else | ||
python scripts/extract-text.py \ | ||
--task muc \ | ||
--path "$DATA_DIR" \ | ||
--lang $src \ | ||
--split "$split" \ | ||
>"$DIR/$src.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then | ||
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists." | ||
else | ||
python scripts/translate.py \ | ||
--infile "$DIR/$src.$split.text" \ | ||
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \ | ||
--src $src \ | ||
--tgt "$tgt" \ | ||
>"$DIR/$src.to_$tgt.$mt_system.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then | ||
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists." | ||
else | ||
python scripts/bitext-concat.py \ | ||
--src_fp "$DIR/$src.$split.text" \ | ||
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \ | ||
>"$DIR/$src.and_$tgt.$mt_system.$split.text" | ||
fi | ||
|
||
if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then | ||
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists." | ||
else | ||
python scripts/awesome-align.py \ | ||
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \ | ||
--align_layer "$align_layer" \ | ||
--model_name_or_path "$encoder" \ | ||
--max_len $max_len \ | ||
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" | ||
fi | ||
|
||
python scripts/muc_convert_to_doc.py \ | ||
--path "$DATA_DIR" \ | ||
--lang $src \ | ||
--split "$split" \ | ||
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \ | ||
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \ | ||
--output_bitext "$DIR/$src.and_$tgt.$mt_system.$split.doc" \ | ||
--output_alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.doc.align" | ||
|
||
python scripts/project-label.py \ | ||
--task muc \ | ||
--path "$DATA_DIR" \ | ||
--lang $src \ | ||
--split "$split" \ | ||
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.doc" \ | ||
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.doc.align" \ | ||
--output_path $FINAL_DIR \ | ||
--name "$tgt.from_$src.$mt_system.$align_system" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Read word alignments in Pharoah format. | ||
|
||
# Inputs: | ||
# | ||
# 1. Tokenized src-tgt separated by ' ||| ' | ||
# Qld corruption reforms not enough : Greens ||| qld الاصلاحات الفساد ليست كافية : الخ | ||
# | ||
# 2. Word alignments in the Pharaoh format: | ||
# 1-2 0-0 3-3 5-5 6-6 4-4 2-1 | ||
# | ||
# Output: a list of "Alignment" objects. The ith item of the list is the | ||
# alignment for the ith line of the input. | ||
|
||
|
||
import sys | ||
from dataclasses import dataclass | ||
from typing import Any, Dict | ||
|
||
|
||
@dataclass | ||
class Alignment: | ||
src_tok: list | ||
tgt_tok: list | ||
align: dict | ||
|
||
def to_dict(self) -> Dict[str, Any]: | ||
return {"src_tok": self.src_tok, "tgt_tok": self.tgt_tok, "align": self.align} | ||
|
||
|
||
def load_aligns(src_tgt_path, pha_path): | ||
all_alignments = [] | ||
src_tgt_file = open(src_tgt_path) | ||
pha_file = open(pha_path) | ||
for ph, s_t in zip(pha_file, src_tgt_file): | ||
ph = ph.strip() | ||
s_t = s_t.strip() | ||
src = s_t.split(" ||| ")[0] | ||
src_toks = src.split(" ") | ||
tgt = s_t.split(" ||| ")[1] | ||
tgt_toks = tgt.split(" ") | ||
align = {} # src tok id to its list of target toks ids | ||
for ph_i in ph.split(" "): | ||
s = int(ph_i.split("-")[0]) | ||
t = int(ph_i.split("-")[1]) | ||
if s + 1 in align: | ||
align[s + 1].append(t + 1) | ||
else: | ||
align[s + 1] = [t + 1] | ||
alignment = Alignment(src_toks, tgt_toks, align) | ||
all_alignments.append(alignment) | ||
return all_alignments | ||
|
||
|
||
if __name__ == "__main__": | ||
all_alignments = load_aligns(sys.argv[1], sys.argv[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.