Update evaluation scripts and other minor edits for EMNLP (castorini#27)

* Update README * Merge unified code to split documents into sentences * Remove matplotlib from requirements.txt * Remove unnecessary args from commands * Update evaluation scripts * Update evaluation commands * Reorganize repo * Add option to filter exact matches in searcher * Add notes in the README * Revert to 5 fold for Core17/18
munaAchyuta · Sep 1, 2019 · 2dd0401 · 2dd0401
1 parent d8d3837
commit 2dd0401
Show file tree

Hide file tree

Showing 21 changed files with 259 additions and 254 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # Birch
 
-[ ![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3372764.svg)](https://doi.org/10.5281/zenodo.3372764)
+[ ![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3381673.svg)](https://doi.org/10.5281/zenodo.3381673)
+
 
 Document ranking via sentence modeling using BERT
 
@@ -21,6 +22,7 @@ source birch_env/bin/activate
 pip install Cython  # jnius dependency
 pip install -r requirements.txt
 
+# For inference, the Python-only apex build can also be used
 git clone https://github.com/NVIDIA/apex
 cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
 
@@ -31,41 +33,42 @@ cd eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd .
 
 # Download data and models
 cd data
-wget https://zenodo.org/record/3372764/files/emnlp_bert4ir.tar.gz
-tar -xzvf emnlp_bert4ir.tar.gz
+wget https://zenodo.org/record/3381673/files/emnlp_bert4ir_v2.tar.gz
+tar -xzvf emnlp_bert4ir_v2.tar.gz
 cd ..
 ```
 
 Experiment Names:
-- large_mb_robust04, large_mb_core17, large_mb_core18
-- large_car_mb_robust04, large_car_mb_core17, large_car_mb_core18
-- large_msmarco_mb_robust04, large_msmarco_mb_core17, large_msmarco_mb_core18
-- large_car_robust04, large_car_core17, large_car_core18
-- large_msmarco_robust04, large_msmarco_core17, large_msmarco_core18
+- mb_robust04, mb_core17, mb_core18
+- car_mb_robust04, car_mb_core17, car_mb_core18
+- msmarco_mb_robust04, msmarco_mb_core17, msmarco_mb_core18
+- robust04, car_core17, car_core18
+- msmarco_robust04, msmarco_core17, msmarco_core18
+
 
 ## Training
 
 For BERT(MB):
 
 ```
-export CUDA_VISIBLE_DEVICES=0; experiment=${experiment}; \
+export CUDA_VISIBLE_DEVICES=0; experiment=mb; \
 nohup python -u src/main.py --mode training --experiment ${experiment} --collection mb \
 --local_model models/bert-large-uncased.tar.gz \
 --local_tokenizer models/bert-large-uncased-vocab.txt --batch_size 16 \
 --data_path data --predict_path data/predictions/predict.${experiment} \
---model_path models/saved.${experiment} --eval_steps 1000 --qrels_file qrels.microblog.txt \
+--model_path models/saved.${experiment} --eval_steps 1000 \
 --device cuda --output_path logs/out.${experiment} > logs/${experiment}.log 2>&1 &
 ```
 
 For BERT(CAR -> MB) and BERT(MS MARCO -> MB):
 
 ```
-export CUDA_VISIBLE_DEVICES=0; experiment=${experiment}; \
+export CUDA_VISIBLE_DEVICES=0; experiment=<car_mb, msmarco_mb>; \
 nohup python -u src/main.py --mode training --experiment ${experiment} --collection mb \
 --local_model <models/pytorch_msmarco.tar.gz, models/pytorch_car.tar.gz> \
 --local_tokenizer models/bert-large-uncased-vocab.txt --batch_size 16 \
 --data_path data --predict_path data/predictions/predict.${experiment} \
---model_path models/saved.${experiment} --eval_steps 1000 --qrels_file qrels.microblog.txt \
+--model_path models/saved.${experiment} --eval_steps 1000 \
 --device cuda --output_path logs/out.${experiment} > logs/${experiment}.log 2>&1 &
 ```
 
@@ -76,7 +79,7 @@ For BERT(MB), BERT(CAR -> MB) and BERT(MS MARCO -> MB):
 ```
 export CUDA_VISIBLE_DEVICES=0; experiment=<experiment_name>; \
 nohup python -u src/main.py --mode inference --experiment ${experiment} --collection <robust04, core17, core18> \
---load_trained --model_path <models/saved.large_mb_2, models/saved.car_mb_1, models/saved.msmarco_mb_2> \
+--load_trained --model_path <models/saved.mb_1, models/saved.car_mb_1, models/saved.msmarco_mb_1> \
 --batch_size 4 --data_path data --predict_path data/predictions/predict.${experiment} \
 --device cuda --output_path logs/out.${experiment} > logs/${experiment}.log 2>&1 &
 ```
@@ -95,14 +98,34 @@ nohup python -u src/main.py --mode inference --experiment ${experiment} --collec
 Note that this step takes a long time. 
 If you don't want to evaluate the pretrained models, you may skip to the next step and evaluate with our predictions under `data/predictions`.
 
+## Retrieve sentences from top candidate documents
+
+```
+python src/utils/split_docs.py --collection <robust04, core17, core18> \
+--index <path/to/index> --data_path data --anserini_path <path/to/anserini/root>
+```
+
 ## Evaluation
 
 ```
 experiment=<experiment_name>
 collection=<robust04, core17, core18>
 anserini_path=<path/to/anserini/root>
+index_path=<path/to/lucene/index>
 data_path=<path/to/data/root>
+```
 
+### BM25+RM3 Baseline
+
+```
+./eval_scripts/baseline.sh ${collection} ${index_path} ${anserini_path} ${data_path}
+
+./eval_scripts/eval.sh baseline ${collection} ${anserini_path} ${data_path}
+```
+
+### Sentence Evidence
+
+```
 # Tune hyperparameters
 ./eval_scripts/train.sh ${experiment} ${collection} ${anserini_path}
 

diff --git a/eval_scripts/baseline.sh b/eval_scripts/baseline.sh
@@ -1,21 +1,8 @@
 #!/usr/bin/env bash
 
-anserini_path=$1
+collection=$1
 index_path=$2
-num_folds=$3
+anserini_path=$3
+data_path=$4
 
-birch_path=$(pwd)
-cd ${anserini_path}
-
-if [ ${num_folds} == '5' ] ; then
-    folds_path="src/main/resources/fine_tuning/robust04-paper2-folds.json"
-    params_path="src/main/resources/fine_tuning/robust04-paper2-folds-map-params.json"
-else
-    folds_path="src/main/resources/fine_tuning/robust04-paper1-folds.json"
-    params_path="src/main/resources/fine_tuning/robust04-paper1-folds-map-params.json"
-fi
-
-python3 src/main/python/fine_tuning/reconstruct_robus04_tuned_run.py --index ${index_path} --folds ${folds_path} --params ${params_path}
-rm run.robust04.bm25+rm3.fold*
-mkdir --parents ${birch_path}/runs
-mv run.robust04.bm25+rm3.txt ${birch_path}/runs/run.bm25+rm3_${num_folds}cv.txt
+${anserini_path}/target/appassembler/bin/SearchCollection -topicreader Trec -index ${index_path} -topics "${data_path}/topics/topics.${collection}.txt" -output "runs/run.${collection}.bm25+rm3.txt" -bm25 -rm3
diff --git a/eval_scripts/eval.sh b/eval_scripts/eval.sh
@@ -5,9 +5,9 @@ data_path=$4
 
 echo "Experiment: ${experiment}"
 
-if [[ ${experiment} == *"bm25+rm3"* ]] ; then
+if [[ ${experiment} == "baseline" ]] ; then
     echo "BM25+RM3:"
-    ${anserini_path}/eval/trec_eval.9.0.4/trec_eval -M1000 -m map -m P.20 -m ndcg_cut.20 "${data_path}/qrels/qrels.${collection}.txt" "runs/run.${experiment}.txt"
+    ${anserini_path}/eval/trec_eval.9.0.4/trec_eval -M1000 -m map -m P.20 -m ndcg_cut.20 "${data_path}/qrels/qrels.${collection}.txt" "runs/run.${collection}.bm25+rm3.txt"
 else
     echo "1S:"
     ${anserini_path}/eval/trec_eval.9.0.4/trec_eval -M1000 -m map -m P.20 -m ndcg_cut.20 "${data_path}/qrels/qrels.${collection}.txt"  "runs/run.${experiment}.cv.a"

diff --git a/eval_scripts/sig_test.sh b/eval_scripts/sig_test.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+experiment=$1
+collection=$2
+anserini_path=$3
+data_path=$4
+metric=$5
+
+birch_path=$(pwd)
+cd ${anserini_path}
+
+python3.6 src/main/python/compare_runs.py --base ${birch_path}/runs/run.${collection}.bm25+rm3.txt \
+--comparison ${birch_path}/runs/run.${experiment}.cv.a --metric ${metric} \
+--qrels ${birch_path}/${data_path}/qrels/qrels.${collection}.txt > temp_sigtext.txt
+tail -6 temp_sigtext.txt > ${birch_path}/${data_path}/sigtest/${experiment}_${metric}_a
+
+python3.6 src/main/python/compare_runs.py --base ${birch_path}/runs/run.${collection}.bm25+rm3.txt \
+--comparison ${birch_path}/runs/run.${experiment}.cv.ab --metric ${metric} \
+--qrels ${birch_path}/${data_path}/qrels/qrels.${collection}.txt > temp_sigtext.txt
+tail -6 temp_sigtext.txt > ${birch_path}/${data_path}/sigtest/${experiment}_${metric}_ab
+
+python3.6 src/main/python/compare_runs.py --base ${birch_path}/runs/run.${collection}.bm25+rm3.txt \
+--comparison ${birch_path}/runs/run.${experiment}.cv.abc --metric ${metric} \
+--qrels ${birch_path}/${data_path}/qrels/qrels.${collection}.txt > temp_sigtext.txt
+tail -6 temp_sigtext.txt > ${birch_path}/${data_path}/sigtest/${experiment}_${metric}_abc
diff --git a/eval_scripts/test.sh b/eval_scripts/test.sh
@@ -8,8 +8,7 @@ declare -a sents=("a" "ab" "abc")
 
 for i in "${sents[@]}"
 do
-    if [[ "${collection}" == "robust04" ]] ; then
-        for j in $(seq 0 4)
+    for j in $(seq 0 4)
         do
             while IFS= read -r line
             do
@@ -21,16 +20,5 @@ do
             python src/main.py --mode retrieval --experiment ${experiment} --collection ${collection} --anserini_path ${anserini_path} 3 ${alpha} ${beta} ${gamma} $j test
         done
         cat runs/run.${experiment}.cv.test.* > runs/run.${experiment}.cv.$i
-    else
-        while IFS= read -r line
-        do
-            alpha=$(echo ${line#?} | cut -d" " -f1)
-            beta=$(echo ${line#?} | cut -d" " -f2)
-            gamma=$(echo ${line#?} | cut -d" " -f3)
-        done < "run_logs/${experiment}/${i}_best.txt"
-
-        python src/main.py --mode retrieval --experiment ${experiment} --collection ${collection} --anserini_path ${anserini_path} 3 ${alpha} ${beta} ${gamma} 0 all
-        mv runs/run.${experiment}.cv.all runs/run.${experiment}.cv.$i
-    fi
 done
 
diff --git a/eval_scripts/train.sh b/eval_scripts/train.sh
@@ -8,8 +8,7 @@ if [ ! -d "run_logs/${experiment}" ] ; then
     mkdir -p "run_logs/${experiment}"
 fi
 
-if [[ "${collection}" == "robust04" ]] ; then
-    for i in $(seq 0 4)
+for i in $(seq 0 4)
     do
         python src/main.py --mode retrieval --experiment ${experiment} --collection ${collection} --anserini_path ${anserini_path} 3 1.0 0.1 0.1 $i train > "run_logs/${experiment}/eval${i}a.txt"
         cat "run_logs/${experiment}/eval${i}a.txt" | sort -k5r,5 -k3,3 | head -1 > "run_logs/${experiment}/${i}a_best.txt"
@@ -22,17 +21,4 @@ if [[ "${collection}" == "robust04" ]] ; then
         python src/main.py --mode retrieval --experiment ${experiment} --collection ${collection} --anserini_path ${anserini_path} 3 1.0 1.0 1.0 $i train > "run_logs/${experiment}/eval${i}abc.txt"
         cat "run_logs/${experiment}/eval${i}abc.txt" | sort -k5r,5 -k3,3 | head -1 > "run_logs/${experiment}/${i}abc_best.txt"
         rm "runs/run.${experiment}.cv.train"
-    done
-else
-    python src/main.py --mode retrieval --experiment ${experiment} --collection ${collection} --anserini_path ${anserini_path} 3 1.0 0.1 0.1 0 train > "run_logs/${experiment}/evala.txt"
-    cat "run_logs/${experiment}/evala.txt" | sort -k5r,5 -k3,3 | head -1 > "run_logs/${experiment}/a_best.txt"
-    rm "runs/run.${experiment}.cv.train"
-
-    python src/main.py --mode retrieval --experiment ${experiment} --collection ${collection} --anserini_path ${anserini_path} 3 1.0 1.0 0.1 0 train > "run_logs/${experiment}/evalab.txt"
-    cat "run_logs/${experiment}/evalab.txt" | sort -k5r,5 -k3,3 | head -1 > "run_logs/${experiment}/ab_best.txt"
-    rm "runs/run.${experiment}.cv.train"
-
-    python src/main.py --mode retrieval --experiment ${experiment} --collection ${collection} --anserini_path ${anserini_path} 3 1.0 1.0 1.0 0 train > "run_logs/${experiment}/evalabc.txt"
-    cat "run_logs/${experiment}/evalabc.txt" | sort -k5r,5 -k3,3 | head -1 > "run_logs/${experiment}/abc_best.txt"
-    rm "runs/run.${experiment}.cv.train"
-fi
+    done
diff --git a/reproduce_arxiv.md b/reproduce_arxiv.md
@@ -53,13 +53,13 @@ If you don't want to evaluate the pretrained models, you may skip to the next st
 
 ## Evaluation
 
-### BM25+RM3:
+### BM25+RM3 Baseline
 
 ```
 ./eval_scripts/baseline.sh <path/to/anserini> <path/to/index> <2, 5>
 ```
 
-### Sentence Evidence:
+### Sentence Evidence
 
 - Compute document score
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
-apex==0.1
 backcall==0.1.0
 boto3==1.9.130
 botocore==1.12.130
@@ -13,7 +12,6 @@ ipython-genutils==0.2.0
 jedi==0.15.1
 jmespath==0.9.4
 kiwisolver==1.1.0
-matplotlib==3.1.1
 nltk==3.4.3
 numpy==1.16.2
 parso==0.5.1

diff --git a/src/args.py b/src/args.py
@@ -13,17 +13,16 @@ def get_args():
     # Interactive
     parser.add_argument('--interactive', action='store_true', default=False, help='Batch evaluation if not set')
     parser.add_argument('--query', default='hubble space telescope', help='Query string')
-    parser.add_argument('--interactive_path', default='data/datasets/query_sents.csv', help='Path to output sentence results from query')
+    parser.add_argument('--interactive_name', default='query_sents', help='Name of output sentence results from query')
 
     # Retrieval
     parser.add_argument('--experiment', default=None, help='Experiment name for logging')
     parser.add_argument('--index_path', default='lucene-index.robust04.pos+docvectors+rawdocs', help='Path to Lucene index')
-    parser.add_argument('--cv_fold', default=5)
 
     # Training
     parser.add_argument('--device', default='cpu', help='[cuda, cpu]')
     parser.add_argument('--model_path', default='models/saved.tmp', help='Path to pretrained model')
-    parser.add_argument('--predict_path', default='predict.tmp')
+    parser.add_argument('--predict_path', default='data/predictions/predict.tmp')
     parser.add_argument('--batch_size', default=16, type=int)
     parser.add_argument('--learning_rate', default=1e-5, type=float)
     parser.add_argument('--num_train_epochs', default=3, type=int)

diff --git a/src/eval_bert.py b/src/eval_bert.py
@@ -47,7 +47,7 @@ def load_bert_scores(pred_file, query_dict, sent_dict):
 
 def calc_q_doc_bert(score_dict, run_file, topics, top_doc_dict, bm25_dict,
                     topKSent, alpha, beta, gamma):
-    run_file = open(os.path.join('runs', run_file), "w")
+    run_file = open(os.path.join('runs', run_file), 'w')
     for q in topics:
         doc_score_dict = {}
         for d in top_doc_dict[q]:
@@ -64,7 +64,7 @@ def calc_q_doc_bert(score_dict, run_file, topics, top_doc_dict, bm25_dict,
         doc_score_dict = sorted(doc_score_dict.items(), key=operator.itemgetter(1), reverse=True)
         rank = 1
         for doc, score in doc_score_dict:
-            run_file.write("{} Q0 {} {} {} BERT\n".format(q, doc, rank, score))
+            run_file.write('{} Q0 {} {} {} BERT\n'.format(q, doc, rank, score))
             rank += 1
 
     run_file.close()