diff --git a/Makefile b/Makefile index 0fe1424..0164276 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,4 @@ DATA_DIR = data -ANNOTATIONS_DIR = $(DATA_DIR)/annotations ARXIV_DIR = $(DATA_DIR)/arxiv ARCHIVES_DIR = $(ARXIV_DIR)/sources UNPACKED_DIR = $(ARXIV_DIR)/unpacked_sources @@ -16,7 +15,7 @@ TABLES := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES)) TEXTS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%/text.json,$(ARCHIVES)) .PHONY: all -all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all +all: extract_all .PHONY: test test: DATA_DIR = test/data @@ -56,24 +55,11 @@ unpack_all: $(UNPACKS) $(UNPACKS): $(UNPACKED_DIR)/%: $(ARCHIVES_DIR)/%.gz ./unpack-sources.sh $^ $@ -$(ANNOTATIONS_DIR)/pdfs-urls.csv: $(ANNOTATIONS_DIR)/papers-urls.csv - sed -e 's#/abs/#/pdf/#' -e 's#$$#.pdf#' $^ > $@ +.PHONY: pull_images +pull_images: + docker pull arxivvanity/engrafo:b3db888fefa118eacf4f13566204b68ce100b3a6 + docker pull zenika/alpine-chrome:73 -$(ANNOTATIONS_DIR)/sources-urls.csv: $(ANNOTATIONS_DIR)/papers-urls.csv - sed -e 's#/abs/#/e-print/#' $^ > $@ - -$(ANNOTATIONS_DIR)/papers-urls.csv: $(ANNOTATIONS_DIR)/evaluation-tables.json get_papers_links.sh - ./get_papers_links.sh $< > $@ - -$(ANNOTATIONS_DIR)/%: $(ANNOTATIONS_DIR)/%.gz - gunzip -kf $^ - -$(ANNOTATIONS_DIR)/evaluation-tables.json.gz: - $(shell mkdir -p "$(ANNOTATIONS_DIR)") - wget https://paperswithcode.com/media/about/evaluation-tables.json.gz -O $@ - - -.PHONY : clean +.PHONY: clean clean : - cd "$(ANNOTATIONS_DIR)" && rm -f *.json *.csv #rm -f *.gz diff --git a/README.md b/README.md index f237d8e..ea2df7c 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,13 @@ # Scripts for extracting tables Dependencies: - * [jq](https://stedolan.github.io/jq/) (`sudo apt install jq`) - * docker (run without `sudo`) + * docker * [conda](https://www.anaconda.com/distribution/) Directory structure: ``` . └── data -    ├── annotations -    │   └── evaluation-tables.json.gz # current annotations    └── arxiv    ├── sources # gzip archives with e-prints    ├── unpacked\_sources # automatically extracted latex sources @@ -20,8 +17,9 @@ Directory structure: ``` -To preprocess data and extract tables, run: +To preprocess data and extract tables and texts, run: ``` +make pull_images conda env create -f environment.yml source activate xtables make -j 8 -i extract_all > stdout.log 2> stderr.log @@ -33,3 +31,5 @@ To test the whole extraction on a single file run ``` make test ``` + +See `extraction-pipeline.ipynb` for an example on how to work with the produced files. diff --git a/clean_html.sh b/clean_html.sh index 6c1df1c..6ceb913 100755 --- a/clean_html.sh +++ b/clean_html.sh @@ -4,4 +4,4 @@ SOURCE=$(realpath "$1") mkdir -p $(dirname "$2") OUTPUT=$(realpath "$2") -docker run --rm -v "$SOURCE":/files/index.html:ro --entrypoint '' zenika/alpine-chrome:73 timeout -t 20 -s KILL chromium-browser --headless --disable-gpu --disable-software-rasterizer --no-sandbox --timeout=30000 --dump-dom /files/index.html > "$OUTPUT" +docker run --rm -v "$SOURCE":/files/index.html:ro --entrypoint '' zenika/alpine-chrome:73 timeout -s KILL 20 chromium-browser --headless --disable-gpu --disable-software-rasterizer --no-sandbox --timeout=30000 --dump-dom /files/index.html > "$OUTPUT" diff --git a/docker-latex2html.sh b/docker-latex2html.sh index 6a30c92..2e2899c 100755 --- a/docker-latex2html.sh +++ b/docker-latex2html.sh @@ -6,4 +6,4 @@ OUTPUT=$(realpath "$2") #~/arxiv/htmls/1701/1701.xyz.html OUTPUT_DIR=$(dirname "$OUTPUT") #~/arxiv/htmls/1701 FILENAME=$(basename "$OUTPUT") #1701.xyz.html -docker run --rm -v $PWD/latex2html.sh:/files/latex2html.sh:ro -v $PWD/guess_main.py:/files/guess_main.py:ro -v $PWD/patches:/files/patches:ro -v "$SOURCE_DIR":/files/ro-source:ro -v "$OUTPUT_DIR":/files/htmls arxivvanity/engrafo /files/latex2html.sh "$FILENAME" +docker run --rm -v $PWD/latex2html.sh:/files/latex2html.sh:ro -v $PWD/guess_main.py:/files/guess_main.py:ro -v $PWD/patches:/files/patches:ro -v "$SOURCE_DIR":/files/ro-source:ro -v "$OUTPUT_DIR":/files/htmls arxivvanity/engrafo:b3db888fefa118eacf4f13566204b68ce100b3a6 /files/latex2html.sh "$FILENAME" diff --git a/download_arxiv_ml_papers.sh b/download_arxiv_ml_papers.sh deleted file mode 100755 index dcc62a5..0000000 --- a/download_arxiv_ml_papers.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -index_dir="index" -papers_dir="papers" -src_dir="src" -mkdir -p "${index_dir}" "${papers_dir}" "${src_dir}" - -jq -r '.[] | select(.arxiv_id) | "/"+.arxiv_id+"."' pwc/papers-with-abstracts.json | sort -u > wildcards.txt -aws s3 cp --request-payer requester s3://arxiv/src/arXiv_src_manifest.xml . -xmllint --xpath '//filename/text()' arXiv_src_manifest.xml > tars.txt - -process_file () { - path="$1" - archive_name=$(basename "${path}") - file="${src_dir}/${archive_name}" - echo "Processing ${file}..." - [ -e "${file}" ] && echo "Already exists, skipping..." && return - aws s3 cp --request-payer requester "s3://arxiv/${path}" "${src_dir}" - tar -tvf "${file}" > "${index_dir}/${archive_name}.ls" - tar -tf "${file}" > "${index_dir}/${archive_name}.txt" - fgrep -f wildcards.txt "${index_dir}/${archive_name}.txt" > to_extract.txt && xargs -a to_extract.txt -- tar xf "${file}" -C "${papers_dir}" -} - -while read file -do - process_file "${file}" -done stdout.log 2> stderr.log```\n", + "\n", + "where `8` is a number of processes to use. Under `mydata` there should be a directory named `arxiv/sources` containing packed LaTeX sources, one archive per paper. After running the test above you can see the directory structure by inspecting `test/data/arxiv` directory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the extraction is finished we can load all the papers. In the example below we load the paper from `test` directory." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sota_extractor2.data.paper_collection import PaperCollection\n", + "pc = PaperCollection.from_files('test/data/arxiv', load_annotations=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PaperCollection` is a list of papers. In the test example there's only one paper with two tables:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DILBERT: Distilling Inner Latent BERT variables\n", + "In this paper we achieve state-of-the-art performance in random number generation.\n" + ] + } + ], + "source": [ + "print(pc[0].text.title)\n", + "print(pc[0].text.abstract)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
leftcenterright
123
456
789
abc
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pc[0].tables[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
bold textitalic textbold italic text
red textgreen textblue text
5.4%3.8%11.2±0.15
an italic text inside boldbold red
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pc[0].tables[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Manual extraction\n", + "\n", + "We can access the extraction pipeline programatically. In case of batch mode presented above we don't provide downloading of arXiv papers on purpose as for batch mode one should use arXiv's S3 bucket. In the example below we show how to load a paper directly from arXiv url, but use of it should be limited." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sota_extractor2.data.paper_collection import PaperCollection, TempPaper\n", + "from pathlib import Path\n", + "from tempfile import TemporaryDirectory\n", + "from sota_extractor2.helpers import LatexConverter, Unpack\n", + "from sota_extractor2.errors import *\n", + "from urllib.request import urlretrieve as download_url\n", + "\n", + "unpack = Unpack()\n", + "latex = LatexConverter(Path().absolute())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "arxiv_url_re = re.compile(r\"^(https?://(www.)?arxiv.org/(abs|pdf|e-print)/)?(?P\\d{4}\\.\\d+(v\\d+)?)(\\.pdf)?$\")\n", + "\n", + "def get_paper_html(path):\n", + " \"\"\"Get an HTML source of a given paper identified by either local path or arXiv url.\n", + " \n", + " :param path - either a local path to an archive with LaTeX source or arXiv url / id.\n", + " In the later case the LaTeX source is downloaded automatically.\n", + " \n", + " :returns html - a string containing paper's HTML source code.\n", + " \"\"\"\n", + " with TemporaryDirectory() as workdir:\n", + " workdir = Path(workdir)\n", + " m = arxiv_url_re.match(path)\n", + " if m:\n", + " path = workdir / \"source\"\n", + " workdir = workdir / \"unpack\"\n", + " download_url(f\"https://arxiv.org/e-print/{m['arxiv_id']}\", path) \n", + " unpack(path, workdir)\n", + " html = latex.to_html(workdir)\n", + " # Skipped step - reference normalization\n", + " return html\n", + "\n", + "def pipeline(path):\n", + " \"\"\"Run the whole pipeline, from path to proposals\n", + "\n", + " :param path - either a local path to an archive with LaTeX source or arXiv url / id.\n", + " \n", + " :returns paper - Paper object with labelled tables\n", + " :returns proposals - DataFrame containing extracted sota records\n", + " \"\"\"\n", + " try:\n", + " html = get_paper_html(path)\n", + " paper = TempPaper(html)\n", + " return paper\n", + " except UnpackError:\n", + " print(\"Unable to unpack sources\")\n", + " return None\n", + " except LatexConversionError:\n", + " print(\"Unable to convert to html\")\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The extraction may take up to a few minutes, as it needs to:\n", + "* download the LaTeX source\n", + "* unpack them\n", + "* convert LaTeX to html\n", + "* clean the html\n", + "* extract paper's text, title, abstract, sections and references\n", + "* extract tables" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/anaconda3/envs/xxt/lib/python3.7/site-packages/bs4/__init__.py:177: UserWarning: You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.\n", + " warnings.warn(\"You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.\")\n" + ] + } + ], + "source": [ + "paper = pipeline(\"https://arxiv.org/abs/1903.08469\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'In Defense of Pre-trained ImageNet Architecturesfor Real-time Semantic Segmentation of Road-driving Images'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.text.title" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
modelsubsetmIoUFPSFPS normGPUresolutionGFLOPsGFLOPs@1Mpx# params
D* [35]val68.4--TitanX M1024x5125.811.60.5M
DG2s [35]val70.6--TitanX M1024x51219.0381.2M
Ladder DenseNet†[16]val72.831.030.1TitanX1024x512--9.8M
ICNet [40]test69.530.349.7TitanX M2048x1024---
ESPNet [24]test60.3112108.7TitanX1024x512--0.4M
ERFNet [28]test68.011.218.4TitanX M1024x51227.755.420M
GUNet†[23]test70.437.333.3TitanXP1024x512---
ERFNet†[28]test69.711.218.4TitanX M1024x51227.755.420M
SwiftNetRN-18val70.439.939.3GTX 1080Ti2048x1024104.052.011.8M
SwiftNetMN V2val69.427.727.7GTX 1080Ti2048x102441.020.52.4M
SwiftNetRN-18†val70.2134.9134.9GTX 1080Ti1024x51226.052.011.8M
SwiftNetRN-18 pyr†val74.434.034.0GTX 1080Ti2048x1024114.057.012.9M
SwiftNetMN V2†val75.327.727.7GTX 1080Ti2048x102441.020.52.4M
SwiftNetRN-18†val75.439.939.3GTX 1080Ti2048x1024104.052.011.8M
SwiftNetRN-18 pyr†test75.134.034.0GTX 1080Ti2048x1024114.057.012.9M
SwiftNetRN-18†test75.539.939.3GTX 1080Ti2048x1024104.052.011.8M
SwiftNetRN-18 ens†test76.518.418.4GTX 1080Ti2048x1024218.0109.024.7M
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.tables[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Internally the cells are stored as a pandas dataframe of `Cell`s:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Cell(value='model', raw_value='model', gold_tags='', refs=[], layout='border-r align-left header')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.tables[0].df.iloc[0,0]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789
0modelsubsetmIoUFPSFPS normGPUresolutionGFLOPsGFLOPs@1Mpx# params
1D* [35]val68.4--TitanX M1024x5125.811.60.5M
2DG2s [35]val70.6--TitanX M1024x51219.0381.2M
3Ladder DenseNet†[16]val72.831.030.1TitanX1024x512--9.8M
4ICNet [40]test69.530.349.7TitanX M2048x1024---
5ESPNet [24]test60.3112108.7TitanX1024x512--0.4M
6ERFNet [28]test68.011.218.4TitanX M1024x51227.755.420M
7GUNet†[23]test70.437.333.3TitanXP1024x512---
8ERFNet†[28]test69.711.218.4TitanX M1024x51227.755.420M
9SwiftNetRN-18val70.439.939.3GTX 1080Ti2048x1024104.052.011.8M
10SwiftNetMN V2val69.427.727.7GTX 1080Ti2048x102441.020.52.4M
11SwiftNetRN-18†val70.2134.9134.9GTX 1080Ti1024x51226.052.011.8M
12SwiftNetRN-18 pyr†val74.434.034.0GTX 1080Ti2048x1024114.057.012.9M
13SwiftNetMN V2†val75.327.727.7GTX 1080Ti2048x102441.020.52.4M
14SwiftNetRN-18†val75.439.939.3GTX 1080Ti2048x1024104.052.011.8M
15SwiftNetRN-18 pyr†test75.134.034.0GTX 1080Ti2048x1024114.057.012.9M
16SwiftNetRN-18†test75.539.939.3GTX 1080Ti2048x1024104.052.011.8M
17SwiftNetRN-18 ens†test76.518.418.4GTX 1080Ti2048x1024218.0109.024.7M
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 \\\n", + "0 model subset mIoU FPS FPS norm GPU \n", + "1 D* [35] val 68.4 - - TitanX M \n", + "2 DG2s [35] val 70.6 - - TitanX M \n", + "3 Ladder DenseNet†[16] val 72.8 31.0 30.1 TitanX \n", + "4 ICNet [40] test 69.5 30.3 49.7 TitanX M \n", + "5 ESPNet [24] test 60.3 112 108.7 TitanX \n", + "6 ERFNet [28] test 68.0 11.2 18.4 TitanX M \n", + "7 GUNet†[23] test 70.4 37.3 33.3 TitanXP \n", + "8 ERFNet†[28] test 69.7 11.2 18.4 TitanX M \n", + "9 SwiftNetRN-18 val 70.4 39.9 39.3 GTX 1080Ti \n", + "10 SwiftNetMN V2 val 69.4 27.7 27.7 GTX 1080Ti \n", + "11 SwiftNetRN-18† val 70.2 134.9 134.9 GTX 1080Ti \n", + "12 SwiftNetRN-18 pyr† val 74.4 34.0 34.0 GTX 1080Ti \n", + "13 SwiftNetMN V2† val 75.3 27.7 27.7 GTX 1080Ti \n", + "14 SwiftNetRN-18† val 75.4 39.9 39.3 GTX 1080Ti \n", + "15 SwiftNetRN-18 pyr† test 75.1 34.0 34.0 GTX 1080Ti \n", + "16 SwiftNetRN-18† test 75.5 39.9 39.3 GTX 1080Ti \n", + "17 SwiftNetRN-18 ens† test 76.5 18.4 18.4 GTX 1080Ti \n", + "\n", + " 6 7 8 9 \n", + "0 resolution GFLOPs GFLOPs@1Mpx # params \n", + "1 1024x512 5.8 11.6 0.5M \n", + "2 1024x512 19.0 38 1.2M \n", + "3 1024x512 - - 9.8M \n", + "4 2048x1024 - - - \n", + "5 1024x512 - - 0.4M \n", + "6 1024x512 27.7 55.4 20M \n", + "7 1024x512 - - - \n", + "8 1024x512 27.7 55.4 20M \n", + "9 2048x1024 104.0 52.0 11.8M \n", + "10 2048x1024 41.0 20.5 2.4M \n", + "11 1024x512 26.0 52.0 11.8M \n", + "12 2048x1024 114.0 57.0 12.9M \n", + "13 2048x1024 41.0 20.5 2.4M \n", + "14 2048x1024 104.0 52.0 11.8M \n", + "15 2048x1024 114.0 57.0 12.9M \n", + "16 2048x1024 104.0 52.0 11.8M \n", + "17 2048x1024 218.0 109.0 24.7M " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.tables[0].df.applymap(lambda cell: cell.value)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The paper's text can be accessed as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "# xxanchor-S1 1 Introduction" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S0F1 Figure 1: Speed-accuracy trade-off for different semantic semantic segmentation approaches on Cityscapes test on GTX1080Ti (except for DG2s which reports only validation performance). Red dots represent our method. Other methods are displayed in green, whereas blue dots show estimated frame rates of the corresponding methods on our GPU (please refer to subsection xxref-S4SS2 for details). Our submissions achieve the best accuracy and the best speed among all approaches aiming at real-time operation. Semantic segmentation is an important dense prediction task in which the inference targets posterior distribution over a known set of classes in each image pixel [xxref-bibbib6, xxref-bibbib20, xxref-bibbib3]. Currently, the best results are achieved with deep fully convolutional models which require extraordinary computational resources. Many important applications such as autonomous navigation or driver assistance require inference on very large images in order to cover a wide field of view and perceive pedestrians at distances of over 200m. At the same time, these applications require a very low latency in order to be able to bring real-time decisions. The resulting requirements intensify computational strain and make real-time implementations a challenging research objective." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "1000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Many real-time semantic segmentation approaches [xxref-bibbib28, xxref-bibbib40, xxref-bibbib24, xxref-bibbib32] address this goal by introducing custom lightweight architectures which are not suited for large-scale visual recognition. Most of these approaches initialize training from scratch, and thus miss a huge regularization opportunity offered by knowledge transfer [xxref-bibbib26] from larger and more diverse recognition datasets [xxref-bibbib30]. Consequently, these methods incur a comparatively large overfitting risk. Some approaches alleviate this shortcoming by pre-training on ImageNet [xxref-bibbib28]. However, our experiments suggest that the resulting benefits tend to be smaller than in architectures which are designed for competitive ImageNet performance." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "1001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "A simple model for semantic segmentation starts with a fully convolutional encoder which gradually decreases the resolution and increases the number of feature maps of the resulting representation. Instead of performing global pooling (as we would do in image-wide classification) one can proceed by attaching a pixel-wise loss to obtain the predictions [xxref-bibbib6]. The resulting model would lead to a very fast evaluation on modern hardware, however its accuracy would be rather low due to the following problems. Firstly, small objects (e.g. distant traffic signs) would not be recognized due to low resolution of pixel-wise predictions, which is usually 32 times smaller than the input image. Secondly, the receptive field of such models would not be large enough to classify pixels at large objects (e.g. nearby buses or trucks). These problems can be alleviated with various techniques such as dilated convolutions [xxref-bibbib37], learned upsampling [xxref-bibbib20], lateral connections [xxref-bibbib27, xxref-bibbib29, xxref-bibbib19, xxref-bibbib16] and resolution pyramids [xxref-bibbib6]. However, not all of these techniques are equally suited for real-time operation." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "1002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "In this paper, we argue that a competitive blend of efficiency and prediction accuracy can be achieved by models based on lightweight ImageNet-grade classification architectures [xxref-bibbib8, xxref-bibbib31]. Additionally, we propose a novel approach to increase the receptive field of deep model predictions, based on a resolution pyramid with shared parameters [xxref-bibbib6]. The proposed approach incurs a very modest increase of the model capacity and is therefore especially suited for datasets with large objects and few annotated images. Finally, we show that the resolution of the predictions can be efficiently and accurately upsampled by a lightweight decoder with lateral connections [xxref-bibbib27, xxref-bibbib29]. We show that the resulting semantic segmentation models can be evaluated under various computing budgets, and that they are feasible even on embedded GPU platforms. We present experiments both with ImageNet pre-training and learning from scratch on different road driving datasets. Our experiments achieve state-of-the art semantic segmentation accuracy among all existing approaches aiming at real-time execution." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "1003" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S2 2 Related Work" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "As described in the introduction, semantic segmentation models have to face two major problems: restoring the input resolution and increasing the receptive field. The simplest way to restore input resolution is to avoid downsampling. This is usually achieved by replacing stride-2 poolings with non-strided poolings, and doubling the dilation factor in subsequent convolutions [xxref-bibbib4, xxref-bibbib38]. However, this approach increases the resolution of deep latent representations, which implies a large computational complexity. Furthermore, dilated convolutions incur significant additional slow-down due to necessity to rearrange the image data before and after calling optimized implementations." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "2000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Another way to achieve dense image prediction relies on trained upsampling [xxref-bibbib20], which results in an encoder-decoder architecture. The idea is to perform recognition on subsampled latent representation to reduce complexity and then to restore the resolution by upsampling the representation (or the predictions). This setup can be naturally augmented by introducing lateral connections [xxref-bibbib27, xxref-bibbib29, xxref-bibbib19, xxref-bibbib16] to blend semantically rich deep layers with spatially rich shallow layers. The upsampling path has to be as lean as possible (to achieve efficiency and prevent overfitting) but no leaner (to avoid underfitting). It turns out that the sweet spot is computationally inexpensive, which makes this setup especially well-suited for real-time operation." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "2001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Early approaches to enlarge the receptive field of logit activations were based on dilated convolutions [xxref-bibbib37, xxref-bibbib4, xxref-bibbib38]. A more involved approach is known as spatial pyramid pooling (SPP) [xxref-bibbib7]. SPP averages features over aligned grids with different granularities [xxref-bibbib17]. We use a convolutional adaptation of that idea, in which the feature pyramid is upsampled to the original resolution [xxref-bibbib41] and then concatenated with the input features. Thus, subsequent convolutions obtain access to broad spatial pools and that increases their receptive field. The combination of dilated convolutions and SPP is known as à trous SPP, or ASPP for short [xxref-bibbib3]. However, SPP and ASPP may hurt generalization due to large capacity. In this paper, we study resolution pyramids as an alternative way to increase the receptive field and at the same time promote scale invariance [xxref-bibbib34, xxref-bibbib15, xxref-bibbib18, xxref-bibbib4]. Most previous pyramidal approaches to semantic segmentation [xxref-bibbib6, xxref-bibbib40, xxref-bibbib23] fuse only the deepest representations extracted at different resolutions. Different from them, we combine representations from different abstraction levels before joining the upsampling path within the decoder. This results in a better gradient flow throughout the pyramidal representation which is advantageous when the objects are large and the training data is scarce." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "2002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Efficient recognition architectures leverage optimized building blocks which aim at reducing computational load while preserving accuracy. Grouped convolutions reduce the number of floating point operations and the number of parameters by enclosing the information flow within smaller groups of feature maps. Various methods have been proposed to discover prominent inter-group connections. ShuffleNet [xxref-bibbib39] uses channel shuffling to pass information across convolutional groups. CondenseNet [xxref-bibbib11] incorporates a training strategy which locates important connections in grouped convolutions and prunes those which are redundant. Neural architecture search [xxref-bibbib42] is an approach that leverages reinforcement learning to jointly learn the model architecture and the corresponding parameters. The resulting architectures achieve competitive ImageNet performance when the computational budget is restricted. Depthwise separable convolutions [xxref-bibbib33, xxref-bibbib36] decrease computational complexity by splitting a regular convolution in two. Firstly, a k×k convolution is separably applied to each input channel. This can be viewed as a group convolution where the number of groups corresponds to the number of channels C. In other words, there are C kernels k×k×1. Secondly, a 1×1 convolution is applied to propagate inter-channel information. Replacing standard convolutions with depthwise separable convolutions lowers the number of parameters and increases the inference speed at the cost of some drop in performance [xxref-bibbib10]. Strong regularization effect of depthwise separable convolutions can be relaxed by inverted residual blocks [xxref-bibbib31] which lead to compact residual models suitable for mobile applications." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "2003" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Most semantic segmentation approaches aiming at real-time operation refrain from using encoders designed for competitive ImageNet performance. ICNet [xxref-bibbib40] proposes a custom encoder which processes an image pyramid with shared parameters and fuses multi-scale representations before entering the decoder which restores the resolution. ERFNet [xxref-bibbib28] redefines a residual block as a composition of a 3×1 followed by a 1×3 convolution, which yields a 33% reduction in parameters. Vallurupalli et al. [xxref-bibbib35] propose the DG2s approach as an efficient ERFNet variant with the following modifications in residual blocks: i) depthwise separable convolutions, and ii) channel shuffling operation before pointwise convolutions. ESPNet [xxref-bibbib24] factorizes convolutions in a similar manner and refrains from shared parameters across the image pyramid in order to produce a fast and compact architecture." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "2004" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Our method is most related to semantic segmentation approaches which use lightweight encoders trained on ImageNet and benefit from such initialization. Similar to our work, Nekrasov et al. [xxref-bibbib25] rely on MobileNet V2 [xxref-bibbib31] and NASNet [xxref-bibbib42] encoders and feature a thick decoder with lateral connections. This is similar to our single scale model, however, our decoder has much less capacity, which allows us to report a half of their number of floating point operations without sacrificing recognition accuracy on road driving datasets. LinkNet [xxref-bibbib2] uses a small ResNet-18 backbone and a lightweight decoder to achieve satisfying performance-speed ratio. Our single scale model is similar to LinkNet, however we omit convolutional layers at full resolution in order to substantially reduce memory imprint and greatly increase the processing speed. Mazzini et al. [xxref-bibbib23] use a dilated encoder initialized from the DRN-D-22 model [xxref-bibbib38], and a decoder with one lateral connection. They also learn nonlinear upsampling to improve accuracy at object boundaries. Instead of using dilated convolutions, our decoder upsamples predictions by exclusively relying on lateral connections, which results in a 4-fold speed-up. Additionally, we succeed to leverage full resolution images during training which achieves an improvement of 5 percentage points on Cityscapes test." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "2005" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S3 3 The proposed segmentation method" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Our method assumes the following requirements. The model should be based on an ImageNet pre-trained encoder in order to benefit from regularization induced by transfer learning. The decoder should restore the resolution of encoded features in order for the predictions to retain detail. The upsampling procedure must be as simple as possible in order to maintain real-time processing speed. Gradient flow should be promoted throughout the network to support training from random initialization in an unusual event that ImageNet pre-training turns out not to be useful." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "3000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S3SS1 3.1 Basic building blocks" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The proposed segmentation method is conceived around three basic building blocks which we briefly describe in the following paragraphs. These building blocks are going to be used in our two models which we propose in subsequent subsections." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "4000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S3SS1SSS0Px1 Recognition encoder We advocate the usage of compact ImageNet pre-trained model as segmentation encoders. We propose to use ResNet-18 [xxref-bibbib8] and MobileNet V2 [xxref-bibbib31] for a number of reasons. These models are a good fit for fine tuning due to pre-trained parameters being publicly available. They are also suitable for training from scratch due to moderate depth and residual structure. Finally, these models are compatible with real-time operation due to small operation footprint. Computationally, ResNet-18 is around six times more complex than MobileNet V2. However, MobileNet V2 uses depthwise separable convolutions which are not directly supported in GPU firmware (the cuDNN library). Therefore, MobileNet V2 tends to be slower than ResNet-18 in most experimental setups. Note that the same issue disqualifies usage of the DenseNet architecture [xxref-bibbib12], since it requires efficient convolution over a non-contiguous tensor, which is still not supported in cuDNN." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "4001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S3SS1SSS0Px2 Upsampling decoder The recognition encoder transforms the input image into semantically rich visual features. These features must have a coarse spatial resolution in order to save memory and processing time. The purpose of the decoder is to upsample these features to the input resolution. We advocate a simple decoder organized as a sequence of upsampling modules with lateral connections [xxref-bibbib27, xxref-bibbib29]. The proposed ladder-style upsampling modules have two inputs: the low resolution features (which should be upsampled), and the lateral features from an earlier layer of the encoder. The low resolution features are first upsampled with bilinear interpolation to the same resolution as the lateral features. Upsampled input features and lateral encoder features are then mixed with elementwise summation and finally blended with a 3×3 convolution." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "4002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "We propose to route the lateral features from the output of the elementwise sum within the last residual block at the corresponding level of subsampling, as shown in Figure xxref-S3F2. Note that routing the lateral features from the output of the subsequent ReLU leads to a considerable drop in validation accuracy. Replacing the standard 3×3 convolution with either a 1×1 convolution, or a depthwise separable convolution also decreases the validation accuracy." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "4003" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S3F2 Figure 2: Structural diagram of the last residual unit within a convolutional block operating on common spatial resolution. We do not use pre-activation [xxref-bibbib9] since we could not find a pre-trained parameterization for ResNet-18. The lateral connection is taken from the output of the elementwise sum after the last residual block. The output of the ReLU node is forwarded to the next residual block." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "4004" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S3SS1SSS0Px3 Module for increasing the receptive field As discussed before, there are two viable possibilities for increasing the receptive field while maintaining real-time speed: i) spatial pyramid pooling, and ii) pyramid fusion. The SPP block gathers the features produced by the encoder at several pooling levels and produces a representation with a varying level of detail. We demonstrate the use of SPP in our single scale model. Our SPP block is a simplified version of the pyramid pooling module from PSPNet [xxref-bibbib41]. The pyramid fusion produces genuine multi-scale representations which need to be carefully fused within the decoder in order to avoid overfitting to unsuitable level of detail. We propose a pyramid pooling approach which blends representations at different levels of abstraction and thus enlarges the receptive field without sacrificing spatial resolution." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "4005" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S3SS2 3.2 Single scale model" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The proposed single scale model transforms the input image into dense semantic predictions throughout a downsampling recognition encoder and upsampling decoder, as shown in Figure xxref-S3F3. Yellow trapezoids designate convolution groups, that is, parts of the encoder which operate on the same spatial resolution. All considered encoders consist of four such convolution groups. The first convolution group produces features at the H/4×W/4 resolution, while each following group increases the subsampling by the factor of 2. Thus the features at the far end of the encoder are H/32×W/32. These features are fed into the spatial pyramid pooling layer (designated by a green diamond) in order to increase the model receptive field. The resulting tensor is finally routed to the decoder whose upsampling modules are shown in blue." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "5000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Note that decoder and encoder are asymmetric: the encoder has many convolutions per convolution group while decoder has only one convolution per upsampling module. Furthermore, the dimensionality of encoder features increases along the downsampling path, while the dimensionality of the decoder features is constant. Therefore, lateral connections have to adjust dimensionality with 1×1 convolutions designated with red squares. Upsampling modules operate in three steps: i) the low resolution representation is bilinearly upsampled, ii) the obtained representations is summed with the lateral connection, iii) the summation is blended using a 3×3 convolution." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "5001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S3F3 Figure 3: Structural diagram of the proposed single scale model. Yellow trapezoids designate convolution groups within the encoder which may be pre-trained on ImageNet. The green diamond designates the spatial pyramid pooling layer, the red squares designate bottleneck layers, and blue trapezoids designate lightweight upsampling modules. Logits are upsampled to original image resolution with bilinear interpolation." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "5002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S3SS3 3.3 Interleaved pyramid fusion model" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "While using a compact encoder is beneficial for fast inference, this also results in a decreased receptive field and a smaller capacity compared to general purpose convolutional models for visual recognition. To counteract these drawbacks, we propose to exploit image pyramids to enlarge the receptive field of the model and reduce the model capacity requirements." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "6000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The proposed model is shown in Figure xxref-S3F4. Two encoder instances (yellow) are applied to the input image at different levels of the resolution pyramid. This results in increased receptive field of the activations which sense the lowest resolution of the image pyramid. Furthermore, shared parameters enable recognition of objects of different sizes with the common set of parameters, which may relax the demand for model capacity. In order to enforce lateral connections and improve the gradient flow throughout the encoder, we concatenate the feature tensors from neighbouring levels of different encoders (we can do that since they have equal spatial resolution). This concatenation is designated with green circles. After concatenation, interleaved encoder features are projected onto the decoder feature space by 1×1 convolutions designated with red squares. The decoder (blue) operates in the same manner as in the single-scale model, however now we have an additional upsampling module for each additional level of the image pyramid." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "6001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S3F4 Figure 4: Structural diagram of the proposed model with interleaved pyramidal fusion. Encoder parameters (yellow) are shared across all pyramid levels and may be pre-trained on Imagenet. Features of the same resolutions are concatenated (green circles), fed into a 1×1 bottleneck convolution (red squares) and blended within the decoder (blue)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "6002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4 4 Experiments" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "We conduct semantic segmentation experiments on two datasets: Cityscapes [xxref-bibbib5] and CamVid [xxref-bibbib1]. We report mIoU accuracy, computational complexity and the execution speed of the trained models. The speed measurements are performed on a desktop GPU (GTX 1080Ti) and on an embedded System on a chip module (Jetson TX2). We also present ablation and validation experiments which provide a more detailed insight into the impact of various design choices. Please note that additional experiments can be found in the supplement." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "7000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS1 4.1 Training details" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "We train all our models with the Adam [xxref-bibbib14] optimizer with the learning rate set to 4⋅10−4. We decay the learning rate with cosine annealing [xxref-bibbib21] to the minimum value of 1⋅10−6 in the last epoch (we do not perform any warm restarts). The weight decay is set to 1⋅10−4. In experiments with ImageNet pre-training, we update pre-trained parameters with 4 times smaller learning rate and apply 4 times smaller weight decay. We train on jittered square crops with batch size 12. The jittering consists of random horizontal flipping, and scaling with random factors between 0.5 and 2. We use 768×768 crops for full Cityscapes resolution, and 448×448 crops for half Cityscapes resolution and CamVid. We train for 200 epochs on Cityscapes and 400 epochs on CamVid. We train for additional 200 epochs in experiments without ImageNet pre-training." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "8000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS2 4.2 Measuring the computational complexity" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "We express the computational complexity of the trained models with two metrics: i) billions of floating point operations (GFLOP), and ii) number of processed frames per second (FPS). The GFLOP metric provides the number of fused multiply-add operations required to evaluate the model. Such platform-agnostic measure of the computational complexity is suitable for CPUs where all multiplications require roughly equal processing time. Unfortunately, the GFLOP metric poorly corresponds with the actual processing time on GPU platforms, since efficient implementations are available only for a small subset of all building blocks used to express current deep models. Consequently, both metrics are required for a complete description of algorithm suitability for real-time operation." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "9000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The FPS metric directly corresponds to the processing time on a particular hardware platform. Such metric does not necessarily correlate across platforms, although rough estimations can be done, as we show below. We simulate real-time applications by setting batch size to 1. We measure the time elapsed between transferring the input data to the GPU, and receiving the semantic predictions into RAM as shown in the code snippet shown in Figure xxref-S4F5." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "9001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S4F5 ⬇ model.\\ttbeval() model.to(device) with torch.no_grad(): \\ttbinput = model.prepare_data(batch).to(device) logits = model.forward(\\ttbinput) torch.cuda.synchronize() t0 = 1000 * perf_counter() \\ttbfor _ \\ttbin \\ttbrange(n): \\ttbinput = model.prepare_data(batch).to(device) logits = model.forward(\\ttbinput) _, pred = logits.\\ttbmax(1) out = pred.data.byte().cpu() torch.cuda.synchronize() t1 = 1000 * perf_counter() fps = (1000 * n) / (t1 - t0) \\ttm Figure 5: Measurement of the processing time under PyTorch." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "9002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "We conduct all measurements on a single GTX1080Ti with CUDA 10.0, CUDNN 7.3 and PyTorch 1.0rc1. We exclude the batch normalization layers [xxref-bibbib13] from measurements since in real-time applications they would be fused with preceding convolutional layers. We report mean FPS values over 1000 forward passes. Results are shown in Table xxref-S4T1. The column FPS norm provides a rough estimate on how would other methods perform on our hardware. The scaling factors are: 1.0 for GTX1080Ti, 0.61 for TitanX Maxwell, 1.03 for TitanX Pascal, and 1.12 for Titan XP. These scaling factors were calculated using publicly available benchmarks: goo.gl/N6ukTz, goo.gl/BaopYQ. The column GFLOPs@1MPx shows an estimated number of FLOPs for an input image of 1MPx, as a resolution-agnostic metric of computational complexity." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "9003" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxtable-xxanchor-S4T1 Table 1: Results of semantic segmentation on Cityscapes. We evaluate our best result on the online test benchmark and compare it with relevant previous work, where possible. We also report the computational complexity (GFLOP, FPS) GPU on which the inference was performed, and the image resolution on which the training and inference were performed. The column GFLOPs@1Mpx shows the GFLOPs metric when the input resolution is 1MPx. The column FPS norm shows or estimates the FPS metric on GTX 1080Ti. The default SwiftNet configuration is the single scale model presented in xxref-S3SS2. Label pyr denotes the pyramid fusion model presented in xxref-S3SS3. Label ens denotes the ensemble of the single scale model and the pyramid model. The symbol † designates ImageNet pre-training." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "9004" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS3 4.3 Cityscapes" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The Cityscapes dataset is a collection of high resolution images taken from the driver’s perspective during daytime and fine weather. It consists of 2975 training, 500 validation, and 1525 test images with labels from 19 classes. It also provides 20000 coarsely labeled images which we do not use during our experiments. Table xxref-S4T1 evaluates the accuracy (class mIoU) and efficiency (GFLOP, FPS) of our methods and compares them to other real-time methods. Our single scale method based on the ResNet-18 encoder achieves 75.5% test mIoU, and delivers 39.9 FPS on full Cityscapes resolution (1024×2048 pixels). To the best of our knowledge, this result outperforms all other approaches aiming at real-time operation. The corresponding submission to the Cityscapes evaluation server is entitled SwiftNetRN-18. Table xxref-S4T1 also presents experiments in which our models are trained from scratch. The accuracy decreases for 5 mIoU percentage points (pp) with respect to the corresponding experiments with ImageNet pre-trained initialization. This shows that ImageNet pre-training represents an important ingredient for reaching highly accurate predictions. We notice that custom encoders like ERFNet [xxref-bibbib28] get less benefits from ImageNet pre-training: only 1.7% pp as shown in Table xxref-S4T1. Figure xxref-S4F7 presents examples of segmentations on Cityscapes val images. We show examples for both single scale and pyramid models. We did not achieve measurable improvements with the pyramid model over the single scale model on the Cityscapes dataset." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "10000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS4 4.4 CamVid" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The CamVid dataset contains 701 densely annotated frames. We use the usual split into 367 train, 101 validation and 233 test images. We train on combined train and validation subsets and evaluate semantic segmentation into 11 classes on the test subset. Table xxref-S4T2 shows that we obtain an improvement of roughly 1.5 pp mIoU when using the pyramid model with pre-trained ResNet-18 and MobileNetV2 backbones. Figure xxref-S4F8 shows frames from the CamVid test subset where the pyramid model performed better." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "11000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Table xxref-S4T2 also shows that ImageNet pre-training contributes more on CamVid than on Cityscapes (7-9pp of mIoU performance). This is not surprising since CamVid has almost 20 times less training pixels." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "11001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "A small size of the dataset poses a considerable challenge when training from scratch due to high overfitting risk. Table xxref-S4T2 shows that the pyramid model achieves better results than the single scale model. This supports our choice of sharing encoder parameters across pyramid levels." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "11002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxtable-xxanchor-S4T2 Table 2: Semantic segmentation accuracy on CamVid test using ImageNet pre-training (mIoU†) and training from scratch (mIoU)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "11003" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS5 4.5 Validation of the upsampling capacity" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The number of feature maps along the upsampling path is the most important design choice of the decoder. We validate this hyper-parameter and report the results in Table xxref-S4T3. The results show that the model accuracy saturates at 128 dimensions. Consequently, we pick this value as a sensible speed-accuracy trade-off in all other experiments." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "12000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxtable-xxanchor-S4T3 Table 3: Validation of the number of feature maps in the upsampling path. The models were trained on Cityscapes train subset at 512×1024 while the evaluation is performed on Cityscapes val. All models use ImageNet initialization." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "12001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS6 4.6 Ablation of lateral connections" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "To demonstrate the importance of lateral connections between the encoder and the decoder, we train a single scale model without lateral connections. For this experiment, we discard the 1×1 convolution layers located on the skip connections and abandon the elementwise summations in upsampling modules. Training such a model on full Cityscapes train images causes the validation accuracy to drop from 75.35% to 72.93%." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "13000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS7 4.7 Execution profile" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "To obtain a better insight into the execution time of our models, we report separate processing times and the GFLOP metrics for the downsampling path (encoder and SPP), and the upsampling path (decoder). Table xxref-S4T4 shows the results for the single scale model and input resolution of 2048×1024. Note the striking discrepancy of time and GFLOPs for the two downsampling paths. ResNet-18 is almost twice as fast than MobileNet v2 despite requiring 6 times more multiplications. Note also that our decoder is twice as fast as the ResNet-18 encoder." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "14000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxtable-xxanchor-S4T4 Table 4: Inference speed along the downsampling (encoder and SPP) and the upsampling (decoder) paths for the single scale model. The columns dn time and up time display the execution times, while the columns dn FLOPs and up FLOPs show the number of floating point operations for 2048×1024 images." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "14001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS8 4.8 Size of the receptive field" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "We estimate the effective receptive field of our models by considering the central pixel in each image X of Cityscapes val. The estimate is based on gradients ∂yi∂X[xxref-bibbib22] where y are the logits for the central pixel while i is argmax(y). Table xxref-S4T5 expresses the effective receptive fields as standard deviations of pixels with top 5% gradient ∂yi∂X. The results show that both SPP and interleaved pyramidal fusion substantially increase the receptive field." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "15000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxtable-xxanchor-S4T5 Table 5: Effective receptive fields (ERF) expressed as standard deviation of pixels with top 5% image gradients with respect to the dominant class of the central pixel, measured on Cityscapes val." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "15001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S4SS9 4.9 Processing speed on Jetson TX2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "We evaluate the proposed methods on NVIDIA Jetson TX2 module under CUDA 9.0, CUDNN 7.1, and PyTorch v1.0rc1. Due to limited number of CUDA cores, all bilinear interpolations had to be replaced with nearest neighbour interpolation. Results are reported in Figure xxref-S4F6. The MobileNet V2 backbone outperforms ResNet-18 for 20-30% on most resolutions due to lower number of FLOPs. However, ResNet-18 is faster on the lowest resolution. Note that our implementations do not use TensorRT optimizations." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "16000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S4F6 Figure 6: Processing speed in frames per second of the proposed architecture on NVIDIA Jetson TX2 module for two different backbones and various input resolutions." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "16001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S4F7 Figure 7: Semantic segmentation results on Cityscapes val. The columns correspond to input image, ground truth annotation, the output of the pyramid model, and the output of the single scale model. The most significant improvements occur on pixels of the class truck." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "16002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-S4F8 Figure 8: Semantic segmentation results on CamVid test. The columns correspond to input, ground truth, the output of the pyramid model, and the output of the single scale model. The most significant improvements occur on pixels of classes bus (top) and tree (bottom)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "16003" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-S5 5 Conclusion" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Real-time performance is a very important trait of semantic segmentation models aiming at applications in robotics and intelligent transportation systems. Most previous work in the field involves custom convolutional encoders trained from scratch, and decoders without lateral skip-connections. However, we argue that a better speed-accuracy trade-off is achieved with i) compact encoders designed for competitive ImageNet performance and ii) lightweight decoders with lateral skip-connections. Additionally, we propose a novel interleaved pyramidal fusion scheme which is able to further improve the results on large objects close to the camera. We provide a detailed analysis of prediction accuracy and processing time on Cityscapes and CamVid datasets for models based on ResNet-18 and MobileNetv2. Our Cityscapes test submission achieves 75.5% mIoU by processing 1024×2048 images at 39.9 Hz on a GTX1080Ti. To the best of our knowledge, this result outperforms all previous approaches aiming at real-time application. The source code is available at https://github.com/orsic/swiftnet." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "17000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-Sx1 Acknowledgment" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "This work has been supported by the European Regional Development Fund under the project System for increased driving safety in public urban rail traffic (SafeTRAM) under grant KK.01.2.1.01.0022, and by European Regional Development Fund (DATACROSS) under grant KK.01.1.1.01.0009, and Microblink Ltd. We would like to thank Josip Krapac for helpful discussions. The Titan Xp used to train some of the evaluated models was donated by NVIDIA Corporation." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "18000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "# xxanchor-bib References" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib1 [1] G. J. Brostow, J. Fauqueur, and R. Cipolla. Semantic object classes in video: A high-definition ground truth database. Pattern Recognition Letters, xx(x):xx–xx, 2008." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19000" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib2 [2] A. Chaurasia and E. Culurciello. Linknet: Exploiting encoder representations for efficient semantic segmentation. In 2017 IEEE Visual Communications and Image Processing, VCIP 2017, St. Petersburg, FL, USA, December 10-13, 2017, pages 1–4, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19001" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib3 [3] L. Chen, G. Papandreou, F. Schroff, and H. Adam. Rethinking atrous convolution for semantic image segmentation. CoRR, abs/1706.05587, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19002" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib4 [4] L. Chen, Y. Yang, J. Wang, W. Xu, and A. L. Yuille. Attention to scale: Scale-aware semantic image segmentation. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27-30, 2016, pages 3640–3649, 2016." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19003" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib5 [5] M. Cordts, M. Omran, S. Ramos, T. Scharwächter, M. Enzweiler, R. Benenson, U. Franke, S. Roth, and B. Schiele. The cityscapes dataset. In CVPRW, 2015." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19004" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib6 [6] C. Farabet, C. Couprie, L. Najman, and Y. LeCun. Learning hierarchical features for scene labeling. IEEE transactions on pattern analysis and machine intelligence, 35(8):1915–1929, 2013." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19005" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib7 [7] K. He, X. Zhang, S. Ren, and J. Sun. Spatial pyramid pooling in deep convolutional networks for visual recognition. PAMI, 2015." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19006" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib8 [8] K. He, X. Zhang, S. Ren, and J. Sun. Deep residual learning for image recognition. In CVPR, pages 770–778, 2016." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19007" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib9 [9] K. He, X. Zhang, S. Ren, and J. Sun. Identity mappings in deep residual networks. In ECCV, pages 630–645, 2016." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19008" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib10 [10] A. G. Howard, M. Zhu, B. Chen, D. Kalenichenko, W. Wang, T. Weyand, M. Andreetto, and H. Adam. Mobilenets: Efficient convolutional neural networks for mobile vision applications. CoRR, abs/1704.04861, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19009" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib11 [11] G. Huang, S. Liu, L. van der Maaten, and K. Q. Weinberger. Condensenet: An efficient densenet using learned group convolutions. arXiv preprint arXiv:1711.09224, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19010" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib12 [12] G. Huang, Z. Liu, L. van der Maaten, and K. Q. Weinberger. Densely connected convolutional networks. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017, pages 2261–2269, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19011" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib13 [13] S. Ioffe and C. Szegedy. Batch normalization: Accelerating deep network training by reducing internal covariate shift. In ICML, pages 448–456, 2015." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19012" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib14 [14] D. P. Kingma and J. Ba. Adam: A method for stochastic optimization. CoRR, abs/1412.6980, 2014." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19013" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib15 [15] I. Kreso, D. Causevic, J. Krapac, and S. Segvic. Convolutional scale invariance for semantic segmentation. In GCPR, pages 64–75, 2016." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19014" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib16 [16] I. Kreso, J. Krapac, and S. Segvic. Ladder-style densenets for semantic segmentation of large natural images. In ICCVW CVRSUAD, pages 238–245, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19015" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib17 [17] S. Lazebnik, C. Schmid, and J. Ponce. Beyond bags of features: Spatial pyramid matching for recognizing natural scene categories. In CVPR, pages 2169–2178, 2006." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19016" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib18 [18] G. Lin, A. Milan, C. Shen, and I. D. Reid. Refinenet: Multi-path refinement networks for high-resolution semantic segmentation. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017, pages 5168–5177, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19017" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib19 [19] T.-Y. Lin, P. Dollár, R. B. Girshick, K. He, B. Hariharan, and S. J. Belongie. Feature pyramid networks for object detection. In CVPR, volume 1, page 4, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19018" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib20 [20] J. Long, E. Shelhamer, and T. Darrell. Fully convolutional networks for semantic segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 3431–3440, 2015." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19019" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib21 [21] I. Loshchilov and F. Hutter. SGDR: stochastic gradient descent with restarts. CoRR, abs/1608.03983, 2016." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19020" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib22 [22] W. Luo, Y. Li, R. Urtasun, and R. S. Zemel. Understanding the effective receptive field in deep convolutional neural networks. In NIPS, pages 4898–4906, 2016." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19021" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib23 [23] D. Mazzini. Guided upsampling network for real-time semantic segmentation. In British Machine Vision Conference 2018, BMVC 2018, Northumbria University, Newcastle, UK, September 3-6, 2018, page 117, 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19022" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib24 [24] S. Mehta, M. Rastegari, A. Caspi, L. G. Shapiro, and H. Hajishirzi. Espnet: Efficient spatial pyramid of dilated convolutions for semantic segmentation. In Computer Vision - ECCV 2018 - 15th European Conference, Munich, Germany, September 8-14, 2018, Proceedings, Part X, pages 561–580, 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19023" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib25 [25] V. Nekrasov, C. Shen, and I. D. Reid. Light-weight refinenet for real-time semantic segmentation. In British Machine Vision Conference 2018, BMVC 2018, Northumbria University, Newcastle, UK, September 3-6, 2018, page 125, 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19024" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib26 [26] M. Oquab, L. Bottou, I. Laptev, and J. Sivic. Learning and transferring mid-level image representations using convolutional neural networks. In 2014 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2014, Columbus, OH, USA, June 23-28, 2014, pages 1717–1724, 2014." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19025" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib27 [27] A. Rasmus, M. Berglund, M. Honkala, H. Valpola, and T. Raiko. Semi-supervised learning with ladder networks. In Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada, pages 3546–3554, 2015." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19026" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib28 [28] E. Romera, J. M. Alvarez, L. M. Bergasa, and R. Arroyo. Erfnet: Efficient residual factorized convnet for real-time semantic segmentation. IEEE Transactions on Intelligent Transportation Systems, 19(1):263–272, 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19027" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib29 [29] O. Ronneberger, P. Fischer, and T. Brox. U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention - MICCAI 2015 - 18th International Conference Munich, Germany, October 5 - 9, 2015, Proceedings, Part III, pages 234–241, 2015." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19028" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib30 [30] O. Russakovsky, J. Deng, H. Su, J. Krause, S. Satheesh, S. Ma, Z. Huang, A. Karpathy, A. Khosla, M. Bernstein, A. C. Berg, and L. Fei-Fei. ImageNet Large Scale Visual Recognition Challenge. International Journal of Computer Vision (IJCV), 115(3):211–252, 2015." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19029" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib31 [31] M. Sandler, A. G. Howard, M. Zhu, A. Zhmoginov, and L. Chen. Inverted residuals and linear bottlenecks: Mobile networks for classification, detection and segmentation. CoRR, abs/1801.04381, 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19030" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib32 [32] M. Siam, M. Gamal, M. Abdel-Razek, S. Yogamani, M. Jagersand, H. Zhang, N. Vallurupalli, S. Annamaneni, G. Varma, C. Jawahar, et al. A comparative study of real-time semantic segmentation for autonomous driving. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pages 587–597, 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19031" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib33 [33] L. Sifre and S. Mallat. Rigid-motion scattering for texture classification. CoRR, abs/1403.1687, 2014." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19032" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib34 [34] B. Singh and L. S. Davis. An analysis of scale invariance in object detection–snip. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pages 3578–3587, 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19033" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib35 [35] N. Vallurupalli, S. Annamaneni, G. Varma, C. Jawahar, M. Mathew, and S. Nagori. Efficient semantic segmentation using gradual grouping. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Workshops, June 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19034" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib36 [36] M. Wang, B. Liu, and H. Foroosh. Factorized convolutional neural networks. In ICCV Workshops, pages 545–553, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19035" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib37 [37] F. Yu and V. Koltun. Multi-scale context aggregation by dilated convolutions. In ICLR, 2016." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19036" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib38 [38] F. Yu, V. Koltun, and T. A. Funkhouser. Dilated residual networks. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017, pages 636–644, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19037" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib39 [39] X. Zhang, X. Zhou, M. Lin, and J. Sun. Shufflenet: An extremely efficient convolutional neural network for mobile devices. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19038" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib40 [40] H. Zhao, X. Qi, X. Shen, J. Shi, and J. Jia. Icnet for real-time semantic segmentation on high-resolution images. arXiv preprint arXiv:1704.08545, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19039" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib41 [41] H. Zhao, J. Shi, X. Qi, X. Wang, and J. Jia. Pyramid scene parsing network. In ICCV, 2017." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19040" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "xxanchor-bibbib42 [42] B. Zoph, V. Vasudevan, J. Shlens, and Q. V. Le. Learning transferable architectures for scalable image recognition. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2018." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "19041" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "paper.text.fragments.print()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xxanchor-S1 1 Introduction\n", + "xxanchor-S2 2 Related Work\n", + "xxanchor-S3 3 The proposed segmentation method\n", + "xxanchor-S3SS1 3.1 Basic building blocks\n", + "xxanchor-S3SS2 3.2 Single scale model\n", + "xxanchor-S3SS3 3.3 Interleaved pyramid fusion model\n", + "xxanchor-S4 4 Experiments\n", + "xxanchor-S4SS1 4.1 Training details\n", + "xxanchor-S4SS2 4.2 Measuring the computational complexity\n", + "xxanchor-S4SS3 4.3 Cityscapes\n", + "xxanchor-S4SS4 4.4 CamVid\n", + "xxanchor-S4SS5 4.5 Validation of the upsampling capacity\n", + "xxanchor-S4SS6 4.6 Ablation of lateral connections\n", + "xxanchor-S4SS7 4.7 Execution profile\n", + "xxanchor-S4SS8 4.8 Size of the receptive field\n", + "xxanchor-S4SS9 4.9 Processing speed on Jetson TX2\n", + "xxanchor-S5 5 Conclusion\n", + "xxanchor-Sx1 Acknowledgment\n", + "xxanchor-bib References\n" + ] + } + ], + "source": [ + "for header in paper.text.fragments.get_toc():\n", + " print(header)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/flatten_evaltab.sh b/flatten_evaltab.sh deleted file mode 100755 index 083a919..0000000 --- a/flatten_evaltab.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -jq -c '.. | select(.datasets?) | .task as $task | .datasets | .[] | .dataset as $dataset | .sota.rows[] | {paper_url, paper_title, model_name} as $paper | .metrics | . as $metrics | keys[] | {dataset: $dataset, metric_name: ., metric_value: $metrics[.], paper_url: $paper.paper_url, paper_title: $paper.paper_title, model_name: $paper.model_name, task: $task }' "$1" | grep arxiv\.org | jq -s '.' diff --git a/get_papers_links.sh b/get_papers_links.sh deleted file mode 100755 index e79a657..0000000 --- a/get_papers_links.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -jq '.. | select(.sota?) | .sota.rows[] | .paper_url' "$1" | grep arxiv | sed -e 's#"##g' -e 's#http:#https:#' | sort -u diff --git a/helpers.py b/helpers.py deleted file mode 100644 index f38aa10..0000000 --- a/helpers.py +++ /dev/null @@ -1,43 +0,0 @@ -from fire import Fire -from pathlib import Path -from sota_extractor2.data.paper_collection import PaperCollection -from sota_extractor2.data.structure import CellEvidenceExtractor -from elasticsearch_dsl import connections -from tqdm import tqdm -import pandas as pd -from joblib import delayed, Parallel - -class Helper: - def split_pc_pickle(self, path, outdir="pc-parts", parts=8): - outdir = Path(outdir) - outdir.mkdir(parents=True, exist_ok=True) - pc = PaperCollection.from_pickle(path) - step = (len(pc) + parts - 1) // parts - for idx, i in enumerate(range(0, len(pc), step)): - part = PaperCollection(pc[i:i + step]) - part.to_pickle(outdir / f"pc-part-{idx:02}.pkl") - - def _evidences_for_pc(self, path): - path = Path(path) - pc = PaperCollection.from_pickle(path) - cell_evidences = CellEvidenceExtractor() - connections.create_connection(hosts=['10.0.1.145'], timeout=20) - raw_evidences = [] - for paper in tqdm(pc): - raw_evidences.append(cell_evidences(paper, paper.tables, paper_limit=100, corpus_limit=20)) - raw_evidences = pd.concat(raw_evidences) - path = path.with_suffix(".evidences.pkl") - raw_evidences.to_pickle(path) - - def evidences_for_pc(self, pattern="pc-parts/pc-part-??.pkl", jobs=-1): - pickles = sorted(Path(".").glob(pattern)) - Parallel(backend="multiprocessing", n_jobs=jobs)(delayed(self._evidences_for_pc)(path) for path in pickles) - - def merge_evidences(self, output="evidences.pkl", pattern="pc-parts/pc-part-*.evidences.pkl"): - pickles = sorted(Path(".").glob(pattern)) - evidences = [pd.read_pickle(pickle) for pickle in pickles] - evidences = pd.concat(evidences) - evidences.to_pickle(output) - - -if __name__ == "__main__": Fire(Helper()) diff --git a/label_tables.py b/label_tables.py deleted file mode 100755 index 5cfcead..0000000 --- a/label_tables.py +++ /dev/null @@ -1,355 +0,0 @@ -#!/usr/bin/env python - -import fire -from sota_extractor.taskdb import TaskDB -from pathlib import Path -import json -import re -import pandas as pd -import sys -from decimal import Decimal, ROUND_DOWN, ROUND_HALF_UP, InvalidOperation -from collections import Counter, namedtuple -from joblib import delayed, Parallel -from sota_extractor2.data.paper_collection import PaperCollection, remove_arxiv_version -from functools import reduce - -arxiv_url_re = re.compile(r"^https?://(www.)?arxiv.org/(abs|pdf|e-print)/(?P\d{4}\.[^./]*)(\.pdf)?$") - -def get_sota_tasks(filename): - db = TaskDB() - db.load_tasks(filename) - return db.tasks_with_sota() - - -def get_metadata(filename): - with open(filename, "r") as f: - j = json.load(f) - metadata = {x["filename"]:x["caption"] for x in j} - return metadata - - -def get_table(filename): - try: - return pd.read_csv(filename, header=None, dtype=str).fillna('') - except pd.errors.EmptyDataError: - return pd.DataFrame() - - -# all_metadata[arxiv_id] = {'table_01.csv': 'Table 1: ...', ...} -# all_tables[arxiv_id] = {'table_01.csv': DataFrame(...), ...} -def get_tables(tables_dir): - tables_dir = Path(tables_dir) - all_metadata = {} - all_tables = {} - for metadata_filename in tables_dir.glob("*/metadata.json"): - metadata = get_metadata(metadata_filename) - basedir = metadata_filename.parent - arxiv_id = basedir.name - all_metadata[arxiv_id] = metadata - all_tables[arxiv_id] = {t:get_table(basedir / t) for t in metadata} - return all_metadata, all_tables - - -metric_na = ['-',''] - - -# problematic values of metrics found in evaluation-tables.json -# F0.5, 70.14 (measured by Ge et al., 2018) -# Test Time, 0.33s/img -# Accuracy, 77,62% -# Electronics, 85,06 -# BLEU-1, 54.60/55.55 -# BLEU-4, 26.71/27.78 -# MRPC, 78.6/84.4 -# MRPC, 76.2/83.1 -# STS, 78.9/78.6 -# STS, 75.8/75.5 -# BLEU score,41.0* -# BLEU score,28.5* -# SemEval 2007,**55.6** -# Senseval 2,**69.0** -# Senseval 3,**66.9** -# MAE, 2.42±0.01 - -## multiple times -# Number of params, 0.8B -# Number of params, 88M -# Parameters, 580k -# Parameters, 3.1m -# Params, 22M - - - -float_value_re = re.compile(r"([+-]?\s*((\d{1,2}(,\d{3})+|\d+)(\.\d*)?|\.\d+)([eE][+-]?\d+)?)") -letters_re = re.compile("[^\W\d_]", re.UNICODE) - -# float value possibly with std -metric_value_re = re.compile(float_value_re.pattern + r"(\s*±\s*" + float_value_re.pattern + ")?") -whitespace_re = re.compile(r"\s+") - - -def normalize_float_value(s): - match = metric_value_re.search(s) - if match: - return whitespace_re.sub("", match.group(1)).replace(",", ""), match.group(0).strip() - return '-', None - - -def test_near(x, precise): - for rounding in [ROUND_DOWN, ROUND_HALF_UP]: - try: - if x == precise.quantize(x, rounding=rounding): - return True - except InvalidOperation: - pass - return False - - -def fuzzy_match(metric, metric_value, target_value): - metric_value, _ = normalize_float_value(str(metric_value)) - if metric_value in metric_na: - return False - metric_value = Decimal(metric_value) - - for match in metric_value_re.findall(target_value): - value = whitespace_re.sub("", match[0]) - value = Decimal(value) - - if test_near(metric_value, value): - return True - if test_near(metric_value.shift(2), value): - return True - if test_near(metric_value, value.shift(2)): - return True - - return False -# -# if metric_value in metric_na or target_value in metric_na: -# return False -# if metric_value != target_value and metric_value in target_value: -# print(f"|{metric_value}|{target_value}|") -# return metric_value in target_value - - -def match_metric(metric, tables, value): - matching_tables = [] - for table in tables: - for col in tables[table]: - for row in tables[table][col]: - if fuzzy_match(metric, value, row): - matching_tables.append(table) - break - else: - continue - break - - return matching_tables - - -comparators = { - "a=b": test_near, - "100a=b": lambda metric, target: test_near(metric.shift(2), target), - "a=100b": lambda metric, target: test_near(metric, target.shift(2)), - "1-a=b": lambda metric, target: test_near(Decimal("1") - metric, target), - "100-a=b": lambda metric, target: test_near(Decimal("100") - metric, target), - "100-100a=b": lambda metric, target: test_near(Decimal("100") - metric.shift(2), target), - "100-a=100b": lambda metric, target: test_near(Decimal("100") - metric, target.shift(2)) -} - - -def empty_celltags_like(table): - return pd.DataFrame().reindex_like(table).fillna('') - - -def mark_with_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values, comp_name): - comparator = comparators[comp_name] - rows, cols = table.shape - hits = 0 - cell_tags = empty_celltags_like(table) - for col in range(cols): - for row in range(rows): - for val, val_str in table.iloc[row, col]: - for record in values: - if comparator(record.normalized, val): - hits += 1 - tags = f"{record.value}" +\ - f"{record.arxiv_id}" +\ - f"{record.model}" +\ - f"{metric_name}" +\ - f"{dataset_name}" +\ - f"{task_name}" - if arxiv_id == record.arxiv_id: - tags += "" - tags += f"{comp_name}" +\ - f"{val}" +\ - f"{val_str}" - cell_tags.iloc[row, col] += tags - return cell_tags, hits - - -def mark_with_best_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values): - max_hits = 0 - best_tags = None - - for comp_name in comparators: - cell_tags, hits = mark_with_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values, comp_name) - if max_hits < hits: - max_hits = hits - best_tags = cell_tags - - return best_tags - - -def mark_with_all_comparators(task_name, dataset_name, metric_name, arxiv_id, table, values): - all_tags = empty_celltags_like(table) - for comp_name in comparators: - cell_tags, _ = mark_with_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values, comp_name) - all_tags += cell_tags - - return all_tags - -def normalize_string(s): - return s.lower.strip() - - -def match_str(a, b): - return normalize_string(a) == normalize_string(b) - - -def mark_strings(table, tags, values): - cell_tags = empty_celltags_like(table) - beg, end = tags - rows, cols = table.shape - for col in range(cols): - for row in range(rows): - for s in values: - real = table.iloc[row, col] - if match_str(real, s): - cell_tags += f"{beg}{s}{end}" - return cell_tags - - -metatables = {} -def match_many(task_name, dataset_name, metric_name, tables, values): - metatables = {} - for arxiv_id in tables: - for table in tables[arxiv_id]: - tags = mark_with_all_comparators(task_name, dataset_name, metric_name, arxiv_id, tables[arxiv_id][table], values) - key = (arxiv_id, table) - if key in metatables: - metatables[key] += tags - else: - metatables[key] = tags - return metatables - - -def normalize_metric(value): - value, _ = normalize_float_value(str(value)) - if value in metric_na: - return Decimal("NaN") - return Decimal(value) - - -def normalize_cell(cell): - matches = metric_value_re.findall(cell) - matches = [normalize_float_value(match[0]) for match in matches] - values = [(Decimal(value[0]), value[1]) for value in matches if value not in metric_na] - return values - - -def normalize_table(table): - return table.applymap(normalize_cell) - - -celltags_re = re.compile(r"(?P.*?)(?P.*?)(?P.*?)(?P.*?)(?P.*?)(?P.*?)(?P)?(?P.*?)(?P.*?)(?P.*?)") -def parse_celltags(v): - r = [] - for m in celltags_re.finditer(v): - d = m.groupdict() - d['this_paper'] = d['this_paper'] is not None - r.append(d) - return r - - -def celltags_to_json(df): - tags = [] - for r, row in df.iterrows(): - for c, cell in enumerate(row): - if cell != "": - tags.append(dict(row=r, col=c, hits=parse_celltags(cell))) - return tags - - - -# for each task with sota row -# arxivs <- list of papers related to the task -# for each (dataset_name, metric_name) of the task: -# for each table in arxivs -# for each fuzzy_comparator -# count number of task's sota rows found in the table using comparator -# comparator <- comparator with the largest number of hits -# if hits > hits_threshold: -# mark table with a given dataset_name and metric_name -# mark hit cells with sota-tag, model_name and paper_id -# if table.arxiv_id == paper_id: mark with this-tag -PaperResult = namedtuple("PaperResult", ["arxiv_id", "model", "value", "normalized"]) - -arxivs_by_metrics = {} -tables = {} - -def match_for(task, dataset, metric): - records = arxivs_by_metrics[(task, dataset, metric)] - tabs = {r.arxiv_id: tables[r.arxiv_id] for r in records if r.arxiv_id in tables} - return match_many(task, dataset, metric, tabs, records) - - -def label_tables(tasksfile, papers_dir, output, jobs=-1): - print("Reading PwC entries...", file=sys.stderr) - tasks = get_sota_tasks(tasksfile) - print("Reading tables from files...", file=sys.stderr) - pc = PaperCollection.from_files(papers_dir, load_texts=False, load_annotations=False, jobs=jobs) - - # share data between processes to avoid costly joblib serialization - global arxivs_by_metrics, tables - - print("Normalizing tables...", file=sys.stderr) - tables = {p.arxiv_no_version: {tab.name: normalize_table(tab.matrix) for tab in p.tables} for p in pc} - - print("Aggregating papers...", file=sys.stderr) - for task in tasks: - for dataset in task.datasets: - for row in dataset.sota.rows: - match = arxiv_url_re.match(row.paper_url) - if match is not None: - arxiv_id = remove_arxiv_version(match.group("arxiv_id")) - for metric in row.metrics: - arxivs_by_metrics.setdefault((task.name, dataset.name, metric), set()).add( - PaperResult(arxiv_id=arxiv_id, model=row.model_name, value=row.metrics[metric], - normalized=normalize_metric(row.metrics[metric]) - ) - ) - - print("Matching results...", file=sys.stderr) - metatables_list = Parallel(n_jobs=jobs, backend="multiprocessing")( - [delayed(match_for)(task, dataset, metric) - for task, dataset, metric in arxivs_by_metrics]) - - print("Aggregating results...", file=sys.stderr) - metatables = {} - for mt in metatables_list: - for k, v in mt.items(): - metatables[k] = metatables.get(k, "") + v - grouped_metatables = {} - for (arxiv_id, tablename), df in metatables.items(): - grouped_metatables.setdefault(arxiv_id, {})[tablename] = celltags_to_json(df) - - with open(output, 'wt') as f: - json.dump(grouped_metatables, f) - # print("Saving matches...", file=sys.stderr) - # for (arxiv_id, table), best in metatables.items(): - # out = output_dir / arxiv_id - # out.mkdir(parents=True, exist_ok=True) - # best.to_csv(out / table.replace("table", "celltags"), header=None, index=None) - - -if __name__ == "__main__": fire.Fire(label_tables) diff --git a/normalize_metrics.py b/normalize_metrics.py deleted file mode 100755 index 058ec34..0000000 --- a/normalize_metrics.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python - -import fire -from label_tables import get_sota_tasks - - -def normalize_metrics(tasksfile): - tasks = get_sota_tasks(tasksfile) - - print("task\tdataset\tmetric") - for task in tasks: - for dataset in task.datasets: - for row in dataset.sota.rows: - for metric in row.metrics: - print(f"{task.name}\t{dataset.name}\t{metric}") - - -if __name__ == "__main__": fire.Fire(normalize_metrics) diff --git a/normalize_references.py b/normalize_references.py deleted file mode 100644 index 506259d..0000000 --- a/normalize_references.py +++ /dev/null @@ -1,84 +0,0 @@ -import fire -from unidecode import unidecode -from pathlib import Path -import string -import ahocorasick -import pickle -from multiprocessing import Pool -from sota_extractor2.data.doc_utils import get_text, read_html - -punctuation_table = str.maketrans('', '', string.punctuation) - -def normalize_title(title): - return unidecode(title.strip().lower().replace(' ', '')).translate(punctuation_table) - -def resolve_references(reference_trie, bibitems): - if len(bibitems) == 0: - return {} - bib_ids = list(bibitems.keys()) - texts = list(bibitems.values()) - found = 0 - resolved = {} - for bib_id, text in zip(bib_ids, texts): - references = [ref for _, ref in reference_trie.iter(normalize_title(text)) if len(normalize_title(ref['title'])) >= 6] - references = sorted(references, key=lambda ref: len(normalize_title(ref['title'])), reverse=True) - for ref in references: - for author in ref['authors']: - if normalize_title(author['name'].split(' ')[-1]) not in normalize_title(text): - break - else: - found += 1 - resolved[bib_id] = ref['id'] - break - print(f"Found {found} ({found / len(bibitems)})") - return resolved - -def bib_elems(html): - return html.select(".ltx_bibliography .ltx_bibitem[id]") - -def update_references(html, mapping): - anchors = html.select('[href^="#"]') - for anchor in anchors: - target = anchor['href'][1:] - anchor['href'] = '#' + mapping.get(target, target) - anchors = bib_elems(html) - for anchor in anchors: - bib_id = anchor['id'] - anchor['id'] = mapping.get(bib_id, bib_id) - -def get_bibitems(html): - elems = bib_elems(html) - bibitems = {} - for elem in elems: - bib_id = elem['id'] - bibitems[bib_id] = get_text(elem) - return bibitems - -def save_html(path, html): - with open(path, 'w') as f: - f.write(str(html)) - -def resolve_references_in_html(args): - file, output = args - output.parent.mkdir(exist_ok=True, parents=True) - html = read_html(file) - bibitems = get_bibitems(html) - mapping = resolve_references(reference_trie, bibitems) - update_references(html, mapping) - save_html(output, html) - -#DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json") - -#TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl") - -def normalize_references(source_path, target_path, automaton, jobs=1): - global reference_trie - source_path = Path(source_path) - target_path = Path(target_path) - with open(automaton, 'rb') as f: - reference_trie = pickle.load(f) - with Pool(jobs) as p: - params = [(file, target_path / file.relative_to(source_path)) for file in source_path.glob("**/*.html")] - p.map(resolve_references_in_html, params) - -if __name__ == "__main__": fire.Fire(normalize_references) diff --git a/notebooks/papers-api.ipynb b/notebooks/papers-api.ipynb deleted file mode 100644 index de5762c..0000000 --- a/notebooks/papers-api.ipynb +++ /dev/null @@ -1,901 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Papers with Code ML papers dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/ubuntu/paperswithcode/paper-extractor\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%cd .." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from sota_extractor2.data.paper_collection import PaperCollection\n", - "from pathlib import Path\n", - "\n", - "DATA_PATH = Path(\"data/arxiv\")\n", - "PICKLE_PATH = Path(\"data/pc.pkl\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset\n", - "The dataset was created by parsing 75K arXiv papers related to machine learning. Due to parsing errors, the dataset contains texts and tables extracted from 56K papers. \n", - "```\n", - ".\n", - "└── arxiv\n", - " ├── papers\n", - " │ ├── 0709\n", - " │ │ ├── 0709.1667\n", - " │ │ │ ├── text.json\n", - " │ │ │ ├── metadata.json\n", - " │ │ │ ├── table_01.csv\n", - " │ │ │ ...\n", - " │ │ ...\n", - " │ ...\n", - " └── structure-annotations.json\n", - "```\n", - "\n", - "`text.json` files contains papers' content organized into sections. `metadata.json` list tables and their captions found in a given paper. `table_xx.csv` contains data of a given table (nested tables are flattened). We provide a simple API to load and access the dataset. Due to large number of papers it is recommended to load the dataset in parallel (default uses number of processes equal to number of CPU cores) and store it in a pickle file. Set `jobs=1` to disable multiprocessing. PaperCollection is a wrapper for `list` of papers with additional functions added for convenience. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 4min 58s, sys: 12.4 s, total: 5min 11s\n", - "Wall time: 7min 28s\n" - ] - }, - { - "data": { - "text/plain": [ - "56696" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%time pc = PaperCollection.from_files(DATA_PATH)\n", - "len(pc)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "pc.to_pickle(PICKLE_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3min 11s, sys: 9.39 s, total: 3min 20s\n", - "Wall time: 3min 20s\n" - ] - } - ], - "source": [ - "#%time pc = PaperCollection.from_pickle(PICKLE_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The path is searched recursively for papers, so it is easy to specify smaller dataset to play with. In this case, however, a path to `structure-annotations.json` file needs to be specified." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 2.35 s, sys: 2.08 s, total: 4.43 s\n", - "Wall time: 8.62 s\n" - ] - }, - { - "data": { - "text/plain": [ - "555" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#%time pc_small = PaperCollection.from_files(DATA_PATH / \"papers\" / \"1602\", annotations_path=DATA_PATH / \"structure-annotations.json\")\n", - "#len(pc_small)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tables\n", - "Each `Paper` contains `text` and `tables` fields. Tables can be displayed with color-coded labels." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Modeld|θ|MTrainTest
Classifier with handcrafted features [12]--99.778.2
LSTM encoders [12]3003.0M83.980.6
Dependency Tree CNN encoders [13]3003.5M83.382.1
SPINN-PI encoders [14]3003.7M89.283.2
NSE3003.4M86.284.6
MMA-NSE3006.3M87.184.8
LSTM attention [15]100242K85.482.3
LSTM word-by-word attention [15]100252K85.383.5
MMA-NSE attention3006.5M86.985.4
mLSTM word-by-word attention [16]3001.9M92.086.1
LSTMN with deep attention fusion [17]4503.4M89.586.3
Decomposable attention model [18]200582K90.586.8
Full tree matching NTI-SLSTM-LSTM global attention [19]3003.2M88.587.3
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "paper = pc.get_by_id('1607.04315')\n", - "table = paper.tables[0]\n", - "table.display()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Tagdescription
model-bestmodel that has results that author most likely would like to have exposed
model-paperan example of a generic model, (like LSTM)
model-competingmodel from another paper used for comparison
dataset-taskTask
datasetDataset
dataset-subSubdataset
dataset-metricMetric
model-paramsParams, f.e., number of layers or inference time
table-metaCell describing other header cells
trashParsing erros
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "PaperCollection.cells_gold_tags_legend()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Table's data is stored in `.df` pandas `DataFrame`. Each cell contains its content `value`, annotated `gold_tags` and references `refs` to other papers. Most of the references were normalized across all papers." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Cell(value='SPINN-PI encoders [14]', gold_tags='model-competing', refs=['xxref-23c141141f4f63c061d3cce14c71893959af5721'])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.df.iloc[4,0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Additionally, each table contains `gold_tags` describing what is the content of the table." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'sota'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.gold_tags" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Text Content\n", - "Papers' content is represented using elastic search document classes (can be easily `save()`'ed to an existing elastic search instance). Each `text` contains `title`, `abstract`, and 'authors'. Paper's text is split into `fragments`." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Abstract We present a memory augmented neural network for natural language understanding: Neural Semantic Encoders. NSE is equipped with a novel memory update rule and has a variable sized encoding memory that evolves over time and maintains the understanding of input sequences through read , compose and write operations. NSE can also access 1 xxanchor-x1-2f1 multiple and shared memories. In this paper, we demonstrated the effectiveness and the flexibility of NSE on five different natural language tasks: natural language inference, question answering, sentence classification, document sentiment analysis and machine translation where NSE achieved state-of-the-art performance when evaluated on publically available benchmarks. For example, our shared-memory model showed an encouraging result on neural machine translation, improving an attention-based baseline by approximately 1.0 BLEU.'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "paper.text.abstract" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "1 xxanchor-x1-10001 Introduction" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "2 xxanchor-x1-20002 Related Work" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "3 xxanchor-x1-30003 Proposed Approach" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "3.1 xxanchor-x1-40003.1 Read, Compose and Write" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "3.2 xxanchor-x1-50003.2 Shared and Multiple Memory Accesses" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4 xxanchor-x1-60004 Experiments" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.1 xxanchor-x1-70004.1 Natural Language Inference" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.2 xxanchor-x1-80004.2 Answer Sentence Selection" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.3 xxanchor-x1-90004.3 Sentence Classification" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.4 xxanchor-x1-100004.4 Document Sentiment Analysis" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.5 xxanchor-x1-110004.5 Machine Translation" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "5.1 xxanchor-x1-130005.1 Memory Access and Compositionality" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "6 xxanchor-x1-140006 Conclusion" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "xxanchor-x1-150006 Acknowledgments" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "xxanchor-x1-160006 References" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "A xxanchor-x1-17000A Step-by-step visualization of memory states in NSE" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "paper.text.print_toc()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "# 4.5 xxanchor-x1-110004.5 Machine Translation" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "Lastly, we conducted an experiment on neural machine translation (NMT). The NMT problem is mostly defined within the encoder-decoder framework [ xxref-4b9b7eed30feee37db3452b74503d0db9f163074 , xxref-0b544dfe355a5070b60986319a3f51fb45d1348e , xxref-39dba6f22d72853561a4ed684be265e179a39e4f ]. The encoder provides the semantic and syntactic information about the source sentences to the decoder and the decoder generates the target sentences by conditioning on this information and its partially produced translation. For an efficient encoding, the attention-based NTM was introduced [ xxref-071b16f25117fb6133480c6259227d54fc2a5ea0 ]." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11000" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "For NTM, we implemented three different models. The first model is a baseline model and is similar to the one proposed in [ xxref-071b16f25117fb6133480c6259227d54fc2a5ea0 ] (RNNSearch). This model (LSTM-LSTM) has two LSTM for the encoder/decoder and has the soft attention neural net, which attends over the source sentence and constructs a focused encoding vector for each target word. The second model is an NSE-LSTM encoder-decoder which encodes the source sentence with NSE and generates the targets with the LSTM network by using the NSE output states and the attention network. The last model is an NSE-NSE setup, where the encoding part is the same as the NSE-LSTM while the decoder NSE now uses the output state and has an access to the encoder memory, i.e., the encoder and the decoder NSEs access a shared memory. The memory is encoded by the first NSEs and then read/written by the decoder NSEs. We used the English-German translation corpus from the IWSLT 2014 evaluation campaign [ xxref-c64d27b122d5b6ef0be135e63df05c3b24bd80c5 ]. The corpus consists of sentence-aligned translation of TED talks. The data was pre-processed and lowercased with the Moses toolkit. 9 xxanchor-x1-11001f9 We merged the dev2010 and dev2012 sets for development and the tst2010, tst2011 and tst2012 sets for test data 10 xxanchor-x1-11002f10 . Sentence pairs with length longer than 25 words were filtered out. This resulted in 110,439/4,998/4,793 pairs for train/dev/test sets. We kept the most frequent 25,000 words for the German dictionary. The English dictionary has 51,821 words. The 300-D Glove 840B vectors were used for embedding the words in the source sentence whereas a lookup embedding layer was used for the target German words. Note that the word embeddings are usually optimized along with the NMT models. However, for the evaluation purpose we in this experiment do not optimize the English word embeddings. Besides, we do not use a beam search to generate the target sentences." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11001" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "xxanchor-x1-110032 Figure 2: Word association or composition graphs produced by NSE memory access. The directed arcs connect the words that are composed via compose module. The source nodes are input words and the destination nodes (pointed by the arrows) correspond to the accessed memory slots. < S > denotes the beginning of sequence." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11002" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "The LSTM encoder/decoders have two layers with 300 units. The NSE read/write modules are two one-layer LSTM with the same number of units as the LSTM encoder/decoders. This ensures that the number of parameters of the models is roughly the equal. The models were trained to minimize word-level cross entropy loss and were regularized by 20% input dropouts and the 30% output dropouts. We set the batch size to 128, the initial learning rate to 1e-3 for LSTM-LSTM and 3e-4 for the other models and l 2 regularizer strength to 3e-5, and train each model for 40 epochs. We report BLEU score for each models. 11 xxanchor-x1-11004f11" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11003" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "Table xxref-x1-100035 reports our results. The baseline LSTM-LSTM encoder-decoder (with attention) obtained 17.02 BLEU on the test set. The NSE-LSTM improved the baseline slightly. Given this very small improvement of the NSE-LSTM, it is unclear whether the NSE encoder is helpful in NMT. However, if we replace the LSTM decoder with another NSE and introduce the shared memory access to the encoder-decoder model (NSE-NSE), we improve the baseline result by almost 1.0 BLEU. The NSE-NSE model also yields an increasing BLEU score on dev set. The result demonstrates that the attention-based NMT systems can be improved by a shared-memory encoder-decoder model. In addition, memory-based NMT systems should perform well on translation of long sequences by preserving long term dependencies." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11004" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "paper.text.print_section(\"Machine Translation\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fragments can be accessed separately" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "# 1 xxanchor-x1-10001 Introduction,\n", - "Recently several studies have explored ways of extending the neural networks with an external memory [ xxref-6eedf0a4fe861335f7f7664c14de7f71c00b7932 – xxref-950ebd31505dfc0733c391ad9b7a16571c46002e ]. Unlike LSTM, the short term memories and the training parameters of such a neural network are no longer coupled and can be adapted. In this paper we propose a novel class of memory augmented neural networks called Neural Semantic Encoders (NSE) for natural language understanding. NSE offers several desirable properties. NSE has a variable sized encoding memory which allows the model to access entire input sequence during the reading process; therefore efficiently delivering long-term dependencies over time. The encoding memory evolves over time and maintains the memory of the input sequence through read , compose and write operations. NSE sequentially processes the input and supports word compositionality inheriting both temporal and hierarchical nature of human language. NSE can read from and write to a set of relevant encoding memories simultaneously or multiple NSEs can access a shared encoding memory effectively supporting knowledge and representation sharing. NSE is flexible, robust and suitable for practical NLU tasks and can be trained easily by any gradient descent optimizer." - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "paper.text.fragments[1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/setup.py b/setup.py deleted file mode 100644 index 6bd0111..0000000 --- a/setup.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import re - -import setuptools - -directory = os.path.dirname(os.path.abspath(__file__)) - -# Extract version information -# path = os.path.join(directory, 'sota_extractor2', '__init__.py') -# with open(path) as read_file: -# text = read_file.read() -# pattern = re.compile(r"^__version__ = ['\"]([^'\"]*)['\"]", re.MULTILINE) -# version = pattern.search(text).group(1) -version="2.0-alpha" - -# # Extract long_description -# path = os.path.join(directory, 'README.md') -# with open(path) as read_file: -# long_description = read_file.read() -long_description = "" -setuptools.setup( - name='sota_extractor2', - version=version, - url='https://...', - description='System for extracting data from arxiv papers', - long_description_content_type='text/markdown', - long_description=long_description, - license='???', - packages=setuptools.find_packages(), - include_package_data=True, - - keywords='machine-learning ai information-extraction weak-supervision', - classifiers=[ - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3', - ], - - project_urls={ # Optional - 'Homepage': 'https://...', - 'Source': 'https://...', - 'Bug Reports': 'https://...', - 'Citation': 'https://...', - }, -) \ No newline at end of file diff --git a/sota_extractor2/config.py b/sota_extractor2/config.py index fd40971..e69de29 100644 --- a/sota_extractor2/config.py +++ b/sota_extractor2/config.py @@ -1,32 +0,0 @@ -import logging -from pathlib import Path - -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.WARN) - -# used only to dynamically fetch graph ql data -graphql_url = 'http://10.0.1.145:8001/graphql/' - -# otherwise use this files -data = Path("/mnt/efs/pwc/data") -goldtags_dump = data / "dumps" / "goldtags-2019.10.15_2227.json.gz" - - -elastic = dict(hosts=['localhost'], timeout=20) - - -arxiv = data/'arxiv' -htmls_raw = arxiv/'htmls' -htmls_clean = arxiv/'htmls-clean' - -datasets = data/"datasets" -datasets_structure = datasets/"structure" -structure_models = datasets / "structure" / "models" - -mocks = datasets / "mocks" - -linking_models = datasets / "linking" / "models" -linking_data = datasets / "linking" / "data" - -autodict = linking_data / "autodict" diff --git a/sota_extractor2/data/__init__.py b/sota_extractor2/data/__init__.py index 9501a35..e69de29 100644 --- a/sota_extractor2/data/__init__.py +++ b/sota_extractor2/data/__init__.py @@ -1,10 +0,0 @@ -import logging -from .. import config # to get logging init - -logger = logging.getLogger(__name__) - -try: - from db import * -except: - logger.info("Unable to intialise django falling back to json data") - from json import * \ No newline at end of file diff --git a/sota_extractor2/data/json.py b/sota_extractor2/data/json.py deleted file mode 100644 index e647e66..0000000 --- a/sota_extractor2/data/json.py +++ /dev/null @@ -1,129 +0,0 @@ -#%% -import json -import re -import gzip -import pprint -import requests -from sota_extractor2 import config -#%% -def to_snake_case(name): - #https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() - -def to_camel_case(name): - components = name.split('_') - return components[0] + ''.join(x.title() for x in components[1:]) - -def wrap_dict(d): - if "edges" in d: - return EdgeWrap(d['edges']) - elif "node" in d: - return NodeWrap(d["node"]) - return NodeWrap(d) - -class EdgeWrap(list): - - def all(self): - return self - - def filter(self, **kwargs): - return [n for n in self if n.matches(**kwargs)] - - def __add__(self, rhs): - return EdgeWrap(list.__add__(self, rhs)) - - def __iter__(self): - return (wrap_dict(d) for d in super().__iter__()) - - def __getitem__(self, key): - vals = list.__getitem__(self, key) - if isinstance(vals, dict): - return wrap_dict(vals) - if isinstance(vals, list): - return EdgeWrap(vals) - return vals - - def __repr__(self): - val = "\n".join(repr(k) for k in self) - return f"EdgeWrap([{val}])" - - -class NodeWrap(dict): - - def matches(self, **kwargs): - return all(getattr(self, k) == v for k,v in kwargs.items()) - - def __getattr__(self, name): - camel_name = to_camel_case(name) - if camel_name in self: - val = self[camel_name] - if isinstance(val, (dict)): - return wrap_dict(val) - return val - return super().__getattribute__(name) - - def __repr__(self): - def cut(s, length=20): - return s[:length] if len(s) <= length else s[:length] + '...' - vals = pprint.pformat({to_snake_case(k): cut(str(self[k])) for k in self.keys()}) - return f"NodeWrap({vals})" - -def load_gql_dump(data_or_file, compressed=True): - if isinstance(data_or_file, dict): - papers_data = data_or_file - else: - open_fn = gzip.open if compressed else open - with open_fn(data_or_file, "rt") as f: - papers_data = json.load(f) - data = papers_data["data"] - return {k:wrap_dict(v) for k,v in data.items()} - -#%% -def gql(query, **variables): - query = { 'query' : query} - r = requests.post(url=config.graphql_url, json=query) - return json.loads(r.text) - -def gql_papers(goldtags_regex="sst2"): - return gql(""" -query{ - papers: allPapers(goldTags_Regex:"sst2", first:10) { - edges{ - node{ - arxivId - title - abstract - tableSet { - edges { - node { - id - matrix - desc - matrixGoldTags - goldTags - cellsGoldTags - } - } - } - } - } - } -} -""", goldTags_Regex=goldtags_regex) - -def load_annotated_papers(): - return load_gql_dump(config.goldtags_dump)["allPapers"] - -def test__wrapping(): - papers = load_gql_dump(papers_data) - assert papers[0].arxiv_id == '1511.08630v2' - papers[0].table_set[0].matrix - papers[0].table_set[0].matrix_gold_tags - - a = load_gql_dump(d)['allPapers'] - a[0].arxiv_id - next(iter(a)).arxiv_id - a - -#%% diff --git a/sota_extractor2/data/paper_collection.py b/sota_extractor2/data/paper_collection.py index 336822e..dde0bee 100644 --- a/sota_extractor2/data/paper_collection.py +++ b/sota_extractor2/data/paper_collection.py @@ -1,6 +1,5 @@ from .elastic import Paper as PaperText, Fragments from .table import Table, read_tables -from .json import load_gql_dump from pathlib import Path import re import pickle diff --git a/sota_extractor2/data/structure.py b/sota_extractor2/data/structure.py deleted file mode 100644 index 1ed5d1a..0000000 --- a/sota_extractor2/data/structure.py +++ /dev/null @@ -1,202 +0,0 @@ -import re -import pandas as pd -from collections import namedtuple -import hashlib -from fastai.text import progress_bar -from .elastic import Fragment, setup_default_connection -from .json import * -from .table import reference_re, remove_text_styles, remove_references, style_tags_re - -def get_all_tables(papers): - for paper in papers: - for table in paper.table_set.filter(parser="latexml"): - if 'trash' not in table.gold_tags and table.gold_tags != '': - table.paper_id = paper.arxiv_id - yield table - -def consume_cells(table): - Cell = namedtuple('AnnCell', 'row col vals') - for row_id, row in enumerate(table.df.values): - for col_id, cell in enumerate(row): - vals = [ - remove_text_styles(remove_references(cell.raw_value)), - cell.gold_tags, - cell.refs[0] if cell.refs else "", - cell.layout, - bool(style_tags_re.search(cell.raw_value)) - ] - yield Cell(row=row_id, col=col_id, vals=vals) - - -reference_re = re.compile(r"\[[^]]*\]") -ours_re = re.compile(r"\(ours?\)") -all_parens_re = re.compile(r"\([^)]*\)") - - -def clear_cell(s): - for pat in [reference_re, all_parens_re]: - s = pat.sub("", s) - s = s.strip() - return s - - -def empty_fragment(paper_id): - fragment = Fragment(paper_id=paper_id) - fragment.meta['highlight'] = {'text': ['']} - return fragment - - -def normalize_query(query): - if isinstance(query, list): - return tuple(normalize_query(x) for x in query) - if isinstance(query, dict): - return tuple([(normalize_query(k), normalize_query(v)) for k,v in query.items()]) - return query - -_evidence_cache = {} -_cache_miss = 0 -_cache_hit = 0 -def get_cached_or_execute(query): - global _evidence_cache, _cache_hit, _cache_miss - n = normalize_query(query.to_dict()) - if n not in _evidence_cache: - _evidence_cache[n] = list(query) - _cache_miss += 1 - else: - _cache_hit += 1 - return _evidence_cache[n] - - -def fetch_evidence(cell_content, cell_reference, paper_id, table_name, row, col, paper_limit=10, corpus_limit=10, - cache=False): - if not filter_cells(cell_content): - return [empty_fragment(paper_id)] - cell_content = clear_cell(cell_content) - if cell_content == "" and cell_reference == "": - return [empty_fragment(paper_id)] - - cached_query = get_cached_or_execute if cache else lambda x: x - evidence_query = Fragment.search().highlight( - 'text', pre_tags="", post_tags="", fragment_size=400) - cell_content = cell_content.replace("\xa0", " ") - query = { - "query": cell_content, - "slop": 2 - } - paper_fragments = list(cached_query(evidence_query - .filter('term', paper_id=paper_id) - .query('match_phrase', text=query)[:paper_limit])) - if cell_reference != "": - reference_fragments = list(cached_query(evidence_query - .filter('term', paper_id=paper_id) - .query('match_phrase', text={ - "query": cell_reference, - "slop": 1 - })[:paper_limit])) - else: - reference_fragments = [] - other_fagements = list(cached_query(evidence_query - .exclude('term', paper_id=paper_id) - .query('match_phrase', text=query)[:corpus_limit])) - - ext_id = f"{paper_id}/{table_name}/{row}.{col}" - ####print(f"{ext_id} |{cell_content}|: {len(paper_fragments)} paper fragments, {len(reference_fragments)} reference fragments, {len(other_fagements)} other fragments") - # if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements): - # print(f"No evidences for '{cell_content}' of {paper_id}") - if not len(paper_fragments) and not len(reference_fragments): - paper_fragments = [empty_fragment(paper_id)] - return paper_fragments + reference_fragments + other_fagements - -fix_refs_re = re.compile('\(\?\)|\s[?]+(\s|$)') - - -def fix_refs(text): - return fix_refs_re.sub(' xref-unkown ', fix_refs_re.sub(' xref-unkown ', text)) - - -highlight_re = re.compile("") -partial_highlight_re = re.compile(r"\xxref\-(?!\)") - - -def fix_reference_hightlight(s): - return partial_highlight_re.sub("xxref-", s) - - -evidence_columns = ["text_sha1", "text_highlited", "text", "header", "cell_type", "cell_content", "cell_reference", - "cell_layout", "cell_styles", "this_paper", "row", "col", "row_context", "col_context", "ext_id"] - - -def create_evidence_records(textfrag, cell, paper_id, table): - for text_highlited in textfrag.meta['highlight']['text']: - text_highlited = fix_reference_hightlight(fix_refs(text_highlited)) - text = highlight_re.sub("", text_highlited) - text_sha1 = hashlib.sha1(text.encode("utf-8")).hexdigest() - - cell_ext_id = f"{paper_id}/{table.name}/{cell.row}/{cell.col}" - - yield {"text_sha1": text_sha1, - "text_highlited": text_highlited, - "text": text, - "header": textfrag.header, - "cell_type": cell.vals[1], - "cell_content": fix_refs(cell.vals[0]), - "cell_reference": cell.vals[2], - "cell_layout": cell.vals[3], - "cell_styles": cell.vals[4], - "this_paper": textfrag.paper_id == paper_id, - "row": cell.row, - "col": cell.col, - "row_context": " border ".join([str(s) for s in table.matrix.values[cell.row]]), - "col_context": " border ".join([str(s) for s in table.matrix.values[:, cell.col]]), - "ext_id": cell_ext_id - #"table_id":table_id - } - - -def filter_cells(cell_content): - return re.search("[a-zA-Z]{2,}", cell_content) is not None - - -interesting_types = ["model-paper", "model-best", "model-competing", "dataset", "dataset-sub", "dataset-task"] - - -def evidence_for_table(paper_id, table, paper_limit, corpus_limit, cache=False): - records = [ - record - for cell in consume_cells(table) - for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=paper_id, table_name=table.name, - row=cell.row, col=cell.col, paper_limit=paper_limit, corpus_limit=corpus_limit, - cache=cache) - for record in create_evidence_records(evidence, cell, paper_id=paper_id, table=table) - ] - df = pd.DataFrame.from_records(records, columns=evidence_columns) - return df - - -def prepare_data(tables, csv_path, cache=False): - data = [evidence_for_table(table.paper_id, table, - paper_limit=100, - corpus_limit=20, cache=cache) for table in progress_bar(tables)] - if len(data): - df = pd.concat(data) - else: - df = pd.DataFrame(columns=evidence_columns) - #moved to experiment preprocessing - #df = df.drop_duplicates( - # ["cell_content", "text_highlited", "cell_type", "this_paper"]) - print("Number of text fragments ", len(df)) - - csv_path.parent.mkdir(parents=True, exist_ok=True) - df.to_csv(csv_path, index=None) - - -class CellEvidenceExtractor: - def __init__(self): - # todo: make sure can be called more than once or refactor to singleton - setup_default_connection() - - def __call__(self, paper, tables, paper_limit=30, corpus_limit=10): - dfs = [evidence_for_table(paper.paper_id, table, paper_limit, corpus_limit) for table in tables] - if len(dfs): - return pd.concat(dfs) - return pd.DataFrame(columns=evidence_columns) diff --git a/sota_extractor2/helpers/explainers.py b/sota_extractor2/helpers/explainers.py deleted file mode 100644 index 412e40e..0000000 --- a/sota_extractor2/helpers/explainers.py +++ /dev/null @@ -1,210 +0,0 @@ -from sota_extractor2.models.linking.metrics import Metrics -from ..models.structure import TableType -from ..loggers import StructurePredictionEvaluator, LinkerEvaluator, FilteringEvaluator -import pandas as pd -import numpy as np -from ..helpers.jupyter import table_to_html -from sota_extractor2.models.linking.format import extract_value -from sota_extractor2.helpers.optimize import optimize_filters - - -class Reason: - pass - - -class IrrelevantTable(Reason): - def __init__(self, paper, table, table_type, probs): - self.paper = paper - self.table = table - self.table_type = table_type - self.probs = pd.DataFrame(probs, columns=["type", "probability"]) - - def __str__(self): - return f"Table {self.table.name} was labelled as {self.table_type.name}." - - def _repr_html_(self): - prediction = f'
{self}
' - caption = f'
Caption: {self.table.caption}
' - probs = self.probs.style.format({"probability": "{:.2f}"})._repr_html_() - return prediction + caption + probs - - -class MislabeledCell(Reason): - def __init__(self, paper, table, row, col, probs): - self.paper = paper - self.table = table - - -class TableExplanation: - def __init__(self, paper, table, table_type, proposals, reasons, topk): - self.paper = paper - self.table = table - self.table_type = table_type - self.proposals = proposals - self.reasons = reasons - self.topk = topk - - def _format_tooltip(self, proposal): - return f"dataset: {proposal.dataset}\n" \ - f"metric: {proposal.metric}\n" \ - f"task: {proposal.task}\n" \ - f"score: {proposal.parsed}\n" \ - f"confidence: {proposal.confidence:0.2f}" - - def _format_topk(self, topk): - return "" - - def _repr_html_(self): - matrix = self.table.matrix_html.values - predictions = np.zeros_like(matrix, dtype=object) - tooltips = np.zeros_like(matrix, dtype=object) - for cell_ext_id, proposal in self.proposals.iterrows(): - paper_id, table_name, rc = cell_ext_id.split("/") - row, col = [int(x) for x in rc.split('.')] - if cell_ext_id in self.reasons: - reason = self.reasons[cell_ext_id] - tooltips[row, col] = reason - if reason.startswith("replaced by "): - tooltips[row, col] += "\n\n" + self._format_tooltip(proposal) - elif reason.startswith("confidence "): - tooltips[row, col] += "\n\n" + self._format_topk(self.topk[row, col]) - else: - predictions[row, col] = 'final-proposal' - tooltips[row, col] = self._format_tooltip(proposal) - - table_type_html = f'
Table {self.table.name} was labelled as {self.table_type.name}.
' - caption_html = f'
Caption: {self.table.caption}
' - table_html = table_to_html(matrix, - self.table.matrix_tags.values, - self.table.matrix_layout.values, - predictions, - tooltips) - html = table_type_html + caption_html + table_html - proposals = self.proposals[~self.proposals.index.isin(self.reasons.index)] - if len(proposals): - proposals = proposals[["dataset", "metric", "task", "model", "parsed"]]\ - .reset_index(drop=True).rename(columns={"parsed": "score"}) - html2 = proposals._repr_html_() - return f"
{html}
Proposals
{html2}
" - return html - - -class Explainer: - _sota_record_columns = ['task', 'dataset', 'metric', 'format', 'model', 'model_type', 'raw_value', 'parsed'] - - def __init__(self, pipeline_logger, paper_collection, gold_sota_records=None): - self.paper_collection = paper_collection - self.gold_sota_records = gold_sota_records - self.spe = StructurePredictionEvaluator(pipeline_logger, paper_collection) - self.le = LinkerEvaluator(pipeline_logger) - self.fe = FilteringEvaluator(pipeline_logger) - - def explain(self, paper, cell_ext_id): - paper_id, table_name, rc = cell_ext_id.split('/') - if paper.paper_id != paper_id: - return "No such cell" - - table_type, probs = self.spe.get_table_type_predictions(paper_id, table_name) - - if table_type == TableType.IRRELEVANT: - return IrrelevantTable(paper, paper.table_by_name(table_name), table_type, probs) - - all_proposals = self.le.proposals[paper_id] - reasons = self.fe.reason - table_ext_id = f"{paper_id}/{table_name}" - table_proposals = all_proposals[all_proposals.index.str.startswith(table_ext_id+"/")] - topk = {(row, col): topk for (pid, tn, row, col), topk in self.le.topk.items() - if (pid, tn) == (paper_id, table_name)} - - return TableExplanation(paper, paper.table_by_name(table_name), table_type, table_proposals, reasons, topk) - - row, col = [int(x) for x in rc.split('.')] - - reason = self.fe.reason.get(cell_ext_id) - if reason is None: - pass - else: - return reason - - def _get_table_sota_records(self, table): - - first_model = lambda x: ([a for a in x if a.startswith('model')] + [''])[0] - if len(table.sota_records): - matrix = table.matrix.values - tags = table.matrix_tags - model_type_col = tags.apply(first_model) - model_type_row = tags.T.apply(first_model) - sota_records = table.sota_records.copy() - sota_records['model_type'] = '' - sota_records['raw_value'] = '' - for cell_ext_id, record in sota_records.iterrows(): - name, rc = cell_ext_id.split('/') - row, col = [int(x) for x in rc.split('.')] - record.model_type = model_type_col[col] or model_type_row[row] - record.raw_value = matrix[row, col] - - sota_records["parsed"] = sota_records[["raw_value", "format"]].apply( - lambda row: float(extract_value(row.raw_value, row.format)), axis=1) - - sota_records = sota_records[sota_records["parsed"] == sota_records["parsed"]] - - strip_cols = ["task", "dataset", "format", "metric", "raw_value", "model", "model_type"] - sota_records = sota_records.transform( - lambda x: x.str.strip() if x.name in strip_cols else x) - return sota_records[self._sota_record_columns] - else: - empty = pd.DataFrame(columns=self._sota_record_columns) - empty.index.rename("cell_ext_id", inplace=True) - return empty - - def _get_sota_records(self, paper): - if not len(paper.tables): - empty = pd.DataFrame(columns=self._sota_record_columns) - empty.index.rename("cell_ext_id", inplace=True) - return empty - records = [self._get_table_sota_records(table) for table in paper.tables] - records = pd.concat(records) - records.index = paper.paper_id + "/" + records.index - records.index.rename("cell_ext_id", inplace=True) - return records - - def linking_metrics(self, experiment_name="unk"): - paper_ids = list(self.le.proposals.keys()) - - proposals = pd.concat(self.le.proposals.values()) - proposals = proposals[~proposals.index.isin(self.fe.reason.index)] - - papers = {paper_id: self.paper_collection.get_by_id(paper_id) for paper_id in paper_ids} - missing = [paper_id for paper_id, paper in papers.items() if paper is None] - if missing: - print("Missing papers in paper collection:") - print(", ".join(missing)) - papers = [paper for paper in papers.values() if paper is not None] - - # if not len(papers): - # gold_sota_records = pd.DataFrame(columns=self._sota_record_columns) - # gold_sota_records.index.rename("cell_ext_id", inplace=True) - # else: - # gold_sota_records = pd.concat([self._get_sota_records(paper) for paper in papers]) - if self.gold_sota_records is None: - gold_sota_records = pd.DataFrame(columns=self._sota_record_columns) - gold_sota_records.index.rename("cell_ext_id", inplace=True) - else: - - gold_sota_records = self.gold_sota_records - which = gold_sota_records.index.to_series().str.split("/", expand=True)[0]\ - .isin([paper.paper_id for paper in papers]) - gold_sota_records = gold_sota_records[which] - - df = gold_sota_records.merge(proposals, 'outer', left_index=True, right_index=True, suffixes=['_gold', '_pred']) - df = df.reindex(sorted(df.columns), axis=1) - df = df.fillna('not-present') - if "experiment_name" in df.columns: - del df["experiment_name"] - - metrics = Metrics(df, experiment_name=experiment_name) - return metrics - - def optimize_filters(self, metrics_info): - results = optimize_filters(self, metrics_info) - return results diff --git a/sota_extractor2/helpers/interpret.py b/sota_extractor2/helpers/interpret.py deleted file mode 100644 index e6d5f14..0000000 --- a/sota_extractor2/helpers/interpret.py +++ /dev/null @@ -1,59 +0,0 @@ -from fastai.text.interpret import TextClassificationInterpretation as AbsTextClassificationInterpretation, _eval_dropouts -from fastai.basic_data import DatasetType -import torch - - -__all__ = ["TextClassificationInterpretation", "TextMultiClassificationInterpretation"] - - -class TextClassificationInterpretation(AbsTextClassificationInterpretation): - @classmethod - def from_learner(cls, learner): - empty_preds = torch.Tensor([[1]]) - return cls(learner, empty_preds, None, None) - - def intrinsic_attention(self, text:str, class_id:int=None): - """Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`. - Similar as in base class, but does not apply abs() before summing gradients. - """ - self.model.train() - _eval_dropouts(self.model) - self.model.zero_grad() - self.model.reset() - ids = self.data.one_item(text)[0] - emb = self.model[0].module.encoder(ids).detach().requires_grad_(True) - lstm_output = self.model[0].module(emb, from_embeddings=True) - self.model.eval() - cl = self.model[1](lstm_output + (torch.zeros_like(ids).byte(),))[0].softmax(dim=-1) - if class_id is None: class_id = cl.argmax() - cl[0][class_id].backward() - # attn = emb.grad.squeeze().abs().sum(dim=-1) - # attn /= attn.max() - attn = emb.grad.squeeze().sum(dim=-1) - attn = attn / attn.abs().max() * 0.5 + 0.5 - tokens = self.data.single_ds.reconstruct(ids[0]) - return tokens, attn - - -class TextMultiClassificationInterpretation(TextClassificationInterpretation): - def intrinsic_attention(self, text:str, class_id:int=None): - """Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`. - Similar as in base class, but uses sigmoid instead of softmax and does not apply abs() before summing gradients. - """ - self.model.train() - _eval_dropouts(self.model) - self.model.zero_grad() - self.model.reset() - ids = self.data.one_item(text)[0] - emb = self.model[0].module.encoder(ids).detach().requires_grad_(True) - lstm_output = self.model[0].module(emb, from_embeddings=True) - self.model.eval() - cl = self.model[1](lstm_output + (torch.zeros_like(ids).byte(),))[0].sigmoid() - if class_id is None: class_id = cl.argmax() - cl[0][class_id].backward() - # attn = emb.grad.squeeze().abs().sum(dim=-1) - # attn /= attn.max() - attn = emb.grad.squeeze().sum(dim=-1) - attn = attn / attn.abs().max() * 0.5 + 0.5 - tokens = self.data.single_ds.reconstruct(ids[0]) - return tokens, attn diff --git a/sota_extractor2/helpers/latex_converter.py b/sota_extractor2/helpers/latex_converter.py index abb5e4d..bbc0386 100644 --- a/sota_extractor2/helpers/latex_converter.py +++ b/sota_extractor2/helpers/latex_converter.py @@ -33,7 +33,7 @@ def latex2html(self, source_dir, output_dir): output_dir.mkdir(parents=True, exist_ok=True) filename = "index.html" command = ["/files/latex2html.sh", filename] - self.client.containers.run("arxivvanity/engrafo", command, remove=True, volumes=volumes) + self.client.containers.run("arxivvanity/engrafo:b3db888fefa118eacf4f13566204b68ce100b3a6", command, remove=True, volumes=volumes) # todo: check for errors @@ -43,7 +43,7 @@ def clean_html(self, path): path.resolve(): ro_bind("/files/index.html"), } - command = "timeout -t 20 -s KILL chromium-browser --headless" \ + command = "timeout -s KILL 20 chromium-browser --headless" \ " --disable-gpu --disable-software-rasterizer --no-sandbox" \ " --timeout=30000 --dump-dom /files/index.html" data = self.client.containers.run("zenika/alpine-chrome:73", command, remove=True, entrypoint="", diff --git a/sota_extractor2/helpers/optimize.py b/sota_extractor2/helpers/optimize.py deleted file mode 100644 index df3e2cb..0000000 --- a/sota_extractor2/helpers/optimize.py +++ /dev/null @@ -1,266 +0,0 @@ -import pandas as pd, numpy as np -from dataclasses import dataclass, replace -from sota_extractor2.models.linking.metrics import CM -from matplotlib import pyplot as plt - - -def annotations(matrix, structure, r, c, type='model'): - ann = [] - for nc in range(0, c): - if type in structure[r, nc]: - ann.append(matrix[r, nc]) - for nr in range(0, r): - if type in structure[nr, c]: - ann.append(matrix[nr, c]) - return ' '.join(ann) - - -def estimate_noises(extracted_values, gold_values, short_forms): - if not len(extracted_values): - return {} - extracted_values = set(extracted_values) - gold_values = set(gold_values) - - return {gold: 1 - len(extracted_values & set(short_forms.get(gold, set()))) / len(extracted_values) for gold in - gold_values} - - -def estimate_context_noise(context, records): - context = context or "" - abbrvs = context_search.extract_acronyms(context) - context = normalize_cell_ws(normalize_dataset(context)) - dss = set(cs.find_datasets(context)) | set(abbrvs.keys()) - mss = set(cs.find_metrics(context)) - dss -= mss - dss = set([normalize_cell(ds) for ds in dss]) - mss = set([normalize_cell(ms) for ms in mss]) - - gold_ds = set(records.dataset.values) - gold_ms = set(records.metric.values) - ds_noises = estimate_noises(dss, gold_ds, cs.datasets) - ms_noises = estimate_noises(mss, gold_ms, cs.metrics) - - return ds_noises, ms_noises - - -def estimate_paper_context_noise(paper, gold_sota_records): - records = gold_sota_records[gold_sota_records.paper_id == paper.paper_id] - datasets = de.from_paper(paper) - context = " ".join(datasets) - return estimate_context_noise(context, records) - - -def estimate_caption_context_noise(paper, table, gold_sota_records): - table_ext_id = f"{paper.paper_id}/{table.name}/" - records = gold_sota_records[gold_sota_records.index.str.startswith(table_ext_id)] - return estimate_context_noise(table.caption, records) - - -def estimate_cell_context_noise(paper, table, row, col, gold_sota_records): - cell_ext_id = f"{paper.paper_id}/{table.name}/{row}.{col}" - records = gold_sota_records[gold_sota_records.index == cell_ext_id] - value = annotations(table.matrix.values, table.matrix_gold_tags.values, row, col, 'dataset') - return estimate_context_noise(value, records) - - -def average_dicts(dicts): - sums = {} - for d in dicts: - for k, v in d.items(): - sums.setdefault(k, []).append(v) - return {k: np.mean(v) for k, v in sums.items()} - - -def all_equal(row): - cols = ["model_type", "dataset", "metric", "task", "parsed"] - return np.all([row[f"{name}_pred"] == row[f"{name}_gold"] for name in cols]) - - -def merge_gold_records(explainer): - paper_ids = list(explainer.le.proposals.keys()) - - proposals = pd.concat(explainer.le.proposals.values()) - - papers = {paper_id: explainer.paper_collection.get_by_id(paper_id) for paper_id in paper_ids} - missing = [paper_id for paper_id, paper in papers.items() if paper is None] - if missing: - print("Missing papers in paper collection:") - print(", ".join(missing)) - papers = [paper for paper in papers.values() if paper is not None] - - if explainer.gold_sota_records is None: - print("gold_sota_records is missing") - return - else: - gold_sota_records = explainer.gold_sota_records - which = gold_sota_records.index.to_series().str.split("/", expand=True)[0] \ - .isin([paper.paper_id for paper in papers]) - gold_sota_records = gold_sota_records[which] - - df = gold_sota_records.merge(proposals, 'outer', left_index=True, right_index=True, suffixes=['_gold', '_pred']) - df = df.reindex(sorted(df.columns), axis=1) - df.confidence = df.confidence.fillna(0.0) - df = df.fillna('not-present') - df["equal"] = df.apply(all_equal, axis=1) - df["pred_positive"] = df["model_type_pred"].str.contains("model-best") - df["gold_positive"] = df["model_type_gold"].str.contains("model-best") - return df - - -def find_threshold_intervals(proposals, metrics_info, context="paper"): - # maximal threshold to have this proposal returned - proposals["max_threshold"] = proposals.confidence - - proposals["min_threshold"] = 0.0 - - ignore = (proposals.model_type_pred != 'model-best') | (proposals.struct_model_type == '') | \ - (proposals.struct_dataset.str.contains('dev')) | (proposals.struct_dataset.str.contains('train')) - - # this proposal won't be ever returned due to structure or model type filters - proposals.loc[ignore, "min_threshold"] = 1.0 - proposals.loc[ignore, "max_threshold"] = 0.0 - - all_proposals = proposals - proposals = proposals[~ignore] - - if context == "paper": - context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0]) - else: - context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0] + "/" + x[1]) - - for i, p in proposals.iterrows(): - key = (p.task_pred, p.dataset_pred, p.metric_pred) - proposals_context = proposals[context_column == context_column[p.name]] - proposals_context = proposals_context[~proposals_context.parsed_pred.isna()] - proposals_context = proposals_context[ - (proposals_context.task_pred == p.task_pred) & - (proposals_context.dataset_pred == p.dataset_pred) & - (proposals_context.metric_pred == p.metric_pred) - ] - d = 0 - if key in metrics_info: - d = metrics_info[key] - elif p.metric_pred in metrics_info: - d = metrics_info[p.metric_pred] - elif 'error' in p.metric_pred.lower(): - d = -1 - elif 'accuracy' in p.metric_pred.lower(): - d = 1 - - if d >= 0: - d = 1 - else: - d = -1 - - # the minimal threshold above which all superior results are ignored - which = d * proposals_context.parsed_pred > d * p.parsed_pred - if np.any(which.values): - all_proposals.at[i, "min_threshold"] = proposals_context[which].confidence.values.max() - else: - which = proposals_context[proposals_context.parsed_pred == p.parsed_pred].iloc[0] - if which.name != p.name: - all_proposals.at[i, "min_threshold"] = which.confidence - - return all_proposals - - -def update_cm(proposal, cm, is_activated): - d = 1 if is_activated else -1 - if proposal.equal and proposal.pred_positive and proposal.gold_positive: - cm = replace(cm, tp=cm.tp + d, fn=cm.fn - d) - if proposal.equal and not proposal.pred_positive and not proposal.gold_positive: - cm = replace(cm, tn=cm.tn + d) - if proposal.pred_positive and (not proposal.equal or not proposal.gold_positive): - cm = replace(cm, fp=cm.fp + d) - # if proposal.gold_positive and (not proposal.equal or not proposal.pred_positive): - # cm = replace(cm, fn = cm.fn+d) - return cm - - -def sweep_thresholds(df): - cm = CM(fn=sum(df.gold_positive)) - df = df[df.min_threshold < df.max_threshold] - - sweeps = df.reset_index().melt(id_vars="cell_ext_id", value_vars=["min_threshold", "max_threshold"], - var_name="threshold_type", value_name="threshold") - - sweeps = sweeps.sort_values(by=["threshold", "threshold_type"]).reset_index(drop=True) - - steps = sweeps.threshold.drop_duplicates().index - - results = [] - for i, idx1 in enumerate(steps[:-1]): - th1 = sweeps.threshold[idx1] - - to_restore = cm - for j, idx2 in enumerate(steps[i + 1:], i + 1): - th2 = sweeps.threshold[idx2] - precision = cm.tp / (cm.tp + cm.fp + 1e-8) - recall = cm.tp / (cm.tp + cm.fn + 1e-8) - f1 = 2 * precision * recall / (precision + recall + 1e-8) - - result = dict(threshold1=th1, threshold2=sweeps.threshold[idx2 - 1], tp=cm.tp, tn=cm.tn, fp=cm.fp, fn=cm.fn, - precision=precision, recall=recall, f1=f1) - results.append(result) - for _, row in sweeps[sweeps.threshold == sweeps.threshold[idx2 - 1]].iterrows(): - proposal = df.loc[row.cell_ext_id] - is_activated = row.threshold_type == 'min_threshold' - if not is_activated and proposal.min_threshold < th1: - cm = update_cm(proposal, cm, is_activated) - - precision = cm.tp / (cm.tp + cm.fp + 1e-8) - recall = cm.tp / (cm.tp + cm.fn + 1e-8) - f1 = 2 * precision * recall / (precision + recall + 1e-8) - - result = dict(threshold1=th1, threshold2=th2, tp=cm.tp, tn=cm.tn, fp=cm.fp, fn=cm.fn, - precision=precision, recall=recall, f1=f1) - results.append(result) - - cm = to_restore - - for _, row in sweeps[sweeps.threshold == th1].iterrows(): - proposal = df.loc[row.cell_ext_id] - - is_activated = row.threshold_type == 'min_threshold' - cm = update_cm(proposal, cm, is_activated) - - return df, sweeps, steps, pd.DataFrame(results) - - -class PRResults: - def __init__(self, results): - self.results = results - - def plot(self): - plt.figure(figsize=(6, 6)) - plt.plot(self.results["precision"], self.results["recall"], '.') - plt.xlabel("precision") - plt.ylabel("recall") - - def _best(self, results, metric): - b = results.loc[results[metric].idxmax()] - x = ["precision", "recall", "f1"] - x.remove(metric) - y = [b[m] for m in x] - print(f"Best {metric}={b[metric]:0.2f} (with {x[0]}={y[0]:.2f} and {x[1]}={y[1]:.2f})" - f" is achieved with threshold1={b.threshold1} and threshold2={b.threshold2}") - - def best(self, min_precision=0, min_recall=0, min_f1=0): - results = self.results - results = results[ - (results.precision >= min_precision) & - (results.recall >= min_recall) & - (results.f1 >= min_f1) - ] - if not len(results): - print("No results with this criteria") - else: - self._best(results, "precision") - self._best(results, "recall") - self._best(results, "f1") - -def optimize_filters(explainer, metrics_info): - df = merge_gold_records(explainer) - df = find_threshold_intervals(df, metrics_info, context="paper") - df, sweeps, steps, results = sweep_thresholds(df) - return PRResults(results) diff --git a/sota_extractor2/helpers/reannotate.py b/sota_extractor2/helpers/reannotate.py deleted file mode 100644 index fdc3b58..0000000 --- a/sota_extractor2/helpers/reannotate.py +++ /dev/null @@ -1,59 +0,0 @@ -import requests -from sota_extractor2 import config -from sota_extractor2.data.paper_collection import _load_annotated_papers - - -def run_graphql_query(query): - request = requests.post(config.graphql_url, json={'query': query}) - if request.status_code == 200: - return request.json() - else: - raise Exception(f"Query error: status code {request.status_code}") - - -def reannotate_paper(paper, annotations): - paper._annotations = annotations - paper.gold_tags = annotations.gold_tags.strip() - for table in paper.tables: - table._set_annotations(annotations.table_set.filter(name=table.name, parser="latexml")[0]) - - -def reannotate_papers(papers, annotations): - for paper in papers: - ann = annotations.get(paper.arxiv_no_version) - if ann is not None: - reannotate_paper(paper, ann) - - -def query_annotations(): - raw = run_graphql_query(""" - query { - allPapers { - edges { - node { - arxivId - goldTags - tableSet { - edges { - node { - name - datasetText - notes - goldTags - matrixGoldTags - cellsSotaRecords - parser - } - } - } - } - } - } - } - """) - return _load_annotated_papers(raw) - - -def reannotate_papers_with_db(papers): - annotations = query_annotations() - reannotate_papers(papers, annotations) diff --git a/sota_extractor2/helpers/training.py b/sota_extractor2/helpers/training.py deleted file mode 100644 index 7962c42..0000000 --- a/sota_extractor2/helpers/training.py +++ /dev/null @@ -1,14 +0,0 @@ - -def set_seed(seed, name, quiet=False, all_gpus=True): - import torch - import numpy as np - import random - if not quiet: - print(f"Setting {name} seed to {seed}") - torch.manual_seed(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - np.random.seed(seed) - random.seed(seed) - if all_gpus: - torch.cuda.manual_seed_all(seed) \ No newline at end of file diff --git a/sota_extractor2/loggers.py b/sota_extractor2/loggers.py deleted file mode 100644 index 6090516..0000000 --- a/sota_extractor2/loggers.py +++ /dev/null @@ -1,192 +0,0 @@ -import sys -import pandas as pd -from .models.structure.experiment import Experiment, label_map, Labels -from .models.structure.type_predictor import TableType -from copy import deepcopy -import pickle - - - -class BaseLogger: - def __init__(self, pipeline_logger, pattern=".*"): - pipeline_logger.register(pattern, self) - - def __call__(self, step, **kwargs): - raise NotImplementedError() - - -class StdoutLogger: - def __init__(self, pipeline_logger, file=sys.stdout): - self.file = file - pipeline_logger.register(".*", self) - - def __call__(self, step, **kwargs): - print(f"[STEP] {step}: {kwargs}", file=self.file) - - -class SessionRecorder: - def __init__(self, pipeline_logger): - self.pipeline_logger = pipeline_logger - self.session = [] - self._recording = False - - def __call__(self, step, **kwargs): - self.session.append((step, deepcopy(kwargs))) - - def reset(self): - self.session = [] - - def record(self): - if not self._recording: - self.pipeline_logger.register(".*", self) - self._recording = True - - def stop(self): - if self._recording: - self.pipeline_logger.unregister(".*", self) - self._recording = False - - def replay(self): - self.stop() - for step, kwargs in self.session: - self.pipeline_logger(step, **kwargs) - - def save_session(self, path): - with open(path, "wb") as f: - pickle.dump(self.session, f) - - def load_session(self, path): - with open(path, "rb") as f: - self.session = pickle.load(f) - - -class StructurePredictionEvaluator: - def __init__(self, pipeline_logger, pc): - pipeline_logger.register("structure_prediction::evidences_split", self.on_evidences_split) - pipeline_logger.register("structure_prediction::tables_labeled", self.on_tables_labeled) - pipeline_logger.register("type_prediction::predicted", self.on_type_predicted) - pipeline_logger.register("type_prediction::multiclass_predicted", self.on_type_multiclass_predicted) - self.pc = pc - self.results = {} - self.type_predictions = {} - self.type_multiclass_predictions = {} - self.evidences = pd.DataFrame() - - def on_type_multiclass_predicted(self, step, paper, tables, threshold, predictions): - for table, prediction in zip(tables, predictions): - self.type_multiclass_predictions[paper.paper_id, table.name] = { - TableType.SOTA: prediction[0], - TableType.ABLATION: prediction[1], - TableType.IRRELEVANT: threshold - } - - def on_type_predicted(self, step, paper, tables, predictions): - for table, prediction in zip(tables, predictions): - self.type_predictions[paper.paper_id, table.name] = prediction - - def on_evidences_split(self, step, evidences, evidences_num): - self.evidences = pd.concat([self.evidences, evidences]) - - def on_tables_labeled(self, step, paper, labeled_tables): - golds = [p for p in self.pc if p.text.title == paper.text.title] - paper_id = paper.paper_id - type_results = [] - cells_results = [] - labeled_tables = {table.name: table for table in labeled_tables} - if len(golds) == 1: - gold = golds[0] - for gold_table, table, in zip(gold.tables, paper.tables): - table_type = self.type_predictions[paper.paper_id, table.name] - is_important = table_type == TableType.SOTA or table_type == TableType.ABLATION - gold_is_important = "sota" in gold_table.gold_tags or "ablation" in gold_table.gold_tags - type_results.append({"predicted": is_important, "gold": gold_is_important, "name": table.name}) - if not is_important: - continue - table = labeled_tables[table.name] - rows, cols = table.df.shape - for r in range(rows): - for c in range(cols): - cells_results.append({ - "predicted": table.df.iloc[r, c].gold_tags, - "gold": gold_table.df.iloc[r, c].gold_tags, - "ext_id": f"{table.name}/{r}.{c}", - "content": table.df.iloc[r, c].value - }) - - self.results[paper_id] = { - 'type': pd.DataFrame.from_records(type_results), - 'cells': pd.DataFrame.from_records(cells_results) - } - - def map_tags(self, tags): - mapping = dict(label_map) - mapping[""] = Labels.EMPTY.value - return tags.str.strip().apply(lambda x: mapping.get(x, 0)) - - def metrics(self, paper_id): - if paper_id not in self.results: - print(f"No annotations for {paper_id}") - return - print("Structure prediction:") - results = self.results[paper_id] - cells_df = results['cells'] - e = Experiment() - e._set_results(paper_id, self.map_tags(results['cells'].predicted), self.map_tags(results['cells'].gold)) - e.show_results(paper_id, normalize=True) - - def get_table_type_predictions(self, paper_id, table_name): - prediction = self.type_predictions.get((paper_id, table_name)) - multi_predictions = self.type_multiclass_predictions.get((paper_id, table_name)) - if prediction is not None: - multi_predictions = sorted(multi_predictions.items(), key=lambda x: x[1], reverse=True) - return prediction, [(k.name, v) for k, v in multi_predictions - ] - - -class LinkerEvaluator: - def __init__(self, pipeline_logger): - pipeline_logger.register("linking::call", self.on_before_linking) - pipeline_logger.register("linking::taxonomy_linking::call", self.on_before_taxonomy) - pipeline_logger.register("linking::taxonomy_linking::topk", self.on_taxonomy_topk) - pipeline_logger.register("linking::linked", self.on_after_linking) - self.proposals = {} - self.topk = {} - self.queries = {} - - def on_before_linking(self, step, paper, tables): - pass - - def on_after_linking(self, step, paper, tables, proposals): - self.proposals[paper.paper_id] = proposals.copy(deep=True) - - def on_before_taxonomy(self, step, ext_id, query, datasets, caption): - self.queries[ext_id] = (query, datasets, caption) - - def on_taxonomy_topk(self, step, ext_id, topk): - paper_id, table_name, rc = ext_id.split('/') - row, col = [int(x) for x in rc.split('.')] - self.topk[paper_id, table_name, row, col] = topk.copy(deep=True) - - def top_matches(self, paper_id, table_name, row, col): - return self.topk[(paper_id, table_name, row, col)] - - -class FilteringEvaluator: - def __init__(self, pipeline_logger): - pipeline_logger.register("filtering::.*::filtered", self.on_filtered) - self.proposals = {} - self.which = {} - self.reason = pd.Series(dtype=str) - - def on_filtered(self, step, proposals, which, reason, **kwargs): - _, filter_step, _ = step.split('::') - if filter_step != "compound_filtering": - if filter_step in self.proposals: - self.proposals[filter_step] = pd.concat([self.proposals[filter_step], proposals]) - self.which[filter_step] = pd.concat([self.which[filter_step], which]) - else: - self.proposals[filter_step] = proposals - self.which[filter_step] = which - self.reason = self.reason.append(reason) - - diff --git a/sota_extractor2/models/linking/__init__.py b/sota_extractor2/models/linking/__init__.py deleted file mode 100644 index a1a4653..0000000 --- a/sota_extractor2/models/linking/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .taxonomy import Taxonomy -from .linker import Linker -from .context_search import ContextSearch, DatasetExtractor -from .proposals_filters import * - -__all__ = ["Taxonomy", "Linker", "ContextSearch", "DatasetExtractor", "ProposalsFilter", "NopFilter", - "BestResultFilter", "StructurePredictionFilter", "ConfidenceFilter", "CompoundFilter"] diff --git a/sota_extractor2/models/linking/acronym_extractor.py b/sota_extractor2/models/linking/acronym_extractor.py deleted file mode 100644 index 4b709e6..0000000 --- a/sota_extractor2/models/linking/acronym_extractor.py +++ /dev/null @@ -1,20 +0,0 @@ -import spacy -from scispacy.abbreviation import AbbreviationDetector -from .utils import normalize_cell, normalize_dataset - -class AcronymExtractor: - def __init__(self): - self.nlp = spacy.load("en_core_sci_sm") - abbreviation_pipe = AbbreviationDetector(self.nlp) - self.nlp.add_pipe(abbreviation_pipe) - self.nlp.disable_pipes("tagger", "ner", "parser") - - def __call__(self, text): - doc = self.nlp(text) - abbrvs = {} - for abrv in doc._.abbreviations: - # abbrvs.setdefault(normalize_cell(str(abrv)), Counter())[str(abrv._.long_form)] += 1 - norm = normalize_cell(normalize_dataset(str(abrv))) - if norm != '': - abbrvs[norm] = normalize_cell(normalize_dataset(str(abrv._.long_form))) - return abbrvs diff --git a/sota_extractor2/models/linking/bm25_naive.py b/sota_extractor2/models/linking/bm25_naive.py deleted file mode 100644 index 7fd87c3..0000000 --- a/sota_extractor2/models/linking/bm25_naive.py +++ /dev/null @@ -1,320 +0,0 @@ -import re -from decimal import Decimal -from dataclasses import dataclass -import numpy as np -import pandas as pd -from elasticsearch import Elasticsearch, client -import logging -#from .extractors import DatasetExtractor -import spacy -from scispacy.abbreviation import AbbreviationDetector -from sota_extractor2.models.linking.format import extract_value - - -@dataclass() -class Value: - type: str - value: str - def __str__(self): - return self.value - - -@dataclass() -class Cell: - cell_ext_id: str - table_ext_id: str - row: int - col: int - - -@dataclass() -class Proposal: - cell: Cell - dataset_values: list - table_description: str - model_values: list # best paper competing - model_params: dict = None - raw_value: str = "" - - def __post_init__(self): - if self.model_params is None: - self.model_params = {} - - @property - def dataset(self): - return ' '.join(map(str, self.dataset_values)).strip() - - @property - def model_name(self): - return ' '.join(map(str, self.model_values)).strip() - - @property - def model_type(self): - types = [v.type for v in self.model_values] + [''] - if 'model-competing' in types: - return 'model-competing' # competing model is different from model-paper and model-best so we return it first - return types[0] - - def __str__(self): - return f"{self.model_name}: {self.raw_value} on {self.dataset}" - -def mkquery_ngrams(query): - return { - "query": { - "multi_match": { - "query": query, - "fields": ["dataset^3", "dataset.ngrams^1", "metric^1", "metric.ngrams^1", "task^1", - "task.ngrams^1"] - } - } - } - - -def mkquery_fullmatch(query): - return { - "query": { - "multi_match": { - "query": query, - "fields": ["dataset^3", "metric^1", "task^1"] - } - } - } - -class MatchSearch: - def __init__(self, mkquery=mkquery_ngrams, es=None): - self.case = True - self.all_fields = True - self.es = es or Elasticsearch() - self.log = logging.getLogger(__name__) - self.mkquery = mkquery - - self.nlp = spacy.load("en_core_web_sm") - abbreviation_pipe = AbbreviationDetector(self.nlp) - self.nlp.add_pipe(abbreviation_pipe) - self.nlp.disable_pipes("tagger", "ner", "parser") - - def match_abrv(self, dataset, datasets): - abrvs = [] - for ds in datasets: - # "!" is a workaround to scispacy error - doc = self.nlp(f"! {ds} ({dataset})") - for abrv in doc._.abbreviations: - if str(abrv) == dataset and str(abrv._.long_form) == ds: - abrvs.append(str(abrv._.long_form)) - abrvs = list(set(abrvs)) - if len(abrvs) == 1: - print(f"abrv. for {dataset}: {abrvs[0]}") - return abrvs[0] - elif len(abrvs) == 0: - return None - else: - print(f"Multiple abrvs. for {dataset}: {abrvs}") - return None - - def preproc(self, val, datasets=None): - val = val.strip(',- ') - val = re.sub("dataset", '', val, flags=re.I) - if datasets: - abrv = self.match_abrv(val, datasets) - if abrv: - val += " " + abrv - # if self.case: - # val += (" " +re.sub("([a-z])([A-Z])", r'\1 \2', val) - # +" " +re.sub("([a-zA-Z])([0-9])", r'\1 \2', val) - # ) - return val - - def search(self, query, explain_doc_id=None): - body = self.mkquery(query) - if explain_doc_id is not None: - return self.es.explain('et_taxonomy', doc_type='doc', id=explain_doc_id, body=body) - return self.es.search('et_taxonomy', doc_type='doc', body=body)["hits"] - - def __call__(self, query, datasets, caption): - split_re = re.compile('([^a-zA-Z0-9])') - query = self.preproc(query, datasets).strip() - if caption: - query += " " + self.preproc(caption).strip()[:400] - results = self.search(query) - hits = results["hits"][:3] - df = pd.DataFrame.from_records([ - dict(**hit["_source"], - confidence=hit["_score"] / len(split_re.split(query)), - # Roughly normalize the score not to ignore query length - evidence=query) for hit in hits - ], columns=["dataset", "metric", "task", "confidence", "evidence"]) - if not len(df): - self.log.debug("Elastic query didn't produce any output", query, hits) - else: - scores = [] - for dataset in df["dataset"]: - r = self.search(dataset) - scores.append( - dict(ok_score=r['hits'][0]['_score'] / len(split_re.split(dataset)), - bad_score=r['hits'][1]['_score'] / len(split_re.split(dataset)))) - - scores = pd.DataFrame.from_records(scores) - df['confidence'] = ((scores['ok_score'] - scores['bad_score']) / scores['bad_score']) * df['confidence'] / scores['ok_score'] - return df[["dataset", "metric", "task", "confidence", "evidence"]] - -float_pm_re = re.compile(r"(±?)([+-]?\s*(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)\s*(%?)") -whitespace_re = re.compile(r"\s+") -def handle_pm(value): - "handle precentage metric" - for match in float_pm_re.findall(value): - if not match[0]: - try: - yield Decimal(whitespace_re.sub("", match[1])) / (100 if match[-1] else 1) - except: - pass - # %% - - -proposal_columns = ['dataset', 'metric', 'task', 'format', 'raw_value', 'model', 'model_type', 'cell_ext_id', - 'confidence', 'parsed', 'struct_model_type', 'struct_dataset'] - - -def generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking, datasets): - # %% - # Proposal generation - def consume_cells(matrix): - for row_id, row in enumerate(matrix): - for col_id, cell in enumerate(row): - yield (row_id, col_id, cell) - - - def annotations(r, c, type='model'): - for nc in range(0, c): - if type in structure[r, nc]: - yield Value(structure[r, nc], matrix[r, nc]) - for nr in range(0, r): - if type in structure[nr, c]: - yield Value(structure[nr, c], matrix[nr, c]) - - - number_re = re.compile(r'^[± Ee /()^0-9.%±_-]{2,}$') - - proposals = [Proposal( - cell=Cell(cell_ext_id=f"{table_ext_id}/{r}.{c}", - table_ext_id=table_ext_id, - row=r, - col=c - ), - # TODO Add table type: sota / error ablation - table_description=desc, - model_values=list(annotations(r, c, 'model')), - dataset_values=list(annotations(r, c, 'dataset')), - raw_value=val) - for r, c, val in consume_cells(matrix) - if structure[r, c] == '' and number_re.match(matrix[r, c].strip())] - - # def empty_proposal(cell_ext_id, reason): - # np = "not-present" - # return dict( - # dataset=np, metric=np, task=np, format=np, raw_value=np, model=np, - # model_type=np, cell_ext_id=cell_ext_id, confidence=-1, debug_reason=reason - # ) - - def linked_proposals(proposals): - for prop in proposals: - df = taxonomy_linking(prop.dataset, datasets, desc, debug_info=prop) - assert len(df) == 1 - - metric = df['metric'][0] - - # heuristyic to handle accuracy vs error - first_num = (list(handle_pm(prop.raw_value)) + [0])[0] - format = "{x}" - # if first_num > 1: - # first_num /= 100 - # format = "{x/100}" - if first_num < 1 and '%' not in prop.raw_value: - first_num *= 100 - format = "{100*x}" - if '%' in prop.raw_value: - format += '%' - - # if ("error" in metric or "Error" in metric) and (first_num > 0.5): - if (metric.strip().lower() == "error") and (first_num > 0.5): - metric = "Accuracy" - - linked = { - 'dataset': df['dataset'][0], - 'metric': metric, - 'task': df['task'][0], - 'format': format, - 'raw_value': prop.raw_value, - 'model': prop.model_name, - 'model_type': prop.model_type, - 'cell_ext_id': prop.cell.cell_ext_id, - 'confidence': df['confidence'][0], - 'struct_model_type': prop.model_type, - 'struct_dataset': prop.dataset - } - yield linked - - # specify columns in case there's no proposal - - proposals = pd.DataFrame.from_records(list(linked_proposals(proposals)), columns=proposal_columns) - - if len(proposals): - proposals["parsed"]=proposals[["raw_value", "format"]].apply( - lambda row: float(extract_value(row.raw_value, row.format)), axis=1) - return proposals - - -def linked_proposals(paper_ext_id, paper, annotated_tables, taxonomy_linking=MatchSearch(), - dataset_extractor=None): - # dataset_extractor=DatasetExtractor()): - proposals = [] - datasets = dataset_extractor.from_paper(paper) - #print(f"Extracted datasets: {datasets}") - for idx, table in enumerate(annotated_tables): - matrix = np.array(table.matrix) - structure = np.array(table.matrix_tags) - tags = 'sota' - desc = table.caption - table_ext_id = f"{paper_ext_id}/{table.name}" - - if 'sota' in tags and 'no_sota_records' not in tags: # only parse tables that are marked as sota - proposals.append(generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking, datasets)) - if len(proposals): - return pd.concat(proposals) - return pd.DataFrame(columns=proposal_columns) - - -def test_link_taxonomy(): - link_taxonomy_raw = MatchSearch() - results = link_taxonomy_raw.search(link_taxonomy_raw.preproc("miniImageNet 5-way 1-shot")) - # assert "Mini-ImageNet - 1-Shot Learning" == results["hits"][0]["_source"]["dataset"], results - results = link_taxonomy_raw.search(link_taxonomy_raw.preproc("CoNLL2003")) - assert "CoNLL 2003 (English)" == results["hits"][0]["_source"]["dataset"], results - results = link_taxonomy_raw.search(link_taxonomy_raw.preproc("AGNews")) - assert "AG News" == results["hits"][0]["_source"]["dataset"], results - link_taxonomy_raw("miniImageNet 5-way 1-shot") - # %% - split_re = re.compile('([^a-zA-Z0-9])') - - # %% - q = "miniImageNet 5-way 1-shot Mini ImageNet 1-Shot Learning" * 1 - r = link_taxonomy_raw.search(q) - f = len(split_re.split(q)) - r['hits'][0]['_score'] / f, r['hits'][1]['_score'] / f, r['hits'][0]['_source'] - # %% - q = "Mini ImageNet 1-Shot Learning" * 1 - r = link_taxonomy_raw.search(q) - f = len(split_re.split(q)) - r['hits'][0]['_score'] / f, r['hits'][1]['_score'] / f, r['hits'][0]['_source'] - # %% - q = "Mini ImageNet 1-Shot" * 1 - r = link_taxonomy_raw.search(q) - f = len(split_re.split(q)) - r['hits'][0]['_score'] / f, r['hits'][1]['_score'] / f, r['hits'][0]['_source'] - # - # # %% - # prop = proposals[1] - # print(prop) - # # todo issue with STS-B matching IJB-B - # link_taxonomy_raw(prop.dataset) - - diff --git a/sota_extractor2/models/linking/context_search.py b/sota_extractor2/models/linking/context_search.py deleted file mode 100644 index cb8e732..0000000 --- a/sota_extractor2/models/linking/context_search.py +++ /dev/null @@ -1,381 +0,0 @@ -# metrics[taxonomy name] is a list of normalized evidences for taxonomy name -from collections import Counter - -from sota_extractor2.models.linking.acronym_extractor import AcronymExtractor -from sota_extractor2.models.linking.probs import get_probs, reverse_probs -from sota_extractor2.models.linking.utils import normalize_dataset, normalize_cell, normalize_cell_ws -from scipy.special import softmax -import re -import pandas as pd -import numpy as np -import ahocorasick -from numba import njit, typed, types - -from sota_extractor2.pipeline_logger import pipeline_logger - -metrics = { - 'BLEU': ['bleu'], - 'BLEU score': ['bleu'], - 'Character Error Rate': ['cer', 'cers'], - 'Error': ['error'], - 'Exact Match Ratio': ['exact match'], - 'F1': ['f1', 'f1 score'], - 'F1 score': ['f1', 'f1 score'], - 'MAP': ['map'], - 'Percentage error': ['wer', 'per', 'wers', 'pers', 'word error rate', 'word error rates', 'phoneme error rates', - 'phoneme error rate', 'error', 'error rate', 'error rates'], - 'Word Error Rate': ['wer', 'wers', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'], - 'Word Error Rate (WER)': ['wer', 'wers', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'], - 'ROUGE-1': ['r1'], - 'ROUGE-2': ['r2'], - 'ROUGE-F': ['rf'], - 'Precision': ['precision'], - 'Recall': ['recall'], - # RAIN REMOVAL - 'PSNR': ['psnr', 'psnr (db)', 'mean psnr'], - 'SSIM': ['ssim'], - 'UQI': ['uqi'], - 'VIF': ['vif'], - 'SSEQ': ['sseq'], - 'NIQE': ['niqe'], - 'BLINDS-II': ['blinds-ii'], - 'FSIM': ['fsim'], - # SEMANTIC SEGMENTATION - 'Mean iOU': ['miou', 'mean iou', 'mean iu'], - 'Pixel Accuracy': ['pixel accuracy', 'pixel acc', 'pixel acc.'], - 'Class iOU': ['class iou', 'iou cla.'], - 'Category iOU': ['cat iou', 'iou cat.'], - 'Class iiOU': ['class iiou', 'iiou cla.'], - 'Category iiOU': ['cat iiou', 'iiou cat.'], -} - -# datasets[taxonomy name] is a list of normalized evidences for taxonomy name -datasets = { - 'Hub5\'00 Average': ['avg', 'full', 'hub5', 'sum', 'evaluation'], - 'Hub5\'00 Switchboard': ['swbd', 'swb', 'hub5 swb', 'hub5 swbd', 'switchboard'], - 'Hub5\'00 CallHome': ['ch', 'hub5 ch', 'call home', 'chm'], - 'TIMIT': ['timit'], - 'WSJ eval92': ['wsj eval 92', 'eval 92', 'wsj'], - 'WSJ eval93': ['wsj eval 93', 'eval 93', 'wsj'], - 'LibriSpeech test-clean': ['libri speech test clean', 'libri speech', 'test', 'tst', 'clean', 'test clean'], - 'LibriSpeech test-other': ['libri speech test other', 'libri speech', 'test', 'tst', 'other', 'test other', - 'noisy'], - 'Babel Cebuano': ['babel cebuano', 'babel', 'cebuano', 'ceb'], - 'Babel Kazakh': ['babel kazakh', 'babel', 'kazakh', 'kaz'], - 'Babel Kurmanji': ['babel kurmanji', 'babel', 'kurmanji', 'kur'], - 'Babel Lithuanian': ['babel lithuanian', 'babel', 'lithuanian', 'lit'], - 'Babel Telugu': ['babel telugu', 'babel', 'telugu', 'tel'], - 'Babel Tok Pisin': ['babel tok pisin', 'babel', 'tok pisin', 'tok'], - - 'Ask Ubuntu': ['ask ubuntu', 'ask u', 'ubuntu'], - 'Chatbot': ['chatbot'], - 'Web Apps': ['web apps'], - 'CHiME clean': ['chime clean', 'chime', 'clean'], - 'CHiME real': ['chime real', 'chime', 'real'], - 'CHiME simu': ['chime simu', 'chime', 'simu', 'sim', 'simulated'], - 'CHiME-4 real 6ch': ['chime 4 real 6 ch', 'chime 4', 'real', '6 channel'], - 'AG News': ['ag news', 'ag'], - 'GigaWord': ['gigaword', 'giga'], - 'GEOTEXT': ['geotext', 'geo'], - 'IWSLT 2015 English-Vietnamese': ["iwslt 2015 english vietnamese", "iwslt", "2015", "english vietnamese", "en vi", - "iwslt 15 english vietnamese", "iwslt 15 en vi", "english", "en", "vietnamese", - "vi"], - 'IWSLT2011 English TED Talks': ["iwslt 2011 english ted talks", "iwslt", "2011", "english", "en", "eng", "ted", - "ted talks", "english ted talks"], - 'IWSLT2012 English TED Talks': ["iwslt 2012 english ted talks", "iwslt", "2012", "english", "en", "eng", "ted", - "ted talks", "english ted talks"], - 'IWSLT2014 English-German': ["iwslt 2014 english german", "iwslt", "2014", "english german", "en de", "en", "de", - "english", "german"], - 'Rich Transcription 2002': ["rich transcription 2002", "rich transcription 02", "rt 2002", "2002", "rt 02", "rich", - "transcription"], - 'Rich Transcription 2003': ["richt ranscription 2003", "rich transcription 03", "rt 2003", "2003", "rt 03", "rich", - "transcription"], - 'Rich Transcription 2004': ["rich transcription 2004", "rich transcription 04", "rt 2004", "2004", "rt 04", "rich", - "transcription"], - 'DIRHA English WSJ real': ['dirha english wsj real', 'dirha', 'english', 'en', 'eng', 'real', 'wsj'], - 'DIRHA English WSJ simu': ['dirha english wsj simu', 'dirha', 'english', 'en', 'eng', 'simu', 'wsj', 'simulated'], - 'VCTK clean': ["vctk clean", "vctk", "clean"], - 'VCTK noisy': ["vctk noisy", "vctk", "noisy"], - 'VoxForge American-Canadian': ["vox forge american canadian", "vox forge", "vox", "forge", "american canadian", - "american", "canadian", "us ca"], - 'VoxForge Commonwealth': ["vox forge common wealth", "vox forge", "common wealth", "vox", "forge", "common", - "wealth"], - 'VoxForge European': ["vox forge european", "vox forge", "european", "vox", "forge", "eu"], - 'VoxForge Indian': ["vox forge indian", "vox forge", "indian", "vox", "forge"], - # RAIN REMOVAL - 'Raindrop': ['raindrop'], - 'Rain100H': ['rain100h'], - 'Rain100L': ['rain100l'], - 'Rain12': ['rain12'], - 'Rain800': ['rain800'], - 'Rain1400': ['rain1400'], - 'Real Rain': ['real rain'], - 'Rain in Surveillance': ['ris'], - 'Rain in Driving': ['rid'], - 'DID-MDN': ['did-mdn'], - 'SOTS': ['sots'], - 'Test 1': ['test 1'], - 'RainSynLight25': ['rainsynlight25'], - 'RainSynComplex25': ['rainsyncomplex25'], - 'NTURain': ['nturain'], - 'RainSynAll100': ['rainsynall100'], - 'SPA-DATA': ['spa-data'], - 'LasVR': ['lasvar'], - # SEMANTIC SEGMENTATION - 'PASCAL VOC 2012': ['voc 2012', 'pascal voc 2012'], - 'ADE20K': ['ade20k'], - 'ImageNet': ['imagenet'], - 'Cityscapes': ['cityscapes'], - 'PASCAL-Context': ['pascal-context'], - 'PASCAL-Person-Part': ['pascal-person-part'], - 'ParseNet': ['parsenet'], - 'LIP': ['lip'], -} - -datasets = {k:(v+['test']) for k,v in datasets.items()} -datasets.update({ - 'LibriSpeech dev-clean': ['libri speech dev clean', 'libri speech', 'dev', 'clean', 'dev clean', 'development'], - 'LibriSpeech dev-other': ['libri speech dev other', 'libri speech', 'dev', 'other', 'dev other', 'development', 'noisy'], -}) - -tasks = {} - -# escaped_ws_re = re.compile(r'\\\s+') -# def name_to_re(name): -# return re.compile(r'(?:^|\s+)' + escaped_ws_re.sub(r'\\s*', re.escape(name.strip())) + r'(?:$|\s+)', re.I) - -#all_datasets = set(k for k,v in merged_p.items() if k != '' and not re.match("^\d+$", k) and v.get('NOMATCH', 0.0) < 0.9) -all_datasets = set(normalize_cell_ws(normalize_dataset(y)) for x in datasets.values() for y in x) -all_metrics = set(normalize_cell_ws(y) for x in metrics.values() for y in x) -all_tasks = set(normalize_cell_ws(normalize_dataset(y)) for x in tasks.values() for y in x) - -#all_metrics = set(metrics_p.keys()) - -# all_datasets_re = {x:name_to_re(x) for x in all_datasets} -# all_metrics_re = {x:name_to_re(x) for x in all_metrics} -#all_datasets = set(x for v in merged_p.values() for x in v) - -# def find_names(text, names_re): -# return set(name for name, name_re in names_re.items() if name_re.search(text)) - - -def make_trie(names): - trie = ahocorasick.Automaton() - for name in names: - norm = name.replace(" ", "") - trie.add_word(norm, (len(norm), name)) - trie.make_automaton() - return trie - - -single_letter_re = re.compile(r"\b\w\b") -init_letter_re = re.compile(r"\b\w") -end_letter_re = re.compile(r"\w\b") -letter_re = re.compile(r"\w") - - -def find_names(text, names_trie): - text = text.lower() - profile = letter_re.sub("i", text) - profile = init_letter_re.sub("b", profile) - profile = end_letter_re.sub("e", profile) - profile = single_letter_re.sub("x", profile) - text = text.replace(" ", "") - profile = profile.replace(" ", "") - s = set() - for (end, (l, word)) in names_trie.iter(text): - if profile[end] in ['e', 'x'] and profile[end - l + 1] in ['b', 'x']: - s.add(word) - return s - - -all_datasets_trie = make_trie(all_datasets) -all_metrics_trie = make_trie(all_metrics) -all_tasks_trie = make_trie(all_tasks) - - -def find_datasets(text): - return find_names(text, all_datasets_trie) - -def find_metrics(text): - return find_names(text, all_metrics_trie) - -def find_tasks(text): - return find_names(text, all_tasks_trie) - -def dummy_item(reason): - return pd.DataFrame(dict(dataset=[reason], task=[reason], metric=[reason], evidence=[""], confidence=[0.0])) - - - -@njit -def compute_logprobs(taxonomy, reverse_merged_p, reverse_metrics_p, reverse_task_p, - dss, mss, tss, noise, ms_noise, ts_noise, ds_pb, ms_pb, ts_pb, logprobs): - empty = typed.Dict.empty(types.unicode_type, types.float64) - for i, (task, dataset, metric) in enumerate(taxonomy): - logprob = 0.0 - short_probs = reverse_merged_p.get(dataset, empty) - met_probs = reverse_metrics_p.get(metric, empty) - task_probs = reverse_task_p.get(task, empty) - for ds in dss: - # for abbrv, long_form in abbrvs.items(): - # if ds == abbrv: - # ds = long_form - # break - # if merged_p[ds].get('NOMATCH', 0.0) < 0.5: - logprob += np.log(noise * ds_pb + (1 - noise) * short_probs.get(ds, 0.0)) - for ms in mss: - logprob += np.log(ms_noise * ms_pb + (1 - ms_noise) * met_probs.get(ms, 0.0)) - for ts in tss: - logprob += np.log(ts_noise * ts_pb + (1 - ts_noise) * task_probs.get(ts, 0.0)) - logprobs[i] += logprob - #logprobs[(dataset, metric)] = logprob - - -class ContextSearch: - def __init__(self, taxonomy, context_noise=(0.5, 0.2, 0.1), metrics_noise=None, task_noise=None, - ds_pb=0.001, ms_pb=0.01, ts_pb=0.01, debug_gold_df=None): - merged_p = \ - get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in datasets.items()})[1] - metrics_p = \ - get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in metrics.items()})[1] - tasks_p = \ - get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in tasks.items()})[1] - - self.queries = {} - self.taxonomy = taxonomy - self._taxonomy = typed.List() - for t in self.taxonomy.taxonomy: - self._taxonomy.append(t) - self.extract_acronyms = AcronymExtractor() - self.context_noise = context_noise - self.metrics_noise = metrics_noise if metrics_noise else context_noise - self.task_noise = task_noise if task_noise else context_noise - self.ds_pb = ds_pb - self.ms_pb = ms_pb - self.ts_pb = ts_pb - self.reverse_merged_p = self._numba_update_nested_dict(reverse_probs(merged_p)) - self.reverse_metrics_p = self._numba_update_nested_dict(reverse_probs(metrics_p)) - self.reverse_tasks_p = self._numba_update_nested_dict(reverse_probs(tasks_p)) - self.debug_gold_df = debug_gold_df - - def _numba_update_nested_dict(self, nested): - d = typed.Dict() - for key, dct in nested.items(): - d2 = typed.Dict() - d2.update(dct) - d[key] = d2 - return d - - def _numba_extend_list(self, lst): - l = typed.List.empty_list(types.unicode_type) - for x in lst: - l.append(x) - return l - - def compute_context_logprobs(self, context, noise, ms_noise, ts_noise, logprobs): - context = context or "" - abbrvs = self.extract_acronyms(context) - context = normalize_cell_ws(normalize_dataset(context)) - dss = set(find_datasets(context)) | set(abbrvs.keys()) - mss = set(find_metrics(context)) - tss = set(find_tasks(context)) - dss -= mss - dss -= tss - dss = [normalize_cell(ds) for ds in dss] - mss = [normalize_cell(ms) for ms in mss] - tss = [normalize_cell(ts) for ts in tss] - ###print("dss", dss) - ###print("mss", mss) - dss = self._numba_extend_list(dss) - mss = self._numba_extend_list(mss) - tss = self._numba_extend_list(tss) - compute_logprobs(self._taxonomy, self.reverse_merged_p, self.reverse_metrics_p, self.reverse_tasks_p, - dss, mss, tss, noise, ms_noise, ts_noise, self.ds_pb, self.ms_pb, self.ts_pb, logprobs) - - def match(self, contexts): - assert len(contexts) == len(self.context_noise) - n = len(self._taxonomy) - context_logprobs = np.zeros(n) - - for context, noise, ms_noise, ts_noise in zip(contexts, self.context_noise, self.metrics_noise, self.task_noise): - self.compute_context_logprobs(context, noise, ms_noise, ts_noise, context_logprobs) - keys = self.taxonomy.taxonomy - logprobs = context_logprobs - #keys, logprobs = zip(*context_logprobs.items()) - probs = softmax(np.array(logprobs)) - return zip(keys, probs) - - def __call__(self, query, datasets, caption, debug_info=None): - cellstr = debug_info.cell.cell_ext_id - pipeline_logger("linking::taxonomy_linking::call", ext_id=cellstr, query=query, datasets=datasets, caption=caption) - datasets = " ".join(datasets) - key = (datasets, caption, query) - ###print(f"[DEBUG] {cellstr}") - ###print("[DEBUG]", debug_info) - ###print("query:", query, caption) - if key in self.queries: - # print(self.queries[key]) - # for context in key: - # abbrvs = self.extract_acronyms(context) - # context = normalize_cell_ws(normalize_dataset(context)) - # dss = set(find_datasets(context)) | set(abbrvs.keys()) - # mss = set(find_metrics(context)) - # dss -= mss - ###print("dss", dss) - ###print("mss", mss) - - ###print("Taking result from cache") - p = self.queries[key] - else: - dist = self.match(key) - topk = sorted(dist, key=lambda x: x[1], reverse=True)[0:5] - - entries = [] - for it, prob in topk: - task, dataset, metric = it - entry = dict(task=task, dataset=dataset, metric=metric) - entry.update({"evidence": "", "confidence": prob}) - entries.append(entry) - - # best, best_p = sorted(dist, key=lambda x: x[1], reverse=True)[0] - # entry = et[best] - # p = pd.DataFrame({k:[v] for k, v in entry.items()}) - # p["evidence"] = "" - # p["confidence"] = best_p - p = pd.DataFrame(entries) - - self.queries[key] = p - - ###print(p) - - # error analysis only - if self.debug_gold_df is not None: - if cellstr in self.debug_gold_df.index: - gold_record = self.debug_gold_df.loc[cellstr] - if p.iloc[0].dataset == gold_record.dataset: - print("[EA] Matching gold sota record (dataset)") - else: - print( - f"[EA] Proposal dataset ({p.iloc[0].dataset}) and gold dataset ({gold_record.dataset}) mismatch") - else: - print("[EA] No gold sota record found for the cell") - # end of error analysis only - pipeline_logger("linking::taxonomy_linking::topk", ext_id=cellstr, topk=p) - return p.head(1) - - -# todo: compare regex approach (old) with find_datasets(.) (current) -class DatasetExtractor: - def __init__(self): - self.dataset_prefix_re = re.compile(r"[A-Z]|[a-z]+[A-Z]+|[0-9]") - self.dataset_name_re = re.compile(r"\b(the)\b\s*(?P((?!(the)\b)\w+\W+){1,10}?)(test|val(\.|idation)?|dev(\.|elopment)?|train(\.|ing)?\s+)?\bdata\s*set\b", re.IGNORECASE) - - def from_paper(self, paper): - text = paper.text.abstract - if hasattr(paper.text, "fragments"): - text += " ".join(f.text for f in paper.text.fragments) - return self(text) - - def __call__(self, text): - return find_datasets(normalize_cell_ws(normalize_dataset(text))) diff --git a/sota_extractor2/models/linking/execution.py b/sota_extractor2/models/linking/execution.py deleted file mode 100644 index cdf41cd..0000000 --- a/sota_extractor2/models/linking/execution.py +++ /dev/null @@ -1,95 +0,0 @@ -import pandas as pd -from django.db import connection -from IPython.core.display import display - -from sota_extractor2.models.linking.metrics import Metrics -from sota_extractor2.models.linking.format import extract_value - - -def q(query, limit=10, index_col=None): - if limit is not None: - query = query.rstrip(" ;") + f" LIMIT {limit}" - return pd.read_sql(query, connection, index_col=index_col) - -def execute_model_on_papers(model, papers): - proposals = [] - for paper in papers: - print("Parsing ", paper.paper_id) - paper_proposals = model(paper.paper_id, paper, paper.tables) - proposals.append(paper_proposals) - proposals = pd.concat(proposals) - proposals["experiment_name"] = model.__name__ - return proposals.set_index('cell_ext_id') - - -def fetch_gold_sota_records(): - gold_sota_records = q(""" - SELECT sc.id as cell_id, - st.paper_id, - CONCAT(st.paper_id, '/', st.name, '/', sr.row,'.', sr.col) as cell_ext_id, - (SELECT gold_tags FROM sota_cell WHERE (row=sc.row or col=sc.col) and table_id=sc.table_id and gold_tags LIKE 'model%' LIMIT 1) as model_type, - task, dataset, metric, model, format, sc.value as raw_value - FROM - sota_record sr - JOIN sota_cell sc USING (table_id, row, col) - JOIN sota_table st ON (sc.table_id=st.id) - WHERE parser = 'latexml' and dataset != '' and task != '' and metric != '' and model != '';""", limit=None) - gold_sota_records["parsed"] = gold_sota_records[["raw_value", "format"]].apply( - lambda row: float(extract_value(row.raw_value, row.format)), axis=1) - - unparsed = gold_sota_records[gold_sota_records["parsed"] != gold_sota_records["parsed"]] - if len(unparsed): - print("Found unparsed values") - display(unparsed.style.format({'cell_ext_id': - lambda x: f'{x}'}) - ) - - gold_sota_records = gold_sota_records[gold_sota_records["parsed"] == gold_sota_records["parsed"]] - - strip_cols=["task", "dataset", "format", "metric", "raw_value", "model", "model_type"] - gold_sota_records = gold_sota_records.transform( - lambda x: x.str.strip() if x.name in strip_cols else x) - gold_sota_records = gold_sota_records.set_index('cell_ext_id') - return gold_sota_records - -def fetch_gold_sota_papers(): - return q(""" - SELECT st.paper_id - FROM - sota_record sr - JOIN sota_cell sc USING (table_id, row, col) - JOIN sota_table st ON (sc.table_id=st.id) - WHERE parser = 'latexml' and dataset != '' and task != '' and metric != '' and model != '' - GROUP BY st.paper_id;""", limit=None)["paper_id"].tolist() - -class Evaluator(): - def __init__(self, model, paper_collection): - self.model = model - self.pc = paper_collection - self.annotated_papers = fetch_gold_sota_papers() - self.raw_proposals = None - - def run_model(self): - papers = [paper for paper in self.pc if paper.paper_id in self.annotated_papers] - self.raw_proposals = execute_model_on_papers(model=self.model, papers=papers) - - def evaluate(self, proposals_filter, track_proposals=False): - if self.raw_proposals is None: - self.run_model() - if track_proposals: - all_proposals = self.raw_proposals.copy(deep=True) - else: - all_proposals = None - proposals = proposals_filter(self.raw_proposals, all_proposals) - gold_sota_records = fetch_gold_sota_records() - df = gold_sota_records.merge(proposals, 'outer', left_index=True, right_index=True, suffixes=['_gold', '_pred']) - df = df.reindex(sorted(df.columns), axis=1) - df = df.fillna('not-present') - if "experiment_name" in df.columns: - del df["experiment_name"] - - metrics = Metrics(df, experiment_name=self.model.__name__) - if track_proposals: - return metrics, all_proposals - else: - return metrics \ No newline at end of file diff --git a/sota_extractor2/models/linking/extractors.py b/sota_extractor2/models/linking/extractors.py deleted file mode 100644 index 695a1b5..0000000 --- a/sota_extractor2/models/linking/extractors.py +++ /dev/null @@ -1,43 +0,0 @@ -import re - -dataset_name_re = re.compile(r"\b(the)\b\s*(?P((?!(the)\b)\w+\W+){1,10}?)(test|val(\.|idation)?|dev(\.|elopment)?|train(\.|ing)?\s+)?\bdata\s*set\b", re.IGNORECASE) - -parens_re = re.compile(r"\([^)]*?\)|\[[^]]*?\]") -def remove_parens(text): - return parens_re.sub("", text) - -def clean_name(name): - return remove_parens(name.strip()).strip() - -year_2k_re = re.compile(r"20(\d\d)") -hyphens_re = re.compile(r"[-_'`–’→]") -ws_re = re.compile(r"\s+") -dataset_prefix_re = re.compile(r"[A-Z]|[a-z]+[A-Z]+|[0-9]") - -def normalize_dataset(name): - name = hyphens_re.sub(" ", name) - name = year_2k_re.sub(r"\1", name) - name = ws_re.sub(" ", name) - return name.strip().lower() - -## temporarily moved to notebook -# class DatasetExtractor: -# def from_paper(self, paper): -# text = paper.text.abstract -# if hasattr(paper.text, "fragments"): -# text += " ".join(f.text for f in paper.text.fragments) -# return self(text) -# -# def __call__(self, text): -# extracted = [clean_name(m.group("name")) for m in dataset_name_re.finditer(text)] -# print("Extracted:", extracted) -# cleaned = [x for x in extracted if dataset_prefix_re.match(x)] -# print("Cleaned:", cleaned) -# return cleaned -# filtered = list(set([x for x in cleaned if normalize_dataset(x) in normalized_datasets])) -# print("Filtered:", filtered) -# return filtered - - -datasets = ['VOT2016', 'Penn Treebank', 'DIV2K', 'SCUT-FBP5500', 'SCUT-FBP', 'ImageNet', 'KITTI', 'Cityscapes', 'Street View House Number', 'MNIST', '1000-class ImageNet', 'CIFAR-10', 'Berkeley Segmentation', 'AFLW', 'BIWI', '300W-LP', 'AFLW2000', 'AFW', 'Stanford Question Answering', 'SQuAD', '80 million tiny images', 'PASCAL VOC 2012', 'ILSVRC-2012 ImageNet', 'CIFAR-100', 'NewsQA', 'COCO', 'Market-1501', 'LSUN', 'Matterport3D', 'Market1501', 'bAbI', 'WikiHop', 'MICC', 'Wild', 'Yelp', 'SNLI', 'MultiNLI', 'Age', 'Yahoo', 'OMNIGLOT', 'DSTC2', 'Cars', 'CBT', 'CNN', 'Daily Mail', 'Jester', 'Adult', 'LSUN bedroom', 'CUB', 'Caltech-UCSD Birds-200-2011', 'Street View House Numbers', 'TREC QA', 'Realtor360', 'PanoContext', 'Stanford 2D-3D', 'Camelyon16', 'COCO-Stuff', 'Flickr Landscapes', 'ADE20K', 'MSRA', 'OntoNotes', 'Visual Question Answering', 'VQA', 'VQA v2.0', 'Indian Pines', 'Pavia University', 'MR', 'PASCAL3D+', 'PASCAL VOC 2007', 'VOC 2007', 'LSP', 'VIPeR', 'PASCAL VOC', 'ImageNet detection', 'MS-COCO', 'Caltech-UCSD Birds', 'MPII Human Pose', 'CoNLL 2003 NER', 'FCE', 'Cora', 'Wikipedia', 'Switchboard', '1B word', 'SVHN', 'Caltech pedestrian', 'Set5', 'Urban100', 'AVA', 'Charades', 'MMI', 'Extended Cohn-Kanade', 'CKP', 'ICDAR 2015', 'SwDA', 'MRDA', 'ModelNet', 'PASCAL 3D', 'ShapeNet', 'TriviaQA', 'Facescrub', 'NYUV2', 'ShapeNet part', 'WSJ', 'CoNLL03 NER', 'NER', 'CoNLL03', 'LibriSpeech', '300W', 'WN18', 'ILSVRC 2012 classification', 'Penn Tree Bank', 'Cifar-10', 'SQuAD 2.0', 'PTB', 'DukeMTMC-reID', 'CUHK03', 'SearchQA', 'Stanford Natural Language Inference', 'NYU', 'ICVL', 'NYU hand pose', 'WN18RR', 'CoNLL-2005 shared task', 'CoNLL-2012 shared task', 'CoNLL-2005', 'CoNLL-2012', 'ImageNet 2012', '300-W', 'AFLW2000-3D', 'LFW', 'Omniglot', 'PROMISE 2012', 'Twitter', 'Florence', 'SUN-RGBD', 'Microsoft COCO', 'ImageNet classification', 'Something-Something', 'MRC', 'MS MARCO', 'Amazon', 'Alibaba', 'Netflix', 'PASCAL-Person-Part', 'CIHP', 'Pascal VOC', 'MS-Celeb-1M', 'CASIA', 'MegaFace', 'IJB-B', 'ImageNet-1k', 'Places365-Standard', 'SciTail', 'GTSRB', 'GRID', 'BSD', 'LIVE1', 'CNN/Daily Mail', 'Caltech', 'MS COCO', 'Restaurant', 'JSB Chorales', 'CUHK', 'CUFSF', 'JFT-300M', 'CelebA', 'RaFD', 'Amazon Reviews', 'Amazon reviews', 'SemEval', 'Tobacco-3482', 'RVL-CDIP', 'Douban', 'Company\xe2\x88\x97', 'Criteo', 'Semantic Boundaries', 'Caltech-UCSD birds', 'IMDb', 'VGG-Face', 'MoFA', 'FERET', 'iNat2017', 'ScanNet', 'TIMIT', 'VOC 2012', 'SICK', 'IJB-A', 'CACD', 'MSCeleb', 'YTF', 'CACD-VS', 'CityScapes', 'COCO detection', 'Bosch', 'LISA', 'Tsinghua-Tencent', 'FDDB', 'Mikolajczyk', 'Middlebury', 'Kitti', 'ILSVRC2012', 'BSD100', 'LineMod', 'Occlusion', 'GTAV', 'CityPersons', 'ETH', 'INRIA', 'ILSVRC CLS-LOC', 'Caltech-USA', 'BlogCatalog', 'CoNLL', 'MPII', 'Cityscapes', 'Cityscapes', 'CamVid', 'Amazon Review', 'STL-10', 'Imagenet', 'ShapeNet-Part', 'ModelNet40', 'BUS 2017', 'Quora Question Pairs', 'SST', 'MARS', 'PRW', 'BSD68', 'IMDB', 'ASPEC', 'OTB-2015', 'VOT-2017 public', 'Tejani', 'LineMOD', 'CASIA WebFace', 'Flying Chairs', 'FLIC', 'Set14 \xc3\x974', 'Human3.6M', 'Google News', 'Jobs', 'WikiText-2', 'Rotten Tomatoes', 'RCV1', 'WIDER FACE val', 'WIDER FACE', 'COCO', 'PoseTrack', 'HPatches', 'MHP v2.0', 'Buffy', 'ShapeNetCore', 'EVAL', 'MAFA', 'iPinYou', 'CASIA-WebFace', 'JANUS CS2', 'Cross-City', 'GTA5', 'SYNTHIA', 'MovieLens-100k', 'MovieLens-1M', 'LAMBADA', 'bAbi', 'Visual Genome', 'Visual-7W', 'Google-Ref', 'CelebA-HQ', 'PASCAL', 'QASent', 'WikiQA', 'Online Products', 'FB15k-237', 'MovieLens 1M', 'REST', 'Yosemite', 'PASCAL faces', 'MusicNet', 'Multi-MNIST', 'CLEVR', 'Quora', 'Who Did What', 'Children\xe2\x80\x99s Book', 'Set14', 'CFP', 'CTW1500', 'Weizmann Horse', 'ReVerb45K', 'AG\xe2\x80\x99s News', 'WMT En\xe2\x86\x92Fr', 'WMT En\xe2\x86\x92De', 'CNN/DailyMail', 'NYT', 'ECCV HotOrNot', 'bAbI story-based QA', 'PPI', 'Mini-ImageNet', 'ITOP', 'YCB-Video', 'DFW', 'ACL-ARC', 'SciCite', 'HumanEva', 'LINEMOD', 'Occlusion LINEMOD', 'Face Detection', 'UP-3D', 'WT2', 'PASCAL-Context', 'TREC', 'WDW', 'Shoulder-Pain', 'MovieLens', 'CT-150', 'WMT', 'CMU-MOSI', 'IEMOCAP', 'MPII Multi-Person Pose', '91-image', 'CoNLL 2003', 'COCO keypoint detection', 'WiderFace', 'Extended Yale B', 'Hutter Prize', 'SST-1', 'CUB-200-2011', 'Cars196', 'Stanford Online Products', 'Caltech and KITTI', 'BRATS', 'E2E', 'TV', 'Laptop', 'CIFAR', 'CHALL_H80K', 'VQA v2', 'NYU depth', 'NYUD', 'Cityscape', 'IBUG', 'BP4D', 'CAF', 'LexNorm2015', 'YouTube Face', 'DAQUAR', 'NYUDv2', 'SmallTobacco', 'BigTobacco', 'TID2013', 'CK+', 'PubMed 20k', 'WAF', 'MPII Multi-Person', 'GTA', 'PCSO mugshot', 'CIFAR100', 'ImageNet', 'MHP', 'CompCars', 'CUB200-2011 bird', 'CUHK03 labeled', 'Stanford 2D-3D annotation', 'Reddit', 'Stanford SQuAD', 'Graph Reachability', 'AIDA-B', 'VGG face', 'Yahoo! Answer', 'AR', 'Caltech Pedestrian', 'CARS-196', 'Pascal Context', 'Scan2CAD', 'Tiny Images', 'CAT', 'CIFAR10', 'JFT', 'PA-100K', 'VOC2007', 'Wikihop', 'PASCAL face', 'MPQA', 'NELL995', 'NELL-995', 'ShanghaiTech', 'SARC', 'Pol', 'CUHK03 detected', 'Celeb-Seq', 'ICDAR2015 Incidental Scene Text', 'Stanford Sentiment Treebank', 'CoQA', 'Massachusetts roads', 'MPIIGaze', 'SBD', 'InsuranceQA', 'ETHZ', 'Landmarks', 'H36M', 'OccludedLINEMOD', 'UCF101', 'RGBD', 'USPS', 'Visual QA', 'COCO-QA', 'Vid4', 'DAVIS-10'] -normalized_datasets = [normalize_dataset(ds) for ds in datasets] diff --git a/sota_extractor2/models/linking/format.py b/sota_extractor2/models/linking/format.py deleted file mode 100644 index 3c07fd0..0000000 --- a/sota_extractor2/models/linking/format.py +++ /dev/null @@ -1,37 +0,0 @@ -import re -from decimal import Decimal, ROUND_DOWN, ROUND_HALF_UP, InvalidOperation - -float_value_re = re.compile(r"([+-]?(?:(?:\d{1,2}(?:,\d{3})+|\d+)(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)") -float_value_nc = re.compile(r"(?:[+-]?(?:(?:\d{1,2}(?:,\d{3})+|\d+)(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)") -par_re = re.compile(r"\{([^\}]*)\}") -escaped_whitespace_re = re.compile(r"(\\\s)+") - -def format_to_regexp(format): - placeholders = par_re.split(format.strip()) - regexp = "" - fn=lambda x: x - for i, s in enumerate(placeholders): - if i % 2 == 0: - if s.strip() == "": - regexp += escaped_whitespace_re.sub(r"\\s+", re.escape(s)) - else: - regexp += escaped_whitespace_re.sub(r"\\s*", re.escape(s)) - elif s.strip() == "": - regexp += float_value_nc.pattern - else: - regexp += float_value_re.pattern - ss = s.strip() - if ss == "100*x" or ss == "100x": - fn = lambda x: 100*x - elif ss == "x/100": - fn = lambda x: x/100 - #return re.compile('^'+regexp+'$'), fn - return re.compile('^' + regexp), fn - -def extract_value(cell_value, format): - cell_value = re.sub(r"\s+%", "%", cell_value) - regexp, fn = format_to_regexp(format) - match = regexp.match(cell_value.strip()) - if match is None or not len(match.groups()): - return Decimal('NaN') - return fn(Decimal(match.group(1))) \ No newline at end of file diff --git a/sota_extractor2/models/linking/linker.py b/sota_extractor2/models/linking/linker.py deleted file mode 100644 index 5a12aa3..0000000 --- a/sota_extractor2/models/linking/linker.py +++ /dev/null @@ -1,19 +0,0 @@ -from .bm25_naive import linked_proposals -from ...pipeline_logger import pipeline_logger - - -class Linker: - step = "linking" - - def __init__(self, name, taxonomy_linking, dataset_extractor): - self.taxonomy_linking = taxonomy_linking - self.dataset_extractor = dataset_extractor - self.__name__ = name - - def __call__(self, paper, tables): - pipeline_logger(f"{Linker.step}::call", paper=paper, tables=tables) - proposals = linked_proposals(paper.paper_id, paper, tables, - taxonomy_linking=self.taxonomy_linking, - dataset_extractor=self.dataset_extractor).set_index('cell_ext_id') - pipeline_logger(f"{Linker.step}::linked", paper=paper, tables=tables, proposals=proposals) - return proposals diff --git a/sota_extractor2/models/linking/metrics.py b/sota_extractor2/models/linking/metrics.py deleted file mode 100644 index 2003f6b..0000000 --- a/sota_extractor2/models/linking/metrics.py +++ /dev/null @@ -1,139 +0,0 @@ -from fastai.text import * -from sklearn.metrics import confusion_matrix -import seaborn as sn -import pandas as pd -import matplotlib.pyplot as plt -from dataclasses import dataclass -from IPython.display import HTML, display - -@dataclass -class CM: - tp: float = 0 - fn: float = 0 - fp: float = 0 - tn: float = 0 - -class Metrics: - def __init__(self, df, experiment_name="unk"): - # TODO fix this, it mask the fact that our model may return more values than it should for "model - #self.df = df[~df["model_type_gold"].str.contains('not-present') | df["model_type_pred"].str.contains('model-best')] - self.df = df[df["model_type_gold"].str.contains('model-best') | df["model_type_pred"].str.contains('model-best')] - self.experiment_name = experiment_name - self.metric_type = 'best' - - def matching(self, *col_names): - return np.all([self.df[f"{name}_pred"] == self.df[f"{name}_gold"] for name in col_names], axis=0) - - def matching_fraction(self, *col_names): - return self.matching(*col_names).sum() / len(self.df) - - def is_predicted_as_relevant(self, *col_names): - np.all([self.df[f"{name}_pred"]]) - - def binary_confusion_matrix(self, *col_names, best_only=True): - relevant_gold = self.df["model_type_gold"].str.contains('model-best') - if best_only: - relevant_pred = self.df["model_type_pred"].str.contains('model-best') - else: - relevant_pred = relevant_gold - # present_pred = np.all([self.df[f"{name}_pred"] != 'not-present' for name in col_names], axis=0) - - pred_positive = relevant_pred # & present_pred - gold_positive = relevant_gold - equal = self.matching(*col_names) - - tp = (equal & pred_positive & gold_positive).sum() - tn = (equal & ~pred_positive & ~gold_positive).sum() - fp = (pred_positive & (~equal | ~gold_positive)).sum() - fn = (gold_positive & (~equal | ~pred_positive)).sum() - - return CM(tp=tp, tn=tn, fp=fp, fn=fn) - - def calc_metric(self, metric_name, metric_fn, *col_names, best_only=True): - prefix = "best_" if best_only else "" - result = {f"{prefix}{metric_name}_{col}": metric_fn(self.binary_confusion_matrix(col, best_only=best_only)) for col in col_names} - if len(col_names) > 1: - cm = self.binary_confusion_matrix(*col_names, best_only=best_only) - result[f"{prefix}{metric_name}_all"] = metric_fn(cm) - result[f"{prefix}TP_all"] = cm.tp - result[f"{prefix}FP_all"] = cm.fp - - # Hack to present count on which precision is done - relevant_gold = self.df["model_type_gold"].str.contains('model-best') - if best_only: - relevant_pred = self.df["model_type_pred"].str.contains('model-best') - else: - relevant_pred = relevant_gold - result[f"{prefix}count"] = (relevant_pred | relevant_gold).sum() - - return result - - def accuracy(self, *col_names): - result = {f"matching_accuracy_{col}": self.matching_fraction(col) for col in col_names} - if len(col_names) > 1: - result['matching_accuracy_all'] = self.matching_fraction(*col_names) - result["matching_count"] = len(self.df) - return result - - # True Positive - m - # False Positive - cell marked as relevant but with incorrect values - - def confusion_matrix(self, name): - pred_y = np.array(self.df[f"{name}_pred"]) - true_y = np.array(self.df[f"{name}_gold"]) - labels = list(sorted(set(list(true_y) + list(pred_y)))) - cm = confusion_matrix(true_y, pred_y, labels) - return cm, labels - - def plot_confusion_matrix(self, name): - cm, target_names = self.confusion_matrix(name) - # cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - df_cm = pd.DataFrame(cm, index=[i for i in target_names], - columns=[i for i in target_names]) - plt.figure(figsize=(20, 20)) - ax = sn.heatmap(df_cm, - annot=True, - square=True, - fmt="d", - cmap="YlGnBu", - mask=cm == 0, - linecolor="black", - linewidths=0.01) - ax.set_ylabel("True") - ax.set_xlabel("Predicted") - - def precision(self, *col_names, best_only=True): - return self.calc_metric("precision", lambda cm: cm.tp / (cm.tp + cm.fp), *col_names, best_only=best_only) - - def recall(self, *col_names, best_only=True): - return self.calc_metric("recall", lambda cm: cm.tp / (cm.tp + cm.fn), *col_names, best_only=best_only) - - def metrics(self): - cols = ["model_type", "dataset", "metric", "task", "parsed"] - m = self.accuracy(*cols) - m.update(self.precision(*cols, best_only=True)) - m.update(self.recall(*cols, best_only=True)) - - m["experiment_name"] = self.experiment_name - m["test_type"] = self.metric_type - - df = pd.DataFrame([(k,v) for k,v in m.items()], columns=["metric", "value"]).set_index("metric") - return df - - def errors(self, *col_names): - cols = col_names - if not cols: - cols = ["model_type", "dataset", "metric", "task", "parsed"] - return self.df[~self.matching(*cols)] - - def show(self, df): - df = df.copy() - df['cell_id'] = df.index.map( - lambda x: 'link'.format(x)) - old_width = pd.get_option('display.max_colwidth') - pd.set_option('display.max_colwidth', -1) - display(HTML(df.to_html(escape=False))) - pd.set_option('display.max_colwidth', old_width) - - def show_errors(self): - self.show(self.errors()) \ No newline at end of file diff --git a/sota_extractor2/models/linking/probs.py b/sota_extractor2/models/linking/probs.py deleted file mode 100644 index 14a4fd0..0000000 --- a/sota_extractor2/models/linking/probs.py +++ /dev/null @@ -1,53 +0,0 @@ -from collections import Counter - - -def get_probs(occurrences): - """ - Computes conditional probabilities based on frequency of co-occurrences - - Parameters - ---------- - occurrences: occurences[x][y] number of times with (X=x and Y=y) - - Returns - ------- - probs : probs[x][y] = Pr(Y=y | X=x) - reverse_probs : reverse_probs[y][x] = Pr(X=x | Y=y) - """ - probs = {} - reverse_probs = {} - y_occ = Counter() - for x, ys in occurrences.items(): - total = sum(ys.values()) - probs[x] = {} - for y, occ in ys.items(): - probs[x][y] = occ / total - y_occ[y] += occ - for x, ys in occurrences.items(): - for y, occ in ys.items(): - reverse_probs.setdefault(y, {})[x] = occ / y_occ[y] - - return probs, reverse_probs - - -def reverse_probs(probs): - """ - Reverses the conditional probability assuming that variables are uniformly distributed - - Parameters - ---------- - probs : probs[x][y] = Pr(Y=y | X=x) - - Returns - ------- - reverse : reverse[y][x] = Pr(X=x | Y=y) assuming X is uniform - """ - reverse = {} - for x, probs_x in probs.items(): - for y, p in probs_x.items(): - reverse.setdefault(y, {})[x] = p - for y, probs_y in reverse.items(): - norm = sum(probs_y.values()) - for x, p in probs_y.items(): - probs_y[x] = p / norm - return reverse diff --git a/sota_extractor2/models/linking/proposals_filters.py b/sota_extractor2/models/linking/proposals_filters.py deleted file mode 100644 index 649bcea..0000000 --- a/sota_extractor2/models/linking/proposals_filters.py +++ /dev/null @@ -1,140 +0,0 @@ -from ...pipeline_logger import pipeline_logger -import pandas as pd -from enum import Enum - - -class FilterOutReason(Enum): - TrainDataset = "train-dataset" - DevDataset = "dev-dataset" - EmptyModelName = "empty-model-name" - ModelCompeting = "model-competing" - - -class ProposalsFilter: - step = "proposals_filtering" - - def _filter(self, proposals): - raise NotImplementedError - - def filter(self, proposals): - which, reason = self._filter(proposals) - self.log(proposals=proposals, which=which, reason=reason) - return which, reason - - def __rshift__(self, other): - return CompoundFilter([self, other]) - - def __call__(self, proposals): - which, reason = self.filter(proposals) - return proposals[which] - - def log(self, **kwargs): - pipeline_logger(f"filtering::{self.step}::filtered", **kwargs) - - -class CompoundFilter(ProposalsFilter): - step = "compound_filtering" - - def __init__(self, filters): - self.filters = filters - - def _filter(self, proposals): - agg_which = pd.Series(data=True, index=proposals.index) - agg_reason = pd.Series(data="", index=proposals.index) - - for f in self.filters: - which, reason = f.filter(proposals) - agg_reason[agg_which & ~which] = reason - agg_which &= which - proposals = proposals[which] - return agg_which, agg_reason[~agg_which] - - -class NopFilter(ProposalsFilter): - step = "nop_filtering" - - def _filter(self, proposals): - which = pd.Series(data=True, index=proposals.index) - reason = pd.Series() - return which, reason - - -# filter proposals for which structure prediction -# * was unable to find model type or -# * found dataset cell containing "dev" or "train" -# this filter could be applied before taxonomy linking, -# but to make error analysis easier it's applied after -class StructurePredictionFilter(ProposalsFilter): - step = "structure_filtering" - - def _filter(self, proposals): - which = (proposals.struct_model_type != '') \ - & ~proposals.struct_dataset.str.contains('dev') \ - & ~proposals.struct_dataset.str.contains('train') - reason = pd.Series(data="", index=proposals.index) - reason[proposals.struct_dataset.str.contains('train')] = "train-dataset" - reason[proposals.struct_dataset.str.contains('dev')] = "dev-dataset" - reason[proposals.struct_model_type == ''] = "empty-model-type" - - return which, reason[~which] - - -class ConfidenceFilter(ProposalsFilter): - step = "confidence_filtering" - - def __init__(self, confidence=-1): - self.confidence = confidence - - def _filter(self, proposals): - which = proposals.confidence >= self.confidence - reason = "confidence " + proposals[~which].confidence.round(2).astype(str) + f" < {self.confidence}" - return which, reason[~which] - - def log(self, **kwargs): - super().log(**kwargs, confidence=self.confidence) - - -class BestResultFilter(ProposalsFilter): - step = "best_result_filtering" - - def __init__(self, taxonomy, context="paper"): - assert context in ["paper", "table"] - self.metrics_info = taxonomy.metrics_info - self.context = context - - def _filter(self, proposals): - reason = pd.Series(data="", index=proposals.index) - indices = [] - - if self.context == "paper": - context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0]) - else: - context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0] + "/" + x[1]) - - for key_all, group in proposals[(proposals.model_type == 'model-best') & ~proposals.parsed.isna()].groupby( - by=["dataset", "metric", "task", context_column]): - dataset, metric, task, paper = key_all - key = (task, dataset, metric) - d = 0 - if key in self.metrics_info: - d = self.metrics_info[key] - elif metric in self.metrics_info: - d = self.metrics_info[metric] - elif 'error' in metric.lower(): - d = -1 - elif 'accuracy' in metric.lower(): - d = 1 - - if d >= 0: - index = group.parsed.idxmax() - else: - index = group.parsed.idxmin() - indices.append(index) - reason[group.index[group.index != index]] = "replaced by " + str(index) - - reason[proposals.struct_model_type == 'model-competing'] = "model-competing" - which = proposals.index.to_series().isin(indices) - return which, reason[~which] - - def log(self, **kwargs): - super().log(**kwargs, context=self.context) diff --git a/sota_extractor2/models/linking/taxonomy.py b/sota_extractor2/models/linking/taxonomy.py deleted file mode 100644 index 58f6794..0000000 --- a/sota_extractor2/models/linking/taxonomy.py +++ /dev/null @@ -1,28 +0,0 @@ -from pathlib import Path -import json -from collections import OrderedDict - - - -class Taxonomy: - def __init__(self, taxonomy, metrics_info): - self.taxonomy = self._read_taxonomy(taxonomy) - self.metrics_info = self._read_metrics_info(metrics_info) - - def _read_json(self, path): - with open(path, "rt") as f: - return json.load(f) - - def _read_taxonomy(self, path): - records = self._read_json(path) - return [(r["task"], r["dataset"], r["metric"]) for r in records] - - def _read_metrics_info(self, path): - records = self._read_json(path) - metrics_info = {} - for r in records: - task, dataset, metric = r['task'], r['dataset'], r['metric'] - d = 1 if r['higher_is_better'] else -1 - metrics_info[(task, dataset, metric)] = d - metrics_info[metric] = metrics_info.get(metric, 0) + d - return metrics_info diff --git a/sota_extractor2/models/linking/utils.py b/sota_extractor2/models/linking/utils.py deleted file mode 100644 index bec7693..0000000 --- a/sota_extractor2/models/linking/utils.py +++ /dev/null @@ -1,54 +0,0 @@ -from unidecode import unidecode -import re - -# cleaning & normalization -parens_re = re.compile(r"\([^)]*?\)|\[[^]]*?\]") - -strip_nonalnum_re = re.compile(r"^\W*(\w.*\b)\W*$") -def strip_nonalnum(s): - m = strip_nonalnum_re.match(s) - if m: - return m.group(1) - return "" - -def remove_parens(text): - return parens_re.sub("", text) - -def clean_name(name): - return remove_parens(unidecode(name).strip()).strip() - -def clean_cell(cell): - return strip_nonalnum(clean_name(cell)) - -year_2k_re = re.compile(r"20(\d\d)") -hyphens_re = re.compile(r"[-_'`–’→]") -ws_re = re.compile(r"\s+") - - -refs_re = re.compile(r"(xxtable-)?xxanchor-[^ ]*|xxref-[^ ]*") - -def remove_references(s): - return refs_re.sub("", s) - -def normalize_dataset_ws(name): - name = remove_references(name) - name = hyphens_re.sub(" ", name) - name = year_2k_re.sub(r"\1", name) - name = ws_re.sub(" ", name) - return unidecode(name.strip().lower()) - -def normalize_dataset(name): - name = remove_references(name) - name = hyphens_re.sub("", name) - name = year_2k_re.sub(r"\1", name) - name = ws_re.sub(" ", name) - return unidecode(name.strip().lower()) - - -def normalize_cell(s): - return unidecode("".join([x for x in s if x.isalnum()])) - -def normalize_cell_ws(s): - return unidecode("".join([x for x in s if x.isalnum() or x.isspace()])) - -# end of cleaning & normalization \ No newline at end of file diff --git a/sota_extractor2/models/structure/__init__.py b/sota_extractor2/models/structure/__init__.py deleted file mode 100644 index a6bbda1..0000000 --- a/sota_extractor2/models/structure/__init__.py +++ /dev/null @@ -1,70 +0,0 @@ -import re -import numpy as np -import pandas as pd -from ...helpers.training import set_seed -from ... import config -from .type_predictor import TableTypePredictor, TableType -from .structure_predictor import TableStructurePredictor - -__all__ = ["TableType", "TableTypePredictor", "TableStructurePredictor"] - - -def split_by_cell_content(df, seed=42, split_column="cell_content"): - set_seed(seed, "val_split", quiet=True) - contents = np.random.permutation(df[split_column].unique()) - val_split = int(len(contents)*0.1) - val_keys = contents[:val_split] - split = df[split_column].isin(val_keys) - valid_df = df[split] - train_df = df[~split] - len(train_df), len(valid_df) - return train_df, valid_df - - -label_map_4 = { - "model-paper": 1, - "model-best": 1, - "model-competing": 2, - "dataset": 3, - "dataset-sub": 3, - "dataset-task": 3, -} - - -label_map_3 = { - "model-paper": 1, - "model-best": 1, - "model-competing": 2, -} - -label_map_2 = { - "model-paper": 1, - "model-best": 1, - "model-competing": 1, -} - - -class DataBunch: - def __init__(self, train_name, test_name, label_map): - self.label_map = label_map - self.train_df = pd.read_csv(config.datasets_structure/train_name) - self.test_df = pd.read_csv(config.datasets_structure/test_name) - self.transform(self.normalize) - self.transform(self.label) - - def transform(self, fun): - self.train_df = fun(self.train_df) - self.test_df = fun(self.test_df) - - def normalize(self, df): - df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("") - df = df.replace(re.compile(r"(xxref|xxanchor)-[\w\d-]*"), "\\1 ") - df = df.replace(re.compile(r"(^|[ ])\d+\.\d+\b"), " xxnum ") - df = df.replace(re.compile(r"(^|[ ])\d\b"), " xxnum ") - df = df.replace(re.compile(r"\bdata set\b"), " dataset ") - return df - - def label(self, df): - df["label"] = df["cell_type"].apply(lambda x: self.label_map.get(x, 0)) - df["label"] = pd.Categorical(df["label"]) - return df diff --git a/sota_extractor2/models/structure/experiment.py b/sota_extractor2/models/structure/experiment.py deleted file mode 100644 index ca5b9a5..0000000 --- a/sota_extractor2/models/structure/experiment.py +++ /dev/null @@ -1,371 +0,0 @@ -import dataclasses -from dataclasses import dataclass -import json -from pathlib import Path -import numpy as np -import pandas as pd -from sota_extractor2.models.structure.nbsvm import * -from sklearn.metrics import confusion_matrix -from matplotlib import pyplot as plt -import seaborn as sn -from enum import Enum -import pickle - -class Labels(Enum): - OTHER=0 - DATASET=1 - PAPER_MODEL=2 - COMPETING_MODEL=3 - METRIC=4 - EMPTY=5 - - -class LabelsExt(Enum): - OTHER=0 - PARAMS=6 - TASK=7 - DATASET=1 - SUBDATASET=8 - PAPER_MODEL=2 - BEST_MODEL=9 - ENSEMBLE_MODEL=10 - COMPETING_MODEL=3 - METRIC=4 - EMPTY=5 - - -label_map = { - "dataset": Labels.DATASET.value, - "dataset-sub": Labels.DATASET.value, - "model-paper": Labels.PAPER_MODEL.value, - "model-best": Labels.PAPER_MODEL.value, - "model-ensemble": Labels.PAPER_MODEL.value, - "model-competing": Labels.COMPETING_MODEL.value, - "dataset-metric": Labels.METRIC.value -} - -label_map_ext = { - "dataset": LabelsExt.DATASET.value, - "dataset-sub": LabelsExt.SUBDATASET.value, - "model-paper": LabelsExt.PAPER_MODEL.value, - "model-best": LabelsExt.BEST_MODEL.value, - "model-ensemble": LabelsExt.ENSEMBLE_MODEL.value, - "model-competing": LabelsExt.COMPETING_MODEL.value, - "dataset-metric": LabelsExt.METRIC.value, - "model-params": LabelsExt.PARAMS.value, - "dataset-task": LabelsExt.TASK.value -} - -# put here to avoid recompiling, used only in _limit_context -elastic_tag_split_re = re.compile("(.*?)") - -# e = Experiment(remove_num=False, drop_duplicates=False, vectorizer='count', -# this_paper=True, merge_fragments=True, merge_type='concat', -# evidence_source='text_highlited', split_btags=True, fixed_tokenizer=True, -# fixed_this_paper=True, mask=False, evidence_limit=None, context_tokens=None, -# analyzer='word', lowercase=True, class_weight='balanced', multinomial_type='multinomial', -# solver='lbfgs', C=0.1, dual=False, penalty='l2', ngram_range=[1, 3], -# min_df=10, max_df=0.9, max_iter=1000, results={}, has_model=False) - -# ULMFiT related parameters -# remove_num, drop_duplicates, this_paper, merge_fragments, merge_type, evidence_source, split_btags -# fixed_tokenizer?, fixed_this_paper (remove), mask, evidence_limit, context_tokens, lowercase -# class_weight? (consider adding support), - -@dataclass -class Experiment: - this_paper: bool = False - merge_fragments: bool = False - merge_type: str = "concat" # "concat", "vote_maj", "vote_avg", "vote_max" - evidence_source: str = "text" # "text" or "text_highlited" - split_btags: bool = False # Test -> Test - fixed_tokenizer: bool = False # if True, and are not split into < b > and < / b > - fixed_this_paper: bool = False # if True and this_paper, filter this_paper before merging fragments - mask: bool = False # if True and evidence_source = "text_highlited", replace ... with xxmask - evidence_limit: int = None # maximum number of evidences per cell (grouped by (ext_id, this_paper)) - context_tokens: int = None # max. number of words before and after - lowercase: bool = True - remove_num: bool = True - drop_duplicates: bool = True - mark_this_paper: bool = False - distinguish_model_source: bool = True - - results: dict = dataclasses.field(default_factory=dict) - - has_model: bool = False # either there's already pretrained model or it's a saved experiment and there's a saved model as well - name: str = None - - def _get_next_exp_name(self, dir_path): - dir_path = Path(dir_path) - files = [f.name for f in dir_path.glob("*.exp.json")] - for i in range(100000): - name = f"{i:05d}.exp.json" - if name not in files: - return dir_path / name - raise Exception("You have too many files in this dir, really!") - - @staticmethod - def _dump_pickle(obj, path): - with open(path, 'wb') as f: - pickle.dump(obj, f) - - @staticmethod - def _load_pickle(path): - with open(path, 'rb') as f: - return pickle.load(f) - - def _save_model(self, path): - self._dump_pickle(self._model, path) - - def _load_model(self, path): - self._model = self._load_pickle(path) - return self._model - - def load_model(self): - path = self._path.parent / f"{self._path.stem}.model" - return self._load_model(path) - - def save_model(self, path): - if hasattr(self, "_model"): - self._save_model(path) - - def save(self, dir_path): - dir_path = Path(dir_path) - dir_path.mkdir(exist_ok=True, parents=True) - filename = self._get_next_exp_name(dir_path) - j = dataclasses.asdict(self) - with open(filename, "wt") as f: - json.dump(j, f) - self.save_model(dir_path / f"{filename.stem}.model") - return filename.name - - def to_df(self): - d = dataclasses.asdict(self) - res = d.pop("results") - d.update(res) - row = pd.DataFrame({k: [v] for k, v in d.items()}) - return row - - def new_experiment(self, **kwargs): - # reset this fields unless their provided in load() - kwargs.setdefault("has_model", False) - kwargs.setdefault("results", {}) - return dataclasses.replace(self, **kwargs) - - def update_results(self, **kwargs): - self.results.update(**kwargs) - - def train_model(self, train_df, valid_df): - raise NotImplementedError("train_model should be implemented in subclass") - - def get_trained_model(self, train_df, valid_df): - self._model = self.train_model(train_df, valid_df) - self.has_model = True - return self._model - - def _limit_context(self, text): - parts = elastic_tag_split_re.split(text) - new_parts = [] - end = len(parts) - for i, part in enumerate(parts): - if i % 2 == 0: - toks = tokenize(part) - if i == 0: - toks = toks[-self.context_tokens:] - elif i == end: - toks = toks[:self.context_tokens] - else: - j = len(toks) - 2 * self.context_tokens - if j > 0: - toks = toks[:self.context_tokens] + toks[-self.context_tokens:] - new_parts.append(' '.join(toks)) - else: - new_parts.append(part) - return ' '.join(new_parts) - - - - def _transform_df(self, df): - df.cell_reference = (df.cell_reference != '').astype(str) - df.cell_styles = df.cell_styles.astype(str) - if self.merge_type not in ["concat", "vote_maj", "vote_avg", "vote_max"]: - raise Exception(f"merge_type must be one of concat, vote_maj, vote_avg, vote_max, but {self.merge_type} was given") - if self.mark_this_paper and (self.merge_type != "concat" or self.this_paper): - raise Exception("merge_type must be 'concat' and this_paper must be false") - #df = df[df["cell_type"] != "table-meta"] # otherwise we get precision 0 on test set - if self.evidence_limit is not None: - df = df.groupby(by=["ext_id", "this_paper"]).head(self.evidence_limit) - if self.context_tokens is not None: - df.loc["text_highlited"] = df["text_highlited"].apply(self._limit_context) - df.loc["text"] = df["text_highlited"].str.replace("", " ").replace("", " ") - if self.evidence_source != "text": - df = df.copy(True) - if self.mask: - df["text"] = df[self.evidence_source].replace(re.compile(".*?"), " xxmask ") - else: - df["text"] = df[self.evidence_source] - elif self.mask: - raise Exception("Masking with evidence_source='text' makes no sense") - - duplicates_columns = ["text", "cell_content", "cell_type", "row_context", "col_context", "cell_reference", "cell_layout", "cell_styles"] - columns_to_keep = ["ext_id", "cell_content", "cell_type", "row_context", "col_context", "cell_reference", "cell_layout", "cell_styles"] - - if self.mark_this_paper: - df = df.groupby(by=columns_to_keep + ["this_paper"]).text.apply( - lambda x: "\n".join(x.values)).reset_index() - this_paper_map = { - True: "this paper", - False: "other paper" - } - df.text = "xxfld 3 " + df.this_paper.apply(this_paper_map.get) + " " + df.text - df = df.groupby(by=columns_to_keep).text.apply( - lambda x: " ".join(x.values)).reset_index() - elif not self.fixed_this_paper: - if self.merge_fragments and self.merge_type == "concat": - df = df.groupby(by=columns_to_keep + ["this_paper"]).text.apply( - lambda x: "\n".join(x.values)).reset_index() - if self.drop_duplicates: - df = df.drop_duplicates(duplicates_columns).fillna("") - if self.this_paper: - df = df[df.this_paper] - else: - if self.this_paper: - df = df[df.this_paper] - if self.merge_fragments and self.merge_type == "concat": - df = df.groupby(by=columns_to_keep).text.apply( - lambda x: "\n".join(x.values)).reset_index() - if self.drop_duplicates: - df = df.drop_duplicates(duplicates_columns).fillna("") - - if self.split_btags: - df["text"] = df["text"].replace(re.compile(r"(\)"), r" \1 ") - df = df.replace(re.compile(r"(xxref|xxanchor)-[\w\d-]*"), "\\1 ") - if self.remove_num: - df = df.replace(re.compile(r"(^|[ ])\d+\.\d+(\b|%)"), " xxnum ") - df = df.replace(re.compile(r"(^|[ ])\d+(\b|%)"), " xxnum ") - df = df.replace(re.compile(r"\bdata set\b"), " dataset ") - df["label"] = df["cell_type"].apply(lambda x: label_map.get(x, 0)) - if not self.distinguish_model_source: - df["label"] = df["label"].apply(lambda x: x if x != Labels.COMPETING_MODEL.value else Labels.PAPER_MODEL.value) - df["label"] = pd.Categorical(df["label"]) - return df - - def transform_df(self, *dfs): - transformed = [self._transform_df(df) for df in dfs] - if len(transformed) == 1: - return transformed[0] - return transformed - - def _set_results(self, prefix, preds, true_y, true_y_ext=None): - m = metrics(preds, true_y) - r = {} - r[f"{prefix}_accuracy"] = m["accuracy"] - r[f"{prefix}_precision"] = m["precision"] - r[f"{prefix}_recall"] = m["recall"] - r[f"{prefix}_cm"] = confusion_matrix(true_y, preds, labels=[x.value for x in Labels]).tolist() - if true_y_ext is not None: - r[f"{prefix}_cm_full"] = confusion_matrix(true_y_ext, preds, labels=[x.value for x in LabelsExt]).tolist() - self.update_results(**r) - - def evaluate(self, model, train_df, valid_df, test_df): - for prefix, tdf in zip(["train", "valid", "test"], [train_df, valid_df, test_df]): - probs = model.predict_proba(tdf["text"]) - preds = np.argmax(probs, axis=1) - - if self.merge_fragments and self.merge_type != "concat": - if self.merge_type == "vote_maj": - vote_results = preds_for_cell_content(tdf, probs) - elif self.merge_type == "vote_avg": - vote_results = preds_for_cell_content_multi(tdf, probs) - elif self.merge_type == "vote_max": - vote_results = preds_for_cell_content_max(tdf, probs) - preds = vote_results["pred"] - true_y = vote_results["true"] - else: - true_y = tdf["label"] - true_y_ext = tdf["cell_type"].apply(lambda x: label_map_ext.get(x, 0)) - self._set_results(prefix, preds, true_y, true_y_ext) - - def show_results(self, *ds, normalize=True, full_cm=True): - if not len(ds): - ds = ["train", "valid", "test"] - for prefix in ds: - print(f"{prefix} dataset") - print(f" * accuracy: {self.results[f'{prefix}_accuracy']:.3f}") - print(f" * μ-precision: {self.results[f'{prefix}_precision']:.3f}") - print(f" * μ-recall: {self.results[f'{prefix}_recall']:.3f}") - suffix = '_full' if full_cm and f'{prefix}_cm_full' in self.results else '' - self._plot_confusion_matrix(np.array(self.results[f'{prefix}_cm{suffix}']), normalize=normalize) - - def _plot_confusion_matrix(self, cm, normalize, fmt=None): - if normalize: - s = cm.sum(axis=1)[:, None] - s[s == 0] = 1 - cm = cm / s - if fmt is None: - fmt = "0.2f" if normalize else "d" - - if len(cm) == 6: - target_names = ["OTHER", "DATASET", "MODEL (paper)", "MODEL (comp.)", "METRIC", "EMPTY"] - else: - target_names = ["OTHER", "params", "task", "DATASET", "subdataset", "MODEL (paper)", "model (best)", - "model (ens.)", "MODEL (comp.)", "METRIC", "EMPTY"] - df_cm = pd.DataFrame(cm, index=[i for i in target_names], - columns=[i for i in target_names]) - plt.figure(figsize=(10, 10)) - ax = sn.heatmap(df_cm, - annot=True, - square=True, - fmt=fmt, - cmap="YlGnBu", - mask=cm == 0, - linecolor="black", - linewidths=0.01) - ax.set_ylabel("True") - ax.set_xlabel("Predicted") - - @classmethod - def load_all(cls, dir_path): - dir_path = Path(dir_path) - return [cls.load(f) for f in dir_path.glob("*.exp.json")] - - @classmethod - def load(cls, path): - # a new field added to the class should not change - # the default behaviour of experiment, so that we - # can load older experiments by setting missing fields - # to their default values - e = cls() - path = Path(path) - with open(path, "rt") as f: - j = json.load(f) - j["name"] = path.name - e = e.new_experiment(**j) - e._path = path - return e - - @classmethod - def experiments_to_df(cls, exps): - dfs = [e.to_df() for e in exps] - df = pd.concat(dfs) - return df - -@dataclass -class NBSVMExperiment(Experiment): - vectorizer: str = "tfidf" - analyzer: str = "word" # "char", "word" or "char_wb" - class_weight: str = None - multinomial_type: str = "manual" # "manual", "ovr", "multinomial" - solver: str = "liblinear" # 'lbfgs' - large, liblinear for small datasets - C: float = 4.0 - dual: bool = True - penalty: str = "l2" - ngram_range: tuple = (1, 2) - min_df: int = 3 - max_df: float = 0.9 - max_iter: int = 1000 - - def train_model(self, train_df, valid_df=None): - nbsvm = NBSVM(experiment=self) - nbsvm.fit(train_df["text"], train_df["label"]) - return nbsvm diff --git a/sota_extractor2/models/structure/nbsvm.py b/sota_extractor2/models/structure/nbsvm.py deleted file mode 100644 index ed4f79b..0000000 --- a/sota_extractor2/models/structure/nbsvm.py +++ /dev/null @@ -1,236 +0,0 @@ -import re -import string -from fastai.text import * # just for utilty functions pd, np, Path etc. - -from sklearn.linear_model import LogisticRegression -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer - -from ...helpers.training import set_seed - -def transform_df(df): - df=df.replace(re.compile(r"(xxref|xxanchor)-[\w\d-]*"), "\\1 ") - df=df.replace(re.compile(r"(^|[ ])\d+\.\d+\b"), " xxnum ") - df=df.replace(re.compile(r"(^|[ ])\d\b"), " xxnum ") - df=df.replace(re.compile(r"\bdata set\b"), " dataset ") - df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("") - return df - -def train_valid_split(df, seed=42, by="cell_content"): - set_seed(seed, "val_split") - contents = np.random.permutation(df[by].unique()) - val_split = int(len(contents)*0.1) - val_keys = contents[:val_split] - split = df[by].isin(val_keys) - valid_df = df[split] - train_df = df[~split] - len(train_df), len(valid_df) - return train_df, valid_df - -def get_class_column(y, classIdx): - if len(y.shape) == 1: - return y == classIdx - else: - return y.iloc[:, classIdx] - -def get_number_of_classes(y): - if len(y.shape) == 1: - return len(np.unique(y)) - else: - return y.shape[1] - -re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])') -re_tok_fixed = re.compile( - f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])'.replace('<', '').replace('>', '').replace('/', '')) - -def tokenize(s): - return re_tok.sub(r' \1 ', s).split() - -def tokenize_fixed(s): - return re_tok_fixed.sub(r' \1 ', s).split() - - -class NBSVM: - def __init__(self, experiment): - self.experiment = experiment - - - def pr(self, y_i, y): - p = self.trn_term_doc[y == y_i].sum(0) - return (p+1) / ((y == y_i).sum()+1) - - def get_mdl(self, y): - y = y.values - r = np.log(self.pr(1, y) / self.pr(0, y)) - m = LogisticRegression(C=self.experiment.C, penalty=self.experiment.penalty, - dual=self.experiment.dual, solver=self.experiment.solver, - max_iter=self.experiment.max_iter, class_weight=self.experiment.class_weight) - x_nb = self.trn_term_doc.multiply(r) - return m.fit(x_nb, y), r - - def bow(self, X_train): - self.n = X_train.shape[0] - - tokenizer = tokenize_fixed if self.experiment.fixed_tokenizer else tokenize - if self.experiment.vectorizer == "tfidf": - self.vec = TfidfVectorizer(ngram_range=self.experiment.ngram_range, - tokenizer=tokenizer, - lowercase=self.experiment.lowercase, - analyzer=self.experiment.analyzer, - min_df=self.experiment.min_df, max_df=self.experiment.max_df, - strip_accents='unicode', use_idf=1, - smooth_idf=1, sublinear_tf=1) - elif self.experiment.vectorizer == "count": - self.vec = CountVectorizer(ngram_range=self.experiment.ngram_range, tokenizer=tokenizer, - analyzer=self.experiment.analyzer, - lowercase=self.experiment.lowercase, - min_df=self.experiment.min_df, max_df=self.experiment.max_df, - strip_accents='unicode') - else: - raise Exception(f"Unknown vectorizer type: {self.experiment.vectorizer}") - - return self.vec.fit_transform(X_train) - - def train_models(self, y_train): - self.models = [] - if self.experiment.multinomial_type == "manual": - for i in range(0, self.c): - #print('fit', i) - m, r = self.get_mdl(get_class_column(y_train, i)) - self.models.append((m, r)) - elif self.experiment.multinomial_type == "multinomial" or self.experiment.multinomial_type == "ovr": - m = LogisticRegression(C=self.experiment.C, penalty=self.experiment.penalty, - dual=self.experiment.dual, solver=self.experiment.solver, - max_iter=self.experiment.max_iter, - multi_class=self.experiment.multinomial_type, class_weight=self.experiment.class_weight) - x_nb = self.trn_term_doc - self.models.append(m.fit(x_nb, y_train)) - else: - raise Exception(f"Unsupported multinomial_type {self.experiment.multinomial_type}") - - def fit(self, X_train, y_train): - self.trn_term_doc = self.bow(X_train) - self.c = get_number_of_classes(y_train) - self.train_models(y_train) - - def predict_proba(self, X_test): - test_term_doc = self.vec.transform(X_test) - if self.experiment.multinomial_type == "manual": - preds = np.zeros((len(X_test), self.c)) - for i in range(0, self.c): - m, r = self.models[i] - preds[:, i] = m.predict_proba(test_term_doc.multiply(r))[:, 1] - elif self.experiment.multinomial_type == "multinomial" or self.experiment.multinomial_type == "ovr": - preds = self.models[0].predict_proba(test_term_doc) - else: - raise Exception(f"Unsupported multinomial_type {self.experiment.multinomial_type}") - return preds - - def sort_features_by_importance(self, label): - label = label.value - names = np.array(self.vec.get_feature_names()) - if self.experiment.multinomial_type == "manual": - m, r = self.models[label] - f = m.coef_[0] * np.array(r)[0] - elif self.experiment.multinomial_type == "multinomial": - f = self.models[0].coef_[label] - else: - raise Exception(f"Unsupported multinomial_type {self.experiment.multinomial_type}") - if self.experiment.vectorizer == "tfidf": - f *= self.vec.idf_ - indices = f.argsort()[::-1] - return names[indices], f[indices] - - def get_mismatched(self, df, true_label, predicted_label): - if self.experiment.merge_fragments and self.experiment.merge_type != "concat": - print("warning: the returned results are before merging") - true_label = true_label.value - predicted_label = predicted_label.value - - probs = self.predict_proba(df["text"]) - preds = np.argmax(probs, axis=1) - true_y = df["label"] - - mismatched_indices = (true_y == true_label) & (preds == predicted_label) - mismatched = df[mismatched_indices] - diff = probs[mismatched_indices, true_label] - probs[mismatched_indices, predicted_label] - indices = diff.argsort() - mismatched = mismatched.iloc[indices] - mismatched["pr_diff"] = diff[indices] - return mismatched - - def validate(self, X_test, y_test): - acc = (np.argmax(self.predict_proba(X_test), axis=1) == y_test).mean() - return acc - -def metrics(preds, true_y): - y = true_y - p = preds - acc = (p == y).mean() - tp = ((y != 0) & (p == y)).sum() - fp = ((p != 0) & (p != y)).sum() - fn = ((y != 0) & (p == 0)).sum() - - prec = tp / (fp + tp) - reca = tp / (fn + tp) - return { - "precision": prec, - "accuracy": acc, - "recall": reca, - "TP": tp, - "FP": fp, - } - - -def preds_for_cell_content(test_df, probs, group_by=["cell_content"]): - test_df = test_df.copy() - test_df["pred"] = np.argmax(probs, axis=1) - grouped_preds = test_df.groupby(group_by)["pred"].agg( - lambda x: x.value_counts().index[0]) - grouped_counts = test_df.groupby(group_by)["pred"].count() - results = pd.DataFrame({'true': test_df.groupby(group_by)["label"].agg(lambda x: x.value_counts().index[0]), - 'pred': grouped_preds, - 'counts': grouped_counts}) - return results - -def preds_for_cell_content_multi(test_df, probs, group_by=["cell_content"]): - test_df = test_df.copy() - probs_df = pd.DataFrame(probs, index=test_df.index) - test_df = pd.concat([test_df, probs_df], axis=1) - grouped_preds = np.argmax(test_df.groupby( - group_by)[probs_df.columns].sum().values, axis=1) - grouped_counts = test_df.groupby(group_by)["label"].count() - results = pd.DataFrame({'true': test_df.groupby(group_by)["label"].agg(lambda x: x.value_counts().index[0]), - 'pred': grouped_preds, - 'counts': grouped_counts}) - return results - -def preds_for_cell_content_max(test_df, probs, group_by=["cell_content"]): - test_df = test_df.copy() - probs_df = pd.DataFrame(probs, index=test_df.index) - test_df = pd.concat([test_df, probs_df], axis=1) - grouped_preds = np.argmax(test_df.groupby( - group_by)[probs_df.columns].max().values, axis=1) - grouped_counts = test_df.groupby(group_by)["label"].count() - results = pd.DataFrame({'true': test_df.groupby(group_by)["label"].agg(lambda x: x.value_counts().index[0]), - 'pred': grouped_preds, - 'counts': grouped_counts}) - return results - -def test_model(model, tdf): - probs = model(tdf["text"]) - preds = np.argmax(probs, axis=1) - print("Results of categorisation on text fagment level") - print(metrics(preds, tdf.label)) - - print("Results per cell_content grouped using majority voting") - results = preds_for_cell_content(tdf, probs) - print(metrics(results["pred"], results["true"])) - - print("Results per cell_content grouped with multi category mean") - results = preds_for_cell_content_multi(tdf, probs) - print(metrics(results["pred"], results["true"])) - - print("Results per cell_content grouped with multi category mean - only on fragments from the same paper that the coresponding table") - results = preds_for_cell_content_multi( - tdf[tdf.this_paper], probs[tdf.this_paper]) - print(metrics(results["pred"], results["true"])) diff --git a/sota_extractor2/models/structure/structure_predictor.py b/sota_extractor2/models/structure/structure_predictor.py deleted file mode 100644 index 8463646..0000000 --- a/sota_extractor2/models/structure/structure_predictor.py +++ /dev/null @@ -1,205 +0,0 @@ -from fastai.text import * -from pathlib import Path -import pandas as pd -import numpy as np -import pickle -from .experiment import Labels, label_map -from .ulmfit_experiment import ULMFiTExperiment -import re -from .ulmfit import ULMFiT_SP -from ...pipeline_logger import pipeline_logger -from copy import deepcopy - - -def load_crf(path): - with open(path, "rb") as f: - return pickle.load(f) - - -with_letters_re = re.compile(r"(?:^\s*[a-zA-Z])|(?:[a-zA-Z]{2,})") - -def cut_ulmfit_head(model): - pooling = PoolingLinearClassifier([1], []) - pooling.layers = model[1].layers[:-2] - return SequentialRNN(model[0], pooling) - - -# todo: move to TSP -n_ulmfit_features = 50 -n_fasttext_features = 0 -n_layout_features = 16 -n_features = n_ulmfit_features + n_fasttext_features + n_layout_features -n_classes = 5 - - -class TableStructurePredictor(ULMFiT_SP): - step = "structure_prediction" - - def __init__(self, path, file, crf_path=None, crf_model="crf.pkl", - sp_path=None, sp_model="spm.model", sp_vocab="spm.vocab"): - super().__init__(path, file, sp_path, sp_model, sp_vocab) - - self._full_learner = deepcopy(self.learner) - self.learner.model = cut_ulmfit_head(self.learner.model) - self.learner.loss_func = None - - #todo: make CRF optional - crf_path = Path(path) if crf_path is None else Path(crf_path) - self.crf = load_crf(crf_path / crf_model) - - # todo: clean Experiment from older approaches - self._e = ULMFiTExperiment(remove_num=False, drop_duplicates=False, - this_paper=True, merge_fragments=True, merge_type='concat', - evidence_source='text_highlited', split_btags=True, fixed_tokenizer=True, - fixed_this_paper=True, mask=True, evidence_limit=None, context_tokens=None, - lowercase=True, drop_mult=0.15, fp16=True, train_on_easy=False) - - def preprocess_df(self, raw_df): - return self._e.transform_df(raw_df) - - def keep_alphacells(self, df): - # which = df.cell_content.str.contains(with_letters_re) - which = df.cell_content.str.contains(with_letters_re) - return df[which], df[~which] - - def df2tl(self, df): - text_cols = ["cell_styles", "cell_layout", "text", "cell_content", "row_context", "col_context", - "cell_reference"] - df = df[text_cols] - return TextList.from_df(df, cols=text_cols) - - def get_features(self, evidences, use_crf=True): - if use_crf: - learner = self.learner - else: - learner = self._full_learner - if len(evidences): - tl = self.df2tl(evidences) - learner.data.add_test(tl) - - preds, _ = learner.get_preds(DatasetType.Test, ordered=True) - return preds.cpu().numpy() - return np.zeros((0, n_ulmfit_features if use_crf else n_classes)) - - def to_tables(self, df, transpose=False, n_ulmfit_features=n_ulmfit_features): - X_tables = [] - Y_tables = [] - ids = [] - C_tables = [] - for table_id, frame in df.groupby("table_id"): - rows, cols = frame.row.max()+1, frame.col.max()+1 - x_table = np.zeros((rows, cols, n_features)) - ###y_table = np.ones((rows, cols), dtype=np.int) * n_classes - c_table = np.full((rows, cols), "", dtype=np.object) - for i, r in frame.iterrows(): - x_table[r.row, r.col, :n_ulmfit_features] = r.features - c_table[r.row, r.col] = r.cell_content - #x_table[r.row, r.col, n_ulmfit_features:n_ulmfit_features+n_fasttext_features] = ft_model[r.text] - # if n_fasttext_features > 0: - # x_table[r.row, r.col, n_ulmfit_features:n_ulmfit_features+n_fasttext_features] = ft_model[r.cell_content] - ###y_table[r.row, r.col] = r.label - if n_layout_features > 0: - offset = n_ulmfit_features+n_fasttext_features - layout = r.cell_layout - x_table[r.row, r.col, offset] = 1 if 'border-t' in layout or 'border-tt' in layout else -1 - x_table[r.row, r.col, offset+1] = 1 if 'border-b' in layout or 'border-bb' in layout else -1 - x_table[r.row, r.col, offset+2] = 1 if 'border-l' in layout or 'border-ll' in layout else -1 - x_table[r.row, r.col, offset+3] = 1 if 'border-r' in layout or 'border-rr' in layout else -1 - x_table[r.row, r.col, offset+4] = 1 if r.cell_reference == "True" else -1 - x_table[r.row, r.col, offset+5] = 1 if r.cell_styles == "True" else -1 - for span_idx, span in enumerate(["cb", "ci", "ce", "rb", "ri", "re"]): - x_table[r.row, r.col, offset+6+span_idx] = 1 if f'span-{span}' in r.cell_layout else -1 - x_table[r.row, r.col, offset+12] = 1 if r.row == 0 else -1 - x_table[r.row, r.col, offset+13] = 1 if r.row == rows-1 else -1 - x_table[r.row, r.col, offset+14] = 1 if r.col == 0 else -1 - x_table[r.row, r.col, offset+15] = 1 if r.col == cols-1 else -1 - #x_table[r.row, r.col, -n_fasttext_features:] = ft_model[r.cell_content] - X_tables.append(x_table) - ###Y_tables.append(y_table) - C_tables.append(c_table) - ids.append(table_id) - if transpose: - X_tables.append(x_table.transpose((1, 0, 2))) - ###Y_tables.append(y_table.transpose()) - C_tables.append(c_table.transpose()) - ids.append(table_id) - ###return (X_tables, Y_tables), C_tables, ids - return X_tables, C_tables, ids - - def merge_with_preds(self, df, preds): - if not len(df): - return [] - ext_id = df.ext_id.str.split("/", expand=True) - return list(zip(ext_id[0] + "/" + ext_id[1], ext_id[2].astype(int), ext_id[3].astype(int), - preds, df.text, df.cell_content, df.cell_layout, df.cell_styles, df.cell_reference, df.label)) - - def merge_all_with_preds(self, df, df_num, preds, use_crf=True): - columns = ["table_id", "row", "col", "features", "text", "cell_content", "cell_layout", - "cell_styles", "cell_reference", "label"] - - alpha = self.merge_with_preds(df, preds) - nums = self.merge_with_preds(df_num, np.zeros((len(df_num), n_ulmfit_features if use_crf else n_classes))) - - df1 = pd.DataFrame(alpha, columns=columns) - df2 = pd.DataFrame(nums, columns=columns) - df2.label = n_classes - return df1.append(df2, ignore_index=True) - - # todo: fix numeric cells being labelled as meta / other - def format_predictions(self, tables_preds, test_ids): - num2label = {v: k for k, v in label_map.items()} - num2label[0] = "table-meta" - num2label[Labels.PAPER_MODEL.value] = 'model-paper' - num2label[Labels.DATASET.value] = 'dataset' - num2label[max(label_map.values()) + 1] = '' - - flat = [] - for preds, ext_id in zip(tables_preds, test_ids): - paper_id, table_id = ext_id.split("/") - labels = pd.DataFrame(preds).applymap(num2label.get).values - flat.extend( - [(paper_id, table_id, r, c, labels[r, c]) for r in range(len(labels)) for c in range(len(labels[r])) if - labels[r, c]]) - return pd.DataFrame(flat, columns=["paper", "table", "row", "col", "predicted_tags"]) - - def predict_tags(self, raw_evidences, use_crf=True): - evidences, evidences_num = self.keep_alphacells(self.preprocess_df(raw_evidences)) - pipeline_logger(f"{TableStructurePredictor.step}::evidences_split", evidences=evidences, evidences_num=evidences_num) - features = self.get_features(evidences, use_crf) - df = self.merge_all_with_preds(evidences, evidences_num, features, use_crf) - tables, contents, ids = self.to_tables(df, n_ulmfit_features=n_ulmfit_features if use_crf else n_classes) - if use_crf: - preds = self.crf.predict(tables) - else: - preds = [] - for table in tables: - p = table[..., :n_classes].argmax(axis=-1) - p[table[..., :n_classes].max(axis=-1) == 0.0] = n_classes - preds.append(p) - return self.format_predictions(preds, ids) - - # todo: consider adding sota/ablation information - def label_table(self, paper, table, annotations, in_place): - structure = pd.DataFrame().reindex_like(table.matrix).fillna("") - ext_id = (paper.paper_id, table.name) - if ext_id in annotations: - for _, entry in annotations[ext_id].iterrows(): - # todo: add model-ensemble support - structure.iloc[entry.row, entry.col] = entry.predicted_tags if entry.predicted_tags != "model-paper" else "model-best" - if not in_place: - table = deepcopy(table) - table.set_tags(structure.values) - return table - - # todo: take EvidenceExtractor in constructor - def label_tables(self, paper, tables, raw_evidences, in_place=False, use_crf=True): - pipeline_logger(f"{TableStructurePredictor.step}::label_tables", paper=paper, tables=tables, raw_evidences=raw_evidences) - if len(raw_evidences): - tags = self.predict_tags(raw_evidences, use_crf) - annotations = dict(list(tags.groupby(by=["paper", "table"]))) - else: - annotations = {} # just deep-copy all tables - pipeline_logger(f"{TableStructurePredictor.step}::annotations", paper=paper, tables=tables, annotations=annotations) - labeled = [self.label_table(paper, table, annotations, in_place) for table in tables] - pipeline_logger(f"{TableStructurePredictor.step}::tables_labeled", paper=paper, labeled_tables=labeled) - return labeled diff --git a/sota_extractor2/models/structure/transfo_experiment.py b/sota_extractor2/models/structure/transfo_experiment.py deleted file mode 100644 index 443b67b..0000000 --- a/sota_extractor2/models/structure/transfo_experiment.py +++ /dev/null @@ -1,716 +0,0 @@ -import time - -from .experiment import Experiment -from .nbsvm import preds_for_cell_content, preds_for_cell_content_max, preds_for_cell_content_multi -import dataclasses -from dataclasses import dataclass -from typing import Tuple -from sota_extractor2.helpers.training import set_seed -from fastai.text import * -import numpy as np -from pathlib import Path -import json - -import argparse -import glob -import logging -import os -import random - -import numpy as np -import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) -from torch.utils.data.distributed import DistributedSampler - -from fastai.text import * # for utilty functions - -try: - from torch.utils.tensorboard import SummaryWriter -except: - from tensorboardX import SummaryWriter - -from tqdm import tqdm, trange -import tensorflow_datasets - -from transformers import (WEIGHTS_NAME, BertConfig, - BertForSequenceClassification, BertTokenizer, - RobertaConfig, - RobertaForSequenceClassification, - RobertaTokenizer, - XLMConfig, XLMForSequenceClassification, - XLMTokenizer, XLNetConfig, - XLNetForSequenceClassification, - XLNetTokenizer, - DistilBertConfig, - DistilBertForSequenceClassification, - DistilBertTokenizer, DataProcessor, InputExample, AutoConfig) - -from transformers import AdamW, WarmupLinearSchedule - -from transformers import glue_compute_metrics as compute_metrics -from transformers import glue_output_modes as output_modes -from transformers import glue_processors as processors -from transformers import glue_convert_examples_to_features as convert_examples_to_features -from transformers import AutoTokenizer, AutoModelForSequenceClassification, glue_convert_examples_to_features -from transformers.data.processors.glue import glue_processors - - -logger = logging.getLogger(__name__) - - -def train(args, train_dataset, valid_dataset, model, tokenizer): - """ Train the model """ - if args.local_rank in [-1, 0]: - tb_writer = args.get_summary_writer() - - train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) - - if args.max_steps > 0: - t_total = args.max_steps - args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 - else: - t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) - if args.fp16: - try: - from apex import amp - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) - - # multi-gpu training (should be after apex fp16 initialization) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Distributed training (should be after apex fp16 initialization) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) - - # Train! - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_dataset)) - logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) - logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - global_step = 0 - tr_loss, logging_loss = 0.0, 0.0 - model.zero_grad() - train_iterator = range(int(args.num_train_epochs)) - set_seed(args.seed, "Training", all_gpus=(args.n_gpu > 1)) # Added here for reproductibility (even between python 2 and 3) - mb = master_bar(train_iterator) - mb.first_bar.comment = f'Epochs' - results={} - for epoch in mb: - epoch_iterator = progress_bar(train_dataloader, display=args.local_rank not in [-1, 0], parent=mb) - - for step, batch in enumerate(epoch_iterator): - model.train() - batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - tr_loss += loss.item() - if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu: - if args.fp16: - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) - else: - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - - optimizer.step() - scheduler.step() # Update learning rate schedule - model.zero_grad() - global_step += 1 - - if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics - mb.child.comment = f"loss: {loss}" - tb_writer.add_scalar('train/lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('train/loss', (tr_loss - logging_loss)/args.logging_steps, global_step) - logging_loss = tr_loss - - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training - model_to_save.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) - logger.info("Saving model checkpoint to %s", output_dir) - #mb.first_bar.comment = f'first bar stat' - #mb.write(f'Finished loop {i}.') - if args.tpu: - args.xla_model.optimizer_step(optimizer, barrier=True) - model.zero_grad() - global_step += 1 - - if args.max_steps > 0 and global_step > args.max_steps: - epoch_iterator.close() - break - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well - results = evaluate(args, model, valid_dataset) - for key, value in results.items(): - tb_writer.add_scalar('eval/{}'.format(key), value, global_step) - mb.first_bar.comment = str(results['acc']) - mb.write(f"Epoch: {epoch} {loss} Accuracy: {results.get('acc', 0)}") - - if args.max_steps > 0 and global_step > args.max_steps: - train_iterator.close() - break - hparams_dict = {k: v for k, v in dataclasses.asdict(args).items() if isinstance(v, (int, float, str, bool,))} - tb_writer.add_hparams(hparam_dict=hparams_dict, metric_dict=results) - - if args.local_rank in [-1, 0]: - tb_writer.close() - - return global_step, tr_loss / global_step - - -def evaluate(args, model, eval_dataset, prefix="", eval_output_dir="/tmp/out"): - # Loop to handle MNLI double evaluation (matched, mis-matched) - results = {} - eval_task = args.task_name - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - mb = progress_bar(eval_dataloader) - for batch in mb: - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - if args.output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif args.output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - results.update(result) - results['loss'] = eval_loss - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - return results - -def prepare_glue_examples(tokenizer, task_name='mrpc', split_name='train'): - processor = glue_processors[task_name]() - - def tf_mrpc_to_pytorch(d): - for ex in d: - ex = processor.get_example_from_tensor_dict(ex) - # ex = processor.tfds_map(ex) - yield ex - - tf_data = tensorflow_datasets.load(f"glue/{task_name}")[split_name] - examples = tf_mrpc_to_pytorch(tf_data) - features = glue_convert_examples_to_features(examples, - tokenizer, - max_length=128, - task='mrpc') - - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - return dataset - - -def strip_tensors(r): - nr = {} - for k,v in r.items(): - v = v.numpy() - if isinstance(v, bytes): - v = v.decode("utf-8") - else: - v = v.item() - nr[k] = v - return nr - -def glue_dataset_to_df(task_name): - data = tensorflow_datasets.load(f"glue/{task_name}") - new_dict = {} - for name, dataset in data.items(): - new_dict[name] = pd.DataFrame.from_records([strip_tensors(r) for r in dataset], - columns=dataset.output_shapes.keys(), - index='idx') - return new_dict.get('train', None), new_dict.get('validation', None), new_dict.get('test', None) - -def convert_df_to_examples(df, text_a='sentence1', text_b='sentence2', label='label'): - return [InputExample( - idx, - row[text_a], - row[text_b], - str(row[label])) - for idx, row in df.iterrows()] - -def convert_df_to_dataset(tokenizer, df, max_length=128, task='mrpc', text_a='sentence1', text_b='sentence2', label='label', return_labels=False): - label_list = list(sorted(map(str, df[label].unique()))) - examples = convert_df_to_examples(df, text_a, text_b, label) - features = glue_convert_examples_to_features(examples, - tokenizer, - max_length=max_length, - label_list=label_list, - output_mode='classification', - task=None) - - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - if return_labels: - return dataset, label_list - return dataset - -@dataclass -class TransfoLearner(): - model: nn.Module - tokenizer: Any - data: Any - -def get_preds(args, model, dataset, ordered=True): - eval_dataset = dataset - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - if isinstance(eval_sampler, DistributedSampler) and ordered: - # Note that DistributedSampler samples randomly - raise ValueError("Unable to run distributed get_preds with ordered == True") - logger.info("Num examples = %d", len(eval_dataset)) - logger.info("Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - mb = progress_bar(eval_dataloader) - preds = [] - labels = [] - try: - with torch.no_grad(): - model.to(args.device) - model.eval() - for batch in mb: - batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None - # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - preds.append(logits.detach().cpu()) - labels.append(inputs['labels'].detach().cpu()) # add non_blocking=True but somehow it isn't avaliabe in our torch - return torch.cat(preds, dim=0), torch.cat(labels, dim=0) - finally: - model.to("cpu") - -@dataclass -class TransfoDatabunch(): - num_labels: int - train_ds: Any - valid_ds: Any - test_ds: Any - -@dataclass -class TransfoExperiment(Experiment): - test_split: str = None - valid_split: str = None - text_a: str = 'text' - text_b: str = 'cell_content' - label: str = 'label' - #@help("Model type selected in the list: ...") - model_type: str = None - #@help("Path to pre-trained model or shortcut name selected in the list: ...") - pretrained_name: str = None - #@help("The name of the task to train selected in the list: " + "".join(processors.keys())) - task_name: str = None - #@help("Pretrained config name or path if not the same as model_name") - config_name: str = "" - #@help("Pretrained tokenizer name or path if not the same as model_name") - tokenizer_name: str = "" - #@help("Where do you want to store the pre-trained models downloaded from s3") - cache_dir: str = "" - #@help("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated sequences shorter will be padded.") - max_seq_length: int = 128 - #@help("Whether to run training.") - do_train: bool = False - #@help("Whether to run eval on the dev set.") - do_eval: bool = False - #@help("Rul evaluation during training at each logging step.") - evaluate_during_training: bool = False - #@help("Batch size per GPU/CPU for training.") - per_gpu_train_batch_size: int = 8 - #@help("Batch size per GPU/CPU for evaluation.") - per_gpu_eval_batch_size: int = 8 - #@help("Number of updates steps to accumulate before performing a backward/update pass.") - gradient_accumulation_steps: int = 1 - #@help("The initial learning rate for Adam.") - learning_rate: float = 5e-5 - #@help("Weight deay if we apply some.") - weight_decay: float = 0.0 - #@help("Epsilon for Adam optimizer.") - adam_epsilon: float = 1e-8 - #@help("Max gradient norm.") - max_grad_norm: float = 1.0 - #@help("Total number of training epochs to perform.") - num_train_epochs: float = 3.0 - #@help("If > 0: set total number of training steps to perform. Override num_train_epochs.") - max_steps: int = -1 - #@help("Linear warmup over warmup_steps.") - warmup_steps: int = 0 - #@help("Log every X updates steps.") - logging_steps: int = 10 - #@help("Save checkpoint every X updates steps.") - save_steps: int = 50 - #@help("Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - eval_all_checkpoints: bool = False - #@help("Avoid using CUDA when available") - no_cuda: bool = False - #@help("Overwrite the cached training and evaluation sets") - overwrite_cache: bool = False - #@help("random seed for initialization") - seed: int = 42 - #@help("Whether to run on the TPU defined in the environment variables") - tpu: bool = False - #@help("TPU IP address if none are set in the environment variables") - tpu_ip_address: str = '' - #@help("TPU name if none are set in the environment variables") - tpu_name: str = '' - #@help("XRT TPU config if none are set in the environment variables") - xrt_tpu_config: str = '' - - #@help("Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - fp16: bool = False - #@help("For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2' and 'O3']. See details at https://nvidia.github.io/apex/amp.html") - fp16_opt_level: str = 'O1' - #@help("For distributed training: local_rank") - local_rank: int = -1 - #@help("For distant debugging.") - server_ip: str = '' - #@help("For distant debugging.") - server_port: str = '' - - seed: int = 42 - # Unused - - #@help("The input data dir. Should contain the .tsv files (or other data files) for the task.") - data_dir: str = "/tmp/data" - - #@help("The output directory where the model predictions and checkpoints will be written.") - output_dir: str = "/tmp/tmp_output_dir" - - #@help("Overwrite the content of the output directory") - overwrite_output_dir: bool = True - - def __post_init__(self): - if os.path.exists(self.output_dir) and os.listdir( - self.output_dir) and self.do_train and not self.overwrite_output_dir: - raise ValueError( - "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( - self.output_dir)) - - # Setup distant debugging if needed - if self.server_ip and self.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(self.server_ip, self.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - # Setup CUDA, GPU & distributed training - if self.local_rank == -1 or self.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu") - self.n_gpu = torch.cuda.device_count() - else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.cuda.set_device(self.local_rank) - device = torch.device("cuda", self.local_rank) - torch.distributed.init_process_group(backend='nccl') - self.n_gpu = 1 - self.device = device - self.output_mode = "classification" - - self.train_batch_size = self.per_gpu_train_batch_size * max(1, self.n_gpu) - self.eval_batch_size = self.per_gpu_eval_batch_size * max(1, self.n_gpu) - self._tokenizer = None - self._model = None - self._data_cache = None - self.train_started = None - - @property - def tokenizer(self): - if self._tokenizer is None: - self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained_name) - return self._tokenizer - - @property - def experiment_name(self): - from datetime import datetime - import socket - if not self.name: - now = datetime.now() - d = now.strftime("%y%m%d_%H%M%S") - h = "_".join(socket.gethostname().split('-')) - - def short_name(name): - return "".join([p[0] for p in name.split('_')]) - - def short_val(val): - if isinstance(val, bool): - return int(val) - return val - - relevant_params = {k: v for k, v in dataclasses.asdict(self).items() - if not k.startswith('_') and hasattr(TransfoExperiment, k) and getattr(TransfoExperiment, - k) != v} - params = [f"{short_name(k)}_{v}" for k, v in relevant_params.items() if not isinstance(v, bool)] - bool_flags = [f"{short_name(k)}" for k, v in relevant_params.items() if isinstance(v, bool) and v] - params_str = ".".join(params + bool_flags) - - self.name = f"{d}.{h}.{params_str}" - return self.name - - def get_summary_writer(self): - return SummaryWriter("runs/"+self.experiment_name) - - def _save_predictions(self, path): - self._dump_pickle([self._preds, self._phases], path) - - def _load_predictions(self, path): - self._preds, self._phases = self._load_pickle(path) - return self._preds - - def load_predictions(self): - path = self._path.parent / f"{self._path.stem}.preds" - return self._load_predictions(path) - - # todo: make it compatible with Experiment - def get_trained_model(self, data: TransfoDatabunch): - self._model = self.train_model(data) - self.has_model = True - return self._model - - def get_glue_databunch(self): - return TransfoDatabunch( - train_ds = prepare_glue_examples(self.tokenizer, self.task_name, 'train'), - valid_ds = prepare_glue_examples(self.tokenizer, self.task_name, 'validation'), - test_ds = None - ) - - def get_databunch(self, train_df, valid_df, test_df): - data_key = (id(train_df), id(valid_df), id(test_df)) - - if self._data_cache is not None and self._data_cache.key != data_key: - self._data_cache = None - - self.tokenizer.max_len = 999999 - if self._data_cache is None: - common_args = dict(text_a=self.text_a, text_b=self.text_b, label=self.label) - train_ds, label_list = convert_df_to_dataset(self.tokenizer, train_df, return_labels=True, **common_args) - data = TransfoDatabunch( - num_labels=len(label_list), - train_ds=train_ds, - valid_ds=convert_df_to_dataset(self.tokenizer, valid_df, **common_args), - test_ds=convert_df_to_dataset(self.tokenizer, test_df, **common_args) - ) - data.key = data_key - self._data_cache = data - return self._data_cache - - def new_experiment(self, **kwargs): - #kwargs.setdefault("has_predictions", False) - return super().new_experiment(**kwargs) - - def _add_phase(self, state): - del state['opt'] - del state['train_dl'] - self._phases.append(state) - - def set_seed(self, name): - return set_seed(self.seed, name, all_gpus=(self.n_gpu > 1)) - - # todo: make it compatible with Experiment - def train_model(self, data: TransfoDatabunch): - self.set_seed("class") - self.train_started = time.time() - num_labels = data.num_labels - config = AutoConfig.from_pretrained(self.pretrained_name, num_labels=num_labels) #, finetuning_task=args.task_name - model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_name, config=config) - train(self, data.train_ds, data.valid_ds, model.to(self.device), self._tokenizer) - model.to("cpu") - return model - - def _save_model(self, path): - model_to_save = self._model.module if hasattr(self._model, - 'module') else self._model # Take care of distributed/parallel training - model_to_save.save_pretrained(path) - logger.info("Saving model checkpoint to %s", path) - - # todo: move to Experiment - def save(self, dir_path): - dir_path = Path(dir_path) - dir_path.mkdir(exist_ok=True, parents=True) - filename = self._get_next_exp_name(dir_path) - j = dataclasses.asdict(self) - with open(filename, "wt") as f: - json.dump(j, f) - self._save_model(dir_path / f"{filename.stem}.model") - if hasattr(self, "_preds"): - self._save_predictions(dir_path / f"{filename.stem}.preds") - - return filename.name - - def evaluate_transformers(self, data): - return evaluate(self, self._model.to(self.device), data.valid_ds, prefix="") - - def evaluate(self, model, train_df, valid_df, test_df): - data = self.get_databunch(train_df, valid_df, test_df) - valid_probs = get_preds(self, model, data.valid_ds, ordered=True)[0].cpu().numpy() - test_probs = get_preds(self, model, data.test_ds, ordered=True)[0].cpu().numpy() - train_probs = get_preds(self, model, data.train_ds, ordered=True)[0].cpu().numpy() - self._preds = [] - - for prefix, tdf, probs in zip(["train", "valid", "test"], - [train_df, valid_df, test_df], - [train_probs, valid_probs, test_probs]): - preds = np.argmax(probs, axis=1) - - if self.merge_fragments and self.merge_type != "concat": - if self.merge_type == "vote_maj": - vote_results = preds_for_cell_content(tdf, probs) - elif self.merge_type == "vote_avg": - vote_results = preds_for_cell_content_multi(tdf, probs) - elif self.merge_type == "vote_max": - vote_results = preds_for_cell_content_max(tdf, probs) - preds = vote_results["pred"] - true_y = vote_results["true"] - else: - true_y = tdf["label"] - print(true_y.shape) - self._set_results(prefix, preds, true_y) - self._preds.append(probs) - -# # schedule: Tuple = ( -# # (1, 1e-2), # (a,b) -> fit_one_cyclce(a, b) -# # (1, 5e-3/2., 5e-3), # (a, b) -> freeze_to(-2); fit_one_cycle(a, b) -# # (8, 2e-3/100, 2e-3) # (a, b) -> unfreeze(); fit_one_cyccle(a, b) -# # ) -# # # drop_mult: float = 0.75 -# # fp16: bool = False -# pretrained_lm: str = "bert_base_cased" -# # dataset: str = None -# # train_on_easy: bool = True -# # BS: int = 64 -# # -# # has_predictions: bool = False # similar to has_model, but to avoid storing pretrained models we only keep predictions -# # # that can be later used by CRF - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_example_from_tensor_dict(self, tensor_dict): - """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['premise'].numpy().decode('utf-8'), - tensor_dict['hypothesis'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), - "dev_matched") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - diff --git a/sota_extractor2/models/structure/type_predictor.py b/sota_extractor2/models/structure/type_predictor.py deleted file mode 100644 index 49885e8..0000000 --- a/sota_extractor2/models/structure/type_predictor.py +++ /dev/null @@ -1,43 +0,0 @@ -from fastai.text import * -from pathlib import Path -import pandas as pd -from .ulmfit import ULMFiT_SP -from ...pipeline_logger import pipeline_logger -import torch -from enum import Enum - - -class TableType(Enum): - SOTA = 0 - ABLATION = 1 - IRRELEVANT = 2 - - -def multipreds2preds(preds, threshold=0.5): - bs = preds.shape[0] - return torch.cat([preds, preds.new_full((bs,1), threshold)], dim=-1).argmax(dim=-1) - - -class TableTypePredictor(ULMFiT_SP): - step = "type_prediction" - - def __init__(self, path, file, sp_path=None, sp_model="spm.model", sp_vocab="spm.vocab", threshold=0.5): - super().__init__(path, file, sp_path, sp_model, sp_vocab) - self.threshold = threshold - - def predict(self, paper, tables): - pipeline_logger(f"{TableTypePredictor.step}::predict", paper=paper, tables=tables) - if len(tables) == 0: - predictions = [] - else: - column = "caption" - df = pd.DataFrame({column: [table.caption if table.caption else "Table" for table in tables]}) - inputs = df.iloc[:, df_names_to_idx(column, df)] - tl = TextList(items=inputs.values[:, 0], path='.', inner_df=df, processor=None) - self.learner.data.add_test(tl) - preds, _ = self.learner.get_preds(DatasetType.Test, ordered=True) - pipeline_logger(f"{TableTypePredictor.step}::multiclass_predicted", paper=paper, tables=tables, - threshold=self.threshold, predictions=preds.cpu().numpy()) - predictions = [TableType(x) for x in multipreds2preds(preds, self.threshold).cpu().numpy()] - pipeline_logger(f"{TableTypePredictor.step}::predicted", paper=paper, tables=tables, predictions=predictions) - return predictions diff --git a/sota_extractor2/models/structure/ulmfit.py b/sota_extractor2/models/structure/ulmfit.py deleted file mode 100644 index ee87052..0000000 --- a/sota_extractor2/models/structure/ulmfit.py +++ /dev/null @@ -1,18 +0,0 @@ -from fastai.text import * -from pathlib import Path - -class ULMFiT_SP: - def __init__(self, path, file, sp_path=None, sp_model="spm.model", sp_vocab="spm.vocab"): - path = Path(path) - sp_path = path if sp_path is None else Path(sp_path) - self.learner = load_learner(path=path, file=file) - self._fix_sp_processor(sp_path, sp_model, sp_vocab) - - def _fix_sp_processor(self, sp_path, sp_model, sp_vocab): - for processor in self.learner.data.label_list.valid.x.processor: - if isinstance(processor, SPProcessor): - processor.sp_model = sp_path / sp_model - processor.sp_vocab = sp_path / sp_vocab - - #todo: see why it wasn't set on save - processor.mark_fields = True diff --git a/sota_extractor2/models/structure/ulmfit_experiment.py b/sota_extractor2/models/structure/ulmfit_experiment.py deleted file mode 100644 index 3422d56..0000000 --- a/sota_extractor2/models/structure/ulmfit_experiment.py +++ /dev/null @@ -1,136 +0,0 @@ -from .experiment import Experiment, label_map_ext -from .nbsvm import preds_for_cell_content, preds_for_cell_content_max, preds_for_cell_content_multi -import dataclasses -from dataclasses import dataclass -from typing import Tuple -from sota_extractor2.helpers.training import set_seed -from fastai.text import * -from fastai.text.learner import _model_meta -import numpy as np -from pathlib import Path -import json - - -@dataclass -class ULMFiTExperiment(Experiment): - seed: int = 42 - schedule: Tuple = ( - (1, 1e-2), # (a,b) -> fit_one_cyclce(a, b) - (1, 5e-3/2., 5e-3), # (a, b) -> freeze_to(-2); fit_one_cycle(a, b) - (8, 2e-3/100, 2e-3) # (a, b) -> unfreeze(); fit_one_cyccle(a, b) - ) - drop_mult: float = 0.75 - fp16: bool = False - pretrained_lm: str = "pretrained-on-papers_enc.pkl" - dataset: str = None - train_on_easy: bool = True - BS: int = 64 - valid_split: str = 'speech_rec' - test_split: str = 'img_class' - n_layers: int = 3 - - has_predictions: bool = False # similar to has_model, but to avoid storing pretrained models we only keep predictions - # that can be later used by CRF - - def _save_predictions(self, path): - self._dump_pickle([self._preds, self._phases], path) - - def _load_predictions(self, path): - self._preds, self._phases = self._load_pickle(path) - return self._preds - - def load_predictions(self): - path = self._path.parent / f"{self._path.stem}.preds" - return self._load_predictions(path) - - # todo: make it compatible with Experiment - def get_trained_model(self, data_clas): - self._model = self.train_model(data_clas) - self.has_model = True - return self._model - - def new_experiment(self, **kwargs): - kwargs.setdefault("has_predictions", False) - return super().new_experiment(**kwargs) - - def _schedule(self, clas, i): - s = self.schedule[i] - if len(s) == 2: - clas.fit_one_cycle(s[0], s[1]) - else: - clas.fit_one_cycle(s[0], slice(s[1], s[2])) - - def _add_phase(self, state): - del state['opt'] - del state['train_dl'] - self._phases.append(state) - - # todo: make it compatible with Experiment - def train_model(self, data_clas): - set_seed(self.seed, "clas") - cfg = _model_meta[AWD_LSTM]['config_clas'].copy() - cfg['n_layers'] = self.n_layers - - clas = text_classifier_learner(data_clas, AWD_LSTM, config=cfg, drop_mult=self.drop_mult) - clas.load_encoder(self.pretrained_lm) - if self.fp16: - clas = clas.to_fp16() - - self._schedule(clas, 0) - self._phases = [] - self._add_phase(clas.recorder.get_state()) - - clas.freeze_to(-2) - self._schedule(clas, 1) - self._add_phase(clas.recorder.get_state()) - - clas.unfreeze() - self._schedule(clas, 2) - self._add_phase(clas.recorder.get_state()) - - return clas - - def _save_model(self, path): - self._model.save(path) - - - # todo: move to Experiment - def save(self, dir_path): - dir_path = Path(dir_path) - dir_path.mkdir(exist_ok=True, parents=True) - filename = self._get_next_exp_name(dir_path) - j = dataclasses.asdict(self) - with open(filename, "wt") as f: - json.dump(j, f) - self.save_model(dir_path / f"{filename.stem}.model") - if hasattr(self, "_preds"): - self._save_predictions(dir_path / f"{filename.stem}.preds") - - return filename.name - - - def evaluate(self, model, train_df, valid_df, test_df): - valid_probs = model.get_preds(ds_type=DatasetType.Valid, ordered=True)[0].cpu().numpy() - test_probs = model.get_preds(ds_type=DatasetType.Test, ordered=True)[0].cpu().numpy() - train_probs = model.get_preds(ds_type=DatasetType.Train, ordered=True)[0].cpu().numpy() - self._preds = [] - - for prefix, tdf, probs in zip(["train", "valid", "test"], - [train_df, valid_df, test_df], - [train_probs, valid_probs, test_probs]): - preds = np.argmax(probs, axis=1) - - if self.merge_fragments and self.merge_type != "concat": - if self.merge_type == "vote_maj": - vote_results = preds_for_cell_content(tdf, probs) - elif self.merge_type == "vote_avg": - vote_results = preds_for_cell_content_multi(tdf, probs) - elif self.merge_type == "vote_max": - vote_results = preds_for_cell_content_max(tdf, probs) - preds = vote_results["pred"] - true_y = vote_results["true"] - else: - true_y = tdf["label"] - true_y_ext = tdf["cell_type"].apply(lambda x: label_map_ext.get(x, 0)) - self._set_results(prefix, preds, true_y, true_y_ext) - self._preds.append(probs) diff --git a/tables2json.py b/tables2json.py deleted file mode 100755 index 8050f35..0000000 --- a/tables2json.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python - -import fire -from sota_extractor.taskdb import TaskDB -from pathlib import Path -import json -import re -import pandas as pd - -from label_tables import get_table, get_metadata - -def get_celltags(filename): - filename = Path(filename) - if filename.exists(): - - try: - celltags = pd.read_csv(filename, header=None, dtype=str).fillna('') - except pd.errors.EmptyDataError: - return pd.DataFrame() - return celltags - else: - return pd.DataFrame() - - -def get_tables(tables_dir): - tables_dir = Path(tables_dir) - all_metadata = {} - all_tables = {} - all_celltags = {} - for metadata_filename in tables_dir.glob("*/metadata.json"): - metadata = get_metadata(metadata_filename) - for k in metadata: - if metadata[k] is None: - metadata[k] = '' - basedir = metadata_filename.parent - arxiv_id = basedir.name - all_metadata[arxiv_id] = metadata - all_tables[arxiv_id] = {t:get_table(basedir / t) for t in metadata} - all_celltags[arxiv_id] = {t:get_celltags(basedir / t.replace("table", "celltags")) for t in metadata} - return all_metadata, all_tables, all_celltags - -def t2j(df): - rows, cols = df.shape - if rows == 0 or cols == 0: - return [[""]] - return [[df.iloc[r, c] for c in range(cols)] for r in range(rows)] - - -def tables2json(tables_dir): - metadata, tables, celltags = get_tables(tables_dir) - all_data = [] - for arxiv_id in metadata: - tabs = [] - for tab in metadata[arxiv_id]: - table = dict( - name=tab, - caption=metadata[arxiv_id][tab], - values=t2j(tables[arxiv_id][tab]), - tags=t2j(celltags[arxiv_id][tab]) - ) - tabs.append(table) - all_data.append(dict(paper_id=arxiv_id, tables=tabs)) - print(json.dumps(all_data)) - -if __name__ == '__main__': fire.Fire(tables2json)