diff --git a/.gitignore b/.gitignore index 839763c..54c38bd 100644 --- a/.gitignore +++ b/.gitignore @@ -98,3 +98,6 @@ venv.bak/ .mypy_cache/ .idea/* .vscode/settings.json + +# pytest +.pytest_cache diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..989e2c5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..ddf8d64 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include axcell/scripts/* +include axcell/scripts/patches/* diff --git a/Makefile b/Makefile deleted file mode 100644 index dd795c2..0000000 --- a/Makefile +++ /dev/null @@ -1,81 +0,0 @@ -DATA_DIR = data -ANNOTATIONS_DIR = $(DATA_DIR)/annotations -ARXIV_DIR = $(DATA_DIR)/arxiv -ARCHIVES_DIR = $(ARXIV_DIR)/sources -UNPACKED_DIR = $(ARXIV_DIR)/unpacked_sources -HTMLS_DIR = $(ARXIV_DIR)/htmls -FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean -TABLES_DIR = $(ARXIV_DIR)/papers -TEXTS_DIR = $(ARXIV_DIR)/papers - -ARCHIVES := $(shell find $(ARCHIVES_DIR) -name '*.gz' -type f 2>/dev/null) -UNPACKS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES)) -HTMLS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES)) -FIXED_HTMLS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES)) -TABLES := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES)) -TEXTS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%/text.json,$(ARCHIVES)) - -.PHONY: all -all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all - -.PHONY: test -test: DATA_DIR = test/data -test: TABLE_FILE = $(TABLES_DIR)/paper/table_01.csv -test: - mkdir -p $(ARCHIVES_DIR) - tar czf $(ARCHIVES_DIR)/paper.gz -C test/src . - $(MAKE) DATA_DIR=$(DATA_DIR) --always-make extract_all - cat $(TABLE_FILE) - diff $(TABLE_FILE) test/src/table_01.csv - -.PHONY: extract_all extract_texts extract_tables fix_htmls_all convert_all unpack_all - -extract_all: extract_tables extract_texts - -extract_texts: $(TEXTS) - -$(TEXTS): $(TEXTS_DIR)/%/text.json: $(FIXED_HTMLS_DIR)/%.html - python ./extract_texts.py $^ $@ - - -extract_tables: $(TABLES) - -fix_htmls_all: $(FIXED_HTMLS) - -convert_all: $(HTMLS) - -$(TABLES): $(TABLES_DIR)/%: $(FIXED_HTMLS_DIR)/%.html - python ./extract_tables.py $^ --outdir $@ - -$(FIXED_HTMLS): $(FIXED_HTMLS_DIR)/%: $(HTMLS_DIR)/% - ./clean_html.sh $^ $@ - -$(HTMLS): $(HTMLS_DIR)/%.html: $(UNPACKED_DIR)/% - ./docker-latex2html.sh $^ $@ - -unpack_all: $(UNPACKS) - -$(UNPACKS): $(UNPACKED_DIR)/%: $(ARCHIVES_DIR)/%.gz - ./unpack-sources.sh $^ $@ - -$(ANNOTATIONS_DIR)/pdfs-urls.csv: $(ANNOTATIONS_DIR)/papers-urls.csv - sed -e 's#/abs/#/pdf/#' -e 's#$$#.pdf#' $^ > $@ - -$(ANNOTATIONS_DIR)/sources-urls.csv: $(ANNOTATIONS_DIR)/papers-urls.csv - sed -e 's#/abs/#/e-print/#' $^ > $@ - -$(ANNOTATIONS_DIR)/papers-urls.csv: $(ANNOTATIONS_DIR)/evaluation-tables.json get_papers_links.sh - ./get_papers_links.sh $< > $@ - -$(ANNOTATIONS_DIR)/%: $(ANNOTATIONS_DIR)/%.gz - gunzip -kf $^ - -$(ANNOTATIONS_DIR)/evaluation-tables.json.gz: - $(shell mkdir -p "$(ANNOTATIONS_DIR)") - wget https://paperswithcode.com/media/about/evaluation-tables.json.gz -O $@ - - -.PHONY : clean -clean : - cd "$(ANNOTATIONS_DIR)" && rm -f *.json *.csv - #rm -f *.gz diff --git a/README.md b/README.md index f237d8e..5f94e69 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,70 @@ -# Scripts for extracting tables +# AxCell: Automatic Extraction of Results from Machine Learning Papers +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/axcell-automatic-extraction-of-results-from/scientific-results-extraction-on-pwc)](https://paperswithcode.com/sota/scientific-results-extraction-on-pwc?p=axcell-automatic-extraction-of-results-from) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/axcell-automatic-extraction-of-results-from/scientific-results-extraction-on-nlp-tdms-exp)](https://paperswithcode.com/sota/scientific-results-extraction-on-nlp-tdms-exp?p=axcell-automatic-extraction-of-results-from) -Dependencies: - * [jq](https://stedolan.github.io/jq/) (`sudo apt install jq`) - * docker (run without `sudo`) - * [conda](https://www.anaconda.com/distribution/) +This repository is the official implementation of [AxCell: Automatic Extraction of Results from Machine Learning Papers](https://arxiv.org/abs/2004.14356). -Directory structure: -``` -. -└── data -    ├── annotations -    │   └── evaluation-tables.json.gz # current annotations -    └── arxiv -    ├── sources # gzip archives with e-prints -    ├── unpacked\_sources # automatically extracted latex sources -    ├── htmls # automatically generated htmls -    ├── htmls-clean # htmls fixed by chromium -    └── tables # extracted tables -``` +![pipeline](https://user-images.githubusercontent.com/13535078/81287158-33e01000-905a-11ea-8573-d716373efbdd.png) +## Requirements -To preprocess data and extract tables, run: -``` +To create a [conda](https://www.anaconda.com/distribution/) environment named `axcell` and install requirements run: + +```setup conda env create -f environment.yml -source activate xtables -make -j 8 -i extract_all > stdout.log 2> stderr.log ``` -where `8` is number of jobs to run simultaneously. Optionally one can specify path to data directory, f.e., `make DATA_DIR=mydata ...`. -## Test -To test the whole extraction on a single file run -``` -make test +Additionally, `axcell` requires `docker` (that can be run without `sudo`). Run `scripts/pull_docker_images.sh` to download necessary images. + +## Datasets +We publish the following datasets: +* [ArxivPapers](https://github.com/paperswithcode/axcell/releases/download/v1.0/arxiv-papers.csv.xz) +* [SegmentedTables & LinkedResults](https://github.com/paperswithcode/axcell/releases/download/v1.0/segmented-tables.json.xz) +* [PWCLeaderboards](https://github.com/paperswithcode/axcell/releases/download/v1.0/pwc-leaderboards.json.xz) + +See [datasets](notebooks/datasets.ipynb) notebook for an example of how to load the datasets provided below. The [extraction](notebooks/extraction.ipynb) notebook shows how to use `axcell` to extract text and tables from papers. + +## Evaluation + +See the [evaluation](notebooks/evaluation.ipynb) notebook for the full example on how to evaluate AxCell on the PWCLeaderboards dataset. + +## Training + +* [pre-training language model](notebooks/training/lm.ipynb) on the ArxivPapers dataset +* [table type classifier](notebooks/training/table-type-classifier.ipynb) and [table segmentation](notebooks/training/table-segmentation.ipynb) on the SegmentedResults dataset + +## Pre-trained Models + +You can download pretrained models here: + +- [axcell](https://github.com/paperswithcode/axcell/releases/download/v1.0/models.tar.xz) — an archive containing the taxonomy, abbreviations, table type classifier and table segmentation model. See the [results-extraction](notebooks/results-extraction.ipynb) notebook for an example of how to load and run the models +- [language model](https://github.com/paperswithcode/axcell/releases/download/v1.0/lm.pth.xz) — [ULMFiT](https://arxiv.org/abs/1801.06146) language model pretrained on the ArxivPapers dataset + +## Results + +AxCell achieves the following performance: + +### + + +| Dataset | Macro F1 | Micro F1 | +| ---------- |---------------- | -------------- | +| [PWC Leaderboards](https://paperswithcode.com/sota/scientific-results-extraction-on-pwc) | 21.1 | 28.7 | +| [NLP-TDMS](https://paperswithcode.com/sota/scientific-results-extraction-on-nlp-tdms-exp) | 19.7 | 25.8 | + + + +## License + +AxCell is released under the [Apache 2.0 license](LICENSE). + +## Citation +The pipeline is described in the following paper: +```bibtex +@inproceedings{axcell, + title={AxCell: Automatic Extraction of Results from Machine Learning Papers}, + author={Marcin Kardas and Piotr Czapla and Pontus Stenetorp and Sebastian Ruder and Sebastian Riedel and Ross Taylor and Robert Stojnic}, + year={2020}, + booktitle={2004.14356} +} ``` diff --git a/axcell/__init__.py b/axcell/__init__.py new file mode 100644 index 0000000..73596bf --- /dev/null +++ b/axcell/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + diff --git a/sota_extractor2/config.py b/axcell/config.py similarity index 52% rename from sota_extractor2/config.py rename to axcell/config.py index fb977b6..faf9f2c 100644 --- a/sota_extractor2/config.py +++ b/axcell/config.py @@ -1,5 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + import logging -from pathlib import Path +from pathlib import Path logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', @@ -10,11 +12,11 @@ # otherwise use this files data = Path("/mnt/efs/pwc/data") -goldtags_dump = data / "dumps" / "goldtags-2019.07.16_2214.json.gz" - +goldtags_dump = data / "dumps" / "goldtags-2019.10.15_2227.json.gz" -elastic = dict(hosts=['localhost'], timeout=20) +elastic = dict(hosts=['127.0.0.1'], timeout=20) +grobid = dict(host='grobid') arxiv = data/'arxiv' htmls_raw = arxiv/'htmls' @@ -22,3 +24,11 @@ datasets = data/"datasets" datasets_structure = datasets/"structure" +structure_models = datasets / "structure" / "models" + +mocks = datasets / "mocks" + +linking_models = datasets / "linking" / "models" +linking_data = datasets / "linking" / "data" + +autodict = linking_data / "autodict" diff --git a/sota_extractor2/data/__init__.py b/axcell/data/__init__.py similarity index 68% rename from sota_extractor2/data/__init__.py rename to axcell/data/__init__.py index 9501a35..84edcf2 100644 --- a/sota_extractor2/data/__init__.py +++ b/axcell/data/__init__.py @@ -1,3 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + import logging from .. import config # to get logging init @@ -7,4 +9,4 @@ from db import * except: logger.info("Unable to intialise django falling back to json data") - from json import * \ No newline at end of file + from json import * diff --git a/axcell/data/db.py b/axcell/data/db.py new file mode 100644 index 0000000..14bb6ab --- /dev/null +++ b/axcell/data/db.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +raise NotImplementedError() diff --git a/axcell/data/doc_utils.py b/axcell/data/doc_utils.py new file mode 100644 index 0000000..13d9229 --- /dev/null +++ b/axcell/data/doc_utils.py @@ -0,0 +1,329 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import re +from bs4 import BeautifulSoup, Comment, Tag, NavigableString +import codecs + +def _handle_reference(el): + if el.get('href', "").startswith("#"): + r = str(el.get('href')) + el.clear() # to remove it's content from the descendants iterator + return "xxref-" + _simplify_anchor(r[1:]) + + +_anchor_like_classes = { + 'ltx_appendix', 'ltx_bibliography', 'ltx_figure', 'ltx_float', 'ltx_graphics', 'ltx_note', + 'ltx_paragraph', 'ltx_picture', 'ltx_section', 'ltx_subsection', 'ltx_subsubsection', 'ltx_theorem', + 'ltx_title_section', 'ltx_title_subsection' +} + +def _insert_anchor(el, anchor_id, prefix="xxanchor"): + el.insert(0, NavigableString(f' {prefix}-{anchor_id} ')) + +def put_dummy_anchors(soup): + for elem in soup.select( + '.ltx_bibitem, ' + \ + '.ltx_figure, .ltx_float, ' + \ + '.ltx_picture, .ltx_theorem'): + id_str = elem.get('id', '') + if id_str: + _insert_anchor(elem, _simplify_anchor(id_str)) + for elem in soup.select('h2, h3, h4, h5, h6'): + sec = elem.find_parent("section") + if sec: + id_str = sec.get('id') + if id_str: + _insert_anchor(elem, _simplify_anchor(id_str)) + for elem in soup.select(".ltx_table"): + id_str = elem.get('id', "xxunk") + _insert_anchor(elem, _simplify_anchor(id_str), "xxtable-xxanchor") + for elem in soup.select(".ltx_tabular"): + elem.extract() + + for elem in soup.select('a[href^="#"]'): + r = str(elem.get('href')) + elem.string = "xxref-" + _simplify_anchor(r[1:]) + + put_footnote_anchors(soup) + +def put_footnote_anchors(soup): + for elem in soup.select('.ltx_note_content > .ltx_note_mark'): + elem.extract() + + for elem in soup.select('.ltx_role_footnote > .ltx_note_mark'): + ft = elem.parent + id_str = ft.get('id') + if id_str: + elem.string = f" xxref-{_simplify_anchor(id_str)} " + + for elem in soup.select('.ltx_note_content > .ltx_tag_note'): + ft = elem.find_parent(class_="ltx_role_footnote") + if ft: + id_str = ft.get('id') + elem.string = f" xxanchor-{_simplify_anchor(id_str)} " + +# remove . from latexml ids (f.e., S2.SS5) so they can be searched for in elastic +# without disambiguations +def _simplify_anchor(s): + return s.replace('.', '') + + +def _handle_anchor(el): + if el.name.lower() == 'a' and el.get('id', ""): + id_str = el.get('id', "") + el.clear() # to remove it's content from the descendants iterator + return "xxanchor-" + id_str +# classes = get_classes(el) +# id_str = el.get('id') +# if 'ltx_title_section' in classes or 'ltx_title_subsection' in classes: +# print(el.get_text()) +# print(el.name) +# if 'ltx_title_section' in classes or 'ltx_title_subsection' in classes: +# print(el.get_text()) +# # this is workaround to deal with differences between +# # htlatex and latexml html structure +# # it would be better to make use of latexml structure +# sec = el.find_parent("section") +# if sec: +# id_str = sec.get('id') +# print(id_str, el.get_text()) +# +# if id_str and classes: +# classes = set(classes) +# if classes.intersection(_anchor_like_classes): +# print('xxanchor-'+id_str) +# el.clear() # to remove it's content from the descendants iterator +# return "xxanchor-" + id_str + + +def _handle_table(el): + if 'ltx_table' in get_classes(el): + id_str = el.get('id', "xxunk") + el.clear() # to remove it's content from the descendants iterator + return f"xxtable-xxanchor-" + id_str + + +_transforms_el = [ + _handle_reference, + _handle_table, + _handle_anchor, +] + + +def transform(el): + if isinstance(el, Tag): +# for f in _transforms_el: +# r = f(el) +# if r is not None: +# return transform(r) + return el.get_text() + elif not isinstance(el, Comment): + return str(el) + return '' + + +def clean_abstract(t): + return re.sub("^\s*[aA]bstract ?", "", t) + + +def get_text(*els): +# t = " ".join([transform(t) +# for el in els for t in getattr(el, 'descendants', [el])]) + t = " ".join([transform(e) for e in els]) + t = re.sub("[ \n\xa0]+", " ", t) + t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t) + t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t) + return t.strip() + + +def content_in_section(header, names=['h2', 'h3'], skip_comments=True): + for el in header.next_siblings: + if getattr(el, 'name', '') in names: + break + if skip_comments and isinstance(el, Comment): + continue + yield el + + +def get_classes(el): + if hasattr(el, 'get'): + return el.get('class', []) + else: + return [] + + +def get_name(el): + return hasattr(el, 'name') and el.name or '' + + +def _group_bibliography(el): + if 'ltx_bibliography' in get_classes(el): + return [get_text(i) for i in el.select('li.ltx_bibitem')] + return [] + + +def _group_table(el): + if 'ltx_table' in get_classes(el): + return [get_text(el)] + return [] + + +class ParagraphGrouper: + def __init__(self): + self.els = [] + self.join_next_p = False + + def collect(self, el): + if get_name(el) == 'table': + self.join_next_p = True + elif 'ltx_para' in get_classes(el): + if self.join_next_p: + self.join_next_p = False + self.els.append(el) + else: + return self.flush(new_els=[el]) + else: + self.els.append(el) + return [] + + def flush(self, new_els=None): + text = get_text(*self.els) + if new_els is None: + new_els = [] + if isinstance(new_els, Tag): # allow for one tag to be passed + new_els = [new_els] + self.els = new_els + if text: + return [text] + return [] + + def reset(self): + self.els = [] + + +_group_el = [ + _group_bibliography, + _group_table, +] + + +def group_content2(elements): + par_gruop = ParagraphGrouper() + for el in elements: + fragments = [frag for grouper in _group_el for frag in grouper(el)] + if fragments: + fragments = par_gruop.flush() + fragments + else: + fragments = par_gruop.collect(el) + for frag in fragments: + yield frag + + for frag in par_gruop.flush(): + yield frag + + +def walk(elem): + for el in elem.children: + classes = get_classes(el) + if el.name == 'section' or 'ltx_biblist' in classes: + yield from walk(el) + else: + yield el + +class Grouper: + def __init__(self): + self.out = [] + self.section_idx = -1 + self.subsection_idx = 0 + self.header = "" + self.in_section = False # move elements before first section into that section + self.section_output = False # if a section is empty and new section begins, output it for keep header + + def get_output_text(self): + return " ".join(self.out) + + def flush(self): + if self.in_section: + r = max(self.section_idx, 0), self.subsection_idx, self.header, self.get_output_text() + self.out = [] + self.section_output = True + self.subsection_idx += 1 + yield r + + def new_section(self, header_el): + if not self.section_output or self.out: # output (possibly) empty section so header won't be lost + yield from self.flush() + self.section_output = False + self.in_section = True + self.section_idx += 1 + self.subsection_idx = 0 + self.header = get_text(header_el) + + def append(self, el): + t = get_text(el).strip() + if t != "": + self.out.append(t) + return True + return False + + def group_content(self, doc): + for el in walk(doc): + classes = get_classes(el) + if el.name in ["h2", "h3"]: + yield from self.new_section(el) + elif el.name == "h1": + continue + elif 'ltx_para' in classes or el.name == "figure" or 'ltx_bibitem' in classes: + has_content = self.append(el) + if has_content: + yield from self.flush() + else: + self.append(el) + self.in_section = True + if not self.section_output or self.out: + yield from self.flush() + + +def group_content(doc): + yield from Grouper().group_content(doc) + +def group_content3(doc): + out = [] + section_idx = -1 + subsection_idx = 0 + header = "" + has_paragraph = False + for el in walk(doc): + classes = get_classes(el) + if el.name in ["h2", "h3"]: + if len(out) and has_paragraph: + yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out])) + out = [] + section_idx += 1 + subsection_idx = 0 + header = get_text(el) + continue + elif 'ltx_title' in classes and el.name != "h1": + if len(out) and has_paragraph: + yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out])) + out = [] + out += [el] + + elif 'ltx_title_document' in classes: + continue + elif 'ltx_para' in classes or el.name == "figure" or 'ltx_bibitem' in classes: + if len(out) and has_paragraph: + yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out])) + subsection_idx += 1 + out = [] + has_paragraph = True + out += [el] + else: + out.append(el) + if len(out): + yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out])) + +def read_html(file): + with codecs.open(file, 'r', encoding='UTF-8') as f: + text = f.read() + return BeautifulSoup(text, "html.parser") diff --git a/sota_extractor2/data/elastic.py b/axcell/data/elastic.py similarity index 64% rename from sota_extractor2/data/elastic.py rename to axcell/data/elastic.py index 36ba396..74eacce 100644 --- a/sota_extractor2/data/elastic.py +++ b/axcell/data/elastic.py @@ -1,17 +1,22 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from bs4 import BeautifulSoup import pandas as pd import re +from dataclasses import asdict from elasticsearch_dsl import Document, Boolean, Object, \ - analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter + analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter, Date from elasticsearch_dsl.serializer import serializer from IPython.display import display, Markdown from elasticsearch_dsl import connections -from sota_extractor2.data.doc_utils import get_text, content_in_section, group_content, set_ids_by_labels, read_html +from axcell.data.doc_utils import get_text, content_in_section, group_content, read_html, put_dummy_anchors, clean_abstract from .. import config from pathlib import Path +import sys def setup_default_connection(): @@ -105,7 +110,11 @@ class Fragment(Document): ) outer_headers = Text(analyzer=html_strip, ) + class Meta: + doc_type = '_doc' + class Index: + doc_type = '_doc' name = 'paper-fragments' @classmethod @@ -130,12 +139,16 @@ def __repr__(self): class Paper(Document): title = Text() - authors = Keyword() + authors = Keyword() #TODO: change this to Text() otherwise we can't search using this field. abstract = Text( analyzer=html_strip ) + class Meta: + doc_type = '_doc' + class Index: + doc_type = '_doc' name = 'papers' def to_json(self): @@ -161,9 +174,10 @@ def from_json(cls, json, paper_id=None): return paper @classmethod - def from_file(cls, path): + def from_file(cls, path, paper_id=None): path = Path(path) - paper_id = path.parent.name + if paper_id is None: + paper_id = path.parent.name with open(path, "rt") as f: json = f.read() return cls.from_json(json, paper_id) @@ -186,25 +200,37 @@ def save(self, **kwargs): else: return super().save(**kwargs) + def delete(self, **kwargs): + if hasattr(self, 'fragments'): + for f in self.fragments: + f.delete() + return super().delete(**kwargs) + @classmethod def parse_html(cls, soup, paper_id): - set_ids_by_labels(soup) - abstract = soup.select("div.abstract") - author = soup.select("div.author") + put_dummy_anchors(soup) + abstract = soup.select("div.ltx_abstract") + author = soup.select("div.ltx_authors") p = cls(title=get_text(soup.title), authors=get_text(*author), - abstract=get_text(*abstract), + abstract=clean_abstract(get_text(*abstract)), meta={'id': paper_id}) for el in abstract + author: el.extract() fragments = Fragments() - for idx, h in enumerate(soup.find_all(['h3', 'h4'])): - section_header = get_text(h) - if p.abstract == "" and section_header.lower() == "abstract": - p.abstract = get_text(*list(content_in_section(h))) - else: - for idx2, content in enumerate(group_content(content_in_section(h))): + doc = soup.find("article") + if doc: + footnotes = doc.select(".ltx_role_footnote > .ltx_note_outer") + for ft in footnotes: + ft.extract() + + idx = 0 + for idx, idx2, section_header, content in group_content(doc): + content = content.strip() + if p.abstract == "" and "abstract" in section_header.lower(): + p.abstract = clean_abstract(content) + else: order = (idx + 1) * 1000 + idx2 f = Fragment( paper_id=paper_id, @@ -214,6 +240,21 @@ def parse_html(cls, soup, paper_id): meta={'id': f"{paper_id}-{order}"} ) fragments.append(f) + idx += 1 + idx2 = 0 + for ft in footnotes: + order = (idx + 1) * 1000 + idx2 + f = Fragment( + paper_id=paper_id, + order=order, + header="xxanchor-footnotes Footnotes", + text=get_text(ft), + meta={'id': f"{paper_id}-{order}"} + ) + fragments.append(f) + idx2 += 1 + else: + print(f"No article found for {paper_id}", file=sys.stderr) p.fragments = fragments return p @@ -232,9 +273,17 @@ def read_html(cls, file): return read_html(file) @classmethod - def parse_paper(cls, file): + def from_html(cls, html, paper_id): + soup = BeautifulSoup(html, "html.parser") + return cls.parse_html(soup, paper_id) + + @classmethod + def parse_paper(cls, file, paper_id=None): + file = Path(file) soup = cls.read_html(file) - return cls.parse_html(soup, file.stem) + if paper_id is None: + paper_id = file.stem + return cls.parse_html(soup, paper_id) class Author(InnerDoc): @@ -251,12 +300,90 @@ class Reference(Document): urls = Keyword() is_ml = Boolean() + class Meta: + doc_type = '_doc' + class Index: + doc_type = '_doc' name = 'references' def __repr__(self): return f"{self.title} / {self.authors}" + +ID_LIMIT=480 + + +class Author2(InnerDoc): + forenames = Text(fields={'keyword': Keyword()}) + surname = Text(fields={'keyword': Keyword()}) + + +class Reference2(Document): + title = Text() + authors = Object(Author2) + + idno = Keyword() + date = Date() + ptr = Keyword() + + arxiv_id = Keyword() + pwc_slug = Keyword() + orig_refs = Text() + + class Meta: + doc_type = '_doc' + + class Index: + doc_type = '_doc' + name = 'references2' + + def add_ref(self, ref): + # if not hasattr(self, 'refs'): + # self.refs = [] + # self.refs.append(asdict(ref)) + if ref.arxiv_id: + self.arxiv_id = ref.arxiv_id + if ref.pwc_slug: + self.pwc_slug = ref.pwc_slug + if ref.idno: + if hasattr(ref.idno, 'values'): + self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop() + elif isinstance(ref.idno, str): + self.idno = ref.idno + # if ref.date: + # self.date = ref.date + if ref.ptr: + self.ptr = ref.ptr + self.orig_refs = self.orig_refs if self.orig_refs else [] + self.orig_refs.append(ref.orig_ref) + self.orig_refs = list(set(self.orig_refs)) + + # TODO Update authors + # titles = Counter([norm_title] + [normalize_title(ref.title) for ref in merged]) + # norm_title = titles.most_common(1)[0][0] + + @property + def stable_id(self): + return self.meta.id + + def unique_id(self): + return self.meta.id + + @classmethod + def from_ref(cls, ref): + #title = ref.title + #first_author = ref.authors[0].short() if len(ref.authors) > 0 else "unknown" + # Todo figure out what to do here so stable_id is recoverable, and it has no collisions + # stable_id = first_author + "-" + normalize_title(until_first_nonalphanumeric(title))[:50] + stable_id = ref.unique_id()[:ID_LIMIT] + + self = cls(meta={"id":stable_id}, + title=ref.title, + authors=[asdict(a) for a in ref.authors if a is not None]) + + return self + # # arxiv = Path('data/arxiv') # html = arxiv/'html' @@ -306,7 +433,7 @@ def cell_type_heuristic(orig_text, text, query): def display_fragment(f, cell_type="", display=True): - from sota_extractor2.helpers.jupyter import display_html + from axcell.helpers.jupyter import display_html cell_type = f" - {cell_type} " if cell_type else "" pre = f"
{f.header}{cell_type}
" body = " ... ".join(f.meta.highlight.text) @@ -314,3 +441,19 @@ def display_fragment(f, cell_type="", display=True): if display: display_html(html) return html + + +def query_for_evidences(paper_id, values, topk=5, fragment_size=50): + evidence_query = Fragment.search().highlight( + 'text', pre_tags="", post_tags="", fragment_size=fragment_size) + + query = { + "query": ' '.join(values) + } + + fragments = list(evidence_query + .filter('term', paper_id=paper_id) + .query('match', text=query)[:topk] + ) + + return '\n'.join([' '.join(f.meta['highlight']['text']) for f in fragments]) diff --git a/extract_tables.py b/axcell/data/extract_tables.py similarity index 63% rename from extract_tables.py rename to axcell/data/extract_tables.py index 5241d9a..3ead7c2 100755 --- a/extract_tables.py +++ b/axcell/data/extract_tables.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + import sys from bs4 import BeautifulSoup, Comment, NavigableString import fire @@ -13,8 +15,7 @@ from dataclasses import dataclass from typing import Set -from tabular import Tabular - +from axcell.data.table import Table # begin of dirty hack # pandas parsing of html tables is really nice @@ -80,9 +81,9 @@ class LayoutCell: span: Set[str] def __str__(self): - borders = ['border-'+x for x in self.borders] - align = ['align-'+x for x in self.align] - span = ['span-'+x for x in self.span] + borders = ['border-'+x for x in sorted(list(self.borders))] + align = ['align-'+x for x in sorted(list(self.align))] + span = ['span-'+x for x in sorted(list(self.span))] header = ["header"] if self.header else [] return ' '.join(borders + align + span + header) @@ -134,9 +135,20 @@ def fix_layout(layout): cell.borders -= {"b", "bb", "t", "tt"} +# does not deal with nested tags +# f.e., +# or +whitespace_tag_re = re.compile(r"<(bold|italic|red|green|blue)>(\s*)") +dummy_close_tag_re = re.compile(r"(\s*)<\1>") +def clear_cell(s): + s = whitespace_tag_re.sub(r"\2", s) + s = dummy_close_tag_re.sub(r"\2", s) + return s.strip() + + def decouple_layout(df): split = df.applymap(lambda x: ("", "") if x == "" else x.split(";", 1)) - tab = split.applymap(lambda x: x[1]) + tab = split.applymap(lambda x: clear_cell(x[1])) layout = split.applymap(lambda x: to_layout(x[0])) fix_layout(layout) return tab, layout @@ -149,6 +161,9 @@ def fix_table(df): return decouple_layout(df) +def is_table_empty(df): + return (df.applymap(lambda x: x.strip()).values == "").all() + def fix_id(s): return s.replace(".", "-") @@ -163,19 +178,60 @@ def move_out_references(table): wrap_elem_content(anchor, f"", "") -#def move_out_text_styles(table): -# ltx_font = 'ltx_font_' -# font_selector = f'[class*="{ltx_font}"]' -# -# for elem in table.select(f"span{font_selector}, a{font_selector}, em{font_selector}"): -# for c in set(elem.attrs["class"]): -# if c == ltx_font + 'bold': -# wrap_elem_content(elem, "", "") -# elif c == ltx_font + 'italic': -# wrap_elem_content(elem, "", "") - - -def move_out_styles(table): +bold_font_weight_re = re.compile(r"(^|;)\s*font-weight:\s*(bold|700|800|900)\s*(;|$)") +bold_mathjax_font_re = re.compile(r"^MJXc-TeX-\w*-BI?$") +italic_font_style_re = re.compile(r"(^|;)\s*font-style:\s*italic\s*(;|$)") +italic_mathjax_font_re = re.compile(r"^MJXc-TeX-\w*-B?I$") + +def _has_font_class(classes, font_re): + return any(font_re.match(cls) for cls in classes) + + +font_color_re = re.compile(r"(^|;)\s*color:\s*(?P#[0-9A-Fa-f]{3,6}|red|green|blue)\s*(;|$)") +def _extract_color_from_style(style): + m = font_color_re.search(style) + if m: + color = m["color"] + if color[0] == "#": + color = color[1:] + if len(color) != 6: + color = (color + color)[:6] + r, g, b = int(color[0:2], 16), int(color[2:4], 16), int(color[4:6], 16) + if r > 2 * g and r > 2 * b: + color = "red" + elif g > 2 * r and g > 2 * b: + color = "green" + elif b > 2 * r and b > 2 * g: + color = "blue" + else: + return + return color + return + + +def move_out_text_styles(table): + for elem in table.select('.ltx_font_bold, [style*="font-weight"], [class*="MJXc-TeX-"]'): + classes = elem.get("class", []) + style = elem.get("style", "") + if "ltx_font_bold" in classes or bold_font_weight_re.search(style) \ + or _has_font_class(classes, bold_mathjax_font_re): + wrap_elem_content(elem, "", "") + + for elem in table.select('.ltx_font_italic, [style*="font-style"], [class*="MJXc-TeX-"]'): + classes = elem.get("class", []) + style = elem.get("style", "") + if "ltx_font_italic" in classes or italic_font_style_re.search(style) \ + or _has_font_class(classes, italic_mathjax_font_re): + wrap_elem_content(elem, "", "") + + for elem in table.select('[style*="color"]'): + style = elem.get("style") + color = _extract_color_from_style(style) + if color: + wrap_elem_content(elem, f"<{color}>", f"") + + +def move_out_cell_styles(table): ltx_border = 'ltx_border_' ltx_align = 'ltx_align_' ltx_th = 'ltx_th' @@ -198,6 +254,11 @@ def move_out_styles(table): wrap_elem_content(elem, f"{b},{a},{header},{colspan},{rowspan};", "") +def remove_ltx_errors(soup): + for span in soup.select('span.ltx_ERROR'): + span.extract() + + def html2data(table): data = pd.read_html(str(table), match='') if len(data) > 1: @@ -205,23 +266,28 @@ def html2data(table): return data[0] if len(data) == 1 else None -def save_table(data, filename): - data.to_csv(filename, header=None, index=None) - - def save_tables(data, outdir): metadata = [] for num, table in enumerate(data, 1): filename = f"table_{num:02}.csv" layout = f"layout_{num:02}.csv" - save_table(table.data, outdir / filename) - save_table(table.layout, outdir / layout) + table.save(outdir, filename, layout) metadata.append(dict(filename=filename, layout=layout, caption=table.caption, figure_id=table.figure_id)) with open(outdir / "metadata.json", "w") as f: json.dump(metadata, f) +def load_tables(path): + path = Path(path) + with open(path / "metadata.json", "r") as f: + metadata = json.load(f) + + return [Table.from_file( + path, + table_metadata) for table_metadata in metadata] + + def set_ids_by_labels(soup): captions = soup.select(".ltx_caption") for caption in captions: @@ -231,6 +297,26 @@ def set_ids_by_labels(soup): for table in fig.select(".ltx_tabular"): table["data-figure-id"] = label + +alg_id_re = re.compile(r"^alg(orithm)?[0-9]+") +def perhaps_not_tabular(table, float_div): + classes = float_div.attrs.get("class", []) + if 'ltx_table' in classes: + return False + if 'ltx_figure' in classes: + if table.find("img", class_="ltx_graphics"): + return True + if 'ltx_float' in classes: + if 'biography' in classes: + return True + if 'ltx_float_algorithm': + return True + if 'ltx_lstlisting': + return True + if float_div.id and alg_id_re.match(float_div.id): + return True + return False + def is_figure(tag): return tag.name == "figure" # classes = tag.attrs.get("class", []) @@ -261,15 +347,12 @@ def remove_footnotes(soup): elem.extract() -def extract_tables(filename, outdir): - with open(filename, "rb") as f: - html = f.read() - outdir = Path(outdir) - outdir.mkdir(parents=True, exist_ok=True) - soup = BeautifulSoup(html, "lxml", from_encoding="utf-8") +def extract_tables(html): + soup = BeautifulSoup(html, "lxml") set_ids_by_labels(soup) fix_span_tables(soup) fix_th(soup) + remove_ltx_errors(soup) flatten_tables(soup) tables = soup.find_all("table", class_="ltx_tabular") @@ -279,15 +362,20 @@ def extract_tables(filename, outdir): continue float_div = table.find_parent(is_figure) + if float_div and perhaps_not_tabular(table, float_div): + continue remove_footnotes(table) move_out_references(table) - move_out_styles(table) + move_out_text_styles(table) + move_out_cell_styles(table) escape_table_content(table) tab = html2data(table) if tab is None: continue tab, layout = fix_table(tab) + if is_table_empty(tab): + continue caption = None if float_div is not None: @@ -295,8 +383,15 @@ def extract_tables(filename, outdir): if cap_el is not None: caption = clear_ws(cap_el.get_text()) figure_id = table.get("data-figure-id") - data.append(Tabular(tab, layout, caption, figure_id)) + data.append(Table(f"table_{len(data)+1:02}", tab, layout.applymap(str), caption, figure_id)) + return data - save_tables(data, outdir) +def extract_tables_cmd(filename, outdir): + with open(filename, "rb") as f: + html = f.read() + tables = extract_tables(html) + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + save_tables(tables, outdir) -if __name__ == "__main__": fire.Fire(extract_tables) +if __name__ == "__main__": fire.Fire(extract_tables_cmd) diff --git a/sota_extractor2/data/json.py b/axcell/data/json.py similarity index 67% rename from sota_extractor2/data/json.py rename to axcell/data/json.py index e647e66..974571d 100644 --- a/sota_extractor2/data/json.py +++ b/axcell/data/json.py @@ -1,10 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + #%% import json import re import gzip import pprint import requests -from sota_extractor2 import config +from axcell import config #%% def to_snake_case(name): #https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case @@ -69,13 +71,62 @@ def cut(s, length=20): vals = pprint.pformat({to_snake_case(k): cut(str(self[k])) for k in self.keys()}) return f"NodeWrap({vals})" + +def _annotations_to_gql(annotations): + nodes = [] + for a in annotations: + tables = [] + for t in a['tables']: + tags = [] + if t['leaderboard']: + tags.append('leaderboard') + if t['ablation']: + tags.append('ablation') + if not tags: + tags = ['irrelevant'] + + records = {} + for r in t['records']: + d = dict(r) + del d['row'] + del d['column'] + records[f'{r["row"]}.{r["column"]}'] = d + table = { + 'node': { + 'name': f'table_{t["index"] + 1:02}.csv', + 'datasetText': t['dataset_text'], + 'notes': '', + 'goldTags': ' '.join(tags), + 'matrixGoldTags': t['segmentation'], + 'cellsSotaRecords': json.dumps(records), + 'parser': 'latexml' + } + } + tables.append(table) + node = { + 'arxivId': a['arxiv_id'], + 'goldTags': a['fold'], + 'tableSet': {'edges': tables} + } + nodes.append({'node': node}) + return { + 'data': { + 'allPapers': { + 'edges': nodes + } + } + } + + def load_gql_dump(data_or_file, compressed=True): - if isinstance(data_or_file, dict): + if isinstance(data_or_file, dict) or isinstance(data_or_file, list): papers_data = data_or_file else: open_fn = gzip.open if compressed else open with open_fn(data_or_file, "rt") as f: - papers_data = json.load(f) + papers_data = json.load(f) + if "data" not in papers_data: + papers_data = _annotations_to_gql(papers_data) data = papers_data["data"] return {k:wrap_dict(v) for k,v in data.items()} diff --git a/axcell/data/paper_collection.py b/axcell/data/paper_collection.py new file mode 100644 index 0000000..baece02 --- /dev/null +++ b/axcell/data/paper_collection.py @@ -0,0 +1,175 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .elastic import Paper as PaperText, Fragments +from .table import Table, read_tables +from .json import load_gql_dump +from pathlib import Path +import re +import pickle +from joblib import Parallel, delayed +from collections import UserList +from ..helpers.jupyter import display_table +import string +import random +from axcell.data.extract_tables import extract_tables + + +class Paper: + def __init__(self, paper_id, text, tables, annotations): + self.paper_id = paper_id + self.arxiv_no_version = remove_arxiv_version(paper_id) + if text is not None: + self.text = text + else: + self.text = PaperText() + self.text.fragments = Fragments() + self.tables = tables + self._annotations = annotations + if annotations is not None: + self.gold_tags = annotations.gold_tags.strip() + else: + self.gold_tags = '' + + def table_by_name(self, name): + for table in self.tables: + if table.name == name: + return table + return None + + +# todo: make sure multithreading/processing won't cause collisions +def random_id(): + return "temp_" + ''.join(random.choice(string.ascii_lowercase) for i in range(10)) + + +class TempPaper(Paper): + """Similar to Paper, but can be used as context manager, temporarily saving the paper to elastic""" + def __init__(self, html): + paper_id = random_id() + text = PaperText.from_html(html, paper_id) + tables = extract_tables(html) + super().__init__(paper_id=paper_id, text=text, tables=tables, annotations=None) + + def __enter__(self): + self.text.save() + return self + + def __exit__(self, exc, value, tb): + self.text.delete() + + +arxiv_version_re = re.compile(r"v\d+$") +def remove_arxiv_version(arxiv_id): + return arxiv_version_re.sub("", arxiv_id) + + +def _load_texts(path, jobs): + files = list(path.glob("**/text.json")) + texts = Parallel(n_jobs=jobs, prefer="processes")(delayed(PaperText.from_file)(f) for f in files) + return {text.meta.id: text for text in texts} + + +def _load_tables(path, annotations, jobs, migrate): + files = list(path.glob("**/metadata.json")) + tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(f.parent.name), migrate) for f in files) + return {f.parent.name: tbls for f, tbls in zip(files, tables)} + + +def _gql_dump_to_annotations(dump): + annotations = {remove_arxiv_version(a.arxiv_id): a for a in dump} + annotations.update({a.arxiv_id: a for a in dump}) + return annotations + +def _load_annotated_papers(data_or_path): + if isinstance(data_or_path, dict) or isinstance(data_or_path, list): + compressed = False + else: + compressed = data_or_path.suffix == ".gz" + dump = load_gql_dump(data_or_path, compressed=compressed)["allPapers"] + return _gql_dump_to_annotations(dump) + + +class PaperCollection(UserList): + def __init__(self, data=None): + super().__init__(data) + + @classmethod + def from_files(cls, path, annotations=None, load_texts=True, load_tables=True, jobs=-1): + return cls._from_files(path, annotations=annotations, annotations_path=None, + load_texts=load_texts, load_tables=load_tables, load_annotations=False, + jobs=jobs) + + @classmethod + def _from_files(cls, path, annotations=None, annotations_path=None, load_texts=True, load_tables=True, load_annotations=True, jobs=-1, migrate=False): + path = Path(path) + if annotations_path is None: + annotations_path = path / "structure-annotations.json" + else: + annotations_path = Path(annotations_path) + if load_texts: + texts = _load_texts(path, jobs) + else: + texts = {} + + if annotations is None: + annotations = {} + else: + annotations = _load_annotated_papers(annotations) + if load_tables: + if load_annotations: + annotations = _load_annotated_papers(annotations_path) + tables = _load_tables(path, annotations, jobs, migrate) + else: + tables = {} + outer_join = set(texts).union(set(tables)) + + papers = [Paper(k, texts.get(k), tables.get(k, []), annotations.get(k)) for k in outer_join] + return cls(papers) + + def get_by_id(self, paper_id, ignore_version=True): + if ignore_version: + paper_id = remove_arxiv_version(paper_id) + for p in self.data: + if p.arxiv_no_version == paper_id: + return p + return None + else: + for p in self.data: + if p.paper_id == paper_id: + return p + return None + + @classmethod + def cells_gold_tags_legend(cls): + tags = [ + ("Tag", "description"), + ("model-best", "the best performing model introduced in the paper"), + ("model-paper", "model introduced in the paper"), + ("model-ensemble", "ensemble of models introduced in the paper"), + ("model-competing", "model from another paper used for comparison"), + ("dataset-task", "Task"), + ("dataset", "Dataset"), + ("dataset-sub", "Subdataset"), + ("dataset-metric", "Metric"), + ("model-params", "Params, f.e., number of layers or inference time"), + ("table-meta", "Cell describing other header cells"), + ("trash", "Parsing erros") + ] + anns = [(t[0], "") for t in tags] + anns[0] = ("", "") + display_table(tags, anns) + + + def to_pickle(self, path): + with open(path, "wb") as f: + pickle.dump(self, f) + + @classmethod + def from_pickle(cls, path): + import gc + try: + gc.disable() + with open(path, "rb") as f: + return pickle.load(f) + finally: + gc.enable() diff --git a/axcell/data/references.py b/axcell/data/references.py new file mode 100644 index 0000000..5d49798 --- /dev/null +++ b/axcell/data/references.py @@ -0,0 +1,412 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# Open source tools lfoppiano/grobid +from collections import OrderedDict, Counter, Iterable, defaultdict +from dataclasses import dataclass, field +from warnings import warn + +import json +import regex as re +from unidecode import unidecode +import time +import requests +import shelve +import xmltodict +from elasticsearch import ConflictError +from joblib import Parallel, delayed +from pathlib import Path + +import diskcache as dc + +from axcell import config +from axcell.data.elastic import Reference2, ID_LIMIT + + +def just_letters(s, _tok_re = re.compile(r'[^\p{L}]+')): + return _tok_re.sub(' ', s).strip() + +def ensure_list(a): + if isinstance(a, (list, tuple)): + return a + else: + return [a] + +_anchor_re=re.compile(r'^((?:\s+[^\s]+){1,5}\s*[\(\[][12]\d{3}[a-zA-Z]{0,3}[\]\)] |\s*\[[a-zA-Z0-9_ .]{1,30}\]\s*)') +def strip_anchor(ref_str): + return _anchor_re.sub('[1] ', ' '+ref_str) + +_tokenizer_re = re.compile(r'[^/a-z0-9\\:?#\[\]\(\).-–]+') +def normalize_title(s, join=True): + toks = _tokenizer_re.split(unidecode(s).lower()) + return "-".join(toks).strip() if join else toks + +def to_normal_dict(d): + if isinstance(d, list): + return [to_normal_dict(x) for x in d] + if isinstance(d, OrderedDict): + return {k:to_normal_dict(v) for k,v in d.items()} + return d + +class GrobidClient: + def __init__(self, cache_path=None, host='127.0.0.1', port=8070, max_tries=4, retry_wait=2): + self.host = host + self.port = port + self.max_tries = max(max_tries, 1) + self.retry_wait = retry_wait + self.cache_path_shelve = Path.home()/'.cache'/'refs' /'gobrid'/'gobrid.pkl' if cache_path is None else Path(cache_path) + self.cache_path = Path.home() / '.cache' / 'refs' /'gobrid' / 'gobrid.db' if cache_path is None else Path(cache_path) + self.cache = None + + def get_cache(self): + if self.cache is None: + self.cache_path.parent.mkdir(parents=True, exist_ok=True) + self.cache = dc.Cache(self.cache_path) + return self.cache + + def migrate(self): + """Migrate from shelve to diskcache for thread safty.""" + cache = self.get_cache() + old_cache = shelve.open(str(self.cache_path_shelve)) + count = 0 + for k in old_cache.keys(): + cache[k] = old_cache[k] + count += 1 + old_cache.close() + return count + + def _post(self, data): + tries = 0 + while tries < self.max_tries: + with requests.Session() as s: + r = s.post( + f'http://{self.host}:{self.port}/api/processCitation', + data=data, + headers={'Connection': 'close'} + ) + if r.status_code in [200, 204]: + return r.content.decode("utf-8") + if r.status_code != 503: + raise RuntimeError(f"{r.status_code} {r.reason}\n{r.content}") + tries += 1 + if tries < self.max_tries: + time.sleep(self.retry_wait) + raise ConnectionRefusedError(r.reason) + + def parse_ref_str_to_tei_dict(self, ref_str): + cache = self.get_cache() + d = cache.get(ref_str) + if d is None: # potential multiple recomputation in multithreading case + content = self._post(data={'citations': ref_str}) + d = xmltodict.parse(content) + d = to_normal_dict(d) + cache[ref_str] = d + return d + + def close(self): + self.cache.close() + self.cache = None + + +def pop_first(dictionary, *path): + if dictionary is None: + return None + try: + d = dictionary + for p in path: + d = d.pop(p, None) + if d is None: + return None + if isinstance(d, (list, tuple)): + d = d[0] + if isinstance(d, str): + return d + return d + except Exception as err: + warn(f"{err} - Unable to pop path {path}, from {dictionary}") + +@dataclass(eq=True, frozen=True) +class PAuthor: + forenames: tuple + surname: str + + @classmethod + def from_tei_dict(cls, d): + try: + p = d['persName'] + forename = p.get('forename', []) + if isinstance(forename, dict): + forename = [forename] + return cls( + forenames=tuple(fname['#text'] for fname in forename), + surname=p['surname'] + ) + except KeyError as err: + warn(f"{err} - Unable to parse {d} as Author") + print(d) + except TypeError as err: + warn(f"{err} - Unable to parse {d} as Author") + print(d) + + @classmethod + def from_fullname(cls, fullname): + names = fullname.split() + return cls( + forenames=tuple(names[:-1]), + surname=names[-1] + ) + + def __repr__(self): + fnames = ', '.join(self.forenames) + return f'"{self.surname}; {fnames}"' + + def short_names(self): + fnames = [just_letters(f)[:1].capitalize() for f in self.forenames] + surname = just_letters(self.surname).capitalize() + return [surname] + fnames + + def short(self): + return " ".join(self.short_names()) + return f'{surname} {fnames}' + + +conferences = [ + 'nips', 'neurips', 'emnlp', 'acl', 'corr', 'ai magazine', 'machine learning', 'arxiv.org' +] +conferences_re = re.compile(f'\\b({"|".join(re.escape(c) for c in conferences)})[\\s0-9.]*', re.IGNORECASE) +def strip_conferences(title): + return conferences_re.sub('', title) + +_arxiv_id_re = re.compile(r'((?:arxiv|preprint|corr abs?)\s*)*[:/]([12]\d{3}\.\d{4,5})(v\d+)?', re.IGNORECASE) +def extract_arxivid(ref_str): + arxiv_id = None + m = _arxiv_id_re.search(ref_str) + if m: + arxiv_id = m.group(2) + b,e = m.span() + if m.group(1) and m.group(1).strip() != "": # we only remove arxivid it was prefixed with arxiv / pre print etc to keep urls intact. + ref_str = ref_str[:b] + " " +ref_str[e:] + return ref_str, arxiv_id + + +def is_publication_venue(word): + return word.lower() in conferences + +latex_artefacts_re=re.compile("|".join(re.escape(a) for a in { + '\\BBA', + '\\BBCP', + '\\BBCQ', + '\\BBOP', + '\\BBOQ', + '\\BCAY', + '\\Bem', + '\\citename' +})) + +def strip_latex_artefacts(text): + #text=text.replace('\\Bem', '.') + return latex_artefacts_re.sub('',text) + +def post_process_title(title, is_surname, is_publication_venue): + if title is None: + return title + parts = title.split('. ') + if len(parts) == 1: + title = parts[0] + else: + def score_sent(part, idx): + words = part.split(' ') + + return ((10 - idx) + + len(words)*2 + + sum(2 for w in words if w.isupper() and len(w) > 2) + + sum(-1 for w in words if w.istitle()) + + sum(-4 for w in words if is_surname(w)) + + sum(-10 for w in words if is_publication_venue(w)) + + (-100 if "in proceedings" in part.lower() else 0) + ) + + scores = [(score_sent(part, idx=idx), part) for idx, part in enumerate(parts)] + + title = max(scores)[1] + + # title = strip_conferences(title) + title = title.rstrip(' .') + return title + +@dataclass +class PReference: + orig_key: tuple = None + title: str = None + authors: list = None + idno: str = None + date: str = None + ptr: str = None + + extra: dict = None + alt: dict = None + orig_ref: str = field(repr=False, default_factory=lambda:None) + + arxiv_id: str = None + pwc_slug: str = None + + def unique_id(self): + if not self.title: + return None + norm_title = normalize_title(self.title)[:ID_LIMIT] # elastic limit + return norm_title + + @classmethod + def from_tei_dict(cls, citation, **kwargs): + bibstruct = dict(citation['biblStruct']) + monogr = bibstruct.pop('monogr', {}) + paper = bibstruct.pop('analytic', monogr) + if paper is None: + raise ValueError("Analytic and mongr are both None") + + title = (pop_first(paper, 'title', '#text') + or pop_first(monogr, 'title', '#text') + or pop_first(bibstruct, 'note') + ) + + if not isinstance(title, str): + bibstruct['note'] = title # note was not string so let's revers pop + title = None + + return cls( + title=title, + authors=[PAuthor.from_tei_dict(a) for a in ensure_list(paper.pop('author', []))], + idno=pop_first(paper, 'idno'), + date=pop_first(paper, 'imprint', 'date', '@when'), + ptr=pop_first(paper, 'ptr', '@target'), + extra={k: paper[k] for k in paper if k not in ['title', 'author']}, + alt=bibstruct, + **kwargs + ) + + @classmethod + def parse_ref_str(cls, ref_str, grobid_client, orig_key=None, is_surname=None, is_publication_venue=is_publication_venue): + try: + clean_ref_str = strip_latex_artefacts(ref_str) + clean_ref_str = strip_anchor(clean_ref_str) + clean_ref_str, arxiv_id = extract_arxivid(clean_ref_str) + d = grobid_client.parse_ref_str_to_tei_dict(clean_ref_str) + ref = cls.from_tei_dict(d, orig_ref=ref_str, arxiv_id=arxiv_id) + ref.orig_key = orig_key + + ref.title = post_process_title(ref.title, is_surname, is_publication_venue) + + return ref + except (KeyError, TypeError,ValueError) as err: + warn(f"{err} - Unable to parse {d} as Ref") + + +nonalphanumeric_re = re.compile(r'[^a-z0-9 ]', re.IGNORECASE) +def until_first_nonalphanumeric(string): + return nonalphanumeric_re.split(string)[0] + +class ReferenceStore: + def __init__(self, grobid_client, + surnames_path='/mnt/efs/pwc/data/ref-names.json', + use_cache=True): + self.grobid_client = grobid_client + self.refdb = {} + self.tosync = [] + self.surnames_db = defaultdict(lambda: 0) + self._load_surnames(surnames_path) + self.use_cache = use_cache + + def _load_surnames(self, path): + with Path(path).open() as f: + self.preloaded_surnames_db = json.load(f) + + def is_surname(self, word): + return word in self.preloaded_surnames_db #or self.surnames_db[word] > 5 + + def get_reference(self, key): + if self.use_cache: + if key not in self.refdb: + self.refdb[key] = Reference2.mget([key])[0] + return self.refdb[key] + return Reference2.mget([key])[0] + + def add_or_merge(self, ref): + curr_uid = ref.unique_id() + if not self.use_cache or curr_uid not in self.refdb: + curr_ref = Reference2.mget([curr_uid])[0] or Reference2.from_ref(ref) + else: + curr_ref = self.refdb[curr_uid] + curr_ref.add_ref(ref) # to fill all other fields but title + if self.use_cache: + self.refdb[curr_uid] = curr_ref + self.refdb[ref.unique_id()] = curr_ref + self.tosync.append(curr_ref) + else: + try: + curr_ref.save() + except ConflictError: + pass + for author in ref.authors: + if author is not None: + self.surnames_db[author.surname] += 1 + + return curr_ref.stable_id + + def add_reference_string(self, ref_str): + ref = PReference.parse_ref_str(ref_str, self.grobid_client, is_surname=self.is_surname) + if ref is None or ref.unique_id() is None: + for r in Reference2.search().query('match', orig_refs=ref_str)[:10]: + if r.stable_id in normalize_title(ref_str): + return r.stable_id + return None + + stable_id = self.add_or_merge(ref) + return stable_id + + def add_batch(self, ref_strs): + if isinstance(ref_strs, str): + ref_strs = [ref_strs] + def add_ref(ref_str): + return self.add_reference_string(ref_str) + if len(ref_strs) < 1000: + yield from (add_ref(ref_str) for ref_str in ref_strs) + else: + yield from Parallel(n_jobs=10, require='sharedmem')( + delayed(add_ref)(ref_str) for ref_str in ref_strs) + + def sync(self): + tosync = self.tosync + self.tosync = [] + for p in tosync: + try: + p.save() + except ConflictError: + pass + + +def get_refstrings(p): + paper = p.text if hasattr(p, 'text') else p + if not hasattr(paper, 'fragments'): + return + fragments = paper.fragments + ref_sec_started = False + for f in reversed(fragments): + if f.header.startswith('xxanchor-bib'): + ref_sec_started = True + yield f.text + elif ref_sec_started: + # todo: check if a paper can have multiple bibliography sections + # (f.e., one in the main paper and one in the appendix) + break # the refsection is only at the end of paper + + + +_ref_re = re.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$') +def extract_refs(p): + for ref in get_refstrings(p): + m = _ref_re.match(ref) + if m: + ref_id, ref_str = m.groups() + yield { + "ref_id": ref_id, + "ref_str": ref_str.strip(r'\s') + } diff --git a/axcell/data/structure.py b/axcell/data/structure.py new file mode 100644 index 0000000..af1eb36 --- /dev/null +++ b/axcell/data/structure.py @@ -0,0 +1,205 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import re +import pandas as pd +from collections import namedtuple +import hashlib +from fastai.text import progress_bar +from .elastic import Fragment, setup_default_connection +from .json import * +from .table import reference_re, remove_text_styles, remove_references, style_tags_re + +def get_all_tables(papers): + for paper in papers: + for table in paper.table_set.filter(parser="latexml"): + if 'trash' not in table.gold_tags and table.gold_tags != '': + table.paper_id = paper.arxiv_id + yield table + +def consume_cells(table): + Cell = namedtuple('AnnCell', 'row col vals') + for row_id, row in enumerate(table.df.values): + for col_id, cell in enumerate(row): + vals = [ + remove_text_styles(remove_references(cell.raw_value)), + cell.gold_tags, + cell.refs[0] if cell.refs else "", + cell.layout, + bool(style_tags_re.search(cell.raw_value)) + ] + yield Cell(row=row_id, col=col_id, vals=vals) + + +reference_re = re.compile(r"\[[^]]*\]") +ours_re = re.compile(r"\(ours?\)") +all_parens_re = re.compile(r"\([^)]*\)") + + +def clear_cell(s): + for pat in [reference_re, all_parens_re]: + s = pat.sub("", s) + s = s.strip() + return s + + +def empty_fragment(paper_id): + fragment = Fragment(paper_id=paper_id) + fragment.meta['highlight'] = {'text': ['']} + return fragment + + +def normalize_query(query): + if isinstance(query, list): + return tuple(normalize_query(x) for x in query) + if isinstance(query, dict): + return tuple([(normalize_query(k), normalize_query(v)) for k,v in query.items()]) + return query + +_evidence_cache = {} +_cache_miss = 0 +_cache_hit = 0 +def get_cached_or_execute(query): + global _evidence_cache, _cache_hit, _cache_miss + n = normalize_query(query.to_dict()) + if n not in _evidence_cache: + _evidence_cache[n] = list(query) + _cache_miss += 1 + else: + _cache_hit += 1 + return _evidence_cache[n] + + +def fetch_evidence(cell_content, cell_reference, paper_id, table_name, row, col, paper_limit=10, corpus_limit=10, + cache=False): + if not filter_cells(cell_content): + return [empty_fragment(paper_id)] + cell_content = clear_cell(cell_content) + if cell_content == "" and cell_reference == "": + return [empty_fragment(paper_id)] + + cached_query = get_cached_or_execute if cache else lambda x: x + evidence_query = Fragment.search().highlight( + 'text', pre_tags="", post_tags="", fragment_size=400) + cell_content = cell_content.replace("\xa0", " ") + query = { + "query": cell_content, + "slop": 2 + } + paper_fragments = list(cached_query(evidence_query + .filter('term', paper_id=paper_id) + .query('match_phrase', text=query)[:paper_limit])) + if cell_reference != "": + reference_fragments = list(cached_query(evidence_query + .filter('term', paper_id=paper_id) + .query('match_phrase', text={ + "query": cell_reference, + "slop": 1 + })[:paper_limit])) + else: + reference_fragments = [] + other_fagements = list(cached_query(evidence_query + .exclude('term', paper_id=paper_id) + .query('match_phrase', text=query)[:corpus_limit])) + + ext_id = f"{paper_id}/{table_name}/{row}.{col}" + ####print(f"{ext_id} |{cell_content}|: {len(paper_fragments)} paper fragments, {len(reference_fragments)} reference fragments, {len(other_fagements)} other fragments") + # if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements): + # print(f"No evidences for '{cell_content}' of {paper_id}") + if not len(paper_fragments) and not len(reference_fragments): + paper_fragments = [empty_fragment(paper_id)] + return paper_fragments + reference_fragments + other_fagements + +fix_refs_re = re.compile('\(\?\)|\s[?]+(\s|$)') + + +def fix_refs(text): + return fix_refs_re.sub(' xref-unkown ', fix_refs_re.sub(' xref-unkown ', text)) + + +highlight_re = re.compile("") +partial_highlight_re = re.compile(r"\xxref\-(?!\)") + + +def fix_reference_hightlight(s): + return partial_highlight_re.sub("xxref-", s) + + +evidence_columns = ["text_sha1", "text_highlited", "text", "header", "cell_type", "cell_content", "cell_reference", + "cell_layout", "cell_styles", "this_paper", "row", "col", "row_context", "col_context", "ext_id"] + + +def create_evidence_records(textfrag, cell, paper_id, table): + for text_highlited in textfrag.meta['highlight']['text']: + text_highlited = fix_reference_hightlight(fix_refs(text_highlited)) + text = highlight_re.sub("", text_highlited) + text_sha1 = hashlib.sha1(text.encode("utf-8")).hexdigest() + + cell_ext_id = f"{paper_id}/{table.name}/{cell.row}/{cell.col}" + + yield {"text_sha1": text_sha1, + "text_highlited": text_highlited, + "text": text, + "header": textfrag.header, + "cell_type": cell.vals[1], + "cell_content": fix_refs(cell.vals[0]), + "cell_reference": cell.vals[2], + "cell_layout": cell.vals[3], + "cell_styles": cell.vals[4], + "this_paper": textfrag.paper_id == paper_id, + "row": cell.row, + "col": cell.col, + "row_context": " border ".join([str(s) for s in table.matrix.values[cell.row]]), + "col_context": " border ".join([str(s) for s in table.matrix.values[:, cell.col]]), + "ext_id": cell_ext_id + #"table_id":table_id + } + + +def filter_cells(cell_content): + return re.search("[a-zA-Z]{2,}", cell_content) is not None + + +interesting_types = ["model-paper", "model-best", "model-competing", "dataset", "dataset-sub", "dataset-task"] + + +def evidence_for_table(paper_id, table, paper_limit, corpus_limit, cache=False): + records = [ + record + for cell in consume_cells(table) + for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=paper_id, table_name=table.name, + row=cell.row, col=cell.col, paper_limit=paper_limit, corpus_limit=corpus_limit, + cache=cache) + for record in create_evidence_records(evidence, cell, paper_id=paper_id, table=table) + ] + df = pd.DataFrame.from_records(records, columns=evidence_columns) + return df + + +def prepare_data(tables, csv_path, cache=False): + data = [evidence_for_table(table.paper_id, table, + paper_limit=100, + corpus_limit=20, cache=cache) for table in progress_bar(tables)] + if len(data): + df = pd.concat(data) + else: + df = pd.DataFrame(columns=evidence_columns) + #moved to experiment preprocessing + #df = df.drop_duplicates( + # ["cell_content", "text_highlited", "cell_type", "this_paper"]) + print("Number of text fragments ", len(df)) + + csv_path.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(csv_path, index=None) + + +class CellEvidenceExtractor: + def __init__(self, setup_connection=True): + # todo: make sure can be called more than once or refactor to singleton + if setup_connection: + setup_default_connection() + + def __call__(self, paper, tables, paper_limit=30, corpus_limit=10): + dfs = [evidence_for_table(paper.paper_id, table, paper_limit, corpus_limit) for table in tables] + if len(dfs): + return pd.concat(dfs) + return pd.DataFrame(columns=evidence_columns) diff --git a/axcell/data/table.py b/axcell/data/table.py new file mode 100644 index 0000000..6480503 --- /dev/null +++ b/axcell/data/table.py @@ -0,0 +1,359 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pandas as pd +import numpy as np +import json +from pathlib import Path +import re +from dataclasses import dataclass, field +from typing import List +from ..helpers.jupyter import display_html, table_to_html +from copy import deepcopy + + +@dataclass +class Cell: + value: str + raw_value: str + gold_tags: str = '' + refs: List[str] = field(default_factory=list) + layout: str = '' + + +reference_re = re.compile(r"(.*?)") +num_re = re.compile(r"^\d+$") + + +def extract_references(s): + parts = reference_re.split(s) + refs = [r.replace('-', '') for r in parts[1::3]] + text = [] + for i, x in enumerate(parts): + if i % 3 == 0: + text.append(x) + elif i % 3 == 2: + s = x.strip() + if num_re.match(s): + text.append(s) + else: + text.append(f"[{s}]") + text = ''.join(text) + return text, refs + + +empty_paren_re = re.compile(r"\(\s*\)|\[\s*\]") +def remove_references(s): + s = reference_re.sub("", s) + return empty_paren_re.sub("", s) + + +style_tags_re = re.compile(r"") +def remove_text_styles(s): + return style_tags_re.sub("", s) + + +reference_id_re = re.compile(r"") +def raw_value_to_html(s): + s = style_tags_re.sub(lambda x: "" if x[0].startswith("', s) + s = s.replace("", "") + s = reference_id_re.sub(r'', s) + return s + + +def str2cell(s): + value, refs = extract_references(s) + value = remove_text_styles(value) + value = unidecode(value) + return Cell(value=value, raw_value=s, refs=refs) + +def read_str_csv(filename): + try: + df = pd.read_csv(filename, header=None, dtype=str, keep_default_na=False) + except pd.errors.EmptyDataError: + df = pd.DataFrame() + return df + + +class CellDataFrame(pd.DataFrame): + """We subclass pandas DataFrame in order to make deepcopy recursively copy cells""" + def __deepcopy__(self, memodict={}): + return CellDataFrame(self.applymap(lambda cell: deepcopy(cell, memodict))) + + +class Table: + def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=None, migrate=False, old_name=None, guessed_tags=None): + self.name = name + self.caption = caption + self.figure_id = figure_id + self.df = CellDataFrame(df.applymap(str2cell)) + + if migrate: + self.old_name = old_name + + if layout is not None: + self.set_layout(layout) + + self._set_annotations(annotations, migrate=migrate, old_name=old_name, guessed_tags=guessed_tags) + + def _set_annotations(self, annotations, migrate=False, old_name=None, guessed_tags=None): + if annotations is not None: + self.gold_tags = annotations.gold_tags.strip() + self.dataset_text = annotations.dataset_text.strip() + self.notes = annotations.notes.strip() + + sota_records = json.loads(annotations.cells_sota_records) + + if guessed_tags is not None: + tags = guessed_tags.values + else: + tags = annotations.matrix_gold_tags + gt_rows = len(tags) + if gt_rows == 0 and len(self.df) > 0: + print(f"Gold tags size mismatch: 0 vs {len(self.df)} in old name {old_name}") + if migrate: + self.old_name = None + elif gt_rows > 0: + gt_cols = len(tags[0]) + if self.df.shape != (0,0) and self.df.shape == (gt_rows, gt_cols): + self.set_tags(tags) + else: + print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}") + # print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}") + # print(annotations.matrix_gold_tags) + # print(self.df.applymap(lambda c:c.value)) + if migrate: + self.old_name = None + else: + self.gold_tags = '' + self.dataset_text = '' + self.notes = '' + sota_records = {} + + sota_records = pd.DataFrame(sota_records.values(), index=sota_records.keys(), + columns=['task', 'dataset', 'metric', 'format', 'model', 'value']) + sota_records.index = self.name + "/" + sota_records.index + sota_records.index.rename("cell_ext_id", inplace=True) + sota_records.rename(columns={"value": "raw_value"}, inplace=True) + + self.sota_records = sota_records.replace("", np.nan).dropna(subset=["model", "metric", "task", "dataset"]) + + + def set_layout(self, layout): + for r, row in layout.iterrows(): + for c, cell in enumerate(row): + self.df.iloc[r, c].layout = cell + + def set_tags(self, tags): + for r, row in enumerate(tags): + for c, cell in enumerate(row): + # todo: change gold_tags to tags to avoid confusion + self.df.iloc[r,c].gold_tags = cell.strip() + + @property + def shape(self): + return self.df.shape + + @property + def matrix(self): + return self.df.applymap(lambda x: x.value) + + @property + def matrix_html(self): + return self.df.applymap(lambda x: raw_value_to_html(x.raw_value)) + + @property + def matrix_layout(self): + return self.df.applymap(lambda x: x.layout) + + @property + def matrix_gold_tags(self): + return self.df.applymap(lambda x: x.gold_tags) + + # todo: remove gold_tags + @property + def matrix_tags(self): + return self.matrix_gold_tags + + @classmethod + def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=None, guessed_tags=None): + path = Path(path) + filename = path / metadata['filename'] + df = read_str_csv(filename) + if 'layout' in metadata: + layout = read_str_csv(path / metadata['layout']) + else: + layout = None + if annotations is not None: + if not migrate: + # TODO: remove parser after migration is fully finished + table_ann = annotations.table_set.filter(name=metadata['filename'], parser="latexml") + [None] + table_ann = table_ann[0] + elif match_name is not None: + table_ann = annotations.table_set.filter(name=match_name) + [None] + table_ann = table_ann[0] + else: + table_ann = None + else: + table_ann = None + return cls(metadata['filename'], df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, migrate, match_name, guessed_tags) + + def _repr_html_(self): + return table_to_html(self.matrix_html.values, self.matrix_tags.values, self.matrix_layout.values) + + def display(self): + display_html(self._repr_html_()) + + def _save_df(self, df, filename): + df.to_csv(filename, header=None, index=None) + + def save(self, path, table_name, layout_name): + path = Path(path) + self._save_df(self.df.applymap(lambda x: x.raw_value), path / table_name) + self._save_df(self.df.applymap(lambda x: x.layout), path / layout_name) + +##### +# this code is used to migrate table annotations from +# tables parsed by htlatex to tables parsed by +# latexml. After all annotated tables will be successfully +# migrated, we switch back to match-by-name + +from unidecode import unidecode +import string +from collections import Counter + +figure_prefix_re = re.compile('^(table|figure)\s+([0-9]+|[ivxl]+)?') +punctuation_table = str.maketrans('', '', string.punctuation) +def normalize_string(s): + if s is None: + return "" + + s = s.strip().lower() + s = figure_prefix_re.sub('', s).strip() + return unidecode(s.replace('\xa0', '').replace(' ', '')).translate(punctuation_table) + +def _remove_almost_empty_values(d): + return {k:v for k,v in d.items() if len(v) >= 10} + +def _keep_unique_values(d): + c = Counter(d.values()) + unique = [k for k,v in c.items() if v == 1] + return {k: v for k,v in d.items() if v in unique} + +def _match_tables_by_captions(annotations, metadata): + if annotations is None: + return {} + old_captions = {x.name: normalize_string(x.desc) for x in annotations.table_set} + new_captions = {m['filename']: normalize_string(m['caption']) for m in metadata} + old_captions = _keep_unique_values(_remove_almost_empty_values(old_captions)) + new_captions = _keep_unique_values(_remove_almost_empty_values(new_captions)) + old_captions_reverse = {v:k for k,v in old_captions.items()} + return {new_name:old_captions_reverse[caption] for new_name, caption in new_captions.items() if caption in old_captions_reverse} + +def normalize_cell(s): + #s = reference_re.sub(' [] ', s) + return normalize_string(s) + +# begin of guess annotations mapping +def create_cell_contexts(df): + cell_context = df.values + cells = np.pad(cell_context, 1, mode='constant', constant_values='') + + slices = [slice(None, -2), slice(1,-1), slice(2, None)] + + row_context = np.stack([cells[1:-1, s] for s in slices], axis=-1) + col_context = np.stack([cells[s, 1:-1] for s in slices], axis=-1) + box_context = np.stack([cells[s1, s2] for s1 in slices for s2 in slices], axis=-1) + return box_context, row_context, col_context, cell_context[...,None] + +def map_context(context, values): + ctx_len = context.shape[-1] + mapping = {} + for ctx, val in zip(context.reshape((-1, ctx_len)), values.reshape(-1)): + mapping.setdefault(tuple(ctx), set()).add(val) + return mapping + +REANNOTATE_TAG = 'reannotate' + +def guess_annotations(old_table, gold_tags, new_table): + df = pd.DataFrame().reindex_like(new_table).fillna(REANNOTATE_TAG) + if old_table.empty: + return 0, df + old_contexts = create_cell_contexts(old_table) + old_mappings = [map_context(ctx, gold_tags.values) for ctx in old_contexts] + new_contexts = create_cell_contexts(new_table) + + rows, cols = new_table.shape + matched = 0 + for row in range(rows): + for col in range(cols): + for mapping, context in zip(old_mappings, new_contexts): + ctx = tuple(context[row, col]) + values = mapping.get(ctx, set()) + if len(values) == 1: + (val,) = values + df.iloc[row, col] = val + matched += 1 + break + return matched, df + +# end of guess annotations mapping + + +def same_table(old_table, new_table): + return old_table.equals(new_table) + +DEB_PAPER="1607.00036v2" + +def deb(path, old_name, old_table, new_name, new_table): + if path.name == DEB_PAPER and old_name == "table_02.csv" == new_name: + print(old_table) + print(new_table) + +def _match_tables_by_content(path, annotations, metadata): + if annotations is None: + return {}, {} + old_tables = {x.name: (pd.DataFrame(x.matrix).applymap(normalize_cell), pd.DataFrame(x.matrix_gold_tags)) for x in annotations.table_set} + new_tables = {m['filename']: Table.from_file(path, m, None, None).df.applymap(lambda c: normalize_cell(c.value)) for m in metadata} + matched = {} + new_tags = {} + for new_name, new_table in new_tables.items(): + max_hits = 0 + matched_name = None + size = np.prod(new_table.shape) + guessed_tags = None + for old_name, (old_table, gold_tags) in old_tables.items(): + hits, tags = guess_annotations(old_table, gold_tags, new_table) + if hits > max_hits: + max_hits = hits + matched_name = old_name + guessed_tags = tags + if max_hits > size / 2: + matched[new_name] = matched_name + new_tags[new_name] = guessed_tags + #deb(path, old_name, old_table, new_name, new_table) + #if same_table(old_table, new_table): + # if new_name in matched: + # print(f"Multiple matches for {path}/{new_name}: {matched[new_name]}, {old_name}") + # else: + # matched[new_name] = old_name + return matched, new_tags +#### + +def read_tables(path, annotations, migrate=False): + path = Path(path) + with open(path / "metadata.json", "r") as f: + metadata = json.load(f) + + if migrate: + _matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata) + _matched_names_by_content, _guessed_tags = _match_tables_by_content(path, annotations, metadata) + _matched_names = _matched_names_by_captions + for new_name, old_name in _matched_names_by_content.items(): + if new_name in _matched_names and _matched_names[new_name] != old_name: + print(f"Multiple matches for table {path}/{new_name}: {_matched_names[new_name]} by caption and {old_name} by content") + else: + _matched_names[new_name] = old_name + else: + _matched_names = {} + _guessed_tags = {} + return [Table.from_file(path, m, annotations, migrate=migrate, match_name=_matched_names.get(m["filename"]), guessed_tags=_guessed_tags.get(m["filename"])) for m in metadata] diff --git a/axcell/errors.py b/axcell/errors.py new file mode 100644 index 0000000..bcfa3ea --- /dev/null +++ b/axcell/errors.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +class PipelineError(Exception): + pass + + +class UnpackError(PipelineError): + pass + + +class LatexConversionError(PipelineError): + pass diff --git a/axcell/helpers/__init__.py b/axcell/helpers/__init__.py new file mode 100644 index 0000000..c82eacb --- /dev/null +++ b/axcell/helpers/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .latex_converter import LatexConverter +from .unpack import Unpack + +__all__ = ["LatexConverter", "Unpack"] diff --git a/axcell/helpers/cache.py b/axcell/helpers/cache.py new file mode 100644 index 0000000..1c7dfda --- /dev/null +++ b/axcell/helpers/cache.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pandas as pd +import json +from collections import defaultdict +from pathlib import Path + + +# these functions are used to cache various results +# of corresponding pipeline steps, to make it faster to +# rerun the pipeline or run in on batch of papers with various +# steps on different machines. The exchange formats are ad hoc and +# can be changed. + + +def _load_json(path): + with Path(path).open('rt') as f: + return json.load(f) + + +def _save_json(obj, path): + with Path(path).open('wt') as f: + json.dump(obj, f) + + +def load_references(path): + return _load_json(path) + + +def save_references(references, path): + _save_json(references, path) + + +def load_tags(path): + return _load_json(path) + + +def save_tags(tags, path): + _save_json(tags, path) + + +def load_structure(path): + return _load_json(path) + + +def save_structure(structure, path): + _save_json(structure, path) + + +def load_proposals(path): + dtypes = defaultdict(lambda: str) + dtypes['confidence'] = float + dtypes['parsed'] = float + + na_values = {'confidence': '', 'parsed': ''} + proposals = pd.read_csv(path, index_col=0, dtype=dtypes, na_values=na_values, keep_default_na=False) + return proposals + + +def save_proposals(proposals, path): + proposals.to_csv(path) diff --git a/axcell/helpers/datasets.py b/axcell/helpers/datasets.py new file mode 100644 index 0000000..9e89e56 --- /dev/null +++ b/axcell/helpers/datasets.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pandas as pd + + +def read_arxiv_papers(path): + return pd.read_csv(path) + + +def read_tables_annotations(path): + return pd.read_json(path) diff --git a/axcell/helpers/evaluate.py b/axcell/helpers/evaluate.py new file mode 100644 index 0000000..738c620 --- /dev/null +++ b/axcell/helpers/evaluate.py @@ -0,0 +1,99 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import re +import pandas as pd + +from axcell.data.paper_collection import remove_arxiv_version + + +def norm_score_str(x): + x = str(x) + if re.match('^(\+|-|)(\d+)\.9{5,}$', x): + x = re.sub('^(\+|-|)(\d+)\.9{5,}$', lambda a: a.group(1)+str(int(a.group(2))+1), x) + elif x.endswith('9' * 5) and '.' in x: + x = re.sub(r'([0-8])9+$', lambda a: str(int(a.group(1))+1), x) + if '.' in x: + x = re.sub(r'0+$', '', x) + if x[-1] == '.': + x = x[:-1] + if x == '-0': + x = '0' + return x + + +epsilon = 1e-10 + + +def precision(tp, fp): + pred_positives = tp + fp + epsilon + return ((1.0 * tp) / pred_positives) + + +def recall(tp, fn): + true_positives = tp + fn + epsilon + return ((1.0 * tp) / true_positives) + + +def f1(prec, recall): + norm = prec + recall + epsilon + return (2 * prec * recall / norm) + + +def stats(predictions, ground_truth, axis=None): + gold = pd.DataFrame(ground_truth, columns=["paper", "task", "dataset", "metric", "value"]) + pred = pd.DataFrame(predictions, columns=["paper", "task", "dataset", "metric", "value"]) + + if axis == 'tdm': + columns = ['paper', 'task', 'dataset', 'metric'] + elif axis == 'tdms' or axis is None: + columns = ['paper', 'task', 'dataset', 'metric', 'value'] + else: + columns = ['paper', axis] + gold = gold[columns].drop_duplicates() + pred = pred[columns].drop_duplicates() + + results = gold.merge(pred, on=columns, how="outer", indicator=True) + + is_correct = results["_merge"] == "both" + no_pred = results["_merge"] == "left_only" + no_gold = results["_merge"] == "right_only" + + results["TP"] = is_correct.astype('int8') + results["FP"] = no_gold.astype('int8') + results["FN"] = no_pred.astype('int8') + + m = results.groupby(["paper"]).agg({"TP": "sum", "FP": "sum", "FN": "sum"}) + m["precision"] = precision(m.TP, m.FP) + m["recall"] = recall(m.TP, m.FN) + m["f1"] = f1(m.precision, m.recall) + + TP_ALL = m.TP.sum() + FP_ALL = m.FP.sum() + FN_ALL = m.FN.sum() + + prec, reca = precision(TP_ALL, FP_ALL), recall(TP_ALL, FN_ALL) + return { + 'Micro Precision': prec, + 'Micro Recall': reca, + 'Micro F1': f1(prec, reca), + 'Macro Precision': m.precision.mean(), + 'Macro Recall': m.recall.mean(), + 'Macro F1': m.f1.mean() + } + + +def evaluate(predictions, ground_truth): + predictions = predictions.copy() + ground_truth = ground_truth.copy() + predictions['value'] = predictions['score' if 'score' in predictions else 'value'].apply(norm_score_str) + ground_truth['value'] = ground_truth['score' if 'score' in ground_truth else 'value'].apply(norm_score_str) + predictions['paper'] = predictions['arxiv_id'].apply(remove_arxiv_version) + ground_truth['paper'] = ground_truth['arxiv_id'].apply(remove_arxiv_version) + + metrics = [] + for axis in [None, "tdm", "task", "dataset", "metric"]: + s = stats(predictions, ground_truth, axis) + s['type'] = {'tdms': 'TDMS', 'tdm': 'TDM', 'task': 'Task', 'dataset': 'Dataset', 'metric': 'Metric'}.get(axis) + metrics.append(s) + columns = ['Micro Precision', 'Micro Recall', 'Micro F1', 'Macro Precision', 'Macro Recall', 'Macro F1'] + return pd.DataFrame(metrics, columns=columns) diff --git a/axcell/helpers/explainers.py b/axcell/helpers/explainers.py new file mode 100644 index 0000000..a88636a --- /dev/null +++ b/axcell/helpers/explainers.py @@ -0,0 +1,218 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from axcell.models.linking.metrics import Metrics +from ..models.structure import TableType +from ..loggers import StructurePredictionEvaluator, LinkerEvaluator, FilteringEvaluator +import pandas as pd +import numpy as np +from ..helpers.jupyter import table_to_html +from axcell.models.linking.format import extract_value +from axcell.helpers.optimize import optimize_filters + + +class Reason: + pass + + +class IrrelevantTable(Reason): + def __init__(self, paper, table, table_type, probs): + self.paper = paper + self.table = table + self.table_type = table_type + self.probs = pd.DataFrame(probs, columns=["type", "probability"]) + + def __str__(self): + return f"Table {self.table.name} was labelled as {self.table_type.name}." + + def _repr_html_(self): + prediction = f'
{self}
' + caption = f'
Caption: {self.table.caption}
' + probs = self.probs.style.format({"probability": "{:.2f}"})._repr_html_() + return prediction + caption + probs + + +class MislabeledCell(Reason): + def __init__(self, paper, table, row, col, probs): + self.paper = paper + self.table = table + + +class TableExplanation: + def __init__(self, paper, table, table_type, proposals, reasons, topk): + self.paper = paper + self.table = table + self.table_type = table_type + self.proposals = proposals + self.reasons = reasons + self.topk = topk + + def _format_tooltip(self, proposal): + return f"dataset: {proposal.dataset}\n" \ + f"metric: {proposal.metric}\n" \ + f"task: {proposal.task}\n" \ + f"score: {proposal.parsed}\n" \ + f"confidence: {proposal.confidence:0.2f}" + + def _format_topk(self, topk): + return "" + + def _repr_html_(self): + matrix = self.table.matrix_html.values + predictions = np.zeros_like(matrix, dtype=object) + tooltips = np.zeros_like(matrix, dtype=object) + for cell_ext_id, proposal in self.proposals.iterrows(): + paper_id, table_name, rc = cell_ext_id.split("/") + row, col = [int(x) for x in rc.split('.')] + if cell_ext_id in self.reasons: + reason = self.reasons[cell_ext_id] + tooltips[row, col] = reason + if reason.startswith("replaced by "): + tooltips[row, col] += "\n\n" + self._format_tooltip(proposal) + elif reason.startswith("confidence "): + tooltips[row, col] += "\n\n" + self._format_topk(self.topk[row, col]) + else: + predictions[row, col] = 'final-proposal' + tooltips[row, col] = self._format_tooltip(proposal) + + table_type_html = f'
Table {self.table.name} was labelled as {self.table_type.name}.
' + caption_html = f'
Caption: {self.table.caption}
' + table_html = table_to_html(matrix, + self.table.matrix_tags.values, + self.table.matrix_layout.values, + predictions, + tooltips) + html = table_type_html + caption_html + table_html + proposals = self.proposals[~self.proposals.index.isin(self.reasons.index)] + if len(proposals): + proposals = proposals[["dataset", "metric", "task", "model", "parsed"]]\ + .reset_index(drop=True).rename(columns={"parsed": "score"}) + html2 = proposals._repr_html_() + return f"
{html}
Proposals
{html2}
" + return html + + +class Explainer: + _sota_record_columns = ['task', 'dataset', 'metric', 'format', 'model', 'model_type', 'raw_value', 'parsed'] + + def __init__(self, pipeline_logger, paper_collection, gold_sota_records=None): + self.paper_collection = paper_collection + self.gold_sota_records = gold_sota_records + self.spe = StructurePredictionEvaluator(pipeline_logger, paper_collection) + self.le = LinkerEvaluator(pipeline_logger) + self.fe = FilteringEvaluator(pipeline_logger) + + def explain(self, paper, cell_ext_id): + paper_id, table_name, rc = cell_ext_id.split('/') + if paper.paper_id != paper_id: + return "No such cell" + + table_type, probs = self.spe.get_table_type_predictions(paper_id, table_name) + + if table_type == TableType.IRRELEVANT: + return IrrelevantTable(paper, paper.table_by_name(table_name), table_type, probs) + + all_proposals = self.le.proposals[paper_id] + reasons = self.fe.reason + table_ext_id = f"{paper_id}/{table_name}" + table_proposals = all_proposals[all_proposals.index.str.startswith(table_ext_id+"/")] + topk = {(row, col): topk for (pid, tn, row, col), topk in self.le.topk.items() + if (pid, tn) == (paper_id, table_name)} + + return TableExplanation(paper, paper.table_by_name(table_name), table_type, table_proposals, reasons, topk) + + row, col = [int(x) for x in rc.split('.')] + + reason = self.fe.reason.get(cell_ext_id) + if reason is None: + pass + else: + return reason + + def _get_table_sota_records(self, table): + + first_model = lambda x: ([a for a in x if a.startswith('model')] + [''])[0] + if len(table.sota_records): + matrix = table.matrix.values + tags = table.matrix_tags + model_type_col = tags.apply(first_model) + model_type_row = tags.T.apply(first_model) + sota_records = table.sota_records.copy() + sota_records['model_type'] = '' + sota_records['raw_value'] = '' + for cell_ext_id, record in sota_records.iterrows(): + name, rc = cell_ext_id.split('/') + row, col = [int(x) for x in rc.split('.')] + record.model_type = model_type_col[col] or model_type_row[row] + record.raw_value = matrix[row, col] + + sota_records["parsed"] = sota_records[["raw_value", "format"]].apply( + lambda row: float(extract_value(row.raw_value, row.format)), axis=1) + + sota_records = sota_records[sota_records["parsed"] == sota_records["parsed"]] + + strip_cols = ["task", "dataset", "format", "metric", "raw_value", "model", "model_type"] + sota_records = sota_records.transform( + lambda x: x.str.strip() if x.name in strip_cols else x) + return sota_records[self._sota_record_columns] + else: + empty = pd.DataFrame(columns=self._sota_record_columns) + empty.index.rename("cell_ext_id", inplace=True) + return empty + + def _get_sota_records(self, paper): + if not len(paper.tables): + empty = pd.DataFrame(columns=self._sota_record_columns) + empty.index.rename("cell_ext_id", inplace=True) + return empty + records = [self._get_table_sota_records(table) for table in paper.tables] + records = pd.concat(records) + records.index = paper.paper_id + "/" + records.index + records.index.rename("cell_ext_id", inplace=True) + return records + + def linking_metrics(self, experiment_name="unk", topk_metrics=False, filtered=True, confidence=0.0): + paper_ids = list(self.le.proposals.keys()) + + proposals = pd.concat(self.le.proposals.values()) + + # if not topk_metrics: + if filtered: + proposals = proposals[~proposals.index.isin(self.fe.reason.index)] + if confidence: + proposals = proposals[proposals.confidence > confidence] + + papers = {paper_id: self.paper_collection.get_by_id(paper_id) for paper_id in paper_ids} + missing = [paper_id for paper_id, paper in papers.items() if paper is None] + if missing: + print("Missing papers in paper collection:") + print(", ".join(missing)) + papers = [paper for paper in papers.values() if paper is not None] + + # if not len(papers): + # gold_sota_records = pd.DataFrame(columns=self._sota_record_columns) + # gold_sota_records.index.rename("cell_ext_id", inplace=True) + # else: + # gold_sota_records = pd.concat([self._get_sota_records(paper) for paper in papers]) + if self.gold_sota_records is None: + gold_sota_records = pd.DataFrame(columns=self._sota_record_columns) + gold_sota_records.index.rename("cell_ext_id", inplace=True) + else: + + gold_sota_records = self.gold_sota_records + which = gold_sota_records.index.to_series().str.split("/", expand=True)[0]\ + .isin([paper.paper_id for paper in papers]) + gold_sota_records = gold_sota_records[which] + + df = gold_sota_records.merge(proposals, 'outer', left_index=True, right_index=True, suffixes=['_gold', '_pred']) + df = df.reindex(sorted(df.columns), axis=1) + df = df.fillna('not-present') + if "experiment_name" in df.columns: + del df["experiment_name"] + + metrics = Metrics(df, experiment_name=experiment_name, topk_metrics=topk_metrics) + return metrics + + + def optimize_filters(self, metrics_info): + results = optimize_filters(self, metrics_info) + return results diff --git a/axcell/helpers/interpret.py b/axcell/helpers/interpret.py new file mode 100644 index 0000000..396ebe8 --- /dev/null +++ b/axcell/helpers/interpret.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from fastai.text.interpret import TextClassificationInterpretation as AbsTextClassificationInterpretation, _eval_dropouts +from fastai.basic_data import DatasetType +import torch + + +__all__ = ["TextClassificationInterpretation", "TextMultiClassificationInterpretation"] + + +class TextClassificationInterpretation(AbsTextClassificationInterpretation): + @classmethod + def from_learner(cls, learner): + empty_preds = torch.Tensor([[1]]) + return cls(learner, empty_preds, None, None) + + def intrinsic_attention(self, text:str, class_id:int=None): + """Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`. + Similar as in base class, but does not apply abs() before summing gradients. + """ + self.model.train() + _eval_dropouts(self.model) + self.model.zero_grad() + self.model.reset() + ids = self.data.one_item(text)[0] + emb = self.model[0].module.encoder(ids).detach().requires_grad_(True) + lstm_output = self.model[0].module(emb, from_embeddings=True) + self.model.eval() + cl = self.model[1](lstm_output + (torch.zeros_like(ids).byte(),))[0].softmax(dim=-1) + if class_id is None: class_id = cl.argmax() + cl[0][class_id].backward() + # attn = emb.grad.squeeze().abs().sum(dim=-1) + # attn /= attn.max() + attn = emb.grad.squeeze().sum(dim=-1) + attn = attn / attn.abs().max() * 0.5 + 0.5 + tokens = self.data.single_ds.reconstruct(ids[0]) + return tokens, attn + + +class TextMultiClassificationInterpretation(TextClassificationInterpretation): + def intrinsic_attention(self, text:str, class_id:int=None): + """Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`. + Similar as in base class, but uses sigmoid instead of softmax and does not apply abs() before summing gradients. + """ + self.model.train() + _eval_dropouts(self.model) + self.model.zero_grad() + self.model.reset() + ids = self.data.one_item(text)[0] + emb = self.model[0].module.encoder(ids).detach().requires_grad_(True) + lstm_output = self.model[0].module(emb, from_embeddings=True) + self.model.eval() + cl = self.model[1](lstm_output + (torch.zeros_like(ids).byte(),))[0].sigmoid() + if class_id is None: class_id = cl.argmax() + cl[0][class_id].backward() + # attn = emb.grad.squeeze().abs().sum(dim=-1) + # attn /= attn.max() + attn = emb.grad.squeeze().sum(dim=-1) + attn = attn / attn.abs().max() * 0.5 + 0.5 + tokens = self.data.single_ds.reconstruct(ids[0]) + return tokens, attn diff --git a/sota_extractor2/helpers/jupyter.py b/axcell/helpers/jupyter.py similarity index 51% rename from sota_extractor2/helpers/jupyter.py rename to axcell/helpers/jupyter.py index 29d3e81..6409a09 100644 --- a/sota_extractor2/helpers/jupyter.py +++ b/axcell/helpers/jupyter.py @@ -1,5 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + from IPython.core.display import display, HTML from .table_style import table_style +import numpy as np + + def set_seed(seed, name): import torch import numpy as np @@ -9,11 +14,11 @@ def set_seed(seed, name): torch.backends.cudnn.benchmark = False np.random.seed(seed) -def display_html(s): return display(HTML(s)) +def display_html(s): return display(HTML(s)) -def display_table(table, structure=None): +def table_to_html(table, structure=None, layout=None, predictions=None, tooltips=None): """ matrix - 2d ndarray with cell values strucutre - 2d ndarray with structure annotation @@ -23,15 +28,23 @@ def display_table(table, structure=None): else: matrix = table if structure is None: structure = table.matrix_gold_tags + if layout is None: layout = np.zeros_like(matrix, dtype=str) + if predictions is None: predictions = np.zeros_like(matrix, dtype=str) + if tooltips is None: tooltips = np.zeros_like(matrix, dtype=str) html = [] html.append(table_style) html.append('
') html.append("") - for row,struc_row in zip(matrix, structure): + for row,struc_row, layout_row, preds_row, tt_row in zip(matrix, structure, layout, predictions, tooltips): html.append("") - for cell,struct in zip(row,struc_row): - html.append(f'') + for cell,struct,layout,preds, tt in zip(row,struc_row,layout_row,preds_row, tt_row): + html.append(f'') html.append("") html.append("
{cell}{cell}
") html.append('
') - display_html("\n".join(html)) + return "\n".join(html) + + +def display_table(table, structure=None, layout=None): + html = table_to_html(table, structure, layout) + display_html(html) diff --git a/axcell/helpers/latex_converter.py b/axcell/helpers/latex_converter.py new file mode 100644 index 0000000..6eed27e --- /dev/null +++ b/axcell/helpers/latex_converter.py @@ -0,0 +1,87 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import docker +from docker.errors import ContainerError, ImageNotFound +from pathlib import Path +from tempfile import TemporaryDirectory +from bs4 import BeautifulSoup + +from axcell.errors import LatexConversionError + + +def ro_bind(path): return dict(bind=path, mode='ro') + + +def rw_bind(path): return dict(bind=path, mode='rw') + + +# magic constant used in latex2html.sh to signal that +# conversion failed on LaTeXML step +MAGIC_EXIT_ERROR = 117 + + +class LatexConverter: + def __init__(self, scripts_path=None): + # pull arxivvanity/engrafo image + self.client = docker.from_env() + if scripts_path is None: + self._scripts_path =\ + Path(__file__).resolve().parent.parent / 'scripts' + else: + self._scripts_path = Path(scripts_path) + + def latex2html(self, source_dir, output_dir, use_named_volumes=False): + base = self._scripts_path + source_dir = Path(source_dir) + output_dir = Path(output_dir) + scriptname = "/files/latex2html.sh" + filename = "index.html" + + volumes = { + base / "latex2html.sh": ro_bind("/files/latex2html.sh"), + base / "guess_main.py": ro_bind("/files/guess_main.py"), # todo: run guess_main outside of docker + base / "patches": ro_bind("/files/patches") # todo: see which patches can be dropped + } + + # In case of fully dockerized pipeline we use named volumes to share files between the steps. + # This, however, requires as to mount specific volumes with all papers, not only the currently processed one. + # (see https://github.com/moby/moby/issues/32582) + if use_named_volumes: + volumes.update({ + "pwc_unpacked_sources": ro_bind("/data/arxiv/unpacked_sources"), + "pwc_htmls": rw_bind("/data/arxiv/htmls") + }) + command = [scriptname, filename, str(source_dir), str(output_dir)] + else: + volumes.update({ + source_dir.resolve(): ro_bind("/files/ro-source"), + output_dir.resolve(): rw_bind("/files/htmls") + }) + command = [scriptname, filename] + + output_dir.mkdir(parents=True, exist_ok=True) + + try: + self.client.containers.run("arxivvanity/engrafo:b3db888fefa118eacf4f13566204b68ce100b3a6", command, remove=True, volumes=volumes) + except ContainerError as err: + if err.exit_status == MAGIC_EXIT_ERROR: + raise LatexConversionError("LaTeXML was unable to convert source code of this paper") + if "Unable to find any suitable tex file" in err.stderr.decode('utf-8'): + raise LatexConversionError("Unable to find any suitable tex file") + raise + + # todo: check for errors + def clean_html(self, path): + path = Path(path) + with path.open("rb") as file: + soup = BeautifulSoup(file, "html5lib") + return str(soup) + + def to_html(self, source_dir): + with TemporaryDirectory() as output_dir: + output_dir = Path(output_dir) + try: + self.latex2html(source_dir, output_dir) + return self.clean_html(output_dir / "index.html") + except ContainerError as err: + raise LatexConversionError from err diff --git a/axcell/helpers/optimize.py b/axcell/helpers/optimize.py new file mode 100644 index 0000000..56c564a --- /dev/null +++ b/axcell/helpers/optimize.py @@ -0,0 +1,283 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pandas as pd, numpy as np +from dataclasses import dataclass, replace +from axcell.models.linking.metrics import CM +from matplotlib import pyplot as plt +import matplotlib.tri as tri + + +def annotations(matrix, structure, r, c, type='model'): + ann = [] + for nc in range(0, c): + if type in structure[r, nc]: + ann.append(matrix[r, nc]) + for nr in range(0, r): + if type in structure[nr, c]: + ann.append(matrix[nr, c]) + return ' '.join(ann) + + +def estimate_noises(extracted_values, gold_values, short_forms): + if not len(extracted_values): + return {} + extracted_values = set(extracted_values) + gold_values = set(gold_values) + + return {gold: 1 - len(extracted_values & set(short_forms.get(gold, set()))) / len(extracted_values) for gold in + gold_values} + + +def estimate_context_noise(context, records): + context = context or "" + abbrvs = context_search.extract_acronyms(context) + context = normalize_cell_ws(normalize_dataset(context)) + dss = set(cs.find_datasets(context)) | set(abbrvs.keys()) + mss = set(cs.find_metrics(context)) + dss -= mss + dss = set([normalize_cell(ds) for ds in dss]) + mss = set([normalize_cell(ms) for ms in mss]) + + gold_ds = set(records.dataset.values) + gold_ms = set(records.metric.values) + ds_noises = estimate_noises(dss, gold_ds, cs.datasets) + ms_noises = estimate_noises(mss, gold_ms, cs.metrics) + + return ds_noises, ms_noises + + +def estimate_paper_context_noise(paper, gold_sota_records): + records = gold_sota_records[gold_sota_records.paper_id == paper.paper_id] + datasets = de.from_paper(paper) + context = " ".join(datasets) + return estimate_context_noise(context, records) + + +def estimate_caption_context_noise(paper, table, gold_sota_records): + table_ext_id = f"{paper.paper_id}/{table.name}/" + records = gold_sota_records[gold_sota_records.index.str.startswith(table_ext_id)] + return estimate_context_noise(table.caption, records) + + +def estimate_cell_context_noise(paper, table, row, col, gold_sota_records): + cell_ext_id = f"{paper.paper_id}/{table.name}/{row}.{col}" + records = gold_sota_records[gold_sota_records.index == cell_ext_id] + value = annotations(table.matrix.values, table.matrix_gold_tags.values, row, col, 'dataset') + return estimate_context_noise(value, records) + + +def average_dicts(dicts): + sums = {} + for d in dicts: + for k, v in d.items(): + sums.setdefault(k, []).append(v) + return {k: np.mean(v) for k, v in sums.items()} + + +def all_equal(row): + cols = ["model_type", "dataset", "metric", "task", "parsed"] + return np.all([row[f"{name}_pred"] == row[f"{name}_gold"] for name in cols]) + + +def merge_gold_records(explainer): + paper_ids = list(explainer.le.proposals.keys()) + + proposals = pd.concat(explainer.le.proposals.values()) + + papers = {paper_id: explainer.paper_collection.get_by_id(paper_id) for paper_id in paper_ids} + missing = [paper_id for paper_id, paper in papers.items() if paper is None] + if missing: + print("Missing papers in paper collection:") + print(", ".join(missing)) + papers = [paper for paper in papers.values() if paper is not None] + + if explainer.gold_sota_records is None: + print("gold_sota_records is missing") + return + else: + gold_sota_records = explainer.gold_sota_records + which = gold_sota_records.index.to_series().str.split("/", expand=True)[0] \ + .isin([paper.paper_id for paper in papers]) + gold_sota_records = gold_sota_records[which] + + df = gold_sota_records.merge(proposals, 'outer', left_index=True, right_index=True, suffixes=['_gold', '_pred']) + df = df.reindex(sorted(df.columns), axis=1) + df.confidence = df.confidence.fillna(0.0) + df = df.fillna('not-present') + df["equal"] = df.apply(all_equal, axis=1) + df["pred_positive"] = df["model_type_pred"].str.contains("model-best") + df["gold_positive"] = df["model_type_gold"].str.contains("model-best") + return df + + +def find_threshold_intervals(proposals, metrics_info, context="paper"): + # maximal threshold to have this proposal returned + proposals["max_threshold"] = proposals.confidence + + proposals["min_threshold"] = 0.0 + + ignore = (proposals.model_type_pred != 'model-best') | (proposals.struct_model_type == '') | \ + (proposals.struct_dataset.str.contains('dev')) | (proposals.struct_dataset.str.contains('train')) + + # this proposal won't be ever returned due to structure or model type filters + proposals.loc[ignore, "min_threshold"] = 1.0 + proposals.loc[ignore, "max_threshold"] = 0.0 + + all_proposals = proposals + proposals = proposals[~ignore] + + if context == "paper": + context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0]) + else: + context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0] + "/" + x[1]) + + for i, p in proposals.iterrows(): + key = (p.task_pred, p.dataset_pred, p.metric_pred) + proposals_context = proposals[context_column == context_column[p.name]] + proposals_context = proposals_context[~proposals_context.parsed_pred.isna()] + proposals_context = proposals_context[ + (proposals_context.task_pred == p.task_pred) & + (proposals_context.dataset_pred == p.dataset_pred) & + (proposals_context.metric_pred == p.metric_pred) + ] + d = 0 + if key in metrics_info: + d = metrics_info[key] + elif p.metric_pred in metrics_info: + d = metrics_info[p.metric_pred] + elif 'error' in p.metric_pred.lower(): + d = -1 + elif 'accuracy' in p.metric_pred.lower(): + d = 1 + + if d >= 0: + d = 1 + else: + d = -1 + + # the minimal threshold above which all superior results are ignored + which = d * proposals_context.parsed_pred > d * p.parsed_pred + if np.any(which.values): + all_proposals.at[i, "min_threshold"] = proposals_context[which].confidence.values.max() + else: + which = proposals_context[proposals_context.parsed_pred == p.parsed_pred].iloc[0] + if which.name != p.name: + all_proposals.at[i, "min_threshold"] = which.confidence + + return all_proposals + + +def update_cm(proposal, cm, is_activated): + d = 1 if is_activated else -1 + if proposal.equal and proposal.pred_positive and proposal.gold_positive: + cm = replace(cm, tp=cm.tp + d, fn=cm.fn - d) + if proposal.equal and not proposal.pred_positive and not proposal.gold_positive: + cm = replace(cm, tn=cm.tn + d) + if proposal.pred_positive and (not proposal.equal or not proposal.gold_positive): + cm = replace(cm, fp=cm.fp + d) + # if proposal.gold_positive and (not proposal.equal or not proposal.pred_positive): + # cm = replace(cm, fn = cm.fn+d) + return cm + + +def sweep_thresholds(df): + cm = CM(fn=sum(df.gold_positive)) + df = df[df.min_threshold < df.max_threshold] + + sweeps = df.reset_index().melt(id_vars="cell_ext_id", value_vars=["min_threshold", "max_threshold"], + var_name="threshold_type", value_name="threshold") + + sweeps = sweeps.sort_values(by=["threshold", "threshold_type"]).reset_index(drop=True) + + steps = sweeps.threshold.drop_duplicates().index + + results = [] + for i, idx1 in enumerate(steps[:-1]): + th1 = sweeps.threshold[idx1] + + to_restore = cm + for j, idx2 in enumerate(steps[i + 1:], i + 1): + th2 = sweeps.threshold[idx2] + precision = cm.tp / (cm.tp + cm.fp + 1e-8) + recall = cm.tp / (cm.tp + cm.fn + 1e-8) + f1 = 2 * precision * recall / (precision + recall + 1e-8) + + result = dict(threshold1=th1, threshold2=sweeps.threshold[idx2 - 1], tp=cm.tp, tn=cm.tn, fp=cm.fp, fn=cm.fn, + precision=precision, recall=recall, f1=f1) + results.append(result) + for _, row in sweeps[sweeps.threshold == sweeps.threshold[idx2 - 1]].iterrows(): + proposal = df.loc[row.cell_ext_id] + is_activated = row.threshold_type == 'min_threshold' + if not is_activated and proposal.min_threshold < th1: + cm = update_cm(proposal, cm, is_activated) + + precision = cm.tp / (cm.tp + cm.fp + 1e-8) + recall = cm.tp / (cm.tp + cm.fn + 1e-8) + f1 = 2 * precision * recall / (precision + recall + 1e-8) + + result = dict(threshold1=th1, threshold2=th2, tp=cm.tp, tn=cm.tn, fp=cm.fp, fn=cm.fn, + precision=precision, recall=recall, f1=f1) + results.append(result) + + cm = to_restore + + for _, row in sweeps[sweeps.threshold == th1].iterrows(): + proposal = df.loc[row.cell_ext_id] + + is_activated = row.threshold_type == 'min_threshold' + cm = update_cm(proposal, cm, is_activated) + + return df, sweeps, steps, pd.DataFrame(results) + + +class PRResults: + def __init__(self, results): + self.results = results + + def plot(self): + plt.figure(figsize=(6, 6)) + plt.plot(self.results["precision"], self.results["recall"], '.') + plt.xlabel("precision") + plt.ylabel("recall") + + def _best(self, results, metric): + b = results.loc[results[metric].idxmax()] + x = ["precision", "recall", "f1"] + x.remove(metric) + y = [b[m] for m in x] + print(f"Best {metric}={b[metric]:0.2f} (with {x[0]}={y[0]:.2f} and {x[1]}={y[1]:.2f})" + f" is achieved with threshold1={b.threshold1} and threshold2={b.threshold2}") + + def best(self, min_precision=0, min_recall=0, min_f1=0): + results = self.results + results = results[ + (results.precision >= min_precision) & + (results.recall >= min_recall) & + (results.f1 >= min_f1) + ] + if not len(results): + print("No results with this criteria") + else: + self._best(results, "precision") + self._best(results, "recall") + self._best(results, "f1") + + def threshold_map(self, metric): + lin = np.linspace(0, 1, 64) + + triang = tri.Triangulation(self.results.threshold1.values, self.results.threshold2.values) + interpolator = tri.LinearTriInterpolator(triang, self.results[metric]) + Xi, Yi = np.meshgrid(lin, lin) + zi = interpolator(Xi, Yi) + plt.figure(figsize=(6, 6)) + img = plt.imshow(zi[::-1], extent=[0, 1, 0, 1]) + plt.colorbar(img) + plt.xlabel("threshold1") + plt.ylabel("threshold2") + + +def optimize_filters(explainer, metrics_info): + df = merge_gold_records(explainer) + df = find_threshold_intervals(df, metrics_info, context="paper") + df, sweeps, steps, results = sweep_thresholds(df) + return PRResults(results) diff --git a/axcell/helpers/paper_extractor.py b/axcell/helpers/paper_extractor.py new file mode 100644 index 0000000..483a531 --- /dev/null +++ b/axcell/helpers/paper_extractor.py @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from pathlib import Path +from axcell.helpers import LatexConverter, Unpack +from axcell.errors import UnpackError, LatexConversionError +from axcell.data.elastic import Paper as PaperText +import axcell.data.extract_tables as table_extraction + +import re +import warnings + +arxiv_re = re.compile(r"^(?P\d{4}\.\d+(v\d+)?)(\..*)?$") + + +class PaperExtractor: + def __init__(self, root): + self.root = Path(root) + self.unpack = Unpack() + self.latex = LatexConverter() + + def __call__(self, source): + source = Path(source) + + m = arxiv_re.match(source.name) + if not m: + warnings.warn(f'Unable to infer arxiv_id from "{source.name}" filename') + arxiv_id = source.name + else: + arxiv_id = m.group('arxiv_id') + + subpath = source.relative_to(self.root / 'sources').parent / arxiv_id + unpack_path = self.root / 'unpacked_sources' / subpath + try: + self.unpack(source, unpack_path) + except UnpackError as e: + if e.args[0].startswith('The paper has been withdrawn'): + return 'withdrawn' + return 'no-tex' + html_path = self.root / 'htmls' / subpath / 'index.html' + try: + html = self.latex.to_html(unpack_path) + html_path.parent.mkdir(parents=True, exist_ok=True) + html_path.write_text(html, 'utf-8') + except LatexConversionError: + return 'processing-error' + + text_path = self.root / 'papers' / subpath / 'text.json' + doc = PaperText.from_html(html, arxiv_id) + text_path.parent.mkdir(parents=True, exist_ok=True) + text_path.write_text(doc.to_json(), 'utf-8') + + tables_path = self.root / 'papers' / subpath + tables = table_extraction.extract_tables(html) + table_extraction.save_tables(tables, tables_path) + + return 'success' diff --git a/axcell/helpers/precompute_evidences.py b/axcell/helpers/precompute_evidences.py new file mode 100644 index 0000000..8e95a1d --- /dev/null +++ b/axcell/helpers/precompute_evidences.py @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from fire import Fire +from pathlib import Path +from axcell.data.paper_collection import PaperCollection +from axcell.data.structure import CellEvidenceExtractor +from elasticsearch_dsl import connections +from tqdm import tqdm +import pandas as pd +from joblib import delayed, Parallel + +class Helper: + def split_pc_pickle(self, path, outdir="pc-parts", parts=8): + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + pc = PaperCollection.from_pickle(path) + step = (len(pc) + parts - 1) // parts + for idx, i in enumerate(range(0, len(pc), step)): + part = PaperCollection(pc[i:i + step]) + part.to_pickle(outdir / f"pc-part-{idx:02}.pkl") + + def _evidences_for_pc(self, path): + path = Path(path) + pc = PaperCollection.from_pickle(path) + cell_evidences = CellEvidenceExtractor() + connections.create_connection(hosts=['10.0.1.145'], timeout=20) + raw_evidences = [] + for paper in tqdm(pc): + raw_evidences.append(cell_evidences(paper, paper.tables, paper_limit=100, corpus_limit=20)) + raw_evidences = pd.concat(raw_evidences) + path = path.with_suffix(".evidences.pkl") + raw_evidences.to_pickle(path) + + def evidences_for_pc(self, pattern="pc-parts/pc-part-??.pkl", jobs=-1): + pickles = sorted(Path(".").glob(pattern)) + Parallel(backend="multiprocessing", n_jobs=jobs)(delayed(self._evidences_for_pc)(path) for path in pickles) + + def merge_evidences(self, output="evidences.pkl", pattern="pc-parts/pc-part-*.evidences.pkl"): + pickles = sorted(Path(".").glob(pattern)) + evidences = [pd.read_pickle(pickle) for pickle in pickles] + evidences = pd.concat(evidences) + evidences.to_pickle(output) + + +if __name__ == "__main__": Fire(Helper()) diff --git a/axcell/helpers/reannotate.py b/axcell/helpers/reannotate.py new file mode 100644 index 0000000..4f23e6b --- /dev/null +++ b/axcell/helpers/reannotate.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import requests +from axcell import config +from axcell.data.paper_collection import _load_annotated_papers + + +def run_graphql_query(query): + request = requests.post(config.graphql_url, json={'query': query}) + if request.status_code == 200: + return request.json() + else: + raise Exception(f"Query error: status code {request.status_code}") + + +def reannotate_paper(paper, annotations): + paper._annotations = annotations + paper.gold_tags = annotations.gold_tags.strip() + for table in paper.tables: + table._set_annotations(annotations.table_set.filter(name=table.name, parser="latexml")[0]) + + +def reannotate_papers(papers, annotations): + for paper in papers: + ann = annotations.get(paper.arxiv_no_version) + if ann is not None: + reannotate_paper(paper, ann) + + +def query_annotations(): + raw = run_graphql_query(""" + query { + allPapers { + edges { + node { + arxivId + goldTags + tableSet { + edges { + node { + name + datasetText + notes + goldTags + matrixGoldTags + cellsSotaRecords + parser + } + } + } + } + } + } + } + """) + return _load_annotated_papers(raw) + + +def reannotate_papers_with_db(papers): + annotations = query_annotations() + reannotate_papers(papers, annotations) diff --git a/axcell/helpers/results_extractor.py b/axcell/helpers/results_extractor.py new file mode 100644 index 0000000..78b651c --- /dev/null +++ b/axcell/helpers/results_extractor.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from axcell.data.structure import CellEvidenceExtractor +from axcell.models.structure import TableType, TableStructurePredictor, TableTypePredictor +from axcell.models.linking import * +from pathlib import Path + + +class ResultsExtractor: + def __init__(self, models_path): + models_path = Path(models_path) + self.cell_evidences = CellEvidenceExtractor() + self.ttp = TableTypePredictor(models_path, "table-type-classifier.pth") + self.tsp = TableStructurePredictor(models_path, "table-structure-classifier.pth") + self.taxonomy = Taxonomy(taxonomy=models_path / "taxonomy.json", metrics_info=models_path / "metrics.json") + + self.evidence_finder = EvidenceFinder(self.taxonomy, abbreviations_path=models_path / "abbreviations.json") + self.context_search = ContextSearch(self.taxonomy, self.evidence_finder) + self.dataset_extractor = DatasetExtractor(self.evidence_finder) + + self.linker = Linker("linking", self.context_search, self.dataset_extractor) + self.filters = StructurePredictionFilter() >> ConfidenceFilter(0.8) >> \ + BestResultFilter(self.taxonomy, context="paper") >> ConfidenceFilter(0.85) + + def __call__(self, paper, tables=None, in_place=False): + if tables is None: + tables = paper.tables + tables_types = self.ttp.predict(paper, tables) + if in_place: + types = { + TableType.SOTA: 'leaderboard', + TableType.ABLATION: 'ablation', + TableType.IRRELEVANT: 'irrelevant' + } + for table, table_type in zip(paper.tables, tables_types): + table.gold_tags = types[table_type] + sota_tables = [ + table for table, table_type in zip(paper.tables, tables_types) + if table_type != TableType.IRRELEVANT + ] + paper.text.save() + evidences = self.cell_evidences(paper, sota_tables) + labeled_tables = self.tsp.label_tables(paper, sota_tables, evidences, in_place=in_place, use_crf=False) + + proposals = self.linker(paper, labeled_tables) + proposals = self.filters(proposals) + proposals = proposals[["dataset", "metric", "task", "model", "parsed"]] \ + .reset_index(drop=True).rename(columns={"parsed": "score"}) + return proposals diff --git a/axcell/helpers/table_style.py b/axcell/helpers/table_style.py new file mode 100644 index 0000000..a51a994 --- /dev/null +++ b/axcell/helpers/table_style.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +table_style=""" +""" diff --git a/axcell/helpers/training.py b/axcell/helpers/training.py new file mode 100644 index 0000000..313f4d5 --- /dev/null +++ b/axcell/helpers/training.py @@ -0,0 +1,16 @@ + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +def set_seed(seed, name, quiet=False, all_gpus=True): + import torch + import numpy as np + import random + if not quiet: + print(f"Setting {name} seed to {seed}") + torch.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + np.random.seed(seed) + random.seed(seed) + if all_gpus: + torch.cuda.manual_seed_all(seed) diff --git a/axcell/helpers/unpack.py b/axcell/helpers/unpack.py new file mode 100644 index 0000000..74107ae --- /dev/null +++ b/axcell/helpers/unpack.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from magic import Magic +import tarfile +import gzip +from pathlib import Path +from shutil import copyfileobj +from axcell.errors import UnpackError +from ..pipeline_logger import pipeline_logger + + +class Unpack: + step = "unpack" + + def __init__(self): + self.magic = Magic(mime=True, uncompress=True) + self.magic_formatted = Magic(mime=False, uncompress=True) + + def __call__(self, source, dest): + pipeline_logger(f"{Unpack.step}::call", source=source, dest=dest) + source = Path(source) + dest = Path(dest) + mime = self.magic.from_file(str(source)) + pipeline_logger(f"{Unpack.step}::detect_mime", source=source, mime=mime) + if mime == 'application/x-tar': + dest.mkdir(parents=True, exist_ok=True) + with tarfile.open(source, "r:*") as tar: + tar.extractall(dest) + elif mime == 'text/x-tex': + dest.mkdir(parents=True, exist_ok=True) + with gzip.open(source, "rb") as src, open(dest / "main.tex", "wb") as dst: + copyfileobj(src, dst) + elif mime == 'application/pdf': + raise UnpackError(f"No LaTeX source code available for this paper, PDF only") + elif mime == 'text/plain' and 'withdrawn' in self.magic_formatted.from_file(str(source)): + raise UnpackError(f"The paper has been withdrawn and there is" + f" no LaTeX source code available") + else: + raise UnpackError(f"Cannot unpack file of type {mime}") diff --git a/axcell/loggers.py b/axcell/loggers.py new file mode 100644 index 0000000..8e30b11 --- /dev/null +++ b/axcell/loggers.py @@ -0,0 +1,194 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import sys +import pandas as pd +from .models.structure.experiment import Experiment, label_map, Labels +from .models.structure.type_predictor import TableType +from copy import deepcopy +import pickle + + + +class BaseLogger: + def __init__(self, pipeline_logger, pattern=".*"): + pipeline_logger.register(pattern, self) + + def __call__(self, step, **kwargs): + raise NotImplementedError() + + +class StdoutLogger: + def __init__(self, pipeline_logger, file=sys.stdout): + self.file = file + pipeline_logger.register(".*", self) + + def __call__(self, step, **kwargs): + print(f"[STEP] {step}: {kwargs}", file=self.file) + + +class SessionRecorder: + def __init__(self, pipeline_logger): + self.pipeline_logger = pipeline_logger + self.session = [] + self._recording = False + + def __call__(self, step, **kwargs): + self.session.append((step, deepcopy(kwargs))) + + def reset(self): + self.session = [] + + def record(self): + if not self._recording: + self.pipeline_logger.register(".*", self) + self._recording = True + + def stop(self): + if self._recording: + self.pipeline_logger.unregister(".*", self) + self._recording = False + + def replay(self): + self.stop() + for step, kwargs in self.session: + self.pipeline_logger(step, **kwargs) + + def save_session(self, path): + with open(path, "wb") as f: + pickle.dump(self.session, f) + + def load_session(self, path): + with open(path, "rb") as f: + self.session = pickle.load(f) + + +class StructurePredictionEvaluator: + def __init__(self, pipeline_logger, pc): + pipeline_logger.register("structure_prediction::evidences_split", self.on_evidences_split) + pipeline_logger.register("structure_prediction::tables_labeled", self.on_tables_labeled) + pipeline_logger.register("type_prediction::predicted", self.on_type_predicted) + pipeline_logger.register("type_prediction::multiclass_predicted", self.on_type_multiclass_predicted) + self.pc = pc + self.results = {} + self.type_predictions = {} + self.type_multiclass_predictions = {} + self.evidences = pd.DataFrame() + + def on_type_multiclass_predicted(self, step, paper, tables, threshold, predictions): + for table, prediction in zip(tables, predictions): + self.type_multiclass_predictions[paper.paper_id, table.name] = { + TableType.SOTA: prediction[0], + TableType.ABLATION: prediction[1], + TableType.IRRELEVANT: threshold + } + + def on_type_predicted(self, step, paper, tables, predictions): + for table, prediction in zip(tables, predictions): + self.type_predictions[paper.paper_id, table.name] = prediction + + def on_evidences_split(self, step, evidences, evidences_num): + self.evidences = pd.concat([self.evidences, evidences]) + + def on_tables_labeled(self, step, paper, labeled_tables): + golds = [p for p in self.pc if p.text.title == paper.text.title] + paper_id = paper.paper_id + type_results = [] + cells_results = [] + labeled_tables = {table.name: table for table in labeled_tables} + if len(golds) == 1: + gold = golds[0] + for gold_table, table, in zip(gold.tables, paper.tables): + table_type = self.type_predictions[paper.paper_id, table.name] + is_important = table_type == TableType.SOTA or table_type == TableType.ABLATION + gold_is_important = "sota" in gold_table.gold_tags or "ablation" in gold_table.gold_tags + type_results.append({"predicted": is_important, "gold": gold_is_important, "name": table.name}) + if not is_important: + continue + table = labeled_tables[table.name] + rows, cols = table.df.shape + for r in range(rows): + for c in range(cols): + cells_results.append({ + "predicted": table.df.iloc[r, c].gold_tags, + "gold": gold_table.df.iloc[r, c].gold_tags, + "ext_id": f"{table.name}/{r}.{c}", + "content": table.df.iloc[r, c].value + }) + + self.results[paper_id] = { + 'type': pd.DataFrame.from_records(type_results), + 'cells': pd.DataFrame.from_records(cells_results) + } + + def map_tags(self, tags): + mapping = dict(label_map) + mapping[""] = Labels.EMPTY.value + return tags.str.strip().apply(lambda x: mapping.get(x, 0)) + + def metrics(self, paper_id): + if paper_id not in self.results: + print(f"No annotations for {paper_id}") + return + print("Structure prediction:") + results = self.results[paper_id] + cells_df = results['cells'] + e = Experiment() + e._set_results(paper_id, self.map_tags(results['cells'].predicted), self.map_tags(results['cells'].gold)) + e.show_results(paper_id, normalize=True) + + def get_table_type_predictions(self, paper_id, table_name): + prediction = self.type_predictions.get((paper_id, table_name)) + multi_predictions = self.type_multiclass_predictions.get((paper_id, table_name)) + if prediction is not None: + multi_predictions = sorted(multi_predictions.items(), key=lambda x: x[1], reverse=True) + return prediction, [(k.name, v) for k, v in multi_predictions + ] + + +class LinkerEvaluator: + def __init__(self, pipeline_logger): + pipeline_logger.register("linking::call", self.on_before_linking) + pipeline_logger.register("linking::taxonomy_linking::call", self.on_before_taxonomy) + pipeline_logger.register("linking::taxonomy_linking::topk", self.on_taxonomy_topk) + pipeline_logger.register("linking::linked", self.on_after_linking) + self.proposals = {} + self.topk = {} + self.queries = {} + + def on_before_linking(self, step, paper, tables): + pass + + def on_after_linking(self, step, paper, tables, proposals): + self.proposals[paper.paper_id] = proposals.copy(deep=True) + + def on_before_taxonomy(self, step, ext_id, query, paper_context, abstract_context, table_context, caption): + self.queries[ext_id] = (query, paper_context, abstract_context, table_context, caption) + + def on_taxonomy_topk(self, step, ext_id, topk): + paper_id, table_name, rc = ext_id.split('/') + row, col = [int(x) for x in rc.split('.')] + self.topk[paper_id, table_name, row, col] = topk.copy(deep=True) + + def top_matches(self, paper_id, table_name, row, col): + return self.topk[(paper_id, table_name, row, col)] + + +class FilteringEvaluator: + def __init__(self, pipeline_logger): + pipeline_logger.register("filtering::.*::filtered", self.on_filtered) + self.proposals = {} + self.which = {} + self.reason = pd.Series(dtype=str) + + def on_filtered(self, step, proposals, which, reason, **kwargs): + _, filter_step, _ = step.split('::') + if filter_step != "compound_filtering": + if filter_step in self.proposals: + self.proposals[filter_step] = pd.concat([self.proposals[filter_step], proposals]) + self.which[filter_step] = pd.concat([self.which[filter_step], which]) + else: + self.proposals[filter_step] = proposals + self.which[filter_step] = which + self.reason = self.reason.append(reason) + + diff --git a/axcell/mocks/__init__.py b/axcell/mocks/__init__.py new file mode 100644 index 0000000..57dfe78 --- /dev/null +++ b/axcell/mocks/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/axcell/mocks/latex_converter.py b/axcell/mocks/latex_converter.py new file mode 100644 index 0000000..ec030f4 --- /dev/null +++ b/axcell/mocks/latex_converter.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +class LatexConverterMock: + def __init__(self, mock_file): + with open(mock_file, "r") as f: + self.mock = f.read() + + def to_html(self, source_dir): + return self.mock diff --git a/axcell/models/__init__.py b/axcell/models/__init__.py new file mode 100644 index 0000000..57dfe78 --- /dev/null +++ b/axcell/models/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/axcell/models/linking/__init__.py b/axcell/models/linking/__init__.py new file mode 100644 index 0000000..3f30207 --- /dev/null +++ b/axcell/models/linking/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .taxonomy import Taxonomy +from .linker import Linker +from .context_search import ContextSearch, DatasetExtractor, EvidenceFinder +from .proposals_filters import * + +__all__ = ["Taxonomy", "Linker", "ContextSearch", "DatasetExtractor", "EvidenceFinder", "ProposalsFilter", "NopFilter", + "BestResultFilter", "StructurePredictionFilter", "ConfidenceFilter", "CompoundFilter"] diff --git a/axcell/models/linking/acronym_extractor.py b/axcell/models/linking/acronym_extractor.py new file mode 100644 index 0000000..2f23cc4 --- /dev/null +++ b/axcell/models/linking/acronym_extractor.py @@ -0,0 +1,22 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import spacy +from scispacy.abbreviation import AbbreviationDetector +from .utils import normalize_cell, normalize_dataset + +class AcronymExtractor: + def __init__(self): + self.nlp = spacy.load("en_core_sci_sm") + abbreviation_pipe = AbbreviationDetector(self.nlp) + self.nlp.add_pipe(abbreviation_pipe) + self.nlp.disable_pipes("tagger", "ner", "parser") + + def __call__(self, text): + doc = self.nlp(text) + abbrvs = {} + for abrv in doc._.abbreviations: + # abbrvs.setdefault(normalize_cell(str(abrv)), Counter())[str(abrv._.long_form)] += 1 + norm = normalize_cell(normalize_dataset(str(abrv))) + if norm != '': + abbrvs[norm] = normalize_cell(normalize_dataset(str(abrv._.long_form))) + return abbrvs diff --git a/axcell/models/linking/bm25_naive.py b/axcell/models/linking/bm25_naive.py new file mode 100644 index 0000000..f538567 --- /dev/null +++ b/axcell/models/linking/bm25_naive.py @@ -0,0 +1,412 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import re +from decimal import Decimal, localcontext, InvalidOperation +from dataclasses import dataclass +import numpy as np +import pandas as pd +from elasticsearch import Elasticsearch, client +import logging +#from .extractors import DatasetExtractor +import spacy +from scispacy.abbreviation import AbbreviationDetector +from axcell.models.linking.format import extract_value + + +@dataclass() +class Value: + type: str + value: str + def __str__(self): + return self.value + + +@dataclass() +class Cell: + cell_ext_id: str + table_ext_id: str + row: int + col: int + + +@dataclass() +class Proposal: + cell: Cell + dataset_values: list + table_description: str + model_values: list # best paper competing + model_params: dict = None + raw_value: str = "" + + def __post_init__(self): + if self.model_params is None: + self.model_params = {} + + @property + def dataset(self): + return ' '.join(map(str, self.dataset_values)).strip() + + @property + def model_name(self): + return ' '.join(map(str, self.model_values)).strip() + + @property + def model_type(self): + types = [v.type for v in self.model_values] + [''] + if 'model-competing' in types: + return 'model-competing' # competing model is different from model-paper and model-best so we return it first + return types[0] + + def __str__(self): + return f"{self.model_name}: {self.raw_value} on {self.dataset}" + + +class MetricValue: + value: Decimal + unit: str = None + + def __init__(self, value, unit): + self.value = value + self.unit = unit + + def to_unitless(self): + return self.value + + def to_absolute(self): + return self.value / Decimal(100) if self.unit is '%' else self.value + + # unit = None means that no unit was specified, so we have to guess the unit. + # if there's a value "21" in a table's cell, then we guess if it's 21 or 0.21 (i.e., 21%) + # based on the target metric properties. + def to_percentage(self): + if self.unit is None and 0 < self.value < 1: + return self.value * 100 + return self.value + + def complement(self): + if self.unit is None: + if 1 < self.value < 100: + value = 100 - self.value + else: + value = 1 - self.value + else: + value = 100 - self.value + return MetricValue(value, self.unit) + + def __repr__(self): + return f"MetricValue({self.value}, {repr(self.unit)})" + + def __str__(self): + return str(self.value) + + +def mkquery_ngrams(query): + return { + "query": { + "multi_match": { + "query": query, + "fields": ["dataset^3", "dataset.ngrams^1", "metric^1", "metric.ngrams^1", "task^1", + "task.ngrams^1"] + } + } + } + + +def mkquery_fullmatch(query): + return { + "query": { + "multi_match": { + "query": query, + "fields": ["dataset^3", "metric^1", "task^1"] + } + } + } + +class MatchSearch: + def __init__(self, mkquery=mkquery_ngrams, es=None): + self.case = True + self.all_fields = True + self.es = es or Elasticsearch() + self.log = logging.getLogger(__name__) + self.mkquery = mkquery + + self.nlp = spacy.load("en_core_web_sm") + abbreviation_pipe = AbbreviationDetector(self.nlp) + self.nlp.add_pipe(abbreviation_pipe) + self.nlp.disable_pipes("tagger", "ner", "parser") + + def match_abrv(self, dataset, datasets): + abrvs = [] + for ds in datasets: + # "!" is a workaround to scispacy error + doc = self.nlp(f"! {ds} ({dataset})") + for abrv in doc._.abbreviations: + if str(abrv) == dataset and str(abrv._.long_form) == ds: + abrvs.append(str(abrv._.long_form)) + abrvs = list(set(abrvs)) + if len(abrvs) == 1: + print(f"abrv. for {dataset}: {abrvs[0]}") + return abrvs[0] + elif len(abrvs) == 0: + return None + else: + print(f"Multiple abrvs. for {dataset}: {abrvs}") + return None + + def preproc(self, val, datasets=None): + val = val.strip(',- ') + val = re.sub("dataset", '', val, flags=re.I) + if datasets: + abrv = self.match_abrv(val, datasets) + if abrv: + val += " " + abrv + # if self.case: + # val += (" " +re.sub("([a-z])([A-Z])", r'\1 \2', val) + # +" " +re.sub("([a-zA-Z])([0-9])", r'\1 \2', val) + # ) + return val + + def search(self, query, explain_doc_id=None): + body = self.mkquery(query) + if explain_doc_id is not None: + return self.es.explain('et_taxonomy', doc_type='doc', id=explain_doc_id, body=body) + return self.es.search('et_taxonomy', doc_type='doc', body=body)["hits"] + + def __call__(self, query, datasets, caption): + split_re = re.compile('([^a-zA-Z0-9])') + query = self.preproc(query, datasets).strip() + if caption: + query += " " + self.preproc(caption).strip()[:400] + results = self.search(query) + hits = results["hits"][:3] + df = pd.DataFrame.from_records([ + dict(**hit["_source"], + confidence=hit["_score"] / len(split_re.split(query)), + # Roughly normalize the score not to ignore query length + evidence=query) for hit in hits + ], columns=["dataset", "metric", "task", "confidence", "evidence"]) + if not len(df): + self.log.debug("Elastic query didn't produce any output", query, hits) + else: + scores = [] + for dataset in df["dataset"]: + r = self.search(dataset) + scores.append( + dict(ok_score=r['hits'][0]['_score'] / len(split_re.split(dataset)), + bad_score=r['hits'][1]['_score'] / len(split_re.split(dataset)))) + + scores = pd.DataFrame.from_records(scores) + df['confidence'] = ((scores['ok_score'] - scores['bad_score']) / scores['bad_score']) * df['confidence'] / scores['ok_score'] + return df[["dataset", "metric", "task", "confidence", "evidence"]] + +float_pm_re = re.compile(r"(±?)([+-]?\s*(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)\s*(%?)") +whitespace_re = re.compile(r"\s+") +def handle_pm(value): + "handle precentage metric" + for match in float_pm_re.findall(value): + if not match[0]: + try: + percent = bool(match[-1]) + value = Decimal(whitespace_re.sub("", match[1])) / (100 if percent else 1) + yield MetricValue(value, "%" if percent else None) + except: + pass + # %% + + +def convert_metric(raw_value, rng, complementary): + format = "{x}" + + percentage = '%' in raw_value + if percentage: + format += '%' + + with localcontext() as ctx: + ctx.traps[InvalidOperation] = 0 + parsed = extract_value(raw_value, format) + parsed = MetricValue(parsed, '%' if percentage else None) + + if complementary: + parsed = parsed.complement() + if rng == '0-1': + parsed = parsed.to_percentage() / 100 + elif rng == '1-100': + parsed = parsed.to_percentage() + elif rng == 'abs': + parsed = parsed.to_absolute() + else: + parsed = parsed.to_unitless() + return parsed + +proposal_columns = ['dataset', 'metric', 'task', 'format', 'raw_value', 'model', 'model_type', 'cell_ext_id', + 'confidence', 'parsed', 'struct_model_type', 'struct_dataset'] + + +# generator of all result-like cells +def generate_cells_proposals(table_ext_id, matrix, structure, desc): + # %% + # Proposal generation + def consume_cells(matrix): + for row_id, row in enumerate(matrix): + for col_id, cell in enumerate(row): + yield (row_id, col_id, cell) + + + def annotations(r, c, type='model'): + for nc in range(0, c): + if type in structure[r, nc]: + yield Value(structure[r, nc], matrix[r, nc]) + for nr in range(0, r): + if type in structure[nr, c]: + yield Value(structure[nr, c], matrix[nr, c]) + + number_re = re.compile(r'(^[± Ee/()^0-9.%,_+-]{2,}$)|(^\s*[0-9]\s*$)') + + proposals = [Proposal( + cell=Cell(cell_ext_id=f"{table_ext_id}/{r}.{c}", + table_ext_id=table_ext_id, + row=r, + col=c + ), + # TODO Add table type: sota / error ablation + table_description=desc, + model_values=list(annotations(r, c, 'model')), + dataset_values=list(annotations(r, c, 'dataset')), + raw_value=val) + for r, c, val in consume_cells(matrix) + if structure[r, c] == '' and number_re.match(matrix[r, c].strip())] + return proposals + + +def link_cells_proposals(proposals, desc, taxonomy_linking, + paper_context, abstract_context, table_context, topk=1): + for prop in proposals: + # heuristyic to handle accuracy vs error + format = "{x}" + + percentage = '%' in prop.raw_value + if percentage: + format += '%' + + df = taxonomy_linking(prop.dataset, paper_context, abstract_context, table_context, + desc, topk=topk, debug_info=prop) + for _, row in df.iterrows(): + raw_value = prop.raw_value + task = row['task'] + dataset = row['dataset'] + metric = row['metric'] + + complementary = False + if metric != row['true_metric']: + metric = row['true_metric'] + complementary = True + + # todo: pass taxonomy directly to proposals generation + ranges = taxonomy_linking.taxonomy.metrics_range + key = (task, dataset, metric) + rng = ranges.get(key, '') + if not rng: rng = ranges.get(metric, '') + + parsed = float(convert_metric(raw_value, rng, complementary)) + + linked = { + 'dataset': dataset, + 'metric': metric, + 'task': task, + 'format': format, + 'raw_value': raw_value, + 'model': prop.model_name, + 'model_type': prop.model_type, + 'cell_ext_id': prop.cell.cell_ext_id, + 'confidence': row['confidence'], + 'struct_model_type': prop.model_type, + 'struct_dataset': prop.dataset, + 'parsed': parsed + } + yield linked + + + +def generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking, + paper_context, abstract_context, table_context, topk=1): + + # def empty_proposal(cell_ext_id, reason): + # np = "not-present" + # return dict( + # dataset=np, metric=np, task=np, format=np, raw_value=np, model=np, + # model_type=np, cell_ext_id=cell_ext_id, confidence=-1, debug_reason=reason + # ) + + + + proposals = generate_cells_proposals(table_ext_id, matrix, structure, desc) + proposals = link_cells_proposals(proposals, desc, taxonomy_linking, paper_context, abstract_context, + table_context, topk=topk) + + # specify columns in case there's no proposal + proposals = pd.DataFrame.from_records(list(proposals), columns=proposal_columns) + return proposals + + +def linked_proposals(paper_ext_id, paper, annotated_tables, taxonomy_linking=None, + dataset_extractor=None, topk=1): + # dataset_extractor=DatasetExtractor()): + proposals = [] + paper_context, abstract_context = dataset_extractor.from_paper(paper) + table_contexts = dataset_extractor.get_table_contexts(paper, annotated_tables) + #print(f"Extracted datasets: {datasets}") + for idx, (table, table_context) in enumerate(zip(annotated_tables, table_contexts)): + matrix = np.array(table.matrix) + structure = np.array(table.matrix_tags) + tags = 'sota' + desc = table.caption + table_ext_id = f"{paper_ext_id}/{table.name}" + + if 'sota' in tags and 'no_sota_records' not in tags: # only parse tables that are marked as sota + proposals.append( + generate_proposals_for_table( + table_ext_id, matrix, structure, desc, taxonomy_linking, + paper_context, abstract_context, table_context, + topk=topk + ) + ) + if len(proposals): + return pd.concat(proposals) + return pd.DataFrame(columns=proposal_columns) + + +def test_link_taxonomy(): + link_taxonomy_raw = MatchSearch() + results = link_taxonomy_raw.search(link_taxonomy_raw.preproc("miniImageNet 5-way 1-shot")) + # assert "Mini-ImageNet - 1-Shot Learning" == results["hits"][0]["_source"]["dataset"], results + results = link_taxonomy_raw.search(link_taxonomy_raw.preproc("CoNLL2003")) + assert "CoNLL 2003 (English)" == results["hits"][0]["_source"]["dataset"], results + results = link_taxonomy_raw.search(link_taxonomy_raw.preproc("AGNews")) + assert "AG News" == results["hits"][0]["_source"]["dataset"], results + link_taxonomy_raw("miniImageNet 5-way 1-shot") + # %% + split_re = re.compile('([^a-zA-Z0-9])') + + # %% + q = "miniImageNet 5-way 1-shot Mini ImageNet 1-Shot Learning" * 1 + r = link_taxonomy_raw.search(q) + f = len(split_re.split(q)) + r['hits'][0]['_score'] / f, r['hits'][1]['_score'] / f, r['hits'][0]['_source'] + # %% + q = "Mini ImageNet 1-Shot Learning" * 1 + r = link_taxonomy_raw.search(q) + f = len(split_re.split(q)) + r['hits'][0]['_score'] / f, r['hits'][1]['_score'] / f, r['hits'][0]['_source'] + # %% + q = "Mini ImageNet 1-Shot" * 1 + r = link_taxonomy_raw.search(q) + f = len(split_re.split(q)) + r['hits'][0]['_score'] / f, r['hits'][1]['_score'] / f, r['hits'][0]['_source'] + # + # # %% + # prop = proposals[1] + # print(prop) + # # todo issue with STS-B matching IJB-B + # link_taxonomy_raw(prop.dataset) + + diff --git a/axcell/models/linking/context_search.py b/axcell/models/linking/context_search.py new file mode 100644 index 0000000..2ff1963 --- /dev/null +++ b/axcell/models/linking/context_search.py @@ -0,0 +1,481 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# metrics[taxonomy name] is a list of normalized evidences for taxonomy name +from collections import Counter, OrderedDict + +from axcell.models.linking.acronym_extractor import AcronymExtractor +from axcell.models.linking.probs import get_probs, reverse_probs +from axcell.models.linking.utils import normalize_dataset, normalize_dataset_ws, normalize_cell, normalize_cell_ws +from scipy.special import softmax +import re +import pandas as pd +import numpy as np +import json +import ahocorasick +from numba import njit, typed, types +from pathlib import Path + +from axcell.pipeline_logger import pipeline_logger + +from axcell.models.linking import manual_dicts + + +def dummy_item(reason): + return pd.DataFrame(dict(dataset=[reason], task=[reason], metric=[reason], evidence=[""], confidence=[0.0])) + + +class EvidenceFinder: + single_letter_re = re.compile(r"\b\w\b") + init_letter_re = re.compile(r"\b\w") + end_letter_re = re.compile(r"\w\b") + letter_re = re.compile(r"\w") + + def __init__(self, taxonomy, abbreviations_path=None, use_manual_dicts=False): + self.abbreviations_path = abbreviations_path + self.use_manual_dicts = use_manual_dicts + self._init_structs(taxonomy) + + @staticmethod + def evidences_from_name(key): + x = normalize_dataset_ws(key) + y = [w for w in x.split() if w not in manual_dicts.stop_words] + return [x] + y if len(y) > 1 else [x] + + @staticmethod + def get_basic_dicts(taxonomy): + tasks = {ts: [normalize_dataset_ws(ts)] for ts in taxonomy.tasks} + datasets = {ds: EvidenceFinder.evidences_from_name(ds) for ds in taxonomy.datasets} + metrics = {ms: EvidenceFinder.evidences_from_name(ms) for ms in taxonomy.metrics} + return tasks, datasets, metrics + + @staticmethod + def merge_evidences(target, source): + for name, evs in source.items(): + target.setdefault(name, []).extend(evs) + + @staticmethod + def make_trie(names): + trie = ahocorasick.Automaton() + for name in names: + norm = name.replace(" ", "") + trie.add_word(norm, (len(norm), name)) + trie.make_automaton() + return trie + + @staticmethod + def get_auto_evidences(name, abbreviations, abbrvs_trie): + frags = EvidenceFinder.find_names(normalize_dataset_ws(name), abbrvs_trie) + evidences = [] + for f in frags: + evidences.extend(abbreviations[f]) + return list(set(evidences)) + + @staticmethod + def find_names(text, names_trie): + text = text.lower() + profile = EvidenceFinder.letter_re.sub("i", text) + profile = EvidenceFinder.init_letter_re.sub("b", profile) + profile = EvidenceFinder.end_letter_re.sub("e", profile) + profile = EvidenceFinder.single_letter_re.sub("x", profile) + text = text.replace(" ", "") + profile = profile.replace(" ", "") + s = Counter() + for (end, (l, word)) in names_trie.iter(text): + if profile[end] in ['e', 'x'] and profile[end - l + 1] in ['b', 'x']: + s[word] += 1 + return s + + def find_datasets(self, text): + return EvidenceFinder.find_names(text, self.all_datasets_trie) + + def find_metrics(self, text): + return EvidenceFinder.find_names(text, self.all_metrics_trie) + + def find_tasks(self, text): + return EvidenceFinder.find_names(text, self.all_tasks_trie) + + def init_evidence_dicts(self, taxonomy): + self.tasks, self.datasets, self.metrics = EvidenceFinder.get_basic_dicts(taxonomy) + + if self.use_manual_dicts: + EvidenceFinder.merge_evidences(self.tasks, manual_dicts.tasks) + EvidenceFinder.merge_evidences(self.datasets, manual_dicts.datasets) + EvidenceFinder.merge_evidences(self.metrics, manual_dicts.metrics) + + if self.abbreviations_path is not None: + with Path(self.abbreviations_path).open('rt') as f: + abbreviations = json.load(f) + abbrvs_trie = EvidenceFinder.make_trie(list(abbreviations.keys())) + + ds_auto = {x: EvidenceFinder.get_auto_evidences(x, abbreviations, abbrvs_trie) for x in taxonomy.datasets} + ms_auto = {x: EvidenceFinder.get_auto_evidences(x, abbreviations, abbrvs_trie) for x in taxonomy.metrics} + + EvidenceFinder.merge_evidences(self.datasets, ds_auto) + EvidenceFinder.merge_evidences(self.metrics, ms_auto) + + self.datasets = {k: (v + ['test'] if 'val' not in k else v + ['validation', 'dev', 'development']) for k, v in + self.datasets.items()} + if self.use_manual_dicts: + self.datasets.update({ + 'LibriSpeech dev-clean': ['libri speech dev clean', 'libri speech', 'dev', 'clean', 'dev clean', 'development'], + 'LibriSpeech dev-other': ['libri speech dev other', 'libri speech', 'dev', 'other', 'dev other', 'development', 'noisy'], + }) + + def _init_structs(self, taxonomy): + self.init_evidence_dicts(taxonomy) + + self.datasets = {k: set(v) for k, v in self.datasets.items()} + self.metrics = {k: set(v) for k, v in self.metrics.items()} + self.tasks = {k: set(v) for k, v in self.tasks.items()} + + self.all_datasets = set(normalize_cell_ws(normalize_dataset(y)) for x in self.datasets.values() for y in x) + self.all_metrics = set(normalize_cell_ws(y) for x in self.metrics.values() for y in x) + self.all_tasks = set(normalize_cell_ws(normalize_dataset(y)) for x in self.tasks.values() for y in x) + + self.all_datasets_trie = EvidenceFinder.make_trie(self.all_datasets) + self.all_metrics_trie = EvidenceFinder.make_trie(self.all_metrics) + self.all_tasks_trie = EvidenceFinder.make_trie(self.all_tasks) + + +@njit +def axis_logprobs(evidences_for, reverse_probs, found_evidences, noise, pb, max_repetitions): + logprob = 0.0 + empty = typed.Dict.empty(types.unicode_type, types.float64) + short_probs = reverse_probs.get(evidences_for, empty) + for evidence, count in found_evidences.items(): + logprob += min(count, max_repetitions) * np.log(noise * pb + (1 - noise) * short_probs.get(evidence, 0.0)) + return logprob + + +# compute log-probabilities in a given context and add them to logprobs +@njit +def compute_logprobs(taxonomy, tasks, datasets, metrics, + reverse_merged_p, reverse_metrics_p, reverse_task_p, + dss, mss, tss, noise, ms_noise, ts_noise, ds_pb, ms_pb, ts_pb, + max_repetitions): + task_cache = typed.Dict.empty(types.unicode_type, types.float64) + dataset_cache = typed.Dict.empty(types.unicode_type, types.float64) + metric_cache = typed.Dict.empty(types.unicode_type, types.float64) + logprobs = np.zeros(len(taxonomy)) + axes_logprobs = ( + np.zeros(len(tasks)), + np.zeros(len(datasets)), + np.zeros(len(metrics)) + ) + for i, (task, dataset, metric) in enumerate(taxonomy): + if dataset not in dataset_cache: + dataset_cache[dataset] = axis_logprobs(dataset, reverse_merged_p, dss, noise, ds_pb, 1) + if metric not in metric_cache: + metric_cache[metric] = axis_logprobs(metric, reverse_metrics_p, mss, ms_noise, ms_pb, 1) + if task not in task_cache: + task_cache[task] = axis_logprobs(task, reverse_task_p, tss, ts_noise, ts_pb, max_repetitions) + + logprobs[i] += dataset_cache[dataset] + metric_cache[metric] + task_cache[task] + for i, task in enumerate(tasks): + axes_logprobs[0][i] += task_cache[task] + + for i, dataset in enumerate(datasets): + axes_logprobs[1][i] += dataset_cache[dataset] + + for i, metric in enumerate(metrics): + axes_logprobs[2][i] += metric_cache[metric] + return logprobs, axes_logprobs + + +def _to_typed_list(iterable): + l = typed.List() + for i in iterable: + l.append(i) + return l + + +class LRUCache: + def __init__(self, capacity): + self.cache = OrderedDict() + self.capacity = capacity + + def __getitem__(self, key): + self.cache.move_to_end(key) + return self.cache[key] + + def __setitem__(self, key, value): + self.cache[key] = value + self.cache.move_to_end(key) + if len(self.cache) > self.capacity: + self.cache.popitem(last=False) + + def __contains__(self, item): + return item in self.cache + + def __repr__(self): + return f"LRUCache(capacity={self.capacity}, {repr(dict(self.cache))})" + + +class ContextSearch: + def __init__(self, taxonomy, evidence_finder, + context_noise=(0.99, 1.0, 1.0, 0.25, 0.01), + metric_noise=(0.99, 1.0, 1.0, 0.25, 0.01), + task_noise=(0.1, 1.0, 1.0, 0.1, 0.1), + ds_pb=0.001, ms_pb=0.01, ts_pb=0.01, + include_independent=True, debug_gold_df=None): + merged_p = \ + get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in evidence_finder.datasets.items()})[1] + metrics_p = \ + get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in evidence_finder.metrics.items()})[1] + tasks_p = \ + get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in evidence_finder.tasks.items()})[1] + + self.queries = LRUCache(10_000) + self.logprobs_cache = LRUCache(10_000) + self.taxonomy = taxonomy + self.evidence_finder = evidence_finder + + self._taxonomy = _to_typed_list(self.taxonomy.taxonomy) + self._taxonomy_tasks = _to_typed_list(self.taxonomy.tasks) + self._taxonomy_datasets = _to_typed_list(self.taxonomy.datasets) + self._taxonomy_metrics = _to_typed_list(self.taxonomy.metrics) + + self.extract_acronyms = AcronymExtractor() + self.context_noise = context_noise + self.metrics_noise = metric_noise if metric_noise else context_noise + self.task_noise = task_noise if task_noise else context_noise + self.ds_pb = ds_pb + self.ms_pb = ms_pb + self.ts_pb = ts_pb + self.reverse_merged_p = self._numba_update_nested_dict(reverse_probs(merged_p)) + self.reverse_metrics_p = self._numba_update_nested_dict(reverse_probs(metrics_p)) + self.reverse_tasks_p = self._numba_update_nested_dict(reverse_probs(tasks_p)) + self.debug_gold_df = debug_gold_df + self.max_repetitions = 3 + self.include_independent = include_independent + + def _numba_update_nested_dict(self, nested): + d = typed.Dict() + for key, dct in nested.items(): + d2 = typed.Dict() + d2.update(dct) + d[key] = d2 + return d + + def _numba_extend_list(self, lst): + l = typed.List.empty_list((types.unicode_type, types.int32)) + for x in lst: + l.append(x) + return l + + def _numba_extend_dict(self, dct): + d = typed.Dict.empty(types.unicode_type, types.int64) + d.update(dct) + return d + + def _hash_counter(self, d): + items = list(d.items()) + items = sorted(items) + return ";".join([x[0]+":"+str(x[1]) for x in items]) + + def compute_context_logprobs(self, context, noise, ms_noise, ts_noise, logprobs, axes_logprobs): + if isinstance(context, str) or context is None: + context = context or "" + #abbrvs = self.extract_acronyms(context) + context = normalize_cell_ws(normalize_dataset_ws(context)) + #dss = set(self.evidence_finder.find_datasets(context)) | set(abbrvs.keys()) + dss = self.evidence_finder.find_datasets(context) + mss = self.evidence_finder.find_metrics(context) + tss = self.evidence_finder.find_tasks(context) + + dss -= mss + dss -= tss + else: + tss, dss, mss = context + + dss = {normalize_cell(ds): count for ds, count in dss.items()} + mss = {normalize_cell(ms): count for ms, count in mss.items()} + tss = {normalize_cell(ts): count for ts, count in tss.items()} + ###print("dss", dss) + ###print("mss", mss) + dss = self._numba_extend_dict(dss) + mss = self._numba_extend_dict(mss) + tss = self._numba_extend_dict(tss) + + key = (self._hash_counter(tss), self._hash_counter(dss), self._hash_counter(mss), noise, ms_noise, ts_noise) + if key not in self.logprobs_cache: + lp, alp = compute_logprobs(self._taxonomy, self._taxonomy_tasks, self._taxonomy_datasets, self._taxonomy_metrics, + self.reverse_merged_p, self.reverse_metrics_p, self.reverse_tasks_p, + dss, mss, tss, noise, ms_noise, ts_noise, self.ds_pb, self.ms_pb, self.ts_pb, + self.max_repetitions) + self.logprobs_cache[key] = (lp, alp) + else: + lp, alp = self.logprobs_cache[key] + logprobs += lp + axes_logprobs[0] += alp[0] + axes_logprobs[1] += alp[1] + axes_logprobs[2] += alp[2] + + def match(self, contexts): + assert len(contexts) == len(self.context_noise) + n = len(self._taxonomy) + context_logprobs = np.zeros(n) + axes_context_logprobs = _to_typed_list([ + np.zeros(len(self._taxonomy_tasks)), + np.zeros(len(self._taxonomy_datasets)), + np.zeros(len(self._taxonomy_metrics)), + ]) + + for context, noise, ms_noise, ts_noise in zip(contexts, self.context_noise, self.metrics_noise, self.task_noise): + self.compute_context_logprobs(context, noise, ms_noise, ts_noise, context_logprobs, axes_context_logprobs) + keys = self.taxonomy.taxonomy + logprobs = context_logprobs + #keys, logprobs = zip(*context_logprobs.items()) + probs = softmax(np.array(logprobs)) + axes_probs = [softmax(np.array(a)) for a in axes_context_logprobs] + return ( + zip(keys, probs), + zip(self._taxonomy_tasks, axes_probs[0]), + zip(self._taxonomy_datasets, axes_probs[1]), + zip(self._taxonomy_metrics, axes_probs[2]) + ) + + def __call__(self, query, paper_context, abstract_context, table_context, caption, topk=1, debug_info=None): + cellstr = debug_info.cell.cell_ext_id + pipeline_logger("linking::taxonomy_linking::call", ext_id=cellstr, query=query, + paper_context=paper_context, abstract_context=abstract_context, table_context=table_context, + caption=caption) + + paper_hash = ";".join(",".join(sorted(s.elements())) for s in paper_context) + abstract_hash = ";".join(",".join(sorted(s.elements())) for s in abstract_context) + mentions_hash = ";".join(",".join(sorted(s.elements())) for s in table_context) + key = (paper_hash, abstract_hash, mentions_hash, caption, query, topk) + ###print(f"[DEBUG] {cellstr}") + ###print("[DEBUG]", debug_info) + ###print("query:", query, caption) + if key in self.queries: + # print(self.queries[key]) + # for context in key: + # abbrvs = self.extract_acronyms(context) + # context = normalize_cell_ws(normalize_dataset(context)) + # dss = set(find_datasets(context)) | set(abbrvs.keys()) + # mss = set(find_metrics(context)) + # dss -= mss + ###print("dss", dss) + ###print("mss", mss) + + ###print("Taking result from cache") + p = self.queries[key] + else: + dists = self.match((paper_context, abstract_context, table_context, caption, query)) + + all_top_results = [sorted(list(dist), key=lambda x: x[1], reverse=True)[:max(topk, 5)] for dist in dists] + top_results, top_results_t, top_results_d, top_results_m = all_top_results + + entries = [] + for it, prob in top_results: + task, dataset, metric = it + entry = dict(task=task, dataset=dataset, metric=metric) + entry.update({"evidence": "", "confidence": prob}) + entries.append(entry) + + if self.include_independent: + best_independent = dict( + task=top_results_t[0][0], + dataset=top_results_d[0][0], + metric=top_results_m[0][0]) + best_independent.update({ + "evidence": "", + "confidence": 0.79 + }) + entries.append(best_independent) + + # entries = [] + # for i in range(5): + # best_independent = dict( + # task=top_results_t[i][0], + # dataset=top_results_d[i][0], + # metric=top_results_m[i][0]) + # best_independent.update({ + # "evidence": "", + # "confidence": np.power(top_results_t[i][1] * top_results_d[i][1] * top_results_m[i][1], 1.0/3.0) + # }) + # entries.append(best_independent) + #entries = [best_independent] + entries + + # best, best_p = sorted(dist, key=lambda x: x[1], reverse=True)[0] + # entry = et[best] + # p = pd.DataFrame({k:[v] for k, v in entry.items()}) + # p["evidence"] = "" + # p["confidence"] = best_p + p = pd.DataFrame(entries).sort_values("confidence", ascending=False) + + self.queries[key] = p + + ###print(p) + + # error analysis only + if self.debug_gold_df is not None: + if cellstr in self.debug_gold_df.index: + gold_record = self.debug_gold_df.loc[cellstr] + if p.iloc[0].dataset == gold_record.dataset: + print("[EA] Matching gold sota record (dataset)") + else: + print( + f"[EA] Proposal dataset ({p.iloc[0].dataset}) and gold dataset ({gold_record.dataset}) mismatch") + else: + print("[EA] No gold sota record found for the cell") + # end of error analysis only + pipeline_logger("linking::taxonomy_linking::topk", ext_id=cellstr, topk=p.head(5)) + + q = p.head(topk).copy() + q["true_metric"] = q.apply(lambda row: self.taxonomy.normalize_metric(row.task, row.dataset, row.metric), axis=1) + return q + + +# todo: compare regex approach (old) with find_datasets(.) (current) +# todo: rename it +class DatasetExtractor: + def __init__(self, evidence_finder): + self.evidence_finder = evidence_finder + self.dataset_prefix_re = re.compile(r"[A-Z]|[a-z]+[A-Z]+|[0-9]") + self.dataset_name_re = re.compile(r"\b(the)\b\s*(?P((?!(the)\b)\w+\W+){1,10}?)(test|val(\.|idation)?|dev(\.|elopment)?|train(\.|ing)?\s+)?\bdata\s*set\b", re.IGNORECASE) + + def find_references(self, text, references): + refs = r"\bxxref-(" + "|".join([re.escape(ref) for ref in references]) + r")\b" + return set(re.findall(refs, text)) + + def get_table_contexts(self, paper, tables): + ref_tables = [table for table in tables if table.figure_id and table.figure_id.replace(".", "")] + refs = [table.figure_id.replace(".", "") for table in ref_tables] + if not refs: + return [[Counter(), Counter(), Counter()] for table in tables] + ref_contexts = {ref: [Counter(), Counter(), Counter()] for ref in refs} + if hasattr(paper.text, "fragments"): + for fragment in paper.text.fragments: + found_refs = self.find_references(fragment.text, refs) + if found_refs: + ts, ds, ms = self(fragment.header + "\n" + fragment.text) + for ref in found_refs: + ref_contexts[ref][0] += ts + ref_contexts[ref][1] += ds + ref_contexts[ref][2] += ms + table_contexts = [ + ref_contexts.get( + table.figure_id.replace(".", ""), + [Counter(), Counter(), Counter()] + ) if table.figure_id else [Counter(), Counter(), Counter()] + for table in tables + ] + return table_contexts + + def from_paper(self, paper): + abstract = paper.text.abstract + text = "" + if hasattr(paper.text, "fragments"): + text += " ".join(f.text for f in paper.text.fragments) + return self(text), self(abstract) + + def __call__(self, text): + text = normalize_cell_ws(normalize_dataset_ws(text)) + ds = self.evidence_finder.find_datasets(text) + ts = self.evidence_finder.find_tasks(text) + ms = self.evidence_finder.find_metrics(text) + ds -= ts + ds -= ms + return ts, ds, ms diff --git a/axcell/models/linking/execution.py b/axcell/models/linking/execution.py new file mode 100644 index 0000000..ceac4a5 --- /dev/null +++ b/axcell/models/linking/execution.py @@ -0,0 +1,97 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pandas as pd +from django.db import connection +from IPython.core.display import display + +from axcell.models.linking.metrics import Metrics +from axcell.models.linking.format import extract_value + + +def q(query, limit=10, index_col=None): + if limit is not None: + query = query.rstrip(" ;") + f" LIMIT {limit}" + return pd.read_sql(query, connection, index_col=index_col) + +def execute_model_on_papers(model, papers): + proposals = [] + for paper in papers: + print("Parsing ", paper.paper_id) + paper_proposals = model(paper.paper_id, paper, paper.tables) + proposals.append(paper_proposals) + proposals = pd.concat(proposals) + proposals["experiment_name"] = model.__name__ + return proposals.set_index('cell_ext_id') + + +def fetch_gold_sota_records(): + gold_sota_records = q(""" + SELECT sc.id as cell_id, + st.paper_id, + CONCAT(st.paper_id, '/', st.name, '/', sr.row,'.', sr.col) as cell_ext_id, + (SELECT gold_tags FROM sota_cell WHERE (row=sc.row or col=sc.col) and table_id=sc.table_id and gold_tags LIKE 'model%' LIMIT 1) as model_type, + task, dataset, metric, model, format, sc.value as raw_value + FROM + sota_record sr + JOIN sota_cell sc USING (table_id, row, col) + JOIN sota_table st ON (sc.table_id=st.id) + WHERE parser = 'latexml' and dataset != '' and task != '' and metric != '' and model != '';""", limit=None) + gold_sota_records["parsed"] = gold_sota_records[["raw_value", "format"]].apply( + lambda row: float(extract_value(row.raw_value, row.format)), axis=1) + + unparsed = gold_sota_records[gold_sota_records["parsed"] != gold_sota_records["parsed"]] + if len(unparsed): + print("Found unparsed values") + display(unparsed.style.format({'cell_ext_id': + lambda x: f'
{x}'}) + ) + + gold_sota_records = gold_sota_records[gold_sota_records["parsed"] == gold_sota_records["parsed"]] + + strip_cols=["task", "dataset", "format", "metric", "raw_value", "model", "model_type"] + gold_sota_records = gold_sota_records.transform( + lambda x: x.str.strip() if x.name in strip_cols else x) + gold_sota_records = gold_sota_records.set_index('cell_ext_id') + return gold_sota_records + +def fetch_gold_sota_papers(): + return q(""" + SELECT st.paper_id + FROM + sota_record sr + JOIN sota_cell sc USING (table_id, row, col) + JOIN sota_table st ON (sc.table_id=st.id) + WHERE parser = 'latexml' and dataset != '' and task != '' and metric != '' and model != '' + GROUP BY st.paper_id;""", limit=None)["paper_id"].tolist() + +class Evaluator(): + def __init__(self, model, paper_collection): + self.model = model + self.pc = paper_collection + self.annotated_papers = fetch_gold_sota_papers() + self.raw_proposals = None + + def run_model(self): + papers = [paper for paper in self.pc if paper.paper_id in self.annotated_papers] + self.raw_proposals = execute_model_on_papers(model=self.model, papers=papers) + + def evaluate(self, proposals_filter, track_proposals=False): + if self.raw_proposals is None: + self.run_model() + if track_proposals: + all_proposals = self.raw_proposals.copy(deep=True) + else: + all_proposals = None + proposals = proposals_filter(self.raw_proposals, all_proposals) + gold_sota_records = fetch_gold_sota_records() + df = gold_sota_records.merge(proposals, 'outer', left_index=True, right_index=True, suffixes=['_gold', '_pred']) + df = df.reindex(sorted(df.columns), axis=1) + df = df.fillna('not-present') + if "experiment_name" in df.columns: + del df["experiment_name"] + + metrics = Metrics(df, experiment_name=self.model.__name__) + if track_proposals: + return metrics, all_proposals + else: + return metrics diff --git a/axcell/models/linking/extractors.py b/axcell/models/linking/extractors.py new file mode 100644 index 0000000..1d3c6fe --- /dev/null +++ b/axcell/models/linking/extractors.py @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import re + +dataset_name_re = re.compile(r"\b(the)\b\s*(?P((?!(the)\b)\w+\W+){1,10}?)(test|val(\.|idation)?|dev(\.|elopment)?|train(\.|ing)?\s+)?\bdata\s*set\b", re.IGNORECASE) + +parens_re = re.compile(r"\([^)]*?\)|\[[^]]*?\]") +def remove_parens(text): + return parens_re.sub("", text) + +def clean_name(name): + return remove_parens(name.strip()).strip() + +year_2k_re = re.compile(r"20(\d\d)") +hyphens_re = re.compile(r"[-_'`–’→]") +ws_re = re.compile(r"\s+") +dataset_prefix_re = re.compile(r"[A-Z]|[a-z]+[A-Z]+|[0-9]") + +def normalize_dataset(name): + name = hyphens_re.sub(" ", name) + name = year_2k_re.sub(r"\1", name) + name = ws_re.sub(" ", name) + return name.strip().lower() + +## temporarily moved to notebook +# class DatasetExtractor: +# def from_paper(self, paper): +# text = paper.text.abstract +# if hasattr(paper.text, "fragments"): +# text += " ".join(f.text for f in paper.text.fragments) +# return self(text) +# +# def __call__(self, text): +# extracted = [clean_name(m.group("name")) for m in dataset_name_re.finditer(text)] +# print("Extracted:", extracted) +# cleaned = [x for x in extracted if dataset_prefix_re.match(x)] +# print("Cleaned:", cleaned) +# return cleaned +# filtered = list(set([x for x in cleaned if normalize_dataset(x) in normalized_datasets])) +# print("Filtered:", filtered) +# return filtered + + +datasets = ['VOT2016', 'Penn Treebank', 'DIV2K', 'SCUT-FBP5500', 'SCUT-FBP', 'ImageNet', 'KITTI', 'Cityscapes', 'Street View House Number', 'MNIST', '1000-class ImageNet', 'CIFAR-10', 'Berkeley Segmentation', 'AFLW', 'BIWI', '300W-LP', 'AFLW2000', 'AFW', 'Stanford Question Answering', 'SQuAD', '80 million tiny images', 'PASCAL VOC 2012', 'ILSVRC-2012 ImageNet', 'CIFAR-100', 'NewsQA', 'COCO', 'Market-1501', 'LSUN', 'Matterport3D', 'Market1501', 'bAbI', 'WikiHop', 'MICC', 'Wild', 'Yelp', 'SNLI', 'MultiNLI', 'Age', 'Yahoo', 'OMNIGLOT', 'DSTC2', 'Cars', 'CBT', 'CNN', 'Daily Mail', 'Jester', 'Adult', 'LSUN bedroom', 'CUB', 'Caltech-UCSD Birds-200-2011', 'Street View House Numbers', 'TREC QA', 'Realtor360', 'PanoContext', 'Stanford 2D-3D', 'Camelyon16', 'COCO-Stuff', 'Flickr Landscapes', 'ADE20K', 'MSRA', 'OntoNotes', 'Visual Question Answering', 'VQA', 'VQA v2.0', 'Indian Pines', 'Pavia University', 'MR', 'PASCAL3D+', 'PASCAL VOC 2007', 'VOC 2007', 'LSP', 'VIPeR', 'PASCAL VOC', 'ImageNet detection', 'MS-COCO', 'Caltech-UCSD Birds', 'MPII Human Pose', 'CoNLL 2003 NER', 'FCE', 'Cora', 'Wikipedia', 'Switchboard', '1B word', 'SVHN', 'Caltech pedestrian', 'Set5', 'Urban100', 'AVA', 'Charades', 'MMI', 'Extended Cohn-Kanade', 'CKP', 'ICDAR 2015', 'SwDA', 'MRDA', 'ModelNet', 'PASCAL 3D', 'ShapeNet', 'TriviaQA', 'Facescrub', 'NYUV2', 'ShapeNet part', 'WSJ', 'CoNLL03 NER', 'NER', 'CoNLL03', 'LibriSpeech', '300W', 'WN18', 'ILSVRC 2012 classification', 'Penn Tree Bank', 'Cifar-10', 'SQuAD 2.0', 'PTB', 'DukeMTMC-reID', 'CUHK03', 'SearchQA', 'Stanford Natural Language Inference', 'NYU', 'ICVL', 'NYU hand pose', 'WN18RR', 'CoNLL-2005 shared task', 'CoNLL-2012 shared task', 'CoNLL-2005', 'CoNLL-2012', 'ImageNet 2012', '300-W', 'AFLW2000-3D', 'LFW', 'Omniglot', 'PROMISE 2012', 'Twitter', 'Florence', 'SUN-RGBD', 'Microsoft COCO', 'ImageNet classification', 'Something-Something', 'MRC', 'MS MARCO', 'Amazon', 'Alibaba', 'Netflix', 'PASCAL-Person-Part', 'CIHP', 'Pascal VOC', 'MS-Celeb-1M', 'CASIA', 'MegaFace', 'IJB-B', 'ImageNet-1k', 'Places365-Standard', 'SciTail', 'GTSRB', 'GRID', 'BSD', 'LIVE1', 'CNN/Daily Mail', 'Caltech', 'MS COCO', 'Restaurant', 'JSB Chorales', 'CUHK', 'CUFSF', 'JFT-300M', 'CelebA', 'RaFD', 'Amazon Reviews', 'Amazon reviews', 'SemEval', 'Tobacco-3482', 'RVL-CDIP', 'Douban', 'Company\xe2\x88\x97', 'Criteo', 'Semantic Boundaries', 'Caltech-UCSD birds', 'IMDb', 'VGG-Face', 'MoFA', 'FERET', 'iNat2017', 'ScanNet', 'TIMIT', 'VOC 2012', 'SICK', 'IJB-A', 'CACD', 'MSCeleb', 'YTF', 'CACD-VS', 'CityScapes', 'COCO detection', 'Bosch', 'LISA', 'Tsinghua-Tencent', 'FDDB', 'Mikolajczyk', 'Middlebury', 'Kitti', 'ILSVRC2012', 'BSD100', 'LineMod', 'Occlusion', 'GTAV', 'CityPersons', 'ETH', 'INRIA', 'ILSVRC CLS-LOC', 'Caltech-USA', 'BlogCatalog', 'CoNLL', 'MPII', 'Cityscapes', 'Cityscapes', 'CamVid', 'Amazon Review', 'STL-10', 'Imagenet', 'ShapeNet-Part', 'ModelNet40', 'BUS 2017', 'Quora Question Pairs', 'SST', 'MARS', 'PRW', 'BSD68', 'IMDB', 'ASPEC', 'OTB-2015', 'VOT-2017 public', 'Tejani', 'LineMOD', 'CASIA WebFace', 'Flying Chairs', 'FLIC', 'Set14 \xc3\x974', 'Human3.6M', 'Google News', 'Jobs', 'WikiText-2', 'Rotten Tomatoes', 'RCV1', 'WIDER FACE val', 'WIDER FACE', 'COCO', 'PoseTrack', 'HPatches', 'MHP v2.0', 'Buffy', 'ShapeNetCore', 'EVAL', 'MAFA', 'iPinYou', 'CASIA-WebFace', 'JANUS CS2', 'Cross-City', 'GTA5', 'SYNTHIA', 'MovieLens-100k', 'MovieLens-1M', 'LAMBADA', 'bAbi', 'Visual Genome', 'Visual-7W', 'Google-Ref', 'CelebA-HQ', 'PASCAL', 'QASent', 'WikiQA', 'Online Products', 'FB15k-237', 'MovieLens 1M', 'REST', 'Yosemite', 'PASCAL faces', 'MusicNet', 'Multi-MNIST', 'CLEVR', 'Quora', 'Who Did What', 'Children\xe2\x80\x99s Book', 'Set14', 'CFP', 'CTW1500', 'Weizmann Horse', 'ReVerb45K', 'AG\xe2\x80\x99s News', 'WMT En\xe2\x86\x92Fr', 'WMT En\xe2\x86\x92De', 'CNN/DailyMail', 'NYT', 'ECCV HotOrNot', 'bAbI story-based QA', 'PPI', 'Mini-ImageNet', 'ITOP', 'YCB-Video', 'DFW', 'ACL-ARC', 'SciCite', 'HumanEva', 'LINEMOD', 'Occlusion LINEMOD', 'Face Detection', 'UP-3D', 'WT2', 'PASCAL-Context', 'TREC', 'WDW', 'Shoulder-Pain', 'MovieLens', 'CT-150', 'WMT', 'CMU-MOSI', 'IEMOCAP', 'MPII Multi-Person Pose', '91-image', 'CoNLL 2003', 'COCO keypoint detection', 'WiderFace', 'Extended Yale B', 'Hutter Prize', 'SST-1', 'CUB-200-2011', 'Cars196', 'Stanford Online Products', 'Caltech and KITTI', 'BRATS', 'E2E', 'TV', 'Laptop', 'CIFAR', 'CHALL_H80K', 'VQA v2', 'NYU depth', 'NYUD', 'Cityscape', 'IBUG', 'BP4D', 'CAF', 'LexNorm2015', 'YouTube Face', 'DAQUAR', 'NYUDv2', 'SmallTobacco', 'BigTobacco', 'TID2013', 'CK+', 'PubMed 20k', 'WAF', 'MPII Multi-Person', 'GTA', 'PCSO mugshot', 'CIFAR100', 'ImageNet', 'MHP', 'CompCars', 'CUB200-2011 bird', 'CUHK03 labeled', 'Stanford 2D-3D annotation', 'Reddit', 'Stanford SQuAD', 'Graph Reachability', 'AIDA-B', 'VGG face', 'Yahoo! Answer', 'AR', 'Caltech Pedestrian', 'CARS-196', 'Pascal Context', 'Scan2CAD', 'Tiny Images', 'CAT', 'CIFAR10', 'JFT', 'PA-100K', 'VOC2007', 'Wikihop', 'PASCAL face', 'MPQA', 'NELL995', 'NELL-995', 'ShanghaiTech', 'SARC', 'Pol', 'CUHK03 detected', 'Celeb-Seq', 'ICDAR2015 Incidental Scene Text', 'Stanford Sentiment Treebank', 'CoQA', 'Massachusetts roads', 'MPIIGaze', 'SBD', 'InsuranceQA', 'ETHZ', 'Landmarks', 'H36M', 'OccludedLINEMOD', 'UCF101', 'RGBD', 'USPS', 'Visual QA', 'COCO-QA', 'Vid4', 'DAVIS-10'] +normalized_datasets = [normalize_dataset(ds) for ds in datasets] diff --git a/axcell/models/linking/format.py b/axcell/models/linking/format.py new file mode 100644 index 0000000..f390457 --- /dev/null +++ b/axcell/models/linking/format.py @@ -0,0 +1,40 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import re +from decimal import Decimal, ROUND_DOWN, ROUND_HALF_UP, InvalidOperation + +float_value_re = re.compile(r"([+-]?(?:(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)") +float_value_nc = re.compile(r"(?:[+-]?(?:(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)") +par_re = re.compile(r"\{([^\}]*)\}") +escaped_whitespace_re = re.compile(r"(\\\s)+") + +def format_to_regexp(format): + placeholders = par_re.split(format.strip()) + regexp = "" + fn=lambda x: x + for i, s in enumerate(placeholders): + if i % 2 == 0: + if s.strip() == "": + regexp += escaped_whitespace_re.sub(r"\\s+", re.escape(s)) + else: + regexp += escaped_whitespace_re.sub(r"\\s*", re.escape(s)) + elif s.strip() == "": + regexp += float_value_nc.pattern + else: + regexp += float_value_re.pattern + ss = s.strip() + if ss == "100*x" or ss == "100x": + fn = lambda x: 100*x + elif ss == "x/100": + fn = lambda x: x/100 + #return re.compile('^'+regexp+'$'), fn + return re.compile('^' + regexp), fn + +def extract_value(cell_value, format): + cell_value = re.sub(r"\s+%", "%", cell_value).replace(",", "") + cell_value = cell_value.replace("(", " ").replace(")", " ").strip() + regexp, fn = format_to_regexp(format) + match = regexp.match(cell_value) + if match is None or not len(match.groups()): + return Decimal('NaN') + return fn(Decimal(match.group(1))) diff --git a/axcell/models/linking/linker.py b/axcell/models/linking/linker.py new file mode 100644 index 0000000..fb3f3ee --- /dev/null +++ b/axcell/models/linking/linker.py @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .bm25_naive import linked_proposals +from ...pipeline_logger import pipeline_logger + + +class Linker: + step = "linking" + + def __init__(self, name, taxonomy_linking, dataset_extractor): + self.taxonomy_linking = taxonomy_linking + self.dataset_extractor = dataset_extractor + self.__name__ = name + + def __call__(self, paper, tables, topk=1): + pipeline_logger(f"{Linker.step}::call", paper=paper, tables=tables) + proposals = linked_proposals(paper.paper_id, paper, tables, + taxonomy_linking=self.taxonomy_linking, + dataset_extractor=self.dataset_extractor, + topk=topk) + + proposals = proposals.set_index('cell_ext_id') + + pipeline_logger(f"{Linker.step}::linked", paper=paper, tables=tables, proposals=proposals) + return proposals + + def get_best_proposals(self, proposals): + return proposals.groupby('cell_ext_id').head(1) diff --git a/axcell/models/linking/manual_dicts.py b/axcell/models/linking/manual_dicts.py new file mode 100644 index 0000000..79dae62 --- /dev/null +++ b/axcell/models/linking/manual_dicts.py @@ -0,0 +1,205 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +metrics = { + 'Accuracy': ['acc', 'accuracy'], + 'BLEU': ['bleu'], + 'BLEU score': ['bleu'], + 'Character Error Rate': ['cer', 'cers'], + 'Error': ['error', 'err', 'error rate'], + 'Exact Match Ratio': ['exact match'], + 'F1': ['f1', 'f1 score'], + 'F1 score': ['f1', 'f1 score'], + 'MAP': ['map'], + 'Percentage error': ['wer', 'per', 'wers', 'pers', 'word error rate', 'word error rates', 'phoneme error rates', + 'phoneme error rate', 'error', 'error rate', 'error rates'], + 'Word Error Rate': ['wer', 'wers', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'], + 'Word Error Rate (WER)': ['wer', 'wers', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'], + 'Word Accuracy': ['accuracy', 'word accuracy', 'acc', 'word acc'], + 'ROUGE-1': ['r1'], + 'ROUGE-2': ['r2'], + 'ROUGE-L': ['rl'], + 'Precision': ['precision'], + 'Recall': ['recall'], + # RAIN REMOVAL + 'PSNR': ['psnr', 'psnr (db)', 'mean psnr'], + 'SSIM': ['ssim'], + 'UQI': ['uqi'], + 'VIF': ['vif'], + 'SSEQ': ['sseq'], + 'NIQE': ['niqe'], + 'BLINDS-II': ['blinds-ii'], + 'FSIM': ['fsim'], + # SEMANTIC SEGMENTATION + 'Mean IoU': ['miou', 'mean iou', 'mean iu', 'class iou', 'iou cla', 'cla iou'], + 'Pixel Accuracy': ['pixel accuracy', 'pixel acc', 'pixel acc.', 'pixacc', 'pixel'], + 'Category IoU': ['cat iou', 'iou cat'], + 'class iIoU': ['class iiou', 'iiou cla'], + 'Category iIoU': ['cat iiou', 'iiou cat'], + 'Mean Accuracy': ['mean acc', 'mean', 'acc', 'accuracy', 'mean accuracy'], + 'Mean Error': ['mean err', 'mean', 'err', 'mean error', 'error'], + 'Top-1 Accuracy': ['top 1 accuracy', 'top 1', 'top 1 acc'], + 'Top-5 Accuracy': ['top 5 accuracy', 'top 5', 'top 5 acc'], + 'Top-1 Error Rate': ['top 1 error', 'top 1', 'top 1 err'], + 'Top-5 Error': ['top 5 error', 'top 5', 'top 5 err'] +} + +# datasets[taxonomy name] is a list of normalized evidences for taxonomy name +datasets = { + 'Hub5\'00 Average': ['avg', 'full', 'hub5', 'sum', 'evaluation'], + 'Hub5\'00 Switchboard': ['swbd', 'swb', 'hub5 swb', 'hub5 swbd', 'switchboard'], + 'Hub5\'00 CallHome': ['ch', 'hub5 ch', 'call home', 'chm'], + 'TIMIT': ['timit'], + 'WSJ eval92': ['wsj eval 92', 'eval 92', 'wsj'], + 'WSJ eval93': ['wsj eval 93', 'eval 93', 'wsj'], + 'LibriSpeech test-clean': ['libri speech test clean', 'libri speech', 'test', 'tst', 'clean', 'test clean'], + 'LibriSpeech test-other': ['libri speech test other', 'libri speech', 'test', 'tst', 'other', 'test other', + 'noisy'], + 'Babel Cebuano': ['babel cebuano', 'babel', 'cebuano', 'ceb'], + 'Babel Kazakh': ['babel kazakh', 'babel', 'kazakh', 'kaz'], + 'Babel Kurmanji': ['babel kurmanji', 'babel', 'kurmanji', 'kur'], + 'Babel Lithuanian': ['babel lithuanian', 'babel', 'lithuanian', 'lit'], + 'Babel Telugu': ['babel telugu', 'babel', 'telugu', 'tel'], + 'Babel Tok Pisin': ['babel tok pisin', 'babel', 'tok pisin', 'tok'], + + 'Ask Ubuntu': ['ask ubuntu', 'ask u', 'ubuntu'], + 'Chatbot': ['chatbot'], + 'Web Apps': ['web apps'], + 'CHiME clean': ['chime clean', 'chime', 'clean'], + 'CHiME real': ['chime real', 'chime', 'real'], + 'CHiME simu': ['chime simu', 'chime', 'simu', 'sim', 'simulated'], + 'CHiME-4 real 6ch': ['chime 4 real 6 ch', 'chime 4', 'real', '6 channel'], + 'AG News': ['ag news', 'ag'], + 'GigaWord': ['gigaword', 'giga'], + 'GEOTEXT': ['geotext', 'geo'], + 'IWSLT2015 English-Vietnamese': ["iwslt 2015 english vietnamese", "iwslt", "2015", "english vietnamese", "en vi", + "iwslt 15 english vietnamese", "iwslt 15 en vi", "english", "en", "vietnamese", + "vi"], + 'IWSLT2011 English TED Talks': ["iwslt 2011 english ted talks", "iwslt", "2011", "english", "en", "eng", "ted", + "ted talks", "english ted talks"], + 'IWSLT2012 English TED Talks': ["iwslt 2012 english ted talks", "iwslt", "2012", "english", "en", "eng", "ted", + "ted talks", "english ted talks"], + 'IWSLT2014 English-German': ["iwslt 2014 english german", "iwslt", "2014", "english german", "en de", "en", "de", + "english", "german"], + 'Rich Transcription 2002': ["rich transcription 2002", "rich transcription 02", "rt 2002", "2002", "rt 02", "rich", + "transcription"], + 'Rich Transcription 2003': ["richt ranscription 2003", "rich transcription 03", "rt 2003", "2003", "rt 03", "rich", + "transcription"], + 'Rich Transcription 2004': ["rich transcription 2004", "rich transcription 04", "rt 2004", "2004", "rt 04", "rich", + "transcription"], + 'DIRHA English WSJ real': ['dirha english wsj real', 'dirha', 'english', 'en', 'eng', 'real', 'wsj'], + 'DIRHA English WSJ simu': ['dirha english wsj simu', 'dirha', 'english', 'en', 'eng', 'simu', 'wsj', 'simulated'], + 'VCTK clean': ["vctk clean", "vctk", "clean", "voice bank", "voice", "bank", "corpus"], + 'VCTK noisy': ["vctk noisy", "vctk", "noisy", "voice bank", "voice", "bank", "corpus"], + 'VoxForge American-Canadian': ["vox forge american canadian", "vox forge", "vox", "forge", "american canadian", + "american", "canadian", "us ca"], + 'VoxForge Commonwealth': ["vox forge common wealth", "vox forge", "common wealth", "vox", "forge", "common", + "wealth"], + 'VoxForge European': ["vox forge european", "vox forge", "european", "vox", "forge", "eu"], + 'VoxForge Indian': ["vox forge indian", "vox forge", "indian", "vox", "forge"], + + # Face Alignment + 'AFLW test': ['annotated faces in the wild', 'annotated facial landmarks in the wild', 'facial', 'landmark', + 'annotated', 'faces', 'in the wild', 'faces in the wild'], + + # Human Part Segmentation + 'CIHP val': ['crowd instance level human parsing'], + 'MHP v2.0 val': ['multi human parsing'], + + # Image Generation + # 'LSUN Bedroom': ['large scale scene understanding'], + + # RAIN REMOVAL + 'Raindrop': ['raindrop'], + 'Rain100H': ['rain100h'], + 'Rain100L': ['rain100l'], + 'Rain12': ['rain12'], + 'Rain800': ['rain800'], + 'Rain1400': ['rain1400'], + 'Real Rain': ['real rain'], + 'Rain in Surveillance': ['ris'], + 'Rain in Driving': ['rid'], + 'DID-MDN': ['did-mdn'], + 'SOTS': ['sots'], + 'Test 1': ['test 1'], + 'RainSynLight25': ['rainsynlight25'], + 'RainSynComplex25': ['rainsyncomplex25'], + 'NTURain': ['nturain'], + 'RainSynAll100': ['rainsynall100'], + 'SPA-DATA': ['spa-data'], + 'LasVR': ['lasvar'], + # SEMANTIC SEGMENTATION + 'ADE20K': ['ade20k'], + 'ADE20K test': ['ade20k test', 'ade20k', 'test'], + + 'COCO-Stuff': ['coco stuff', 'coco', 'stuff'], + 'PASCAL VOC 2012': ['voc 12', 'pascal voc 12', 'voc12' 'pascal'], + 'PASCAL VOC 2012 test': ['voc 12', 'pascal voc 12', 'voc12' 'pascal'], + 'PASCAL VOC 2011': ['voc 11', 'pascal voc 11', 'voc11' 'pascal'], + 'PASCAL VOC 2011 test': ['voc 11', 'pascal voc 11', 'voc11', 'pascal'], + + 'ImageNet': ['imagenet'], + 'Cityscapes test': ['cityscapes'], + 'PASCAL Context': ['pascal context', 'pascal', 'context', 'pascalcontext', 'pascal-context'], + 'PASCAL Context val': ['pascal context', 'pascal', 'context', 'pascalcontext', 'val', 'pascal-context'], + 'PASCAL Person-Part': ['pascal person part'], + 'ParseNet': ['parsenet'], + 'LIP': ['lip'], + 'SUN-RGBD': ['sun rgbd', 'sunrgbd', 'sunrgb d'], + 'NYU Depth v2': ['nyudv2'], + + # Gender Classification + 'LFWA': ['labeled faces in the wild', 'faces in the wild', 'faces', 'in the wild'], + + # Recommendation Systems + 'NeurIPS Co-authorship': ['nips'], + + # Sentiment Analysis + 'Pang and Lee 2004': ['pl04'], + + # Semantic Textual Similarity + 'STS Benchmark': ['sts b'], + + # Paraphrase Identification + 'Microsoft Research Paraphrase Corpus': ['MRPC'], +} + +tasks = {} + +complementary_metrics = {k.lower(): v for k, v in { + 'Accuracy': 'Error', + 'Error': 'Accuracy', + 'Acc': 'Err', + 'Err': 'Acc', + 'Percentage Error': 'Accuracy', + 'Error rate': 'Accuracy', + 'Word Error Rate': 'Word Accuracy', + 'Word Error Rate (WER)': 'Word Accuracy', + 'Top-1 Accuracy': 'Top-1 Error Rate', + 'Top-3 Accuracy': 'Top-3 Error Rate', + 'Top-5 Accuracy': 'Top-5 Error Rate', + 'Top 1 Accuracy': 'Top 1 Error Rate', + 'Top 3 Accuracy': 'Top 3 Error Rate', + 'Top 5 Accuracy': 'Top 5 Error Rate', + 'Top-1 Error Rate': 'Top-1 Accuracy', + 'Top-3 Error Rate': 'Top-3 Accuracy', + 'Top-5 Error Rate': 'Top-5 Accuracy', + 'Top 1 Error Rate': 'Top 1 Accuracy', + 'Top 3 Error Rate': 'Top 3 Accuracy', + 'Top 5 Error Rate': 'Top 5 Accuracy', + 'Top-1 Error': 'Top-1 Accuracy', + 'Top-3 Error': 'Top-3 Accuracy', + 'Top-5 Error': 'Top-5 Accuracy', + 'Top 1 Error': 'Top 1 Accuracy', + 'Top 3 Error': 'Top 3 Accuracy', + 'Top 5 Error': 'Top 5 Accuracy', + 'Classification Accuracy': 'Classification Error', + 'Classification Error': 'Classification Accuracy', +}.items()} + +stop_words = { + "a", "an", "and", "are", "as", "at", "be", "but", "by", + "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", + "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with" +} diff --git a/axcell/models/linking/metrics.py b/axcell/models/linking/metrics.py new file mode 100644 index 0000000..6496b9f --- /dev/null +++ b/axcell/models/linking/metrics.py @@ -0,0 +1,147 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from fastai.text import * +from sklearn.metrics import confusion_matrix +import seaborn as sn +import pandas as pd +import matplotlib.pyplot as plt +from dataclasses import dataclass +from IPython.display import HTML, display + +@dataclass +class CM: + tp: float = 0 + fn: float = 0 + fp: float = 0 + tn: float = 0 + +class Metrics: + def __init__(self, df, experiment_name="unk", topk_metrics=False): + # TODO fix this, it mask the fact that our model may return more values than it should for "model + #self.df = df[~df["model_type_gold"].str.contains('not-present') | df["model_type_pred"].str.contains('model-best')] + self.df = df[df["model_type_gold"].str.contains('model-best') | df["model_type_pred"].str.contains('model-best')] + self.experiment_name = experiment_name + self.metric_type = 'best' + self.topk_metrics = topk_metrics + + def matching(self, *col_names): + return np.all([self.df[f"{name}_pred"] == self.df[f"{name}_gold"] for name in col_names], axis=0) + + def matching_fraction(self, *col_names): + return self.matching(*col_names).sum() / len(self.df) + + def is_predicted_as_relevant(self, *col_names): + np.all([self.df[f"{name}_pred"]]) + + def binary_confusion_matrix(self, *col_names, best_only=True): + relevant_gold = self.df["model_type_gold"].str.contains('model-best') + if best_only: + relevant_pred = self.df["model_type_pred"].str.contains('model-best') + else: + relevant_pred = relevant_gold + # present_pred = np.all([self.df[f"{name}_pred"] != 'not-present' for name in col_names], axis=0) + + pred_positive = relevant_pred # & present_pred + gold_positive = relevant_gold + equal = self.matching(*col_names) + + if self.topk_metrics: + equal = pd.Series(equal, index=pred_positive.index).groupby('cell_ext_id').max() + pred_positive = pred_positive.groupby('cell_ext_id').head(1) + gold_positive = gold_positive.groupby('cell_ext_id').head(1) + + tp = (equal & pred_positive & gold_positive).sum() + tn = (equal & ~pred_positive & ~gold_positive).sum() + fp = (pred_positive & (~equal | ~gold_positive)).sum() + fn = (gold_positive & (~equal | ~pred_positive)).sum() + + return CM(tp=tp, tn=tn, fp=fp, fn=fn) + + def calc_metric(self, metric_name, metric_fn, *col_names, best_only=True): + prefix = "best_" if best_only else "" + result = {f"{prefix}{metric_name}_{col}": metric_fn(self.binary_confusion_matrix(col, best_only=best_only)) for col in col_names} + if len(col_names) > 1: + cm = self.binary_confusion_matrix(*col_names, best_only=best_only) + result[f"{prefix}{metric_name}_all"] = metric_fn(cm) + result[f"{prefix}TP_all"] = cm.tp + result[f"{prefix}FP_all"] = cm.fp + + # Hack to present count on which precision is done + relevant_gold = self.df["model_type_gold"].str.contains('model-best') + if best_only: + relevant_pred = self.df["model_type_pred"].str.contains('model-best') + else: + relevant_pred = relevant_gold + result[f"{prefix}count"] = (relevant_pred | relevant_gold).sum() + + return result + + def accuracy(self, *col_names): + result = {f"matching_accuracy_{col}": self.matching_fraction(col) for col in col_names} + if len(col_names) > 1: + result['matching_accuracy_all'] = self.matching_fraction(*col_names) + result["matching_count"] = len(self.df) + return result + + # True Positive - m + # False Positive - cell marked as relevant but with incorrect values + + def confusion_matrix(self, name): + pred_y = np.array(self.df[f"{name}_pred"]) + true_y = np.array(self.df[f"{name}_gold"]) + labels = list(sorted(set(list(true_y) + list(pred_y)))) + cm = confusion_matrix(true_y, pred_y, labels) + return cm, labels + + def plot_confusion_matrix(self, name): + cm, target_names = self.confusion_matrix(name) + # cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + df_cm = pd.DataFrame(cm, index=[i for i in target_names], + columns=[i for i in target_names]) + plt.figure(figsize=(20, 20)) + ax = sn.heatmap(df_cm, + annot=True, + square=True, + fmt="d", + cmap="YlGnBu", + mask=cm == 0, + linecolor="black", + linewidths=0.01) + ax.set_ylabel("True") + ax.set_xlabel("Predicted") + + def precision(self, *col_names, best_only=True): + return self.calc_metric("precision", lambda cm: cm.tp / (cm.tp + cm.fp), *col_names, best_only=best_only) + + def recall(self, *col_names, best_only=True): + return self.calc_metric("recall", lambda cm: cm.tp / (cm.tp + cm.fn), *col_names, best_only=best_only) + + def metrics(self): + cols = ["model_type", "dataset", "metric", "task", "parsed"] + m = self.accuracy(*cols) + m.update(self.precision(*cols, best_only=True)) + m.update(self.recall(*cols, best_only=True)) + + m["experiment_name"] = self.experiment_name + m["test_type"] = self.metric_type + + df = pd.DataFrame([(k,v) for k,v in m.items()], columns=["metric", "value"]).set_index("metric") + return df + + def errors(self, *col_names): + cols = col_names + if not cols: + cols = ["model_type", "dataset", "metric", "task", "parsed"] + return self.df[~self.matching(*cols)] + + def show(self, df): + df = df.copy() + df['cell_id'] = df.index.map( + lambda x: 'link'.format(x)) + old_width = pd.get_option('display.max_colwidth') + pd.set_option('display.max_colwidth', -1) + display(HTML(df.to_html(escape=False))) + pd.set_option('display.max_colwidth', old_width) + + def show_errors(self): + self.show(self.errors()) diff --git a/axcell/models/linking/probs.py b/axcell/models/linking/probs.py new file mode 100644 index 0000000..8a5a192 --- /dev/null +++ b/axcell/models/linking/probs.py @@ -0,0 +1,55 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from collections import Counter + + +def get_probs(occurrences): + """ + Computes conditional probabilities based on frequency of co-occurrences + + Parameters + ---------- + occurrences: occurences[x][y] number of times with (X=x and Y=y) + + Returns + ------- + probs : probs[x][y] = Pr(Y=y | X=x) + reverse_probs : reverse_probs[y][x] = Pr(X=x | Y=y) + """ + probs = {} + reverse_probs = {} + y_occ = Counter() + for x, ys in occurrences.items(): + total = sum(ys.values()) + probs[x] = {} + for y, occ in ys.items(): + probs[x][y] = occ / total + y_occ[y] += occ + for x, ys in occurrences.items(): + for y, occ in ys.items(): + reverse_probs.setdefault(y, {})[x] = occ / y_occ[y] + + return probs, reverse_probs + + +def reverse_probs(probs): + """ + Reverses the conditional probability assuming that variables are uniformly distributed + + Parameters + ---------- + probs : probs[x][y] = Pr(Y=y | X=x) + + Returns + ------- + reverse : reverse[y][x] = Pr(X=x | Y=y) assuming X is uniform + """ + reverse = {} + for x, probs_x in probs.items(): + for y, p in probs_x.items(): + reverse.setdefault(y, {})[x] = p + for y, probs_y in reverse.items(): + norm = sum(probs_y.values()) + for x, p in probs_y.items(): + probs_y[x] = p / norm + return reverse diff --git a/axcell/models/linking/proposals_filters.py b/axcell/models/linking/proposals_filters.py new file mode 100644 index 0000000..dd58b7d --- /dev/null +++ b/axcell/models/linking/proposals_filters.py @@ -0,0 +1,142 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from ...pipeline_logger import pipeline_logger +import pandas as pd +from enum import Enum + + +class FilterOutReason(Enum): + TrainDataset = "train-dataset" + DevDataset = "dev-dataset" + EmptyModelName = "empty-model-name" + ModelCompeting = "model-competing" + + +class ProposalsFilter: + step = "proposals_filtering" + + def _filter(self, proposals): + raise NotImplementedError + + def filter(self, proposals): + which, reason = self._filter(proposals) + self.log(proposals=proposals, which=which, reason=reason) + return which, reason + + def __rshift__(self, other): + return CompoundFilter([self, other]) + + def __call__(self, proposals): + which, reason = self.filter(proposals) + return proposals[which] + + def log(self, **kwargs): + pipeline_logger(f"filtering::{self.step}::filtered", **kwargs) + + +class CompoundFilter(ProposalsFilter): + step = "compound_filtering" + + def __init__(self, filters): + self.filters = filters + + def _filter(self, proposals): + agg_which = pd.Series(data=True, index=proposals.index) + agg_reason = pd.Series(data="", index=proposals.index) + + for f in self.filters: + which, reason = f.filter(proposals) + agg_reason[agg_which & ~which] = reason + agg_which &= which + proposals = proposals[which] + return agg_which, agg_reason[~agg_which] + + +class NopFilter(ProposalsFilter): + step = "nop_filtering" + + def _filter(self, proposals): + which = pd.Series(data=True, index=proposals.index) + reason = pd.Series() + return which, reason + + +# filter proposals for which structure prediction +# * was unable to find model type or +# * found dataset cell containing "dev" or "train" +# this filter could be applied before taxonomy linking, +# but to make error analysis easier it's applied after +class StructurePredictionFilter(ProposalsFilter): + step = "structure_filtering" + + def _filter(self, proposals): + which = (proposals.struct_model_type != '') \ + & ~proposals.struct_dataset.str.contains('dev') \ + & ~proposals.struct_dataset.str.contains('train') + reason = pd.Series(data="", index=proposals.index) + reason[proposals.struct_dataset.str.contains('train')] = "train-dataset" + reason[proposals.struct_dataset.str.contains('dev')] = "dev-dataset" + reason[proposals.struct_model_type == ''] = "empty-model-type" + + return which, reason[~which] + + +class ConfidenceFilter(ProposalsFilter): + step = "confidence_filtering" + + def __init__(self, confidence=-1): + self.confidence = confidence + + def _filter(self, proposals): + which = proposals.confidence >= self.confidence + reason = "confidence " + proposals[~which].confidence.round(2).astype(str) + f" < {self.confidence}" + return which, reason[~which] + + def log(self, **kwargs): + super().log(**kwargs, confidence=self.confidence) + + +class BestResultFilter(ProposalsFilter): + step = "best_result_filtering" + + def __init__(self, taxonomy, context="paper"): + assert context in ["paper", "table"] + self.metrics_info = taxonomy.metrics_info + self.context = context + + def _filter(self, proposals): + reason = pd.Series(data="", index=proposals.index) + indices = [] + + if self.context == "paper": + context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0]) + else: + context_column = proposals.index.to_series().str.split('/', expand=False).apply(lambda x: x[0] + "/" + x[1]) + + for key_all, group in proposals[(proposals.model_type == 'model-best') & ~proposals.parsed.isna()].groupby( + by=["dataset", "metric", "task", context_column]): + dataset, metric, task, paper = key_all + key = (task, dataset, metric) + d = 0 + if key in self.metrics_info: + d = self.metrics_info[key] + elif metric in self.metrics_info: + d = self.metrics_info[metric] + elif 'error' in metric.lower(): + d = -1 + elif 'accuracy' in metric.lower(): + d = 1 + + if d >= 0: + index = group.parsed.idxmax() + else: + index = group.parsed.idxmin() + indices.append(index) + reason[group.index[group.index != index]] = "replaced by " + str(index) + + reason[proposals.struct_model_type == 'model-competing'] = "model-competing" + which = proposals.index.to_series().isin(indices) + return which, reason[~which] + + def log(self, **kwargs): + super().log(**kwargs, context=self.context) diff --git a/axcell/models/linking/taxonomy.py b/axcell/models/linking/taxonomy.py new file mode 100644 index 0000000..228dfb9 --- /dev/null +++ b/axcell/models/linking/taxonomy.py @@ -0,0 +1,72 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from pathlib import Path +import json +from collections import OrderedDict +from axcell.models.linking.manual_dicts import complementary_metrics + + +class Taxonomy: + def __init__(self, taxonomy, metrics_info): + self.taxonomy = self._get_taxonomy(taxonomy) + self.metrics_info, self.metrics_range = self._read_metrics_info(metrics_info) + self.tasks = self._get_axis('task') + self.datasets = self._get_axis('dataset') + self.metrics = self._get_axis('metric') + + def normalize_metric(self, task, dataset, metric): + if (task, dataset, metric) in self._complementary: + return self._complementary[(task, dataset, metric)][2] + return metric + + def _read_json(self, path): + with open(path, "rt") as f: + return json.load(f) + + def _get_complementary_metrics(self): + complementary = [] + self._complementary = {} + for record in self.canonical_records: + metric = record["metric"] + if metric.lower() in complementary_metrics: + task = record["task"] + dataset = record["dataset"] + comp_metric = complementary_metrics[metric.lower()] + complementary.append( + dict( + task=task, + dataset=dataset, + metric=comp_metric + ) + ) + + self._complementary[(task, dataset, comp_metric)] = (task, dataset, metric) + return complementary + + def _get_taxonomy(self, path): + self.canonical_records = self._read_json(path) + self.records = self.canonical_records + self._get_complementary_metrics() + return [(r["task"], r["dataset"], r["metric"]) for r in self.records] + + def _get_axis(self, axis): + return set(x[axis] for x in self.records) + + def _read_metrics_info(self, path): + records = self._read_json(path) + metrics_info = {} + metrics_range = {} + mr = {} + for r in records: + task, dataset, metric = r['task'], r['dataset'], r['metric'] + key = (task, dataset, metric) + d = 1 if r['higher_is_better'] else -1 + rng = r['range'] + metrics_info[key] = d + metrics_info[metric] = metrics_info.get(metric, 0) + d + metrics_range[key] = rng + s = mr.get(metric, {}) + s[rng] = s.get(rng, 0) + 1 + mr[metric] = s + for metric in mr: + metrics_range[metric] = sorted(mr[metric].items(), key=lambda x: x[1])[-1][0] + return metrics_info, metrics_range diff --git a/axcell/models/linking/utils.py b/axcell/models/linking/utils.py new file mode 100644 index 0000000..5314df3 --- /dev/null +++ b/axcell/models/linking/utils.py @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from unidecode import unidecode +import re + +# cleaning & normalization +parens_re = re.compile(r"\([^)]*?\)|\[[^]]*?\]") + +strip_nonalnum_re = re.compile(r"^\W*(\w.*\b)\W*$") +def strip_nonalnum(s): + m = strip_nonalnum_re.match(s) + if m: + return m.group(1) + return "" + +def remove_parens(text): + return parens_re.sub("", text) + +def clean_name(name): + return remove_parens(unidecode(name).strip()).strip() + +def clean_cell(cell): + return strip_nonalnum(clean_name(cell)) + +year_2k_re = re.compile(r"20(\d\d)") +hyphens_re = re.compile(r"[-_'`–’→]") +ws_re = re.compile(r"\s+") + + +refs_re = re.compile(r"(xxtable-)?xxanchor-[^ ]*|xxref-[^ ]*") + +def remove_references(s): + return refs_re.sub("", s) + +def normalize_dataset_ws(name): + name = remove_references(name) + name = hyphens_re.sub(" ", name) + name = year_2k_re.sub(r"\1", name) + name = ws_re.sub(" ", name) + return unidecode(name.strip().lower()) + +def normalize_dataset(name): + name = remove_references(name) + name = year_2k_re.sub(r"\1", name) + name = hyphens_re.sub("", name) + name = ws_re.sub(" ", name) + return unidecode(name.strip().lower()) + + +def normalize_cell(s): + return unidecode("".join([x for x in s if x.isalnum()])) + +def normalize_cell_ws(s): + return unidecode("".join([x for x in s if x.isalnum() or x.isspace()])) + +# end of cleaning & normalization diff --git a/sota_extractor2/models/structure/__init__.py b/axcell/models/structure/__init__.py similarity index 87% rename from sota_extractor2/models/structure/__init__.py rename to axcell/models/structure/__init__.py index 5405e19..f85edf6 100644 --- a/sota_extractor2/models/structure/__init__.py +++ b/axcell/models/structure/__init__.py @@ -1,8 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + import re import numpy as np import pandas as pd from ...helpers.training import set_seed from ... import config +from .type_predictor import TableTypePredictor, TableType +from .structure_predictor import TableStructurePredictor + +__all__ = ["TableType", "TableTypePredictor", "TableStructurePredictor"] + def split_by_cell_content(df, seed=42, split_column="cell_content"): set_seed(seed, "val_split", quiet=True) @@ -46,7 +53,7 @@ def __init__(self, train_name, test_name, label_map): self.test_df = pd.read_csv(config.datasets_structure/test_name) self.transform(self.normalize) self.transform(self.label) - + def transform(self, fun): self.train_df = fun(self.train_df) self.test_df = fun(self.test_df) @@ -58,7 +65,7 @@ def normalize(self, df): df = df.replace(re.compile(r"(^|[ ])\d\b"), " xxnum ") df = df.replace(re.compile(r"\bdata set\b"), " dataset ") return df - + def label(self, df): df["label"] = df["cell_type"].apply(lambda x: self.label_map.get(x, 0)) df["label"] = pd.Categorical(df["label"]) diff --git a/sota_extractor2/models/structure/experiment.py b/axcell/models/structure/experiment.py similarity index 66% rename from sota_extractor2/models/structure/experiment.py rename to axcell/models/structure/experiment.py index d9503c2..9df6b64 100644 --- a/sota_extractor2/models/structure/experiment.py +++ b/axcell/models/structure/experiment.py @@ -1,10 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + import dataclasses from dataclasses import dataclass import json from pathlib import Path import numpy as np import pandas as pd -from sota_extractor2.models.structure.nbsvm import * +from axcell.models.structure.nbsvm import * from sklearn.metrics import confusion_matrix from matplotlib import pyplot as plt import seaborn as sn @@ -17,22 +19,63 @@ class Labels(Enum): PAPER_MODEL=2 COMPETING_MODEL=3 METRIC=4 + EMPTY=5 + + +class LabelsExt(Enum): + OTHER=0 + PARAMS=6 + TASK=7 + DATASET=1 + SUBDATASET=8 + PAPER_MODEL=2 + BEST_MODEL=9 + ENSEMBLE_MODEL=10 + COMPETING_MODEL=3 + METRIC=4 + EMPTY=5 + label_map = { "dataset": Labels.DATASET.value, "dataset-sub": Labels.DATASET.value, "model-paper": Labels.PAPER_MODEL.value, "model-best": Labels.PAPER_MODEL.value, + "model-ensemble": Labels.PAPER_MODEL.value, "model-competing": Labels.COMPETING_MODEL.value, "dataset-metric": Labels.METRIC.value } +label_map_ext = { + "dataset": LabelsExt.DATASET.value, + "dataset-sub": LabelsExt.SUBDATASET.value, + "model-paper": LabelsExt.PAPER_MODEL.value, + "model-best": LabelsExt.BEST_MODEL.value, + "model-ensemble": LabelsExt.ENSEMBLE_MODEL.value, + "model-competing": LabelsExt.COMPETING_MODEL.value, + "dataset-metric": LabelsExt.METRIC.value, + "model-params": LabelsExt.PARAMS.value, + "dataset-task": LabelsExt.TASK.value +} + # put here to avoid recompiling, used only in _limit_context elastic_tag_split_re = re.compile("(.*?)") +# e = Experiment(remove_num=False, drop_duplicates=False, vectorizer='count', +# this_paper=True, merge_fragments=True, merge_type='concat', +# evidence_source='text_highlited', split_btags=True, fixed_tokenizer=True, +# fixed_this_paper=True, mask=False, evidence_limit=None, context_tokens=None, +# analyzer='word', lowercase=True, class_weight='balanced', multinomial_type='multinomial', +# solver='lbfgs', C=0.1, dual=False, penalty='l2', ngram_range=[1, 3], +# min_df=10, max_df=0.9, max_iter=1000, results={}, has_model=False) + +# ULMFiT related parameters +# remove_num, drop_duplicates, this_paper, merge_fragments, merge_type, evidence_source, split_btags +# fixed_tokenizer?, fixed_this_paper (remove), mask, evidence_limit, context_tokens, lowercase +# class_weight? (consider adding support), + @dataclass class Experiment: - vectorizer: str = "tfidf" this_paper: bool = False merge_fragments: bool = False merge_type: str = "concat" # "concat", "vote_maj", "vote_avg", "vote_max" @@ -43,22 +86,11 @@ class Experiment: mask: bool = False # if True and evidence_source = "text_highlited", replace ... with xxmask evidence_limit: int = None # maximum number of evidences per cell (grouped by (ext_id, this_paper)) context_tokens: int = None # max. number of words before and after - analyzer: str = "word" # "char", "word" or "char_wb" lowercase: bool = True remove_num: bool = True drop_duplicates: bool = True mark_this_paper: bool = False - - class_weight: str = None - multinomial_type: str = "manual" # "manual", "ovr", "multinomial" - solver: str = "liblinear" # 'lbfgs' - large, liblinear for small datasets - C: float = 4.0 - dual: bool = True - penalty: str = "l2" - ngram_range: tuple = (1, 2) - min_df: int = 3 - max_df: float = 0.9 - max_iter: int = 1000 + distinguish_model_source: bool = True results: dict = dataclasses.field(default_factory=dict) @@ -74,19 +106,31 @@ def _get_next_exp_name(self, dir_path): return dir_path / name raise Exception("You have too many files in this dir, really!") - def _save_model(self, path): + @staticmethod + def _dump_pickle(obj, path): with open(path, 'wb') as f: - pickle.dump(self._model, f) + pickle.dump(obj, f) - def _load_model(self, path): + @staticmethod + def _load_pickle(path): with open(path, 'rb') as f: - self._model = pickle.load(f) - return self._model + return pickle.load(f) + + def _save_model(self, path): + self._dump_pickle(self._model, path) + + def _load_model(self, path): + self._model = self._load_pickle(path) + return self._model def load_model(self): path = self._path.parent / f"{self._path.stem}.model" return self._load_model(path) + def save_model(self, path): + if hasattr(self, "_model"): + self._save_model(path) + def save(self, dir_path): dir_path = Path(dir_path) dir_path.mkdir(exist_ok=True, parents=True) @@ -94,9 +138,7 @@ def save(self, dir_path): j = dataclasses.asdict(self) with open(filename, "wt") as f: json.dump(j, f) - if hasattr(self, "_model"): - fn = filename.stem - self._save_model(dir_path / f"{fn}.model") + self.save_model(dir_path / f"{filename.stem}.model") return filename.name def to_df(self): @@ -115,12 +157,13 @@ def new_experiment(self, **kwargs): def update_results(self, **kwargs): self.results.update(**kwargs) - def get_trained_model(self, train_df): - nbsvm = NBSVM(experiment=self) - nbsvm.fit(train_df["text"], train_df["label"]) - self._model = nbsvm + def train_model(self, train_df, valid_df): + raise NotImplementedError("train_model should be implemented in subclass") + + def get_trained_model(self, train_df, valid_df): + self._model = self.train_model(train_df, valid_df) self.has_model = True - return nbsvm + return self._model def _limit_context(self, text): parts = elastic_tag_split_re.split(text) @@ -145,6 +188,8 @@ def _limit_context(self, text): def _transform_df(self, df): + df.cell_reference = (df.cell_reference != '').astype(str) + df.cell_styles = df.cell_styles.astype(str) if self.merge_type not in ["concat", "vote_maj", "vote_avg", "vote_max"]: raise Exception(f"merge_type must be one of concat, vote_maj, vote_avg, vote_max, but {self.merge_type} was given") if self.mark_this_paper and (self.merge_type != "concat" or self.this_paper): @@ -164,32 +209,35 @@ def _transform_df(self, df): elif self.mask: raise Exception("Masking with evidence_source='text' makes no sense") + duplicates_columns = ["text", "cell_content", "cell_type", "row_context", "col_context", "cell_reference", "cell_layout", "cell_styles"] + columns_to_keep = ["ext_id", "cell_content", "cell_type", "row_context", "col_context", "cell_reference", "cell_layout", "cell_styles"] + if self.mark_this_paper: - df = df.groupby(by=["ext_id", "cell_content", "cell_type", "this_paper"]).text.apply( + df = df.groupby(by=columns_to_keep + ["this_paper"]).text.apply( lambda x: "\n".join(x.values)).reset_index() this_paper_map = { True: "this paper", False: "other paper" } df.text = "xxfld 3 " + df.this_paper.apply(this_paper_map.get) + " " + df.text - df = df.groupby(by=["ext_id", "cell_content", "cell_type"]).text.apply( + df = df.groupby(by=columns_to_keep).text.apply( lambda x: " ".join(x.values)).reset_index() elif not self.fixed_this_paper: if self.merge_fragments and self.merge_type == "concat": - df = df.groupby(by=["ext_id", "cell_content", "cell_type", "this_paper"]).text.apply( + df = df.groupby(by=columns_to_keep + ["this_paper"]).text.apply( lambda x: "\n".join(x.values)).reset_index() if self.drop_duplicates: - df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("") + df = df.drop_duplicates(duplicates_columns).fillna("") if self.this_paper: df = df[df.this_paper] else: if self.this_paper: df = df[df.this_paper] if self.merge_fragments and self.merge_type == "concat": - df = df.groupby(by=["ext_id", "cell_content", "cell_type"]).text.apply( + df = df.groupby(by=columns_to_keep).text.apply( lambda x: "\n".join(x.values)).reset_index() if self.drop_duplicates: - df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("") + df = df.drop_duplicates(duplicates_columns).fillna("") if self.split_btags: df["text"] = df["text"].replace(re.compile(r"(\)"), r" \1 ") @@ -199,6 +247,8 @@ def _transform_df(self, df): df = df.replace(re.compile(r"(^|[ ])\d+(\b|%)"), " xxnum ") df = df.replace(re.compile(r"\bdata set\b"), " dataset ") df["label"] = df["cell_type"].apply(lambda x: label_map.get(x, 0)) + if not self.distinguish_model_source: + df["label"] = df["label"].apply(lambda x: x if x != Labels.COMPETING_MODEL.value else Labels.PAPER_MODEL.value) df["label"] = pd.Categorical(df["label"]) return df @@ -208,13 +258,15 @@ def transform_df(self, *dfs): return transformed[0] return transformed - def _set_results(self, prefix, preds, true_y): + def _set_results(self, prefix, preds, true_y, true_y_ext=None): m = metrics(preds, true_y) r = {} r[f"{prefix}_accuracy"] = m["accuracy"] r[f"{prefix}_precision"] = m["precision"] r[f"{prefix}_recall"] = m["recall"] - r[f"{prefix}_cm"] = confusion_matrix(true_y, preds).tolist() + r[f"{prefix}_cm"] = confusion_matrix(true_y, preds, labels=[x.value for x in Labels]).tolist() + if true_y_ext is not None: + r[f"{prefix}_cm_full"] = confusion_matrix(true_y_ext, preds, labels=[x.value for x in LabelsExt]).tolist() self.update_results(**r) def evaluate(self, model, train_df, valid_df, test_df): @@ -233,9 +285,10 @@ def evaluate(self, model, train_df, valid_df, test_df): true_y = vote_results["true"] else: true_y = tdf["label"] - self._set_results(prefix, preds, true_y) + true_y_ext = tdf["cell_type"].apply(lambda x: label_map_ext.get(x, 0)) + self._set_results(prefix, preds, true_y, true_y_ext) - def show_results(self, *ds, normalize=True): + def show_results(self, *ds, normalize=True, full_cm=True): if not len(ds): ds = ["train", "valid", "test"] for prefix in ds: @@ -243,14 +296,26 @@ def show_results(self, *ds, normalize=True): print(f" * accuracy: {self.results[f'{prefix}_accuracy']:.3f}") print(f" * μ-precision: {self.results[f'{prefix}_precision']:.3f}") print(f" * μ-recall: {self.results[f'{prefix}_recall']:.3f}") - self._plot_confusion_matrix(np.array(self.results[f'{prefix}_cm']), normalize=normalize) + suffix = '_full' if full_cm and f'{prefix}_cm_full' in self.results else '' + self._plot_confusion_matrix(np.array(self.results[f'{prefix}_cm{suffix}']), normalize=normalize) + + def get_cm_labels(self, cm): + if len(cm) == 6: + target_names = ["OTHER", "DATASET", "MODEL (paper)", "MODEL (comp.)", "METRIC", "EMPTY"] + else: + target_names = ["OTHER", "params", "task", "DATASET", "subdataset", "MODEL (paper)", "model (best)", + "model (ens.)", "MODEL (comp.)", "METRIC", "EMPTY"] + return target_names def _plot_confusion_matrix(self, cm, normalize, fmt=None): if normalize: - cm = cm / cm.sum(axis=1)[:, None] + s = cm.sum(axis=1)[:, None] + s[s == 0] = 1 + cm = cm / s if fmt is None: fmt = "0.2f" if normalize else "d" - target_names = ["OTHER", "DATASET", "MODEL (paper)", "MODEL (comp.)", "METRIC"] + + target_names = self.get_cm_labels(cm) df_cm = pd.DataFrame(cm, index=[i for i in target_names], columns=[i for i in target_names]) plt.figure(figsize=(10, 10)) @@ -289,4 +354,38 @@ def load(cls, path): def experiments_to_df(cls, exps): dfs = [e.to_df() for e in exps] df = pd.concat(dfs) - return df \ No newline at end of file + return df + +@dataclass +class NBSVMExperiment(Experiment): + vectorizer: str = "tfidf" + analyzer: str = "word" # "char", "word" or "char_wb" + class_weight: str = None + multinomial_type: str = "manual" # "manual", "ovr", "multinomial" + solver: str = "liblinear" # 'lbfgs' - large, liblinear for small datasets + C: float = 4.0 + dual: bool = True + penalty: str = "l2" + ngram_range: tuple = (1, 2) + min_df: int = 3 + max_df: float = 0.9 + max_iter: int = 1000 + + def train_model(self, train_df, valid_df=None): + nbsvm = NBSVM(experiment=self) + nbsvm.fit(train_df["text"], train_df["label"]) + return nbsvm + + +def experiments_grid(base_experiment, transform=None, **params): + if not params: + yield base_experiment + else: + param, values = next(iter(params.items())) + params.pop(param) + for value in values: + if transform and param in transform: + updates = transform[param](param, value) + else: + updates = {param: value} + yield from experiments_grid(base_experiment.new_experiment(**updates), transform, **params) diff --git a/sota_extractor2/models/structure/nbsvm.py b/axcell/models/structure/nbsvm.py similarity index 99% rename from sota_extractor2/models/structure/nbsvm.py rename to axcell/models/structure/nbsvm.py index ed4f79b..5851195 100644 --- a/sota_extractor2/models/structure/nbsvm.py +++ b/axcell/models/structure/nbsvm.py @@ -1,3 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + import re import string from fastai.text import * # just for utilty functions pd, np, Path etc. diff --git a/axcell/models/structure/structure_predictor.py b/axcell/models/structure/structure_predictor.py new file mode 100644 index 0000000..2a461f3 --- /dev/null +++ b/axcell/models/structure/structure_predictor.py @@ -0,0 +1,215 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from fastai.text import * +from pathlib import Path +import pandas as pd +import numpy as np +import pickle +from .experiment import Labels, label_map +from .ulmfit_experiment import ULMFiTExperiment +import re +from .ulmfit import ULMFiT_SP +from ...pipeline_logger import pipeline_logger +from copy import deepcopy + + +def load_crf(path): + with open(path, "rb") as f: + return pickle.load(f) + + +with_letters_re = re.compile(r"(?:^\s*[a-zA-Z])|(?:[a-zA-Z]{2,})") + +def cut_ulmfit_head(model): + pooling = PoolingLinearClassifier([1], []) + pooling.layers = model[1].layers[:-2] + return SequentialRNN(model[0], pooling) + + +# todo: move to TSP +n_ulmfit_features = 50 +n_fasttext_features = 0 +n_layout_features = 16 +n_features = n_ulmfit_features + n_fasttext_features + n_layout_features +n_classes = 5 + + +class TableStructurePredictor(ULMFiT_SP): + step = "structure_prediction" + + def __init__(self, path, file, crf_path=None, crf_model=None, + sp_path=None, sp_model="spm.model", sp_vocab="spm.vocab"): + super().__init__(path, file, sp_path, sp_model, sp_vocab) + + self._full_learner = deepcopy(self.learner) + self.learner.model = cut_ulmfit_head(self.learner.model) + self.learner.loss_func = None + + if crf_model is not None: + crf_path = Path(path) if crf_path is None else Path(crf_path) + self.crf = load_crf(crf_path / crf_model) + else: + self.crf = None + + # todo: clean Experiment from older approaches + self._e = ULMFiTExperiment(remove_num=False, drop_duplicates=False, + this_paper=True, merge_fragments=True, merge_type='concat', + evidence_source='text_highlited', split_btags=True, fixed_tokenizer=True, + fixed_this_paper=True, mask=True, evidence_limit=None, context_tokens=None, + lowercase=True, drop_mult=0.15, fp16=True, train_on_easy=False) + + def preprocess_df(self, raw_df): + return self._e.transform_df(raw_df) + + @staticmethod + def keep_alphacells(df): + # which = df.cell_content.str.contains(with_letters_re) + which = df.cell_content.str.contains(with_letters_re) + return df[which], df[~which] + + def df2tl(self, df): + text_cols = ["cell_styles", "cell_layout", "text", "cell_content", "row_context", "col_context", + "cell_reference"] + df = df[text_cols] + return TextList.from_df(df, cols=text_cols) + + def get_features(self, evidences, use_crf=True): + if use_crf: + learner = self.learner + else: + learner = self._full_learner + if len(evidences): + tl = self.df2tl(evidences) + learner.data.add_test(tl) + + preds, _ = learner.get_preds(DatasetType.Test, ordered=True) + return preds.cpu().numpy() + return np.zeros((0, n_ulmfit_features if use_crf else n_classes)) + + @staticmethod + def to_tables(df, transpose=False, n_ulmfit_features=n_ulmfit_features): + X_tables = [] + Y_tables = [] + ids = [] + C_tables = [] + for table_id, frame in df.groupby("table_id"): + rows, cols = frame.row.max()+1, frame.col.max()+1 + x_table = np.zeros((rows, cols, n_features)) + ###y_table = np.ones((rows, cols), dtype=np.int) * n_classes + c_table = np.full((rows, cols), "", dtype=np.object) + for i, r in frame.iterrows(): + x_table[r.row, r.col, :n_ulmfit_features] = r.features + c_table[r.row, r.col] = r.cell_content + #x_table[r.row, r.col, n_ulmfit_features:n_ulmfit_features+n_fasttext_features] = ft_model[r.text] + # if n_fasttext_features > 0: + # x_table[r.row, r.col, n_ulmfit_features:n_ulmfit_features+n_fasttext_features] = ft_model[r.cell_content] + ###y_table[r.row, r.col] = r.label + if n_layout_features > 0: + offset = n_ulmfit_features+n_fasttext_features + layout = r.cell_layout + x_table[r.row, r.col, offset] = 1 if 'border-t' in layout or 'border-tt' in layout else -1 + x_table[r.row, r.col, offset+1] = 1 if 'border-b' in layout or 'border-bb' in layout else -1 + x_table[r.row, r.col, offset+2] = 1 if 'border-l' in layout or 'border-ll' in layout else -1 + x_table[r.row, r.col, offset+3] = 1 if 'border-r' in layout or 'border-rr' in layout else -1 + x_table[r.row, r.col, offset+4] = 1 if r.cell_reference == "True" else -1 + x_table[r.row, r.col, offset+5] = 1 if r.cell_styles == "True" else -1 + for span_idx, span in enumerate(["cb", "ci", "ce", "rb", "ri", "re"]): + x_table[r.row, r.col, offset+6+span_idx] = 1 if f'span-{span}' in r.cell_layout else -1 + x_table[r.row, r.col, offset+12] = 1 if r.row == 0 else -1 + x_table[r.row, r.col, offset+13] = 1 if r.row == rows-1 else -1 + x_table[r.row, r.col, offset+14] = 1 if r.col == 0 else -1 + x_table[r.row, r.col, offset+15] = 1 if r.col == cols-1 else -1 + #x_table[r.row, r.col, -n_fasttext_features:] = ft_model[r.cell_content] + X_tables.append(x_table) + ###Y_tables.append(y_table) + C_tables.append(c_table) + ids.append(table_id) + if transpose: + X_tables.append(x_table.transpose((1, 0, 2))) + ###Y_tables.append(y_table.transpose()) + C_tables.append(c_table.transpose()) + ids.append(table_id) + ###return (X_tables, Y_tables), C_tables, ids + return X_tables, C_tables, ids + + @staticmethod + def merge_with_preds(df, preds): + if not len(df): + return [] + ext_id = df.ext_id.str.split("/", expand=True) + return list(zip(ext_id[0] + "/" + ext_id[1], ext_id[2].astype(int), ext_id[3].astype(int), + preds, df.text, df.cell_content, df.cell_layout, df.cell_styles, df.cell_reference, df.label)) + + @staticmethod + def merge_all_with_preds(df, df_num, preds, use_crf=True): + columns = ["table_id", "row", "col", "features", "text", "cell_content", "cell_layout", + "cell_styles", "cell_reference", "label"] + + alpha = TableStructurePredictor.merge_with_preds(df, preds) + nums = TableStructurePredictor.merge_with_preds(df_num, np.zeros((len(df_num), n_ulmfit_features if use_crf else n_classes))) + + df1 = pd.DataFrame(alpha, columns=columns) + df2 = pd.DataFrame(nums, columns=columns) + df2.label = n_classes + return df1.append(df2, ignore_index=True) + + # todo: fix numeric cells being labelled as meta / other + @staticmethod + def format_predictions(tables_preds, test_ids): + num2label = {v: k for k, v in label_map.items()} + num2label[0] = "table-meta" + num2label[Labels.PAPER_MODEL.value] = 'model-paper' + num2label[Labels.DATASET.value] = 'dataset' + num2label[max(label_map.values()) + 1] = '' + + flat = [] + for preds, ext_id in zip(tables_preds, test_ids): + paper_id, table_id = ext_id.split("/") + labels = pd.DataFrame(preds).applymap(num2label.get).values + flat.extend( + [(paper_id, table_id, r, c, labels[r, c]) for r in range(len(labels)) for c in range(len(labels[r])) if + labels[r, c]]) + return pd.DataFrame(flat, columns=["paper", "table", "row", "col", "predicted_tags"]) + + def predict_tags(self, raw_evidences, use_crf=True): + evidences, evidences_num = self.keep_alphacells(self.preprocess_df(raw_evidences)) + pipeline_logger(f"{TableStructurePredictor.step}::evidences_split", evidences=evidences, evidences_num=evidences_num) + features = self.get_features(evidences, use_crf) + df = self.merge_all_with_preds(evidences, evidences_num, features, use_crf) + tables, contents, ids = self.to_tables(df, n_ulmfit_features=n_ulmfit_features if use_crf else n_classes) + if use_crf: + preds = self.crf.predict(tables) + else: + preds = [] + for table in tables: + p = table[..., :n_classes].argmax(axis=-1) + p[table[..., :n_classes].max(axis=-1) == 0.0] = n_classes + preds.append(p) + return self.format_predictions(preds, ids) + + # todo: consider adding sota/ablation information + @staticmethod + def label_table(paper, table, annotations, in_place): + structure = pd.DataFrame().reindex_like(table.matrix).fillna("") + ext_id = (paper.paper_id, table.name) + if ext_id in annotations: + for _, entry in annotations[ext_id].iterrows(): + # todo: add model-ensemble support + structure.iloc[entry.row, entry.col] = entry.predicted_tags if entry.predicted_tags != "model-paper" else "model-best" + if not in_place: + table = deepcopy(table) + table.set_tags(structure.values) + return table + + # todo: take EvidenceExtractor in constructor + def label_tables(self, paper, tables, raw_evidences, in_place=False, use_crf=True): + pipeline_logger(f"{TableStructurePredictor.step}::label_tables", paper=paper, tables=tables, raw_evidences=raw_evidences) + if len(raw_evidences): + tags = self.predict_tags(raw_evidences, use_crf) + annotations = dict(list(tags.groupby(by=["paper", "table"]))) + else: + annotations = {} # just deep-copy all tables + pipeline_logger(f"{TableStructurePredictor.step}::annotations", paper=paper, tables=tables, annotations=annotations) + labeled = [self.label_table(paper, table, annotations, in_place) for table in tables] + pipeline_logger(f"{TableStructurePredictor.step}::tables_labeled", paper=paper, labeled_tables=labeled) + return labeled diff --git a/axcell/models/structure/transfo_experiment.py b/axcell/models/structure/transfo_experiment.py new file mode 100644 index 0000000..fe63850 --- /dev/null +++ b/axcell/models/structure/transfo_experiment.py @@ -0,0 +1,718 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import time + +from .experiment import Experiment +from .nbsvm import preds_for_cell_content, preds_for_cell_content_max, preds_for_cell_content_multi +import dataclasses +from dataclasses import dataclass +from typing import Tuple +from axcell.helpers.training import set_seed +from fastai.text import * +import numpy as np +from pathlib import Path +import json + +import argparse +import glob +import logging +import os +import random + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, + TensorDataset) +from torch.utils.data.distributed import DistributedSampler + +from fastai.text import * # for utilty functions + +try: + from torch.utils.tensorboard import SummaryWriter +except: + from tensorboardX import SummaryWriter + +from tqdm import tqdm, trange +import tensorflow_datasets + +from transformers import (WEIGHTS_NAME, BertConfig, + BertForSequenceClassification, BertTokenizer, + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer, + XLMConfig, XLMForSequenceClassification, + XLMTokenizer, XLNetConfig, + XLNetForSequenceClassification, + XLNetTokenizer, + DistilBertConfig, + DistilBertForSequenceClassification, + DistilBertTokenizer, DataProcessor, InputExample, AutoConfig) + +from transformers import AdamW, WarmupLinearSchedule + +from transformers import glue_compute_metrics as compute_metrics +from transformers import glue_output_modes as output_modes +from transformers import glue_processors as processors +from transformers import glue_convert_examples_to_features as convert_examples_to_features +from transformers import AutoTokenizer, AutoModelForSequenceClassification, glue_convert_examples_to_features +from transformers.data.processors.glue import glue_processors + + +logger = logging.getLogger(__name__) + + +def train(args, train_dataset, valid_dataset, model, tokenizer): + """ Train the model """ + if args.local_rank in [-1, 0]: + tb_writer = args.get_summary_writer() + + train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + + if args.max_steps > 0: + t_total = args.max_steps + args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 + else: + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + if args.fp16: + try: + from apex import amp + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) + + # multi-gpu training (should be after apex fp16 initialization) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Distributed training (should be after apex fp16 initialization) + if args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], + output_device=args.local_rank, + find_unused_parameters=True) + + # Train! + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataset)) + logger.info(" Num Epochs = %d", args.num_train_epochs) + logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) + logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) + logger.info(" Total optimization steps = %d", t_total) + + global_step = 0 + tr_loss, logging_loss = 0.0, 0.0 + model.zero_grad() + train_iterator = range(int(args.num_train_epochs)) + set_seed(args.seed, "Training", all_gpus=(args.n_gpu > 1)) # Added here for reproductibility (even between python 2 and 3) + mb = master_bar(train_iterator) + mb.first_bar.comment = f'Epochs' + results={} + for epoch in mb: + epoch_iterator = progress_bar(train_dataloader, display=args.local_rank not in [-1, 0], parent=mb) + + for step, batch in enumerate(epoch_iterator): + model.train() + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids + outputs = model(**inputs) + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + if args.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + tr_loss += loss.item() + if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu: + if args.fp16: + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) + else: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + optimizer.step() + scheduler.step() # Update learning rate schedule + model.zero_grad() + global_step += 1 + + if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: + # Log metrics + mb.child.comment = f"loss: {loss}" + tb_writer.add_scalar('train/lr', scheduler.get_lr()[0], global_step) + tb_writer.add_scalar('train/loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + logging_loss = tr_loss + + if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: + # Save model checkpoint + output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(output_dir) + torch.save(args, os.path.join(output_dir, 'training_args.bin')) + logger.info("Saving model checkpoint to %s", output_dir) + #mb.first_bar.comment = f'first bar stat' + #mb.write(f'Finished loop {i}.') + if args.tpu: + args.xla_model.optimizer_step(optimizer, barrier=True) + model.zero_grad() + global_step += 1 + + if args.max_steps > 0 and global_step > args.max_steps: + epoch_iterator.close() + break + if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + results = evaluate(args, model, valid_dataset) + for key, value in results.items(): + tb_writer.add_scalar('eval/{}'.format(key), value, global_step) + mb.first_bar.comment = str(results['acc']) + mb.write(f"Epoch: {epoch} {loss} Accuracy: {results.get('acc', 0)}") + + if args.max_steps > 0 and global_step > args.max_steps: + train_iterator.close() + break + hparams_dict = {k: v for k, v in dataclasses.asdict(args).items() if isinstance(v, (int, float, str, bool,))} + tb_writer.add_hparams(hparam_dict=hparams_dict, metric_dict=results) + + if args.local_rank in [-1, 0]: + tb_writer.close() + + return global_step, tr_loss / global_step + + +def evaluate(args, model, eval_dataset, prefix="", eval_output_dir="/tmp/out"): + # Loop to handle MNLI double evaluation (matched, mis-matched) + results = {} + eval_task = args.task_name + if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: + os.makedirs(eval_output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + # Note that DistributedSampler samples randomly + eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + # Eval! + logger.info("***** Running evaluation {} *****".format(prefix)) + logger.info(" Num examples = %d", len(eval_dataset)) + logger.info(" Batch size = %d", args.eval_batch_size) + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + mb = progress_bar(eval_dataloader) + for batch in mb: + model.eval() + batch = tuple(t.to(args.device) for t in batch) + + with torch.no_grad(): + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + if args.output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif args.output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(eval_task, preds, out_label_ids) + results.update(result) + results['loss'] = eval_loss + output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results {} *****".format(prefix)) + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + return results + +def prepare_glue_examples(tokenizer, task_name='mrpc', split_name='train'): + processor = glue_processors[task_name]() + + def tf_mrpc_to_pytorch(d): + for ex in d: + ex = processor.get_example_from_tensor_dict(ex) + # ex = processor.tfds_map(ex) + yield ex + + tf_data = tensorflow_datasets.load(f"glue/{task_name}")[split_name] + examples = tf_mrpc_to_pytorch(tf_data) + features = glue_convert_examples_to_features(examples, + tokenizer, + max_length=128, + task='mrpc') + + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) + return dataset + + +def strip_tensors(r): + nr = {} + for k,v in r.items(): + v = v.numpy() + if isinstance(v, bytes): + v = v.decode("utf-8") + else: + v = v.item() + nr[k] = v + return nr + +def glue_dataset_to_df(task_name): + data = tensorflow_datasets.load(f"glue/{task_name}") + new_dict = {} + for name, dataset in data.items(): + new_dict[name] = pd.DataFrame.from_records([strip_tensors(r) for r in dataset], + columns=dataset.output_shapes.keys(), + index='idx') + return new_dict.get('train', None), new_dict.get('validation', None), new_dict.get('test', None) + +def convert_df_to_examples(df, text_a='sentence1', text_b='sentence2', label='label'): + return [InputExample( + idx, + row[text_a], + row[text_b], + str(row[label])) + for idx, row in df.iterrows()] + +def convert_df_to_dataset(tokenizer, df, max_length=128, task='mrpc', text_a='sentence1', text_b='sentence2', label='label', return_labels=False): + label_list = list(sorted(map(str, df[label].unique()))) + examples = convert_df_to_examples(df, text_a, text_b, label) + features = glue_convert_examples_to_features(examples, + tokenizer, + max_length=max_length, + label_list=label_list, + output_mode='classification', + task=None) + + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) + if return_labels: + return dataset, label_list + return dataset + +@dataclass +class TransfoLearner(): + model: nn.Module + tokenizer: Any + data: Any + +def get_preds(args, model, dataset, ordered=True): + eval_dataset = dataset + eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + if isinstance(eval_sampler, DistributedSampler) and ordered: + # Note that DistributedSampler samples randomly + raise ValueError("Unable to run distributed get_preds with ordered == True") + logger.info("Num examples = %d", len(eval_dataset)) + logger.info("Batch size = %d", args.eval_batch_size) + eval_loss = 0.0 + nb_eval_steps = 0 + mb = progress_bar(eval_dataloader) + preds = [] + labels = [] + try: + with torch.no_grad(): + model.to(args.device) + model.eval() + for batch in mb: + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None + # XLM, DistilBERT and RoBERTa don't use segment_ids + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + preds.append(logits.detach().cpu()) + labels.append(inputs['labels'].detach().cpu()) # add non_blocking=True but somehow it isn't avaliabe in our torch + return torch.cat(preds, dim=0), torch.cat(labels, dim=0) + finally: + model.to("cpu") + +@dataclass +class TransfoDatabunch(): + num_labels: int + train_ds: Any + valid_ds: Any + test_ds: Any + +@dataclass +class TransfoExperiment(Experiment): + test_split: str = None + valid_split: str = None + text_a: str = 'text' + text_b: str = 'cell_content' + label: str = 'label' + #@help("Model type selected in the list: ...") + model_type: str = None + #@help("Path to pre-trained model or shortcut name selected in the list: ...") + pretrained_name: str = None + #@help("The name of the task to train selected in the list: " + "".join(processors.keys())) + task_name: str = None + #@help("Pretrained config name or path if not the same as model_name") + config_name: str = "" + #@help("Pretrained tokenizer name or path if not the same as model_name") + tokenizer_name: str = "" + #@help("Where do you want to store the pre-trained models downloaded from s3") + cache_dir: str = "" + #@help("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated sequences shorter will be padded.") + max_seq_length: int = 128 + #@help("Whether to run training.") + do_train: bool = False + #@help("Whether to run eval on the dev set.") + do_eval: bool = False + #@help("Rul evaluation during training at each logging step.") + evaluate_during_training: bool = False + #@help("Batch size per GPU/CPU for training.") + per_gpu_train_batch_size: int = 8 + #@help("Batch size per GPU/CPU for evaluation.") + per_gpu_eval_batch_size: int = 8 + #@help("Number of updates steps to accumulate before performing a backward/update pass.") + gradient_accumulation_steps: int = 1 + #@help("The initial learning rate for Adam.") + learning_rate: float = 5e-5 + #@help("Weight deay if we apply some.") + weight_decay: float = 0.0 + #@help("Epsilon for Adam optimizer.") + adam_epsilon: float = 1e-8 + #@help("Max gradient norm.") + max_grad_norm: float = 1.0 + #@help("Total number of training epochs to perform.") + num_train_epochs: float = 3.0 + #@help("If > 0: set total number of training steps to perform. Override num_train_epochs.") + max_steps: int = -1 + #@help("Linear warmup over warmup_steps.") + warmup_steps: int = 0 + #@help("Log every X updates steps.") + logging_steps: int = 10 + #@help("Save checkpoint every X updates steps.") + save_steps: int = 50 + #@help("Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") + eval_all_checkpoints: bool = False + #@help("Avoid using CUDA when available") + no_cuda: bool = False + #@help("Overwrite the cached training and evaluation sets") + overwrite_cache: bool = False + #@help("random seed for initialization") + seed: int = 42 + #@help("Whether to run on the TPU defined in the environment variables") + tpu: bool = False + #@help("TPU IP address if none are set in the environment variables") + tpu_ip_address: str = '' + #@help("TPU name if none are set in the environment variables") + tpu_name: str = '' + #@help("XRT TPU config if none are set in the environment variables") + xrt_tpu_config: str = '' + + #@help("Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") + fp16: bool = False + #@help("For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2' and 'O3']. See details at https://nvidia.github.io/apex/amp.html") + fp16_opt_level: str = 'O1' + #@help("For distributed training: local_rank") + local_rank: int = -1 + #@help("For distant debugging.") + server_ip: str = '' + #@help("For distant debugging.") + server_port: str = '' + + seed: int = 42 + # Unused + + #@help("The input data dir. Should contain the .tsv files (or other data files) for the task.") + data_dir: str = "/tmp/data" + + #@help("The output directory where the model predictions and checkpoints will be written.") + output_dir: str = "/tmp/tmp_output_dir" + + #@help("Overwrite the content of the output directory") + overwrite_output_dir: bool = True + + def __post_init__(self): + if os.path.exists(self.output_dir) and os.listdir( + self.output_dir) and self.do_train and not self.overwrite_output_dir: + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + self.output_dir)) + + # Setup distant debugging if needed + if self.server_ip and self.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(self.server_ip, self.server_port), redirect_output=True) + ptvsd.wait_for_attach() + + # Setup CUDA, GPU & distributed training + if self.local_rank == -1 or self.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu") + self.n_gpu = torch.cuda.device_count() + else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.cuda.set_device(self.local_rank) + device = torch.device("cuda", self.local_rank) + torch.distributed.init_process_group(backend='nccl') + self.n_gpu = 1 + self.device = device + self.output_mode = "classification" + + self.train_batch_size = self.per_gpu_train_batch_size * max(1, self.n_gpu) + self.eval_batch_size = self.per_gpu_eval_batch_size * max(1, self.n_gpu) + self._tokenizer = None + self._model = None + self._data_cache = None + self.train_started = None + + @property + def tokenizer(self): + if self._tokenizer is None: + self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained_name) + return self._tokenizer + + @property + def experiment_name(self): + from datetime import datetime + import socket + if not self.name: + now = datetime.now() + d = now.strftime("%y%m%d_%H%M%S") + h = "_".join(socket.gethostname().split('-')) + + def short_name(name): + return "".join([p[0] for p in name.split('_')]) + + def short_val(val): + if isinstance(val, bool): + return int(val) + return val + + relevant_params = {k: v for k, v in dataclasses.asdict(self).items() + if not k.startswith('_') and hasattr(TransfoExperiment, k) and getattr(TransfoExperiment, + k) != v} + params = [f"{short_name(k)}_{v}" for k, v in relevant_params.items() if not isinstance(v, bool)] + bool_flags = [f"{short_name(k)}" for k, v in relevant_params.items() if isinstance(v, bool) and v] + params_str = ".".join(params + bool_flags) + + self.name = f"{d}.{h}.{params_str}" + return self.name + + def get_summary_writer(self): + return SummaryWriter("runs/"+self.experiment_name) + + def _save_predictions(self, path): + self._dump_pickle([self._preds, self._phases], path) + + def _load_predictions(self, path): + self._preds, self._phases = self._load_pickle(path) + return self._preds + + def load_predictions(self): + path = self._path.parent / f"{self._path.stem}.preds" + return self._load_predictions(path) + + # todo: make it compatible with Experiment + def get_trained_model(self, data: TransfoDatabunch): + self._model = self.train_model(data) + self.has_model = True + return self._model + + def get_glue_databunch(self): + return TransfoDatabunch( + train_ds = prepare_glue_examples(self.tokenizer, self.task_name, 'train'), + valid_ds = prepare_glue_examples(self.tokenizer, self.task_name, 'validation'), + test_ds = None + ) + + def get_databunch(self, train_df, valid_df, test_df): + data_key = (id(train_df), id(valid_df), id(test_df)) + + if self._data_cache is not None and self._data_cache.key != data_key: + self._data_cache = None + + self.tokenizer.max_len = 999999 + if self._data_cache is None: + common_args = dict(text_a=self.text_a, text_b=self.text_b, label=self.label) + train_ds, label_list = convert_df_to_dataset(self.tokenizer, train_df, return_labels=True, **common_args) + data = TransfoDatabunch( + num_labels=len(label_list), + train_ds=train_ds, + valid_ds=convert_df_to_dataset(self.tokenizer, valid_df, **common_args), + test_ds=convert_df_to_dataset(self.tokenizer, test_df, **common_args) + ) + data.key = data_key + self._data_cache = data + return self._data_cache + + def new_experiment(self, **kwargs): + #kwargs.setdefault("has_predictions", False) + return super().new_experiment(**kwargs) + + def _add_phase(self, state): + del state['opt'] + del state['train_dl'] + self._phases.append(state) + + def set_seed(self, name): + return set_seed(self.seed, name, all_gpus=(self.n_gpu > 1)) + + # todo: make it compatible with Experiment + def train_model(self, data: TransfoDatabunch): + self.set_seed("class") + self.train_started = time.time() + num_labels = data.num_labels + config = AutoConfig.from_pretrained(self.pretrained_name, num_labels=num_labels) #, finetuning_task=args.task_name + model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_name, config=config) + train(self, data.train_ds, data.valid_ds, model.to(self.device), self._tokenizer) + model.to("cpu") + return model + + def _save_model(self, path): + model_to_save = self._model.module if hasattr(self._model, + 'module') else self._model # Take care of distributed/parallel training + model_to_save.save_pretrained(path) + logger.info("Saving model checkpoint to %s", path) + + # todo: move to Experiment + def save(self, dir_path): + dir_path = Path(dir_path) + dir_path.mkdir(exist_ok=True, parents=True) + filename = self._get_next_exp_name(dir_path) + j = dataclasses.asdict(self) + with open(filename, "wt") as f: + json.dump(j, f) + self._save_model(dir_path / f"{filename.stem}.model") + if hasattr(self, "_preds"): + self._save_predictions(dir_path / f"{filename.stem}.preds") + + return filename.name + + def evaluate_transformers(self, data): + return evaluate(self, self._model.to(self.device), data.valid_ds, prefix="") + + def evaluate(self, model, train_df, valid_df, test_df): + data = self.get_databunch(train_df, valid_df, test_df) + valid_probs = get_preds(self, model, data.valid_ds, ordered=True)[0].cpu().numpy() + test_probs = get_preds(self, model, data.test_ds, ordered=True)[0].cpu().numpy() + train_probs = get_preds(self, model, data.train_ds, ordered=True)[0].cpu().numpy() + self._preds = [] + + for prefix, tdf, probs in zip(["train", "valid", "test"], + [train_df, valid_df, test_df], + [train_probs, valid_probs, test_probs]): + preds = np.argmax(probs, axis=1) + + if self.merge_fragments and self.merge_type != "concat": + if self.merge_type == "vote_maj": + vote_results = preds_for_cell_content(tdf, probs) + elif self.merge_type == "vote_avg": + vote_results = preds_for_cell_content_multi(tdf, probs) + elif self.merge_type == "vote_max": + vote_results = preds_for_cell_content_max(tdf, probs) + preds = vote_results["pred"] + true_y = vote_results["true"] + else: + true_y = tdf["label"] + print(true_y.shape) + self._set_results(prefix, preds, true_y) + self._preds.append(probs) + +# # schedule: Tuple = ( +# # (1, 1e-2), # (a,b) -> fit_one_cyclce(a, b) +# # (1, 5e-3/2., 5e-3), # (a, b) -> freeze_to(-2); fit_one_cycle(a, b) +# # (8, 2e-3/100, 2e-3) # (a, b) -> unfreeze(); fit_one_cyccle(a, b) +# # ) +# # # drop_mult: float = 0.75 +# # fp16: bool = False +# pretrained_lm: str = "bert_base_cased" +# # dataset: str = None +# # train_on_easy: bool = True +# # BS: int = 64 +# # +# # has_predictions: bool = False # similar to has_model, but to avoid storing pretrained models we only keep predictions +# # # that can be later used by CRF + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return InputExample(tensor_dict['idx'].numpy(), + tensor_dict['premise'].numpy().decode('utf-8'), + tensor_dict['hypothesis'].numpy().decode('utf-8'), + str(tensor_dict['label'].numpy())) + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + diff --git a/axcell/models/structure/type_predictor.py b/axcell/models/structure/type_predictor.py new file mode 100644 index 0000000..abc224d --- /dev/null +++ b/axcell/models/structure/type_predictor.py @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from fastai.text import * +from pathlib import Path +import pandas as pd +from .ulmfit import ULMFiT_SP +from ...pipeline_logger import pipeline_logger +import torch +from enum import Enum + + +class TableType(Enum): + SOTA = 0 + ABLATION = 1 + IRRELEVANT = 2 + + +def multipreds2preds(preds, threshold=0.5): + bs = preds.shape[0] + return torch.cat([preds, preds.new_full((bs,1), threshold)], dim=-1).argmax(dim=-1) + + +class TableTypePredictor(ULMFiT_SP): + step = "type_prediction" + + def __init__(self, path, file, sp_path=None, sp_model="spm.model", sp_vocab="spm.vocab", threshold=0.5): + super().__init__(path, file, sp_path, sp_model, sp_vocab) + self.threshold = threshold + + def predict(self, paper, tables): + pipeline_logger(f"{TableTypePredictor.step}::predict", paper=paper, tables=tables) + if len(tables) == 0: + predictions = [] + else: + column = "caption" + df = pd.DataFrame({column: [table.caption if table.caption else "Table" for table in tables]}) + inputs = df.iloc[:, df_names_to_idx(column, df)] + tl = TextList(items=inputs.values[:, 0], path='.', inner_df=df, processor=None) + self.learner.data.add_test(tl) + preds, _ = self.learner.get_preds(DatasetType.Test, ordered=True) + pipeline_logger(f"{TableTypePredictor.step}::multiclass_predicted", paper=paper, tables=tables, + threshold=self.threshold, predictions=preds.cpu().numpy()) + predictions = [TableType(x) for x in multipreds2preds(preds, self.threshold).cpu().numpy()] + pipeline_logger(f"{TableTypePredictor.step}::predicted", paper=paper, tables=tables, predictions=predictions) + return predictions diff --git a/axcell/models/structure/ulmfit.py b/axcell/models/structure/ulmfit.py new file mode 100644 index 0000000..6d0ddb1 --- /dev/null +++ b/axcell/models/structure/ulmfit.py @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from fastai.text import * +from pathlib import Path + +class ULMFiT_SP: + def __init__(self, path, file, sp_path=None, sp_model="spm.model", sp_vocab="spm.vocab"): + path = Path(path) + sp_path = path if sp_path is None else Path(sp_path) + self.learner = load_learner(path=path, file=file) + import sys, os + print(f"[PID {os.getpid()}] Load model {file}", file=sys.stderr) + sys.stderr.flush() + self._fix_sp_processor(sp_path, sp_model, sp_vocab) + + # disable multiprocessing to avoid celery deamon issues + for dl in self.learner.data.dls: + dl.num_workers = 0 + + def _fix_sp_processor(self, sp_path, sp_model, sp_vocab): + for processor in self.learner.data.label_list.valid.x.processor: + if isinstance(processor, SPProcessor): + processor.sp_model = sp_path / sp_model + processor.sp_vocab = sp_path / sp_vocab + processor.n_cpus = 1 + + #todo: see why it wasn't set on save + processor.mark_fields = True diff --git a/axcell/models/structure/ulmfit_experiment.py b/axcell/models/structure/ulmfit_experiment.py new file mode 100644 index 0000000..9160d0d --- /dev/null +++ b/axcell/models/structure/ulmfit_experiment.py @@ -0,0 +1,337 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from functools import partial + +from .experiment import Experiment, label_map_ext +from axcell.models.structure.nbsvm import * +from sklearn.metrics import confusion_matrix +from .nbsvm import preds_for_cell_content, preds_for_cell_content_max, preds_for_cell_content_multi +import dataclasses +from dataclasses import dataclass +from typing import Tuple +from axcell.helpers.training import set_seed +from fastai.text import * +from fastai.text.learner import _model_meta +import torch +import numpy as np +from pathlib import Path +import json + + +@dataclass +class ULMFiTExperiment(Experiment): + seed: int = 42 + schedule: Tuple = ( + (1, 1e-2), # (a,b) -> fit_one_cyclce(a, b) + (1, 5e-3/2., 5e-3), # (a, b) -> freeze_to(-2); fit_one_cycle(a, b) + (8, 2e-3/100, 2e-3) # (a, b) -> unfreeze(); fit_one_cyccle(a, b) + ) + moms: Tuple = None + drop_mult: float = 0.75 + fp16: bool = False + pretrained_lm: str = "pretrained-on-papers_enc.pkl" + dataset: str = None + train_on_easy: bool = True + BS: int = 64 + valid_split: str = 'speech_rec' + test_split: str = 'img_class' + n_layers: int = 3 + + has_predictions: bool = False # similar to has_model, but to avoid storing pretrained models we only keep predictions + # that can be later used by CRF + + def _save_predictions(self, path): + self._dump_pickle([self._preds, self._phases], path) + + def _load_predictions(self, path): + self._preds, self._phases = self._load_pickle(path) + return self._preds + + def load_predictions(self): + path = self._path.parent / f"{self._path.stem}.preds" + return self._load_predictions(path) + + # todo: make it compatible with Experiment + def get_trained_model(self, data_clas): + self._model = self.train_model(data_clas) + self.has_model = True + return self._model + + def new_experiment(self, **kwargs): + kwargs.setdefault("has_predictions", False) + return super().new_experiment(**kwargs) + + def _schedule(self, clas, i): + s = self.schedule[i] + cyc_len = s[0] + if len(s) == 2: + max_lr = s[1] + else: + max_lr = slice(s[1], s[2]) + + if self.moms is None: + clas.fit_one_cycle(cyc_len, max_lr) + else: + clas.fit_one_cycle(cyc_len, max_lr, moms=self.moms) + + def _add_phase(self, state): + del state['opt'] + del state['train_dl'] + self._phases.append(state) + + def _get_train_metrics(self): + return None + + # todo: make it compatible with Experiment + def train_model(self, data_clas): + set_seed(self.seed, "clas") + cfg = _model_meta[AWD_LSTM]['config_clas'].copy() + cfg['n_layers'] = self.n_layers + + metrics = self._get_train_metrics() + clas = text_classifier_learner(data_clas, AWD_LSTM, config=cfg, drop_mult=self.drop_mult, metrics=metrics) + clas.load_encoder(self.pretrained_lm) + if self.fp16: + clas = clas.to_fp16() + + self._phases = [] + + if self.schedule[0][0]: + self._schedule(clas, 0) + self._add_phase(clas.recorder.get_state()) + + if self.schedule[1][0]: + clas.freeze_to(-2) + self._schedule(clas, 1) + self._add_phase(clas.recorder.get_state()) + + if self.schedule[2][0]: + clas.unfreeze() + self._schedule(clas, 2) + self._add_phase(clas.recorder.get_state()) + + return clas + + def _save_model(self, path): + self._model.save(path) + + + # todo: move to Experiment + def save(self, dir_path): + dir_path = Path(dir_path) + dir_path.mkdir(exist_ok=True, parents=True) + filename = self._get_next_exp_name(dir_path) + j = dataclasses.asdict(self) + with open(filename, "wt") as f: + json.dump(j, f) + self.save_model(dir_path / f"{filename.stem}.model") + if hasattr(self, "_preds"): + self._save_predictions(dir_path / f"{filename.stem}.preds") + + return filename.name + + + def evaluate(self, model, train_df, valid_df, test_df): + valid_probs = model.get_preds(ds_type=DatasetType.Valid, ordered=True)[0].cpu().numpy() + test_probs = model.get_preds(ds_type=DatasetType.Test, ordered=True)[0].cpu().numpy() + train_probs = model.get_preds(ds_type=DatasetType.Train, ordered=True)[0].cpu().numpy() + self._preds = [] + + for prefix, tdf, probs in zip(["train", "valid", "test"], + [train_df, valid_df, test_df], + [train_probs, valid_probs, test_probs]): + preds = np.argmax(probs, axis=1) + + if self.merge_fragments and self.merge_type != "concat": + if self.merge_type == "vote_maj": + vote_results = preds_for_cell_content(tdf, probs) + elif self.merge_type == "vote_avg": + vote_results = preds_for_cell_content_multi(tdf, probs) + elif self.merge_type == "vote_max": + vote_results = preds_for_cell_content_max(tdf, probs) + preds = vote_results["pred"] + true_y = vote_results["true"] + else: + true_y = tdf["label"] + true_y_ext = tdf["cell_type"].apply(lambda x: label_map_ext.get(x, 0)) + self._set_results(prefix, preds, true_y, true_y_ext) + self._preds.append(probs) + + +def multipreds2preds(preds, threshold=0.5): + bs = preds.shape[0] + return torch.cat([preds, preds.new_full((bs,1), threshold)], dim=-1).argmax(dim=-1) + + +def accuracy_multilabel(input, target, sigmoid=True, irrelevant_as_class=False, threshold=0.5): + if sigmoid: + if irrelevant_as_class: + input = torch.sigmoid(input).argmax(dim=-1) + target = target.argmax(dim=-1) + return (input == target).float().mean() + else: + input = torch.sigmoid(input) + input = multipreds2preds(input, threshold) + targs = multipreds2preds(target, threshold) + return (input == targs).float().mean() + else: + return accuracy(input, target) + + +def accuracy_binary(input, target, sigmoid=True, irrelevant_as_class=False, threshold=0.5): + if sigmoid: + if irrelevant_as_class: + input = torch.sigmoid(input).argmax(dim=-1) + target = target.argmax(dim=-1) + input[input == 1] = 0 + target[target == 1] = 0 + return (input == target).float().mean() + else: + input = torch.sigmoid(input) + input = multipreds2preds(input, threshold) + target = multipreds2preds(target, threshold) + input[input == 1] = 0 + target[target == 1] = 0 + return (input == target).float().mean() + else: + input = input.argmax(dim=-1) + input[input == 1] = 0 + target[target == 1] = 0 + return (input == target).float().mean() + + +@dataclass +class ULMFiTTableTypeExperiment(ULMFiTExperiment): + sigmoid: bool = True + distinguish_ablation: bool = True + irrelevant_as_class: bool = False + caption: bool = True + first_row: bool = False + first_column: bool = False + referencing_sections: bool = False + dedup_seqs: bool = False + + def _save_model(self, path): + pass + + def _get_train_metrics(self): + if self.distinguish_ablation: + return [ + partial(accuracy_multilabel, sigmoid=self.sigmoid, irrelevant_as_class=self.irrelevant_as_class), + partial(accuracy_binary, sigmoid=self.sigmoid, irrelevant_as_class=self.irrelevant_as_class) + ] + else: + return [accuracy] + + def _transform_df(self, df): + df = df.copy(True) + if self.distinguish_ablation: + df["label"] = 2 + df.loc[df.ablation, "label"] = 1 + df.loc[df.sota, "label"] = 0 + else: + df["label"] = 1 + df.loc[df.sota, "label"] = 0 + df.loc[df.ablation, "label"] = 0 + + if self.sigmoid: + if self.irrelevant_as_class: + df["irrelevant"] = ~(df["sota"] | df["ablation"]) + if not self.distinguish_ablation: + df["sota"] = df["sota"] | df["ablation"] + df = df.drop(columns=["ablation"]) + else: + df["class"] = df["label"] + + drop_columns = [] + if not self.caption: + drop_columns.append("caption") + if not self.first_column: + drop_columns.append("col0") + if not self.first_row: + drop_columns.append("row0") + if not self.referencing_sections: + drop_columns.append("sections") + df = df.drop(columns=drop_columns) + return df + + def evaluate(self, model, train_df, valid_df, test_df): + valid_probs = model.get_preds(ds_type=DatasetType.Valid, ordered=True)[0].cpu().numpy() + test_probs = model.get_preds(ds_type=DatasetType.Test, ordered=True)[0].cpu().numpy() + train_probs = model.get_preds(ds_type=DatasetType.Train, ordered=True)[0].cpu().numpy() + self._preds = [] + + def multipreds2preds(preds, threshold=0.5): + bs = preds.shape[0] + return np.concatenate([probs, np.ones((bs, 1)) * threshold], axis=-1).argmax(-1) + + for prefix, tdf, probs in zip(["train", "valid", "test"], + [train_df, valid_df, test_df], + [train_probs, valid_probs, test_probs]): + + if self.sigmoid and not self.irrelevant_as_class: + preds = multipreds2preds(probs) + else: + preds = np.argmax(probs, axis=1) + + true_y = tdf["label"] + self._set_results(prefix, preds, true_y) + self._preds.append(probs) + + def _set_results(self, prefix, preds, true_y, true_y_ext=None): + def metrics(preds, true_y): + y = true_y + p = preds + + if self.distinguish_ablation: + g = {0: 0, 1: 0, 2: 1}.get + bin_y = np.array([g(x) for x in y]) + bin_p = np.array([g(x) for x in p]) + irr = 2 + else: + bin_y = y + bin_p = p + irr = 1 + + acc = (p == y).mean() + tp = ((y != irr) & (p == y)).sum() + fp = ((p != irr) & (p != y)).sum() + fn = ((y != irr) & (p == irr)).sum() + + bin_acc = (bin_p == bin_y).mean() + bin_tp = ((bin_y != 1) & (bin_p == bin_y)).sum() + bin_fp = ((bin_p != 1) & (bin_p != bin_y)).sum() + bin_fn = ((bin_y != 1) & (bin_p == 1)).sum() + + prec = tp / (fp + tp) + reca = tp / (fn + tp) + bin_prec = bin_tp / (bin_fp + bin_tp) + bin_reca = bin_tp / (bin_fn + bin_tp) + return { + "precision": prec, + "accuracy": acc, + "recall": reca, + "TP": tp, + "FP": fp, + "bin_precision": bin_prec, + "bin_accuracy": bin_acc, + "bin_recall": bin_reca, + "bin_TP": bin_tp, + "bin_FP": bin_fp, + } + + m = metrics(preds, true_y) + r = {} + r[f"{prefix}_accuracy"] = m["accuracy"] + r[f"{prefix}_precision"] = m["precision"] + r[f"{prefix}_recall"] = m["recall"] + r[f"{prefix}_bin_accuracy"] = m["bin_accuracy"] + r[f"{prefix}_bin_precision"] = m["bin_precision"] + r[f"{prefix}_bin_recall"] = m["bin_recall"] + r[f"{prefix}_cm"] = confusion_matrix(true_y, preds).tolist() + self.update_results(**r) + + def get_cm_labels(self, cm): + if len(cm) == 3: + return ["SOTA", "ABLATION", "IRRELEVANT"] + else: + return ["SOTA", "IRRELEVANT"] diff --git a/axcell/pipeline_logger.py b/axcell/pipeline_logger.py new file mode 100644 index 0000000..1577491 --- /dev/null +++ b/axcell/pipeline_logger.py @@ -0,0 +1,32 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import re + + +class PipelineLogger: + def __init__(self): + self.observers = [] + + def reset(self): + self.observers = [] + + def register(self, pattern, observer): + if isinstance(pattern, str): + pattern = re.compile(pattern) + self.observers.append((pattern, observer)) + + def unregister(self, pattern, observer): + if pattern is None: + self.observers = [(p, o) for p, o in self.observers if o != observer] + else: + if isinstance(pattern, str): + pattern = re.compile(pattern) + self.observers = [(p, o) for p, o in self.observers if o != observer or p.pattern != pattern.pattern] + + def __call__(self, step, **args): + for pattern, observer in self.observers: + if pattern.match(step): + observer(step, **args) + + +pipeline_logger = PipelineLogger() diff --git a/axcell/scripts/guess_main.py b/axcell/scripts/guess_main.py new file mode 100644 index 0000000..afc9638 --- /dev/null +++ b/axcell/scripts/guess_main.py @@ -0,0 +1,58 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from pathlib import Path +import re +import sys +import codecs + +doccls = re.compile(r"\s*\\documentclass") +docbeg = re.compile(r"\s*\\begin\s*\{\s*document\s*\}") +title = re.compile(r"\s*\\(icml)?title\s*\{(?P[^%}]*)") + +aux = re.compile(r"(rebuttal\s+|instructions\s+(for\s+\\confname|.*proceedings)|(supplementary|supplemental)\s+materials?|appendix|author\s+guidelines|ieeetran\.cls|formatting\s+instructions)") + +def aux_title(t): + t = t.strip().lower() + return bool(aux.search(t)) + + +def calc_priority(path): + priority = 0 + if path.name.lower() == "ms.tex": + return 30 + with codecs.open(path, 'r', encoding='utf8', errors='ignore') as f: + for line in f: + if doccls.match(line): + priority += 10 + break + for line in f: + m = title.match(line) + if m: + priority += 5 + t = m["title"] + if aux_title(t): + priority = 5 + break + return priority + + +def guess_main(path): + path = Path(path) + files = sorted(path.glob("*.tex"), key=lambda p: p.stem.lower()) + if len(files) > 1: + with_priority = [(f, calc_priority(f)) for f in files] + with_priority = sorted(with_priority, key=lambda fp: fp[1], reverse=True) + files = [fp[0] for fp in with_priority] + + return files[0] if len(files) else None + +if __name__ == '__main__': + if len(sys.argv) != 2: + print(f"Usage:\n\t{sys.argv[0]} DIR", file=sys.stderr) + exit(1) + main = guess_main(sys.argv[1]) + if not main: + print("Unable to find any suitable tex file", file=sys.stderr) + exit(1) + else: + print(main) diff --git a/axcell/scripts/latex2html.sh b/axcell/scripts/latex2html.sh new file mode 100755 index 0000000..097bdbe --- /dev/null +++ b/axcell/scripts/latex2html.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +OUTNAME="$1" +echo $OUTNAME +RO_SOURCE_DIR="${2:-/files/ro-source}" +SOURCE_DIR="/files/source" +OUTPUT_DIR="${3:-/files/htmls}" + +mkdir -p /files +cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR" + +# turn tikzpciture instances into comments +find "$SOURCE_DIR" -iname '*.tex' -print0 | xargs -0 sed -i \ + -e 's/\\begin{document}/\\usepackage{verbatim}\0/g' \ + -e 's/\\begin\(\[[^]]*\]\)\?{tikzpicture}/\\begin{comment}/g' \ + -e 's/\\end{tikzpicture}/\\end{comment}/g' + +# temporary fixes +# https://github.com/brucemiller/LaTeXML/pull/1171 +# https://github.com/brucemiller/LaTeXML/pull/1173 +# https://github.com/brucemiller/LaTeXML/pull/1177 +for patch in /files/patches/* +do + patch -i $patch -p 3 -d /usr/local/share/perl/5.28.1/LaTeXML +done + +MAINTEX=$(python3 /files/guess_main.py "$SOURCE_DIR") +[ ! -f "$MAINTEX" ] && exit 1 + +timeout -s KILL 300 engrafo "$MAINTEX" /files/output + +[ ! -f /files/output/index.html ] && exit 117 +cp /files/output/index.html "$OUTPUT_DIR/$OUTNAME" diff --git a/axcell/scripts/patches/1171.patch b/axcell/scripts/patches/1171.patch new file mode 100644 index 0000000..7c0d4c7 --- /dev/null +++ b/axcell/scripts/patches/1171.patch @@ -0,0 +1,188 @@ +From d715def1f4ddd18336e5e49b54baf0efd9acfb94 Mon Sep 17 00:00:00 2001 +From: Deyan Ginev <d.ginev@jacobs-university.de> +Date: Sun, 21 Jul 2019 16:14:17 -0400 +Subject: [PATCH] neurips binding and reliably preload main article + dependencies for bibliography post-processing + +--- + MANIFEST | 1 + + lib/LaTeXML/Package/neurips.sty.ltxml | 34 +++++++++++++++++++++++++++ + lib/LaTeXML/Post.pm | 22 +++++++++-------- + lib/LaTeXML/Post/MakeBibliography.pm | 11 +++++---- + 4 files changed, 53 insertions(+), 15 deletions(-) + create mode 100644 lib/LaTeXML/Package/neurips.sty.ltxml + +diff --git a/MANIFEST b/MANIFEST +index f944d07aa..69b5bdd51 100644 +--- a/MANIFEST ++++ b/MANIFEST +@@ -548,6 +548,7 @@ lib/LaTeXML/Package/multido.sty.ltxml + lib/LaTeXML/Package/multirow.sty.ltxml + lib/LaTeXML/Package/nameref.sty.ltxml + lib/LaTeXML/Package/natbib.sty.ltxml ++lib/LaTeXML/Package/neurips.sty.ltxml + lib/LaTeXML/Package/newcent.sty.ltxml + lib/LaTeXML/Package/newfloat.sty.ltxml + lib/LaTeXML/Package/newlfont.sty.ltxml +diff --git a/lib/LaTeXML/Package/neurips.sty.ltxml b/lib/LaTeXML/Package/neurips.sty.ltxml +new file mode 100644 +index 000000000..b642e4dd3 +--- /dev/null ++++ b/lib/LaTeXML/Package/neurips.sty.ltxml +@@ -0,0 +1,34 @@ ++# -*- mode: Perl -*- ++# /=====================================================================\ # ++# | neurips_2019.sty | # ++# | Implementation for LaTeXML | # ++# |=====================================================================| # ++# | Part of LaTeXML: | # ++# | Public domain software, produced as part of work done by the | # ++# | United States Government & not subject to copyright in the US. | # ++# |---------------------------------------------------------------------| # ++# | Bruce Miller <bruce.miller@nist.gov> #_# | # ++# | http://dlmf.nist.gov/LaTeXML/ (o o) | # ++# \=========================================================ooo==U==ooo=/ # ++package LaTeXML::Package::Pool; ++use strict; ++use warnings; ++use LaTeXML::Package; ++ ++#====================================================================== ++RequirePackage('natbib'); ++RequirePackage('geometry'); ++RequirePackage('lineno'); ++# /--------------------------------------------------------------------\ ++# | Drafted by texscan --stub neurips_2019.sty | ++# \--------------------------------------------------------------------/ ++DefMacro('\AND', Tokens()); ++DefMacro('\And', Tokens()); ++DefMacro('\bottomfraction', Tokens()); ++DefMacro('\patchAmsMathEnvironmentForLineno', Tokens()); ++DefMacro('\patchBothAmsMathEnvironmentsForLineno', Tokens()); ++DefMacroI('\subsubsubsection', undef, '\@startsection{subsubsubsection}{4}{}{}{}{}', locked => 1); ++DefMacro('\textfraction', Tokens()); ++DefMacro('\topfraction', Tokens()); ++#====================================================================== ++1; +diff --git a/lib/LaTeXML/Post.pm b/lib/LaTeXML/Post.pm +index a1dc74c1c..ec12bf2a8 100644 +--- a/lib/LaTeXML/Post.pm ++++ b/lib/LaTeXML/Post.pm +@@ -56,7 +56,7 @@ sub ProcessChain_internal { + foreach my $doc (@docs) { + local $LaTeXML::Post::DOCUMENT = $doc; + if (my @nodes = grep { $_ } $processor->toProcess($doc)) { # If there are nodes to process +- my $n = scalar(@nodes); ++ my $n = scalar(@nodes); + my $msg = join(' ', $processor->getName || '', + $doc->siteRelativeDestination || '', + ($n > 1 ? "$n to process" : 'processing')); +@@ -198,7 +198,7 @@ sub generateResourcePathname { + my $subdir = $$self{resource_directory} || ''; + my $prefix = $$self{resource_prefix} || "x"; + my $counter = join('_', "_max", $subdir, $prefix, "counter_"); +- my $n = $doc->cacheLookup($counter) || 0; ++ my $n = $doc->cacheLookup($counter) || 0; + my $name = $prefix . ++$n; + $doc->cacheStore($counter, $n); + return pathname_make(dir => $subdir, name => $name, type => $type); } +@@ -218,11 +218,12 @@ sub find_documentclass_and_packages { + $classoptions = $$entry{options} || 'onecolumn'; + $oldstyle = $$entry{oldstyle}; } + elsif ($$entry{package}) { +- push(@packages, [$$entry{package}, $$entry{options} || '']); } +- } ++ push(@packages, [$$entry{package} . ".sty", $$entry{options} || '']); } } + if (!$class) { + Warn('expected', 'class', undef, "No document class found; using article"); + $class = 'article'; } ++ if ($class !~ /\.cls$/) { ++ $class = $class . ".cls"; } + return ([$class, $classoptions, $oldstyle], @packages); } + + #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +@@ -336,7 +337,7 @@ sub processNode { + # XMath will be removed (LATER!), but mark its ids as reusable. + $doc->preremoveNodes($xmath); + if ($$self{parallel}) { +- my $primary = $self->convertNode($doc, $xmath); ++ my $primary = $self->convertNode($doc, $xmath); + my @secondaries = (); + foreach my $proc (@{ $$self{secondary_processors} }) { + local $LaTeXML::Post::MATHPROCESSOR = $proc; +@@ -425,7 +426,7 @@ sub convertXMTextContent { + my $tag = $doc->getQName($node); + if ($tag eq 'ltx:XMath') { + my $conversion = $self->convertNode($doc, $node); +- my $xml = $$conversion{xml}; ++ my $xml = $$conversion{xml}; + # And if no xml ???? + push(@result, $self->outerWrapper($doc, $node, $xml)); } + else { +@@ -516,7 +517,7 @@ sub associateNode { + $document->generateNodeID($sourcenode, '', 1); } # but the ID is reusable + if (my $sourceid = $sourcenode->getAttribute('fragid')) { # If source has ID + my $nodeid = $currentnode->getAttribute('fragid') || $sourceid; +- my $id = $document->uniquifyID($nodeid, $self->IDSuffix); ++ my $id = $document->uniquifyID($nodeid, $self->IDSuffix); + if ($isarray) { + $$node[1]{'xml:id'} = $id; } + else { +@@ -775,7 +776,7 @@ sub setDocument_internal { + my ($tag, $attributes, @children) = @$root; + my ($prefix, $localname) = $tag =~ /^(.*):(.*)$/; + my $nsuri = $$self{namespaces}{$prefix}; +- my $node = $$self{document}->createElementNS($nsuri, $localname); ++ my $node = $$self{document}->createElementNS($nsuri, $localname); + $$self{document}->setDocumentElement($node); + map { $$attributes{$_} && $node->setAttribute($_ => $$attributes{$_}) } keys %$attributes + if $attributes; +@@ -927,7 +928,7 @@ sub idcheck { + my %missing = (); + foreach my $node ($self->findnodes("//*[\@xml:id]")) { + my $id = $node->getAttribute('xml:id'); +- $dups{$id} = 1 if $idcache{$id}; ++ $dups{$id} = 1 if $idcache{$id}; + $idcache{$id} = 1; } + foreach my $id (keys %{ $$self{idcache} }) { + $missing{$id} = 1 unless $idcache{$id}; } +@@ -1181,13 +1182,14 @@ sub prependNodes { + sub cloneNode { + my ($self, $node, $idsuffix, %options) = @_; + return $node unless ref $node; ++ return $node if ref $node eq 'ARRAY'; # Should we deep clone if we get an array? Just return for now + my $copy = $node->cloneNode(1); + my $nocache = $options{nocache}; + #### $idsuffix = '' unless defined $idsuffix; + # Find all id's defined in the copy and change the id. + my %idmap = (); + foreach my $n ($self->findnodes('descendant-or-self::*[@xml:id]', $copy)) { +- my $id = $n->getAttribute('xml:id'); ++ my $id = $n->getAttribute('xml:id'); + my $newid = $self->uniquifyID($id, $idsuffix); + $idmap{$id} = $newid; + $self->recordID($newid => $n) unless $nocache; +diff --git a/lib/LaTeXML/Post/MakeBibliography.pm b/lib/LaTeXML/Post/MakeBibliography.pm +index 37c70b92e..6bf6d96fc 100644 +--- a/lib/LaTeXML/Post/MakeBibliography.pm ++++ b/lib/LaTeXML/Post/MakeBibliography.pm +@@ -162,13 +162,14 @@ sub convertBibliography { + my ($self, $doc, $bib) = @_; + require LaTeXML; + require LaTeXML::Common::Config; +- my @packages = +- my @preload = (); +- # Might want/need to preload more (all?) packages, but at least do inputenc! ++ my @preload = (); # custom macros often used in e.g. howpublished field ++ # need to preload all packages used by the main article + foreach my $po ($self->find_documentclass_and_packages($doc)) { + my ($pkg, $options) = @$po; +- if ($pkg eq 'inputenc') { +- push(@preload, "[$options]$pkg"); } } ++ if ($options) { ++ push(@preload, "[$options]$pkg"); } ++ else { ++ push(@preload, "$pkg"); } } + NoteProgress(" [Converting bibliography $bib ..."); + my $bib_config = LaTeXML::Common::Config->new( + cache_key => 'BibTeX', diff --git a/axcell/scripts/patches/1173.patch b/axcell/scripts/patches/1173.patch new file mode 100644 index 0000000..d6b07c1 --- /dev/null +++ b/axcell/scripts/patches/1173.patch @@ -0,0 +1,76 @@ +From 6eeebce933599340b44a0d61d69ad409f6944d44 Mon Sep 17 00:00:00 2001 +From: Deyan Ginev <d.ginev@jacobs-university.de> +Date: Wed, 24 Jul 2019 12:49:40 -0400 +Subject: [PATCH] avoid Mouth time-travel bug when preparing url from an XUntil + context + +--- + lib/LaTeXML/Package/hyperref.sty.ltxml | 9 ++++----- + lib/LaTeXML/Package/url.sty.ltxml | 5 ++--- + 2 files changed, 6 insertions(+), 8 deletions(-) + +diff --git a/lib/LaTeXML/Package/hyperref.sty.ltxml b/lib/LaTeXML/Package/hyperref.sty.ltxml +index 48a9af302..d07afc015 100644 +--- a/lib/LaTeXML/Package/hyperref.sty.ltxml ++++ b/lib/LaTeXML/Package/hyperref.sty.ltxml +@@ -112,7 +112,7 @@ DefConstructor('\@add@PDF@RDFa@triples', sub { + if (my $entry = ($pdfkey_property{$key})) { + my ($property, $object, $datatype) = @$entry; + my $value = LookupMapping('Hyperref_options', $key); +- my $node = $document->openElementAt($root, 'ltx:rdf', ++ my $node = $document->openElementAt($root, 'ltx:rdf', + property => $property, $object => $value, + ($datatype ? (datatype => $datatype) : ())); + # Must do directly; $document->setAttribute omits empty attributes +@@ -136,17 +136,16 @@ DefMacro('\href Verbatim {}', '\@@Url\href{}{}{#1}{#2}'); + # Redefine \@url to sanitize the argument less + DefMacro('\@Url Token', sub { + my ($gullet, $cmd) = @_; +- my $mouth = $gullet->getMouth; + my ($open, $close, $url); + $open = $gullet->readToken; + StartSemiverbatim('%'); + Let('~', T_OTHER('~')); # Needs special protection? + if ($open->equals(T_BEGIN)) { + $open = T_OTHER('{'); $close = T_OTHER('}'); +- $url = $gullet->readBalanced(1); } # Expand as we go! ++ $url = $gullet->readBalanced(1); } # Expand as we go! + else { + $close = $open = T_OTHER($open->getString); +- $url = $mouth->readTokens($close); } ++ $url = $gullet->readUntil($close); } + EndSemiverbatim(); + my @toks = grep { $_->getCatcode != CC_SPACE; } $url->unlist; + # Identical with url's \@Url except, let CS's through! +@@ -212,7 +211,7 @@ DefConstructor('\autoref Semiverbatim', + + DefMacro('\lx@autorefnum@@{}', sub { + my ($gullet, $type) = @_; +- my $type_s = ToString($type); ++ my $type_s = ToString($type); + my $counter = LookupMapping('counter_for_type', $type_s) || $type_s; + return Tokens( + (LookupDefinition(T_CS('\\' . $type_s . 'autorefname')) +diff --git a/lib/LaTeXML/Package/url.sty.ltxml b/lib/LaTeXML/Package/url.sty.ltxml +index da6223359..c208a8dc5 100644 +--- a/lib/LaTeXML/Package/url.sty.ltxml ++++ b/lib/LaTeXML/Package/url.sty.ltxml +@@ -46,16 +46,15 @@ DefMacro('\DeclareUrlCommand{}{}', '\def#1{\begingroup #2\@Url#1}'); + # In any case, we read the verbatim arg, and build a Whatsit for @@Url + DefMacro('\@Url Token', sub { + my ($gullet, $cmd) = @_; +- my $mouth = $gullet->getMouth; + my ($open, $close, $url); + StartSemiverbatim('%'); + $open = $gullet->readToken; + if ($open->equals(T_BEGIN)) { + $open = T_OTHER('{'); $close = T_OTHER('}'); +- $url = $gullet->readBalanced; } ++ $url = $gullet->readBalanced; } + else { + $close = $open = T_OTHER($open->getString); +- $url = $mouth->readTokens($close); } ++ $url = $gullet->readUntil($close); } + EndSemiverbatim(); + + my @toks = grep { $_->getCatcode != CC_SPACE; } $url->unlist; diff --git a/axcell/scripts/patches/1177.patch b/axcell/scripts/patches/1177.patch new file mode 100644 index 0000000..a4ac126 --- /dev/null +++ b/axcell/scripts/patches/1177.patch @@ -0,0 +1,26 @@ +From 4b3a98e90e790eccb18eb16636c783ec7dfceb3b Mon Sep 17 00:00:00 2001 +From: Marcin Kardas <marcin.kardas@gmail.com> +Date: Sun, 28 Jul 2019 16:51:49 +0200 +Subject: [PATCH] Use algorithmic instead of algorithmicx in icml*.sty +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +All icml style files I've tested (2013-2019) require `algorithmic` package, but `algorithmicx` doesn’t define any `algorithmic` commands. [Here's](https://arxiv.org/pdf/1402.5766v1.pdf) an example of paper (using icml2014.sty) on which LaTeXML hangs during processing when `algorithmicx` is used. +--- + lib/LaTeXML/Package/icml_support.sty.ltxml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/LaTeXML/Package/icml_support.sty.ltxml b/lib/LaTeXML/Package/icml_support.sty.ltxml +index 176498a05..3696de4b7 100644 +--- a/lib/LaTeXML/Package/icml_support.sty.ltxml ++++ b/lib/LaTeXML/Package/icml_support.sty.ltxml +@@ -20,7 +20,7 @@ RequirePackage('times'); + RequirePackage('fancyhdr'); + RequirePackage('color'); + RequirePackage('algorithm'); +-RequirePackage('algorithmicx'); ++RequirePackage('algorithmic'); + RequirePackage('natbib'); + # RequirePackage('eso-pic'); + # RequirePackage('forloop'); diff --git a/clean_html.sh b/clean_html.sh deleted file mode 100755 index 6c1df1c..0000000 --- a/clean_html.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -SOURCE=$(realpath "$1") -[ ! -f "$SOURCE" ] && echo "File $SOURCE not found." >&2 && exit 1 -mkdir -p $(dirname "$2") -OUTPUT=$(realpath "$2") - -docker run --rm -v "$SOURCE":/files/index.html:ro --entrypoint '' zenika/alpine-chrome:73 timeout -t 20 -s KILL chromium-browser --headless --disable-gpu --disable-software-rasterizer --no-sandbox --timeout=30000 --dump-dom /files/index.html > "$OUTPUT" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..d293d63 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,18 @@ +version: '3' +services: + # search engine + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.7.1 + entrypoint: + - elasticsearch + - -Ehttp.port=9200 + - -Etransport.host=localhost + - -Ehttp.host=0.0.0.0 + - -Ebootstrap.system_call_filter=false + - -Ehttp.cors.enabled=true + - -Ehttp.cors.allow-origin=* + - -Ehttp.cors.allow-headers=* + - -Ehttp.cors.allow-credentials=true + user: elasticsearch + ports: + - '127.0.0.1:9200:9200' diff --git a/docker-latex2html.sh b/docker-latex2html.sh deleted file mode 100755 index 463ef82..0000000 --- a/docker-latex2html.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -SOURCE_DIR=$(realpath "$1") #~/arxiv/unpacked/1701/1701.xyz -[ ! -d "$SOURCE_DIR" ] && echo "Directory $SOURCE_DIR not found." >&2 && exit 1 -mkdir -p $(dirname "$2") -OUTPUT=$(realpath "$2") #~/arxiv/htmls/1701/1701.xyz.html -OUTPUT_DIR=$(dirname "$OUTPUT") #~/arxiv/htmls/1701 -FILENAME=$(basename "$OUTPUT") #1701.xyz.html - -docker run --rm -v $PWD/latex2html.sh:/files/latex2html.sh:ro -v "$SOURCE_DIR":/files/ro-source:ro -v "$OUTPUT_DIR":/files/htmls arxivvanity/engrafo /files/latex2html.sh "$FILENAME" diff --git a/environment.yml b/environment.yml index 0d2b4c6..c439f78 100644 --- a/environment.yml +++ b/environment.yml @@ -1,18 +1,42 @@ -name: xtables +name: axcell channels: - conda-forge - defaults dependencies: +- pip=21.0.1 - fire=0.1.3 - html5lib=1.0.1 - beautifulsoup4=4.7.1 - lxml=4.3.4 -- pandas=0.24.2 +- pandas=0.25.3 - beautifulsoup4=4.7.1 -- numpy=1.15.4 +- numpy=1.18.1 - python=3.7.1 - pyahocorasick=1.4.0 - Unidecode=1.0.23 -- elasticsearch-dsl=7.0.0 +- elasticsearch-dsl=6.3.1 - ipython=7.5.0 - joblib=0.13.2 +- python-magic=0.4.15 +- docker-py=4.1.0 +- python-magic=0.4.15 +- seaborn=0.9.0 +- docker-compose=1.25.5 +- freetype=2.10.1 +- numba=0.48.0 +- spacy=2.2.3 +- pip: + - sentencepiece==0.1.83 + - elasticsearch==6.3.1 + - elasticsearch-dsl==6.3.1 + - scikit-learn==0.21.3 + - regex==2019.11.1 + - scispacy==0.2.4 + - psutil==5.6.7 + - "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz" + - "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz" + - torch==1.3.1 + - torchvision==0.4.2 + - fastai==1.0.55 + - fastprogress==0.1.21 + - . diff --git a/extract_texts.py b/extract_texts.py deleted file mode 100755 index e8102dc..0000000 --- a/extract_texts.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import fire -from sota_extractor2.data.elastic import Paper -from pathlib import Path - -def extract_text(source, target): - source = Path(source) - target = Path(target) - target.parent.mkdir(exist_ok=True, parents=True) - - arxiv_id = source.stem - doc = Paper.parse_paper(source) - with open(target, 'wt') as f: - f.write(doc.to_json()) - -if __name__ == "__main__": fire.Fire(extract_text) diff --git a/flatten_evaltab.sh b/flatten_evaltab.sh deleted file mode 100755 index 083a919..0000000 --- a/flatten_evaltab.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -jq -c '.. | select(.datasets?) | .task as $task | .datasets | .[] | .dataset as $dataset | .sota.rows[] | {paper_url, paper_title, model_name} as $paper | .metrics | . as $metrics | keys[] | {dataset: $dataset, metric_name: ., metric_value: $metrics[.], paper_url: $paper.paper_url, paper_title: $paper.paper_title, model_name: $paper.model_name, task: $task }' "$1" | grep arxiv\.org | jq -s '.' diff --git a/get_papers_links.sh b/get_papers_links.sh deleted file mode 100755 index e79a657..0000000 --- a/get_papers_links.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -jq '.. | select(.sota?) | .sota.rows[] | .paper_url' "$1" | grep arxiv | sed -e 's#"##g' -e 's#http:#https:#' | sort -u diff --git a/label_tables.py b/label_tables.py deleted file mode 100755 index 3e2a8c1..0000000 --- a/label_tables.py +++ /dev/null @@ -1,308 +0,0 @@ -#!/usr/bin/env python - -import fire -from sota_extractor.taskdb import TaskDB -from pathlib import Path -import json -import re -import pandas as pd -import sys -from decimal import Decimal, ROUND_DOWN, ROUND_HALF_UP, InvalidOperation -from collections import Counter, namedtuple - - -arxiv_url_re = re.compile(r"^https?://(www.)?arxiv.org/(abs|pdf|e-print)/(?P<arxiv_id>\d{4}\.[^./]*)(\.pdf)?$") - -def get_sota_tasks(filename): - db = TaskDB() - db.load_tasks(filename) - return db.tasks_with_sota() - - -def get_metadata(filename): - with open(filename, "r") as f: - j = json.load(f) - metadata = {x["filename"]:x["caption"] for x in j} - return metadata - - -def get_table(filename): - try: - return pd.read_csv(filename, header=None, dtype=str).fillna('') - except pd.errors.EmptyDataError: - return pd.DataFrame() - - -def get_tables(tables_dir): - tables_dir = Path(tables_dir) - all_metadata = {} - all_tables = {} - for metadata_filename in tables_dir.glob("*/metadata.json"): - metadata = get_metadata(metadata_filename) - basedir = metadata_filename.parent - arxiv_id = basedir.name - all_metadata[arxiv_id] = metadata - all_tables[arxiv_id] = {t:get_table(basedir / t) for t in metadata} - return all_metadata, all_tables - - -metric_na = ['-',''] - - -# problematic values of metrics found in evaluation-tables.json -# F0.5, 70.14 (measured by Ge et al., 2018) -# Test Time, 0.33s/img -# Accuracy, 77,62% -# Electronics, 85,06 -# BLEU-1, 54.60/55.55 -# BLEU-4, 26.71/27.78 -# MRPC, 78.6/84.4 -# MRPC, 76.2/83.1 -# STS, 78.9/78.6 -# STS, 75.8/75.5 -# BLEU score,41.0* -# BLEU score,28.5* -# SemEval 2007,**55.6** -# Senseval 2,**69.0** -# Senseval 3,**66.9** -# MAE, 2.42±0.01 - -## multiple times -# Number of params, 0.8B -# Number of params, 88M -# Parameters, 580k -# Parameters, 3.1m -# Params, 22M - - - -float_value_re = re.compile(r"([+-]?\s*((\d{1,2}(,\d{3})+|\d+)(\.\d*)?|\.\d+)([eE][+-]?\d+)?)") -letters_re = re.compile("[^\W\d_]", re.UNICODE) - -# float value possibly with std -metric_value_re = re.compile(float_value_re.pattern + r"(\s*±\s*" + float_value_re.pattern + ")?") -whitespace_re = re.compile(r"\s+") - - -def normalize_float_value(s): - match = metric_value_re.search(s) - if match: - return whitespace_re.sub("", match.group(1)).replace(",", ""), match.group(0).strip() - return '-', None - - -def test_near(x, precise): - for rounding in [ROUND_DOWN, ROUND_HALF_UP]: - try: - if x == precise.quantize(x, rounding=rounding): - return True - except InvalidOperation: - pass - return False - - -def fuzzy_match(metric, metric_value, target_value): - metric_value, _ = normalize_float_value(str(metric_value)) - if metric_value in metric_na: - return False - metric_value = Decimal(metric_value) - - for match in metric_value_re.findall(target_value): - value = whitespace_re.sub("", match[0]) - value = Decimal(value) - - if test_near(metric_value, value): - return True - if test_near(metric_value.shift(2), value): - return True - if test_near(metric_value, value.shift(2)): - return True - - return False -# -# if metric_value in metric_na or target_value in metric_na: -# return False -# if metric_value != target_value and metric_value in target_value: -# print(f"|{metric_value}|{target_value}|") -# return metric_value in target_value - - -def match_metric(metric, tables, value): - matching_tables = [] - for table in tables: - for col in tables[table]: - for row in tables[table][col]: - if fuzzy_match(metric, value, row): - matching_tables.append(table) - break - else: - continue - break - - return matching_tables - - -comparators = { - "a=b": test_near, - "100a=b": lambda metric, target: test_near(metric.shift(2), target), - "a=100b": lambda metric, target: test_near(metric, target.shift(2)), - "1-a=b": lambda metric, target: test_near(Decimal("1") - metric, target), - "100-a=b": lambda metric, target: test_near(Decimal("100") - metric, target), - "100-100a=b": lambda metric, target: test_near(Decimal("100") - metric.shift(2), target), - "100-a=100b": lambda metric, target: test_near(Decimal("100") - metric, target.shift(2)) -} - - -def empty_celltags_like(table): - return pd.DataFrame().reindex_like(table).fillna('') - - -def mark_with_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values, comp_name): - comparator = comparators[comp_name] - rows, cols = table.shape - hits = 0 - cell_tags = empty_celltags_like(table) - for col in range(cols): - for row in range(rows): - for val, val_str in table.iloc[row, col]: - for record in values: - if comparator(record.normalized, val): - hits += 1 - tags = f"<hit><sota>{record.value}</sota>" +\ - f"<paper>{record.arxiv_id}</paper>" +\ - f"<model>{record.model}</model>" +\ - f"<metric>{metric_name}</metric>" +\ - f"<dataset>{dataset_name}</dataset>" +\ - f"<task>{task_name}</task>" - if arxiv_id == record.arxiv_id: - tags += "<this_paper/>" - tags += f"<comparator>{comp_name}</comparator>" +\ - f"<matched_cell>{val}</matched_cell>" +\ - f"<matched_str>{val_str}</matched_str></hit>" - cell_tags.iloc[row, col] += tags - return cell_tags, hits - - -def mark_with_best_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values): - max_hits = 0 - best_tags = None - - for comp_name in comparators: - cell_tags, hits = mark_with_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values, comp_name) - if max_hits < hits: - max_hits = hits - best_tags = cell_tags - - return best_tags - - -def mark_with_all_comparators(task_name, dataset_name, metric_name, arxiv_id, table, values): - all_tags = empty_celltags_like(table) - for comp_name in comparators: - cell_tags, _ = mark_with_comparator(task_name, dataset_name, metric_name, arxiv_id, table, values, comp_name) - all_tags += cell_tags - - return all_tags - -def normalize_string(s): - return s.lower.strip() - - -def match_str(a, b): - return normalize_string(a) == normalize_string(b) - - -def mark_strings(table, tags, values): - cell_tags = empty_celltags_like(table) - beg, end = tags - rows, cols = table.shape - for col in range(cols): - for row in range(rows): - for s in values: - real = table.iloc[row, col] - if match_str(real, s): - cell_tags += f"{beg}{s}{end}" - return cell_tags - - -metatables = {} -def match_many(output_dir, task_name, dataset_name, metric_name, tables, values): - for arxiv_id in tables: - for table in tables[arxiv_id]: - tags = mark_with_all_comparators(task_name, dataset_name, metric_name, arxiv_id, tables[arxiv_id][table], values) - global metatables - key = (arxiv_id, table) - if key in metatables: - metatables[key] += tags - else: - metatables[key] = tags - - -def normalize_metric(value): - value, _ = normalize_float_value(str(value)) - if value in metric_na: - return Decimal("NaN") - return Decimal(value) - - -def normalize_cell(cell): - matches = metric_value_re.findall(cell) - matches = [normalize_float_value(match[0]) for match in matches] - values = [(Decimal(value[0]), value[1]) for value in matches if value not in metric_na] - return values - - -def normalize_table(table): - return table.applymap(normalize_cell) - - -# for each task with sota row -# arxivs <- list of papers related to the task -# for each (dataset_name, metric_name) of the task: -# for each table in arxivs -# for each fuzzy_comparator -# count number of task's sota rows found in the table using comparator -# comparator <- comparator with the largest number of hits -# if hits > hits_threshold: -# mark table with a given dataset_name and metric_name -# mark hit cells with sota-tag, model_name and paper_id -# if table.arxiv_id == paper_id: mark with this-tag -PaperResult = namedtuple("PaperResult", ["arxiv_id", "model", "value", "normalized"]) - - -def label_tables(tasksfile, tables_dir): - output_dir = Path(tables_dir) - tasks = get_sota_tasks(tasksfile) - metadata, tables = get_tables(tables_dir) - - arxivs_by_metrics = {} - - tables = {arxiv_id: {tab: normalize_table(tables[arxiv_id][tab]) for tab in tables[arxiv_id]} for arxiv_id in tables} - - for task in tasks: - for dataset in task.datasets: - for row in dataset.sota.rows: - match = arxiv_url_re.match(row.paper_url) - if match is not None: - arxiv_id = match.group("arxiv_id") - for metric in row.metrics: - arxivs_by_metrics.setdefault((task.name, dataset.name, metric), set()).add( - PaperResult(arxiv_id=arxiv_id, model=row.model_name, value=row.metrics[metric], - normalized=normalize_metric(row.metrics[metric]) - ) - ) - - for task, dataset, metric in arxivs_by_metrics: - records = arxivs_by_metrics[(task, dataset, metric)] - tabs = {r.arxiv_id: tables[r.arxiv_id] for r in records if r.arxiv_id in tables} - match_many(output_dir, task, dataset, metric, tabs, records) - - global metatables - - for (arxiv_id, table), best in metatables.items(): - out = output_dir / arxiv_id - out.mkdir(parents=True, exist_ok=True) - best.to_csv(out / table.replace("table", "celltags"), header=None, index=None) - - -if __name__ == "__main__": fire.Fire(label_tables) diff --git a/latex2html.sh b/latex2html.sh deleted file mode 100755 index 01249ac..0000000 --- a/latex2html.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -OUTNAME="$1" -echo $OUTNAME -SOURCE_DIR="/files/ro-source" -OUTPUT_DIR="/files/htmls" - -cd "$SOURCE_DIR" - -if [ -f "$SOURCE_DIR/ms.tex" ] -then - MAINTEX="$SOURCE_DIR/ms.tex" -elif [ -f "$SOURCE_DIR/main.tex" ] -then - MAINTEX="$SOURCE_DIR/main.tex" -elif [ -f "$SOURCE_DIR/00_main.tex" ] -then - MAINTEX="$SOURCE_DIR/00_main.tex" -else - MAINTEX=$(find $SOURCE_DIR -maxdepth 1 -type f -iname "*.tex" -print0 | xargs -0 grep -l documentclass | head -1) -fi -timeout -s KILL 300 engrafo "$MAINTEX" /files/output - -cp /files/output/index.html "$OUTPUT_DIR/$OUTNAME" diff --git a/normalize_metrics.py b/normalize_metrics.py deleted file mode 100755 index 058ec34..0000000 --- a/normalize_metrics.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python - -import fire -from label_tables import get_sota_tasks - - -def normalize_metrics(tasksfile): - tasks = get_sota_tasks(tasksfile) - - print("task\tdataset\tmetric") - for task in tasks: - for dataset in task.datasets: - for row in dataset.sota.rows: - for metric in row.metrics: - print(f"{task.name}\t{dataset.name}\t{metric}") - - -if __name__ == "__main__": fire.Fire(normalize_metrics) diff --git a/normalize_references.py b/normalize_references.py deleted file mode 100644 index 506259d..0000000 --- a/normalize_references.py +++ /dev/null @@ -1,84 +0,0 @@ -import fire -from unidecode import unidecode -from pathlib import Path -import string -import ahocorasick -import pickle -from multiprocessing import Pool -from sota_extractor2.data.doc_utils import get_text, read_html - -punctuation_table = str.maketrans('', '', string.punctuation) - -def normalize_title(title): - return unidecode(title.strip().lower().replace(' ', '')).translate(punctuation_table) - -def resolve_references(reference_trie, bibitems): - if len(bibitems) == 0: - return {} - bib_ids = list(bibitems.keys()) - texts = list(bibitems.values()) - found = 0 - resolved = {} - for bib_id, text in zip(bib_ids, texts): - references = [ref for _, ref in reference_trie.iter(normalize_title(text)) if len(normalize_title(ref['title'])) >= 6] - references = sorted(references, key=lambda ref: len(normalize_title(ref['title'])), reverse=True) - for ref in references: - for author in ref['authors']: - if normalize_title(author['name'].split(' ')[-1]) not in normalize_title(text): - break - else: - found += 1 - resolved[bib_id] = ref['id'] - break - print(f"Found {found} ({found / len(bibitems)})") - return resolved - -def bib_elems(html): - return html.select(".ltx_bibliography .ltx_bibitem[id]") - -def update_references(html, mapping): - anchors = html.select('[href^="#"]') - for anchor in anchors: - target = anchor['href'][1:] - anchor['href'] = '#' + mapping.get(target, target) - anchors = bib_elems(html) - for anchor in anchors: - bib_id = anchor['id'] - anchor['id'] = mapping.get(bib_id, bib_id) - -def get_bibitems(html): - elems = bib_elems(html) - bibitems = {} - for elem in elems: - bib_id = elem['id'] - bibitems[bib_id] = get_text(elem) - return bibitems - -def save_html(path, html): - with open(path, 'w') as f: - f.write(str(html)) - -def resolve_references_in_html(args): - file, output = args - output.parent.mkdir(exist_ok=True, parents=True) - html = read_html(file) - bibitems = get_bibitems(html) - mapping = resolve_references(reference_trie, bibitems) - update_references(html, mapping) - save_html(output, html) - -#DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json") - -#TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl") - -def normalize_references(source_path, target_path, automaton, jobs=1): - global reference_trie - source_path = Path(source_path) - target_path = Path(target_path) - with open(automaton, 'rb') as f: - reference_trie = pickle.load(f) - with Pool(jobs) as p: - params = [(file, target_path / file.relative_to(source_path)) for file in source_path.glob("**/*.html")] - p.map(resolve_references_in_html, params) - -if __name__ == "__main__": fire.Fire(normalize_references) diff --git a/notebooks/datasets.ipynb b/notebooks/datasets.ipynb new file mode 100644 index 0000000..fba6a3c --- /dev/null +++ b/notebooks/datasets.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Datasets\n", + "\n", + "We published four datasets for training and evaluating extraction of performance results from machine learning papers. In this notebook we describe the format and show how to use our python API to conveniently work with the datasets. Due to the licensing the datasets consists of metadata and annotations, but do not include papers and data extracted from them. However, we made special effort in our extraction pipeline to get reproducible results." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Simple functions to load the datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.helpers.datasets import read_arxiv_papers\n", + "from pathlib import Path\n", + "\n", + "V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'\n", + "ARXIV_PAPERS_URL = V1_URL + 'arxiv-papers.csv.xz'\n", + "SEGMENTED_TABLES_URL = V1_URL + 'segmented-tables.json.xz'\n", + "PWC_LEADERBOARDS_URL = V1_URL + 'pwc-leaderboards.json.xz'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ArxivPapers\n", + "\n", + "**ArxivPapers** dataset is a corpus of over 100,000 scientific papers related to machine learning. In our work we use the corpus for self-supervised training of ULMFiT langauge model (see the lm_training notebook) and for extraction of common abbreviations. The dataset is a CSV file with one row per paper and the following fields:\n", + "* arxiv_id - arXiv identifier with version\n", + "* archive_size - the file size in bytes of the e-print archive\n", + "* sha256 - SHA-256 hash of the e-print archive\n", + "* title - paper's title\n", + "* status - the text and tables extraction status for this paper, one of:\n", + " + success,\n", + " + no-tex - LaTeX source is unavailable,\n", + " + processing-error - extraction issues,\n", + " + withdrawn - the paper is withdrawn from arXiv\n", + "* sections - number of extracted sections and subsections\n", + "* tables - number of extracted tables" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of papers: 104710\n", + "└── with LaTeX source: 93811\n", + "Number of extracted tables: 277946\n" + ] + } + ], + "source": [ + "arxiv_papers = read_arxiv_papers(ARXIV_PAPERS_URL)\n", + "\n", + "print(f'Number of papers: {len(arxiv_papers):8}')\n", + "print(f'└── with LaTeX source: {(~arxiv_papers.status.isin([\"no-tex\", \"withdrawn\"])).sum():8}')\n", + "print(f'Number of extracted tables: {arxiv_papers.tables.sum():8}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The arXiv id can be used to generate links to e-prints. Please read https://arxiv.org/help/bulk_data and play nice." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "104705 http://export.arxiv.org/e-print/2002.08204v1\n", + "104706 http://export.arxiv.org/e-print/2002.08253v1\n", + "104707 http://export.arxiv.org/e-print/2002.08264v1\n", + "104708 http://export.arxiv.org/e-print/2002.08301v1\n", + "104709 http://export.arxiv.org/e-print/2002.08325v1\n", + "dtype: object" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_eprint_link(paper):\n", + " return f'http://export.arxiv.org/e-print/{paper.arxiv_id}'\n", + "\n", + "links = arxiv_papers.apply(get_eprint_link, axis=1)\n", + "links.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SegmentedTables & LinkedResults\n", + "\n", + "The **SegmentedTables** dataset contains annotations of almost 2,000 tables. The dataset is a JSON array with one item per paper and the following fields:\n", + "* arxiv_id - arXiv identifier with version. The version can be different than in **ArxivTables**,\n", + "* sha256 - SHA-256 hash of the e-print archive\n", + "* fold - one of 11 folds, f.e., img_class or speech_rec. Each paper has exactly one fold, even if it's related to more than one task,\n", + "* tables - array of tables annotations\n", + " + index - 0-based index of tables extracted from paper,\n", + " + leaderboard - a boolean denoting if this table is a leaderboard table,\n", + " + ablation - a boolean denoting if this table is an ablation table (a table can be both a leaderboard and an ablation table),\n", + " + dataset_text - datasets mentioned in table's caption, not normalized\n", + " + segmentation - for leaderboard tables, a 2D array (list of lists) with one label per cell\n", + "\n", + "Additionally we annotated part of the tables with performance results, called simply the **LinkedResults** dataset. Each table contains a 'records' array with items containing:\n", + "* task, dataset, metric - task, dataset and metric names normalized across all papers from the **LinkedResults** dataset,\n", + "* value - normalized metric value,\n", + "* model - model name,\n", + "* row, column - 0-based cell location with this result." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of papers: 352\n", + "Number of tables: 1994\n", + "├── leaderboards: 796\n", + "└── ablations: 468\n", + "Linked results: 1591\n" + ] + } + ], + "source": [ + "from axcell.helpers.datasets import read_tables_annotations\n", + "\n", + "segmented_tables_annotations = read_tables_annotations(SEGMENTED_TABLES_URL)\n", + "\n", + "leaderboards = (segmented_tables_annotations.tables.apply(\n", + " lambda tables: len([t for t in tables if t['leaderboard']])\n", + ").sum())\n", + "ablations = (segmented_tables_annotations.tables.apply(\n", + " lambda tables: len([t for t in tables if t['ablation']])\n", + ").sum())\n", + "records = (segmented_tables_annotations.tables.apply(\n", + " lambda tables: sum([len(t['records']) for t in tables])\n", + ").sum())\n", + "\n", + "print(f'Number of papers: {len(segmented_tables_annotations):8}')\n", + "print(f'Number of tables: {segmented_tables_annotations.tables.apply(len).sum():8}')\n", + "print(f'├── leaderboards: {leaderboards:8}')\n", + "print(f'└── ablations: {ablations:8}')\n", + "print(f'Linked results: {records:8}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PWCLeaderboards\n", + "\n", + "The **PWCLeaderboards** dataset is similar in structure to the **LinkedResults** dataset. It's a JSON array with one item per paper, containing:\n", + "* arxiv_id - arXiv identifier with version. The version corresponds to the version in **ArxivTables**,\n", + "* tables\n", + " + index - 0-based table index\n", + " + records - as in **LinkedResults**" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of papers: 731\n", + "Number of tables: 1278\n", + "Linked results: 5393\n" + ] + } + ], + "source": [ + "pwc_leaderboards = read_tables_annotations(PWC_LEADERBOARDS_URL)\n", + "\n", + "records = (pwc_leaderboards.tables.apply(\n", + " lambda tables: sum([len(t['records']) for t in tables])\n", + ").sum())\n", + "\n", + "print(f'Number of papers: {len(pwc_leaderboards):8}')\n", + "print(f'Number of tables: {pwc_leaderboards.tables.apply(len).sum():8}')\n", + "print(f'Linked results: {records:8}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/evaluation.ipynb b/notebooks/evaluation.ipynb new file mode 100644 index 0000000..1988d44 --- /dev/null +++ b/notebooks/evaluation.ipynb @@ -0,0 +1,301 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Performance Evaluation on PWCLeaderboards dataset\n", + "\n", + "This notebook runs AxCell on the **PWCLeaderboards** dataset." + ] + }, + { + "cell_type": "markdown", + "source": [ + "For the pipeline to work we need a running elasticsearch instance. Run `docker-compose up -d` from the `axcell` repository to start a new instance." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.helpers.datasets import read_tables_annotations\n", + "from pathlib import Path\n", + "\n", + "V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'\n", + "PWC_LEADERBOARDS_URL = V1_URL + 'pwc-leaderboards.json.xz'\n", + "\n", + "pwc_leaderboards = read_tables_annotations(PWC_LEADERBOARDS_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# path to root directory containing e-prints\n", + "PWC_LEADERBOARDS_ROOT_PATH = Path('pwc-leaderboards')\n", + "PWC_LEADERBOARDS_ROOT_PATH = Path.home() / 'data/pwc-leaderboards'\n", + "SOURCES_PATH = PWC_LEADERBOARDS_ROOT_PATH / 'sources'\n", + "\n", + "from axcell.helpers.paper_extractor import PaperExtractor\n", + "extract = PaperExtractor(PWC_LEADERBOARDS_ROOT_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2.02 s, sys: 1.07 s, total: 3.09 s\n", + "Wall time: 12min 43s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from joblib import delayed, Parallel\n", + "\n", + "# access extract from the global context to avoid serialization\n", + "def extract_single(file): return extract(file)\n", + "\n", + "files = sorted([path for path in SOURCES_PATH.glob('**/*') if path.is_file()])\n", + "\n", + "statuses = Parallel(backend='multiprocessing', n_jobs=-1)(delayed(extract_single)(file) for file in files)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "assert statuses == [\"success\"] * 731" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download and unpack the archive with trained models (table type classifier, table segmentation), taxonomy and abbreviations." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[PID 10700] Load model table-structure-classifier.pth\n" + ] + } + ], + "source": [ + "MODELS_URL = V1_URL + 'models.tar.xz'\n", + "MODELS_ARCHIVE = 'models.tar.xz'\n", + "MODELS_PATH = Path('models')\n", + "\n", + "from fastai.core import download_url\n", + "import tarfile\n", + "\n", + "download_url(MODELS_URL, MODELS_ARCHIVE)\n", + "with tarfile.open(MODELS_ARCHIVE, 'r:*') as archive:\n", + " archive.extractall()\n", + "\n", + "from axcell.helpers.results_extractor import ResultsExtractor\n", + "extract_results = ResultsExtractor(MODELS_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "papers = []\n", + "our_taxonomy = set(extract_results.taxonomy.taxonomy)\n", + "gold_records = []\n", + "for _, paper in pwc_leaderboards.iterrows():\n", + " for table in paper.tables:\n", + " for record in table['records']:\n", + " r = dict(record)\n", + " r['arxiv_id'] = paper.arxiv_id\n", + " tdm = (record['task'], record['dataset'], record['metric'])\n", + " if tdm in our_taxonomy:\n", + " gold_records.append(r)\n", + " papers.append(paper.arxiv_id)\n", + "gold_records = pd.DataFrame(gold_records)\n", + "papers = sorted(set(papers))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.data.paper_collection import PaperCollection\n", + "pc = PaperCollection.from_files(PWC_LEADERBOARDS_ROOT_PATH / \"papers\")\n", + "pc = PaperCollection([pc.get_by_id(p) for p in papers])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 21.4 s, sys: 17.8 s, total: 39.2 s\n", + "Wall time: 43min 27s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from joblib import delayed, Parallel\n", + "\n", + "def process_single(index):\n", + " extract_results = ResultsExtractor(MODELS_PATH)\n", + " return extract_results(pc[index])\n", + "\n", + "results = Parallel(backend='multiprocessing', n_jobs=-1)(delayed(process_single)(index) for index in range(len(pc)))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "predicted_records = []\n", + "for paper, records in zip(pc, results):\n", + " r = records.copy()\n", + " r['arxiv_id'] = paper.arxiv_no_version\n", + " predicted_records.append(r)\n", + "predicted_records = pd.concat(predicted_records)\n", + "predicted_records.to_json('axcell-predictions-on-pwc-leaderboards.json.xz', orient='records')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<style type=\"text/css\" >\n", + "</style><table id=\"T_362da586_9049_11ea_bd4b_51531a44b57f\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >Micro Precision</th> <th class=\"col_heading level0 col1\" >Micro Recall</th> <th class=\"col_heading level0 col2\" >Micro F1</th> <th class=\"col_heading level0 col3\" >Macro Precision</th> <th class=\"col_heading level0 col4\" >Macro Recall</th> <th class=\"col_heading level0 col5\" >Macro F1</th> </tr></thead><tbody>\n", + " <tr>\n", + " <th id=\"T_362da586_9049_11ea_bd4b_51531a44b57flevel0_row0\" class=\"row_heading level0 row0\" >0</th>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow0_col0\" class=\"data row0 col0\" >39.35%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow0_col1\" class=\"data row0 col1\" >24.18%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow0_col2\" class=\"data row0 col2\" >29.95%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow0_col3\" class=\"data row0 col3\" >24.18%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow0_col4\" class=\"data row0 col4\" >22.13%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow0_col5\" class=\"data row0 col5\" >21.34%</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_362da586_9049_11ea_bd4b_51531a44b57flevel0_row1\" class=\"row_heading level0 row1\" >1</th>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow1_col0\" class=\"data row1 col0\" >67.83%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow1_col1\" class=\"data row1 col1\" >47.35%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow1_col2\" class=\"data row1 col2\" >55.77%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow1_col3\" class=\"data row1 col3\" >47.94%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow1_col4\" class=\"data row1 col4\" >46.50%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow1_col5\" class=\"data row1 col5\" >43.62%</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_362da586_9049_11ea_bd4b_51531a44b57flevel0_row2\" class=\"row_heading level0 row2\" >2</th>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow2_col0\" class=\"data row2 col0\" >70.79%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow2_col1\" class=\"data row2 col1\" >57.27%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow2_col2\" class=\"data row2 col2\" >63.32%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow2_col3\" class=\"data row2 col3\" >60.78%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow2_col4\" class=\"data row2 col4\" >62.72%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow2_col5\" class=\"data row2 col5\" >59.60%</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_362da586_9049_11ea_bd4b_51531a44b57flevel0_row3\" class=\"row_heading level0 row3\" >3</th>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow3_col0\" class=\"data row3 col0\" >70.28%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow3_col1\" class=\"data row3 col1\" >48.06%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow3_col2\" class=\"data row3 col2\" >57.08%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow3_col3\" class=\"data row3 col3\" >53.64%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow3_col4\" class=\"data row3 col4\" >52.79%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow3_col5\" class=\"data row3 col5\" >50.02%</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_362da586_9049_11ea_bd4b_51531a44b57flevel0_row4\" class=\"row_heading level0 row4\" >4</th>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow4_col0\" class=\"data row4 col0\" >68.48%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow4_col1\" class=\"data row4 col1\" >58.09%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow4_col2\" class=\"data row4 col2\" >62.86%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow4_col3\" class=\"data row4 col3\" >58.22%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow4_col4\" class=\"data row4 col4\" >60.53%</td>\n", + " <td id=\"T_362da586_9049_11ea_bd4b_51531a44b57frow4_col5\" class=\"data row4 col5\" >56.38%</td>\n", + " </tr>\n", + " </tbody></table>" + ], + "text/plain": [ + "<pandas.io.formats.style.Styler at 0x7f0880fa7f98>" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from axcell.helpers.evaluate import evaluate\n", + "evaluate(predicted_records, gold_records).style.format('{:.2%}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/extraction.ipynb b/notebooks/extraction.ipynb new file mode 100644 index 0000000..b97f341 --- /dev/null +++ b/notebooks/extraction.ipynb @@ -0,0 +1,592 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text and Tables Extraction\n", + "\n", + "This notebook presents how to use our pipeline to extract text and tables from arXiv papers with available LaTeX source code." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from axcell.helpers.paper_extractor import PaperExtractor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Structure of Directories\n", + "\n", + "We cache the artifacts produced by successful execution of the intermediate steps of extraction pipeline. The `root` argument of `PaperExtractor` is a path under which the following directory structue is created:\n", + "\n", + "```\n", + "root\n", + "├── sources # e-print archives\n", + "├── unpacked_sources # extracted latex sources (generated automatically)\n", + "├── htmls # converted html files (generated automatically)\n", + "└── papers # extracted text and tables (generated automatically)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ROOT_PATH = Path('data')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our case there's a single e-print archive:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;36mdata\u001b[00m\r\n", + "└── \u001b[01;34msources\u001b[00m\r\n", + " └── \u001b[01;34m1903\u001b[00m\r\n", + " └── 1903.11816v1\r\n", + "\r\n", + "2 directories, 1 file\r\n" + ] + } + ], + "source": [ + "!tree {ROOT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "extract = PaperExtractor(ROOT_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To extract text and tables from a single paper just pass the path to the archive:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'success'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SOURCES_PATH = ROOT_PATH / 'sources'\n", + "extract(SOURCES_PATH / '1903' / '1903.11816v1')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The subdirectory structure under `sources` directory will be replicated in the other top-level directories." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;36mdata\u001b[00m\r\n", + "├── \u001b[01;34mhtmls\u001b[00m\r\n", + "│   └── \u001b[01;34m1903\u001b[00m\r\n", + "│   └── \u001b[01;34m1903.11816v1\u001b[00m\r\n", + "│   └── index.html\r\n", + "├── \u001b[01;34mpapers\u001b[00m\r\n", + "│   └── \u001b[01;34m1903\u001b[00m\r\n", + "│   └── \u001b[01;34m1903.11816v1\u001b[00m\r\n", + "│   ├── layout_01.csv\r\n", + "│   ├── layout_02.csv\r\n", + "│   ├── layout_03.csv\r\n", + "│   ├── layout_04.csv\r\n", + "│   ├── layout_05.csv\r\n", + "│   ├── metadata.json\r\n", + "│   ├── table_01.csv\r\n", + "│   ├── table_02.csv\r\n", + "│   ├── table_03.csv\r\n", + "│   ├── table_04.csv\r\n", + "│   ├── table_05.csv\r\n", + "│   └── text.json\r\n", + "├── \u001b[01;34msources\u001b[00m\r\n", + "│   └── \u001b[01;34m1903\u001b[00m\r\n", + "│   └── 1903.11816v1\r\n", + "└── \u001b[01;34munpacked_sources\u001b[00m\r\n", + " └── \u001b[01;34m1903\u001b[00m\r\n", + " └── \u001b[01;34m1903.11816v1\u001b[00m\r\n", + " ├── eso-pic.sty\r\n", + " ├── iccv.sty\r\n", + " ├── iccv_eso.sty\r\n", + " ├── ieee.bst\r\n", + " ├── \u001b[01;34mimages\u001b[00m\r\n", + " ├── submission_465.bbl\r\n", + " └── submission_465.tex\r\n", + "\r\n", + "12 directories, 20 files\r\n" + ] + } + ], + "source": [ + "!tree -L 4 {ROOT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The extracted data is stored in `papers` directory. We can read it using `PaperCollection` class. `PaperCollection` is a wrapper for `list` of papers with additional functions added for convenience. Due to large number of papers it is recommended to load the dataset in parallel (default uses number of processes equal to number of CPU cores) and store it in a pickle file. Set jobs=1 to disable multiprocessing." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.data.paper_collection import PaperCollection\n", + "\n", + "PAPERS_PATH = ROOT_PATH / 'papers'\n", + "pc = PaperCollection.from_files(PAPERS_PATH)\n", + "# pc.to_pickle('mypapers.pkl')\n", + "# pc = PaperCollection.from_pickle('mypapers.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "paper = pc.get_by_id('1903.11816')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.text.title" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<style>\n", + "body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .has-annotations{color:#ff3860}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-ensemble{background-color: #aa38ff;color: #fff;}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper span.text-bold{font-weight:700}.tableWrapper span.text-italic{font-style:italic}.tableWrapper span.text-red{color:red}.tableWrapper span.text-green{color:green}.tableWrapper span.text-blue{color:#00f}.predict-dataset,.predict-dataset-metric,.predict-model-competing,.predict-model-paper,.predict-model-params,.predict-table-meta{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper .predict-model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .predict-table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-paper{background-color:#ff3860;color:#fff}.tableWrapper .predict-dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .predict-dataset{background-color:#02bd43;color:#fff}.tableWrapper td{border:inherit}.tableWrapper table tr td.border-l{border-left:1px solid #000}.tableWrapper table tr td.border-r{border-right:1px solid #000}.tableWrapper table tr td.border-t{border-top:1px solid #000}.tableWrapper table tr td.border-b{border-bottom:1px solid #000}.tableWrapper table tr td.border-ll{border-left:2px solid #000}.tableWrapper table tr td.border-rr{border-right:2px solid #000}.tableWrapper table tr td.border-tt{border-top:2px solid #000}.tableWrapper table tr td.border-bb{border-bottom:2px solid #000}.tableWrapper table tr td.align-left{text-align:left}.tableWrapper table tr td.align-right{text-align:right}.tableWrapper table tr td.align-center{text-align:center}.tableWrapper table tr td.align-justify{text-align:justify}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.ht_clone_top{z-index:20}.evaluation-tables{overflow:scroll;max-height:20vh;border-top:1px solid #a9a9a9}.navbar.is-fixed-bottom,.navbar.is-fixed-top{z-index:200}body{padding-bottom:20vh}\n", + ".tableWrapper .final-proposal{ background: lightgreen }\n", + "</style>\n", + "\n", + "<div class=\"tableWrapper\">\n", + "<table>\n", + "<tr>\n", + "<td class=\" border-t align-center header \" title=\"\">Rank</td>\n", + "<td class=\" border-t align-left header \" title=\"\">Team</td>\n", + "<td class=\" border-r border-t align-center header \" title=\"\">Single Model</td>\n", + "<td class=\" border-t align-center header \" title=\"\">Final Score</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" border-t align-center header \" title=\"\">1</td>\n", + "<td class=\" border-t align-left header \" title=\"\">CASIA_IVA_JD</td>\n", + "<td class=\" border-r border-t align-center \" title=\"\">✗</td>\n", + "<td class=\" border-t align-center \" title=\"\">0.5547</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" align-center header \" title=\"\">2</td>\n", + "<td class=\" align-left header \" title=\"\">WinterIsComing</td>\n", + "<td class=\" border-r align-center \" title=\"\">✗</td>\n", + "<td class=\" align-center \" title=\"\">0.5544</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" align-center header \" title=\"\">-</td>\n", + "<td class=\" align-left header \" title=\"\">PSPNet [<a title=\"bib-bib38\">38</a>]</td>\n", + "<td class=\" border-r align-center \" title=\"\">ResNet-269</td>\n", + "<td class=\" align-center \" title=\"\">0.5538</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" align-center header \" title=\"\">-</td>\n", + "<td class=\" align-left header \" title=\"\">EncNet [<a title=\"bib-bib36\">36</a>]</td>\n", + "<td class=\" border-r align-center \" title=\"\">ResNet-101</td>\n", + "<td class=\" align-center \" title=\"\">0.5567</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" border-b border-t align-center header \" title=\"\">-</td>\n", + "<td class=\" border-b border-t align-left header \" title=\"\">Ours</td>\n", + "<td class=\" border-b border-r border-t align-center \" title=\"\">ResNet-101</td>\n", + "<td class=\" border-b border-t align-center \" title=\"\"><span class=\"text-bold\">0.5584</span></td>\n", + "</tr>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "<axcell.data.table.Table at 0x7feccce790f0>" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.tables[4]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As *FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation* (Wu et al., 2019) is present in our **SegmentedTables** dataset, we can use `PaperCollection` to import annotations (table segmentation and results):" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.helpers.datasets import read_tables_annotations\n", + "\n", + "V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'\n", + "SEGMENTED_TABLES_URL = V1_URL + 'segmented-tables.json.xz'\n", + "\n", + "segmented_tables = read_tables_annotations(SEGMENTED_TABLES_URL)\n", + "\n", + "pc = PaperCollection.from_files(PAPERS_PATH, annotations=segmented_tables.to_dict('record'))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<style>\n", + "body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .has-annotations{color:#ff3860}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-ensemble{background-color: #aa38ff;color: #fff;}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper span.text-bold{font-weight:700}.tableWrapper span.text-italic{font-style:italic}.tableWrapper span.text-red{color:red}.tableWrapper span.text-green{color:green}.tableWrapper span.text-blue{color:#00f}.predict-dataset,.predict-dataset-metric,.predict-model-competing,.predict-model-paper,.predict-model-params,.predict-table-meta{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper .predict-model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .predict-table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-paper{background-color:#ff3860;color:#fff}.tableWrapper .predict-dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .predict-dataset{background-color:#02bd43;color:#fff}.tableWrapper td{border:inherit}.tableWrapper table tr td.border-l{border-left:1px solid #000}.tableWrapper table tr td.border-r{border-right:1px solid #000}.tableWrapper table tr td.border-t{border-top:1px solid #000}.tableWrapper table tr td.border-b{border-bottom:1px solid #000}.tableWrapper table tr td.border-ll{border-left:2px solid #000}.tableWrapper table tr td.border-rr{border-right:2px solid #000}.tableWrapper table tr td.border-tt{border-top:2px solid #000}.tableWrapper table tr td.border-bb{border-bottom:2px solid #000}.tableWrapper table tr td.align-left{text-align:left}.tableWrapper table tr td.align-right{text-align:right}.tableWrapper table tr td.align-center{text-align:center}.tableWrapper table tr td.align-justify{text-align:justify}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.ht_clone_top{z-index:20}.evaluation-tables{overflow:scroll;max-height:20vh;border-top:1px solid #a9a9a9}.navbar.is-fixed-bottom,.navbar.is-fixed-top{z-index:200}body{padding-bottom:20vh}\n", + ".tableWrapper .final-proposal{ background: lightgreen }\n", + "</style>\n", + "\n", + "<div class=\"tableWrapper\">\n", + "<table>\n", + "<tr>\n", + "<td class=\"table-meta border-t align-center header \" title=\"\">Rank</td>\n", + "<td class=\"table-meta border-t align-left header \" title=\"\">Team</td>\n", + "<td class=\"model-params border-r border-t align-center header \" title=\"\">Single Model</td>\n", + "<td class=\"dataset-metric border-t align-center header \" title=\"\">Final Score</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" border-t align-center header \" title=\"\">1</td>\n", + "<td class=\"model-competing border-t align-left header \" title=\"\">CASIA_IVA_JD</td>\n", + "<td class=\" border-r border-t align-center \" title=\"\">✗</td>\n", + "<td class=\" border-t align-center \" title=\"\">0.5547</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" align-center header \" title=\"\">2</td>\n", + "<td class=\"model-competing align-left header \" title=\"\">WinterIsComing</td>\n", + "<td class=\" border-r align-center \" title=\"\">✗</td>\n", + "<td class=\" align-center \" title=\"\">0.5544</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" align-center header \" title=\"\">-</td>\n", + "<td class=\"model-competing align-left header \" title=\"\">PSPNet [<a title=\"bib-bib38\">38</a>]</td>\n", + "<td class=\" border-r align-center \" title=\"\">ResNet-269</td>\n", + "<td class=\" align-center \" title=\"\">0.5538</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" align-center header \" title=\"\">-</td>\n", + "<td class=\"model-competing align-left header \" title=\"\">EncNet [<a title=\"bib-bib36\">36</a>]</td>\n", + "<td class=\" border-r align-center \" title=\"\">ResNet-101</td>\n", + "<td class=\" align-center \" title=\"\">0.5567</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\" border-b border-t align-center header \" title=\"\">-</td>\n", + "<td class=\"model-best border-b border-t align-left header \" title=\"\">Ours</td>\n", + "<td class=\" border-b border-r border-t align-center \" title=\"\">ResNet-101</td>\n", + "<td class=\" border-b border-t align-center \" title=\"\"><span class=\"text-bold\">0.5584</span></td>\n", + "</tr>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "<axcell.data.table.Table at 0x7feccd878518>" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper = pc.get_by_id('1903.11816')\n", + "paper.tables[4]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<style>\n", + "body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .has-annotations{color:#ff3860}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-ensemble{background-color: #aa38ff;color: #fff;}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper span.text-bold{font-weight:700}.tableWrapper span.text-italic{font-style:italic}.tableWrapper span.text-red{color:red}.tableWrapper span.text-green{color:green}.tableWrapper span.text-blue{color:#00f}.predict-dataset,.predict-dataset-metric,.predict-model-competing,.predict-model-paper,.predict-model-params,.predict-table-meta{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper .predict-model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .predict-table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-paper{background-color:#ff3860;color:#fff}.tableWrapper .predict-dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .predict-dataset{background-color:#02bd43;color:#fff}.tableWrapper td{border:inherit}.tableWrapper table tr td.border-l{border-left:1px solid #000}.tableWrapper table tr td.border-r{border-right:1px solid #000}.tableWrapper table tr td.border-t{border-top:1px solid #000}.tableWrapper table tr td.border-b{border-bottom:1px solid #000}.tableWrapper table tr td.border-ll{border-left:2px solid #000}.tableWrapper table tr td.border-rr{border-right:2px solid #000}.tableWrapper table tr td.border-tt{border-top:2px solid #000}.tableWrapper table tr td.border-bb{border-bottom:2px solid #000}.tableWrapper table tr td.align-left{text-align:left}.tableWrapper table tr td.align-right{text-align:right}.tableWrapper table tr td.align-center{text-align:center}.tableWrapper table tr td.align-justify{text-align:justify}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.ht_clone_top{z-index:20}.evaluation-tables{overflow:scroll;max-height:20vh;border-top:1px solid #a9a9a9}.navbar.is-fixed-bottom,.navbar.is-fixed-top{z-index:200}body{padding-bottom:20vh}\n", + ".tableWrapper .final-proposal{ background: lightgreen }\n", + "</style>\n", + "\n", + "<div class=\"tableWrapper\">\n", + "<table>\n", + "<tr>\n", + "<td class=\" \" title=\"\">Tag</td>\n", + "<td class=\" \" title=\"\">description</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-best \" title=\"\">model-best</td>\n", + "<td class=\" \" title=\"\">the best performing model introduced in the paper</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-paper \" title=\"\">model-paper</td>\n", + "<td class=\" \" title=\"\">model introduced in the paper</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-ensemble \" title=\"\">model-ensemble</td>\n", + "<td class=\" \" title=\"\">ensemble of models introduced in the paper</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing \" title=\"\">model-competing</td>\n", + "<td class=\" \" title=\"\">model from another paper used for comparison</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"dataset-task \" title=\"\">dataset-task</td>\n", + "<td class=\" \" title=\"\">Task</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"dataset \" title=\"\">dataset</td>\n", + "<td class=\" \" title=\"\">Dataset</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"dataset-sub \" title=\"\">dataset-sub</td>\n", + "<td class=\" \" title=\"\">Subdataset</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"dataset-metric \" title=\"\">dataset-metric</td>\n", + "<td class=\" \" title=\"\">Metric</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-params \" title=\"\">model-params</td>\n", + "<td class=\" \" title=\"\">Params, f.e., number of layers or inference time</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"table-meta \" title=\"\">table-meta</td>\n", + "<td class=\" \" title=\"\">Cell describing other header cells</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"trash \" title=\"\">trash</td>\n", + "<td class=\" \" title=\"\">Parsing erros</td>\n", + "</tr>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pc.cells_gold_tags_legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>task</th>\n", + " <th>dataset</th>\n", + " <th>metric</th>\n", + " <th>format</th>\n", + " <th>model</th>\n", + " <th>raw_value</th>\n", + " </tr>\n", + " <tr>\n", + " <th>cell_ext_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>table_05.csv/5.3</th>\n", + " <td>Semantic Segmentation</td>\n", + " <td>ADE20K test</td>\n", + " <td>Test Score</td>\n", + " <td>NaN</td>\n", + " <td>EncNet + JPU</td>\n", + " <td>0.5584</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " task dataset metric format \\\n", + "cell_ext_id \n", + "table_05.csv/5.3 Semantic Segmentation ADE20K test Test Score NaN \n", + "\n", + " model raw_value \n", + "cell_ext_id \n", + "table_05.csv/5.3 EncNet + JPU 0.5584 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.tables[4].sota_records" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parallel Extraction\n", + "\n", + "For a single paper extraction can take from several seconds to a few minutes (the longest phase of converting LaTeX source into HTML is timed-out after 5 minutes), so to process multiple files we run extraction in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 100 ms, sys: 40.5 ms, total: 141 ms\n", + "Wall time: 30.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from joblib import delayed, Parallel\n", + "\n", + "# access extract from the global context to avoid serialization\n", + "def extract_single(file): return extract(file)\n", + "\n", + "files = sorted([path for path in SOURCES_PATH.glob('**/*') if path.is_file()])\n", + "\n", + "statuses = Parallel(backend='multiprocessing', n_jobs=-1)(delayed(extract_single)(file) for file in files)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/papers-api.ipynb b/notebooks/papers-api.ipynb deleted file mode 100644 index de5762c..0000000 --- a/notebooks/papers-api.ipynb +++ /dev/null @@ -1,901 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Papers with Code ML papers dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/ubuntu/paperswithcode/paper-extractor\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%cd .." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from sota_extractor2.data.paper_collection import PaperCollection\n", - "from pathlib import Path\n", - "\n", - "DATA_PATH = Path(\"data/arxiv\")\n", - "PICKLE_PATH = Path(\"data/pc.pkl\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset\n", - "The dataset was created by parsing 75K arXiv papers related to machine learning. Due to parsing errors, the dataset contains texts and tables extracted from 56K papers. \n", - "```\n", - ".\n", - "└── arxiv\n", - " ├── papers\n", - " │ ├── 0709\n", - " │ │ ├── 0709.1667\n", - " │ │ │ ├── text.json\n", - " │ │ │ ├── metadata.json\n", - " │ │ │ ├── table_01.csv\n", - " │ │ │ ...\n", - " │ │ ...\n", - " │ ...\n", - " └── structure-annotations.json\n", - "```\n", - "\n", - "`text.json` files contains papers' content organized into sections. `metadata.json` list tables and their captions found in a given paper. `table_xx.csv` contains data of a given table (nested tables are flattened). We provide a simple API to load and access the dataset. Due to large number of papers it is recommended to load the dataset in parallel (default uses number of processes equal to number of CPU cores) and store it in a pickle file. Set `jobs=1` to disable multiprocessing. PaperCollection is a wrapper for `list` of papers with additional functions added for convenience. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 4min 58s, sys: 12.4 s, total: 5min 11s\n", - "Wall time: 7min 28s\n" - ] - }, - { - "data": { - "text/plain": [ - "56696" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%time pc = PaperCollection.from_files(DATA_PATH)\n", - "len(pc)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "pc.to_pickle(PICKLE_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3min 11s, sys: 9.39 s, total: 3min 20s\n", - "Wall time: 3min 20s\n" - ] - } - ], - "source": [ - "#%time pc = PaperCollection.from_pickle(PICKLE_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The path is searched recursively for papers, so it is easy to specify smaller dataset to play with. In this case, however, a path to `structure-annotations.json` file needs to be specified." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 2.35 s, sys: 2.08 s, total: 4.43 s\n", - "Wall time: 8.62 s\n" - ] - }, - { - "data": { - "text/plain": [ - "555" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#%time pc_small = PaperCollection.from_files(DATA_PATH / \"papers\" / \"1602\", annotations_path=DATA_PATH / \"structure-annotations.json\")\n", - "#len(pc_small)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tables\n", - "Each `Paper` contains `text` and `tables` fields. Tables can be displayed with color-coded labels." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<style>body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}\n", - "</style>\n", - "\n", - "<div class=\"tableWrapper\">\n", - "<table>\n", - "<tr>\n", - "<td class=\"table-meta\">Model</td>\n", - "<td class=\"model-params\">d</td>\n", - "<td class=\"model-params\">|θ|M</td>\n", - "<td class=\"dataset-sub\">Train</td>\n", - "<td class=\"dataset-sub\">Test</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">Classifier with handcrafted features [12]</td>\n", - "<td class=\"\">-</td>\n", - "<td class=\"\">-</td>\n", - "<td class=\"\">99.7</td>\n", - "<td class=\"\">78.2</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">LSTM encoders [12]</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">3.0M</td>\n", - "<td class=\"\">83.9</td>\n", - "<td class=\"\">80.6</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">Dependency Tree CNN encoders [13]</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">3.5M</td>\n", - "<td class=\"\">83.3</td>\n", - "<td class=\"\">82.1</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">SPINN-PI encoders [14]</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">3.7M</td>\n", - "<td class=\"\">89.2</td>\n", - "<td class=\"\">83.2</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-paper\">NSE</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">3.4M</td>\n", - "<td class=\"\">86.2</td>\n", - "<td class=\"\">84.6</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-paper\">MMA-NSE</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">6.3M</td>\n", - "<td class=\"\">87.1</td>\n", - "<td class=\"\">84.8</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">LSTM attention [15]</td>\n", - "<td class=\"\">100</td>\n", - "<td class=\"\">242K</td>\n", - "<td class=\"\">85.4</td>\n", - "<td class=\"\">82.3</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">LSTM word-by-word attention [15]</td>\n", - "<td class=\"\">100</td>\n", - "<td class=\"\">252K</td>\n", - "<td class=\"\">85.3</td>\n", - "<td class=\"\">83.5</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-best\">MMA-NSE attention</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">6.5M</td>\n", - "<td class=\"\">86.9</td>\n", - "<td class=\"\">85.4</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">mLSTM word-by-word attention [16]</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">1.9M</td>\n", - "<td class=\"\">92.0</td>\n", - "<td class=\"\">86.1</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">LSTMN with deep attention fusion [17]</td>\n", - "<td class=\"\">450</td>\n", - "<td class=\"\">3.4M</td>\n", - "<td class=\"\">89.5</td>\n", - "<td class=\"\">86.3</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">Decomposable attention model [18]</td>\n", - "<td class=\"\">200</td>\n", - "<td class=\"\">582K</td>\n", - "<td class=\"\">90.5</td>\n", - "<td class=\"\">86.8</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">Full tree matching NTI-SLSTM-LSTM global attention [19]</td>\n", - "<td class=\"\">300</td>\n", - "<td class=\"\">3.2M</td>\n", - "<td class=\"\">88.5</td>\n", - "<td class=\"\">87.3</td>\n", - "</tr>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - "<IPython.core.display.HTML object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "paper = pc.get_by_id('1607.04315')\n", - "table = paper.tables[0]\n", - "table.display()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "<style>body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}\n", - "</style>\n", - "\n", - "<div class=\"tableWrapper\">\n", - "<table>\n", - "<tr>\n", - "<td class=\"\">Tag</td>\n", - "<td class=\"\">description</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-best\">model-best</td>\n", - "<td class=\"\">model that has results that author most likely would like to have exposed</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-paper\">model-paper</td>\n", - "<td class=\"\">an example of a generic model, (like LSTM)</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-competing\">model-competing</td>\n", - "<td class=\"\">model from another paper used for comparison</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"dataset-task\">dataset-task</td>\n", - "<td class=\"\">Task</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"dataset\">dataset</td>\n", - "<td class=\"\">Dataset</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"dataset-sub\">dataset-sub</td>\n", - "<td class=\"\">Subdataset</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"dataset-metric\">dataset-metric</td>\n", - "<td class=\"\">Metric</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"model-params\">model-params</td>\n", - "<td class=\"\">Params, f.e., number of layers or inference time</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"table-meta\">table-meta</td>\n", - "<td class=\"\">Cell describing other header cells</td>\n", - "</tr>\n", - "<tr>\n", - "<td class=\"trash\">trash</td>\n", - "<td class=\"\">Parsing erros</td>\n", - "</tr>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - "<IPython.core.display.HTML object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "PaperCollection.cells_gold_tags_legend()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Table's data is stored in `.df` pandas `DataFrame`. Each cell contains its content `value`, annotated `gold_tags` and references `refs` to other papers. Most of the references were normalized across all papers." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Cell(value='SPINN-PI encoders [14]', gold_tags='model-competing', refs=['xxref-23c141141f4f63c061d3cce14c71893959af5721'])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.df.iloc[4,0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Additionally, each table contains `gold_tags` describing what is the content of the table." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'sota'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.gold_tags" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Text Content\n", - "Papers' content is represented using elastic search document classes (can be easily `save()`'ed to an existing elastic search instance). Each `text` contains `title`, `abstract`, and 'authors'. Paper's text is split into `fragments`." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Abstract We present a memory augmented neural network for natural language understanding: Neural Semantic Encoders. NSE is equipped with a novel memory update rule and has a variable sized encoding memory that evolves over time and maintains the understanding of input sequences through read , compose and write operations. NSE can also access 1 xxanchor-x1-2f1 multiple and shared memories. In this paper, we demonstrated the effectiveness and the flexibility of NSE on five different natural language tasks: natural language inference, question answering, sentence classification, document sentiment analysis and machine translation where NSE achieved state-of-the-art performance when evaluated on publically available benchmarks. For example, our shared-memory model showed an encouraging result on neural machine translation, improving an attention-based baseline by approximately 1.0 BLEU.'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "paper.text.abstract" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "1 xxanchor-x1-10001 Introduction" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "2 xxanchor-x1-20002 Related Work" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "3 xxanchor-x1-30003 Proposed Approach" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "3.1 xxanchor-x1-40003.1 Read, Compose and Write" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "3.2 xxanchor-x1-50003.2 Shared and Multiple Memory Accesses" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4 xxanchor-x1-60004 Experiments" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.1 xxanchor-x1-70004.1 Natural Language Inference" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.2 xxanchor-x1-80004.2 Answer Sentence Selection" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.3 xxanchor-x1-90004.3 Sentence Classification" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.4 xxanchor-x1-100004.4 Document Sentiment Analysis" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "4.5 xxanchor-x1-110004.5 Machine Translation" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "5.1 xxanchor-x1-130005.1 Memory Access and Compositionality" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "6 xxanchor-x1-140006 Conclusion" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "xxanchor-x1-150006 Acknowledgments" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "xxanchor-x1-160006 References" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "A xxanchor-x1-17000A Step-by-step visualization of memory states in NSE" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "paper.text.print_toc()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "# 4.5 xxanchor-x1-110004.5 Machine Translation" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "Lastly, we conducted an experiment on neural machine translation (NMT). The NMT problem is mostly defined within the encoder-decoder framework [ xxref-4b9b7eed30feee37db3452b74503d0db9f163074 , xxref-0b544dfe355a5070b60986319a3f51fb45d1348e , xxref-39dba6f22d72853561a4ed684be265e179a39e4f ]. The encoder provides the semantic and syntactic information about the source sentences to the decoder and the decoder generates the target sentences by conditioning on this information and its partially produced translation. For an efficient encoding, the attention-based NTM was introduced [ xxref-071b16f25117fb6133480c6259227d54fc2a5ea0 ]." - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11000" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "For NTM, we implemented three different models. The first model is a baseline model and is similar to the one proposed in [ xxref-071b16f25117fb6133480c6259227d54fc2a5ea0 ] (RNNSearch). This model (LSTM-LSTM) has two LSTM for the encoder/decoder and has the soft attention neural net, which attends over the source sentence and constructs a focused encoding vector for each target word. The second model is an NSE-LSTM encoder-decoder which encodes the source sentence with NSE and generates the targets with the LSTM network by using the NSE output states and the attention network. The last model is an NSE-NSE setup, where the encoding part is the same as the NSE-LSTM while the decoder NSE now uses the output state and has an access to the encoder memory, i.e., the encoder and the decoder NSEs access a shared memory. The memory is encoded by the first NSEs and then read/written by the decoder NSEs. We used the English-German translation corpus from the IWSLT 2014 evaluation campaign [ xxref-c64d27b122d5b6ef0be135e63df05c3b24bd80c5 ]. The corpus consists of sentence-aligned translation of TED talks. The data was pre-processed and lowercased with the Moses toolkit. 9 xxanchor-x1-11001f9 We merged the dev2010 and dev2012 sets for development and the tst2010, tst2011 and tst2012 sets for test data 10 xxanchor-x1-11002f10 . Sentence pairs with length longer than 25 words were filtered out. This resulted in 110,439/4,998/4,793 pairs for train/dev/test sets. We kept the most frequent 25,000 words for the German dictionary. The English dictionary has 51,821 words. The 300-D Glove 840B vectors were used for embedding the words in the source sentence whereas a lookup embedding layer was used for the target German words. Note that the word embeddings are usually optimized along with the NMT models. However, for the evaluation purpose we in this experiment do not optimize the English word embeddings. Besides, we do not use a beam search to generate the target sentences." - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11001" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "xxanchor-x1-110032 Figure 2: Word association or composition graphs produced by NSE memory access. The directed arcs connect the words that are composed via compose module. The source nodes are input words and the destination nodes (pointed by the arrows) correspond to the accessed memory slots. < S > denotes the beginning of sequence." - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11002" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "The LSTM encoder/decoders have two layers with 300 units. The NSE read/write modules are two one-layer LSTM with the same number of units as the LSTM encoder/decoders. This ensures that the number of parameters of the models is roughly the equal. The models were trained to minimize word-level cross entropy loss and were regularized by 20% input dropouts and the 30% output dropouts. We set the batch size to 128, the initial learning rate to 1e-3 for LSTM-LSTM and 3e-4 for the other models and l 2 regularizer strength to 3e-5, and train each model for 40 epochs. We report BLEU score for each models. 11 xxanchor-x1-11004f11" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11003" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "Table xxref-x1-100035 reports our results. The baseline LSTM-LSTM encoder-decoder (with attention) obtained 17.02 BLEU on the test set. The NSE-LSTM improved the baseline slightly. Given this very small improvement of the NSE-LSTM, it is unclear whether the NSE encoder is helpful in NMT. However, if we replace the LSTM decoder with another NSE and introduce the shared memory access to the encoder-decoder model (NSE-NSE), we improve the baseline result by almost 1.0 BLEU. The NSE-NSE model also yields an increasing BLEU score on dev set. The result demonstrates that the attention-based NMT systems can be improved by a shared-memory encoder-decoder model. In addition, memory-based NMT systems should perform well on translation of long sequences by preserving long term dependencies." - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "11004" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "paper.text.print_section(\"Machine Translation\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fragments can be accessed separately" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "# 1 xxanchor-x1-10001 Introduction,\n", - "Recently several studies have explored ways of extending the neural networks with an external memory [ xxref-6eedf0a4fe861335f7f7664c14de7f71c00b7932 – xxref-950ebd31505dfc0733c391ad9b7a16571c46002e ]. Unlike LSTM, the short term memories and the training parameters of such a neural network are no longer coupled and can be adapted. In this paper we propose a novel class of memory augmented neural networks called Neural Semantic Encoders (NSE) for natural language understanding. NSE offers several desirable properties. NSE has a variable sized encoding memory which allows the model to access entire input sequence during the reading process; therefore efficiently delivering long-term dependencies over time. The encoding memory evolves over time and maintains the memory of the input sequence through read , compose and write operations. NSE sequentially processes the input and supports word compositionality inheriting both temporal and hierarchical nature of human language. NSE can read from and write to a set of relevant encoding memories simultaneously or multiple NSEs can access a shared encoding memory effectively supporting knowledge and representation sharing. NSE is flexible, robust and suitable for practical NLU tasks and can be trained easily by any gradient descent optimizer.<Fragment(meta.id=1607.04315_1001, order=1001)>" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "paper.text.fragments[1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/results-extraction.ipynb b/notebooks/results-extraction.ipynb new file mode 100644 index 0000000..150a9b0 --- /dev/null +++ b/notebooks/results-extraction.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Results Extraction\n", + "\n", + "This notebook presents how to use AxCell for retrieval of machine learning results." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the pipeline to work we need a running elasticsearch instance. Run `docker-compose up -d` from the `axcell` repository to start a new instance." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.data.paper_collection import PaperCollection\n", + "from pathlib import Path\n", + "\n", + "ROOT_PATH = Path('data')\n", + "PAPERS_PATH = ROOT_PATH / 'papers'\n", + "pc = PaperCollection.from_files(PAPERS_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download and unpack the archive with trained models (table type classifier, table segmentation), taxonomy and abbreviations." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "MODELS_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/models.tar.xz'\n", + "MODELS_ARCHIVE = 'models.tar.xz'\n", + "MODELS_PATH = Path('models')\n", + "\n", + "from fastai.core import download_url\n", + "import tarfile\n", + "\n", + "download_url(MODELS_URL, MODELS_ARCHIVE)\n", + "with tarfile.open(MODELS_ARCHIVE, 'r:*') as archive:\n", + " archive.extractall()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[PID 21480] Load model table-structure-classifier.pth\n" + ] + } + ], + "source": [ + "from axcell.helpers.results_extractor import ResultsExtractor\n", + "\n", + "extract_results = ResultsExtractor(MODELS_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "paper = pc.get_by_id('1903.11816')\n", + "results = extract_results(paper, in_place=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>dataset</th>\n", + " <th>metric</th>\n", + " <th>task</th>\n", + " <th>model</th>\n", + " <th>score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>PASCAL Context</td>\n", + " <td>mIoU</td>\n", + " <td>Semantic Segmentation</td>\n", + " <td>EncNet+JPU (ours)</td>\n", + " <td>53.10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ADE20K</td>\n", + " <td>Validation mIoU</td>\n", + " <td>Semantic Segmentation</td>\n", + " <td>Ours</td>\n", + " <td>80.99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " dataset metric task model \\\n", + "0 PASCAL Context mIoU Semantic Segmentation EncNet+JPU (ours) \n", + "1 ADE20K Validation mIoU Semantic Segmentation Ours \n", + "\n", + " score \n", + "0 53.10 \n", + "1 80.99 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With `in_place=True` we can inspect the inferred segmentation of the tables predicted to be a leaderboard or ablation tables." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<style>\n", + "body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .has-annotations{color:#ff3860}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-ensemble{background-color: #aa38ff;color: #fff;}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper span.text-bold{font-weight:700}.tableWrapper span.text-italic{font-style:italic}.tableWrapper span.text-red{color:red}.tableWrapper span.text-green{color:green}.tableWrapper span.text-blue{color:#00f}.predict-dataset,.predict-dataset-metric,.predict-model-competing,.predict-model-paper,.predict-model-params,.predict-table-meta{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}.tableWrapper .predict-model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .predict-table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .predict-model-paper{background-color:#ff3860;color:#fff}.tableWrapper .predict-dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .predict-dataset{background-color:#02bd43;color:#fff}.tableWrapper td{border:inherit}.tableWrapper table tr td.border-l{border-left:1px solid #000}.tableWrapper table tr td.border-r{border-right:1px solid #000}.tableWrapper table tr td.border-t{border-top:1px solid #000}.tableWrapper table tr td.border-b{border-bottom:1px solid #000}.tableWrapper table tr td.border-ll{border-left:2px solid #000}.tableWrapper table tr td.border-rr{border-right:2px solid #000}.tableWrapper table tr td.border-tt{border-top:2px solid #000}.tableWrapper table tr td.border-bb{border-bottom:2px solid #000}.tableWrapper table tr td.align-left{text-align:left}.tableWrapper table tr td.align-right{text-align:right}.tableWrapper table tr td.align-center{text-align:center}.tableWrapper table tr td.align-justify{text-align:justify}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.ht_clone_top{z-index:20}.evaluation-tables{overflow:scroll;max-height:20vh;border-top:1px solid #a9a9a9}.navbar.is-fixed-bottom,.navbar.is-fixed-top{z-index:200}body{padding-bottom:20vh}\n", + ".tableWrapper .final-proposal{ background: lightgreen }\n", + "</style>\n", + "\n", + "<div class=\"tableWrapper\">\n", + "<table>\n", + "<tr>\n", + "<td class=\"table-meta border-r border-t align-right header \" title=\"\">Method</td>\n", + "<td class=\"table-meta border-r border-t align-left header \" title=\"\">Backbone</td>\n", + "<td class=\"dataset-metric border-t align-center header \" title=\"\">mIoU%</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r border-t align-right \" title=\"\">FCN-8s [<a title=\"bib-bib22\">22</a>]</td>\n", + "<td class=\" border-r border-t \" title=\"\"></td>\n", + "<td class=\" border-t align-center \" title=\"\">37.8</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">CRF-RNN [<a title=\"bib-bib39\">39</a>]</td>\n", + "<td class=\" border-r \" title=\"\"></td>\n", + "<td class=\" align-center \" title=\"\">39.3</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">ParseNet [<a title=\"bib-bib21\">21</a>]</td>\n", + "<td class=\" border-r \" title=\"\"></td>\n", + "<td class=\" align-center \" title=\"\">40.4</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">BoxSup [<a title=\"bib-bib10\">10</a>]</td>\n", + "<td class=\" border-r \" title=\"\"></td>\n", + "<td class=\" align-center \" title=\"\">40.5</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">HO_CRF [<a title=\"bib-bib2\">2</a>]</td>\n", + "<td class=\" border-r \" title=\"\"></td>\n", + "<td class=\" align-center \" title=\"\">41.3</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">Piecewise [<a title=\"bib-bib19\">19</a>]</td>\n", + "<td class=\" border-r \" title=\"\"></td>\n", + "<td class=\" align-center \" title=\"\">43.3</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">VeryDeep [<a title=\"bib-bib32\">32</a>]</td>\n", + "<td class=\" border-r \" title=\"\"></td>\n", + "<td class=\" align-center \" title=\"\">44.5</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">DeepLabV2 [<a title=\"bib-bib5\">5</a>]</td>\n", + "<td class=\"table-meta border-r align-left \" title=\"\">ResNet-101 + COCO</td>\n", + "<td class=\" align-center \" title=\"\">45.7</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">RefineNet [<a title=\"bib-bib18\">18</a>]</td>\n", + "<td class=\"table-meta border-r align-left \" title=\"\">ResNet-152</td>\n", + "<td class=\" align-center \" title=\"\">47.3</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">EncNet [<a title=\"bib-bib36\">36</a>]</td>\n", + "<td class=\"table-meta border-r align-left \" title=\"\">ResNet-101</td>\n", + "<td class=\" align-center \" title=\"\">51.7</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-competing border-r align-right \" title=\"\">DUpsampling [<a title=\"bib-bib29\">29</a>]</td>\n", + "<td class=\"table-meta border-r align-left \" title=\"\">Xception-71</td>\n", + "<td class=\" align-center \" title=\"\">52.5</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-best border-r border-t align-right \" title=\"\">EncNet+JPU (ours)</td>\n", + "<td class=\"table-meta border-r border-t align-left \" title=\"\">ResNet-50</td>\n", + "<td class=\" border-t align-center \" title=\"\">51.2</td>\n", + "</tr>\n", + "<tr>\n", + "<td class=\"model-best border-b border-r align-right \" title=\"\">EncNet+JPU (ours)</td>\n", + "<td class=\"table-meta border-b border-r align-left \" title=\"\">ResNet-101</td>\n", + "<td class=\" border-b align-center \" title=\"\"><span class=\"text-bold\">53.1</span></td>\n", + "</tr>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "<axcell.data.table.Table at 0x7f59ce044e10>" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paper.tables[2]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/training/lm.ipynb b/notebooks/training/lm.ipynb new file mode 100644 index 0000000..f01c6b3 --- /dev/null +++ b/notebooks/training/lm.ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Language Model Pre-training\n", + "\n", + "This notebook presents how to pretrain ULMFiT language model on the **ArxivPapers** dataset. You can download the pretrained model at https://github.com/paperswithcode/axcell/releases/download/v1.0/lm.pth.xz ." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "BATCH_SIZE = 256\n", + "BPTT = 80\n", + "VOCAB_SIZE = 30000\n", + "UNIGRAM_MODEL_SENTENCES = 5000000" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.data.paper_collection import PaperCollection\n", + "from pathlib import Path\n", + "\n", + "# path to extracted papers from ArxivPapers dataset\n", + "PAPERS_PATH = Path('./data/arxiv-papers/papers')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2min 49s, sys: 11.3 s, total: 3min\n", + "Wall time: 5min 46s\n" + ] + } + ], + "source": [ + "%time pc = PaperCollection.from_files(PAPERS_PATH, load_tables=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.helpers.datasets import read_arxiv_papers\n", + "\n", + "V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'\n", + "ARXIV_PAPERS_URL = V1_URL + 'arxiv-papers.csv.xz'\n", + "arxiv_papers = read_arxiv_papers(ARXIV_PAPERS_URL)\n", + "\n", + "assert len(pc) == (arxiv_papers.status == 'success').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "anchors_re = re.compile(r\"xxanchor-[^ ]*\")\n", + "refs_re = re.compile(r\"xxref-[^ ]*\")\n", + "\n", + "\n", + "def remove_anchors(s):\n", + " return anchors_re.sub(\"\", s)\n", + "\n", + "def replace_references(s):\n", + " return refs_re.sub(\"xxref\", s)\n", + "\n", + "def clean_text(s):\n", + " s = remove_anchors(s)\n", + " s = replace_references(s)\n", + " return s\n", + "\n", + "def get_texts(pc):\n", + " texts = []\n", + " for p in sorted(pc, key=lambda p: p.paper_id):\n", + " # do not include empty texts\n", + " if not hasattr(p.text, \"fragments\"):\n", + " continue\n", + " header = f\"Title\\n{p.text.title}\\n\\nAbstract\\n{p.text.abstract}\\n\\nBody\\n\"\n", + " last_section = None\n", + " fragments = []\n", + " for f in p.text.fragments:\n", + " if last_section != f.header:\n", + " fragments.append(f.header+\"\\n\")\n", + " last_section = f.header\n", + " fragments.append(f.text)\n", + " text = header + '\\n'.join(fragments)\n", + " text = clean_text(text)\n", + " texts.append(text)\n", + " return pd.DataFrame({'text': texts})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 25s, sys: 3.19 s, total: 1min 28s\n", + "Wall time: 1min 28s\n" + ] + } + ], + "source": [ + "%time texts = get_texts(pc)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Title\n", + "VQA-LOL: Visual Question Answering under the Lens of Logic\n", + "\n", + "Abstract\n", + "Logical connectives and t...\n" + ] + } + ], + "source": [ + "print(texts.text.iloc[-1][:100]+'...')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# texts.to_pickle(\"/data/arxiv/dumps/arxiv-papers-texts-dataframe.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "texts = pd.read_pickle(\"/data/arxiv/dumps/arxiv-papers-texts-dataframe.pkl\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Reduce number of sentences to avoid sentencepiece badalloc" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = [\n", + " sentence\n", + " for text in texts.text.values\n", + " for sentence in text.split('\\n')\n", + " if sentence.strip()\n", + "]\n", + "\n", + "np.random.seed(12345)\n", + "\n", + "indices = np.random.choice(range(len(sentences)), size=UNIGRAM_MODEL_SENTENCES, replace=False)\n", + "sentences = [sentences[index] for index in indices]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from fastai.text import *\n", + "\n", + "BASE_PATH = Path('./models')\n", + "BASE_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + "processor = SPProcessor(vocab_sz=VOCAB_SIZE, mark_fields=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1h 40min 51s, sys: 36.1 s, total: 1h 41min 27s\n", + "Wall time: 41min 29s\n" + ] + }, + { + "data": { + "text/plain": [ + "PosixPath('models/tmp')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time processor.train_func(sentences, BASE_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "processor = SPProcessor(sp_model=BASE_PATH / \"tmp\" / \"spm.model\", sp_vocab=BASE_PATH / \"tmp\" / \"spm.vocab\", mark_fields=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 17min 29s, sys: 51 s, total: 18min 20s\n", + "Wall time: 26min 28s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "data_lm = (\n", + " TextList.from_df(\n", + " texts, BASE_PATH, cols=\"text\", processor=processor\n", + " ).split_by_rand_pct(0.1, seed=12345)\n", + " .label_for_lm()\n", + " .databunch(bs=BATCH_SIZE, bptt=BPTT)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# data_lm.save('arxiv-papers-texts-data_lm.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/anaconda3/envs/fastai/lib/python3.7/site-packages/torch/serialization.py:493: SourceChangeWarning: source code of class 'torch.nn.modules.loss.CrossEntropyLoss' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.\n", + " warnings.warn(msg, SourceChangeWarning)\n" + ] + } + ], + "source": [ + "data_lm = load_data(BASE_PATH, 'arxiv-papers-texts-data_lm.pkl', bs=BATCH_SIZE, bptt=BPTT)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: left;\">\n", + " <th>epoch</th>\n", + " <th>train_loss</th>\n", + " <th>valid_loss</th>\n", + " <th>accuracy</th>\n", + " <th>perplexity</th>\n", + " <th>time</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>3.019458</td>\n", + " <td>3.264306</td>\n", + " <td>0.392344</td>\n", + " <td>26.161938</td>\n", + " <td>1:54:36</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>3.056603</td>\n", + " <td>3.422664</td>\n", + " <td>0.376507</td>\n", + " <td>30.651068</td>\n", + " <td>1:53:43</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>3.141768</td>\n", + " <td>3.550231</td>\n", + " <td>0.362796</td>\n", + " <td>34.821327</td>\n", + " <td>1:53:26</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>3.090492</td>\n", + " <td>3.525985</td>\n", + " <td>0.366870</td>\n", + " <td>33.987396</td>\n", + " <td>1:53:16</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>3.107407</td>\n", + " <td>3.491773</td>\n", + " <td>0.370532</td>\n", + " <td>32.844139</td>\n", + " <td>1:54:11</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>3.059378</td>\n", + " <td>3.445549</td>\n", + " <td>0.375365</td>\n", + " <td>31.360525</td>\n", + " <td>1:54:10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>3.030591</td>\n", + " <td>3.368207</td>\n", + " <td>0.382388</td>\n", + " <td>29.026358</td>\n", + " <td>1:53:57</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>2.965446</td>\n", + " <td>3.278792</td>\n", + " <td>0.391360</td>\n", + " <td>26.543692</td>\n", + " <td>1:53:37</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>2.919746</td>\n", + " <td>3.163137</td>\n", + " <td>0.404793</td>\n", + " <td>23.644709</td>\n", + " <td>1:53:10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>2.812866</td>\n", + " <td>3.019272</td>\n", + " <td>0.421912</td>\n", + " <td>20.476440</td>\n", + " <td>1:53:43</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>2.800652</td>\n", + " <td>2.874423</td>\n", + " <td>0.440786</td>\n", + " <td>17.715170</td>\n", + " <td>1:54:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>2.870245</td>\n", + " <td>2.789970</td>\n", + " <td>0.453570</td>\n", + " <td>16.280558</td>\n", + " <td>1:53:58</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn = language_model_learner(\n", + " data_lm, AWD_LSTM, drop_mult=0.1,\n", + " pretrained=False, metrics=[accuracy, Perplexity()]\n", + ").to_fp16(clip=0.1)\n", + "\n", + "learn.fit_one_cycle(cyc_len=12, max_lr=0.01, moms=(0.8, 0.7), div_factor=10, wd=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "learn.save_encoder('pretrained-on-papers_enc')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "learn.save('pretrained-on-papers_learner_with_opt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/training/table-segmentation.ipynb b/notebooks/training/table-segmentation.ipynb new file mode 100644 index 0000000..f43dd95 --- /dev/null +++ b/notebooks/training/table-segmentation.ipynb @@ -0,0 +1,300 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table Segmentation Training\n", + "\n", + "This notebook shows how to train a table segmentation model on the **SegmentedTables** dataset. You can download the model weights at https://github.com/paperswithcode/axcell/releases/download/v1.0/models.tar.xz." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.helpers.datasets import read_tables_annotations\n", + "from pathlib import Path\n", + "\n", + "V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'\n", + "SEGMENTED_TABLES_URL = V1_URL + 'segmented-tables.json.xz'\n", + "\n", + "segmented_tables_annotations = read_tables_annotations(SEGMENTED_TABLES_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.data.paper_collection import PaperCollection\n", + "\n", + "SEGMENTED_TABLES_PAPERS = Path('/mnt/efs/pwc/data/arxiv/sources/segmented-tables/papers')\n", + "pc = PaperCollection.from_files(SEGMENTED_TABLES_PAPERS, annotations=segmented_tables_annotations.to_dict(orient='record'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We convert papers and annotations into a dataframe with features: for each cell we get its content, row and column contexts, styles, references, etc. Additionally we query the elasticsearch instance with cell's content as query to search in a given paper for text fragments with this content. (Run `docker-compose up -d` from the `axcell` repository to start an elasticsearch instance.) We first ensure that all papers from `pc` are stored in the instance:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.data.structure import CellEvidenceExtractor\n", + "\n", + "cell_evidences = CellEvidenceExtractor() # sets up elasticsearch connection on creation\n", + "for paper in pc:\n", + " paper.text.save()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from joblib import delayed, Parallel\n", + " \n", + "def process_single(index):\n", + " cell_evidences = CellEvidenceExtractor()\n", + " paper = pc[index]\n", + " leaderboards = [table for table in paper.tables if 'leaderboard' in table.gold_tags.split(',')]\n", + " evidences = cell_evidences(paper, leaderboards, paper_limit=30, corpus_limit=0)\n", + " evidences['fold'] = paper.gold_tags\n", + " evidences.header.fillna('', inplace=True)\n", + " return evidences" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.46 s, sys: 284 ms, total: 1.74 s\n", + "Wall time: 3min 16s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "evidences = Parallel(backend='multiprocessing', n_jobs=-1)(delayed(process_single)(index) for index in range(len(pc)))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "evidences = pd.concat(evidences)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "evidences.to_pickle('/mnt/efs/pwc/data/arxiv/sources/segmented-tables/cell_evidences.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "from collections import OrderedDict\n", + "from fastai.text import *\n", + "from axcell.models.structure.structure_predictor import TableStructurePredictor\n", + "\n", + "\n", + "# make sure the training dataframe is batch_size aligned\n", + "def align_df(df, batch_size):\n", + " aligned_len = ( len(df) // batch_size ) * batch_size\n", + " return df.iloc[:aligned_len]\n", + "\n", + "\n", + "def dataframes_to_databunch(base_path, train_df, valid_df, test_df, batch_size, processor):\n", + " classes=range(6)\n", + " columns = [\"label\", \"text\", \"cell_reference\", \"cell_styles\", \"cell_layout\", \"cell_content\", \"row_context\", \"col_context\"]\n", + " text_cols=[\"cell_styles\", \"cell_layout\", \"text\", \"cell_content\", \"row_context\", \"col_context\", \"cell_reference\"]\n", + " train_df, valid_df, test_df = train_df[columns], valid_df[columns], test_df[columns]\n", + " \n", + " label_cols = [\"label\"]\n", + " train_tl = TextList.from_df(train_df, base_path, cols=text_cols, processor=processor)\n", + " valid_tl = TextList.from_df(valid_df, base_path, cols=text_cols, processor=processor)\n", + " test_tl = TextList.from_df(test_df, base_path, cols=text_cols, processor=processor)\n", + " \n", + " src = ItemLists(base_path, train_tl, valid_tl)\\\n", + " .label_from_df(cols=label_cols)\n", + " src.add_test(test_tl)\n", + " \n", + " data_clas = src.databunch(bs=batch_size)\n", + " return data_clas\n", + "\n", + "def get_databunch(experiment, df, processor):\n", + " is_test = df.fold == experiment.test_split\n", + " is_valid = df.fold == experiment.valid_split\n", + " test_df_all = df[is_test].copy()\n", + " valid_df_all = df[is_valid].copy()\n", + " train_df_all = df[(~is_test) & (~is_valid)].copy()\n", + " train_df_all, valid_df_all, test_df_all = experiment.transform_df(train_df_all, valid_df_all, test_df_all)\n", + "\n", + " train_df, train_df_num = TableStructurePredictor.keep_alphacells(train_df_all)\n", + " valid_df, valid_df_num = TableStructurePredictor.keep_alphacells(valid_df_all)\n", + " test_df, test_df_num = TableStructurePredictor.keep_alphacells(test_df_all)\n", + " train_df = align_df(train_df, experiment.BS)\n", + "\n", + " data_clas = dataframes_to_databunch(BASE_DIR, train_df, valid_df, test_df, experiment.BS, processor)\n", + " return train_df, valid_df, test_df, data_clas" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.models.structure.experiment import experiments_grid\n", + "from axcell.models.structure.ulmfit_experiment import ULMFiTExperiment\n", + "\n", + "EXPERIMENTS_DIR = './experiments/segmentation'\n", + "BASE_DIR = Path('./models')\n", + "\n", + "processor = processor = SPProcessor(\n", + " sp_model=BASE_DIR / 'tmp' / 'spm.model',\n", + " sp_vocab=BASE_DIR / 'tmp' / 'spm.vocab',\n", + " mark_fields=True\n", + ")\n", + "\n", + "\n", + "experiment = ULMFiTExperiment(remove_num=False, drop_duplicates=False,\n", + " this_paper=True, merge_fragments=True, merge_type='concat',\n", + " evidence_source='text_highlited', split_btags=True, fixed_tokenizer=True,\n", + " fixed_this_paper=True, mask=True, evidence_limit=None, context_tokens=None,\n", + " lowercase=True, drop_mult=0.15, fp16=True, train_on_easy=False,\n", + " dataset=\"segmented-tables\",\n", + " test_split='img_class',\n", + " valid_split='speech_rec',\n", + " pretrained_lm='lm'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "train_df, valid_df, test_df, data_clas = get_databunch(experiment, evidences, processor)\n", + "model = experiment.get_trained_model(data_clas)\n", + "experiment.evaluate(model, train_df, valid_df, test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train dataset\n", + " * accuracy: 0.996\n", + " * μ-precision: 0.995\n", + " * μ-recall: 1.000\n", + "valid dataset\n", + " * accuracy: 0.688\n", + " * μ-precision: 0.630\n", + " * μ-recall: 0.982\n", + "test dataset\n", + " * accuracy: 0.817\n", + " * μ-precision: 0.871\n", + " * μ-recall: 0.876\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAn0AAAJfCAYAAAAKF2DwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzde7xVdZn48c/DAfKCIGBxQBGdPONlECkd1F/iBT2iYpq3Upspf1NDdvuNVpqNk6nVdNEkLSclpzFz1MK01GMpogKaoYgKaZqkJMjFUfCugYfn98fZ4ObAgc3I2fucvT5vX/vFXmt913c/3+/ZCx+edTmRmUiSJKm+9ah1AJIkSep8Jn2SJEkFYNInSZJUACZ9kiRJBWDSJ0mSVAA9ax2AJElSrW2+/UlVe5zJG89cG9X6rHJW+iRJkgrApE+SJKkAPL0rSZIKL6L+62D1P0JJkiRZ6ZMkSYoC1MHqf4SSJEmy0idJkuQ1fZIkSaoLVvokSVLhWemTJElSXTDpkyRJKgBP70qSpMKLqMmvw60qK32SJEkFYKVPkiSpAHWw+h+hJEmSrPRJkiT5yBZJkiTVBSt9kiSp8Kz0SZIkqS5Y6ZMkSYUXBaiD1f8IJUmSZKVPkiTJa/okSZJUF6z0SZKkwrPSJ0mSpLpgpU+SJBWelT5JkiTVBZM+SZKkAvD0riRJKrwgah1Cp7PSJ0mSVABW+iRJUuF5I4ckSZLqgpU+SZJUeFb6JEmSVBes9EmSpMKz0idJkqS6YKVPkiSpAHWw+h+hJEmSrPRJkiR5TZ8kSZLqgpU+SZJUeFb6JEmSVBes9EmSpMKLAtTB6n+EkiRJMumTJEkqAk/vSpKkwvNGDkmSJNUFK32SJKnwIqLWIXQ6k75uIiKy1jFIklQtmVn/WViVmfR1E5sNPbHWIXR5b86/jkxz40pEBJlP1DqMLi9iZ+epQm1z5fG3IW3HnvO0IbWounlNnyRJkuqClT5JklR4PpxZkiRJdcFKnyRJKjyv6ZMkSVJdsNInSZIKz0qfJEmS6oKVPkmSVHjevStJkqS6YKVPkiTJa/okSZJUD0z6JEmSCsDTu5IkqfB8ZIskSZLqgpU+SZJUeBFR6xA6nZU+SZKkArDSJ0mSCs+HM0uSJKkuWOmTJEmF5927kiRJqgtW+iRJkrx7V5IkSfXASp8kSVIBymAFGKIkSZKs9EmSJHlNnyRJkuqBlT5JkiQrfZIkSaoHVvokSZIKUAYrwBAlSZJk0idJklQAJn1ay2UXfIq/zLqMmZO/22Gb7533cf4wbQL33/YdRg7fYfX6jx6/P3OmXsScqRfx0eP3r0K06g6mTXuQsWNPpbl5PBMnTlpr+/LlKzjttO/Q3DyeE074IgsWLFm97fLLJ9HcPJ6xY09l+vRZ1Qy7JpwrqTYyomqvWjHpK4mI7SLi1xHxZET8OSIujoixEfFw6fVqRDxRen9VRBwYEbe06+PKiDi+9P7usvYPR8T1pfXnRsSzpXWPRcRJtRjv+vxs0lSO/ti3O9w+9qCRvHeHRobvfzqfO+vHXPLNTwDQv9+WnH3asex/1FcZfdRXOfu0Y9m635bVCltdVGtrK+effxlXXHEuLS2Xcsst05g795k12kyadDt9+/Zh8uSJnHLK0Vx44ZUAzJ37DC0t02hpuZQrrjiX8877Ea2trTUYRXU4V5I6k0kfEBEB3AD8KjObgL8F+gCHZObIzBwJzAQ+Wlr+WIVdr2o/MjOPL1s/odTn0cDlEdFrEw7nHbv3/sdZ+uKrHW4/8tA9ueaX0wG4/6G59Ou7BY3v2ZrmA/ZgyvQ5LHvpNV586TWmTJ/DoQfsUa2w1UXNnv0kw4YNZujQRnr37sW4cfszZcqMNdrceecMjjnmYADGjv0A9933CJnJlCkzGDduf3r37sXQoY0MGzaY2bOfrMUwqsK5kmooqviqEZO+NmOANzPzvwAysxU4HfiniNiisz40M58EXgf6d9ZndIYhjQNYsOiF1cvPLl7KkMYBDGnsz4KFS99ev2gpQxq71dDUCZYseYHGxm1WLw8aNJAlS15Yq83gwW1tevZsYKuttmTZspfXse82a+1bT5wrSZ3JR7a0+TvgwfIVmflyRDwD7ATM7mC/0RHxcNny9kD5Kd//jog3Su8nZ+YZ5TtHxPuBJzPzuXV1HhHjgfEAPfvvRc8+O1U6HkmStDF6+HBmrd/0stO3I4Gb2m0vP71bnvCdHhGPAjOAb3bUeWZOzMy9MnOvrpTwLVy8lO0GD1y9vG3jABYuXsrCxcvYbsiAt9cPHsDCxctqEaK6kEGDBrJ48fOrl5cseYFBgwau1WbRorY2b73VyiuvvEb//n3Xse/za+1bT5wrSZ3JpK/NY8Ce5Ssioi9tlbu5nfB5EzLz74DjgP+MiM064TM6TcvkWZx83GgARr1vJ15+5XUWP/cik6c+wiGjR7B1vy3Zut+WHDJ6BJOnPlLjaFVru+/exLx5C5k/fzHLl6+gpWUaY8aMWqPNmDF7c+ONUwC47bZ72WefEUQEY8aMoqVlGsuXr2D+/MXMm7eQESOaajGMqnCupBqKqN6rRjy922YK8O2I+FhmXhURDcD3gCsz8/XO+tDMvCkiPgF8HLi8sz5nY/30B59n9L67sk3/rZg744d8/aLr6dWr7atyxdV38Ns7H2LsQSN5dPr3ef2Nv/KpL7WFvuyl1/jWJTdyz83fAODfL76BZS+9VrNxqGvo2bOBc845lU9+8mu0tq7kuOMOoalpGBdffDXDhzdx8MF7c/zxzZxxxkU0N4+nX78+TJhwJgBNTcM4/PD9OOKIz9DQ0NZPQ0NDjUfUeZwrSZ0pMrPWMXQJETEU+A9gF9oqoLcCX8rMv5a2311anllaPrC0fGRZH1cCt2Tm9aX2g4FV1/Q9n5mHRMS5wKuZeWFpnz2Ba4BdM3NlR/Ftvv1J/qA24M351+H3uTIRQeYTtQ6jy4vY2XmqUNtcefxtSNux5zxtSGmeqloSaxrz46r9YJ68859rUu6z0leSmfOBD65n+4Htlu8G7m637pSO2petP7fd8oPAzhsVrCRJ0kYy6ZMkSfLuXUmSJNUDK32SJEk1vKu2Wqz0SZIkFYCVPkmSpPov9FnpkyRJKgKTPkmSpALw9K4kSZKPbJEkSVI9sNInSZJU/4U+K32SJElFYKVPkiQVXvpwZkmSJNUDK32SJEnevStJkqRqi4jDIuKJiJgbEWetY/v2EXFXRDwUEbMj4ogN9WnSJ0mSFFV8bSiUiAbgUuBwYDfgpIjYrV2zfwN+kZnvA04E/mND/Zr0SZIkdS2jgLmZ+VRmLgeuA45u1yaBvqX3/YCFG+rUa/okSZKqePduRIwHxpetmpiZE8uWtwXmly0vAPZu1825wO0R8XlgS+CQDX2uSZ8kSVIVlRK8iRtsuH4nAVdm5vciYl/gZxExPDNXdrSDSZ8kSVLXunv3WWBo2fJ2pXXlPgEcBpCZ90XEZsA2wHMddeo1fZIkSV3LA0BTROwYEb1pu1HjpnZtngEOBoiIXYHNgP9ZX6dW+iRJkrpQoS8z34qIzwG3AQ3ATzLz0Yg4H5iZmTcBXwR+HBGn03ZTxymZmevr16RPkiSpi8nMW4Fb2607p+z9Y8AHNqZPT+9KkiQVgJU+SZKkKj6ypVas9EmSJBWAlT5JkiQrfZIkSaoHVvokSZIKUAYrwBAlSZJkpU+SJKkA1/SZ9HUTb86/rtYhdAtRgIN2U4nYudYhdAvOU+U8/irjPKlWTPq6iQ38ZhXR9hfpZkNPrHUY3cKb86/zO1WBiHCeKtQ2V0/UOowuL2Jnv1MVqEliXIBc3Gv6JEmSCsBKnyRJKrzsUf+lPit9kiRJBWClT5IkqQA32FjpkyRJKgArfZIkSfVf6LPSJ0mSVAQmfZIkSQXg6V1JkiQf2SJJkqR6YKVPkiTJR7ZIkiSpHljpkyRJqv9Cn5U+SZKkIrDSJ0mS5N27kiRJqgdW+iRJkqz0SZIkqR5Y6ZMkSYWX9V/os9InSZJUBFb6JEmSvKZPkiRJ9cBKnyRJkr97V5IkSfXApE+SJKkAPL0rSZLkjRySJEmqB1b6JEmSClAGK8AQJUmSZKVPkiTJR7ZIkiSpHpj0Sf9Ll13wKf4y6zJmTv5uh22+d97H+cO0Cdx/23cYOXyH1es/evz+zJl6EXOmXsRHj9+/CtFK9WXatAcZO/ZUmpvHM3HipLW2L1++gtNO+w7NzeM54YQvsmDBktXbLr98Es3N4xk79lSmT59VzbDVlfWI6r1qNcSafXIni4jWiHg4Ih6NiEci4osR0aNdm+9HxLOr1kfE/y3t83BELI+IOaX33y7b51cR8ft2/ewcEXeX2v4xIiaW1h8YES+V9flwRHyk7P3i0uevWu5djbnRpvGzSVM5+mPf7nD72ING8t4dGhm+/+l87qwfc8k3PwFA/35bcvZpx7L/UV9l9FFf5ezTjmXrfltWK2yp22ttbeX88y/jiivOpaXlUm65ZRpz5z6zRptJk26nb98+TJ48kVNOOZoLL7wSgLlzn6GlZRotLZdyxRXnct55P6K1tbUGo5Cqr26TPuCNzByZmX8HNAOHA19btbGU6B0DzAcOAMjM/yrtMxJYCBxUWj6rtM/WwJ5Av4j4m7LPugSYUGq7K/CDsm3TV/VZev287DMuK9tvZGYu76zJ0KZ37/2Ps/TFVzvcfuShe3LNL6cDcP9Dc+nXdwsa37M1zQfswZTpc1j20mu8+NJrTJk+h0MP2KNaYUvd3uzZTzJs2GCGDm2kd+9ejBu3P1OmzFijzZ13zuCYYw4GYOzYD3DffY+QmUyZMoNx4/and+9eDB3ayLBhg5k9+8laDENdTEZU7VUr9Zz0rZaZzwHjgc9FrJ7tA4FHgR8BJ1XY1bHAzcB1wIll6wcDC8o+b847DFl1YEjjABYsemH18rOLlzKkcQBDGvuzYOHSt9cvWsqQxv61CFHqlpYseYHGxm1WLw8aNJAlS15Yq83gwW1tevZsYKuttmTZspfXse82a+0r1atCJH0AmfkU0AC8p7TqJOBa4EZgXET0qqCbVftcy5qJ4gTgzoj4TUScXqoIrjK63end977jwUiSpE2rRxVfNVKYpK9c6dq5I4BfZebLwAxg7Ab2GQQ0Afdk5p+AFRExHNpOCwO7ApNoqyD+PiLeVdq1/endP29EnOMjYmZEzJw4ceJGjlK1tnDxUrYbPHD18raNA1i4eCkLFy9juyED3l4/eAALFy+rRYhStzRo0EAWL35+9fKSJS8waNDAtdosWtTW5q23Wnnlldfo37/vOvZ9fq19pXpVmKSvdA1eK/AcbQne1sCciJgH7MeGT/F+GOgPPF3aZ4fyfTJzYWb+JDOPBt4Chr/TmDNzYmbulZl7jR8//p12pyprmTyLk48bDcCo9+3Ey6+8zuLnXmTy1Ec4ZPQItu63JVv325JDRo9g8tRHahyt1H3svnsT8+YtZP78xSxfvoKWlmmMGTNqjTZjxuzNjTdOAeC22+5ln31GEBGMGTOKlpZpLF++gvnzFzNv3kJGjGiqxTDU1RTg7t1CPJw5It5N200TP8zMjIiTgE9m5rWl7VvSlsxtkZmvd9DNScBhmXlfaZ8dgTuAsyPiMGBKZq6IiEZgIPAssEvnjky19NMffJ7R++7KNv23Yu6MH/L1i66nV6+2Q+qKq+/gt3c+xNiDRvLo9O/z+ht/5VNfuhyAZS+9xrcuuZF7bv4GAP9+8Q0se+m1mo1D6m569mzgnHNO5ZOf/BqtrSs57rhDaGoaxsUXX83w4U0cfPDeHH98M2eccRHNzePp168PEyacCUBT0zAOP3w/jjjiMzQ0tPXT0NBQ4xFJ1RGZWesYOkVEtAJzgF60Vd5+BlwEbEbbTRc7lE7trmp/A/DzzPx5aXkesFdmPh8ROwD3Attl2YRFxCzg08BHgHHAm6VNF2Tm1RFxIPBr4Omy0L6RmdeX9j8XeDUzL6xgSPX5g9qEIoLNhp644YbizfnXUa/H/qYUEc5Thdrm6olah9HlRezsd6oCpWOvqiWxHb90c9V+ME9f+MGalPvqttKXmR390+11YED7lZl5bLvlHcrezwO2Xcc+7y+9nQF8YR3b7wb6rSfGczvaJkmStCkV5po+SZKkIqvbSp8kSVLFaniDRbVY6ZMkSSoAK32SJEn1X+iz0idJklQEVvokSVLhpdf0SZIkqR5Y6ZMkSbLSJ0mSpHpgpU+SJCms9EmSJKkOWOmTJEkqQBmsAEOUJEmSlT5JkiSv6ZMkSVI9sNInSZLkc/okSZJUD0z6JEmSCsDTu5IkSZ7elSRJUj2w0idJkgovfWSLJEmS6oGVPkmSpAKUwQowREmSJFnpkyRJ8po+SZIk1QMrfZIkST6nT5IkSfXASp8kSZKVPkmSJNUDK32SJEn1X+gz6esuogC3km8Kb86/rtYhdBt+pyrjPFUuYudah9At+J1SrZj0dROZT9Q6hC4vYmcys9ZhdAsRwebbn1zrMLq8N565hpX5eK3D6BZ6xC4efxWICOepArVIjNNr+iRJklQPTPokSZIKwNO7kiRJBbjW0kqfJElSAVjpkyRJ8kYOSZIk1QMrfZIkSfVf6LPSJ0mSVARW+iRJUuH1KEAZrABDlCRJkpU+SZJUeAV4TJ+VPkmSpCKw0idJkgrPSp8kSZLqgpU+SZJUeFGAUp+VPkmSpC4mIg6LiCciYm5EnNVBmw9HxGMR8WhEXLOhPq30SZKkwutKhb6IaAAuBZqBBcADEXFTZj5W1qYJ+ArwgcxcFhHv2VC/VvokSZK6llHA3Mx8KjOXA9cBR7dr88/ApZm5DCAzn9tQpyZ9kiRJXcu2wPyy5QWldeX+FvjbiLg3In4fEYdtqFNP70qSpMKr5undiBgPjC9bNTEzJ25kNz2BJuBAYDtgWkTsnpkvrm8HSZIkVUkpwVtfkvcsMLRsebvSunILgBmZuQJ4OiL+RFsS+EBHnXp6V5IkFV70qN6rAg8ATRGxY0T0Bk4EbmrX5le0VfmIiG1oO9371Po6NemTJEnqQjLzLeBzwG3AH4FfZOajEXF+RBxVanYb8EJEPAbcBZyRmS+sr19P70qSpMLrSo9sAcjMW4Fb2607p+x9Al8ovSpipU+SJKkArPRJkqTC69HFKn2dwUqfJElSAVjpkyRJhdfVrunrDFb6JEmSCsBKnyRJKjwrfZIkSaoLVvokSVLhRQFKfVb6tJZp0x5k7NhTaW4ez8SJk9bavnz5Ck477Ts0N4/nhBO+yIIFS1Zvu/zySTQ3j2fs2FOZPn1WNcNWF9Z8wAgevvMC5kz9Hl/89AfX2j50221oueYrzPjtt/jtdWezbeOA1du+ftaJPHD7t3ng9m9z3JH7VDPsmpg+7UEOG/tpDm0ez8SJ16+1ffnyFZx+2nc5tHk8Hz7hS2sdf4c2j+ewsZ/2+JO0lk5P+iIiI+LqsuWeEfE/EXFL2boPRcTsiPhjRMyJiA+VbbsyIp6OiEci4k8RcVVEbFe2fV5pn4dLr0vK9jt+A7FtHhFTI6Jh0466MhGxe0RcWYvP7khrayvnn38ZV1xxLi0tl3LLLdOYO/eZNdpMmnQ7ffv2YfLkiZxyytFceOGVAMyd+wwtLdNoabmUK644l/PO+xGtra01GIW6kh49gglfP4UPffy7vP+QMznhqH3ZpWnbNdp86+yTueaX97D3YV/hW5fcyHlf/ggAh40ZycjhO7DP4f/KAUd/jdPGH8FWfTavwSiqo+34u5wfX/E1bmm5lJZ1HH/XT5pM3759uH3yRD5+ylF878KfAm3H360t07ml5VKuuOJrnH/eZR5/0kboYr97t1NU46NfA4ZHxKq/qZuBZ1dtjIg9gAuBozNzV+Ao4MKIGFHWxxmZuQewM/AQcGfpFxCvclBmjiy9/t9GxPZPwA2ZWfW/GSOiZ2bOAbaLiO2r/fkdmT37SYYNG8zQoY307t2LceP2Z8qUGWu0ufPOGRxzzMEAjB37Ae677xEykylTZjBu3P707t2LoUMbGTZsMLNnP1mLYagL2Wvke/nzvCXMm/8/rFjRyvU3/54jm/dco80uTdty9+8eBWDq7x5bvX2Xpm259/7HaW1dyetv/JU/PD6f5gNGrPUZ9WL27CfZvuz4O2Lc6LWOvyl3zuBDx4wB1j7+jhg3mt69e7Hd0Ea29/iT1E618s1bgXGl9ycB15Zt+xLw75n5NEDpz28BZ7TvJNtMABYDh2+CuD4K/BogIg6MiGkR0RIRT0TEZRFt+XhE/CgiZkbEoxFx3qqdS1XG75YqjfdHxE6l9e+OiF9GxAOl1wdK68+NiJ9FxL3Az0rd3AycuAnGskksWfICjY3brF4eNGggS5a8sFabwYPb2vTs2cBWW23JsmUvr2PfbdbaV8UzpHEAzy56+3vw7KKlDGnsv0abOX98hqMP+3sAjj5sL/putTkDtu7DnMeeofmAPdh8s94M7N+H/ffdje2GDKxq/NW0ZMkLDC47hhrXcQw9t47j78Vlr6xj37WPXUnFVq2k7zrgxIjYDBgBlP/T9e+AB9u1n1la35FZwC5ly3eVnd49vZKASpXCv8nMeWWrRwGfB3YD3gscW1p/dmbuVYr9gHZVyJcyc3fgh8D3S+suBiZk5t8DxwFXlLXfDTgkM08qG+voDmIcX0o2Z06c+PNKhiV1S//6jf9m9D67ct+t32S/vXfl2UVLaV25kinT53DbXQ9z1w3ncuUPPseMWU/S2rqy1uFKqkMR1XvVSlXu3s3M2RGxA21Vvls3QZftp+ygzHx+I/vYBnix3br7M/MpgIi4FtgPuB74cESMp22+BtOWuM0u7XNt2Z8TSu8PAXYruxOob0T0Kb2/KTPfKPvM54Ah6wowMycCE9uW/pQbN7z/nUGDBrJ48dtTuWTJCwwaNHCtNosWPU9j4za89VYrr7zyGv37913Hvs+vta+KZ+HipWw7+O3vwbaDB7Bw8bI12ix67kVO+lTbv5m23OJdfOjwUbz08usAfPeHv+a7P/w1AP91yWeZ+/SiKkVefYMGDWRR2TG0eB3H0HvWcfxt3X+rdey79rErqdiqeTnhTbRdu3dtu/WPAXu2W7cn8Oh6+nof8Md3GM8bwGbt1rVPrDIidqTtFPTBmTkCaGm3X67jfQ9gn7LrDLfNzFdL215r9xmblWLpEnbfvYl58xYyf/5ili9fQUvLNMaMGbVGmzFj9ubGG6cAcNtt97LPPiOICMaMGUVLyzSWL1/B/PmLmTdvISNGNNViGOpCHnzkKXbasZFhQ99Nr14NHP/BfWiZvGZxf2D/Pqsfl3DGZ4/iql/cDbTdBDJg67Z/Lw3fZSjDdxnKHdPmVDX+atp99yb+Mm8hC0rH360t0xkzZu812owZM4pf3Xgn0P7425tbW6azfPkKFsxfzF88/qSNYqVv0/oJ8GJmzomIA8vWXwhMiog7M3NeqSL4r8Bad95G2/8VPk9bte237ySYzFwWEQ0RsVlmvllaPaqU5P0F+AhtVba+tCVqL0XEINquJby7rKuPAN8u/Xlfad3tpTgvKMU9MjMf7iCUvwX+8E7Gsin17NnAOeecyic/+TVaW1dy3HGH0NQ0jIsvvprhw5s4+OC9Of74Zs444yKam8fTr18fJkw4E4CmpmEcfvh+HHHEZ2hoaOunoaEmN0arC2ltXckXzrmSm676Mg0NPbjqF1P545PP8tUvHMes2U/TcscsRu+7G+ef+REyk3vvf5zTvnolAL169WTy9ecA8Morb/CJ035U16d3e/Zs4KvnfIpPfPJcVq4+/rbnkov/m+HDd2JM6fg784yLOLR5PP36bcVFE9ouf25q2p7DD9+PcUd81uNP0jpFZueeNYyIVzOzT7t1BwJfyswjS8vHAucBvYAVwNcy84bStiuBA4CXgS2A3wNfycwFpe3zgFeAVXfgzs7Mj5X2+yBvV9HmZ+a+7eL4T+DazLyjFNP5pb52Au4CPpOZK0t9/R9gPvASbadoryx99s9pSwT/CpyUmXMjYhvgUmBX2hLraZl5akScC7yamReWxfBD4LbMvHn9M1md07vdWcTOdPb3uV5EBJtvf3Ktw+jy3njmGlbm47UOo1voEbt4/FUgIpynCpTmqao1sZH/Pb1qP5iHPzq6JvW+Tk/6urKIeD9wemb+Y/tEtML95wF7/S+uJ1y1/7uAqcB+mfnW+lub9G2ISV/lTPoqY9JXOZO+ypj0Vcakr3MU+tewZeasiLirVg9nBrYHztpwwidJkjpTj/r/LWzFTvoAMvMnpbd3s+a1epXsu8M7/OwnAZ+eKkmSOl3hkz5JkqRa3lVbLTX8DXCSJEmqFit9kiSp8Kz0SZIkqS5Y6ZMkSYUXBbh910qfJElSAVjpkyRJhec1fZIkSaoLVvokSVLhWemTJElSXTDpkyRJKgBP70qSpMLz9K4kSZLqgpU+SZJUeAV4NrOVPkmSpCKw0idJkgrPa/okSZJUF6z0SZKkwosClMEKMERJkiRVXOmLiHdl5l87MxhJkqRa8Jo+ICJGRcQc4MnS8h4R8YNOj0ySJEmbTCWVvkuAI4FfAWTmIxFxUKdGJUmSVEVRgFJfJdf09cjMv7Rb19oZwUiSJKlzVFLpmx8Ro4CMiAbg88CfOjcsSZKk6ilAoa+iSt+ngS8A2wNLgH1K6yRJktRNbLDSl5nPASdWIRZJkqSaKEKlb4NJX0T8GMj26zNzfKdEJEmSpE2ukmv67ih7vxlwDDC/c8KRJElSZ6jk9O7Py5cj4mfAPZ0WkSRJUpV5enfddgQGbepAtH4RO9c6hG6hCM9Z2lTeeOaaWofQLfSIXWodQrfh8VcZ50m1Usk1fct4+5q+HsBS4KzODEpry3yi1iF0eRE7O08Vcq4qE7Ezmw31PrZKvDn/OjLXuvxb7USE81SBWiTGPQqQi6836Yu2Wd8DeLa0amX6bZUkSep21pv0ZWZGxK2ZObxaAUmSJFVbESp9lTyc+eGIeF+nRyJJkqRO02GlLyJ6ZuZbwPuAByLiz8BrQNBWBHx/lWKUJEnqVD2i/q9eW9/p3fuB9wNHVSkWSZIkdZL1JX0BkJl/rlIskiRJNVGEa/rWl/S9Ox/rkqgAACAASURBVCK+0NHGzLyoE+KRJElSJ1hf0tcA9KFU8ZMkSapXldzZ2t2tL+lblJnnVy0SSZIkdZoNXtMnSZJU74pw9+76qpkHVy0KSZIkdaoOK32ZubSagUiSJNVKEe7eLcJ1i5IkSYVn0idJklQA67uRQ5IkqRCKUAUrwhglSZIKz0qfJEkqPG/kkCRJUl2w0idJkgovCv5wZkmSJNUJK32SJKnwvKZPkiRJdcFKnyRJKrwiVMGKMEZJkqTCs9InSZIKr4d370qSJKkeWOmTJEmF5927kiRJqgtW+iRJUuEVoQpWhDFKkiQVnkmfJElSAZj0aS3Tpj3I2LGn0tw8nokTJ621ffnyFZx22ndobh7PCSd8kQULlqzedvnlk2huHs/YsacyffqsaoZdE85VZZynylx2waf4y6zLmDn5ux22+d55H+cP0yZw/23fYeTwHVav/+jx+zNn6kXMmXoRHz1+/ypEK9WXHlG9V83G2FkdR0RGxNVlyz0j4n8i4paydR+KiNkR8ceImBMRHyrbdmVEPB0Rj0TEnyLiqojYrmz7vNI+D5del5Ttd/wGYts8IqZGRMOmHfXGiYjPRcQ/1TKG9lpbWzn//Mu44opzaWm5lFtumcbcuc+s0WbSpNvp27cPkydP5JRTjubCC68EYO7cZ2hpmUZLy6VcccW5nHfej2htba3BKKrDuaqM81S5n02aytEf+3aH28ceNJL37tDI8P1P53Nn/ZhLvvkJAPr325KzTzuW/Y/6KqOP+ipnn3YsW/fbslphS+omOrPS9xowPCI2Ly03A8+u2hgRewAXAkdn5q7AUcCFETGirI8zMnMPYGfgIeDOiOhdtv2gzBxZev2/jYjtn4AbMrPW//f4CfD5Gsewhtmzn2TYsMEMHdpI7969GDduf6ZMmbFGmzvvnMExxxwMwNixH+C++x4hM5kyZQbjxu1P7969GDq0kWHDBjN79pO1GEZVOFeVcZ4qd+/9j7P0xVc73H7koXtyzS+nA3D/Q3Pp13cLGt+zNc0H7MGU6XNY9tJrvPjSa0yZPodDD9ijWmFLdaFHZNVeNRtjJ/d/KzCu9P4k4NqybV8C/j0znwYo/fkt4Iz2nWSbCcBi4PBNENdHgV+vWoiIL5eqho9ExLdL60ZGxO9LlcgbI6J/af3dETEhImaWKpR/HxE3RMSTEfGNUpsdIuLxiPjvUpvrI2KLdYzrdWBeRIzaBGPaJJYseYHGxm1WLw8aNJAlS15Yq83gwW1tevZsYKuttmTZspfXse82a+1bT5yryjhPm86QxgEsWPT2+J9dvJQhjQMY0tifBQuXvr1+0VKGNPavRYiSurDOTvquA06MiM2AEUD5P+//DniwXfuZpfUdmQXsUrZ8V9np3dMrCahUKfybzJxXWj4cOBrYu1RVXHUxzVXAlzNzBDAH+FpZN8szcy/gMtqSx88Cw4FTImJgqc3OwH+UqpgvA5/pIKSZwOgOYh1fSi5nTpz480qGJ0mS/he8pu8dyszZwA60Vflu3QRdtp+q8tO7EyrsYxvgxbLlQ4D/KlXdyMylEdEP2Dozp5ba/BQovzL6ptKfc4BHM3NRZv4VeAoYWto2PzPvLb2/Gtivg3ieA4asa0NmTszMvTJzr/HjP1Lh8N6ZQYMGsnjx86uXlyx5gUGDBq7VZtGitjZvvdXKK6+8Rv/+fdex7/Nr7VtPnKvKOE+bzsLFS9lu8Nvj37ZxAAsXL2Xh4mVsN2TA2+sHD2Dh4mW1CFFSF1aNu3dvou3avWvbrX8M2LPduj2BR9fT1/uAP77DeN4ANnuHffy19OfKsverllc98Lr9SfuOTuJvVoqpS9h99ybmzVvI/PmLWb58BS0t0xgzZs2zz2PG7M2NN04B4Lbb7mWffUYQEYwZM4qWlmksX76C+fMXM2/eQkaMaKrFMKrCuaqM87TptEyexcnHtZ0YGPW+nXj5lddZ/NyLTJ76CIeMHsHW/bZk635bcsjoEUye+kiNo5W6lx5VfNVKNX4jx0+AFzNzTkQcWLb+QmBSRNyZmfMiYgfgX4G17ryNiKDthofBwG/fSTCZuSwiGiJis8x8E5gMnBMR/52Zr0fEgFK1b1lEjM7M6cA/AlPX3/Nato+IfTPzPuBk4J4O2v0tcG8H26quZ88GzjnnVD75ya/R2rqS4447hKamYVx88dUMH97EwQfvzfHHN3PGGRfR3Dyefv36MGHCmQA0NQ3j8MP344gjPkNDQ1s/DQ01vUG6UzlXlXGeKvfTH3ye0fvuyjb9t2LujB/y9Yuup1evtr+mr7j6Dn5750OMPWgkj07/Pq+/8Vc+9aXLAVj20mt865IbuefmbwDw7xffwLKXXqvZOCR1TZHZOXeRRMSrmdmn3boDgS9l5pGl5WOB84BewArga5l5Q2nblcABtF0PtwXwe+ArmbmgtH0e8Aqw6g7c2Zn5sdJ+H+Tt6tn8zNy3XRz/CVybmXeUls8CPgYsB27NzH+NiJG0XbO3BW2nbf9vKWG8uzSGmesYz9203aDyPG3J6UzaqpePAf9YSirPB2Zm5k2lfWYBzZm5gavT/1S72326iYidyXyi1mF0C85VZSJ2ZrOhJ9Y6jG7hzfnX0Vn/P6knEeE8VaA0T1W9+u3Ue++q2g/msg8cVJMr+zot6evKIuL9wOmZ+Y+d1P8OwC2ZOXwD7d4HfKGyOEz6NsREpnLOVWVM+ipn0lcZk77KmPR1jmqc3u1yMnNWRNwVEQ01flbfNsBXa/j5kiSJ2t5VWy2FTPoAMvMnndj3PNoe4bKhdpM7KwZJkqRyhU36JEmSVilCpa+Wdw5LkiSpSqz0SZKkwitCFawIY5QkSSo8kz5JkqQC8PSuJEkqvB5R/89PtNInSZLUxUTEYRHxRETMLf3msI7aHRcRGRF7bahPK32SJKnwutIjWyKiAbgUaAYWAA9ExE2Z+Vi7dlsB/wLMqKRfK32SJEldyyhgbmY+lZnLgeuAo9fR7uvAd4A3K+nUpE+SJBVejyq+ImJ8RMwse41vF862wPyy5QWldatFxPuBoZnZUukYPb0rSZJURZk5EZj4v90/InoAFwGnbMx+Jn2SJKnwutI1fcCzwNCy5e1K61bZChgO3B0RAI3ATRFxVGbO7KhTT+9KkiR1LQ8ATRGxY0T0Bk4Eblq1MTNfysxtMnOHzNwB+D2w3oQPrPRJkiQRXeg5fZn5VkR8DrgNaAB+kpmPRsT5wMzMvGn9PaybSZ8kSVIXk5m3Are2W3dOB20PrKRPkz5JklR4Xeyavk7hNX2SJEkFYKVPkiQVXhGqYEUYoyRJUuFZ6ZMkSYXXowvdvdtZrPRJkiQVgEmfJElSAXh6V5IkFZ6PbJEkSVJdsNInSZIKz0qfJEmS6oKVPkmSVHgNtQ6gCqz0SZIkFYCVPkmSVHhFeDizSV83EbFzrUPoFpynyjlXlXlz/nW1DqHbiCjAlfCbgPOkWjHp6yYy6/9fIO9URLAyH6t1GN1Cj9jN71QFIsJ5qlBEsMX2/1DrMLq815+52u9UBWqRGHv3riRJkuqClT5JklR4VvokSZJUF6z0SZKkwmuw0idJkqR6YKVPkiQVntf0SZIkqS6Y9EmSJBWAp3clSVLhFeHXsFnpkyRJKgArfZIkqfC8kUOSJEl1wUqfJEkqvIZaB1AFVvokSZIKwEqfJEkqPK/pkyRJUl2w0idJkgrP5/RJkiSpLljpkyRJhdfgNX2SJEmqB1b6JElS4Xn3riRJkuqClT5JklR4VvokSZJUF0z6JEmSCsDTu5IkqfA8vStJkqS6YKVPkiQVXoO/hk2SJEn1wEqfJEkqvCJUwYowRkmSpMKz0idJkgrPu3clrdf0abM4bOxnOLT5VCZO/OVa25cvX8Hpp13Aoc2n8uETzmDBgiUALFv2Mh/7x3/j/e87kfPPn1jtsKVur/mA3Xnozu8ye+qFfPHTR661fei2A2m55ixm/Pab/Oa6f2VIY//V275+1kd44PZv8cDt3+K4I/euZthSTRU26YuIjIiry5Z7RsT/RMQtpeVTSssPl732KHu/NCKeLr2/IyJ2iIg3SsuPRcRVEdGr1NeBq/otLR8eETNL7R6KiO9Vfwb0TrW2tnL++Zfz4yvO4ZaWH9Byy3Tmzp2/RpvrJ02mb98+3D75Mj5+ylF878KrAHjXu3rzL/9yMmeeeUoNIpe6tx49gou+/nGO+fgF7HnIlznhqH3ZpWnIGm3+/eyTueaX97D3YWfz7Ut+xflf/jAAY8fswcjhO7DP4WdzwNHn8i/jj2CrPpvVYhjqYnpE9V41G2PtPrrmXgOGR8TmpeVm4Nl2bX6emSPLXo+seg/cBJxRWj6k1P7PpW27A9sBH27/oRExHPgh8A+ZuRuwFzB30w9PnW327CfZfthghg5tpHfvXhwxbj+mTJmxRpspd97Ph445CICxY/8P9903m8xkiy02Y8+9dqP3u3rVInSpW9tr5Ht5at4S5s3/H1asaOX6m3/Pkc17rtFml6Yh3P27xwCY+rvHGFfavmvTttxz/+O0tq7k9Tf+yh8en0/zASOqPgapFoqc9AHcCowrvT8JuHZTdJqZrcD9wLbr2Hwm8M3MfHxV28z80ab4XFXXkiVLGdy4zerlxkEDWbJk6RptnluylMGD29r07NnAVlttwYvLXqlqnFK9GdLYnwWL3j7Wnl20lMFlp28B/vDHZzj6sL0AOOqwvei71eYM2LoPcx57huYDRrD5Zr0Z2L8P+++7K9sNGVjV+NU1NURW7VUrRU/6rgNOjIjNgBHAjHbbP9Lu9O7ma3extlJ/ewO/Xcfm4cCDFfYzvnQaeObEiV73JUmV+so3rmW/fXbhd7d+ndF778Kzi5bSunIlU6b/gdvueoQ7bziHK3/wWe6fNZfW1pW1DleqikLfvZuZsyNiB9qqfLeuo8nPM/NzG9HleyPiYWBHoCUzZ7/D+CYCq7K9+n9UeDczaNAAFi1+fvXy4iUvMGjQgDXavGfQABYtep7Gxm14661WXnnldbbuv1W1Q5XqysLFy9hu8NvH2raDB7Bo8bI12ix+7kVO/tQlAGy5xbs4+vC/56WXXwfggh/exAU/vAmA/7rk08x9enGVIldX5t27xXATcCGb5tTuqmv63gvsGRFHraPNo8Ce61ivbmb33Zv4y7xFLJi/hOXLV3Bryz2MGTNqjTZjxoziVzfeBcBtt/2OffbZnYgC/M0idaIHH3mK9+7YyLCh76ZXrwaO/+A+tEyetUabgf37rD7WvvTZD3LVL6YCbTeBDNi6DwDDdxnK8F22545pc6o7AKlGCl3pK/kJ8GJmzomIAzdFh5n5fEScBXyFtqSy3AXADRFxT2b+KSJ6AOMz87JN8dmqnp49G/jqOf/MJz55HitbWznuuENoatqeSy6+huHDd2LMwaM4/vhDOPOM73No86n067cVF0344ur9x4z5Z1579Q1WrHiLKXfM4D9/ci477TS0hiOSuofW1pV88Zyr+PVVZ9DQ0IOrfjGNPz75LP/2hWOZNftpbr3jIUbvuyvnnflhMpN773+C07/6UwB69erJ7df/GwCvvPIGnzjtR57eFVCMSl9kFvOsYUS8mpl92q07EPhSZh4ZEafQlqCV39H7mcz8XantlcAtmXl9aXmH0vLw0nIADwOfAxpW9VvadiRwHrAFbadtb8nMMzcQcjF/UBshIliZj9U6jG6hR+xGUY/9jRERzlOFIoIttv+HWofR5b3+zNV+pypQOvaqmobd/MxvqvaD+eD2h9ckxSxs0tcN+YPaAJO+ypn0Vcakr3ImfZUx6auMSV/n8PSuJEkqvCKc3vVGDkmSpAKw0idJkgqvwUqfJEmS6oGVPkmSVHg9avjr0arFSp8kSVIBWOmTJEmFV4QqWBHGKEmSVHhW+iRJUuH5nD5JkiTVBSt9kiSp8HxOnyRJkuqClT5JklR4PqdPkiRJdcFKnyRJKjzv3pUkSVJdsNInSZIKz0qfJEmS6oJJnyRJUgF4eleSJBVeEapgRRijJElS4VnpkyRJhRfeyCFJkqR6YKVPkiQVXgEKfVb6JEmSisBKnyRJKjyv6ZMkSVJdsNInSZIKrwhVsCKMUZIkqfCs9EmSpMKLyFqH0Oms9EmSJBWAlb5uIopwW9Em0CN2q3UI3Ybfqco4T5V7/Zmrax1Ct+B3qmsqwk/FpK+byKz/svM7FRHOU4Wcq8o4T5VzrirjPFXGxLhzmPRJkqTCK0Ke6TV9kiRJBWDSJ0mSVACe3pUkSYVXgLO7VvokSZKKwEqfJEkqvB4FKPVZ6ZMkSSoAK32SJKnwClDos9InSZJUBCZ9kiSp8CKq96osnjgsIp6IiLkRcdY6tn8hIh6LiNkRMSUihm2oT5M+SZKkLiQiGoBLgcOB3YCTItb65fIPAXtl5gjgeuC7G+rXpE+SJBVeVPFVgVHA3Mx8KjOXA9cBR5c3yMy7MvP10uLvge021KlJnyRJUhVFxPiImFn2Gt+uybbA/LLlBaV1HfkE8JsNfa5370qSpMKr5t27mTkRmLgp+oqIfwD2Ag7YUFuTPkmSpK7lWWBo2fJ2pXVriIhDgLOBAzLzrxvq1KRPkiQVXhf7jRwPAE0RsSNtyd6JwMnlDSLifcDlwGGZ+VwlnXpNnyRJUheSmW8BnwNuA/4I/CIzH42I8yPiqFKzC4A+wKSIeDgibtpQv5GZnRa0Nil/UBsQEfh9roxzVRnnqXLOVWWcp8qU5qmqtbcnX7qlaj+Ypn5H1qSuaKVPkiSpAEz6JEmSCsAbOSRJUuFF1P9pdyt9kiRJBWClT5IkFV7XemJL57DSJ0mSVABW+iRJUuFFAUp9VvokSZIKwEqfJEkqvCJUwYowRkmSpMKz0idJkgrPa/okSZJUF6z0SZKkwitAoc9KnyRJUhGY9JWJiNaIeLjsdVZp/d0R8UzE22f8I+JXEfFq6f0OEfFGaZ/HIuKyiNijrJ+lEfF06f0dEfF4ROxe1tcZEXF59UcsSZKg7Zq+ar1qxdO7a3ojM0d2sO1F4APAPRGxNTC43fY/Z+bIiOgJ3Am8d1VfEXElcEtmXl9aPgz4j4jYHxgCnArstclHI0mSVGKlr3LXASeW3h8L3LCuRpn5FvA7YKeOOsrM3wKLgI8BE4BzM3PZJo1WkiRVLKr4qhWTvjVt3u707kfKtk0B9o+IBtqSv5+vq4OI2AI4GJizgc86Dfgm8O7M/FkHfY2PiJkRMXPixIkbPRhJkqRVPL27pvWd3m0F7qEt4ds8M+fFmifm3xsRDwMJ/Dozf7O+D8rMhRFxJ3DLetpMBFZle1nhGCRJktZi0rdxrgNuBM5dx7Y/rydh7MjK0kuSJNVQjwI8s8XTuxtnOvAt4NpaByJJkrQxrPStafPSKdpVfpuZZ61ayMwELqx+WJIkqTMVoNBHtOUx6gb8QW1AROD3uTLOVWWcp8o5V5VxnipTmqeq5mGLXr+5aj+YwVt8sCY5ppU+SZJUeBH1n4x7TZ8kSVIBWOmTJEmFV4Rr+qz0SZIkFYCVPkmSVHhRgFKflT5JkqQCsNInSZIKrwCFPit9kiRJRWClT5IkFV4RqmBFGKMkSVLhWemTJEmF5927kiRJqgsmfZIkSQXg6V1JkqQCPLTFSp8kSVIBWOmTJEmFF1b6JEmSVA+s9EmSpMKLqP86WP2PUJIkSVb6JEmSvHtXkiRJdcFKnyRJKjzv3pUkSVJdsNInSZJkpU+SJEn1wEqfJEkqPJ/TJ0mSpLpgpU+SJMlr+iRJklQPTPokSZIKwNO7kiSp8Hw4s7qS6GqviPhUrWMof2VmzWPoDvPkXDlPRZkr56n7zlVpnrSJmfTpnRhf6wC6Ceepcs5VZZynyjhPlSv8XEUV/6sVkz5JkqQC8Jo+SZKkAtTB6n+E6kwTax1AN+E8Vc65qozzVBnnqXLOVQFEZtY6BkmSpJp67a2pVUuItux5QE0u7LPSJ0mSVABe0ydJklTDu2qrxUqfJElSAVjpkzahiOiZmW/VOg5J0sYpwm/kMOmTNq37gffXOojuICK2A04ERgNDgDeAPwAtwG8yc2UNw+tSImIv1p6nyZm5rKaBqVuKiB8BX87Ml2sdi6rL07uqSEQ0RMQ2Zcu9I2J8RPyxlnF1QfX/T8VNICL+C/gJsBz4DnAS8BngDuAw4J6I2L92EXYNEfF/I2IW8BVgc+AJ4DlgP+COiPhpRGxfyxi7ioi4oPSrxNqv/1REfLsWMXVhTwEPRsTJtQ6ka+lRxVdt+MgWbVBEnAhcDrwGPAl8k7b/YT8AfD0zZ9UwvC4lIhYAF3W0PTM73FYkETE8M/+wnu29ge0zc24Vw+pyIuKzwE8y840Oto8EBmbmlOpG1vVExIPAXtnuf2oR0QOYnZnDaxNZ1xQR29L2d9U2wI+A1ZX1zLyhVnHV0utv3Vu1hGiLnh+oSYHA07uqxL8Be2bm3Ih4P3AfcHxm3lzjuLqiBqAPVvzWKzP/EBENwFWZ+dF1bF8OFDrhA8jMSzew/eFqxdINvKt9wgeQmSsjwuOxncx8NiJaaPtH/Ad5O+lLoJBJn9f0SW2Wr6q4ZOasiHjShK9DizLz/FoH0R1kZmtEDIuI3qUkTxshIo7MzFtqHUcX8kZENGXmk+UrI6KJtusgVRIRf0dbdW8hMCozF9U4JFWJSZ8q8Z6I+ELZ8tbly56yXEP9/1Nx03oKuDcibqLt8gHA71SF/h4w6XvbOcBvIuIbwIOldXvRdj3kaTWLqmuaBJyWmbfXOhBVl0mfKvFjYKv1LOtt31r1JiJ2zMyny5aPLeq1Muvx59KrB36nNkpmfq3WMXQlmfmbiPgQcAbw+dLqR4HjMnNO7SLrkp414VtbEa4C8EYOaROKiFmZ+f7279e1rLdFxBaZ+Xqt4+iqImIz2u5u3o+2a67uAX6UmW/WNDB1S/5dtG5vtt5XtYRos4Z9vZFDXVNE/CIzP1x6/53M/HLZttsz89DaRdflRAfv17VceBGxL/CftN38sn1E7AF8KjM/U9vIupyrgFeAH5SWTwZ+BpxQs4i6mIi4mbaEeJ0y86gqhtPVbR0Rx3a0sbhnJOr/r2iTPlWiqex9M/DlsuV3VzmWri47eL+uZcH3gbHATQCZ+YjP51un4Zm5W9nyXRHxWM2i6ZourHUA3Ug/4EjWneUU9u7dIjDpUyXWl6yYyKzpb0o3JUTZe0rLO9YurK4rM+e3u5amtVaxdGGzImKfzPw9QETszf9v796D7a7KM45/nwSEUAgXr1MEIhdFzASoiFS8AAJyKYKOVVIvXDrjrbZIwULVKYpTRXCsIBXFqoCOxPEGUVC0oggKGiBckgjIEECpVouiCBFI8vSP3zpk5+Tscwk7e/322c9n5sz8Lvvs/Z5Mzj7vftda74LrK8fUKrav6nZP0r79jGUA3GP7+NpBtI2GYL+KJH0xGZtJ2pNmsv2scqzyNatqZO1zZMfx6MpDKhHr+oWkFwGWtDFwApBdXtb1fODHku4t59sDt0u6FbDtefVCa4fS9/G1wLbAt0svyL8B3k3zPrVnzfhaZvqPY8aYspAjJiTpB4w/V2b//kUzWEoiM5dmtdxvasfTNmVrv7OBA2k+VFwBnGD7/qqBtYykHca7b/uefsXSVpIuALaj2f/6hTQ96PYCTrV9ScXQWkfSPGBHYGfgVttXVA6pFR5ZtahvCdEmM19QJfFO0hfRQ5I+CXzc9lJJW9LsXrIK2AY42fbFVQOMgSVpa5qk5vERmmyBuIakJcC8sgPHpsCvgZ3yAWJdkj4BPA/4MfBy4Bu2P1A3qvqGIenL8G5MaLxVXjDMK73G9BLbby3HxwF32D5K0jOAbwFJ+jpI2pGm0rcPTTX5WuBE23dVDaxlJH0AOJamp+HIHyYDB9SKqYUetb0awPafJd2VhK+rlwK7l11xNgOuBoY+6RuGPn1J+mIyjhh13LkFW1Z6ra1zO7GDaDrfY/vXw/CGsh6+CPwn8KpyfjRNYvzCahG102tpqlbZrq67XSXdUo4F7FTOReY9jvao7VUAth/O3sTDI0lfTMj2cSPHkhZ3nsc6HiiTx+8D9gX+HkDSRmTRy1g2s/35jvMvSHpXtWjaawmwFZB5od09t3YAAyQJ8pimf+6bpC+mKpNAx/cW4BzgGTR7W/66XH85cFm1qNrrW5JOBRbQ/N96HXC5pG0AbP+uZnAt8iFgcZm39sjIxTQcXsun0yh+0pIgD6ks5IgpyfY960/SC2wvqh1Hm0haPs5t296xb8G0mKSlwKeAW4HVI9fH6003bMooRNqyxHp7bPXiviVEG8/YMws5op1GbW/U2XAYSLVhPJJ2A+aXrwdoWkhEYTsNqyfnYdvn1A6i5bbM1mKTI+lB1h61UTkfGd6dXSWw2OBS6YsJSXrZePdTbVibpDmsSfQeA3YA9rJ9d72o2kvSXGA3YNORa7YvqhdR+0j6KM2w7kLWHt5Ny5ZC0v3ApXTZWiw7UKwh6RKaKShfAxbYvneCbxkKj62+qY+Vvj3Spy/aSdIFto+tHccgkHQtMJtmjtoC2z+XtDwVrbFJOg3Yjybpuxw4FLjG9mtqxtU2kr4/xmXbTsuWIlNPpqb0EX01zYr5TYEv0bxnDe082mFI+jK8G5MxpCu51sv/0mwD9XTgqcDPyeKX8bwG2B1YbPs4SU8HvlA5ptbJrjeTMv2XXvaQ7T8An5N0IU3idw5N8vfRqoHFBpWkLyZjZO/dMd9UM8S0RmnEPPIJ+n2SdgG2krS37Z9WDq+NVpQdFFZKmk3TkmS72kG1Tfk/dRpNU12Aq4DTyx/uaBwzciBpE9uPdJzvY/u6OmG1U9nzej7wEuAa4FW2r64bVV0ags8NGd6NCZVJv4voPlcmQ0xdSHoaTWPd+cD2tpPQdCjbQb2bptJwEvAn4Kb0glybpK/S9Oq7sFx6I82OCuPuljNMOod3Rw/1Zuh3bZLuFF98yQAAC3RJREFUpllYtgC4EljZeX9YP8ivXH1z3xKijWbsnjl90U5phTB5kj5o+91d7u1g+55+xzQoygKY2bZvmeChQ0fSTbb3mOjaMOt8nxr9npX3sLVJ+gHdp50M7Qf5Vb6lbwnRTM3LnL5ot7KJ+c7l9E7bf64ZT0sdQlO5WkcSvrGVNhsvpvkjdA2QpG9dKyS92PY1AJL2BVZUjqlt3OV4rPOhZnu/2jFEHUn6YjJOkfRhmi3F7qEZ5t1O0ueA99h+rGp07TJT0tZ0n/84tCvjxlKGd3em2W8X4C2SDrT9DxXDaqO3AReWuX0AvweOrRdOKz1T0jk0v3sjx5TzbeuF1T6S/sX2meX4b21/ueNe19GK6W9G7QA2uAzvxoQkfQzYHDjR9oPl2mzgIzQT8U+oGV+bSHqEZt/dbvMfs8NEB0m3Ac91eSOSNANYajvbRI2h/N5h+4+1Y2kbSceMd9/2hePdHyaZ/zi2VV7Sx+HduRnejdY6HHi2Oz4h2P6jpLcBtwFJ+tZYlrlDU3InsD1NBRmalbt31gunnSR9EDjT9gPlfGvgJNvvrRtZeySpmxJ1OR7rfGgMw+rdJH0xGfYYJWHbqySlVBxPxBbAzyT9lGbe1d7A9SNb/WWLv8cd2jnkZvv3kg4DkvQVo7eHHC3/l9aS+Y9DKklfTMYySW8avTWWpDfQVPpijbPHulgWwRzROXcmAPi32gEMiJmdveckzQI2qRxT2/w18Aua+aE/YYgrVpOwu6Q/0vwbzSrHlPNNu3/bdDf9/8tkTl9MSNK2NHs0rgBuKJf3AmbRNPS8r1ZsbSZpJvAKmh59BwNXZ3uxhiSNVT2e6mOGhaRTgCOAz5VLxwELRybjx+O/bwfR/L7NAy4DLra9tGpgMTBWe1nf3m9maLf06Yt2k3QA8Lxyusz292rG01aSXgb8HXAY8FNgX2BH2w9XDaxFSp+wrwKXdm72LulJNO1bjgG+b/uCKgG2kKRDgAPL6XdtX1EznjaTtAlN8ncW8H7b51YOKQaA+VnfEiLx3CR9EYNO0i+Be4HzgEtsPyhpue1nVQ6tVcpw9/HA64Fn0ewOMIumZ8J3gE/YXlwvwnZIRXRqSrJ3OE3CNwdYCHw2oxExGUn6ImJKSnubo2i2zPoicClwa1q1dCdpY+ApNO1/HqgdT5ukIjp5ki4C5gKXAwtsL6kcUgwYc3sfk77nJOmLmA4kCdiPptpwGLAlTWPry23/qWJoMWBSEZ08SauBh8pp5x820XQgmN3/qGKQtC3pK1M6zgZmAv9l+4xR9zcBLgKeD9wPvM723eM+Z5K+iA2nVLEOAY4GXmH7KZVDigGVimjEhnZHHxOiZ4+b9JWFSXfQLE76JbAImG97Wcdj3g7Ms/1WSUfTLKx83XjPO/33HImoRNJTga1sf8P262kaD0esF9uP2f5VEr6IobA3zR73d9l+FFgAHDnqMUcCI03JvwK8vIw0dZU+fRE9VH7hTgPeQVOSR9JK4OO2T68Z26CQ9CPb+9aOIyKGzfjVt16S9GbgzR2Xzrd9fsf5tjR9J0f8EnjhqKd5/DG2V0r6A/Bk4P+6vW6SvojeOpGmRcsLbC8HkLQjcJ6kE23/R9XoBsP2tQOIiNiQSoJ3/oQP7LEM70b01htp5l0sH7lg+y7gDcCbqkU1WDLReBIk/ah2DBGxwdzH2lOCnlmujfkYSRvRLBq8f7wnTaUvorc2tr1Oad32b8tE/AAkvbrbLZrVqTGxVEQjpq9FwC6SnkWT3B1N0/S/00Katk3XAq8BrpyoZ2eSvojeenQ97w2bI8a5982+RTHYUhGNmKbKHL13AFfQzA//rO2lkk4Hrre9EPgM8HlJdwK/o0kMx5WWLRE9JGkVa3qFrXUL2NR2qn0xaRNURD9p+6n9jCciBlsqfRE9ZHtm7RgGgaSP2X5nOT7B9tkd9y6wfWy14NolFdGI6JlU+iKi7yTdaPuvRh+PdR4REb2R1bsRUYO6HEeHspfzyPEJo+5d0PeAImKgJemLiBpmSNpa0pM7jreRtA2lqXUA8NKO42NG3ZvXz0AiYvBlTl9E1LAlcANrqnw3dtzLnJM1UhGNiJ5J0hcRfWd7Tu0YBsQMSVvTjMqMHI8kf6mIRsSUZCFHRFRROsgfCuxaLi0DrrC9sl5U7SLpbmA1Y1f5bHvH/kYUEYMsSV9E9J2kbYErgV8Bi2mSmj2BZwD72/6fiuFFRExLSfoiou/KytObbH9s1PV/Ap5ve/SihaGVimhE9EqSvojoO0m32d61y73bbT+n3zG1USqiEdFLWcgRETWsGOfew32Lov3+HTivS0X0Q6zbxiUioqskfRFRw5Zd9pUVMLvfwbTYPmNtSWf7HEm3V4gnIgZYkr6IqOEquu8r+8N+BtJyqYhGRM8k6YuIvrN9XO0YBkQqohHRM0n6IqIKSXOBdwHPK5eWAh+xfWu9qFonFdGI6Jms3o2IvpN0JPARmsUI15fLewH/Cpxs+9JasUVETFdJ+iKi7yTdDBxp++5R1+cAl9revUJYrZSKaET0yozaAUTEUNpodMIHUK5t3PdoWqpURL9OM8x7fPm6CvhauRcRMWmZ0xcRNayUtL3tezsvStoByE4Ta5wOHDQqQb5F0pXApeUrImJSkvRFRA2nAf8t6YPADeXaXsCpwCnVomqfrhVRSamIRsSUJOmLiL6zfYmk5cBJwD+Wy8uA19q+uV5krZOKaET0TBZyRES0lKSjgDOBMSuiti+pFVtEDJ4kfRHRd5IWjnff9iv7FUvbSdqdpiI6snp3Gc3q3VREI2JKkvRFRN9J+i3wC+Bi4Cc0O0w8zvZVNeKKiJjOkvRFRN9JmgkcBMwH5gGXARfbXlo1sJZJRTQieilJX0RUJWkTmuTvLOD9ts+tHFJrpCIaEb2UpC8iqijJ3uE0Cd8cYCHwWdv31YyrTVIRjYheStIXEX0n6SJgLnA5sMD2ksohtV4qohHxRCXpi4i+k7QaeKicdr4JCbDt2f2Pqp1SEY2IXknSFxHRUqmIRkQvJemLiGipVEQjopeS9EVEREQMgRm1A4iIiIiIDS9JX0RERMQQSNIXERERMQSS9EXEwJO0StJNkpZI+rKkzZ7Ac+0n6Zvl+JWSTh3nsVtJevt6vMb7JJ28vjFGRKyPJH0RMR2ssL2H7bnAo8BbO2+qMeX3O9sLbZ8xzkO2Aqac9EVE1JCkLyKmm6uBnSXNkXR76XW3BNhO0sGSrpV0Y6kIbg4g6RBJt0m6EXj1yBNJOlbSueX46ZK+Lunm8vUi4Axgp1JlPKs87l2SFkm6RdL7O57rPZLukHQN8Jy+/WtERBQb1Q4gIqJXJG0EHAp8u1zaBTjG9nWSngK8FzjQ9kOSTgH+WdKZwKeBA4A7gS91efpzgKtsv6rsibs5cCow1/Ye5fUPLq+5N00vvYWSXkrTa+9oYA+a990bgRt6+9NHRIwvSV9ETAezJN1Ujq8GPgP8JXCP7evK9X2A3YAfSQJ4EnAtsCuw3PbPASR9AXjzGK9xAPAmANurgD9I2nrUYw4uX4vL+eY0SeAWwNdtP1xeY+ET+mkjItZDkr6ImA5WjFTbRpTE7qHOS8B3bc8f9bi1vu8JEvAh258a9Rrv7OFrRESsl8zpi4hhcR2wr6SdAST9haRnA7cBcyTtVB43v8v3fw94W/nemZK2BB6kqeKNuAI4vmOu4LaSngb8EDhK0ixJWwBH9Phni4iYUJK+iBgKtn8LHAtcLOkWytCu7T/TDOdeVhZy/KbLU5wA7C/pVpr5eLvZvp9muHiJpLNsfwf4InBtedxXgC1s30gzV/Bm4FvAog32g0ZEdJG9dyMiIiKGQCp9EREREUMgSV9ERETEEEjSFxERETEEkvRFREREDIEkfRERERFDIElfRERExBBI0hcRERExBP4fHf+FrQd43JcAAAAASUVORK5CYII=\n", + "text/plain": [ + "<Figure size 720x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoMAAAJfCAYAAAAEktzaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdd3xUVfrH8c+TRk8CgSQIiZREkSYqzYJClCIWQFBRdy2r4NrLqlhRURcLxYYF1J9lXRsioqCAIIIKCCJFioLUAEnooZpk5vz+mCEkocXVzATu9/16zStz733Ovc+ZDMOZ59x7Y845RERERMSbIsKdgIiIiIiEjwaDIiIiIh6mwaCIiIiIh2kwKCIiIuJhGgyKiIiIeFhUuBMQERERCbdKqZeF7PYqu1e/Z6E6VmmoMigiIiLiYRoMioiIiHiYpolFRETE88y8Wx/zbs9FRERERJVBEREREfNwfcy7PRcRERERVQZFREREdM6giIiIiHiSKoMiIiLieaoMioiIiIgnaTAoIiIi4mGaJhYRERHPMytXfy44pFQZFBEREfEwVQZFREREPFwf827PRURERESVQRERERHdWkZEREREPEmVQREREfE8VQZFRERExJNUGRQRERHPMw/Xx7zbcxERERFRZVBERERE5wyKiIiIiCepMigiIiKep8qgiIiIiHiSKoMiIiLieaoMioiIiIgnaTAoIiIi4mGaJhYRERHPMyzcKYSNKoMiIiIiHqbKoIiIiHieLiAREREREU9SZVBEREQ8T5VBEREREfEkVQZFRETE81QZFBERERFPUmVQRERExMP1Me/2XERERERUGRQRERHROYMiIiIi4kmqDIqIiIjnqTIoIiIiIp6kyqCIiIh4nnm4PubdnouIiIiIBoMiIiIiXqZpYhEREfE8XUAiIiIiIp6kyqCIiIh4npmFO4Ww0WDwCGFmLtw5iIiIhIpzzrujsxDTYPAIcexD48KdQrm36rGu5OZ9Fe40jgixMeewcfen4U6j3KtZqRsrcseEO40jQv3YC9ldMD3caZR7laJOxTl9tz+ccFTpdM6giIiIiHiSKoMiIiLiebrptIiIiIh4kiqDIiIi4nk6Z1BEREREPEmVQREREfE8VQZFRERExJNUGRQRERHP09XEIiIiIuJJqgyKiIiI6JxBEREREfEiDQZFREREPEzTxCIiIuJ5urWMiIiIiHiSKoMiIiLieWYW7hTCRpVBEREREQ9TZVBEREQ8TzedFhERERFPUmVQREREPE9XE4uIiIiIJ6kyKCIiIqKriUVERETEi1QZFBEREfFweczDXRcRERERVQZFREREdM6giIiIiHiRKoMiIiIiqgyKiIiIiBepMigiIiLi4fKYh7suIiIiUj6ZWRcz+8XMlpnZvQfYPtTM5gYfv5rZ1iLbfEW2jTncsVQZFBERESlHzCwSGAZ0BDKBWWY2xjm3aG+Mc+6OIvG3ACcV2cVu51yL0h5Pg0HZz1lpNenf9QQizfhgTiYvT1u+X8x5TZK5vUM6DsfirO3cNnIeAD1b1OHmsxoC8OI3v/Hx3LUhzT3Uvv92IYOe/Ai/z9G952lcfV3nYtvz8vJ5+L63WLxoDXHxVRg46FqOqZNAQb6Pxx7+D0sWr8FX4OO8C9twTZ8uYepF2Zvx3RKefWoMPr+fC3q05sprM4pt/+nH5Tz39Bh+W7qeR5+6goyOzQH4dclannliFLt2/E5EpHHVdWdzTpdSf74dcWZ9v4RXBn2Kz+/n3O5tuPTq4q/Tgjm/8crgMSxftp77n7iCduecWLjt3NZ3Uy+tNgCJSfE8OvQfIc091L6bNp+nBr6L3+enR6+zuLbP+cW25+Xl88C9w1m8cCVx8VV5esiN1KlTi7VrN9Dj/PuoVy/wWjU7sSEPPXJ1GHog5Y0rXxeQtAaWOeeWA5jZ+0A3YNFB4i8DHv5fD6bBYJCZ1SUwCm9MYPr8c2Ac8FQwJA1YC+wG5gNvAHc5584vso83gc+dcyPNbApQOxgPgV9qLzN7BOgDbABigMecc++Vaef+gAiDAec34W9v/UBW7h7GXH8aE5fksGzDjsKYejUqc+OZDen52nRy9xSQUCUGgLhK0dzWPo0LXv0e5xyf//N0Ji7JJndPQbi6U6Z8Pj9PPf4Bw0bcSlJyPFde+hRndmhOg4a1C2M+HfU91WIrM/qLRxk/bjYvDPmEgYOv46sJc8jLK+CDTx5kz+48Lu42gM5dW3FMnYQw9qhs+Hx+Bv37E557tS+JSXFce/nztGvfhPoNkwpjkpPjefCxS/jvW98Ua1uxYgz9H+9NyrG12JCzjX9c9hxtTjuearGVQt2NMufz+Rn21CcMHNaXmklx3HLlc7Q9szHHNkgujKmVXJ1/PXIpI9/5Zr/2MRWiefm/d4Yy5bDx+fz8+/G3efW1e0hKqsHllz5C+w4n0TCtTmHMJx9PJTa2Cp+Pf4Yvxs3g2cEf8syQmwCom5LIh588Fq70RTCzvkDfIquGO+eGF1muA6wpspwJtDnIvo4F6gOTi6yuaGazgQLgSefc6EPlo3MGATMzYBQw2jmXDhwHVAXOcc61CJZaZwNXBJevLOWu98a3cM71KrJ+aHCf3YBXzSz6L+zOn9KibjyrNu9kzZbd5Pscny1YT6dGicVierdM4e2ZqwoHeZt25gGBiuK3v21k2+58cvcU8O1vG2mfXivkfQiVhQtWkpJai7opNYmOjqLTuafwzeR5xWK+mTyf87u1BeDsTifxw8xfcM6BwZ7dv1NQ4GPP73lER0dRpWrFcHSjzC36eTV1U2pSp24C0dFRnNOlBdOmLCwWU7tODdKOO4aIiOLfzFPr1SLl2MB7qFZiHNVrVGXrlh0cjX5ZuJpjUhKoHXyd2ndqwfRvir9OycfUoEH6/q+T1/y8YDkpqUnUTUkkOiaKLue2YcrkOcVivp48hwu7nwFAx06t+GHGosC/PZGDsdA9nHPDnXMtizyKDgT/qN7ASOecr8i6Y51zLYHLgWfNrOGhdqDBYEAGsMc5938AwRf0DuAfZla5rA7qnFsK7AKql9Ux/qikahVZt21P4fL63D0kxRYfpDRIqEL9mlUYeV1bPulzKmel1Qy0ja3IutxDtz2a5ORsJSl5368uMak6OTnbDhoTFRVJ1aqV2LZ1J+d0PJmKlSrQpcN9nN/xQf529TnExVUJaf6hsiEnl6Tk+MLlWolxbMjedogWB7ZowWry833USTn6qqcAm3K2UStp3+tUMzGejTmlf53y8gq4+e/PctvVz/P9lJ/LIsVyIyd7C8nJNQqXE5NrkJ2z5aAxUVGRVK1Wia1bA18k1q7dwCUXPcQ/rvw3c2b/ErrERUpvLZBSZLlucN2B9AaKzTA659YGfy4HplD8fML9aJo4oAnwY9EVzrlcM1tNYHp4/kHatTOzuUWWUwlML+/1rpntnSae6Jy7u2hjMzsZWOqcyznQzouWkWucdwvVTj63tP0pU5ERRv0alen9xkySYyvy4bVt6DLs23CndUT5ecFKIiMj+HLyQHJzd3HdVYNp3bYRdVNqhju1cmnjhlwGPPA+Dz5+KRER+g57IO989gA1E+NYn7mJfje8Qr20ZI6pq/dTSbVqxTN+0lDi46uyaOEKbr/leUaN+TdVqx59px7IH1S+Ku6zgHQzq09gENibQJWvGDNrRKCgNL3IuurALufc72ZWEzgdePpQB9On6p8zrcg0cAug5OXbRaeJiw4E7zCzhcBM4ImD7bxoGTlUA8Hs7Xs4Jm5fNa92bEWyi1T7ALJy9/DVLzkU+B2ZW3ezYtNO6tWoQnbuHo6JPXTbo0liYjzZWfuqETnZW0hMjDtoTEGBjx07dhMXX4Xx42Zx6umNiYqOpEZCNU5s0ZDFC1eFNP9QqZUYS3ZW4R0P2JCzjVpJcYdoUdzOHXu46+Y36HtLF5o2P7YsUiwXEhLj2JC973XamLOVmomlf532xtaum0DzUxry25Kj9+KtxKTqZGVtLlzOydpMUmL1g8YUFPjYsX038fFViYmJJj6+KgCNm9QnJSWRVSuzQpe8SCk45wqAm4HxwGLgQ+fcQjMbYGYXFgntDbzvip8DcQIw28zmAV8TOGfwYBeeABoM7rUIOKXoCjOLJVDpW1YGxxvqnGsC9AReN7NyM5c6b+026tWoQt34SkRHGhc0q83EJcULlxMWZ9O2XmD6pXrlaOonVGH1ll18s2wj7dJqElsxitiKUbRLq8k3yzaGoxsh0bjpsaxZncPazI3k5xcw4YsfObND82IxZ3ZozuefzgBg0oSfaNXmeMyMpNo1mP1DYHpq967f+Xn+CurVT9rvGEeDE5qkkLl6I+syN5OfX8BXX87ljLMal6ptfn4B997xFudecErhFcZHq+Mbp7B2zUay1m4iP7+AKRPm0vbMJqVquz13F3l5gXN4t23dycJ5K0ltcHS+nwCaNK3P6lXZZGZuID+vgC+/mMlZHYrPgrXvcBJjRgdmLCZOmEXrNidgZmzenIvP5wcgc00Oq1ZlUbfu0Xtus/wBZqF7lIJzbpxz7jjnXEPn3BPBdf2dc2OKxDzinLu3RLvvnXPNnHMnBn++frhjaZo4YBLwpJld6Zx7O3h/n8HAm865XWV1UOfcGDO7FrgKeLWsjvNH+PyO/mMX8faVrYiMMD6ck8nSDTu4IyOdBWu38dUvOYWDvok3t8PnHAPH/8LW3fkAPD/lN8Zcf1rw+TK2BdcfjaKiIrn7/ku55foX8fn8XNjjVBqmHcMrL37GCU2O5awOzel20Wn0v+9Nup/7MLFxlfn3M9cCcMllZ/Log+9wSbfHcM5xQfdTST++bph7VDaioiK5877u3HHDCHx+P+d3b02DtGRGDBtPoyZ1ade+CYt+XsN9d7zF9txdfPvNYl5/aQLvfnIXk8bPY+6c5eRu28m4MbMAeGDApRzXqM5hjnrkiYyK5Ka7e3D/LSPw+xydLmxFvYbJvPXKlxx3QgqnntWEXxauZsDdgddpxrRFvD18AiM+vJvVK3J4/t8jsQjD+R2XXtWh2FXIR5uoqEjue+Dv3NDnGfx+P917nElael2GvTCKJk3q0T7jZHr0PJMH+g3n/M53ExtfhacH3QjAnNm/MOyFUURHRWERxoMPX01csFIo4lWmq6sCzCwFeAloRKBiOo7ArWN+D26fElyeHVxuzx+7tcxG59w5wVvL7HDODQq2OQX4L3CCc85/sPzq9f9Cv6jDWPVYV3Lzvgp3GkeE2Jhz2Lj703CnUe7VrNSNFbmHvXm/APVjL2R3wfTDB3pcpahTdVVzKZgZzrmQnsSXnjEiZL+YpZP7lKsTFFUZDHLOrQEuOMT29iWWpxC4QqfouqsPFl9k/SMlln8Ejv9DyYqIiIj8RTQYFBERESlfVxOHlC4gEREREfEwVQZFREREytffJg4pVQZFREREPEyVQRERERHvFgZVGRQRERHxMg0GRURERDxM08QiIiIiurWMiIiIiHiRKoMiIiIi3i0MqjIoIiIi4mWqDIqIiIjnOd10WkRERES8SJVBEREREV1NLCIiIiJepMqgiIiIiHcLg6oMioiIiHiZKoMiIiIiuppYRERERLxIlUERERERXU0sIiIiIl6kyqCIiIiIdwuDqgyKiIiIeJkGgyIiIiIepmliEREREd1aRkRERES8SJVBEREREVUGRURERMSLVBkUERER8XB5zMNdFxERERFVBkVEREQ8fM6gOefCnYOUgpnpFyUiIp7hnAvp6CztkndD9v/ssg+vKFcjT1UGjxB+tyTcKZR7EdaIiim9w53GEWHPmvf1niqFCGuEz/9zuNM4IkRGNGXr71+GO41yL75CF1SEOTwLR5WuXA3PQkvnDIqIiIh4mCqDIiIi4nkuwrulQVUGRURERDxMlUERERERD19NrMqgiIiIiIepMigiIiLi3cKgKoMiIiIiXqbBoIiIiIiHaZpYRERERLeWEREREREvUmVQRERERLeWEREREREvUmVQRERExLuFQVUGRURERLxMlUERERERXU0sIiIiIl6kyqCIiIiIKoMiIiIi4kWqDIqIiIjnOe8WBlUZFBEREfEyVQZFREREdM6giIiIiHiRKoMiIiIi+tvEIiIiIuJFGgyKiIiIeJimiUVERER0AYmIiIiIeJEqgyIiIiIeLo95uOsiIiIiosqgiIiIiG4tIyIiIiJepMGg7Gfa1B/p0vkGOnXsy/DhI/fbnpeXzx23P02njn255OK7yMzMLtz26qsf0aljX7p0voFp0+aEMu2w6HjWicz7ejA/Tx3KXTdeuN/2lGMS+PL9B5k+biA/jH+Kzh1aABAVFcmIITcwa8JT/DRpEHfd1C3UqYeU3lOlN23aHM7tcjOdO93IiOGj9tuel5fPHXcMonOnG7n0kn6szcwBYMuW7Vx1ZX9OOflyHhswItRph9z0bxfR64LHuajrAN56beJ+2/Py8rn/rv/joq4DuObywaxbuwmA/PwCBjz4Lpf1GMjlPZ/kx1lLQ526lFcRFrpHOXPUDgbNzGdmc81soZnNM7N/mVlEiZhnzWzt3vVmdk2wzVwzyzOzBcHnTxZpM9rMZpTYz/FmNiUYu9jMhgfXtzezbUX2OdfMLi3yPCt4/L3LMaF4bQ7F5/MxYMCrjHjtYT4fO4yxn09l2bLVxWJGfjSR2NiqTJg4nKuuvpDBg94CYNmy1YwbO43Pxw7jtdceZsCjr+Dz+cLRjZCIiDCeffwaul31FCedfRcXX3gajdLrFIvpd2sPPv58Bqd2vY8rb36e5x7/BwA9z2tDhZgoWnXqx2nn3c91l59Nat2a4ehGmdN7qvR8Ph+PDRjB8BEP8tnnzzF27DSWLVtTLGbkyK+Ii63K+AkvceVVFzBo8NsAVKgQza23Xcbd91wVjtRDyufz8/QTH/HcS//kg0/vZ/wXP7L8t/XFYsaMmkG12MqMGtefy/7enheHjgFg9MjvAXjvk/t4cfhNPPfMJ/j9/pD3QaQ8OWoHg8Bu51wL51wToCNwLvDw3o3BAWAPYA1wFoBz7v+CbVoA64AOweV7g23igVOAODNrUORYzwNDg7EnAC8U2TZt7z6Djw+KHOOVIu1aOOfyyurFKK3585eSemxtUlKSiYmJput57Zg0aWaxmEmTZ9K9RwYAnTufzvTp83DOMWnSTLqe146YmGjqpiSTemxt5s8/er91t2qRxm8rs1i5Oof8fB8ffTad8zu1LBbjnCO2WiUA4qpVZn32luB6qFy5ApGREVSqGENefgHbt+8OeR9CQe+p0ps/fxmpqUVeq65nMHnSD8ViJk+aRbfuHQDo3PlUZkxfgHOOypUrcsopJ1AhJjocqYfUwgWrqJtaizopNYmOjqLTuScz9esFxWK++XoB513YGoCMji2YNfNXnHOs+C2Llm3SAaiRUI2qsZVZvHDNfscQ73FmIXuUN0fzYLCQcy4H6AvcbFb4W2gPLAReBi4r5a4uAj4D3gd6F1lfG8gscrwFHKGyszdRO3lfhSo5qSbZ2ZuKxeRkb6J27UBMVFQk1apVYeuW7Qdom7Bf26PJMcnVyVy3r39r12+iTlL1YjFPDP2Y3j3OYNnMF/nkrXu48+E3ARg1bia7dv3Oitkv8+uMF3h2+Ods2bYzlOmHjN5TpZeTvYnk2gmFy0nJCWRnby4Wk52zidrBmMBrVZmtW7eHNM9w25CzlaTk+MLlxKR4NmRvKxGzrTAmKiqSqlUrsm3rTtKPr8PUr3+moMDH2sxNLFm0huysLSHNX6S88cRgEMA5txyIBBKDqy4D3gM+Ac4zs9J8nd7b5j2KDyCHApPN7AszuyNYQdyrXYlp4oZ/ujNyxLjkwtP4z0dTSWtzMz2ueprXn70RM6NVi4b4fH4atLqRE06/jdv6nEe91MTD71BE/pQLerQlMSmeq3oPYuhTH9P8xPpERHjmv0I5lIgQPsqZcphS2Quem9cVGO2cywVmAp0P0yYJSAe+dc79CuSbWVMITC8DJwAfEag4zjCzCsGmJaeJf/sDefY1s9lmNnv48A/+YC//N0lJCazP2li4nJW9kaSkhGIxiUkJrF8fiCko8LF9+07iq1c7QNtN+7U9mqzL2kLdY/b1r07tBNZmF68wXNW7Ax9/Ph2AmXOWUrFCNDVrVOOSbqcz4Zt5FBT42LApl+mzf+WU5g04Guk9VXqJSQlkrd9X+czO2kRSUo1iMUmJCawPxgReq13Ex1cLaZ7hVisxnuysrYXLOdlbqZUUVyImrjCmoMDHjh17iIuvQlRUJHf2u4h3R/Zj0At92b59F6n1aoU0f5HyxjODweA5fj4gh8DALx5YYGYrgTM4/FTxJUB1YEWwTb2ibZxz65xzbzjnugEFQNM/m7NzbrhzrqVzrmXfvpf+2d2VSrNm6axauY7MNVnk5eUzbuw0MjLaFIvJyGjN6E8mAzB+/He0bdscMyMjow3jxk4jLy+fzDVZrFq5jubN00OSdzjMnvcbafWTOTalFtHRkVx8wamMnfhjsZg1azfS/vTAW+H4tGOoWCGGDZtyyVy3kfanNQGgcqUKtD45jV+WrQt5H0JB76nSa9YsjVWr1pOZmR14rcZ9S4eMVsViOmS04tPRXwMwfvx02rZthpXDc5DKUuOmqaxZtYG1mZvIzy9gwhdzaNe+WbGYM9s3ZeyYwPmWkyfOpWXrdMyMPbvz2L3rdwBmfr+EyMhIGjSsHfI+SDnk4auJPXHTaTOrReBijRedc87MLgOuc869F9xehcAgr7JzbtdBdnMZ0MU5Nz3Ypj7wFfCAmXUBJjnn8s0sGUgA1gKNyrZnf72oqEge6n891173CH6fn549zyE9PZXnn3uXpk3TyDi7Db16deSeu4fQqWNf4uKqMWTo3QCkp6dy7rlncF7Xm4iMjKR//38SGRkZ1v6UJZ/Pzx0Pvcln79xHZGQEb30whcW/ZvLQnb2Ys2AFYyf+yL2P/4eXnurDLdd1xTlHnztfBuCVtyYwfPA/+fGrZzCDdz78hp+XrD7MEY9Mek+VXlRUJA8+dB3XXTsAv9/PRT3PDrxWz79H06YNychoTa9eZ9Pvnufo3OlG4uKqMnjInYXtz864np07d5OfX8CkSTN57fWHSUtLCWOPykZUVCR339+LW//5En6fnwt6tKVhWm1efXEsJzRJ5cwOzbjwolN5+L53uKjrAGLjKvPE01cDsHnzdm7958tEmFErMY5HB/49vJ0RKQfMORfuHMqEmfmABUA0gUrdO8AQoCKBiz3qBaeI98aPAj5wzn0QXF4JtHTObTSzesB3QF1X5AUzsznADcClwHnAnuCmZ5xz/zGz9sCnwIoiqT3unBsZbP8IsMM5N+hw/XH8cnT+ov5CEdaIiim9Dx8o7FnzPn63JNxplHsR1gif/+dwp3FEiIxoytbfvwx3GuVefIUuHK3/7/6VzAznXEhLaPXv+ixkv5gVgy4oV+XBo7Yy6Jw7WPlgF1Cj5Ern3EUllusVeb4SqFOiCc65k4NPZwJ3HmD7FCCu5Poi2x852DYRERGRUPDMOYMiIiIisr+jtjIoIiIiUmrl8MKOUFFlUERERMTDVBkUERER8W5hUJVBERERES9TZVBEREQ8z+mcQREREREpL8ysi5n9YmbLzOzeg8RcYmaLzGyhmf23yPqrzGxp8HHV4Y6lyqCIiIhIOaoMmlkkMAzoSOAPZcwyszHOuUVFYtKB+4DTnXNbzCwxuL4G8DDQEnDAj8G2Ww52PFUGRURERMqX1sAy59xy51we8D7QrURMH2DY3kGecy4nuL4zMNE5tzm4bSLQ5VAH02BQRERExCxkDzPra2azizz6lsimDrCmyHIm+/8ltOOA48zsOzObYWZd/kDbYjRNLCIiIhJCzrnhwPA/uZsoIB1oD9QFpppZs/91RyIiIiLeVr7mStcCKUWW6wbXFZUJzHTO5QMrzOxXAoPDtQQGiEXbTjnUwcpX10VERERkFpBuZvXNLAboDYwpETOa4KDPzGoSmDZeDowHOplZdTOrDnQKrjsoVQZFRERErPxcTeycKzCzmwkM4iKBN5xzC81sADDbOTeGfYO+RYAPuNs5twnAzB4jMKAEGOCc23yo42kwKCIiIlLOOOfGAeNKrOtf5LkD7gw+SrZ9A3ijtMfSYFBERESkHN1nMNR0zqCIiIiIh2kwKCIiIuJhmiYWERER0TSxiIiIiHiRKoMiIiLiea4c3Vom1FQZFBEREfEwVQZFREREPFwe83DXRURERESVQRERERGdMygiIiIiXqTKoIiIiIjuMygiIiIiXqTKoIiIiIgqgyIiIiLiRaoMioiIiHi3MKjB4JEiwhqFO4Ujwp4174c7hSOG3lOlExnRNNwpHDHiK3QJdwpHBPPwLUykfNJg8AhR4J8X7hTKvaiIE8na9Wm40zgiJFfuRv1rPwx3GuXeitcvYcHmz8KdxhGhWY0L8PkXhjuNci8yognOuXCnUe6FY8DsdM6giIiIiHiRBoMiIiIiHqZpYhEREREPn8upyqCIiIiIh6kyKCIiIqILSERERETEi1QZFBEREfFuYVCVQREREREvU2VQREREPC/Cw+UxD3ddRERERFQZFBEREc/z8G0GVRkUERER8TJVBkVERMTzVBkUEREREU9SZVBEREQ8zzxcGlRlUERERMTDVBkUERERz/NwYVCVQREREREv02BQRERExMM0TSwiIiKep2liEREREfEkVQZFRETE88zD5TEPd11EREREVBkUERERz9M5gyIiIiLiSaoMioiIiOdFqDIoIiIiIl6kyqCIiIh4ns4ZFBERERFPUmVQREREPE+VQRERERHxJFUGRURExPPMw6VBDQZlP9Om/cTAJ/4Pn99Pr15n06dvj2Lb8/LyubffCyxcuJz4+GoMGXIHdeom8v138xgy+F3y8wuIjo7irnv+Ttu2zcLUi9CY+d0Snn96DH6/n/N6tOZv/8gotn3uj8t54ZkxLF+6noefvIL2HZsDkLVuCw/c+RbO76egwE/Py06n28WnhqMLIXFmkyT6X3YSERHGh9OW88oXv+wX07VlXW67sAnOOZZkbuP2ETMBWDq8F79kbgNg3eZd9H3xu5DmHko/TV/CG2InaxoAACAASURBVENH4/f7OfvCNlx05dnFti/86Tf+b+inrPptPXc+9jdOzTixcNs7L37Oj98vBuDia87h9I4nhTT3UJs2bQ7/fuJ1/H4/vXqdQ5++PYttz8vLp1+/51i08Lfg59Rd1KmbyJYtudx+2zP8/PMyunfvwEP9+4apByLlR5kPBs3MAe865/4WXI4C1gMznXPnB9d1BwYA0UAB8JBzbnRw25vAWUAuUAmYAdzvnMsMbl8JbAd8wUNOdc7dGmz3uXNu5CFyqwR8CWQ453wHiysrZtYM+Jdz7upQH/tgfD4fjw94ndfeeIikpBpcevF9dMhoSVpaSmHMxyMnExtblfETXmTc2O8YPPg/DBl6J/HVY3np5XtJTKrB0l9X0+e6x5kydXgYe1O2fD4/Qwd+wpBX+lIrKY6+VzzPGWc1oV7DpMKYpOR47h9wCe+//U2xtgm1qvHy2zcTExPFrl2/c3XPwZx+VmNqJsaFuhtlLsLg0StO5sohU8nasovRD57DV3PXsWz99sKYeolVuaFrIy5+cjK5u/JJqFahcNuePB/nD5gYjtRDyufzM2LQKPo/fz0JiXH0u+ZZWrVrQkr95MKYWknVufmh3oz575RibX/8bhHLf8lk8Nt3kp9fQP8bX+ak006gcpWKoe1EiPh8Ph4bMJzX33iEpKQELrn4HjpktC72OTVy5FfExVZh/ISXGTt2GoMGv83QoXdRoUIMt952GUuXrmbpr6vD2Aspb/S3icvWTqBpcOAF0BFYu3ejmZ0IDAK6OedOAC4EBplZ8yL7uNs5dyJwPPATMNnMYops7+CcaxF83PoHcvsHMCpMA8Eo59wCoK6ZpYb6+AezYP4yUlOTSUlJIiYmmnO7ns7kSbOLxUyeNIvu3c8CoFPntsyY/jPOORo3rk9iUg0A0tJT2PN7Hnl5+SHvQ6gs/nk1dVJqckzdBKKjozi7cwu+nbKwWEztOjVoeNwx+00/REdHERMT+C6Wn1eA37mQ5R1qJ9avwaqcHazZuJN8n+PzH9bQsUWdYjGXnlmfd77+jdxdgffLpu2/hyPVsFq2aDXJdRNIrhN4P53R8SRmTS3+fko8pgb10vd/P61ZkU3jkxoSGRVJxUoVODatNj9NXxLK9ENq/vylpKbWJiUlmZiYaLp2PYPJk34oFjN50g90694BgM6dT2PG9Pk456hcuSKnnNKYCjExB9q1iCeFahw8Djgv+Pwy4L0i2+4C/u2cWwEQ/DkQuLvkTlzAUCALOPcvyOsK4FMAM2tvZlPNbKyZ/WJmr5gFvieY2ctmNtvMFprZo3sbm9lKM3vazBaY2Q9mlhZcX8vMPjazWcHH6cH1j5jZO2b2HfBOcDefAb3/gr78JbKzN5NcO6FwOTm5BjnZm4rH5GwmuXZNAKKiIqlWrTJbt24vFjNh/AwaN25ATEx02ScdJhtzcklMji9crpUUx4acbaVun521lasvHkyvLk9w+dXtj8qqIEBy9Uqs37KrcHn9ll0kVa9ULKZ+UjXqJ1Xlw3s78PF9GZzZZF91tUJ0BJ8+eDYf35dBxxbHhCzvUNu8YRs1E/e9n2okxrFpQ+neT/XSj+Gn6Uv4fU8euVt38POPy9iUvbWsUg27nOx9n0EASckJZO/3ObWJ2of5nBKRgFANBt8HeptZRaA5MLPItibAjyXiZwfXH8wcoFGR5a/NbG7wcUdpEgpWFhs451YWWd0auAVoDDQELgquf8A51zKY+1klqpbbnHPNgBeBZ4PrngOGOudaAT2B14rENwbOcc5dVqSv7Q6SY9/gIHT2iOEHne0ud5YuXcOQwe/yyKM6F+dQkpLjefOjf/HemH58+dmPbN7k3f+ooiKMeonVuPyZKdw2Ygb/vqol1SoFvki06zeWbo9P4vYRM3modwtSa1UJc7blT4s2x3PyaSdwf58XGPrQfzi+6bFERHp4zkvkf2AWukd5E5ILSJxz882sHoGq4Li/YJclX8oOzrmNf3AfNYGSX51/cM4tBzCz94AzgJHAJWbWl8DrVZvAgG5+sM17RX4ODT4/B2hcZCon1syqBp+Pcc7tLnLMHOCA5Q7n3HBgOIDPzQ/JPGJSUg2y1u/7hp2VtZnEpITiMYk1yFq/keTkBAoKfGzfvov4+GrB+E3cevMzDHzqZlJTkzma1UyMJSdr31toQ/Y2av0P1b2aiXE0SEtm/pwVhReYHE2ytuymdvXKhcu1q1cme8vu/WLmrthMgc+RuXEXK7O3Uz+pKvNXbiF76x4A1mzcyYxfNtAkNZ7VG3aGtA+hUKNWHBtz9r2fNudsI6FW6d9Pva45h17XnAPA0P7/oXZqrb88x/IiMSnwGbRXdtYmkvb7nEpg/fqNJCfX3O9zSkSKC+VXxzEEzg18r8T6RcApJdadAizk4E4CFv/JfHYDJc+uLjngcmZWn8BU9tnOuebA2BLt3AGeRwBti5zHWMc5tyO4reT/YhWDuZQLTZulsWrVejIzs8nLy+eLcd/RIaNlsZgOGS0ZPTpwQcSE8TNo07YpZkZu7k5uuH4gd/7rCk4+udGBdn9UadQkhczVG1m3djP5+QVMGj+X089qXKq2Odlb+X1P4Py47bm7mP/TClLqHZ3/ec9fuYV6SVWpW7My0ZHG+a1T+GreumIxE35aS5vjA/2vXjWGeknVWL1hJ7GVo4mJiihc3zItgaXrckPeh1BIOyGF9Ws2kr1uE/n5BXw78SdatjvUBMk+Pp+f7dsCHy0rl65j1bL1tGh9XFmmG1bNmqUX+5waN+5bOmS0KhbTIaMVn47+GoDx47+nbdtmnr51iByeKoOh8Qaw1Tm3wMzaF1k/CPjIzCY751YGK4j3A71K7sAC/5JvIVCd+/LPJOOc22JmkWZW0Tm3J7i6dXDwtwq4lEBVLpbAAG6bmSUROFdxSpFdXQo8Gfw5PbhuQjDPZ4J5t3DOzT1IKscBP/+ZvvyVoqIieeCha+lz7RP4/X569OxAenoKLzz/Pk2aNiQjoxU9e2XQ754X6NzpZuLjqjJoSGBm/r/vfsnq1Vm89NJHvPTSRwC89vpDJCQcnefCRUVFcvu93bnrhhH4/X66dmtN/bRkXn9pPMc3rssZ7Zuw+Oc1PHjnW2zP3cX3UxfzxssTeHvUXaxansOwIZ9hZjjn6H3lWTRMrx3uLpUJn9/xyH9/4q3bzyQiwvjouxUsXZfL7d2asGDlZibNW8/Uhdm0a5LM+AGd8fsdT340n6078zi5YQJP/P0U/M4RYcYrXywpdhXy0SQyKpLr7rqIx24bjt/vyDi/NakNknlv+JekNapLqzObsmzRap7q9yY7t+9m9reLeH/EeJ577x58BT4evH4YAJWqVOC2Ry4nMioyzD0qO1FRkTz4UB+uu/ZR/H4/F/U8m/T0VJ5//r80bZpGRkZrevU6h373PEvnTjcQF1eVwUP+Vdj+7Iy+7Ny5O/AlbtIPvPb6w8WuRBbxGnNlfBWjme1wzlUtsa49cFeRW8tcBDxK4NYy+cDDzrlRwW1vsu/WMpUJ3FrmvkPcWma+c+7KYLsL2Fd1W+OcK3YjNzN7HXjPOfdVMKcBwX2lAV8DNzrn/MF9nQasAbYRmOp9M3jsDwgMEH8HLnPOLTOzmsAw4AQCA+6pzrl/mtkjwA7n3KAiObwIjHfOfXao1zFU08RHsqiIE8na9Wm40zgiJFfuRv1rPwx3GuXeitcvYcHmQ/7TlKBmNS7A5z/UhI4AREYE7qUphxb8ohzSGlqLd6eF7Bcz94p25ao+WOaVwZIDweC6KRSprgUHfqMO0v7qw+y/3v/SLmgYcAfwVXA5d+8A9Q/s6xnnXL8S8RsJVApL7ueRostmVgFoCdxeilxFRERE/nKe/gskzrk5Zva1mYVrPiUVuNc5VxCm44uIiAiBG+R7lacHgwDOuTeCT6dQ/FzA0rSt9yePvRRY+mf2ISIiIvJneH4wKCIiIlIer/INFd2VVERERMTDVBkUERERz1NlUEREREQ8SZVBERER8Tzz8OXEqgyKiIiIeJgqgyIiIuJ5OmdQRERERDxJlUERERHxPFUGRURERMSTNBgUERER8TBNE4uIiIjnaZpYRERERDxJlUERERHxPA/fc1qVQREREREvU2VQREREPE/nDIqIiIiIJ6kyKCIiIp5nHi6PebjrIiIiIlLqyqCZVXDO/V6WyYiIiIiEg84ZPAQza21mC4ClweUTzeyFMs9MRERERMpcaaaJnwfOBzYBOOfmAR3KMikRERGRUDKzkD1KmU8XM/vFzJaZ2b2HiOtpZs7MWgaX65nZbjObG3y8crhjlWaaOMI5t6pE8r5StBMRERGRP8jMIoFhQEcgE5hlZmOcc4tKxFUDbgNmltjFb865FqU9Xmkqg2vMrDXgzCzSzG4Hfi3tAURERETKO7PQPUqhNbDMObfcOZcHvA90O0DcY8BTwJ4/0/fSDAZvAO4EUoFsoG1wnYiIiIj89eoAa4osZwbXFTKzk4EU59zYA7Svb2Y/mdk3ZtbucAc77DSxcy4H6H24OBEREZEjVSivJjazvkDfIquGO+eG/4H2EcAQ4OoDbF4PpDrnNpnZKcBoM2vinMs92P4OOxg0sxGAK7neOdf3AOEiIiIicgjBgd+hBn9rgZQiy3WD6/aqBjQFpgSv6UgGxpjZhc652cDvweP8aGa/AccBsw92sNJcQPJVkecVgR4UL12KiIiIyF9nFpBuZvUJDAJ7A5fv3eic2wbU3LtsZlOAu5xzs82sFrDZOeczswZAOrD8UAcrzTTxB0WXzewd4NtSd0dERESknCtPN512zhWY2c3AeCASeMM5t9DMBgCznXNjDtH8TGCAmeUDfuCfzrnNhzqeObffDPAhmVlDYIJzruEfaih/ipn9sV+UiIjIEcw5F9LhWYdx34Xs/9mvu55ejoaepTtncAv7zhmMADYDB735oZSNAv+CcKdQ7kVFNGPg3AnhTuOIcF+LTuwu+D7caZR7laJO49h7Pw93GkeEVU+ej98tCXca5V6ENeKPFmG8qLQ3Zv4rRZSr4VloHXIwaIHfxonsO2nR7/QuFhERETlqHHIw6JxzZjbOOdc0VAmJiIiIhJqXK4Oluen0XDM7qcwzEREREZGQO2hl0MyinHMFwEkE/ibeb8BOwAgUDU8OUY4iIiIiZSrCw9dpHmqa+AfgZODCEOUiIiIiIiF2qMGgATjnfgtRLiIiIiJh4eVzBg81GKxlZncebKNzbkgZ5CMiIiIiIXSowWAkUJVghVBERETkaFWaK2qPVocaDK53zg0IWSYiIiIiEnKHPWdQRERE5Gjn5auJD1UVPTtkWYiIiIhIWBy0Muic2xzKRERERETCxctXE3v5fEkRERERz9NgUERERMTDDnUBiYiIiIgneLk65uW+i4iIiHieKoMiIiLiebqAREREREQ8SZVBERER8TzTTadFRERExItUGRQRERHP0zmDIiIiIuJJqgyKiIiI53m5OublvouIiIh4niqDIiIi4nkRuppYRERERLxIlUERERHxPF1NLCIiIiKepMqgiIiIeJ6Xq2Ne7ruIiIiI52kwKCIiIuJhmiaW/Uyb9hMDn3gDn99Pr15n06fvRcW25+Xlc2+/51m4cDnx8dUYMuRO6tRN5Pvv5jFk8H/Izy8gOjqKu+65krZtm4WpF6GROXcRM/9vJM7v57izT6N5907Fti+ZMI3F46cSERFBVMUKnH79ZcTXrc3a+Yv58d0x+AoKiIyKouXfu3NM0+PD1Iuy9920+Tw18L/4fX569DqTa/ucX2x7Xl4+D9w7gsULVxIXX5Wnh9xAnTq1WLt2Az3Ov5969ZIBaHZiQx565Oow9CA0zjquFv3Pb0xkhPHBrDW8/M1v+8Wc16w2t5+djgMWr8/ltg/mAvDWNa04KaU6s1Zt5tq3Zoc489CbNvVHnnjiNfx+H70u7kTfvr2Kbc/Ly6ffPUNZuHAZ8fGxDBl6N3XrJgHw6qsf8fHIiURERPLAg31o1+7kcHRByhkvX0BSZoNBM3PAu865vwWXo4D1wEzn3PnBdd2BAUA0UAA85JwbHdz2JnAWkAtUAmYA9zvnMoPbVwLbAV/wkFOdc7cG233unBt5iNwqAV8CGc4538HiypqZ3Qzscs69Ea4cSvL5fDw+YASvvdGfpKQELr24Hx0yWpGWllIY8/HIScTGVmX8hGGMG/stgwe/w5Ch/yK+ejVeevk+EpNqsPTX1fS57jGmTB0Rxt6ULb/fz4zXP6TzgzdTOSGez+57htSWzYivW7swpsEZLWnUqR0Aq2fP54e3RtHpgZuoWK0q5/S7nso14tmyeh0TnhjGpa8+Ea6ulCmfz8+/H3+HV1+7m6SkGlx+6aO073ASDdPqFMZ88vFUYmMr8/n4p/li3AyeHfwRzwy5EYC6KYl8+Mlj4Uo/ZCIMBlzYhL+9PpOs3D2MuekMJi7OZlnOjsKYegmVubF9Q3q+8j25ewpIqBJTuO3VqcupFB3J5W1Sw5F+SPl8PgYMeJU3/m8ASUkJXNzrX2RktCYtbV/fR340kdjYqkyYOJyxY6cyeNBbDH32HpYtW824sdP4fOwwcrI3cc01/fly/MtERkaGsUci4VWW08Q7gabBgRdAR2Dt3o1mdiIwCOjmnDsBuBAYZGbNi+zjbufcicDxwE/AZDOLKbK9g3OuRfBx6x/I7R/AqHAOBIPeAG4Jcw7FLJi/jNTUZFJSkomJiebcrmcwedKsYjGTJ/1A9+7tAejU+VRmTF+Ac47GjRuQmFQDgLT0FPb8nkdeXn6ouxAyG5etpFpyTaol1SQyKooGp53M6lnzi8XEVK5U+LxgTx5Y4KtnQv0UKteIByA+pTYFefn48o/O1+rnBctJSU2ibkoi0TFRdDm3DVMm/1Qs5uvJP3Fh9zMA6NipFT/MWIRz3roBbIuUeFZt2sWaLbvJ9zk+m7eOTickFYvp3SqVt6evIndPAQCbduYVbvv+t03s/L0gpDmHy/z5S0k9tnbh51TX89oxadLMYjGTJs+ke48MADp3Pp3p0+fhnGPSpJl0Pa8dMTHR1E1JJvXY2syfvzQc3ZByJsJcyB7lTVmfMzgOOC/4/DLgvSLb7gL+7ZxbARD8ORC4u+ROXMBQIAs49y/I6wrg070LZtbPzBaY2TwzezK4roWZzTCz+Wb2iZlVD66fYmZDzWy2mS02s1ZmNsrMlprZ48GYema2xMzeDcaMNLPKB+jXLmClmbX+C/r0l8jO3kxy7ZqFy8nJNcjJ3lQ8JmdfTFRUJNWqVWbr1u3FYiaMn0HjxvWJiYku+6TDZNfmbVRJqF64XDmhOjs3b9svbvGX3zDylkeY9e5o2lzTa7/tq2bOJaFBCpHRR+drlZO9heTkGoXLicnVyc7ZctCYqKhIqlarxNatgYrY2rUbuOSi/vzjyoHMmf1L6BIPsaTYiqzbtrtweX3uHpLiKhaLaVCzCvVrVmHk9afyyQ2ncdZxtUKdZrmQnb2J2slFPqeSapJd4nMqJ3sTtYt9TlVh65btB2ibsF9bEa8p68Hg+0BvM6sINAeKfnVrAvxYIn52cP3BzAEaFVn+2szmBh93lCahYGWxgXNuZXD5XKAb0CZYhXw6GPo20M851xxYADxcZDd5zrmWwCsEBpU3AU2Bq80sIRhzPPBSsOqZC9x4kJRmA+0Okmvf4KBz9ojhH5Wme+XC0qWrGTL4HR559J/hTqVcOKHLWfR64RFaXtGNeR9/WWzbljXrmf3up5zWp3eYsivfatWKZ/ykIXw4agB39buMe+95lR07dh++4VEqMtKoX7MKvUfM4Jb3f2Jgj2bEVtSp3yJ/hQgL3aO8KdPBoHNuPlCPQFVw3F+wy5IvYdFp4qGl3EdNYGuR5XOA/wtW6XDObTazOCDeOfdNMOYt4MwibcYEfy4AFjrn1jvnfgeWA3tPrlvjnPsu+Pw/wBkHyScHOOZAG5xzw51zLZ1zLfv0vbiU3ftzkpJqkLV+Y+FyVtZmEpMSisck7ospKPCxffsu4uOrBeM3cevNTzPwqVtJTU0OSc7hUrlGHDs37atw7dq0hSo14g4a3+C0U4pNI+/ctIXJg4bT7qa/E5t89FZ4EpOqk5W1uXA5J2sLSYnVDxpTUOBjx/bdxMdXJSYmmvj4qgA0blKPlJRarFqZFbrkQyg7dw/HxO07raB2bEWyt+0pFpO1bQ9fLc6mwO/I3LKbFRt3Uq9mlVCnGnZJSQmszyryOZW9kaQSn1OJSQmsL/Y5tZP46tUO0HbTfm1FvCYUt5YZQ+DcwPdKrF8EnFJi3SnAwkPs6yRg8Z/MZzdQ8bBRh/Z78Ke/yPO9y3u/ppc8KeBgJwlUDOZULjRtlsaqVevJzMwmLy+fL8Z9S4eMlsViOmS0YvToKQBMGD+dNm2bYmbk5u7khuuf4M5//Y2TT250gL0fXWo2PJbc9RvYnrMRX0EBy7+fQ0rL5sVitq3PKXy+Zs5CYmsHBn2/79zFxCdf4ZTLu5HUqGFI8w61Jk3rs3pVNpmZG8jPK+DLL2ZyVoeTisW079CCMaO/BWDihFm0bnMCZsbmzbn4fH4AMtfksGpVNnXrHp0D53mZ26hXswp1q1ciOtK44MRjmLg4u1jMhEXZtG0QGLhUrxxN/ZpVWL15VzjSDatmzdJZtXIdmWuyyMvLZ9zYaWRktCkWk5HRmtGfTAZg/PjvaNu2OWZGRkYbxo2dRl5ePplrsli1ch3Nm6eHoxtSzkSE8FHehGJ+4Q1gq3NugZm1L7J+EPCRmU12zq00s3rA/cB+J1WZmRG40KI2gauA/2fOuS1mFmlmFZ1ze4CJQH8ze9c5t8vMagSrg1vMrJ1zbhrwd+CbQ+95P6lmdqpzbjpwOfDtQeKOA747yLaQi4qK5IGHrqPPtY/h9/vp0TOD9PRUXnj+PZo0TSMjoxU9e51Nv3uep3Onm4iPq8qgIYEZ+v+++wWrV2fx0ksf8dJLgWnt117vT0LCwatlR7KIyEja/uMSJjwxDOd3pHdoS/WU2sz54HNqNkwltWVzFn85lfULlhARGUlM1cq0u+lKABZ/OZXtWRuYN/IL5o38AoBOD95Mpbhq4exSmYiKiuS+B/7GDX0G4ff76d6jHWnpdRj2wiiaNKlP+4yT6NHzTB7oN5zzO99DbHwVnh50AwBzZv/CsBc+IToqEouI4MGHryIuWCk82vj8jv5jfubtf7Qm0owPZ2eyNGcHd5xzHAvWbuWrxTl88+sG2qXXZOLtZ+JzjoFfLGbrrsCFRx/2PZWGtapQpUIU0+/NoN/H85m6dONhjnpkioqK5KH+13PtdY/g9/np2fMc0tNTef65d2naNI2Ms9vQq1dH7rl7CJ069iUurhpDhgZOR09PT+Xcc8/gvK43ERkZSf/+/9SVxOJ5VlZX7JnZDudc1RLr2gN3Fbm1zEXAowRuLZMPPOycGxXc9ib7bi1TmcCtZe47xK1l5jvnrgy2u4B91bY1zrlTS+TxOvCec+6r4PK9wJVAHjDOOXe/mbUgcE5gZQLTv9cEB5JTgn2YfYD+TCFwYcxGAoPW2QSqnYuAvwcHmwOA2c65McE2c4COzrlDnsHscz+Xv8uPypmoiGYMnDsh3GkcEe5r0YndBd+HO41yr1LUaRx77+fhTuOIsOrJ8/G7JeFOo9yLsEaeu1L+f2FmOOdCenbdP7/7OmS/mFdO71Cuzhwss8pgyYFgcN0UYEqR5VHAqIO0v/ow+6/3v7QLGgbcAXwVbPMk8GSJ/cwF2h5g/+2LPJ9C8f60h8DVxEDB3nsslmjff+9zMzuJwDmHupRNREREwsKTl6E55+aY2ddmFhnmew3WBB4K4/FFRESE8nmVb6h4cjAIUJZ/9SN425qmpYibWFY5iIiIiJSGZweDIiIiInv9f3v3HS9ZXd9//PXepUe6KEqRahSQoggaFImKgogYYhRiImgewZqgxoLlZ41olBjFFkkkiIlsjFFZEEQNwYBCpArsBqRKEUUBG6Cwu5/fH3MuzF7uvTsLu3PO3fN68pgHp83MZ87e8rmfb+tzZbCLI5wlSZI0JlYGJUlS7/W5Otbnzy5JktR7JoOSJEk9ZjOxJEnqvTnp72TgVgYlSZJ6zMqgJEnqPaeWkSRJUi9ZGZQkSb3X5+pYnz+7JElS71kZlCRJvWefQUmSJPWSlUFJktR7cZ5BSZIk9ZGVQUmS1Hv2GZQkSVIvWRmUJEm91+fqWJ8/uyRJUu9ZGZQkSb03x9HEkiRJ6iOTQUmSpB6zmViSJPWeU8tIkiSpl6wMSpKk3rMyKEmSpF6yMihJknpvbtsBtMjKoCRJUo9ZGZQkSb3X50mnU9XfDz+bJD3+KpUk9U5VjXVIx9GXfGtsv2ffvuu+nRquYmVwljBpX7YkLLjjlLbDmBV23PBAv6ZGkMT7NKIkPGrHd7YdRufdsuBv/ZoaQTL+XMnRxJIkSeolK4OSJKn3rAxKkiSpl6wMSpKk3ptrZVCSJEl9ZGVQkiT1nn0GJUmS1Esmg5IkST1mMihJknpvTmpsj1Ek2S/JlUmuTnLUFOdfleSyJJckOSfJDkPn3tY878okz13mZ1+uOyVJkqSVKslc4FPA/sAOwKHDyV7ji1X1hKraFfgw8NHmuTsAhwA7AvsBn25eb1omg5IkqffmZHyPEewBXF1V11bVPcA84KDhC6rqV0O7vwdMlBwPAuZV1e+q6jrg6ub1puVoYkmSpG7ZDLhxaP8mYM/JFyV5LfBGYA3gmUPPPW/Sczeb6c2sDEqSpN6bO8ZHkiOSXDD0OOLBxFxVn6qqbYG3Au98MK8BVgYlSZLGqqqOA46b4ZKbgS2G9jdvjk1nHvCZB/lcK4OSJEkd6zN4PrB9kq2TrMFgQMj84QuSbD+0ewBwVbM9HzgkyZpJtga2B74/05tZGZQkSeqQqlqU5HXAGQxalo+vqgVJ3gdcUFXzgdcleTZwL3AHxz+eWAAAIABJREFUcFjz3AVJvgQsBBYBr62qxTO9n8mgJEnqvVHn/xuXqjoNOG3SsXcNbR85w3M/AHxg1PeymViSJKnHrAxKkqTemztaX75VkpVBSZKkHrMyKEmSem/EUb6rJCuDkiRJPWZlUJIk9Z6VQUmSJPWSyaAkSVKP2UwsSZJ6z2ZiSZIk9ZKVQUmS1HtzO7Yc3ThZGZQkSeoxK4OSJKn3+lwd6/NnlyRJ6j0rg5Ikqff6PJrYZFB6CC469wo+99GvsWTJEp79gj3548OetdT5BRdfw/H/cDLXX30Lf/P+P+MPnrXLfec+/4lTuPC7/8eSKnbd47H8xRtfSNLjn0bScthnr+14/1EHMGduOOk/L+STnzt7qfObbbo+Hzv6YNZfd23mzA1H/8M3OfPsq9j7qdvy9tfvy+qrr8a99y7i/X9/Bt/9/nUtfQqpG3rbTJykkvzr0P5qSX6W5NRm//Bm/5Khxy5D27cnua7Z/naSrZLc3ewvTHJiktWb19pn4nWb/f2TXNBcd3GSvx//HdBDtXjxEo77yFf4fx/7S46d9xbO+ebF3HjtT5a6ZpNHbshf/b9D2Ps5uy11/IpLr+OKS6/nH/7tTXz8i2/mqoU3suCia8YZvjRrzZkTjn7ngbz01Seyzws+wUHP25ntt9lkqWuOfOUzOOWMy3nOn3yaV7/pS3zwnQcCcPsdd3HY6/6NZx38SY58x1c49oMvauMjqIPmZHyPrulzZfBOYKcka1fV3cC+wM2Trvn3qnrdpGO7AiQ5ATi1qr7c7G8FXFNVuyaZC3wLeDHwb8NPTrIT8EnggKq6orn2iBX5wTQeVy28gUdtvjGbbrYxAE/bdze+/z8L2GKbTe+75hGP3giATP7uT7jnd4tYdO9ioFi8aDHrb7TuuEKXZrXdnrA5199wGzfcdAcAJ59+Gc995uO56tqf3XdNFaz7e2sBsN66a/HTn/0agMuvuOW+a668+lbWWms11lh9Lvfcu3iMn0Dqlj4ngwCnAQcAXwYOBU4Cnv5QX7SqFif5PrDZFKffAnygqq6YuBb4zEN9T43f7bf+koc/coP79jd+xPr8cMENIz33cU/Yiic8aVteccB7oGD/P9mLLbZ+5EqKVFq1bPqI9fjxT3553/4tP/0lT3zC5ktd8/efPpOTjjucl//pnqyz9hq85C9PeMDrHLDvjly+8BYTQQHOM9hn84BDkqwF7Az876TzL5nUTLz2KC/avN6ewDemOL0TcOGIr3NE05x8wXHHHTfKUzRL3HLjz7np+lv551PexT+f+i4uu+BqFl58bdthSauMFz5vZ7508kXs/uxj+PPXfIFPfPCPl+qT+9htH8E73vgc3vK+k1uMUuqGXlcGq+rSpnn3UAZVwsmmaiaeybZJLgG2Br5eVZc+xPiOAyaywP7+ydJRGz1ifX7+01/ct3/brb9k403WH+m55511GY/d6TGsvc6aADzxqY/jysuvZ4fdtlkpsUqrkp/c+iseven932uPeuT63HLrr5e65tCDn8RLX/V5AC78wY2sucZqbLThOtx2+5086pHr8bmPH8qRb/9PfnTjHWONXd3Vxb5849L3yiDAfOAYBk3ED9U1VbUrsC3wpCQvmOKaBcCTVsB7qWXbP34Lbrnx5/z0x7dx772LOOdbF/PkvXcc6bmbbLoBCy6+hsWLFrNo0WIWXHwNm29lM7E0iksuv5mtt9yYLTbbgNVXm8tB+z+Bb/73FUtdc/Mtv+Bpe24LwHbbbMKaa67GbbffyXrrrsWJn/5zjv7Ytzj/4tG6dUirul5XBhvHA7+oqsuS7LMiXrCqfp7kKOBtDJLNYR8BvpLknKr6YZI5wBFV9Y8r4r01PnNXm8tfvulg3vvXx7FkSfGsA/dgy2025Yuf/QbbPX5z9th7J65aeAN/95YT+M2v7+b8sxcy75/O4Nh5b+Gpz9yFyy64miNfegwh7PbU3+fJTx8tkZT6bvHiJbzj6FP54mcPY+7cOcz76kX88JpbefNrn8kPFvyYb551Be/9yDc45r0H8Zcv+wOo4g3v/AoALz90T7beYiPe+Kp9eOOr9gHgkCM+z22339niJ1IX9LkymKp+tj4m+U1VPWzSsX2AN1XV85McziBxGx5h/Jqq+l5z7Qk8cDTxqVW1U7Mf4BLgdcDciddtzj0feC+wDoPm31Or6i3LCLmf/1DLIQkL7jil7TBmhR03PJC+fu8vjyTepxEl4VE7vrPtMDrvlgV/69fUCJrvvbGmZ6fccPrY/mEO3HL/TqWeva0MTk4Em2NnAWc12ycAJ8zw/MMn7V/PYHDIxH4BuwxdctbQuVOBU5EkSWpZb5NBSZKkCX1uJnYAiSRJUo9ZGZQkSb0318qgJEmS+sjKoCRJ6r05LkcnSZKkPrIyKEmSeq/P1bE+f3ZJkqTeszIoSZJ6z3kGJUmS1EtWBiVJUu85z6AkSZJ6ycqgJEnqPecZlCRJUi9ZGZQkSb3naGJJkiT1kpVBSZLUe1YGJUmS1Esmg5IkST1mM7EkSeq9PlfH+vzZJUmSes/KoCRJ6r04gESSJEl9ZGVQkiT1Xo8Lg1YGJUmS+szKoCRJ6j37DEqSJKmXrAxKkqTe63N1rM+fXZIkqfesDEqSpN5Lqu0QWmNlUJIkqcdS1d9MeDZJn/9kkST1TlWNdXzvJbedOrbfs7tu/PxOjV22mXiWMGlftiTepxF5r0bjfRqd92o03qfRpM/zvLTAZFCSJPVen/NP+wxKkiT1mMmgJElSj9lMLEmSeq/HrcRWBiVJkvrMyqAkSeq9OT0uDVoZlCRJ6jErg5Ikqfd6XBi0MihJktRnVgYlSVLvOem0JEmSesnKoCRJ6r0eFwatDEqSJPWZlUFJktR7VgYlSZLUS1YGJUlS77kCiSRJknrJyqAkSeq9HhcGrQxKkiT1mcmgJElSj9lMLEmSei+ptkNojZVBSZKkHrMyKEmSes8BJJIkSeolK4OSJKn30uPSoJVBSZKkHjMZlCRJvTdnjI9RJNkvyZVJrk5y1BTn905yUZJFSV406dziJJc0j/nLei+biSVJkjokyVzgU8C+wE3A+UnmV9XCoctuAA4H3jTFS9xdVbuO+n4mg5Ikqfc61mdwD+DqqroWIMk84CDgvmSwqq5vzi15qG9mM7EkSdIYJTkiyQVDjyMmXbIZcOPQ/k3NsVGt1bzueUleuKyLrQxKkqTeG2dhsKqOA45biW/xmKq6Ock2wJlJLquqa6a72MqgJElSt9wMbDG0v3lzbCRVdXPz/2uBs4DdZrreZHDIpNE3l0yM3klyVpIbkvt7FCT5WpLfNNtbJbm7ec7CJP+YZJeh17k9yXXN9reTXJHkCUOv9eYknx3/J5YkSTDoMziuxwjOB7ZPsnWSNYBDgGWOCh58jmyYZM1m++HAXgz1NZyKzcRLm2n0zS8Y3NBzkmwAPGrS+WuqatckqwFnAttOvFaSE4BTq+rLzf5+wKeT7A08GngVsPsK/zSSJGnWqapFSV4HnAHMBY6vqgVJ3gdcUFXzkzwZ+CqwIXBgkvdW1Y7A44HPNgNL5gAfmjQK+QFMBkc3j0Fmfg5wMPAVYMfJFzX/gN8DtpvuharqG0leAbwMOAB4T1XdsVKiliRJy9StwcRQVacBp0069q6h7fMZNB9Pft73gCdMPj4Tm4mXtvakZuKXDJ37L2DvZu6fQ4B/n+oFkqwDPAu4bBnv9XrgA8AmVfWFaV7rvtFGxx23MvuZSpKkvrIyuLSZmokXM6gKHgKsXVXXZ+mG/22TXAIUcHJVnT7TG1XVj5OcCZw6wzXDo41qxM8gSZI0MpPB5TOPQfv8e6Y4d83yzPbdWNI8JElSi+Z0rZ14jGwmXj5nAx8ETmo7EEmSpBXByuDS1m6aeid8o6ruWxy6qgo4ZvxhSZKklanHhUEyyG80C/gPtQxJ8Ot5NN6r0XifRue9Go33aTTNfRprfnbLXaeM7R/mUesc2Knc08qgJEnqvaS/Sbp9BiVJknrMyqAkSeq9TrXbjpmVQUmSpB6zMihJknovPS4NWhmUJEnqMSuDkiSp93pcGLQyKEmS1GdWBiVJUu/1uTrW588uSZLUe1YGJUlS7zmaWJIkSb1kMihJktRjNhNLkiT1eHIZK4OSJEk9ZmVQkiT1XqwMSpIkqY+sDEqSpN5L+lsf6+8nlyRJkpVBSZIkRxNLkiSpl6wMSpKk3nM0sSRJknrJyqAkSZKVQUmSJPWRlUFJktR7zjMoSZKkXrIyKEmSZJ9BSZIk9ZHJoCRJUo/ZTCxJknrPSac1G6RrjySvbDuG4UdVtR7DbLhP3ivvU1/ulfdp9t6r5j5pTEwG9VAc0XYAs4T3aXTeq9F4n0bjfRpd7+9Vxvhf15gMSpIk9Zh9BiVJknpcH+vvJ9eKcFzbAcwS3qfRea9G430ajfdpdN6rHktVtR2DJElSq+5c9J2xJUS/t9ozOtVx0MqgJElSj9lnUJIkqYOjfMfFyqAkSVKPWRmUVqAkq1XVorbjkCQtny7O/zcuJoPSivV94IltBzEbJNkcOAR4OvBo4G7gcuDrwOlVtaTF8Dolye488D59q6ruaDUwzUpJPgO8tap+1XYs6gabiTWSJHOTPHxof40kRyT5vzbj6qD+/mm5HJL8C3A8cA/wd8ChwGuAbwP7Aeck2bu9CLshycuTXAS8DVgbuBK4FXga8O0kn0+yZZsxdkWSjzRLqk0+/sokH2ojpg67FrgwyZ+2HUi3zBnjo1ucWkbLlOQQ4LPAncBVwAcY/CI/H3h/VV3UYnidkuQm4KPTna+qac/1SZKdquryGc6vAWxZVVePMazOSfJa4Piqunua87sCG1fVf403su5JciGwe036pZZkDnBpVe3UTmTdlGQzBj+rHg58BrivEl9VX2krrjbdtei7Y0uI1lltr04VDmwm1ijeCTypqq5O8kTgXOBFVXVKy3F10VzgYVghnFFVXZ5kLnBiVb10ivP3AL1OBAGq6lPLOH/JuGKZBdacnAgCVNWSJH4/TlJVNyf5OoM/7g/k/mSwgF4mg/YZlGZ2z0SFpqouSnKVieC0bqmq97UdxGxQVYuTPCbJGk3yp+WQ5PlVdWrbcXTI3Um2r6qrhg8m2Z5BP0s1kuzIoBr4Y2CPqrql5ZDUMpNBjeIRSd44tL/B8L5Nn0vp75+WD861wHeTzGfQDQHwa2pETwZMBu/3LuD0JH8LXNgc251Bf8vXtxZVN/0H8Pqq+mbbgagbTAY1in8C1p1hX/f74MRGkq2r6rqh/YP72hdnBtc0jzn4NbVcqurdbcfQJVV1epIXAm8G/qo5vAD446q6rL3IOulmE8EH6nNvAgeQSCtQkouq6omTt6fa1/2SrFNVd7UdR1clWYvBaOunMejTdQ7wmar6bauBaVbyZ9HUfrv43LElRGvNfWqnMk8rg1qmJF+qqhc3239XVW8dOvfNqnpOe9F1TqbZnmq/95I8Ffgcg0E3WybZBXhlVb2m3cg650Tg18Anmv0/Bb4A/ElrEXVMklMYJMpTqqoXjDGcrtsgycHTnexvC0Z/f0SbDGoU2w9t7wu8dWh/kzHH0nU1zfZU+4KPAc8F5gNU1Q+cX3BKO1XVDkP7/51kYWvRdNMxbQcwi6wPPJ+ps5/ejibuM5NBjWKmJMYEZ2nbNIMhMrRNs791e2F1V1XdOKmvzuK2Yumwi5I8parOA0iyJ3BByzF1SlV9Z7pzSfYaZyyzwI+q6hVtB9E16eBk0ONiMqhRrJNkNwad/NduttM81m41su45aGh7cqXCysUD3ZjkD4BKsjpwJOCqNg/0JOB7SW5o9rcErkxyGVBVtXN7oXVDM2/li4HNgG80c1k+H3g7g59Tu7UZX8f0tz1UU3IAiZYpyVnM3BfnD8cXzezSJDg7MRi9d2vb8XRNs8Thx4FnM/hj4wzgyKq6rdXAOibJY2Y6X1U/GlcsXZXkBGALBuuD78lgDr3dgaOq6msthtY5SXYGtgG2Ay6rqjNaDqkTfrf4/LElRGvOfXKnEnKTQWkFSvKPwCeqakGS9Rms1rIY2Ah4U1Wd1GqAmrWSbMgg2bmvRcelIO+X5HJg52bFkbWAnwDb+ofFAyX5NLAj8D3gWcApVfX+dqNqX5+TQZuJtUwzjTqDPo88m9LTq+pVzfbLgR9W1QuTbAqcDpgMDkmyDYPK4FMYVJ/PBd5QVde2GljHJHk/cDiDORknfmEV8My2Yuqge6pqCUBV/TbJtSaC09ob2KVZBWgd4Gyg98lgn+cZNBnUKA6ctD28FJ0jz5Y2vKzavgxm+qeqftLnHzQz+CLwKeCPmv1DGCTMe7YWUTe9mEGVy2X7pve4JJc22wG2bfaD/Sonu6eqFgNU1V2u3SyTQS1TVb18YjvJxcP7eoBfNJ3Wbwb2Av4CIMlqONhmKutU1ReG9v81yZtbi6a7Lgc2AOx3Or3Htx3ALGLiPKX+5sQmg1pedjKd2SuBY4FNGaz9+ZPm+LOAr7cWVXednuQoYB6Dr62XAKcl2Qigqm5vM7gO+SBwcdMv7ncTB51IeSn/5AT4IzNx1lIcQKLl4jJGD16SJ1fV+W3H0SVJrpvhdFXVNmMLpsOSLAA+C1wGLJk4PtPcen3TtFo4fYwetHuXXDy2hGj1Obt1qgxpZVDLNGmZp+GJlAGrEzNJsgNwaPP4BYOpLtSoKifiHs1dVXVs20F03PousTaaJL9m6VaeNPsTzcTrtRKYWmNlUMuU5Bkznbc6sbQkW3F/Angv8Bhg96q6vr2ouivJTsAOwFoTx6rqxPYi6p4kH2XQPDyfpZuJnVqmkeQ24GSmWWLNFTful+RrDLqyfAWYV1U3LOMpvXDvkkvGWBnctVOVQZNBLVOSE6rq8LbjmA2SnAusx6AP3LyquirJdVbAppbk3cA+DJLB04D9gXOq6kVtxtU1Sf57isNVVU4t07ALy/Jp5kE9mMEI/rWAf2fwM6u3/XT7nAzaTKxR9HRk2YPyUwbLYT0S2AS4CgfdzORFwC7AxVX18iSPBP615Zg6x1V+RtKpX65dV1W/BP4lyecZJITHMkgKP9pqYGqFyaBGMbE28ZQ/bG2qul8zwfTEX9zvSbI9sEGSParq+y2H10V3NytGLEqyHoOpU7ZoO6iuab6m3s1gsmCA7wDva36ha+CwiY0ka1bV74b2n1JV57UTVjc1a4IfCjwdOAf4o6o6u92o2pUe/z1hM7GWqelsfD7T98WxqWoaSR7BYMLgQ4Etq8pEZ0izLNbbGVQm/gb4DXCJc1kuLcl/Mphr8PPNoT9nsILEjKsD9clwM/HkJmObkJeW5HoGA9rmAWcCi4bP9/UP/EVLfjC2hGi1Obt0KvM0GdQyOWXD6JIcXVVvn+bcY6rqR+OOabZoBt6sV1WXLuPS3klySVXtuqxjfTb8c2ryzyx/hi0tyVlM332lt3/gL65Lx5YQzc3OnUoGbSbWyJrF37drdq+uqt+2GU9H7ceg0vUAJoJTa6YDeRqDX07nACaDD3R3kqdV1TkASfYC7m45pq6paban2u+1qtqn7RjULSaDGsVbk/wdg6XVfsSguXiLJP8CvKOq7m01um6Zm2RDpu9f2duRelNpmom3Y7AeMcArkzy7ql7bYlhd9Grg803fQYA7gMPbC6eTNk9yLIPvvYltmv3N2gure5K8pao+3Gz/SVX9x9C5aVs3Vn1z2g6gNTYTa5mSfAx4GPCGqvp1c2w94BgGAwCObDO+LknyOwbrEk/Xv9IVNYYkuQJ4fDU/iJLMARZUlctlTaH5vqOqftV2LF2T5LCZzlfV52c63yf2r5za4rp8jM3EO9lMrFnnAOCxNfSXQ1X9KsmrgSsAk8H7LbRv0nK5GtiSQcUZBiOJr24vnG5KcjTw4ar6RbO/IfA3VfXOdiPrDpO95ZJptqfa740+jyY2GdQoqqYoIVfV4iSWlvVQrAv8X5LvM+jXtQdwwcSShy51eJ/9h5vuquqOJM8DTAYbk5fJnMyvpaXYv1JLMRnUKBYmednkJcKS/BmDyqDu9/GpDjaDbw4c7psjAN7VdgCzxNzhufOSrA2s2XJMXfNU4EYG/U//lx5XuEawS5JfMbhHazfbNPtrTf+0VV1/v2TsM6hlSrIZgzUs7wYubA7vDqzNYKLSm9uKrcuSzAWey2COwecAZ7vM2kCSTFVtXt5r+iLJW4EDgX9pDr0cmD8xCED3fb/ty+D7bWfg68BJVbWg1cA0ayyphWP7eTMnO3Qq8zQZ1MiSPBPYsdldWFX/1WY8XZXkGcCfAs8Dvg/sBWxTVXe1GliHNPOc/SdwclXdMHR8DQbTzBwG/HdVndBKgB2UZD/g2c3ut6rqjDbj6bIkazJICj8CvLeqPtlySJoFiv8bW0IUHm8yKK2qktwE3AB8BvhaVf06yXVVtXXLoXVK02z+CuClwNYMVkNYm8HcDt8EPl1VF7cXYTdYQV0+TRJ4AINEcCtgPnC8rRcahcmgpBWimYbnhQyWDvsicDJwmVPKTC/J6sDDGUxT9Iu24+kSK6ijS3IisBNwGjCvqi5vOSTNMsWVY0wGf99kUFqVJQmwD4PqxPOA9RlM2H1aVf2mxdA0y1hBHV2SJcCdze7wL7YwmBFhvfFHpdmka8lg0zXk48Bc4J+r6kOTzq8JnAg8CbgNeElVXd+cexuD3zuLgb9eVrcSk0FpJWqqXvsBhwDPraqHtxySZikrqNLK9sMxJkSPnTEZbAZE/ZDBoKibgPOBQ6tq4dA1rwF2rqpXJTmEwYDOlyTZgcGo+j2ARwPfZjBX8OLp3q+/a69IK1mSTYANquqUqnopgwmVpQelqu6tqltMBKVe2AO4uqqurap7gHnAQZOuOQiYmGz9y8Czmpapgxh0lfhdVV3HYCL/PWZ6M+cZlFag5hvx3cDrGJT2SbII+ERVva/N2GaLJN+tqr3ajkNS38xcrVuRkhwBHDF06LiqOm5ofzMG82ZOuAnYc9LL3HdNVS1K8ktg4+b4eZOeO+P63CaD0or1BgZTyTy5+YuMJNsAn0nyhqr6h1ajmx22bDsASVqZmsTvuGVeOCY2E0sr1p8z6Ndx3cSBqroW+DPgZa1FNbvYkXkESb7bdgySVpqbWbpr0ebNsSmvSbIag8GKt4343KVYGZRWrNWr6ueTD1bVz5oBAAKSHDzdKQajZbVsVlClVdf5wPZJtmaQyB3CYDGDYfMZTC91LvAi4Myqqmad7i8m+SiDASTbM1gAYVomg9KKdc+DPNc3B85w7tSxRTG7WUGVVlFNH8DXAWcw6H9+fFUtSPI+4IKqmg98DvhCkquB2xkkjDTXfQlYCCwCXjvTSGJwahlphUqymPvnOlvqFLBWVVkd1MiWUUH9x6raZJzxSFo1WRmUVqCqmtt2DLNBko9V1eub7SOr6uND506oqsNbC65brKBKWumsDEoauyQXVdUTJ29PtS9JWrkcTSypDZlmW0Oata4nto+cdO6EsQckaZVkMiipDXOSbJhk46HtjZJsRDNZtwDYe2j7sEnndh5nIJJWXfYZlNSG9YELub8qeNHQOfuu3M8KqqSVzmRQ0thV1VZtxzBLzEmyIYNWnIntiaTQCqqkFcIBJJJa0cyYvz/wuObQQuCMqlrUXlTdkuR6YAlTVwWrqrYZb0SSVkUmg5LGLslmwJnALcDFDJKd3YBNgT+sqh+3GJ4k9YrJoKSxa0bCXlJVH5t0/K+BJ1XV5MESvWUFVdLKZjIoaeySXFFVj5vm3JVV9fvjjqmLrKBKGgcHkEhqw90znLtrbFF03weAz0xTQf0gD5xuRpKWm8mgpDasP826uwHWG3cwHfaUqZbmq6pjk1zZQjySVkEmg5La8B2mX3f3f8YZSMdZQZW00pkMShq7qnp52zHMElZQJa10JoOSWpFkJ+DNwI7NoQXAMVV1WXtRdY4VVEkrnaOJJY1dkoOAYxgMgrigObw78DbgTVV1cluxSVLfmAxKGrskPwAOqqrrJx3fCji5qnZpIaxOsoIqaWWb03YAknpptcmJIEBzbPWxR9NRTQX1qwyai1/RPL4DfKU5J0kPmX0GJbVhUZItq+qG4YNJHgO4ssb93gfsOylxvjTJmcDJzUOSHhKTQUlteDfw7SRHAxc2x3YHjgLe2lpU3TNtBTWJFVRJK4TJoKSxq6qvJbkO+Bvgr5rDC4EXV9UP2ousc6ygSlrpHEAiSR2V5IXAh4EpK6hV9bW2YpO06jAZlDR2SebPdL6qXjCuWLouyS4MKqgTo4kXMhhNbAVV0gphMihp7JL8DLgROAn4XwYratynqr7TRlyS1Ecmg5LGLslcYF/gUGBn4OvASVW1oNXAOsYKqqRxMBmU1KokazJICj8CvLeqPtlySJ1hBVXSOJgMSmpFkwQewCAR3AqYDxxfVTe3GVeXWEGVNA4mg5LGLsmJwE7AacC8qrq85ZA6zwqqpJXFZFDS2CVZAtzZ7A7/EApQVbXe+KPqJiuoklY2k0FJ6igrqJLGwWRQkjrKCqqkcTAZlCRJ6rE5bQcgSZKk9pgMSpIk9ZjJoCRJUo+ZDEqa9ZIsTnJJksuT/EeSdR7Ca+2T5NRm+wVJjprh2g2SvOZBvMd7krzpwcYoSSuSyaCkVcHdVbVrVe0E3AO8avhkBpb7511Vza+qD81wyQbAcieDktQlJoOSVjVnA9sl2SrJlc1cfZcDWyR5TpJzk1zUVBAfBpBkvyRXJLkIOHjihZIcnuSTzfYjk3w1yQ+axx8AHwK2baqSH2mue3OS85NcmuS9Q6/1jiQ/THIO8PtjuxuStAyrtR2AJK0oSVYD9ge+0RzaHjisqs5L8nDgncCzq+rOJG8F3pjkw8A/Ac8Ergb+fZqXPxb4TlX9UbNm8MOAo4CdqmrX5v2f07znHgzmApyfZG8GcwUeAuzK4OfuRcCFK/bTS9KDYzIoaVWwdpJLmu2zgc8BjwZ+VFXnNcem63MkAAABQklEQVSfAuwAfDcJwBrAucDjgOuq6iqAJP8KHDHFezwTeBlAVS0Gfplkw0nXPKd5XNzsP4xBcrgu8NWquqt5j/kP6dNK0gpkMihpVXD3RHVuQpPw3Tl8CPhWVR066bqlnvcQBfhgVX120nu8fgW+hyStUPYZlNQX5wF7JdkOIMnvJXkscAWwVZJtm+sOneb5/wW8unnu3CTrA79mUPWbcAbwiqG+iJsleQTwP8ALk6ydZF3gwBX82STpQTMZlNQLVfUz4HDgpCSX0jQRV9VvGTQLf70ZQHLrNC9xJPCHSS5j0N9vh6q6jUGz8+VJPlJV3wS+CJzbXPdlYN2quohBX8QfAKcD56+0DypJy8m1iSVJknrMyqAkSVKPmQxKkiT1mMmgJElSj5kMSpIk9ZjJoCRJUo+ZDEqSJPWYyaAkSVKP/X8LEE4lJXcukgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 720x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAn0AAAJfCAYAAAAKF2DwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdeXgV1fnA8e9JQlBkR0mQpajgguCKiFUrRAXBXWmrtVX7U7G2LnXDpa60rmVxxYp7W7dKFRdQVNCKFKy4sagVVDYlQUFERU1Izu+PXEISFrGSe8Od7+d57sOdOWfmvmeYuTl5z5lJiDEiSZKk7JaT6QAkSZJU9+z0SZIkJYCdPkmSpASw0ydJkpQAdvokSZISIC/TAUiSJGXaph2OTdvjTL6e92BI12dVZ6ZPkiQpAez0SZIkJYDDu5IkKfFCyP48WPa3UJIkSWb6JEmSQgLyYNnfQkmSJJnpkyRJck6fJEmSsoKZPkmSlHhm+iRJkpQV7PRJkiQlgMO7kiQp8ULIyJ/DTSszfZIkSQlgpk+SJCkBebDsb6EkSZLM9EmSJPnIFkmSJGUFM32SJCnxzPRJkiQpK5jpkyRJiRcSkAfL/hZKkiTJTJ8kSZJz+iRJkpQVzPRJkqTEM9MnSZKkrGCmT5IkJZ6ZPkmSJGUFO32SJEkJ4PCuJElKvEDIdAh1zkyfJElSApjpkyRJieeNHJIkScoKZvokSVLimemTJElSVjDTJ0mSEs9MnyRJkrKCmT5JkqQE5MGyv4WSJEky0ydJkuScPkmSJGUFM32SJCnxzPRJkiQpK5jpkyRJiRcSkAfL/hZKkiTJTp8kSVISOLwrSZISzxs5JEmSlBXM9EmSpMQLIWQ6hDpnp28jEUKImY5BkqR0iTFmfy8szez0bSRadj4j0yHUe0tm3cyKiumZDmOjkJfTjRUVb2U6jHovL2dnKuK7mQ5jo5ATtufb8qmZDqPea5jbnRj9Hf67ZCLr5pw+SZIkZQUzfZIkKfF8OLMkSZKygpk+SZKUeM7pkyRJUlYw0ydJkhLPTJ8kSZKygpk+SZKUeN69K0mSpKxgpk+SJMk5fZIkScoGdvokSZISwOFdSZKUeD6yRZIkSVnBTJ8kSUq8EEKmQ6hzZvokSZISwEyfJElKPB/OLEmSpKxgpk+SJCWed+9KkiQpK5jpkyRJ8u5dSZIkZQMzfZIkSQlIgyWgiZIkSTLTJ0mS5Jw+SZIkZQMzfZIkSWb6JEmSlA3M9EmSJCUgDZaAJkqSJMlOnyRJUgLY6dNqivbdgVee+QOvPncpZw08YLXydlu24LH7fsdLT1zA4387gy0LmleV/ePO0/hg6rU8cPvAdIacMRMnvkH/g86gb5/fccfIR1crLy0t45yzh9K3z+/4+c8u5KMFiwD496S3GHDU+Rx+6NkMOOp8pkyZnu7Q06ryOJ1J3z6nc8fIx1YrrzxOw+jb53R+/rOLah2nQRx+6DkMOGpQ1h8ngIkvvcZBfU+jz4EDGTly1GrlpaVlnP376+lz4EB+9tPzWLCgpKrs9tsfoc+BAzmo72lMnPh6OsNOu5cnvsWh/c6lf9+zufOOJ1YrLy0t47yzb6J/37P5xc8v5aOPPqlRvvDjT+mx+6+59+6n0hWy6rkYQtpemWKnLyWE0C6E8HgIYVYI4f0Qwo0hhL4hhDdTry9DCP9Nvf9rCKFXCOGpWvu4N4QwIPX+xWr13wwhjEqtvyKE8FFq3dshhGMz0d61yckJXH/5T/nZKX/hx/2v5qhDdme7bQpr1Bl8wRE8PPpVfnLYdQy59RkuPe/QqrJb7hrPaef/Pd1hZ0R5eTl/GnwHt9/xB5586gbGjnmZ2bPn16jzz1Hjadq0MeOevZUTTjiEoUP/BkDzFk0YcdtFPP7kcK659gwuHHRTJpqQFpXH6a7UcRrO2DGT1nCcJqSO0y2p41R5DjVv0ZQRt13I408O45prT+fCQTdnoglpU15ezuDBt3PHnZfz1JhbGfPUS8yePa9GnVGPPEfTpo159rmRnHDiYQwdch8As2fPY+yYiTw15lbuvPNyBl/5F8rLyzPRjDpXXl7BVX+8hxEjB/H4k3/m6TH/5v3ZC2rUeXTUizRtthljxw3nV8f3Y/iQB2uU//m6v7PPvjunM2wp4+z0ASGEADwKjI4xdga2BRoDB8QYd4kx7gJMBY5LLR+/nrteWX+XGOOAauuHp/Z5OHB7CKHBBmzOD7LbTj/iw7mfMHf+YsrKynlszOv0O6BbjTrbdSrkpcnvATBxyiz67b+q/KXJ7/HlV9+kNeZMmT5tNh06FNK+fSH5+Q3o138fJox/tUadCeP/wxFH9AKgT9+9mDJ5OjFGunTZmtYFLQHo1Lk933xbSmlpWbqbkBarjlNB6jjtzYTxU2vUmTD+VY44Yj8A+vTtyZTJM1LHaavEHCeAadNm0eFHbarOqf4H78v48a/UqDN+wisccWQRAH377s3kyW8RY2T8+Ffof/C+5Oc3oF37Qjr8qA3Tps3KRDPqXOU5VUD79gU0yM+jX/+9eGHCazXqvDBhKocdvi8AB/bdk1emVJ5TAOOff5W27bagU6d2aY9d9VhI42t9wgnhoFTyaHYI4cI1lHcIIbwQQngjhDAthND/u/Zpp69SEfBNjPEegBhjOXA28H8hhEZ19aExxlnAcqBFXX3G99WmoDkfFS+tWv64eCltCprVqDPj3Y84pE/lb8iH9NmJJo03oUXzOjtM9VZJyRIK22xetVxY2JJFJYtr1lm0qk5eXi5NmjRi6dIvatR5dtwUunTZivz8etP336Aqj1OrquUfdpy2ztrjBFBSspg2hdXOqYLNKal1rBaVLKZNjWO1GUs/+2IN27ZabdtssWjRZxQWrjqnCgpaUlKypGadks+qzru8vFwap86p5V99w913Pslpvz06rTFL30cIIRe4FegHdAGODSF0qVXtEuAfMcZdgWOAEd+1Xzt9lXYEavyaGGNcBswDOq1ju32rDd++CRxWq/z+auV/rr1xCGE3YFaMcdGadh5CGBhCmBpCmPrN5zO+V4Pq0uXXjWbvHp14YfQgfrxHJz4uXkp5ecx0WBulWbPmMWzo37jiyt9kOpR6bdas+Qwbej9XXJmMuaKqOyNu/Se/OqE/jTbbJNOhqL7JCel7fbcewOwY4wcxxlLgISpHB6uLQNPU+2bAx9+1U5/T98NMjDEesnIhhHBvrfLjYoxTWd3ZIYRfUzmMfOgaygGIMY4ERgK02vbMtPSqFpYspW3hqhsztixszsKSz2vUKV60jBNOvwuAzRrlc2jfXVj2xdfpCK9eKShoSfHCT6uWi4uX0LqgVc06rSvrFBa2YsWKcr74YjnNmzdJ1V/MmadfzzXXnUmHDjXnTWaTyuO0KuP0vx2nP3PNdadn9XECKChoxcLiaudUyacU1DpWrQtasXDhpxQWbp46Vl/RvEWTNWy7eLVts0Xr1i0oLl51TpWULKEgNQ2gqk5BC4oXLq46p75MnVPTp83muXGvMHzIA3zxxXJCTiC/YQN+cVzfdDdDCRZCGAhU/y12ZOpn/kptgeqTnxcAe9bazRXAsyGEM4DNgNXvvKzFTF+lt4Hdq68IITQFOgCz6+DzhscYdwSOBu4KIdSbXznfmD6PrTtuQYd2LWnQIJcjD96Np8fXvGOyZYvNCKm7j35/6oHcP2pKJkLNuK7dOjF37kIWLCihtLSMp8e+TO+i7jXq9C7ag9GjXwTg2XGT2bNnV0IILFv2FaedehXnnPtLdttt+wxEnz6rH6dJazhO3Rk9+l9A5TBuzeN0Deece1zWHyeAbt06M3fOxyyYX0xpaRljx0ykqKjm93xRUQ9GPzYBgHHjJtGz506EECgq2pOxYyZSWlrGgvnFzJ3zMTvt1DkTzahzXbttw9y5xSxYsIiy0hU8PXYyvXrX+AqnV+/deeLxiQA8N+4VevTckRAC9/39csaNv4lx42/il8cfxCkDD7fDp0ohpO0VYxwZY+xe7TXyuwNczbHAvTHGdkB/4G8hhHX268z0VRoPXBtCOD7G+NfUWPpQKg/m8rr60BjjEyGEk4ATgNvr6nO+j/LyCi4YPIpH7votubk5PDBqCv+dXcyFZ/bnzRnzeGbCDPbu0ZlLzz2EGGHy1PcZdMUjVds/9cBZdN66gM0a5TP9pcGcefEDvPDyuxlsUd3Jy8vlD5eezCkn/ZGKigqOPLqIzp07cPNND7Jj104UFe3B0QP254JBN9G3z+9o3qwxQ4adDcAD9z/NvHnFjBjxCCNGVB6/O++6jFatmq3rIzdKlcfpJE456arUcepN587tufmmh9ix6zap41TEBYNupm+f02sdp2fWcJwuzcrjBJXH6tLLTuWkk6+goryCo48+gM6dO3DTjffTtWsnivbfkwEDDmTQ+cPoc+BAmjVrwrDh5wPQuXMH+vXbh4P7/47c3Fwuu+w35ObmZrQ9dSUvL5eLLzmR35x8LeUVFRx5VC86dW7HLTc9wo5dt6Z30e4cNaAXF10wgv59z6ZZs824fugZmQ5b+j4+AtpXW26XWlfdScBBADHGyakE0ubAGqeMAYSVdzMlXQihPZWTILenMgM6FjgvxvhtqvzF1PLU1HKv1HLt4d2nYoyjUvXbACvHPT+NMR4QQrgC+DLGOCS1ze7AA8AOMcaKtcWXruHdjdmSWTezoiL7n+O2IeTldGNFxVuZDqPey8vZmYqYnb+0bGg5YXu+LV/TbBZV1zC3O/7c/W6hMhuW1gfadS66I23/MbMmnLLOtoUQ8oD3gP2p7Oy9CvwixjizWp2ngYdjjPeGEHagMoHVNq7jBDPTlxJjnM+659f1qrX8IvBirXUnrq1+tfVX1Fp+DdjuewUrSZKyVoxxRQjhdGAckAvcHWOcGUIYDEyNMT4BnAvcEUI4m8qbOk5cV4cP7PRJkiSt7121aRNjHEvlqGP1dZdVe/82sPf32ac3ckiSJCWAmT5JkqQM/k3cdDHTJ0mSlABm+iRJkrI/0WemT5IkKQns9EmSJCWAw7uSJEn17JEtdcFMnyRJUgKY6ZMkScr+RJ+ZPkmSpCQw0ydJkhIv+nBmSZIkZQMzfZIkSd69K0mSpGxgpk+SJCn7E31m+iRJkpLATJ8kSZJ370qSJCkbmOmTJEny7l1JkiRlAzN9kiRJ2Z/oM9MnSZKUBHb6JEmSEsDhXUmSJB/ZIkmSpGxgpk+SJMlMnyRJkrKBmT5JkqQEpMES0ERJkiSZ6ZMkSUrAnD47fRuJJbNuznQIG4W8nG6ZDmGjkZezc6ZD2CjkhO0zHcJGo2Fu90yHsFEICehcqH6y07eRWFHxVqZDqPfycnZmk/bHZDqMjcI38x+iIr6d6TDqvZzQhRhjpsPYKIQQKK+Ykekw6r3cnK6eU+shIx3jBPTFndMnSZKUAGb6JElS4sWc7E/1memTJElKADN9kiRJCbjBxkyfJElSApjpkyRJyv5En5k+SZKkJLDTJ0mSlAAO70qSJPnIFkmSJGUDM32SJEk+skWSJEnZwEyfJElS9if6zPRJkiQlgZk+SZIk796VJElSNjDTJ0mSZKZPkiRJ2cBMnyRJSryY/Yk+M32SJElJYKZPkiTJOX2SJEnKBmb6JEmS/Nu7kiRJygZ2+iRJkhLA4V1JkiRv5JAkSVI2MNMnSZKUgDRYApooSZIkM32SJEk+skWSJEnZwEyfVjNx4htcc9U9lFdUMGDA/pwy8Mga5aWlZVx4wc3MnPkBzZs3Ydiws2nbrjX/nvQWw4beT1nZCho0yOO8Qb+iZ89uGWpFehy4384MueJ4cnNzuPehFxgy4oka5R3abs5fhpzK5i2b8tnSL/m/s27lo+IlAHz54f3MeHceAPM/XsxPTxqS9vjTZeJLr3PVVXdSUVHBgJ8eyMCBR9coLy0t44JBNzBz5vuV59Tw82jXroDPPlvGWWdez4wZszniyCIuu2xghlqg+mbixNe5+qq7K8+pAQdwysCjapSXlpZxwQU38nbV99S5tG3XmkmT3mTY0L9XfU+dP+iErP+e0nry7t2NVwihPITwZghhZgjhrRDCuSGEnFp1bgghfLRyfQjh16lt3gwhlIYQpqfeX1ttm9EhhCm19rNdCOHFVN13QggjU+t7hRA+r7bPN0MIP6/2vjj1+SuX89NxbNalvLycPw2+i9vv+ANPPjWcsWMmMXv2/Bp1/jlqAk2bNmbcs7dwwgmHMHTo3wFo3qIpI267kMefHMY1157OhYNuzkQT0iYnJ3DDn37N4Sdcx677n8dPD/sx23duW6PONZccx/3/nEiPvhdw9Y2PMvjCY6rKvv6mlJ79LqJnv4uyusNXXl7O4MG3c8edl/HUmJsZ89TE1c6pUY88R9OmjXn2ub9wwomHMXTIXwFo2DCfs876BYMGnZiByFVflZeX88fBdzDyjkt48qkbGTNmDefUqOdp1rQx454dwfEnHMqQoZXnVIsWTbnttot54skbuObaM7hg0I2ZaIKUEVnb6QO+jjHuEmPcETgQ6AdcvrIw1dE7EpgP7AcQY7wntc0uwMdA79TyhaltmgO7A81CCFtX+6ybgOGpujsA1Xs7E1fuM/V6uNpn/KXadrvEGEvr6mCsr+nTZtOhQyHt2xeQn9+Afv33ZsL4qTXqTBj/KkccsR8Affr2ZMrkGcQY6dJlK1oXtASgU+f2fPNtKaWlZWlvQ7rssUsn3p9TzJx5iygrK+eRJydzSJ/uNeps37kd/5o0A4B//Xsmhxy4eyZCzahp02bR4UdtaN++kPz8BvQ/eB/Gj3+lRp3xE/7DEUf2BqBv3x8zefI0Yow0arQJu3fvQn7DBpkIXfXUtGmz6dCh2jnVfx8mjP9PjToTxr/K4UesPKf2Ysrk6anvqa2rvqc6d+7At1n+PaX1F0NI2ytTsrnTVyXGuAgYCJweQtXR7gXMBG4Djl3PXR0FPAk8BBxTbX0bYEG1z5v+A0POmJKSJRS2aVW1XFjYkkUli2vWWbSEwjabA5CXl0uTJo1YuvSLGnWeHTeFLl22Jj8/e39Yb1nYggUfrzo2Hy1cTNuCFjXqTH97Lof36wHA4QftQdMmjWjZvDEAmzRswMtPXcW/Rg/m0FqdxWxSUrKENoWbVy0XFrSipGRJjTqLSpbQpvY59VnNc0paaVHJ4hrfUwWFq59TJYsW0yZVZ+3fU5PZIcu/p6TqEjOnL8b4QQghF2gNlFDZ0XsQeBy4OoTQIMb4Xb/uHQsMTm3/T+Dq1PrhwIQQwr+BZ4F7YoxLU2X7hhDerLaPo2OM72+QRtVTs2bNZ9jQ+7njrksyHUrGXXTV/QwffCK/HLAfk/7zDh8tXEx5RQUA2+11Bh+XfEbHDq155sFLmPHfeXw4d1GGI5aSYdaseQwd+jfuvOvy766sZEhAGiwBTVxdau5cf2B0jHEZ8ArQ9zu2KQA6Ay/HGN8DykIIXaFyWBjYAXiEygzilBBCw9SmtYd317vDF0IYGEKYGkKYesfIUd+zlf+bgoKWFC9clb0qLl5C64JWNeu0bknxwk8BWLGinC++WE7z5k1S9Rdz5ul/5prrTqdDh8K0xJwpHxd/RrstVx2btm1a8VHJZzXqLCz5jGNOHc5e/S/i8usfBuDzZcsrt0/VnTNvES9NeZtdduyYnsDTrKCgJQuLP61aLi5ZTEFqeG2l1gUtWVj7nGrRJK1xauPRuqBVje+pkuLVz6mC1q1YmKqz+vfUp5xx+nVce92ZWf89JVWXmE5fag5eObCIyg5ec2B6CGEOsA/fPcT7M6AF8GFqm47Vt4kxfhxjvDvGeDiwAuj6Q2OOMY6MMXaPMXY/ZeCAH7q79dK1Wyfmzl3IggUllJaW8fTYSfQuqjn02LuoO6NH/wuoHMbds2dXQggsW/YVp516Deecexy77bZ9WuLNpKlvvU+nrQr5UfstaNAgl58euhdjnnutRp1WLZqwckbB+b87nPsefhGA5s02Iz8/r6rOXt235Z1ZH6U1/nTp1q0zc+csZMH8ynNq7JiXKSrqUaNOUVEPRj/2AgDjxv2bnj27ERLwzCz9b7rV+p4aO/ZlehftUaNO76I9eHz0ynNqctU5tWzZV/zm1Ks459xfsdtuO2QifNVXOSF9rwxJxPBuCGELKm+auCXGGEMIxwInxxgfTJVvRmVnrlGMcfladnMscFCMcXJqm62A54E/hBAOAsbHGMtCCIVAK+AjYKPr+eTl5fKHS0/ilJOuoqKigiOP7k3nzu25+aaH2LHrNhQV7cHRA4q4YNDN9O1zOs2bNWbIsLMBeOD+Z5g3r5gRIx5hxIhHALjzrktp1apZJptUZ8rLKzj70nt58m8XkZubw30Pv8g77y3g0nMG8Pr0Dxnz3Gv8ZK8dGHzBMcQIL7/yDr+/9B4Atu+0JTdfczIVFZGcnMCQEU/wbpZ2+vLycrn0slM46eQrqSgv5+ijD6Bz5w7cdOMDdO3aiaL9ezBgwAEMOv8G+hz4G5o1a8Kw4edWbV9UdApfffk1ZWUrGP/8K9x19xV06tQ+gy1SpuXl5XLJpSdz8kmDqaio4Kij9688p256kK5dt6GoqAcDBuzPBYNupG+f39KsWWOGDjsHgPvvH8u8ecXcNuIf3DbiHwDceddltGrVPJNNktIixBgzHUOdCCGUA9OBBlRm3v4GDAM2ofKmi46pod2V9R8FHo4xPpxangN0jzF+GkLoCEwC2sVqByyE8DpwGvBz4GDgm1TRn2OMfw8h9KJyzuCH1UL7U4xxVGr7K4AvY4zf+byO8jgtO/+jNqC8nJ3ZpP0x311RfDP/ISri25kOo97LCV3I1u/IDS2EQHnFjEyHUe/l5nT1nFoPIQRijGlNiW113pNp+4/5cMihGUn3ZW2mL8aYu5ai5UDL2itjjEfVWu5Y7f0coG2tTYgx7pZ6+wpwzhrKXwTWmuaKMV6xtjJJkqQNKTFz+iRJkpIsazN9kiRJ680/wyZJkqRsYKZPkiQp+xN9ZvokSZKSwEyfJElKvOicPkmSJGUDM32SJElm+iRJkpQNzPRJkiQFM32SJEnKAmb6JEmSEpAGS0ATJUmSZKZPkiTJOX2SJEnKBmb6JEmSfE6fJEmSsoGdPkmSpARweFeSJMnhXUmSJGUDM32SJCnxoo9skSRJUjYw0ydJkpSANFgCmihJkiQzfZIkSc7pkyRJUjYw0ydJkuRz+iRJkpQNzPRJkiSZ6ZMkSVI2MNMnSZKU/Yk+O30bi7ycnTMdwkbhm/kPZTqEjUZO6JLpEDYKIQGPcdhQcnO6ZjqEjYLnlDLFTt9G4r2lT2Y6hHpv2+aHUlr+eqbD2Cjk5+7GNkfcl+kw6r33R5/gObWe8nN3I8aY6TDqvRCCx2k9ZKJjHOvZnL4QwkHAjUAucGeM8do11PkZcAUQgbdijL9Y1z7t9EmSJNUjIYRc4FbgQGAB8GoI4YkY49vV6nQGLgL2jjF+FkJo/V379UYOSZKk+qUHMDvG+EGMsRR4CDi8Vp1TgFtjjJ8BxBgXfddO7fRJkiSFkLZXCGFgCGFqtdfAWtG0BeZXW16QWlfdtsC2IYRJIYQpqeHgdXJ4V5IkKY1ijCOBkT9wN3lAZ6AX0A54KYTQLca4dF0bSJIkJVv9upHjI6B9teV2qXXVLQBeiTGWAR+GEN6jshP46tp26vCuJElS/fIq0DmEsFUIIR84BniiVp3RVGb5CCFsTuVw7wfr2qmZPkmSpHqU6IsxrgghnA6Mo/KRLXfHGGeGEAYDU2OMT6TK+oQQ3gbKgfNjjIvXtV87fZIkSfVMjHEsMLbWusuqvY/AOanXerHTJ0mSEi8nARPeEtBESZIkmemTJEmJl4Q/iWymT5IkKQHM9EmSpMQz0ydJkqSsYKZPkiQlXkhAqs9MnyRJUgKY6ZMkSYmXgESfmT5JkqQksNMnSZKUAA7vSpKkxHN4V5IkSVnBTJ8kSUq8kIA0WAKaKEmSJDN9kiQp8ZzTJ0mSpKxgpk+SJCVejpk+SZIkZQMzfZIkKfGc0ydJkqSsYKZPkiQlnpk+SZIkZQUzfZIkKfFCAlJ9dvq0mtcmv8vIoaOpqKigz+F78tMT9q9RPuP197lj+ON8OHshg/70S/bZf+eqsrtvepKpk96hIkZ27bEtA889IqsvpJcnvsm1V99HeUUFRw8o4uRTDq9RXlpaxkUX3Mrbb39I8+aNGTLsLNq2bc30abO54vI7AIgx8tvfDeCAA3tkoglp8ZNdt+SSk/YgNyfwj+dnc/ujM2qU/+HX3dmzWyEAmzbMo1WzTdjtlw8B8N9Rv+S/85YCsPCTrzj1mhfSG3yaeU5Jqit13ukLIUTg/hjjL1PLecBC4JUY4yGpdUcAg4EGwArg0hjj6FTZvcB+wDJgU2AKcHGMcUGqfA7wBVCe+siXYoxnprZ7KsY4ah2xbQo8AxTFGMvXVq+uhBC6AefGGE9M92evTXl5Bbdd/yh/uuVUWrVuxtkn3MCe++5Ih60Lq+psUdiC3192DI/+/cUa274z7UPemTaHmx84D4BBp9zC9NffZ6fdO6WzCWlTXl7Bn/54N3fc9QcKC1rx859dTO/eu7NNp3ZVdR4d9QJNmzXm6XE3MnbMvxk25AGGDv89nTq35+FHriYvL5dPFn3G0UdeQK/eu5OXl5vBFtWNnJzAFQP35IQrnqN48XIevb4/4/8zn9kLPq+qc9U9U6ve/6r/9nTZumXV8jel5Rx2zlNpjTlTPKekzPFv724YXwFdUx0sgAOBj1YWhhB2BoYAh8cYdwAOA4aEEHaqto/zY4w7A9sBbwATQgj51cp7xxh3Sb3O/B6x/R/waIY6fHkxxulAuxBCh3R//tq8N3Mebdq1orBtKxo0yOMnfXZlyksza9Qp2LIlW3XekpzVnmQZKC1dwYqycsrKVlC+opwWLZukL/g0mz5tNh06FNK+fQEN8vPo1//HTJgwtUadCROmcvjhPwGgT989eWXKTGKMbLppw6ofxt+WlmX1DOKdO7di7sIvmF/yJWUrKhjz8hwO6NF+rfUP3bcjT038MI0R1h+eU5LqUrr6tWOBg1PvjwUerFZ2HnB1jPFDgNS/1wDn195JrDQcKAb6bYC4jgMeBwgh9AohvBRCGDqnDzIAACAASURBVBNC+G8I4S8hVPb7Qwi3hRCmhhBmhhCuXLlxCGFOCOH6EML0EMJ/QgidUuu3CCH8M4Twauq1d2r9FSGEv4UQJgF/S+3mSeCYDdCWDWLxJ5+zRUHzquXNWzdj8Sefr2OLVXbYqSM77b4Nx/e/guP7XcluPbej/VYFdRVqxi1atITCwlZVywUFLVlUsqRmnZIlFLaprJOXl0vjJpuydOkXAEx7axaHH3IeRx5+PpddflLWZmQKWjZi4adfVS0XL15OQatGa6y75Rab0a51YyZPL65a1zA/l8f+3J9R1/ZbZ2cxG3hOSapL6er0PQQcE0LYBNgJeKVa2Y7Aa7XqT02tX5vXge2rLb8QQngz9Tp7fQJKZQq3jjHOqba6B3AG0AXYBjgqtf4PMcbuqdj3q5WF/DzG2A24Bbghte5GYHiMcQ/gaODOavW7AAfEGI+t1tZ91xLjwFRnc+pD9z6zPs3KqI/nf8r8OYu496nLuG/MZbw1dTYz3vgg02HVWzvt3JnHnxrCQ/+4mjvveJxvvy3NdEgZd8g+HXlm8jwqKmLVuv0G/pMjzx/L2cMncslJe9ChsHEGI6zfPKek/10I6XtlSlo6fTHGaUBHKrN8YzfALmsfsurDu8PXcx+bA0trrftPjPGD1HDvg8A+qfU/CyG8TuXQ8o5UdtxWerDav3ul3h8A3BJCeBN4AmgaQlj5k+qJGOPX1bZfBGy5pgBjjCNjjN1jjN2POfGg9WzWD9Nqi2Z8UrLqsHy66HNabdFsvbad/OJ0tuv6IzZt1JBNGzWk+4+3593pc+oo0sxr3bolxcWLq5ZLSpbQuqBlzToFLSleWFlnxYpyvvzia5o3rznkvc02bWnUaBNmzZpf90FnQMmS5bTZfLOq5cJWjShZvHyNdQ/ZZyuerDW0W7Kk8nKZX/Ilr8wopstWLde0aVbwnJJUl9I5bfEJKufuPVhr/dvA7rXW7Q7MZO12Bd75gfF8DWxSa12svRxC2IrKIej9Y4w7AWNqbRfX8D4H6FmtI9o2xvhlquwratokFUu9sG2X9nw8/1OKP1pMWdkKXnr2Dfbcd11J11W2KGzOjNffp3xFOStWlDP99fezeni3a7dtmDe3mAULFlFWuoKnx/6b3r1rnsq9e+/O44+/BMCz415hz547EkJgwYJFrFhROZX0448+4cMPPqZt2y3S3oZ0mDZrMT9q04R2rRvTIC+Hg/fpyPhXV++MbN22KU0b5/PGfz+pWtd0s3zy8yq/plo0acju27dm9vz1m26wMfKckjInCZm+dD6y5W5gaYxxegihV7X1Q4BHQggTYoxzQggdgYuBAbV3ECqf/XEG0IbKu27/ZzHGz0IIuSGETWKM36RW90h18uYCPwdGAk2p7Kh9HkIooHIu4YvVdvVz4NrUv5NT655NxfnnVNy7xBjfXEso2wIz1lKWdrl5ufzm/KO47MyRVFREDjy0Bz/appC/3/4MnXdox54/6cp7b8/jqkH38uWyr/nPxLd5YOQ4Rjw8iL2Ldmba1Nn87hdDCCGwW8/t1rvDuDHKy8vl4kt+zaknX015RQVHHtWbTp3bc8tN/2DHrlvTu6g7Rw3ozUUX3Eq/vmfRrFlj/jy08j6j1197l7vueIK8BrnkhMAll/0fLVo0zXCL6kZ5ReTKO/7DPZcfQG5O4JHxs5k1/3POOnZnZsxezPhXFwCVWb4xL8+pse027Zrxp9N6UlERyckJ3P7ojBp3/WYbzylJdSnEWDu5tYE/IIQvY4yNa63rBZxX7ZEtRwFXUvnIljLg8hjjo6mye1n1yJZGVD6y5aJ1PLJlWozx+NR2h7IqizY/xrhy+HVlHHcBD8YYn0/FNDi1r07AC8BvY4wVqX39GJgPfE7lEO29qc9+mMqO4LfAsTHG2SGEzYFbgR2o7Fi/FGP8TQjhCuDLGOOQajHcAoyLMT65ruM46/On6vY/Kgts2/xQSstfz3QYG4X83N3Y5oj7Mh1Gvff+6BM8p9ZTfu5u1PXPk2wQQvA4rYfUcUprTmyX+yem7T/mzeP2zUi+r84zfbU7fKl1L1ItW5bq4D26lu1P/I79d/xftku5FTgbeD61vGxlR/R77OvPMcYLatX/lMrMX+39XFF9OYTQEOgO/H49YpUkSfqfJfovcsQYXw8hvBBCyNRzDToAF8YYV2To8yVJErDao2ezUKI7fQAxxrtTb1+k5ly99dm24w/87FnArB+yD0mSpPWR+E6fJElSEv6ITQL+0pwkSZLM9EmSpMQz0ydJkqSsYKZPkiQlXkjA7btm+iRJkhLATJ8kSUo85/RJkiQpK5jpkyRJiWemT5IkSVnBTp8kSVICOLwrSZISz+FdSZIkZQUzfZIkKfES8GxmM32SJElJYKZPkiQlnnP6JEmSlBXM9EmSpMQLCUiDJaCJkiRJWu9MXwihYYzx27oMRpIkKROc0weEEHqEEKYDs1LLO4cQbq7zyCRJkrTBrE+m7ybgEGA0QIzxrRBC7zqNSpIkKY1CAlJ96zOnLyfGOLfWuvK6CEaSJEl1Y30yffNDCD2AGELIBc4A3qvbsCRJktInAYm+9cr0nQacA3QASoCeqXWSJEnaSHxnpi/GuAg4Jg2xSJIkZUQSMn3f2ekLIdwBxNrrY4wD6yQiSZIkbXDrM6fv+WrvNwGOBObXTTiSJEmqC+szvPtw9eUQwt+Al+ssIkmSpDRzeHfNtgIKNnQgWrdtmx+a6RA2Cvm5u2U6hI3G+6NPyHQIGwXPqfWXhOecbQgeJ2XK+szp+4xVc/pygCXAhXUZlFa36OvHMx1Cvdd608OpiG9nOoyNQk7owpJvnsx0GPVey00OZZuj/pbpMDYK7z/6Kyriu5kOo97LCdsT42rT5FVLJjrGOQnoi6+z0xcqj/rOwEepVRXRs1WSJGmjs85OX4wxhhDGxhi7pisgSZKkdEtCpm99Hs78Zghh1zqPRJIkSXVmrZm+EEJejHEFsCvwagjhfeArIFCZBHR2syRJygo5Iftnr61rePc/wG7AYWmKRZIkSXVkXZ2+ABBjfD9NsUiSJGVEEub0ravTt0UI4Zy1FcYYh9VBPJIkSaoD6+r05QKNSWX8JEmSstX63Nm6sVtXp29hjHFw2iKRJElSnfnOOX2SJEnZLgl3764rm7l/2qKQJElSnVprpi/GuCSdgUiSJGVKEu7eTcK8RUmSpMSz0ydJkpQA67qRQ5IkKRGSkAVLQhslSZISz0yfJElKPG/kkCRJUlYw0ydJkhIvJPzhzJIkScoSZvokSVLiOadPkiRJWcFMnyRJSrwkZMGS0EZJkqTEM9MnSZISL8e7dyVJkpQNzPRJkqTE8+5dSZIkZQUzfZIkKfGSkAVLQhslSZISz06fJElSAji8q9VMmfQuN173BBUVFRxyZA9+dVJRjfI3X/uAm65/gvdnLeSK646j94E7AVD88WdcfPZ9VMQKVpRVMODYvTniZ3tloglpM/Gl17nqqjupqKhgwE8PZODAo2uUl5aWccGgG5g5832aN2/CsOHn0a5dAZ99toyzzryeGTNmc8SRRVx22cAMtSA9Jr/8LsOvG01FRQWHHbUnx5+0f43yN6a+z/DrH+f9WQv543W/pKjPzlVlv//NSGZMn8vOu27F0FtOTnfoafWTXdpwyf/tQW5O4B/jZ3P7YzNrlP/hxN3Zs2sBAJs2zKNVs03Y7fh/APDff/yC/85bCsDCT5dz6rUvpjX2dJv40mupa6+cAT/tw8CBA2qUV157w5k5czbNmzdl2PDzq1171zFjxqzUtfebDLVA9U0SbuSos05fCCEC98cYf5lazgMWAq/EGA9JrTsCGAw0AFYAl8YYR6fK7gX2A5YBmwJTgItjjAtS5XOAL4Dy1Ee+FGM8M7XdUzHGUeuIbVPgGaAoxli+tnp1LYRwOrA8xnh3pmKorby8gmFXP8bw2wfSuqAZJ//iJvbptSNbbVNQVaegsDkX//FnPHjfv2ps22qLJvzlb6eTn5/H8uXfcvzRQ9mnVxc2b90s3c1Ii/LycgYPvp2777mSgoJW/HTA+RQV9aBTp/ZVdUY98hxNmzbm2ef+wpgxExk65K8Mv+F8GjbM56yzfsGsWfN4b9a8DLai7pWXVzDk6ke5aeSptC5oxq+PvYF9e+3IVtsUVtUpaNOCS/90DA/c++Jq2x93Yi+++aaM0aMmpzHq9MvJCVxxSg9OGDye4sXLefS6fox/dQGzF3xeVeeqe1+rev+rftvRZasWVcvflJZz2Hlj0xpzpqy69ganrr1zU9deh6o6q669kYwZ8xJDh9zH8BsGpa6945g1ay7vzZqbwVZI6VeXw7tfAV1THSyAA4GPVhaGEHYGhgCHxxh3AA4DhoQQdqq2j/NjjDsD2wFvABNCCPnVynvHGHdJvc78HrH9H/BoJjt8KXcDZ2Q4hhremTGPdu03p227VjRokMcBB+3Cyy/WzDa0aduSTttuSU6tX4saNMgjP7/y94iy0hVUVGT3gy6nTZtFhx+1oX37QvLzG9D/4H0YP/6VGnXGT/gPRxzZG4C+fX/M5MnTiDHSqNEm7N69C/kNG2Qi9LR6e8Y82nVoVXVOHXjQrrz0Qs1zasu2Lem87ZaENfyqvUfPbWm0WcN0hZsxO3dqxdziL5hf8iVlKyoY8/IcDtij3VrrH7pPR556eU76AqxHVr/29l3DtfcKRxxZOUrRt+/eTJ78Vq1rL39Nu1aC5YSYtlfG2ljH+x8LHJx6fyzwYLWy84CrY4wfAqT+vQY4v/ZOYqXhQDHQbwPEdRzw+MqFEMIFIYTpIYS3QgjXptbtEkKYEkKYFkJ4LITQIrX+xRDC8BDC1BDCOyGEPUIIj4YQZoUQ/pSq0zGE8G4I4f5UnVEhhEZraNdyYE4IoccGaNMG8cmiZbQubF61vEXrZnxS8vk6tqippHgpJwwYylF9r+K4X/fK2iwfQEnJEtoUbl61XFjQipKSJTXqLCpZQps2lXXy8nJp0qQRSz/7Iq1xZtonJZ/TumDVOdW6oBmfLFr/cyopClo2YuGny6uWi5csp6DVal8bAGy5xWa0K2jM5BklVesa5ufy2HX9GHVNXw7osfbOYjYoKVlc69rbnJKSxTXqLCpZXOva2yxx155UW113+h4CjgkhbALsBFT/VWxH4LVa9aem1q/N68D21ZZfCCG8mXqdvT4BpTKFW8cY56SW+wGHA3umsorXp6r+FbggxrgTMB24vNpuSmOM3YG/UNl5/B3QFTgxhNAqVWc7YEQqi7kM+O1aQpoK7LuWWAemOpdT/3rXuPVpXsYVFDbnvlHn8vCTF/DME6+xZLFfstKGdsjeP+KZyXNrZNP3+81jHHnB05x9wyQu+XV3OhQ0zmCE0sYnJ6TvlbE21uXOY4zTgI5UZvk2xGST2oeq+vDu8PXcx+bA0mrLBwD3pLJuxBiXhBCaAc1jjCsnrd0H/KTaNk+k/p0OzIwxLowxfgt8AKyc0DU/xjgp9f7vwD5riWcRsOWaCmKMI2OM3WOM3Y8/qe96Nu+H2aJ1UxYVrzo8nyz6nC0Kvn+2bvPWzdiqUyFvvf7hhgyvXikoaMnC4k+rlotLFlNQ0LJGndYFLVm4sLLOihXlfPHFcpq3aJLWODNti4JmLCpZdU4tKvmcLbI4A/y/KlmynDabr8rsFbZsRMni5Wuse8jeHXmy1tBuyZKvAZhf8iWvzCyhy1Yt17BldigoaFXr2vuUgoJWNeq0LmhV69r7KnHXnlRbOh7Z8gSVc/cerLX+bWD3Wut2B2aydrsC7/zAeL4GNvmB+/g29W9Ftfcrl1feHFN70H5tg/ibpGKqF7bfsT3z533KxwuWUFa2guefeZO99+uyXtsuKlnKt9+UAbBs2XKmvfEhHTpuUZfhZlS3bp2ZO2chC+aXUFpaxtgxL1NUVHOkvqioB6MfewGAceP+Tc+e3QghAbeIVbPDju2ZP/dTPl6wmLKyFTz3zBvs22tdCf1kmjZ7MT9q04R2rTejQV4OB+/TkfFTF6xWb+u2TWnaOJ83/ruq09N0s3zy8yq/zls0acju229R4waQbFN57X3MgvnFqWtvIkVFe9aoU3ntTQBg3LhJ9Oy5U+KuPX0/OWl8ZUo6HtlyN7A0xjg9hNCr2vohwCMhhAkxxjkhhI7AxcCA2jsIlVfqGUAbKu+6/Z/FGD8LIeSGEDaJMX4DPAdcFkK4P8a4PITQMpXt+yyEsG+McSLwK+Bf697zajqEEPaKMU4GfgG8vJZ62wKT1lKWdnl5uZxz0RGcc9odVFRUcPARPdi6UyF33jqO7Xdsxz69duSdGfO5+Oz7+GLZcib96x3uGvEsf3/sPOZ+sIhbhj4JIUCMHHvCfmzTuU2mm1Rn8vJyufSyUzjp5CupKC/n6KMPoHPnDtx04wN07dqJov17MGDAAQw6/wb6HPgbmjVrwrDh51ZtX1R0Cl99+TVlZSsY//wr3HX3FTXu/M0WeXm5nHfxUZx12kgqyiOHpM6pkbc+w/Zd2vGT3l15e8Y8Lvj9vXyx7Gte/tfb3HHbOB58bBAAp55wC3PnLOLr5d9y6AGD+cOVP6Pn3tt/x6dufMorIlfe+Sr3XLo/uTmBRya8z6z5n3PWMTsxY/aSqg7gIXt3ZMykOTW23aZdU/506p5UxMqho9sfm5nVnb7Ka+9UTjr5CirKK6pde/enrr09GTDgQAadP4w+Bw5MXXurposXFZ3MV18ur3btXVnjzl+pPgghHATcCOQCd8YYr11LvaOBUcAeMcap69xnjHVzF0kI4csYY+Na63oB51V7ZMtRwJVUPrKlDLg8xvhoquxeVj2ypRGVj2y5aB2PbJkWYzw+td2hrMqezY8x1nhYXAjhLuDBGOPzqeULgeOBUmBsjPHiEMIuVM7Za0TlsO2vUx3GF1NtmLqG9rxI5Q0qn1LZOZ1KZfbybeBXqU7lYGBqjPGJ1DavAwfGGGvOQq7lk2+eyO5bYTeA1pseTkV8O9NhbBRyQheWfPNkpsOo91pucijbHPW3TIexUXj/0V9REd/NdBj1Xk7Ynrr6uZtNQgjEGNOamv3NpBfS9h/zl717r7NtIYRc4D0qn3yyAHgVODbGmj/kQghNgDFAPnD6d3X66izTV7vDl1r3IvBiteVHgUfXsv2J37H/jv/Ldim3AmcDz6e2uRao0YOOMb4J9FzD/ntVe/8iNdvTCyrv3gVWrHxGYa3tL1v5PoSwK5VzAtfZ4ZMkSYnSA5gdY/wAIITwEJU3ndbObPwRuI41PPlkTRL5Z9hijK9TeedvboZD2Ry4NMMxSJKUePXs7t22wPxqywtS66qEEHYD2scYx6xvGxP7Z9jq8q9gpB4H03U96j1XVzFIkqT6KYQwEKj+9zdHxhhHfo/tc4BhwInf53MT2+mTJElaKZ3Pz0t18NbVyfuIVY+AA2hHtb9qBjShMrn0Yuqu9ELgiRDCYeua15fI4V1JkqR67FWgcwhhq9QflTiGVc8IJsb4eYxx8xhjx9Q9DlOAdXb4wEyfJElSvcqCxRhXhBBOB8ZR+ciWu2OMM2s/AeT7stMnSZJUz8QYx1Lrr5lVfwJIrfW91mef9aljK0mSpDpipk+SJCVeTsj+h2ab6ZMkSUoAM32SJCnx0vnIlkwx0ydJkpQAZvokSVLiJSELloQ2SpIkJZ6ZPkmSlHjO6ZMkSVJWMNMnSZISL/icPkmSJGUDM32SJCnxnNMnSZKkrGCmT5IkJV4SsmBJaKMkSVLimemTJEmJl+Pdu5IkScoGdvokSZISwOFdSZKUeD6yRZIkSVnBTJ8kSUo8M32SJEnKCmb6JElS4uVmOoA0MNMnSZKUAGb6JElS4iXh4cwhxuxvZDYIIQFnoyRJKTHGtN5acfWbz6Xt5+zFuxyYkdtGzPRtJOycf7cQAku/fSbTYWwUmjc8iBUVb2U6jHovL2dnr731FEJgyx0vy3QY9d7HMwd7Tq2HENLfJ/LuXUmSJGUFM32SJCnxzPRJkiQpK5jpkyRJiZdrpk+SJEnZwEyfJElKPOf0SZIkKSvY6ZMkSUoAh3clSVLiJeHPsJnpkyRJSgAzfZIkKfG8kUOSJElZwUyfJElKvNxMB5AGZvokSZISwEyfJElKPOf0SZIkKSuY6ZMkSYnnc/okSZKUFcz0SZKkxMt1Tp8kSZKygZk+SZKUeN69K0mSpKxgpk+SJCWemT5JkiRlBTt9kiRJCeDwriRJSjyHdyVJkpQVzPRJkqTEy/XPsEmSJCkbmOmTJEmJl4QsWBLaKEmSlHhm+iRJUuIl4e5dO33SDzD55bcZet2jVJRXcPhRe3HCyQfWKH996myGX/8os9/7mD9dfwL799m1quypx1/hnpHPAvDrgX045PA90xp7Ok2c+AbXXHUP5RUVDBiwP6cMPLJGeWlpGRdecDMzZ35A8+ZNGDbsbNq2a82/J73FsKH3U1a2ggYN8jhv0K/o2bNbhlqh+qTX3p0YfGE/cnIDD/7zdW696+Ua5VsWNuPGq4+kaZNNyMkNXDP8eSZMnEVeXg5Drjycrju0IS8vh1FPvMUtd07MUCuk9Ers8G4IIYYQ/l5tOS+E8EkI4anU8omp5TervXau9n5JCOHD1PvnQwgdQwhfp5bfDiH8NYTQILWvXiv3m1ruF0KYmqr3RghhaPqPgH6o8vIKrr/qEW4c8Rsefvxixj39Gh+8v7BGncI2Lbjsj8fRp//uNdZ//vlX3HnbM9z9wDnc88C53HnbMyz7fHk6w0+b8vJy/jT4Lm6/4w88+dRwxo6ZxOzZ82vU+eeoCTRt2phxz97CCSccwtChlZdm8xZNGXHbhTz+5DCuufZ0Lhx0cyaaoHomJydw1SUH88vT/k7vw27liP7d6Lz1FjXqnHXqT3hy3Ez6/vQv/Pa8UVx9ycEAHNJnR/LzczngqBEc9LPb+eVPd6fdls0z0QzVMzkhfa+MtTFzH51xXwFdQwibppYPBD6qVefhGOMu1V5vrXwPPAGcn1o+IFX//VRZN6Ad8LPaHxpC6ArcAvwyxtgF6A7M3vDNU12bOX0u7TpsQdv2m9OgQR59+u3GSy9Mr1Fny7at6LxdW3JCzat8yqR32XOv7WjWbDOaNmvEnnttx+RJ76Qz/LSZPm02HToU0r59Afn5DejXf28mjJ9ao86E8a9yxBH7AdCnb0+mTJ5BjJEuXbaidUFLADp1bs8335ZSWlqW9jaoftm1W1vmzFvCvAWfUbainMefnkHfou1rVorQeLOGADRt0pCST76oXB0jjTbNJzc3h00b5lFWVs6XX36b7iZIGZHkTh/AWODg1PtjgQc3xE5jjOXAf4C2aygeBFwVY3x3Zd0Y420b4nOVXp8sWkpB4aoMQeuC5nxS8vl6b9u6sEXNbRct3eAx1gclJUsobNOqarmwsCWLShbXrLNoCYVtNgcgLy+XJk0asXTpFzXqPDtuCl26bE1+foO6D1r1WmHrpnxcvOpaW1jyOYWtm9SoM3TECxx1yE5Mff4c/jril1xy9VgAxjz3Nsu/LuWNF87jP8+dw1/u/TdLl32d1vhVP+WGmLZXpiS90/cQcEwIYRNgJ+CVWuU/rzW8u+nqu1hdan97As+sobgr8Np67mdgahh46siRI9dnEykrzZo1n2FD7+eKKwdmOhRtJI7o341HHn+T7gcM4/jf/p2brjmKEAK7dGtLeXlkt6Ih9DzoBk494cd0aNfiu3coZYFEd/pijNOAjlRm+cauoUrt4d3v+nVwmxDCm0AJsDC1/x8S38gYY/cYY/eBA/1hV99s0bo5JcWrsnOLSpayRUGz9d52UfFnNbdtnZ3zigoKWlK8cFVmr7h4Ca0LWtWs07olxQs/BWDFinK++GI5zZs3SdVfzJmn/5lrrjudDh0K0xe46q3iRcvYsnDVtdamoBnFi2pmho85ajeeHDcDgNfeWkDD/DxatmjEkf134sVJs1ixooLFS77i1TfnsfOOW6Y1ftVPzulLhieAIWyYod2Vc/q2AXYPIRy2hjozgd3XsF4bmS5dOzB/7id8tGAxZWUrePbp19m31/rdWdpz7+2ZMvldln2+nGWfL2fK5Hfpuff2373hRqhrt07MnbuQBQtKKC0t4+mxk+hd1L1Gnd5F3Rk9+l9A5TDunj27EkJg2bKvOO3Uazjn3OPYbbfsPD76/t6c8TFbdWhJ+7bNaZCXy+H/396dR1lWV3cbf77drQwi4JRBFJEhUegAKqIGg4oTJOAUo5BBUd+AURM0USFqomIcIiyjaFBIRNS8gq9GAUHEASGgGJlHAREQRYxGxQFQpHu/f5xT9u2ihttYfc+pOs+n113rTHVr39NVt/bdv2mvlXz2i1eudc2NN/2Yxz5qawC23fq+bLDBCn7ww1u48aYfs9uuzfGNNrobD9/xAVxz3f9O/DVIXXDKFjgGuLmqLk3y+IV4wqr63ySHAH9Pk1SOOgz4RJKzq+rqJMuAA6rqfQvxvTU5K1Ys51WveTZ/8+IjWb1qNfs889Fss+1vc9R7TuGhO2zJ7k/4Pa647Ju8+qB/5yc/vY2zzryMo488lY+e8Bo22+wevOjAp7L/focD8H8O3JPNNrtHx69o/VixYjmv/YcX8ZcvejOrV6/mmX/8BLbb7oG8+4jj2WHlNuyxxyP542fvwcGvfjdPfcrL2HyzTTj8Ha8A4CP/9zPccMN3OfLIj3HkkR8D4N/f/w/c5z7jVVS1NK1atZrXveXTfOSov2DZ8mV89JMXcvU3vs8rX/oELr78O3zujKs49LDTOOyNT+Mvn/cYqopXvO4EAI497qv8yz89g9NPeCkJfPSEi/ja1f/T8StSHwxhnr5ULf0FhmeS5GdVtcm0Y48HXllVeyfZnyZBGx3R+5Kq+nJ77bHAyVX18XZ/q3Z/Zbsf4CLgZcDyqedtz+0NvBHYGKj26149T8jD/I9aB0m4+RczdaPUdJtvJZXfegAAGnVJREFUsCd3rL646zB6b8WynRjqe+S6SsL9d/jHrsPove9cfqg/U2NIQlVNNA371A2nTuw/Zp8t9+okxRxspW96wtceOwM4o90+Fjh2jq/ff9r+9TSDNKb2C9hp5JIzRs6dDJyMJEnShAw26ZMkSZoyhOZdB3JIkiQNgJU+SZI0eMut9EmSJGkpsNInSZIGb1mHy6NNipU+SZKkAbDSJ0mSBm8IVbAhvEZJkqTBs9InSZIGz3n6JEmStCRY6ZMkSYPnPH2SJElaEqz0SZKkwXOePkmSJC0JVvokSdLgOXpXkiRJS4KVPkmSNHhW+iRJkrQkmPRJkiQNgM27kiRp8IZQBRvCa5QkSRo8K32SJGnw4kAOSZIkLQVW+iRJ0uANoNBnpU+SJGkIrPRJkqTBs0+fJEmSlgQrfZIkafCGUAUbwmuUJEkaPCt9kiRp8JLqOoT1zkqfJEnSAKRq6We2S0GG8BFEkqRWVU10PO1FPzh5Yn9nd77P3p2MFbZ5d5EwOZ9fEu/TmLxX4/E+jc97NR7v03gyhPlTOmDSJ0mSBm8IeaZ9+iRJkgbApE+SJGkAbN6VJEmDN4DWXSt9kiRJfZNkzyRXJbkmySEznP/bJFckuSTJF5I8aL7nNOmTJEmDtyyTe8wnyXLgX4G9gO2B/ZJsP+2yC4FdqmpH4OPA2+d9jet6UyRJkrRe7QpcU1XXVtXtwPHA00cvqKovVtWt7e5XgAfM96QmfZIkafAyyUdyQJLzRh4HTAtnC+BbI/vfbo/N5kXAqfO9RgdySJIkTVBVHQ0cvRDPleTPgV2Ax813rUmfJEkavJ5Nznwj8MCR/Qe0x9aS5EnAa4HHVdUv5ntSm3clSZL65VxguyQPTnJ3YF/gpNELkjwMOAp4WlV9b5wntdInSZIGr0+Fvqq6I8nLgNOA5cAxVXV5kkOB86rqJOAwYBPgY+1axTdU1dPmet648POi4X/UPFzIfHzeq/F4n8bnvRqP92k87X2aaB72tZtPnth/zEM337uTHNNKnyRJGrw+VfrWF/v0SZIkDYCVPkmSNHjjrJSx2FnpkyRJGgArfZIkafAGUOiz0idJkjQEJn2SJEkDYPOuJEkavGTpz59opU+SJGkArPRJkqTBcyCHJEmSlgQrfZIkafAygFKflT5JkqQBsNInSZIGbwhVsCG8RkmSpMGz0idJkgbPPn2SJElaEqz0SZKkwRtAoc9KnyRJ0hCY9I1IsirJRSOPQ9rjZyS5IVnT4p/khCQ/a7e3SnJb+zVXJHlfkp1GnueHSa5rtz+f5MokvzfyXK9KctTkX7EkSYKmT9+kHl2xeXdtt1XVzrOcuxnYDTg7yebAb087/42q2jnJCuB0YJup50pyLHByVX283d8TODLJ7sD9gRcDuyz4q5EkSWpZ6Rvf8cC+7fazgE/MdFFV3QF8Gdh2tieqqs8ANwHPA/4FeENV/WhBo5UkSWPLBB9dMelb20bTmnefO3LuC8DuSZbTJH8fnekJkmwMPBG4dJ7v9XLgzcD9qurDszzXAUnOS3Le0Ucfvc4vRpIkaYrNu2ubq3l3FXA2TcK3UVVdn7Ub5rdJchFQwIlVdepc36iqvpPkdODkOa45GpjK9mrM1yBJknQnJn3r5njgk8AbZjj3jTkSxtmsbh+SJKlDywYwZ4vNu+vmLOCtwHFdByJJkrQurPStbaO2iXbKZ6rqkKmdqirg8MmHJUmS1qcBFPpIk8doEfA/ah5J8Od5PN6r8Xifxue9Go/3aTztfZpoHnbTrZ+a2H/Mb2+8Tyc5ppU+SZI0eMnST8bt0ydJkjQAVvokSdLgDaFPn5U+SZKkAbDSJ0mSBi8DKPVZ6ZMkSRoAK32SJGnwBlDos9InSZI0BFb6JEnS4A2hCjaE1yhJkjR4VvokSdLgOXpXkiRJS4JJnyRJ0gDYvCtJkjSASVus9EmSJA2AlT5JkjR4sdInSZKkpcBKnyRJGrxk6dfBlv4rlCRJkpU+SZIkR+9KkiRpSbDSJ0mSBs/Ru5IkSVoSrPRJkiRZ6ZMkSdJSYKVPkiQNnvP0SZIkaUmw0idJkmSfPkmSJC0FJn2SJEkDYPOuJEkaPCdnVp+kb48kB3Ydw+ijqjqPYTHcJ++V92ko98r7tHjvVXuftMBM+vTrOKDrABYJ79P4vFfj8T6Nx/s0vsHfq0zwX1dM+iRJkgbAPn2SJEkDqIMt/Veo9enorgNYJLxP4/Nejcf7NB7v0/i8VwOQquo6BkmSpE7dcseZE0uI7rHicZ107LPSJ0mSNAD26ZMkSepwVO2kWOmTJEkaACt90gJKsqKq7ug6DknSuhnCihwmfdLC+irw8K6DWAySPADYF/gD4P7AbcBlwCnAqVW1usPweiXJLtz5Pn2uqn7UaWBalJK8Fzi4qn7SdSyaLJt3NZYky5Pcd2T/7kkOSPK1LuPqoaX/UXEBJPkAcAxwO/DPwH7AS4DPA3sCZyfZvbsI+yHJC5JcAPw9sBFwFfA94LHA55N8MMmWXcbYF0kOa5cSm378wCRv6yKmHrsWOD/Jn3YdSL8sm+CjG07Zonkl2Rc4CrgF+DrwZpo/2OcCb6qqCzoMr1eSfBt4x2znq2rWc0OSZGVVXTbH+bsDW1bVNRMMq3eSvBQ4pqpum+X8zsB9quoLk42sf5KcD+xS0/6oJVkGXFJVK7uJrJ+SbEHzXnVf4L3AryrrVfWJruLq0q13fGliCdHGK3brpEBg867G8TrgEVV1TZKHA+cAz66qT3UcVx8tBzbBit+cquqyJMuBD1XVn81w/nZg0AkfQFX96zznL5pULIvABtMTPoCqWp3E38dpqurGJKfQfIjfhzVJXwGDTPrs0yc1bp+quFTVBUm+bsI3q5uq6tCug1gMqmpVkgcluXub5GkdJNm7qk7uOo4euS3JdlX19dGDSbaj6QepVpIdaKp73wF2raqbOg5JE2LSp3H8RpK/HdnffHTfJsu1LP2PigvrWuBLSU6i6T4A+DM1pkcCJn1r/CNwapJ/As5vj+1C0x/y5Z1F1U8fA15eVZ/tOhBNlkmfxvFvwD3n2Ncab53aSPLgqrpuZP9ZQ+0rM4dvtI9l+DO1Tqrq9V3H0CdVdWqSZwCvAv66PXw58MdVdWl3kfXSjSZ8dzaEXgAO5JAWUJILqurh07dn2tcaSTauqlu7jqOvkmxIM7r5sTR9rs4G3ltVP+80MC1KvhfN7OerzplYQrTh8sc4kEP9lOT/VdVz2u1/rqqDR859tqqe0l10vZNZtmfaH7wkjwHeTzP4ZcskOwEHVtVLuo2sdz4E/BR4d7v/p8CHgT/pLKKeSfIpmoR4RlX1tAmG03ebJ3nWbCeH2yKx9N+iTfo0ju1Gtp8MHDyyf78Jx9J3Ncv2TPuCdwJPBU4CqKqLnZ9vRiuravuR/S8muaKzaPrp8K4DWEQ2A/Zm5ixnsKN3h8CkT+OYK1kxkVnb1u2ghIxs0+4/uLuw+quqvjWtL82qrmLpsQuSPLqqvgKQ5FHAeR3H1CtVdeZs55LsNslYFoFvVtULuw6ibzKA9SpM+jSOjZM8jKaz/UbtdtrHRp1G1j9PH9meXnmwEnFn30ry+0AluRtwEOAqL3f2CODLSW5o97cErkpyKVBVtWN3ofVDO+/jc4AtgM+0c0HuDbyG5n3qYV3G1zNLvx1TM3Igh+aV5Azm7ivzhMlFs7i0icxKmtFy3+s6nr5pl/Z7F/Akmg8VpwEHVdUPOg2sZ5I8aK7zVfXNScXSV0mOBR5Is/71o2jmoNsFOKSqTugwtN5JsiOwNbAtcGlVndZxSL3wi1XnTiwh2mD5IztJvE36pAWU5H3Au6vq8iSb0axesgq4N/DKqjqu0wC1aCW5F01S86sWGpdAXCPJZcCO7QocGwLfBbbxA8SdJTkS2AH4MvBE4FNV9aZuo+reEJI+m3c1r7lGecGQR3rN6A+q6sXt9guAq6vqGUl+CzgVMOkbkWRrmkrfo2mqyecAr6iqazsNrGeSvAnYn2ZOw6k/TAXs0VVMPXR7Va0GqKqfJ7nWhG9WuwM7tavibAycBQw+6RvCPH0mfRrHPtO2R5dgc6TX2kaXE3syzcz3VNV3h/CGchd8BPhX4Jnt/r40ifGjOouon55DU7VyubrZPSTJJe12gG3a/WC/x+lur6pVAFV1q2sTD4dJn+ZVVS+Y2k5y4ei+7uTmtvP4jcBuwIsAkqzAQS8z2biqPjyy/x9JXtVZNP11GbA5YL/Q2T206wAWERPkGS393NekT+vKTqBzOxA4AvgtmrUtv9sefyJwSmdR9depSQ4Bjqf52Xou8Okk9waoqh92GVyPvBW4sO239oupg044vJZ/c6L4sZkgD5QDObROXL7nrkvyyKo6t+s4+iTJdXOcrqraemLB9FiSy4GjgEuB1VPH55qbbmjaVginZdFd9svVF04sIbrbsoc5kEP9NG15o9EJhwGrDXNJsj2wX/u4mWYKCbWqygmrx3NrVR3RdRA9t5lLi40nyU9Zu9Um7f5U8+6mnQSm9c5Kn+aV5HFznbfasLYkW7Em0fsl8CBgl6q6vruo+ivJSmB7YMOpY1X1oe4i6p8k76Bp1j2JtZt3nbKlleQHwInMsrSYK1CskeQEmi4onwCOr6ob5vmSQfjl6osmWOnb2Xn61E9Jjq2q/buOYzFIcg6wKU0fteOr6utJrrOiNbMkrwceT5P0fRrYCzi7qp7dZVx9k+SLMxyuqnLKlpZdT9ZNO4/os2hGzG8IfJTmPWuw/WiHkPTZvKtxDHQk113yPzTLQP0mcD/g6zj4ZS7PBnYCLqyqFyT5TeA/Oo6pd1z1ZixLf+jlAqqqHwMfSPJBmsTvCJrk7x2dBqb1yqRP45hae3fGN1WbmNZoJ2Ke+gT9hiTbAZsn2bWqvtpxeH10W7uCwh1JNqWZkuSBXQfVN+3P1OtpJtUFOBM4tP3DrcbzpzaSbFBVvxjZf3RVfaWbsPqpXfN6P+APgLOBZ1bVWd1G1a0M4HODzbuaV9vp91xm7ytjE9MskvwGzcS6+wFbVpUJzYh2OajX0FQa/g74GXCRc0GuLcl/0szV98H20F/QrKgw52o5QzLavDu9qdem37UluZ5mYNnxwOnAHaPnh/pB/o7VF08sIVqxbCf79KmfnAphfEneUlWvmeXcg6rqm5OOabFoB8BsWlWXzHPp4CS5qKp2nu/YkI2+T01/z/I9bG1JzmD2bieD/SC/qi6ZWEK0PDvap0/91i5ivm27e01V/bzLeHpqT5rK1Z2Y8M2snWbjsTR/hM4GTPru7LYkj62qswGS7Abc1nFMfVOzbM+0P2hV9fiuY1A3TPo0joOT/DPNkmLfpGnmfWCSDwCvrapfdhpdvyxPci9m7/842JFxM2mbd7elWW8X4MAkT6qql3YYVh/9FfDBtm8fwI+A/bsLp5cekOQImt+9qW3a/S26C6t/kry6qt7ebv9JVX1s5NysrRVL37KuA1jvbN7VvJK8E9gEeEVV/bQ9tilwOE1H/IO6jK9PkvyCZt3d2fo/usLEiCRXAg+t9o0oyTLg8qpymagZtL93VNVPuo6lb5I8f67zVfXBuc4Pif0fZ7aqLptg8+5Km3fVW38E/E6NfEKoqp8k+SvgSsCkb40r7Du0Tq4BtqSpIEMzcvea7sLppyRvAd5eVTe3+/cC/q6qXtdtZP1hUrdOMsv2TPuDMYTRuyZ9GkfVDCXhqlqVxFKxfh33BL6W5Ks0/a52Bc6bWurPJf5+Za/RJreq+lGSPwRM+lrTl4eczp+ltdj/caBM+jSOK5I8b/rSWEn+nKbSpzXeNdPBdhDMPqN9ZwTAP3YdwCKxfHTuuSQbARt0HFPfPAb4Fk3/0P9mwBWrMeyU5Cc092ijdpt2f8PZv2ypW/o/Mvbp07ySbEGzRuNtwPnt4V2AjWgm9Lyxq9j6LMly4Kk0c/Q9BTjL5cUaSTJT9XhdrxmKJAcD+wAfaA+9ADhpqjO+fvX79mSa37cdgVOA46rq8k4D06Kxuq6Y2PvNsmzvPH3qtyR7ADu0u1dU1Re6jKevkjwO+FPgD4GvArsBW1fVrZ0G1iPtPGH/CZw4uth7krvTTN/yfOCLVXVsJwH2UJI9gSe1u5+rqtO6jKfPkmxAk/wdBryxqt7TcUhaBIqvTSwhCg816ZMWuyTfBm4A3gucUFU/TXJdVT2449B6pW3ufiHwZ8CDaVYH2IhmzoTPAkdW1YXdRdgPVkTXTZvs/RFNwrcVcBJwjK0RGodJn6R10k5v8wyaJbM+ApwIXOpULbNLcjfgvjTT/9zcdTx9YkV0fEk+BKwEPg0cX1WXdRySFpniqgkmfb9r0ictBUkCPJ6m2vCHwGY0E1t/uqp+1mFoWmSsiI4vyWrglnZ39A9baGYg2HTyUWkx6VvS13bpeBewHPj3qnrbtPMbAB8CHgH8AHhuVV0/53Oa9EnrT1vF2hPYF3hqVd2345C0SFkRlda3qyeYEP3OnElfOzDpaprBSd8GzgX2q6orRq55CbBjVb04yb40AyufO9fzLv01R6SOJLkfsHlVfaqq/oxm4mHpLqmqX1bVTSZ80iDsSrPG/bVVdTtwPPD0adc8HZialPzjwBPblqZZOU+ftIDaX7jXAy+jKcmT5A7g3VV1aJexLRZJvlRVu3Udh6Shmbv6tpCSHAAcMHLo6Ko6emR/C5p5J6d8G3jUtKf51TVVdUeSHwP3Af53tu9r0ictrFfQTNHyyKq6DiDJ1sB7k7yiqv6l0+gWhy27DkCS1qc2wTt63gsXmM270sL6C5p+F9dNHaiqa4E/B57XWVSLix2Nx5DkS13HIGm9uZG1uwQ9oD024zVJVtAMGvzBXE9qpU9aWHerqjuV1qvq+21HfAFJnjXbKZrRqZqfFVFp6ToX2C7Jg2mSu31pJv0fdRLNtE3nAM8GTp9vzk6TPmlh3X4Xzw3NPnOcO3liUSxuVkSlJarto/cy4DSa/uHHVNXlSQ4Fzquqk4D3Ax9Ocg3wQ5rEcE5O2SItoCSrWDNX2FqngA2rymqfxjZPRfR9VXW/ScYjaXGz0ictoKpa3nUMi0GSd1bVy9vtg6rqXSPnjq2q/TsLrl+siEpaMFb6JE1ckguq6uHTt2falyQtDEfvSupCZtnWiHYt56ntg6adO3biAUla1Ez6JHVhWZJ7JbnPyPa9k9ybdlJrAbD7yPbzp53bcZKBSFr87NMnqQubAeezpsp3wcg5+5ysYUVU0oIx6ZM0cVW1VdcxLBLLktyLplVmansq+bMiKmmdOJBDUifaGeT3Ah7SHroCOK2q7uguqn5Jcj2wmpmrfFVVW082IkmLmUmfpIlLsgVwOnATcCFNUvMw4LeAJ1TVdzoMT5KWJJM+SRPXjjy9qKreOe343wCPqKrpgxYGy4qopIVi0idp4pJcWVUPmeXcVVX1u5OOqY+siEpaSA7kkNSF2+Y4d+vEoui/NwPvnaUi+lbuPI2LJM3KpE9SFzabZV3ZAJtOOpgee/RMS9JV1RFJruogHkmLmEmfpC6cyezryv7XJAPpOSuikhaMSZ+kiauqF3QdwyJhRVTSgjHpk9SJJCuBVwE7tIcuBw6vqku7i6p3rIhKWjCO3pU0cUmeDhxOMxjhvPbwLsDfA6+sqhO7ik2SliqTPkkTl+Ri4OlVdf2041sBJ1bVTh2E1UtWRCUtlGVdByBpkFZMT/gA2mN3m3g0PdVWRD9J08z7wvZxJvCJ9pwkjc0+fZK6cEeSLavqhtGDSR4EuNLEGocCT56WIF+S5HTgxPYhSWMx6ZPUhdcDn0/yFuD89tguwCHAwZ1F1T+zVkSTWBGVtE5M+iRNXFWdkOQ64O+Av24PXwE8p6ou7i6y3rEiKmnBOJBDknoqyTOAtwMzVkSr6oSuYpO0+Jj0SZq4JCfNdb6qnjapWPouyU40FdGp0btX0IzetSIqaZ2Y9EmauCTfB74FHAf8N80KE79SVWd2EZckLWUmfZImLsly4MnAfsCOwCnAcVV1eaeB9YwVUUkLyaRPUqeSbECT/B0GvLGq3tNxSL1hRVTSQjLpk9SJNtn7I5qEbyvgJOCYqrqxy7j6xIqopIVk0idp4pJ8CFgJfBo4vqou6zik3rMiKunXZdInaeKSrAZuaXdH34QCVFVtOvmo+smKqKSFYtInST1lRVTSQjLpk6SesiIqaSGZ9EmSJA3Asq4DkCRJ0vpn0idJkjQAJn2SJEkDYNInadFLsirJRUkuS/KxJBv/Gs/1+CQnt9tPS3LIHNdunuQld+F7vCHJK+9qjJJ0V5j0SVoKbquqnatqJXA78OLRk2ms8/tdVZ1UVW+b45LNgXVO+iSpCyZ9kpaas4Btk2yV5Kp2rrvLgAcmeUqSc5Jc0FYENwFIsmeSK5NcADxr6omS7J/kPe32byb5ZJKL28fvA28DtmmrjIe1170qyblJLknyxpHnem2Sq5OcDfzuxO6GJLVWdB2AJC2UJCuAvYDPtIe2A55fVV9Jcl/gdcCTquqWJAcDf5vk7cC/AXsA1wAfneXpjwDOrKpntmvibgIcAqysqp3b7/+U9nvuSjOX3klJdqeZa29fYGea990LgPMX9tVL0txM+iQtBRsluajdPgt4P3B/4JtV9ZX2+KOB7YEvJQG4O3AO8BDguqr6OkCS/wAOmOF77AE8D6CqVgE/TnKvadc8pX1c2O5vQpME3hP4ZFXd2n6Pk36tVytJd4FJn6Sl4LapatuUNrG7ZfQQ8Lmq2m/adWt93a8pwFur6qhp3+PlC/g9JOkusU+fpKH4CrBbkm0Bktwjye8AVwJbJdmmvW6/Wb7+C8BftV+7PMlmwE9pqnhTTgNeONJXcIskvwH8F/CMJBsluSewzwK/Nkmal0mfpEGoqu8D+wPHJbmEtmm3qn5O05x7SjuQ43uzPMVBwBOSXErTH2/7qvoBTXPxZUkOq6rPAh8Bzmmv+zhwz6q6gKav4MXAqcC56+2FStIsXHtXkiRpAKz0SZIkDYBJnyRJ0gCY9EmSJA2ASZ8kSdIAmPRJkiQNgEmfJEnSAJj0SZIkDcD/Bz3mOGOUdNEAAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 720x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "experiment.show_results(full_cm=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/training/table-type-classifier.ipynb b/notebooks/training/table-type-classifier.ipynb new file mode 100644 index 0000000..ed81da0 --- /dev/null +++ b/notebooks/training/table-type-classifier.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table Type Classifier Training \n", + "\n", + "This notebook shows how to train a table type classifier on the **SegmentedTables** dataset. You can download the model weights at https://github.com/paperswithcode/axcell/releases/download/v1.0/models.tar.xz." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.helpers.datasets import read_tables_annotations\n", + "from pathlib import Path\n", + "\n", + "V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'\n", + "SEGMENTED_TABLES_URL = V1_URL + 'segmented-tables.json.xz'\n", + "\n", + "segmented_tables_annotations = read_tables_annotations(SEGMENTED_TABLES_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.data.paper_collection import PaperCollection\n", + "\n", + "SEGMENTED_TABLES_PAPERS = Path('/mnt/efs/pwc/data/arxiv/sources/segmented-tables/papers')\n", + "pc = PaperCollection.from_files(SEGMENTED_TABLES_PAPERS, annotations=segmented_tables_annotations.to_dict(orient='record'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We convert papers and annotations into a dataframe with features: table caption, content of table headers, and headlines of sections referencing the table. We later specify in ULMFiTTableTypeExperiment which of these featues to use." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "from collections import OrderedDict\n", + "from fastai.text import *\n", + "\n", + "anchor_re = re.compile(r'^xxanchor-\\S* ')\n", + "\n", + "# find headlines (possibly with duplicates) of sections\n", + "# referencing a given table in paper's text\n", + "def find_sections(paper, table):\n", + " anchor = table.figure_id\n", + " if anchor is None:\n", + " return []\n", + " anchor = 'xxref-' + anchor.replace('.', '')\n", + " sections = [anchor_re.sub('', fragment.header) for fragment in paper.text.fragments if anchor in fragment.text]\n", + " sections = [section for section in sections if section]\n", + " return sections\n", + "\n", + "\n", + "def get_tabletype_features(papers, deduplicate=True):\n", + " records = []\n", + " dedup_fn = (lambda x: x) if not deduplicate else lambda x: list(OrderedDict.fromkeys(x))\n", + " for paper in sorted(papers, key=lambda p: p.paper_id):\n", + " for table in paper.tables:\n", + " tags = table.gold_tags.split()\n", + " record = dict(\n", + " id=f'{paper.paper_id}/{table.name}',\n", + " fold=paper.gold_tags,\n", + " caption=table.caption or '',\n", + " sota='leaderboard' in tags,\n", + " ablation='ablation' in tags,\n", + " sections=' ; '.join(dedup_fn(find_sections(paper, table))),\n", + " row0=' ; '.join(dedup_fn(table.matrix.iloc[:,0])),\n", + " col0=' ; '.join(dedup_fn(table.matrix.iloc[0,:]))\n", + " )\n", + " records.append(record)\n", + " return pd.DataFrame(records)\n", + "\n", + "\n", + "# make sure the training dataframe is batch_size aligned\n", + "def align_df(df, batch_size):\n", + " aligned_len = ( len(df) // batch_size ) * batch_size\n", + " return df.iloc[:aligned_len]\n", + "\n", + "\n", + "def dataframes_to_databunch(sigmoid, base_path, train_df, valid_df, test_df, batch_size, processor):\n", + " columns = []\n", + " if sigmoid:\n", + " label_cols = [x for x in [\"sota\", \"ablation\", \"irrelevant\"] if x in train_df.columns]\n", + " else:\n", + " label_cols = [\"class\"]\n", + " text_cols = [x for x in [\"caption\", \"sections\", \"row0\", \"col0\"] if x in train_df.columns]\n", + " columns = label_cols + text_cols\n", + " \n", + " train_df, valid_df, test_df = train_df[columns], valid_df[columns], test_df[columns]\n", + " \n", + " if len(label_cols) > 1:\n", + " classes = label_cols\n", + " else:\n", + " classes = None\n", + " train_tl = TextList.from_df(train_df, base_path, cols=text_cols, processor=processor)\n", + " valid_tl = TextList.from_df(valid_df, base_path, cols=text_cols, processor=processor)\n", + " test_tl = TextList.from_df(test_df, base_path, cols=text_cols, processor=processor)\n", + " \n", + " src = ItemLists(base_path, train_tl, valid_tl)\\\n", + " .label_from_df(cols=label_cols, classes=classes)\n", + " src.add_test(test_tl)\n", + " \n", + " data_clas = src.databunch(bs=batch_size)\n", + " return data_clas\n", + "\n", + "\n", + "def get_databunch(experiment, df, processor):\n", + " df = experiment.transform_df(df)\n", + " is_test = df.fold == experiment.test_split\n", + " is_valid = df.fold == experiment.valid_split\n", + " test_df = df[is_test]\n", + " valid_df = df[is_valid]\n", + " train_df = df[(~is_test) & (~is_valid)]\n", + " train_df = align_df(train_df, experiment.BS)\n", + " data_clas = dataframes_to_databunch(\n", + " experiment.sigmoid, BASE_DIR, train_df, valid_df,\n", + " test_df, experiment.BS, processor\n", + " )\n", + " \n", + " return train_df, valid_df, test_df, data_clas" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from axcell.models.structure.experiment import experiments_grid\n", + "from axcell.models.structure.ulmfit_experiment import ULMFiTTableTypeExperiment\n", + "\n", + "EXPERIMENTS_DIR = './experiments/tabletype'\n", + "BASE_DIR = Path('./models')\n", + "\n", + "processor = processor = SPProcessor(\n", + " sp_model=BASE_DIR / 'tmp' / 'spm.model',\n", + " sp_vocab=BASE_DIR / 'tmp' / 'spm.vocab',\n", + " mark_fields=True\n", + ")\n", + "\n", + "# parameters common for all experiments\n", + "base_experiment = ULMFiTTableTypeExperiment(\n", + " dataset=\"segmented-tables\",\n", + " drop_mult=1.0,\n", + " fp16=False,\n", + " test_split='img_class',\n", + " valid_split='speech_rec',\n", + " pretrained_lm='lm',\n", + " schedule=(\n", + " (1, 1e-2), # (a,b) -> fit_one_cyclce(a, b)\n", + " (1, 5e-3/2., 5e-3), # (a, b) -> freeze_to(-2); fit_one_cycle(a, b)\n", + " (10, 1e-2 / (2.6 ** 4), 1e-3) # (a, b) -> unfreeze(); fit_one_cyccle(a, b)\n", + " ),\n", + " sigmoid=False,\n", + " BS=128,\n", + " dedup_seqs=True,\n", + " caption=True,\n", + " first_row=False,\n", + " first_column=False,\n", + " referencing_sections=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "experiments = list(experiments_grid(\n", + " base_experiment,\n", + " seed=[1234, 6671347, 531609, 999999, 135792468]\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import gc\n", + "\n", + "EXPERIMENTS_DIR = './experiments/tabletype'\n", + "df_dedup = get_tabletype_features(pc, deduplicate=True)\n", + "df_nodedup = get_tabletype_features(pc, deduplicate=False)\n", + "\n", + "# folds = sorted(df_dedup.fold.unique())\n", + "# folds.remove(base_experiment.test_split)\n", + "\n", + "for i in range(len(experiments)):\n", + " experiment = experiments[i]\n", + " df = df_dedup if experiment.dedup_seqs else df_nodedup\n", + "\n", + " train_df, valid_df, test_df, data_clas = get_databunch(experiment, df, processor)\n", + " print(f'Running experiment {i+1} / {len(experiments)}')\n", + " model = experiment.get_trained_model(data_clas)\n", + " experiment.evaluate(model, train_df, valid_df, test_df)\n", + " experiment.save(EXPERIMENTS_DIR)\n", + "\n", + "# experiments[i] = None\n", + "# experiment = None\n", + "# gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "edf = ULMFiTTableTypeExperiment.experiments_to_df(experiments).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>valid_accuracy</th>\n", + " <th>valid_bin_accuracy</th>\n", + " <th>test_accuracy</th>\n", + " <th>test_bin_accuracy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>0.572368</td>\n", + " <td>0.763158</td>\n", + " <td>0.703226</td>\n", + " <td>0.819355</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0.625000</td>\n", + " <td>0.809211</td>\n", + " <td>0.748387</td>\n", + " <td>0.858065</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0.578947</td>\n", + " <td>0.750000</td>\n", + " <td>0.716129</td>\n", + " <td>0.825806</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0.592105</td>\n", + " <td>0.769737</td>\n", + " <td>0.748387</td>\n", + " <td>0.858065</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0.565789</td>\n", + " <td>0.703947</td>\n", + " <td>0.722581</td>\n", + " <td>0.858065</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " valid_accuracy valid_bin_accuracy test_accuracy test_bin_accuracy\n", + "0 0.572368 0.763158 0.703226 0.819355\n", + "1 0.625000 0.809211 0.748387 0.858065\n", + "2 0.578947 0.750000 0.716129 0.825806\n", + "3 0.592105 0.769737 0.748387 0.858065\n", + "4 0.565789 0.703947 0.722581 0.858065" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edf[['valid_accuracy', 'valid_bin_accuracy', 'test_accuracy', 'test_bin_accuracy']]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "best_idx = edf.valid_bin_accuracy.idxmax()\n", + "best = experiments[best_idx]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train dataset\n", + " * accuracy: 0.751\n", + " * μ-precision: 0.700\n", + " * μ-recall: 0.950\n", + "valid dataset\n", + " * accuracy: 0.625\n", + " * μ-precision: 0.643\n", + " * μ-recall: 0.741\n", + "test dataset\n", + " * accuracy: 0.748\n", + " * μ-precision: 0.726\n", + " * μ-recall: 0.924\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjkAAAItCAYAAADbrGvgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd7wddZ3/8dfn3oSWQkKAJJDQQwldadKLKGJoIgLqKosaXXVXYUXhB7LCgsLSdBXU6CpioVlY2gJK7wjSQy+BQBIgJISSkHI/vz/OSbz3mk7mnHtmXk8f8+DMzPd8vzO548037/l+ZyIzkSRJKpu2Zh+AJElSEezkSJKkUrKTI0mSSslOjiRJKiU7OZIkqZR6NfsAJElSc6y41uENm2I9/YULo1FtzWWSI0mSSslOjiRJKiVvV0mSVFER5c46yn12kiSpskxyJEmqqCh51lHus5MkSZVlkiNJUkU5JkeSJKkFmeRIklRRJjmSJEktyCRHkqSKimj4mxYayiRHkiSVkkmOJEmVVe6so9xnJ0mSKstOjiRJKiVvV0mSVFFOIZckSWpBJjmSJFWUSY4kSVILMsmRJKmiouRZR7nPTpIkVZZJjiRJFeWYHEmSpBZkkiNJUkWZ5EiSJLUgkxxJkirKJEeSJKkFmeRIklRRQTT7EAplkiNJkkrJJEeSpIpyTI4kSVILspMjSZJKydtVkiRVlLerJEmSWpBJjiRJFWWSI0mS1IJMciRJqqxyZx3lPjtJklRZJjmSJFWUY3IkSZJakEmOJEkVZZIjSZLUgkxyJEmqqCh51lHus5MkSZVlkiNJUkU5JkeSJKkFmeRIklRREdHsQyiUSY4kSSqlHpvkRESuMPywZh+GSmLGixcBkPlEk49EZRGxkdeTlqnaNZXljlYarMd2ciRJUrEceCxJktSCTHIkSaooHwYoSZLUgkxyJEmqKMfkSJIktSCTHEmSKsokR5IkqQWZ5EiSVFHOrpIkSWpBJjmSJFWVY3IkSZJaj0mOJEkV5ewqSZKkFmSSI0lSRUVEsw+hUCY5kiSplOzkSJKkUvJ2lSRJFeXDACVJklqQSY4kSRXlFHJJkqQWZJIjSVJVOYVckiSp9ZjkSJJUVSWPOkp+epIkqapMciRJqirH5EiSJLUekxxJkqrKJEeSJKn1mORIklRVJY86Sn56kiSpqkxyJEmqqHRMjiRJUuuxkyNJkkrJ21WSJFVVue9WmeRIkqRyMsmRJKmq2sod5ZjkSJKkUjLJkSSpqpxCLkmS1Hrs5EiSVFXRwGVRhxKxT0Q8ERFPR8Sx89m/VkTcGBH3R8RDEbHvouq0kyNJkpoqItqBc4GPACOBwyNiZLdiJwCXZObWwGHAeYuq1zE5kiRVVc+ZXbUd8HRmPgsQERcBBwBjO5VJoH/988rAy4uq1CRHkiQVLiJGR8S9nZbRnXavCbzYaX18fVtn3wE+HRHjgauBf11UmyY5kiRVVQNnV2XmGGDMe6jicOD8zDwrIj4A/DoiNsvMjgV9wSRHkiQ120vA8E7rw+rbOvsccAlAZt4JrACsurBK7eRIklRVPWd21V+BERGxbkQsR21g8eXdyrwA7AUQEZtQ6+S8urBK7eRIkqSmyszZwFeBa4HHqM2iejQiTo6I/evF/h34QkQ8CFwIHJGZubB6HZMjSVJV9ZzZVWTm1dQGFHfedmKnz2OBnZakTpMcSZJUSiY5kiRVVc8JcgphkiNJkkrJTo4kSSolb1dJklRR2cCHATaDSY4kSSolkxxJkqqqB00hL4JJjiRJKiWTHEmSqqrcQY5JjiRJKieTHEmSqsrZVZIkSa3HJEeSpKpydpUkSVLrMcmRJKmqyh3kmORIkqRyMsmRJKmqnF0lSZLUekxyJEmqKpMcSZKk1mOS04PtvduWnPmdz9De3sb5F93Imedd3mX/Wmuuyk/O/CKrrtKfKVPf4sivnctLE18HYPgagzjvv0YzbOggkuTAz57OC+Nfa8ZpqAe55Zb7OPXUn9HR0cEhh+zN6NGHdNk/c+YsvvnNs3n00WcYMKAf55zzTYYNG8ztt9/PWWf9ilmzZtO7dy+OOeaf+cAHtmzSWagn8ZpST2aS00O1tQXfP+WfOeCzp7P1Xt/gkP13ZOMRa3Yp870TPsVv/3Ar2334W3z3B3/k5GMPm7fv5+d8mXN+eiVb7/UNdtnvBF59bVqjT0E9zJw5czj55J/w859/h6uuOpcrr7yFp59+oUuZSy+9jv79+/LnP4/hiCMO4Mwzzwdg4MD+/PjH3+aKK37EaacdxTe/eXYTzkA9jddUCbQ1cGkCOzk91LZbbcAzz0/k+RdeYdasOVx6xZ2M+tA2XcpsPGIYN9/+CAA33/Eoo/Z+f337mvTq1cYNtz4MwNvvvMv0GTMbewLqcR566CnWXnsow4cPYbnlevPRj+7K9dff3aXMDTfczUEH7QXAhz+8E3fe+SCZyciR6zN48CAARoxYi3ffncnMmbMafg7qWbym1NM1tJMTETtHxLmNbLNVrTFkIONfnjxv/aUJk1lz8MAuZR4eO44DPrIdAAfssy39+63EKgP6MmLdoUyd9g4X/fQo7rz6e3z3/32StpI/uluLNmnSZIYMWXXe+uDBg5g0afI/lBk6tFamV692+vXrw5QpXVPAa6+9g5Ej12e55XoXf9Dq0bymSiCicUsTFN7JiYitI+KMiHge+E/g8YWUHR0R90bEvUUfVxkcd+pv2WX7Tbjz6u+xyw6b8NKEyczp6KBXrzZ22nZjjj31t+y83/Gsu9bq/NMhuzX7cFUCTz01jjPPPJ+TT/5Ksw9FJeE1pSIVMvA4IjYEDq8vrwEXA5GZeyzse5k5BhhTryOLOLZW8fLEKQxbY9C89TWHDuKlSVO6lJkwaQqHffEcAPqstDwHfmQ73pj2Di9NeJ2Hxo7j+RdeAeDy6+5lu61H8KuLb2rY8avnGTx4EBMn/n3w+aRJk+fdLuhcZsKE1xgyZFVmz57Dm2++zcCB/QGYOPE1vvrV73L66Uex1lpDG3rs6pm8pkqg5CF/UUnO48CewKjM3DkzfwjMKaitUrr3wWfYYN0hrD18NXr3bueQ/T7AVX++r0uZQQP7EfUI8JivHDCvE3Pvg8+wcv+VWHWVfgDsvuOmPP7U+IYev3qezTcfwfPPv8yLL05k5sxZXHXVLey553Zdyuy55/b86U/XA3Dttbezww5bEBFMm/YWo0efxL//+2d5//tHNuPw1QN5TamnK2oK+ceAw4AbI+Ia4CJK319ctubM6eCob5/PFb8+jvb2Nn518U089uR4vn30x/nbw89x1Z/vY9cPbMLJ3zqMTLjt7sf4+rd/CUBHR3Lcqb/l6gtPIALuf/g5fnHhDU0+IzVbr17tnHjil/j85/+DOXM6OPjgDzJixNr84Ae/YbPNRrDXXtvz8Y/vzTHHnM3ee49m5ZX7cs453wTgN7+5ihdemMC5517EuedeBMAvfnEygwYNaOYpqcm8plpflny8ZmQWd1coIvoAB1C7bbUncAHwp8y8bjG+mysMP2xRxaTFMuPF2i/RzCeafCQqi4iNvJ60TNWuqWxor2P9w3/XsKEhz1z4yYb3qAq5XRURvQAy8+3M/F1m7gcMA+4HvlVEm5IkaQk5u2qp3NN9Q2ZOycwxmblXQW1KkiTNU9SYnHLf5JMkqQxK/rd1UZ2c1SLi6AXtzEyf3y1JkgpVVCenHehL6fuIkiS1sJLPriqqkzMhM08uqG5JkqRFckyOJElV1aRZT41SVCdnL4CI2APYtL7t0cy8saD2JEmSuiiqk7NiRNwNzADmvovgkIg4HTgoM18qqF1JkiSguE7Oj4AfZ+b5nTdGxGeA86g9BVmSJDVTue9WFfYwwJHdOzgAmXkBsHFBbUqSJM1TVJIz385TRLRRm14uSZKareRTyItKcq6KiJ/VX9AJzHtZ50+AqwtqU5IkaZ6iOjnHAFOBcRFxX0TcBzwPTAO+UVCbkiRpSbRF45ZmnF5B9W4FnA0MB44Azqf2BvLlqD0JWZIkqVBFdXJ+CrybmdOBgcBx9W1vAGMKalOSJC2BjMYtzVDYu6sy8/X650OBMZn5B+APEfFAQW1KkiTNU1gnJyJ6ZeZsak8/Ht2ANiVJ0pIo+eyqojocFwI3R8RrwHTgVoCI2IDaLStJkqRCFdLJycxTI+J6YChwXWZmfVcb8K9FtClJkpaQL+hcOpl513y2PVlUe5IkSZ05PkaSpKoq+ZicoqaQS5IkNZVJjiRJVVXyqKPkpydJkqrKTo4kSSolb1dJklRVJZ9CbpIjSZJKySRHkqSqcgq5JElS6zHJkSSpotIxOZIkSa3HJEeSpKoqedRR8tOTJElVZZIjSVJVObtKkiSp9ZjkSJJUVc6ukiRJaj0mOZIkVZVjciRJklqPSY4kSVVV7iDHJEeSJJWTnRxJklRK3q6SJKmi0oHHkiRJrcckR5KkqjLJkSRJaj0mOZIkVZWvdZAkSWo9JjmSJFVVyaOOkp+eJEmqKpMcSZKqyjE5kiRJrcckR5KkqvI5OZIkSa3HJEeSpKoyyZEkSWo9JjmSJFVUOrtKkiSp9djJkSRJpeTtKkmSqqrkUUfJT0+SJFWVSY4kSVXlwGNJkqTWY5IjSVJV+TBASZKk1mOSI0lSVZnkSJIktR6THEmSqqrcQU7P7uTMePGiZh+CSiZio2YfgkrE60nq2Xp0J+eOSVc2+xBUEjsOHgXA2sdf3eQjUVmMO3Vf3p51S7MPQyXSp/euDW8zHZMjSZLUenp0kiNJkgrkE48lSZJaj0mOJElV5ZgcSZKk1mMnR5IklZKdHEmSqioauCzqUCL2iYgnIuLpiDh2AWU+ERFjI+LRiPjdoup0TI4kSWqqiGgHzgX2BsYDf42IyzNzbKcyI4DjgJ0yc0pErL6oeu3kSJJUUW09537OdsDTmfksQERcBBwAjO1U5gvAuZk5BSAzX1lUpT3n9CRJUmlFxOiIuLfTMrrT7jWBFzutj69v62xDYMOIuD0i7oqIfRbVpkmOJEkV1chnAWbmGGDMe6iiFzAC2B0YBtwSEZtn5tQFfcEkR5IkNdtLwPBO68Pq2zobD1yembMy8zngSWqdngWykyNJUkVFNG5ZhL8CIyJi3YhYDjgMuLxbmcuopThExKrUbl89u7BK7eRIkqSmyszZwFeBa4HHgEsy89GIODki9q8XuxaYHBFjgRuBYzJz8sLqdUyOJEkVFT3oBZ2ZeTVwdbdtJ3b6nMDR9WWxmORIkqRSMsmRJKmielCQUwiTHEmSVEomOZIkVZRJjiRJUgsyyZEkqaKi5FFHyU9PkiRVlZ0cSZJUSt6ukiSpohx4LEmS1IJMciRJqqg2kxxJkqTWY5IjSVJFOSZHkiSpBZnkSJJUUSY5kiRJLcgkR5KkioqSRzkmOZIkqZRMciRJqihf0ClJktSCTHIkSaqokg/JMcmRJEnlZJIjSVJFmeRIkiS1IDs5kiSplLxdJUlSRXm7SpIkqQWZ5EiSVFFtJjmSJEmtxyRHkqSKckyOJElSCzLJkSSpokxyJEmSWpBJjiRJFRUln15lkiNJkkrJJEeSpIpyTI4kSVILMsmRJKmiTHIkSZJakEmOJEkVZZIjSZLUguzkSJKkUvJ2lSRJFVXyZwGa5EiSpHIyyZEkqaIceCxJktSCTHIkSaqoKHnUUfLTkyRJVbXYSU5ELJ+Z7xZ5MJIkqXEqPyYnIraLiIeBp+rrW0bEDws/MkmSpPdgcZKc/wZGAZcBZOaDEbFHoUclSZIKFyWPchZnTE5bZo7rtm1OEQcjSZK0rCxOkvNiRGwHZES0A/8KPFnsYUmSpKKVPMhZrCTnX4CjgbWAScAO9W2SJEk91iKTnMx8BTisAceibh66+zF+94PL6OjoYNdROzDq03t12X/NRTdxy5V309beRr8BffnccYey6pBVAJg8aQq/OP1iXn9lKkFw1BlfYLWhqzTjNNSD7DZiVU7cdxPa24KL7xvPj2959h/KfHSzIXx9zxFkJo9NfJOvXfogAGusvAKnHbQ5a/RfgQT++YJ7GT91eoPPQD3N7bc+whmnXUjHnA4OPHgXjvzCvl32z5w5i28f9z889ug4Vh7Ql9PP+iJrrLkqs2bO5pSTLmDso+OICL553GFss93GTTqL6ip7krPITk5E/AzI7tszc3QhRyQAOuZ08Ouz/8gx53yJVVZbmZO+cA5b77Qpa647ZF6ZtTdck//4+VEsv8Jy3PCn27nkx1fy5ZM+A8CYU37Hfp/5IJttuxEz3nmXKPtb2LRIbQEn77cpn/7lPUycNoPLv7Qjf37sFZ5+9a15ZdYZtBJf3nV9Dh5zJ9NmzGZQn+Xm7Tv741vwo5ue4bZnJrPScu105D/8WlDFzJnTwWmn/pYf/+xoBg8eyKcOPYXd9tiK9TdYY16Zy/5wG/369+Hya77HNVffww/O/j2nn/Ul/vj7WwC49LKTeH3yNL76pe/zm4tPoK3Nx7dp2Vmcq+kvwPX15XZgdcDn5RTs2cdeYPCaq7L6GoPo1bsX2++1Nfff9kiXMpu8bwTLr1D7S2j9Tdfm9VemAvDScxPpmNPBZttuBMAKKy0/r5yqa6thAxg3+W1enDKdWXOSKx6ewIc2Wb1LmcO2Gc4Fd49j2ozZAEx+eyYAG6zWl/a24LZnJgPwzsw5zJjV0dgTUI/zyMPPMXz46gwbvhq9l+vFh/fdjptufKBLmZtueID9DtgRgA9+6P3cc9fjZCbPPjOBbbffBIBVBvWnX7+VGPvI840+hcqLaNzSDItzu+rizusR8WvgtgWVj4hdF1HfLYt9dBU25dU3WGX1AfPWB642gGcf6z7J7e9uueputtih9gtj4ouvslLfFfnh8b/k1QmvM/L9I/jEl0bR1u6/kKpscP8VePmNGfPWJ0ybwVbDBnQps96gPgD8/gs70N4WfP+Gp7j5qddYb9WVmDZ9Nj85fGuGD1yJ2555jdOve4IOw5xKe2XSFAYPHThvffDggTzyUNdboK+8MoUhQ2plevVqp2+/FZk69S023GgYN9/4APvsux2TJr7O2LHjmDhxCptt0dBTUMktzbur1gUGL2T/MfPZlsAWwHCgfUFfjIjRgLfBltAd197Lc4+/yHE//CpQu9X15EPPctIv/p1Bqw/gvO9cwK3/dw+7jdqhyUeqnq69LVh30Eoc9j93M2TlFbjk89uzzw9vo72tjW3XGchHz72dl9+YwY8O3YqPv28Yl9w3vtmHrBZ1wMd25rlnJ/CpT5zC0DUGseVW69PuP8QaruwjGRZnTM4U/j4mpw14HTh2QeUzc79u398JOAGYSG36+QJl5hhgTP17lf434sDVVp53+wlgyqtTGbjqyv9Q7tF7n+SKX/+F4374FXovV/txDlx9ZdbaYA1WX2MQAO/beXOeGbvgFEjVMGnaDNZYeYV560P7r8CkaTO6lJk4bQYPjJ/K7I5k/JTpPPfa26wzqA8T35jBYxPe5MUptYHG1z02ia2HDeCShp6BeprVBw9k0oQp89YnTZrCaoMHdi2z+kAmTpzC4CGrMHv2HN56czoDBvQlIvjGsX+f0/LZT32PtdZe2L+fpSW30G5z1B6FuCWwWn0ZmJnrZeYif7dFxF4RcRNwCnB2Zu6QmVcsg2OuhHU3Hs6k8a/y6suTmT1rNndffz9b77xZlzLjnhzP+Wdcyte+9zn6D+w3b/t6G6/FO29NZ9qU2oDSx/72FGus4y+PqnvwpTdYZ1Afhg1ckd7twX6bD+XPj7/Spcx1j01ih3Vrs/AGrtSbdVftwwuvv8ODL02l/wq9WGWl2tiuHdcbxFOdBiyrmjbdbB1eeGESL41/lVkzZ3Pt1few+x5bdimz2x5bcsX/3gHAX667j22335iIYPr0d5n+Tm145113PEp7e1uXAcvSsrDQJCczMyKuzszNFlaus4j4KHA88AZwQmYucPyOFqy9VzufPupjnPnvY+jo6GCXj27HmusO4Y8//z/W3Xg4W++8GRefdwXvTn+Xc0/8FQCDBg/k66d9jrb2Ng79yv7819d/DCRrbzic3ffzVlXVzelITrxyLBd8dlva24JL7hvPU6+8xVF7jeDhl97gL4+/ws1PvcYuG6zKn/9tF+Z0JN+75gmmTp8FwKnXPM5vj9yWIHjk5Te46N4Xm3xGarZevdr51vGf5Mujv09HRwcHHLQT62+wJuf98DJGbroOu++5FQcevAsnHPtz9t/nOPqv3IfTzvwiAFNef5Mvjz6HtrZgtdUHcsppn2/y2VRT2W9XRS5iGmhE/AY4KzPvX6wKIzqA8cCDzH/q+f6LWU/eMenKxSkqLdKOg0cBsPbxVzf5SFQW407dl7dnOY9Cy06f3ruSmQ3tdux9ze0NGxry5312aniXaoFJTkT0yszZwNbAXyPiGeBtIKiFPO9bwFd9eackSS2greTDXxd2u+oe4H3AYiUvc2XmzQARsQKwQX3z05k5Y8HfkiRJWrYW1skJgMx8ZkkqjIhewHeBI4Fx9XqGR8QvgeMzc9ZSHqskSVqGyj4mZ2GdnNUi4ugF7czMsxew6wygH7BuZr4JEBH9gTPry9eW8lglSZIW28I6Oe1AX+qJzhIYBWyYnUY0Z+a0iPgX4HHs5EiS1COU/fGLC+vkTMjMk5eizsz5TNnKzDlVf8CfJElqnEWOyVkKYyPiM5l5QZfKIj5NLcmRJEk9QJVnV+21lHV+BfhjRBwJ3Ffftg2wInDQUtYpSZK0RBbYycnM15emwsx8Cdg+IvYENq1vvjozr1+a+iRJUjGqPLtqqUTEKvWPD9SXLtuXtvMkSZK0JJZ5J4faLaqk/mTkTtvnrq9XQJuSJGkJVXl21dLaPTPHFVCvJEnSYiuik/Mnaq+DkCRJPVjZx+QUkVSV/I9MkiS1giKSnDUj4r8XtDMz/62ANiVJkrooopMznb8/H0eSJPVQZX8RQRGdnMmZ+asC6pUkSVpsRXRyZhZQpyRJWsYceLyEMnOH7tsiYv2I+HZEPLqs25MkSZqfwp4DFBFrRMRREfFX4NF6W4cV1Z4kSVoybQ1cmmGZtxsRoyPiRuAmYBDwOWBCZp6UmQ8v6/YkSZLmp4gxOT8C7gQ+mZn3AkTZh29LktSC2kr+13MRnZyhwCHAWRExBLgE6F1AO5IkSQtUxMDjyZn5k8zcDdgLmApMiojHIuK7y7o9SZK0dNqicUtTzq/IyjNzfGaelZnbAAdQe1CgJElS4Ro24DkznwS+0Kj2JEnSwjm7atkq+WOHJElST1HEwOOFKfcwbkmSWkjZn3i8zDs5EXEF8+/MBLXn5kiSJBWuiCTnzKXcJ0mSGsjn5CyhzLx5WdcpSZK0pIp4rcOIiPhlRJwdEcMi4v8i4q2IeDAitl3W7UmSJM1PEbOrfknttQ4vA3cDvwBWBb5B7ZUPkiSpB/BhgEuub2aOycwzgemZeWlmzsjMPwPLF9CeJEnSPyhi4HFHp8/TFrJPkiQ1UbMe0tcoRXRyNo6Ih6hNGV+//pn6+noFtCdJkvQPiujkbDKfbQEMB44roD1JkrQUnEK+hDJz3NzPEbE18EngEOA54A/Luj1JkqT5KeKJxxsCh9eX14CLgcjMPZZ1W5Ikaen5Wocl9zhwKzAqM58GiIijCmhHkiRpgYro5HwMOAy4MSKuAS7Ct49LktTjlD3JWeazxzLzssw8DNgYuBH4OrB6RPw4Ij60rNuTJEman8KmyGfm25n5u8zcDxgG3A98q6j2JEnSkmlr4NIMDWk3M6fUn4K8VyPakyRJKvvDDiVJ0gK0RTZsWZSI2CcinoiIpyPi2IWUOzgiMiK2WeT5LeGfhyRJ0jIVEe3AucBHgJHA4RExcj7l+gFfo/YC8EWykyNJUkX1oLeQbwc8nZnPZuZMajOzD5hPuf8ETgdmLNb5LcGfhSRJ0lKJiNERcW+nZXSn3WsCL3ZaH1/f1vn77wOGZ+ZVi9tmEc/JkSRJ6iIzxwBjlua7EdEGnA0csSTfs5MjSVJF9aDbOS9Re5H3XMPq2+bqB2wG3BQRAEOAyyNi/8y8d0GV9qDzkyRJFfVXYERErBsRy1F7c8Llc3dm5huZuWpmrpOZ6wB3AQvt4IBJjiRJldVTXuuQmbMj4qvAtUA78IvMfDQiTgbuzczLF17D/NnJkSRJTZeZVwNXd9t24gLK7r44ddrJkSSpomIxHtLXyhyTI0mSSskkR5KkiuopY3KKYpIjSZJKySRHkqSKKnvSUfbzkyRJFWWSI0lSRbU5u0qSJKn1mORIklRRzq6SJElqQSY5kiRVlEmOJElSC7KTI0mSSsnbVZIkVVR7sw+gYCY5kiSplExyJEmqqLI/DLBHd3J2HDyq2Yegkhl36r7NPgSVSJ/euzb7ECQtRI/u5Jz/5DXNPgSVxBEb7gPA27NuafKRqCz69N6Vdc+9qdmHoRJ57iu7N7xNp5BLkiS1oB6d5EiSpOKY5EiSJLUgkxxJkiqq3SRHkiSp9ZjkSJJUUY7JkSRJakEmOZIkVVTZn3hskiNJkkrJJEeSpIpyTI4kSVILspMjSZJKydtVkiRVVHuzD6BgJjmSJKmUTHIkSaooBx5LkiS1IJMcSZIqyocBSpIktSCTHEmSKqrdMTmSJEmtxyRHkqSKcnaVJElSCzLJkSSpokxyJEmSWpBJjiRJFWWSI0mS1IJMciRJqqh2n3gsSZLUeuzkSJKkUvJ2lSRJFVX2pKPs5ydJkirKJEeSpIpyCrkkSVILMsmRJKmiTHIkSZJakEmOJEkV5cMAJUmSWpBJjiRJFeWYHEmSpBZkkiNJUkWZ5EiSJLUgkxxJkirKJEeSJKkFmeRIklRR7SY5kiRJrcdOjiRJKiVvV0mSVFFtvtZBkiSp9ZjkSJJUUWVPOsp+fpIkqaJMciRJqigfBihJktSCTHIkSaooHwYoSZLUgkxyJEmqKJ+TI0mS1IJMciRJqihnV0mSJLUgkxxJkirKJEeSJKkFmeRIklRRZU86yn5+kiSpouzkSJKkUvJ2lSRJFRUOPJYkSWo9JjmSJFVUyYMckxxJklROJjmSJFWUY3IkSZJakElOD/bMfWP585g/kh0dbPmhD7DjIXt32f+3q8HeO4MAABWqSURBVG/jvqtuJdraWG7F5fnIVw9ltbWG8tz9j3Pj+ZczZ/Yc2nu1s+eRB7LOlhs26SzUk9x+6yOccdqFdMzp4MCDd+HIL+zbZf/MmbP49nH/w2OPjmPlAX05/awvssaaqzJr5mxOOekCxj46jojgm8cdxjbbbdyks1BPsuvwgZy48wa0tQWXjJ3AT+5/scv+gzcazLE7rsekt2cCcMHDL3HJYxMB+NYO67L72oMA+NF947jq6Vcbe/AqfdJhJ6eH6pjTwbU/vpTDT/kK/QcN4JdHncmI7TdjtbWGziuz6e7v53377gzAk3c/zPU//xOHnfxlVuzfh0NO/CL9Bq3MK8+/zEUn/ph/u+A/m3Uq6iHmzOngtFN/y49/djSDBw/kU4eewm57bMX6G6wxr8xlf7iNfv37cPk13+Oaq+/hB2f/ntPP+hJ//P0tAFx62Um8PnkaX/3S9/nNxSfQ1lb2X5FamLaAk3YdwWeueIiJb73LZR9/H395fjJPT3mnS7mrnn6V79z6dJdte6y9Cpuu1o9Rl9zLcu1tXHjgltw87nXemjWnkaegkvM3VA/18pPjGDh0NQYOWZX23r0Yuev7eOquh7uUWX6lFed9njVj5rybq0PWH06/QSsDsNraQ5k9cxazZ81q3MGrR3rk4ecYPnx1hg1fjd7L9eLD+27HTTc+0KXMTTc8wH4H7AjABz/0fu6563Eyk2efmcC2228CwCqD+tOv30qMfeT5Rp+CepgtV+/PuDem8+K0GczqSK58+hX2XnfQYn13g4Ercc/LU5mTMH12B49Pfptd11ql4CNWdxHZsKUZCunkRMR1RdRbJW9Onkr/1QbMW++36gDenPzGP5S798pbOO/zJ3HDL/+XD40++B/2P377AwxZfxi9evcu9HjV870yaQqDhw6ctz548EBenTSla5lXpjBkSK1Mr17t9O23IlOnvsWGGw3j5hsfYPbsObw0/lXGjh3HxIldv6vqGdJnOSa89e689QlvvcvgPsv/Q7l91luVqw99P+d+eCRD+9b2P1bv1KzQq42BK/RihzUGzNsnLStF3a5abWm+FBGjgdHL+FhKbZtRu7LNqF159KZ7uf3i69jv6E/P2/fquAnceP7lHP6fX27iEaoMDvjYzjz37AQ+9YlTGLrGILbcan3a2w2CtWjXPz+ZK556hZkdyeEjh3LGnhvx6csf4rYXp7DF6v34/ce25vXps7h/0jQ6sjn/2q+ykk+uKqyTs3JEfGxBOzPzjwvYPgYYAxDNyrZ6iH6DBjDt1anz1t98beq8W1DzM3LX93HNeZfMW5/22hT+cOrP2e/of2Lg0KXqc6pkVh88kEkT/p6+TJo0hdUGD+xaZvWBTJw4hcFDVmH27Dm89eZ0BgzoS0TwjWMPm1fus5/6HmutPbhhx66eaeLbM7ukL0P7Ls+kt9/tUmbqu7Pnfb74sQkc+4H15q2fd98LnHffCwB8/4Mb89zU6QUfsaqmqH+KrQyMAvabzzKqoDZLZY0N12LKy68ydeJk5syazdhb/saI7TfvUub1l16Z9/npvz7KwDVqnZkZb73DJd/5KbsfsT/DR66HBLDpZuvwwguTeGn8q8yaOZtrr76H3ffYskuZ3fbYkiv+9w4A/nLdfWy7/cZEBNOnv8v0d2p/ed11x6O0t7d1GbCsanrolWmss/KKDOu3Ar3bglEbrM5fnpvcpcxqKy037/MH1xk0b1ByW8CA5Wv/zt54UB82GtSXW198vXEHL6A2lLNRSzMUleSMy8wjC6q7Etra2/nQlz7ORSeeR0dHB1vuvQOrrT2Um39zFUNHrMWG22/OvVfeyvMPPkFbezsr9F2R/Y6q3aq698pbmTLhNW678Bpuu/AaAA7/zy/TZ0C/Zp6SmqxXr3a+dfwn+fLo79PR0cEBB+3E+husyXk/vIyRm67D7ntuxYEH78IJx/6c/fc5jv4r9+G0M78IwJTX3+TLo8+hrS1YbfWBnHLa55t8NuoJ5iR859an+dV+m9MWwaWPT+SpKe/w9W3X4eFX3+T65ydzxBZrstc6g5jTkUx9dzbH3PA4AL3agosP2gqAt2bO4ei/PMacSuf3KkJkAfdAI+L+zNz6PdaR5z95zbI6JFXcERvuA8Dbs25p8pGoLPr03pV1z72p2YehEnnuK7uTmQ3NPB56/cqGdS23WGVUw/Ocom5XfXrRRSRJkopT1O2qK7sNHA5g7npm5voFtStJkgQU18nZptt6G/AJ4BvA/QW1KUmSlkBbyeeQF9LJyczJABHRBvwTcAzwAPDRzBxbRJuSJEmdFdLJiYjewJHAUcBtwIGZ+fTCvyVJkhqp5EFOYberngNmA98HXgC2iIgt5u5c0MMAJUmSlpWiOjl/oTbQeMv60lkCdnIkSWqyZj2kr1GKGpNzxIL2RYTPgpckSYUrKsnpIiIGAAcDnwQ2AXwevCRJTVbyIKe4Tk5ErAgcQK1jszXQDzgQ8JGzkiSpcIU88Tgifgc8CewN/BBYB5iSmTdlZkcRbUqSpCUTDVyaoajXOowEpgCPAY9l5hz+/sRjSZKkLiJin4h4IiKejohj57P/6IgYGxEPRcT1EbH2ouospJOTmVtRe8JxP+AvEXEb0M9Bx5Ik9Rxt0bhlYSKiHTgX+Ai1oOTwiBjZrdj9wDaZuQXwe+C/Fnl+S/OHsjgy8/HM/I/M3Bj4GvAr4K8RcUdRbUqSpJa0HfB0Zj6bmTOBi6iN650nM2/MzHfqq3cBwxZVaVFjcrbtdmD3ZeY3gLWBf4igJElS4zVyTE5EjI6IezstozsdyprAi53Wx9e3LcjngP9b1PkVNbtqTET0pdYTu3Du+6oyM3F2lSRJlZOZY4Ax77WeiPg0tReB77aoskU9DHDriNgIOAz4fUTMAi4ELsrM54toU5IkLZmIHjMn6CVgeKf1YfVtXUTEB4Hjgd0y891FVVrkmJwnMvOkzBwJfAZYGbg+Im4vqk1JktSS/gqMiIh1I2I5aiHJ5Z0LRMTWwE+B/TPzlcWptPAnHkdEG7A6MBjoAyzWgUmSpGL1lCceZ+bsiPgqcC3QDvwiMx+NiJOBezPzcuAMoC9wadReuvVCZu6/sHqLfOLxLsDh1J5y/DC18TlHZeYbRbUpSZJaU2ZeDVzdbduJnT5/cEnrLKSTExEvAuOodWy+s7ixkiRJ0rJSVJKzc2aOm9+OiOiVmbMLaleSJC2m6Cn3qwpS1MDj3879EBG/7rbvnoLalCRJmqeoJKdPp8+bdttX8n6jJEmtobAp1j1EUee3sIn3PWZSviRJKq+ikpwBEXEQtU7UgIj4WH17UHtejiRJarKyj8kpqpNzM7B/p8/7ddrnax0kSVLhinqtwz8vaF9EHFxEm5IkacmUPMhpypijc5rQpiRJqpjCX+swH2XvOEqS1BLKPianGUmOs6skSVLhinqtw8PMvzMT1F7UKUmSmqzkQU5ht6tGFVSvJEnSYimqk7NiZj4OEBHLZ+a7c3dExA7UXt4pSZKaqK3kUU5RY3J+1+nznd32nVdQm5IkSfMUleTEAj7Pb12SJDVB2f9CbsS7q7oPQHZ2lSRJKlxRSc6wiPhvap3EuZ+pr69ZUJuSJEnzFNXJOabT53u77eu+LkmSmiCi3DdXinp31a8WtC8i1iqiTUmSpM4Ke+JxRHwgIj4eEavX17eIiN8BtxfVpiRJWnzRwKUZCunkRMQZwC+Ag4GrIuIU4DrgbmBEEW1KkiR1VtSYnI8CW2fmjIgYCLwIbJaZzxfUniRJWkK+oHPpzMjMGQCZOQV4yg6OJElqpKKSnPUi4vJO6+t2Xs/M/QtqV5IkLaaSBzmFdXIO6LZ+VkHtSJIkzVdRU8hvXtC+iNipiDYlSdKSKWyKdQ9RSCcnItqBT1B7uvE1mflIRIwC/h+wIrB1Ee1KkiTNVdTtqv8BhgP3AP8dES8D2wDHZuZlBbUpSZKWQNlnVxXVydkG2CIzOyJiBWAisH5mTi6oPUmSpC6K6uTMzMwOgPqzcp61gyNJUk9T7iinqE7OxhHxUP1zAOvX1wPIzNyioHYlSZKA4jo5mxRUryRJWkbCJGfJZea4IuqVJElaXEVNIX8TyPntona7qn8R7UqSJM1VVJLTr4h6JUnSshNR7scBlvvsJElSZRU18FiSJPV45R54bJIjSZJKySRHkqSKKvsUcpMcSZJUSiY5kiRVlkmOJElSyzHJkSSponxOjiRJUgsyyZEkqbIckyNJktRyTHIkSaoon5MjSZLUgiIzm30Meo8iYnRmjmn2cagcvJ60rHlN9VxvzbqhYZ2Avr33bHhsZJJTDqObfQAqFa8nLWteU2oKOzmSJKmUHHgsSVJllTvrKPfZVYf3urUseT1pWfOaUlM48FiSpIp6e/bNDesE9Om1mwOPJUmSlgXH5EiSVFk+DFANFhHHR8SjEfFQRDwQEdtHxHIR8f2IeDoinoqI/42IYRExqF7mgYiYGBEvdVpfrl7fgRGREbFxs89Ny0b3n2lErBMR0+s/9wcj4o6I2Ki+b/eIuHIB9awaEbMi4kudtt1dr+eFiHi10/W0TkQ8HxGr1ssNq1+HT0XEMxHxg07X3O7149uvU71XRsTuBf6xaBEi4q36fztfL2Mj4oKI6F3ft3tEvFHf93hEnNnp+0d0uyYeiIiR9foemU9750fEc53K3lEvOz66vf567u+6busXzae+lyJi+fr6qvVrcvNObbzeqc2/LNs/QbUaOzk9TER8ABgFvC8ztwA+CLwIfBfoB2yUmSOAy4A/Aq9n5laZuRXwE+CcueuZObNe7eHAbfX/qhzm9zN9pv5z3xL4FfD/FqOeQ4C7OteTmdvXr6cTgYs7XU/Pzy0TEUHt+rusfj1uCPQFTu1U93jg+KU5OTXEM/Wf8+bAMOATnfbdWt+3NTAqInbqtK/zNbFVZo5dRDvHdCq7Y/06egHYZW6Beme9X2beXV/fBGgHdomIPt3qmwMc2XlDZj7c6ffg5Z3a/ODi/VFUVzTwf81gJ6fnGQq8lpnvAmTma8BU4J+BozJzTn37L4F3gT0XVllE9AV2Bj4HHFbgcatBFvNn2h+YshjVHQ78O7BmRAxbgsPYE5hRvw6pX5dHAUdGxEr1Mg8Cb0TE3ktQrxqs/rO7B1hzPvumAw/Mb997dCFdr93DgM6pzeHAr4HrgAO6fff7wFER4XALLZKdnJ7nOmB4RDwZEedFxG7ABsALmTmtW9l7gU0XUd8BwDWZ+SQwOSLev+wPWQ22oJ/p+vWI/hngaODshVUSEcOBoZl5D3AJcOgSHMOmwH2dN9SvzxeoXa9znQqcsAT1qsEiYgVge+Ca+ewbCIwAbum0+dBut6tWXEQTZ3Qq+9v6tkuAAzt1VA6l1vGh0/pF9W3dE+gXqKWY/7QYp6dFamvg0nh2cnqYzHwLeD+1x6C/ClwM7P4eqjycv/8L6SK8ZVUGC/qZzr1dtT7wdRb9bJJDqf1l072eZSYzbwGIiJ2Xdd16z9aPiAeAScCEzHyo075dIuJB4CXg2syc2Glf99tV0xfRTufbVZ8CyMxJwCPAXhGxFTA7Mx8BiIhtqKXZLwDXA1tHxCrd6vwecAz+HaZFMO7rgerx8U3ATRHxMPBFYK2I6JeZb3Yq+n5gvgNKAeq/GPYENo+IpHaPOyPimPQBSS1pQT9T4NxuRS8HfrmI6g4HhkTEp+rra0TEiMx8ajEOZSzw8W7H1h9YC3ga2K7TrrlpzuzFqFeN80xmblUfSH57ROyfmZfX992amaMiYl3groi4JDMfWMbtz71lNYmuKc7hwMYR8Xx9vT9wMPCzuQUy86l6B63zOCIthWaNlWkUe8E9TERsFBEjOm3aCniC2kDSsyOivV7uM8BKwA0Lqe7jwK8zc+3MXCczhwPP0WnAn1rOgn6mw7uV2xl4ZkGVRMSGQN/MXLNezzrU/nW8uGnO9cBK9euQ+nV5FnB+Zr7TuWBmXgcMBLZYzLrVQPVxf8cCx81n33PAacC3Cmj6j8C+/P3WFPUZV58ANu90XR7A/K/LU4FvFHBcKhE7OT1PX+BX9WmdDwEjge9Q+wU0A3gyIp6iNivmoEUkMocDf+q27Q94y6qVLehnehx/H5PzILXZeJ/vVGav+rTd8RExfiH1LNa1Ub/uDgIOqV+PT1K7Phc0o+tU/rEjpp7jMmqd1vn9A+gnwK4RsU59vfuYnB3r2zfqfI1FxCH17Wd0K78cQGZOBe4EJmXms/WyuwAvZebLndq/BRgZEUM7H1RmPgr87T2ed+VFRMOWppyfdy0kSaqmGXPubFgnYIX2DzS8p+OYHEmSKssxOZIkSS3HTo4kSSolb1dJklRRUfKso9xnJ0mSKstOjtSiImJOfUruIxFxaad3Ri1NXfPeVB4R+0fEsQspOyAivrwUbXwnInyuidSjRAOXxrOTI7Wu6fVH5W8GzAS+1Hln1Czx/8cz8/LMPG0hRQYAS9zJkaRGs5MjlcOtwAYRsU5EPBERF1B7N9DwiPhQRNwZEX+rJz59ASJin4h4PCL+BnxsbkURcURE/Kj+eXBE/CkiHqwvO1J7Au7cBw+eUS93TET8NSIeioiTOtV1fNReNnsbsFHD/jQkLZayPwzQgcdSi4vam5w/wt/fIj0C+Gxm3hW19xKdAHwwM9+OiG8BR0fEf1F7F9Ce1N41dfECqv9v4ObMPChqr27oS+0VAJtl5lb19j9Ub3M7apn05RGxK/A2tXcTbUXtd83f6Pbmckkqkp0cqXWtGLWXFEItyfkfYA1gXGbeVd++A7VXg9xe/5fUctQepb8x8Nzcl3FGxG+ovfm+uz2Bz8C8F8e+EREDu5X5UH25v77el1qnpx/wp7nvsoqIy5HUw5T7YYB2cqTWNX1umjJXvSPzdudNwJ8z8/Bu5bp87z0K4HuZ+dNubXx9GbYhSUvMMTlSud0F7BQRGwBERJ+ovYH8cWCdiFi/Xm5BL+a8HviX+nfbI2Jl4E1qKc1c1wJHdhrrs2ZErE7txYoHRsSKEdEP2G8Zn5uk9yhoa9jSDHZypBLLzFeBI4ALo/ZW+zuBjTNzBrXbU1fVBx6/soAqvgbsEREPUxtPMzIzJ1O7/fVIRJyRmdcBvwPurJf7PdAvM/9GbazPg8D/AX8t7EQlaT58C7kkSRU1q+OBhnUCerdt1fABQCY5kiSplBx4LElSRUXJZ1eZ5EiSpFIyyZEkqaKa9STiRjHJkSRJpWQnR5IklZK3qyRJqqxyZx3lPjtJklRZJjmSJFWUU8glSZJakEmOJEmVZZIjSZLUckxyJEmqKB8GKEmS1IJMciRJqqxyZx3lPjtJklRZJjmSJFWUz8mRJElqQZGZzT4GSZKkZc4kR5IklZKdHEmSVEp2ciRJUinZyZEkSaVkJ0eSJJWSnRxJklRK/x8LM4NjPOXmeQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 720x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAIuCAYAAABXZLccAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd7hdVbWw8Xeck0JJICGkkYQehNBCEVAEKYKoFAELqFex5eInFlQUFVFRVK4U8YJ6o6KoF1FUMJRLkQ7SQif0GhKSACkEpCQ5Z3x/7J2wc0w5Cdklme/PZz/stdZca851zjIZGbOsyEwkSZJK0tbsBkiSJDWaAZAkSSqOAZAkSSqOAZAkSSqOAZAkSSqOAZAkSSpOj2Y3QJIkNcfq6x/esLVwXpn4x1jS8YjYDzgdaAd+lZk/6nJ8A+AsYCAwA/hIZk6qHvsYcFy16Pcz8+yltccMkCRJaqqIaAfOBN4FjAIOj4hRXYqdDPwuM7cBTgB+WD13HeDbwM7ATsC3I6L/0uo0AJIkSc22E/BoZj6emXOAc4GDupQZBVxV/X51zfF3Aldk5ozMnAlcAey3tAoNgCRJKlREW8M+SzEMeLpme1J1X627gUOq3w8G+kbEgG6e+28MgCRJUt1FxJiIGF/zGbOMl/gK8PaIuBN4OzAZ6Fje9jgIWpKkQkUD8yCZORYYu5jDk4ERNdvDq/tqz3+GagYoIvoAh2bmrIiYDOzR5dxrltYeM0CSJKnZbgNGRsRGEdELOAwYV1sgItaN1/vSvk5lRhjAZcC+EdG/Ovh53+q+JTIDJElSoboxNqchMnNeRBxFJXBpB87KzAkRcQIwPjPHUcny/DAiErgO+Gz13BkR8T0qQRTACZk5Y2l1RmbDlgCQJEktpM+GH2tYEPDSk2cvcR2gRjMDJElSoVolA9QM5d65JEkqlhkgSZIKFdFSvVINZQZIkiQVxwyQJEnFKjcPUu6dS5KkYhkASZKk4tgFJklSoZwGL0mSVBAzQJIkFcoMkCRJUkHMAEmSVKgoOA9S7p1LkqRimQGSJKlQjgGSJEkqiBkgSZIKZQZIkiSpIGaAJEkqlBkgSZKkgpgBkiSpUEE0uwlNYwZIkiQVxwyQJEmFcgyQJElSQQyAJElScewCkySpUHaBSZIkFcQMkCRJhTIDJEmSVBAzQJIkFavcPEi5dy5JkoplBkiSpEI5BkiSJKkgZoAkSSqUGSBJkqSCmAGSJKlQUXAepNw7lyRJxTIDJElSoRwDJEmSVBAzQJIkFSoimt2EpjEDJEmSitOyGaCIyNVGHNbsZmgV8erT5wKQ+VCTW6JVRcSbeHnujc1uhlYha/TclcwsNyXTYC0bAEmSpPpyELQkSVJBzABJklQoF0KUJEkqiBkgSZIK5RggSZKkgpgBkiSpUGaAJEmSCmIGSJKkQjkLTJIkqSBmgCRJKpVjgCRJksphBkiSpEI5C0ySJKkgZoAkSSpURDS7CU1jBkiSJBXHAEiSJBXHLjBJkgrlQoiSJEkFMQMkSVKhnAYvSZJUEDNAkiSVymnwkiRJ5TADJElSqQpOgxR865IkqVRmgCRJKpVjgCRJksphBkiSpFKZAZIkSSqHGSBJkkpVcBqk4FuXJEmlMgMkSVKh0jFAkiRJ5TAAkiRJxbELTJKkUpXbA2YGSJIklccMkCRJpWorNwVkBkiSJBXHDJAkSaVyGrwkSVI5zABJklSqchNAZoAkSVJ5zABJklSqFpoFFhH7AacD7cCvMvNHXY6fBuxZ3VwDGJSZ/arHOoB7q8cmZuaBS6vPAEiSJDVVRLQDZwL7AJOA2yJiXGbeP79MZh5dU/5zwHY1l3glM0cvS512gUmSVKqIxn2WbCfg0cx8PDPnAOcCBy2h/OHAH9/IrRsASZKkZhsGPF2zPam6799ExAbARsBVNbtXi4jxEXFzRLy3OxXaBSZJUqkaOAQoIsYAY2p2jc3MsctxqcOAv2RmR82+DTJzckRsDFwVEfdm5mNLuogBkCRJqrtqsLO4gGcyMKJme3h136IcBny2y7UnV//7eERcQ2V80BIDILvAJEkqVVs07rNktwEjI2KjiOhFJcgZ17VQRGwO9AduqtnXPyJ6V7+vC+wK3N/13K7MAEmSpKbKzHkRcRRwGZVp8Gdl5oSIOAEYn5nzg6HDgHMzM2tO3wL4n4jopJLY+VHt7LHFMQCSJKlUrbMMEJl5CXBJl33Hd9n+ziLO+yew9bLWZxeYJEkqjgGQJEkqjl1gkiQVKpe+QOEqywyQJEkqjhkgSZJK1UIvQ200M0CSJKk4ZoAkSSpVuQkgM0CSJKk8ZoAkSSqVs8AkSZLKYQZIkqRSOQtMkiSpHGaAJEkqVbkJIDNAkiSpPGaAJEkqlbPAJEmSymEGSJKkUpkBkiRJKocZoBa2z9u35eTvfJT29jZ+e+7VnPyzcQsdX3/Yuvzi5P9k3XXWYuasl/jEF85k8tQZAPz9d8ey03ab8s/xD3Hox3/cjOarBV133e2ceOIv6ezs5P3v34cxY96/0PE5c+by1a+eyoQJj9GvX19OO+2rDB8+mBtvvJNTTjmbuXPn0bNnD4455uO85S3bNuku1EpuvP5e/utH59DZ0cnBh+7OJz79noWOz5kzl+O+/ksemPAUa/frw0mnfIZhw9Zl7tx5fPf43/DgA0/R0dHJ/ge+lU9+ev8m3YVKZAaoRbW1BT/5/sc56GMnsd3eX+H9B76VzUcOW6jMD4/7MP/71+vZ6Z1f4wen/40Tjj1swbHT/udCPnn0zxrdbLWwjo4OTjjhF/zqV9/h4ovP5KKLruPRRycuVOa88y5nrbX6cMUVYzniiIM4+eTfAtC//1r8/Off4sILz+BHPzqar3711CbcgVpNR0cnPzzx95z5i6P527gTufSSW3js0ckLlTn/r9ez1lprcuGlJ/GRj+7L6af+GYArLruNuXPn8ZcLvs85f/42f/nzNUye/HwzbqNsbQ38tJgWbJIA3jx6Ux57cipPTnyWuXM7OO/Cm9h/3x0XKrP5yOFce+N9AFz7zwnsv88OC45dc+MEXnzplYa2Wa3tnnseYYMNhjJixBB69erJe96zO1deectCZa666hYOPnhvAN75zl256aa7yUxGjdqEwYMHADBy5Pq89toc5syZ2/B7UGu5797HGTFiEMNHDKJnrx688907cc3Vdy5U5pqr7uCAg3YF4B377sitNz9AZhIRvPLya8yb18Frr82lZ88e9FlztWbchgrV0AAoIt4WEWc2ss6V1XpD+jPpmekLtidPmc6wwf0XKnPv/U9x0Lt2AuCg/d7MWn3XYJ1+fRraTq08pk2bzpAh6y7YHjx4ANOmTf+3MkOHVsr06NFO375rMnPm7IXKXHbZPxk1ahN69epZ/0arpT07bSZDhq6zYHvw4HV4dtrMhcs8O4shQyplevRop0/f1Zk16yXese+OrL5Gb/bZ44vs944v89Ej9mNt//xqvIjGfVpM3QOgiNguIn4cEU8C3wMeXELZMRExPiLG17tdq4Kvn/i/7LbzFtx0yQ/ZbZctmDxlOh2dnc1ullZhjzzyFCef/FtOOOGzzW6KVnL33fsEbW1tXH71aVxy2Y/5/dmXMenpZ5vdLBWkLoOgI2Iz4PDq53ngT0Bk5p5LOi8zxwJjq9fIerRtZfHM1JkMX2/Agu1hQwcwucu/rKZMm8lh/3kaAGuu0Zv3vmsnXpj9ckPbqZXH4MEDmDr19TEW06ZNX9CtVVtmypTnGTJkXebN6+DFF/9F//5rATB16vMcddQPOOmko1l//aENbbta06DB/Zk6ZcaC7WnTZjCoS6Z60KB+TJ06g8FD1mHevA5eevEV+vXrw/9dfDO7vm1revbswToD1mL0dpsyYcKTDB8xqNG3UbbWS8w0TL0yQA8CewH7Z+bbMvO/gY461bVKGn/3Y2y60RA2GDGQnj3bef8Bb+HiK25fqMyA/n2JalrxmM8exNl/uqYJLdXKYuutR/Lkk8/w9NNTmTNnLhdffB177bXTQmX22mtnzj//SgAuu+xGdtllGyKC2bNfYsyY7/LlL3+MHXYY1YzmqwVtudVGTJz4LJMnPcfcOfO47JJbefue2y1U5u17bseFf78RgH9cPp4377wFEcHQoetw6y0PAPDKy69x792Ps9FGBtZqnHpNgz8EOAy4OiIuBc6l6Dhz2XV0dHL0t37Lhb//Ou3tbZz9p2t44OFJfOtL7+OOe5/g4ituZ/e3bMEJXzuMTLjhlgf44rd+s+D8f/zl22y2yXr0WXM1Hr3lDI48Ziz/uO6eJt6Rmq1Hj3aOP/5IPvWpb9PR0cmhh76DkSM34PTT/8BWW41k77135n3v24djjjmVffYZw9pr9+G0074KwB/+cDETJ07hzDPP5cwzzwXgrLNOYMCAfs28JTVZjx7tHPvND/OZMafQ2dnJQQfvxqabDuNn/30+o7bckD322o6DD92dbx47lgP2+xprrb0mJ518JAAfPHxvjj/u1xxy4Dch4cCD38ZmbxrR5DsqT7aV+1dzZNavpyki1gQOotIVthfwO+D8zLy8G+fmaiMOW1oxqVtefbryl3bmQ01uiVYVEW/i5bk3NrsZWoWs0XNXMrOhEckmh5/TsOEmj/3xQy0VbdWlCywiegBk5r8y85zMPAAYDtwJfK0edUqSpGXkLLAV7tauOzJzZmaOzcy961SnJElSt9RrDFDrhXqSJGlhBf9tXa8AaGBEfGlxBzPTdfQlSVLT1CsAagf6UHRsKUlSiyt4Fli9AqApmXlCna4tSZL0hjgGSJKkUrXg7KxGqVcAtDdAROwJbFndNyEzr65TfZIkSd1WrwBo9Yi4BXgVmP/+hvdHxEnAwZk5uU71SpIkLVW9AqAzgJ9n5m9rd0bER4GfUVkdWpIkNVO5PWB1WwhxVNfgByAzfwdsXqc6JUmSuqVeGaBFBlYR0UZlirwkSWq2gqfB1ysDdHFE/LL6MlRgwYtRfwFcUqc6JUmSuqVeAdAxwCzgqYi4PSJuB54EZgNfqVOdkiRpWbRF4z4tpl4B0GjgVGAEcATwWypvgu9FZYVoSZKkpqlXAPQ/wGuZ+QrQH/h6dd8LwNg61SlJkpZBRuM+raZu7wLLzBnV7x8ExmbmX4G/RsRddapTkiSpW+oWAEVEj8ycR2VV6DENqFOSJC2LFhyb0yj1Ckb+CFwbEc8DrwDXA0TEplS6wSRJkpqmLgFQZp4YEVcCQ4HLMzOrh9qAz9WjTkmStIx8GeqKl5k3L2Lfw/WqT5IkqbscjyNJUqkKHgNUr2nwkiRJLcsMkCRJpSo4DVLwrUuSpFIZAEmSpOLYBSZJUqkKngZvBkiSJBXHDJAkSaVyGrwkSVI5zABJklSodAyQJElSOcwASZJUqoLTIAXfuiRJKpUZIEmSSuUsMEmSpHKYAZIkqVTOApMkSSqHGSBJkkrlGCBJkqRymAGSJKlU5SaAzABJkqTyGABJkqTi2AUmSVKh0kHQkiRJ5TADJElSqcwASZIklcMMkCRJpfJVGJIkSeUwAyRJUqkKToMUfOuSJKlUZoAkSSqVY4AkSZLKYQZIkqRSuQ6QJElSOcwASZJUKjNAkiRJzRMR+0XEQxHxaEQcu5gyH4iI+yNiQkScU7P/YxHxSPXzse7UZwZIkqRCZYvMAouIduBMYB9gEnBbRIzLzPtryowEvg7smpkzI2JQdf86wLeBHYEEbq+eO3NJdZoBkiRJzbYT8GhmPp6Zc4BzgYO6lPk0cOb8wCYzn63ufydwRWbOqB67AthvaRUaAEmSpGYbBjxdsz2puq/WZsBmEXFjRNwcEfstw7n/xi4wSZJK1cA0SESMAcbU7BqbmWOX4RI9gJHAHsBw4LqI2Hp522MAJEmS6q4a7Cwu4JkMjKjZHl7dV2sScEtmzgWeiIiHqQREk6kERbXnXrO09tgFJklSqSIa91my24CREbFRRPQCDgPGdSlzAdVAJyLWpdIl9jhwGbBvRPSPiP7AvtV9S2QGSJIkNVVmzouIo6gELu3AWZk5ISJOAMZn5jheD3TuBzqAYzJzOkBEfI9KEAVwQmbOWFqdBkCSJJWqhRZCzMxLgEu67Du+5nsCX6p+up57FnDWstRnF5gkSSqOGSBJkkrVQhmgRjMDJEmSimMGSJKkUpWbAGrtAOjVp89tdhO0iol4U7OboFXIGj13bXYTJC2nlg6A9rvs+mY3QauIS9+5GwDXTbmoyS3RqmL3ofvz24cvbXYztAo5YrOlvr5qhUvHAEmSJJWjpTNAkiSpjpa+QvMqywyQJEkqjhkgSZJK5RggSZKkchgASZKk4tgFJklSqcrtATMDJEmSymMGSJKkQrUVnAYp+NYlSVKpzABJklSogtdBNAMkSZLKYwZIkqRCmQGSJEkqiBkgSZIKFQWngMwASZKk4pgBkiSpUAUngMwASZKk8pgBkiSpUGaAJEmSCmIGSJKkQkXBaZCCb12SJJXKAEiSJBXHLjBJkgrlIGhJkqSCmAGSJKlQbWaAJEmSymEGSJKkQjkGSJIkqSBmgCRJKpQZIEmSpIKYAZIkqVBRcArIDJAkSSqOGSBJkgrly1AlSZIKYgZIkqRCFTwEyAyQJEkqjxkgSZIKZQZIkiSpIAZAkiSpOHaBSZJUKLvAJEmSCmIGSJKkQrWZAZIkSSqHGSBJkgrlGCBJkqSCmAGSJKlQZoAkSZIKYgZIkqRCRcHTwMwASZKk4pgBkiSpUI4BkiRJKogZIEmSCmUGSJIkqSBmgCRJKpQZIEmSpIIYAEmSpOLYBSZJUqEKXgfRDJAkSSqPGSBJkgrlIGhJkqSCmAGSJKlQUXAapOBblyRJpep2Bigiemfma/VsjCRJahzHAC1BROwUEfcCj1S3t42I/657yyRJkuqkOxmgnwL7AxcAZObdEbFnXVslSZLqLgpOAXVnDFBbZj7VZV9HPRojSZLUCN3JAD0dETsBGRHtwOeAh+vbLEmSVG8FJ4C6lQH6DPAlYH1gGrBLdZ8kSdJKaakZoMx8FjisAW1RFzsM6MeRm29MWwSXTprGeU9OWmS5XQcN4LjRW/D5m+/ikdkvMWi13ozddXsm/esVAB584UXOeOCxRjZdLeq+Wx7gj/99AZ2dnez2nl1494f3Xuj45X+6husvvoW29jb69uvDx7/2QQYMWQeAT+/5ZYZvPBSAdQb153M//GTD26/W89jt93PF2L+RnZ1su+9beOv791no+B2X3MDtF19PtLXRa/XevOuoDzJw/cpz9M8/X87dV9xMtLWx75hD2XiHLZpxC0UrOQO01AAoIn4JZNf9mTmmLi0SUEnNfXaLTfjG7ffx/KtzOH2X0dzy3HQmVoOa+VZvb+egDdbjwVmzF9o/5ZVXOermuxrYYrW6zo5O/vcnf+NLpxxJ/4Fr8/3/PI3Ru27JehsOWVBm/ZHDOG7s0fRerRdXX3Aj5/3iIo78zkcB6NW7J9/+9Vea1Xy1oM6OTi77+Xkc/v3PstaAfvzm6JMZufNWCwIcgC332IHt3/02AB6+5V6u/NX5HHbC/+O5iVO4/7o7+PTPvs5L02dzznFncOT/fIu2dpenU2N050n7B3Bl9XMjMAhwPaA622ztvjzz8qtMfeU15mVy7dTn2GXQgH8r99FN1+e8JyYxp/PfYlRpIU88MJFBw9Zl4HoD6NGzBzvttR133XDfQmU2334kvVfrBcAmozZg5nOzmtFUrSSeefgp+g8dSP8h69Leswejdt+eR26+d6EyvddYfcH3ua/OWZByeOTmexm1+/b06NmTfkMG0H/oQJ55uOt8G9VbROM+raY7XWB/qt2OiN8DNyyufETsvpTrXdft1hVs3dV68dyrr8eZz7/6Gm9au+9CZTbpuybrrtab256fyfs2HL7QsSGrr8YZu4zm5XkdnP3oU0zokiFSeWY+/wL9B/VbsN1/YD8ef2Dxf+Fcf8ktbL3z610Sc+fM43tjTqW9vY13fWhvtttt67q2V63vxemzWGvg689U33X78cxD//5Mjb/oOm694Go65nXw4ROPqp77AsM233BBmbXW7ceL0w241TjL8y6wjYDBSzh+zCL2JbANMAJoX9yJETEGsGutGwIY86aNOeW+f5+QN/O1OXz0utt4ce48Nu27JsdvN4ojb7yDlztcvUDdc9Pl43nqoac55vSjFuw76U/H0X9gP557ZjonH/0zhm08lEHD1m1iK7Wy2HH/3dlx/92ZcM14bvzT5RzwpY80u0mqamvBzEyjdGcM0ExeHwPUBswAjl1c+cw8oMv5uwLHAVOpTKFfrMwcC4ytnld0n87zr85h4Gq9F2yvu1pvpr82Z8H26j3a2aDPGvzXmyv/Cu/fqxffHr0F373rAR6Z/RJz584D4NEX/8WUl19l2Jqr88jslxp7E2op/dddm5nPvv4v7JnPzaL/umv/W7n7xz/Mxb//B1/96Wfp2ev1PyL6V/+lP3C9Abxp9KZMfGSyAVDh+g7ox+yabtIXn59F3wH//kzNN2r37bn0Z3+unrs2s5+bueDY7Odn0XdAv8WdKq1wSxwDFJUlIrcFBlY//TNz48z889IuHBF7R8Q1wPeBUzNzl8y8cAW0uQgPz36R9dZYncGr96ZHBG8fMpCbn52x4PjL8zo47JpbOOL68Rxx/XgefOHFBcHP2j17LPjFDlm9N+utsRpTXn61OTeilrHh5iOYNuk5npsynXlz53HrVXey7a5bLVRm4sOT+P0p5/G5H36Stfq/3uX6rxdfZu6cSlD94qyXePTeJ1hvwyUlglWC9TZbn5nPPMesqdPpmDuP+6+7g5E7L9w1OmPyswu+P3rbBPqvNxCAkTtvzf3X3cG8uXOZNXU6M595jvU226Ch7VfZlpgBysyMiEsyc6sllasVEe8Bvgm8AByXmYsdL6TF60z4+YOP8f3tt6I94PLJ05j4r5f5j03W5+HZL3HLczMWe+5W/dfmPzZdn3mdSQJnPPAYL82b17jGqyW192jnQ188hJ98ZSydnZ3s+u6dGLbREC749f+x4eYjGL3rVpz3iwt59ZXX+MW3zwZen+4+5alp/P7k84i2IDuTd314r4Vmj6lMbe3t7Hvk+zj3+J/R2dnJtvvswsANhnLtHy5m6Mj12WznrRl/0fU8efdDtLW3s1qf1Tng6Er318ANhrLFbtsx9jM/oK29nXd+5v3OAGuCkrvAInPJPU0R8QfglMy8s1sXjOgEJgF3s+jp8wd28zq532XXd6eotFSXvnM3AK6bclGTW6JVxe5D9+e3D1/a7GZoFXLEZvuRmQ0NSfa59MaGDTe5Yr9dl3hvEbEfcDqVscK/yswfLabcocBfgDdn5viI2BB4AHioWuTmzDxyae1ZbAYoInpk5jxgO+C2iHgM+BeV8beZmdsv5lRflCpJ0kqgrUWG21ZftXUmsA+VJMptETEuM+/vUq4v8AXgli6XeCwzRy9LnUvqArsV2B7oVsZmvsy8ttrI1YBNq7sfzUwHoUiSpEXZiUqs8DhARJwLHATc36Xc94CTWPSM82WypAAoADJzmd6hEBE9gB8AnwCeql5nRET8BvhmZs5dzrZKkqQVqIXGAA0Dnq7ZngTsXFsgIrYHRmTmxRHRNQDaKCLuBGZTGX+81DE0SwqABkbElxZ3MDNPXcyhHwN9gY0y88Vqo9cCTq5+vrC0RkmSpFXLItb6G1td/qY757YBpwJHLOLwFGD9zJweETsAF0TElpm5xBWAlxQAtQN9qGaClsH+wGZZM7o6M2dHxGeABzEAkiSpJTRy3l3tWn+LMJnKYsnzDa/um68vsBVwTWWFHoYA4yLiwMwcT/UVXZl5e3XM8mbA+CW1Z0kB0JTMPGFJJy9G1gY/NTs7Sl/cUJIkLdJtwMiI2IhK4HMY8KH5BzPzBWDByqvVdQa/Up0FNhCYUY0zNgZGAo8vrcKljgFaDvdHxEcz83cLXSziI1QyQJIkqQW0yiywzJwXEUcBl1HpgTorMydExAnA+Mwct4TTdwdOiIi5QCdwZGYufrG8qiUFQHsvQ9trfRb4W0R8Ari9um9HYHXg4OW8piRJWoVl5iXAJV32Hb+YsnvUfP8r8NdlrW+xAVB3oqfFnDcZ2Dki9gK2rO6+JDOvXJ7rSZKk+mihWWANtzxvg1+iiFin+vWu6meh/csbWEmSJK0oKzwAotLtlVRXjK7ZP3974zrUKUmSllHJb1+rRwC0R2Y+VYfrSpIkrRD1CIDOp/IKDUmS1MJKHgNUj+xXwT9OSZK0MqhHBmhYRPx0cQcz8/N1qFOSJKnb6hEAvcLr6/9IkqQWVfILGuoRAE3PzLPrcF1JkqQVoh4B0Jw6XFOSJK1gDoJegTJzl677ImKTiPhWRExY0fVJkiQtq7qtgRQR60XE0RFxGzChWtdh9apPkiQtm7YGflrNCm9TRIyJiKuBa4ABwCeBKZn53cy8d0XXJ0mStKzqMQboDOAm4EOZOR4gSh5mLklSi2or+K/negRAQ4H3A6dExBDgz0DPOtQjSZK0XOoxCHp6Zv4iM98O7A3MAqZFxAMR8YMVXZ8kSVo+bdG4T6up67ikzJyUmadk5o7AQVQWSZQkSWqqhg3MzsyHgU83qj5JkrRkzgJrnBZMgkmSpNLUYxD0kpQ73FySpBbTimNzGmWFB0ARcSGLDnSCyrpAkiRJTVWPDNDJy3lMkiQ1kOsArUCZee2KvqYkSdKKVI9XYYyMiN9ExKkRMTwi/i8iXoqIuyPizSu6PkmSpGVVj1lgv6HyKoxngFuAs4B1ga9QeU2GJElqAS6EuGL1ycyxmXky8EpmnpeZr2bmFUDvOtQnSZK0TOoxCLqz5vvsJRyTJElN1IoLFDZKPQKgzSPiHirT3jepfqe6vXEd6pMkSVom9QiAtljEvgBGAF+vQ32SJGk5OA1+BcrMp+Z/j4jtgA8B7weeAP66ouuTJElaVvVYCXoz4PDq53ngT0Bk5p4rui5JkrT8WnF2VqPUowvsQeB6YP/MfBQgIo6uQz2SJEnLpR4B0CHAYcDVEXEpcC6+BV6SpJZTcgZohc+Ay8wLMvMwYHPgauCLwKCI+HlE7Lui65MkSVpWdVsCIDP/lZnnZOYBwHDgTuBr9RkGFxMAABoRSURBVKpPkiQtm7YGflpNQ9qUmTOrq0Pv3Yj6JEmSlqQeY4AkSdJKoOR1gFoxKyVJklRXZoAkSSqUs8AkSZIKYgAkSZKKYxeYJEmFKjkLUvK9S5KkQpkBkiSpUA6CliRJKogZIEmSChUuhChJklQOM0CSJBXKMUCSJEkFMQMkSVKhSs6ClHzvkiSpUGaAJEkqVJuzwCRJksphBkiSpEI5C0ySJKkgZoAkSSqUGSBJkqSCGABJkqTi2AUmSVKh2pvdgCYyAyRJkopjBkiSpEKVvBBiZLbmzUcU/FuRJBUpMxs6L+sHd13RsL9rvzF6n5aac9bSGaDpr45rdhO0ihiw2oEAzOm4o8kt0aqiV/v2rL/Nic1uhlYhE+/5ZsPrdBq8JElSQVo6AyRJkurHDJAkSVJBzABJklSodjNAkiRJ5TADJElSoRwDJEmSVBAzQJIkFarklaDNAEmSpOKYAZIkqVCOAZIkSSqIAZAkSSqOXWCSJBWqvdkNaCIzQJIkqThmgCRJKpSDoCVJkgpiBkiSpEK5EKIkSVJBzABJklSodscASZIklcMASJKkQrVF4z5LExH7RcRDEfFoRBy7iONHRsS9EXFXRNwQEaNqjn29et5DEfHObt37svygJEmSVrSIaAfOBN4FjAIOrw1wqs7JzK0zczTwX8Cp1XNHAYcBWwL7AT+rXm+JDIAkSSpUC2WAdgIezczHM3MOcC5wUG2BzJxds7kmMH8K20HAuZn5WmY+ATxavd6S7717PyJJkqTlFxFjImJ8zWdMzeFhwNM125Oq+7pe47MR8RiVDNDnl+XcrpwFJklSoRq5EnRmjgXGvsFrnAmcGREfAo4DPra81zIDJEmSmm0yMKJme3h13+KcC7x3Oc8FDIAkSSpWe2TDPktxGzAyIjaKiF5UBjWPqy0QESNrNt8DPFL9Pg44LCJ6R8RGwEjg1qVVaBeYJElqqsycFxFHAZcB7cBZmTkhIk4AxmfmOOCoiHgHMBeYSbX7q1ruz8D9wDzgs5nZsbQ6DYAkSVLTZeYlwCVd9h1f8/0LSzj3RODEZanPAEiSpEKVPA6m5HuXJEmFMgMkSVKhGjkNvtWYAZIkScUxAyRJUqHMAEmSJBXEDJAkSYXqxgKFqywzQJIkqThmgCRJKpRjgCRJkgpiBkiSpEKZAZIkSSqIGSBJkgplBkiSJKkgZoAkSSpUuxkgSZKkchgASZKk4tgFJklSodp8FYYkSVI5zABJklSokrMgJd+7JEkqlBkgSZIK5UKIkiRJBTEDJElSoVwIUZIkqSBmgCRJKpTrAEmSJBXEDJAkSYVyFpgkSVJBzABJklQoM0CSJEkFMQMkSVKhSs6ClHzvkiSpUAZAkiSpOHaBSZJUqHAQtCRJUjnMAEmSVKiCE0BmgCRJUnnMAEmSVKiSxwAZALWwm254kJ+c9Hc6Ojs58JCd+egn91ro+J3jH+Mn/zWOxx6ZwgknfZi99t0WgCnPzODYL55NZjJvXgfvO3xXDvnAW5txC2oxN1x/Fz/6wdl0dHZy6Pv24lOfPmih43PmzOXrXzuT++9/gn79+nDyqV9g2LBBTJ78LAe+58tsuNF6AGyz7Ui+/Z1PNeMW1GLe/taN+fbX9qG9LTj3/Lv5+Vk3LXR8vSFrcer3D2Ctvr1pa2vjpNOv5uobHuNtu2zIsV/Yk54925k7t4MfnHYV/7z1qSbdhUpkANSiOjo6OeUH53P62DEMGrw2nzj8dHbbYxQbbTJkQZkhQ/vzre9/kP/97bULnbvuwLX45R8+R69ePXj55df48CEns9seWzJw0NqNvg21kI6OTr7/vbP45a+/yZDBA/jgB77BnnvuwCabDl9Q5m9/uZq11u7D/112Opdc/E9OPfkcTjntiwCMGDGYv55/UrOarxbU1hZ87xvv5MP/+UemTpvNuHM+zj+ueYRHHn9+QZnPfXpXLrrsAf5w3h2M3HhdfnPGB3jbu3/GzFmv8InPn8ezz73EZpsO5Pc/P4yd9/nvJt5NmUoeB1Pyvbe0+++byPD1BzBs+AB69uzBO/YbzXVXT1iozNBh67DpZuvR1uVlLj179qBXr0psO3fOPLIzG9Zuta5773mU9dcfwogRg+nZqwfvevdbueqq8QuVueqq8Rx00O4A7PvOnbnl5glk+vxo0UZvtR5PPj2TpyfPYu68Ti689H722WPkQmUS6NOnFwB9+/Tm2edeAmDCg9MWfH/40edYrXcPevVsb2j7Vba6ZIAi4vLM3Lce1y7Fc9NeYNDgfgu2Bw3ux4R7u58enjZ1Fl/+7K+Z9PTzHPWl/c3+iGefncGQIQMWbA8evA733vPowmWmzWDI0EqZHj3a6dN3dWbNehGAyZOf432HHEufNVfnc1/4ADvsuEXjGq+WNGRQX6ZMnb1ge8qzL7Ld1ustVOYnP7+O3//icI44fEfWWL0nHxrzx3+7zrvfsTn3PTCVOXM76t5mLSyi3H/g1KsLbGCdrqtuGjykH3/465d57tkX+NoXf8te+2zDOgP6NrtZWkkNHNifK648g379+zJhwuN8/qiT+fuFJ9OnzxrNbppa3IHv2pK/jLuHX/7uVrbfZhg/OfFA9jl0LPMTiyM3WZdjv7gnHzny3wMjqZ7q1QW2dkQcsrjP4k6KiDERMT4ixi+uTCkGDl6bZ6fNWrD97LRZy5XFGThobTbedAh33f74imyeVkKDBq3D1KnTF2xPmzaDQYPXWbjM4HWYOqVSZt68Dl568RX69etLr1496de/EkBvueXGjBgxmCefnNK4xqslTX32RYYOWWvB9tBBfZk67cWFynzw4G256LIHALjjnsn07t3OOv0rgfOQQX0Ze9qhfOm4C5k4aRZqvGjgp9XULQAC9gcOWMRn/8WdlJljM3PHzNyxTu1aaWyx5Qiefup5npk0nblz5/GPS+9itz227Na5z06dxauvzgVg9uyXuefOJ1h/w0H1bK5WAlttvQkTn5rKpEnPMnfOPP7vkn+y5547LFRmzz134O9/vw6Ayy+7hZ132ZKIYMaM2XR0dALw9NPTmPjUVEYMH9zwe1BruXvCM2y0fn9GDFubnj3aOGC/UVxx7SMLlXlmymx23XlDADbdaAC9e/Vg+oyXWatvb35zxgc46fRrGH/XpCa0XqWrVxfYU5n5iTpduwg9erTz5W8czBc/80s6O5L93/tmNt50CGPPvJQtRo1gtz235P77JnLsF8/mxdkvc8O19/Orn1/OOecfw5NPPMtPT76QCMiED31sDzbdbGizb0lN1qNHO9847uP856d+QEdnJwcfsiebjhzBGT/9M1tutTF77rUjh7xvT77+tTN51zu/wNpr9+HHp3wegNvHP8AZPz2PHj3baYvg+O98irX79WnyHanZOjqS4394Ob/7+WG0t7Xx5wvu5pHHnudL/2937pkwhX9c+wjfP+VKfnT8u/jkR3YiE758/EUAfOywHdlw/f58fszb+PyYtwHwH5/5I9NnvNzMWypOyesART1meETEnZm53Ru8Rk5/ddyKapIKN2C1AwGY03FHk1uiVUWv9u1Zf5sTm90MrUIm3vNNMrOhIcndMy5q2CjobdfZv6XCrXplgD5Sp+tKkqQVpKUikgarVwB0USw8ty6oLAcBkJm5SZ3qlSRJWqp6BUBdBzG3AR8AvgLcWac6JUmSuqUuAVBmTgeIiDbgP4BjgLuA92Tm/fWoU5IkLZu2gvvA6rUSdE/gE8DRwA3AezPz0SWfJUmS1Bj16gJ7ApgH/ASYCGwTEdvMP5iZf6tTvZIkqZsKTgDVLQD6B5VBz9tWP7USMACSJElNU68xQEcs7lhEuHysJEktoOSFEOv1KoyFRES/iPhkRFyJs8AkSVKT1asLjIhYHTgI+BCwHdAXeC9wXb3qlCRJ3VdwAqg+GaCIOAd4GNgH+G9gQ2BmZl6TmZ31qFOSJKm76pUBGgXMBB4AHsjMji4rQ0uSpCYzA7SCZeZoKis/9wX+ERE3AH0dAC1JklpB3cYAZeaDwLeBb0fEDsDhwG0RMSkz31qveiVJUveUvBJ0vcYAvbl2OzNvz8yvABsAx9ajTkmSpO6qVwZobET0Ac4F/jj//V+ZmTgLTJKkllBwAqhuY4C2A/an8jqMv0TE3RFxbERsWI/6JEmSlkXdFkLMzIcy87uZOQr4KLA2cGVE3FivOiVJUvdFZMM+rabuK0FHRBswCBgMrAk8W+86JUmSlqSeK0HvRmXm13uBe6mMBzo6M1+oV52SJKn7Sh4DVJcAKCKeBp6iEvR8JzPN+kiSpJZRrwzQ2zLzqUUdiIgemTmvTvVKkiQtVb3GAP3v/C8R8fsux26tU52SJGkZRDTu02rqFQCtWfN9yy7HWvDHIEmSSlKvLrAlzXdrvblwkiQVqO5TwVtYvQKgfhFxMJWfbb+IOKS6P6isByRJktQ09QqArgUOrPl+QM0xX4UhSVILaMWxOY1SlwAoMz++uGMRcWg96pQkSequZnT/ndaEOiVJUhfRwE+raUYA1Io/B0mSVJC6vQpjCZwFJklSC3AM0AoWEfey6EAnqLwUVZIkqWnqlQHav07XlSRJK0jBCaC6BUCrZ+aDABHROzNfm38gInah8qJUSZKkpqjXIOhzar7f1OXYz+pUpyRJWgZt0bhPq6lXABSL+b6obUmSVLiI2C8iHoqIRyPi2EUc3z0i7oiIeRHxvi7HOiLirupnXHfqa8S7wLoOhnYWmCRJLaBVMhIR0Q6cCewDTAJui4hxmXl/TbGJwBHAVxZxiVcyc/Sy1FmvAGh4RPyUys92/neq28PqVKckSVo57QQ8mpmPA0TEucBBwIIAKDOfrB7rXBEV1isAOqbm+/gux7puS5KkVVxEjAHG1Owam5ljq9+HAU/XHJsE7LwMl18tIsYD84AfZeYFSzuhXu8CO3txxyJi/XrUKUmSlk1E40alVIOdsUstuHw2yMzJEbExcFVE3JuZjy3phLq9CiMi3hIR74uIQdXtbSLiHODGetUpSZJWSpOBETXbw6v7uiUzJ1f/+zhwDbDd0s6pSwAUET8GzgIOBS6OiO8DlwO3ACPrUackSVo2LfQy1NuAkRGxUUT0Ag4DujWbKyL6R0Tv6vd1gV2pGTu0OPUaA/QeYLvMfDUi+lPp19tq/gAmSZKk+TJzXkQcBVwGtANnZeaEiDgBGJ+Z4yLizcD5QH/ggIj4bmZuCWwB/E91cHQblTFATQuAXs3MV6s3NTMiHjH4kSSptbTSy1Az8xLgki77jq/5fhuVrrGu5/0T2HpZ66tXALRxl4WINqrdzswD61SvJEnSUtUrADqoy/YpdapHkiQtpxZKADVcvabBX7u4YxGxaz3qlCRJ6q66BEDVJa0/QGVho0sz876I2B/4BrA63ZieJkmS6qtua+GsBOrVBfZrKvP5bwV+GhHPADsCx3ZndUZJkqR6qlcAtCOwTWZ2RsRqwFRgk8ycXqf6JEnSMmqlWWCNVq/s15zM7ASoTod/3OBHkiS1inplgDaPiHuq3wPYpLodQGbmNnWqV5IkdVu5KaB6BUBb1Om6kiRJb1i9psE/VY/rSpKkFSfMAK1YEfEikIs6RKULbK161CtJktQd9coA9a3HdSVJklaEeo0BkiRJLS6i3KUQy71zSZJULDNAkiQVq9xB0GaAJElSccwASZJUqJKnwZsBkiRJxTEDJElSscwASZIkFcMMkCRJhXIdIEmSpIKYAZIkqViOAZIkSSqGGSBJkgrlOkCSJEkFadkMUGaWG5Yuo4gYk5ljm92OVpaZzW7CSsPnqXt8prrPZ6q7vtHwGs0AaWU3ptkN0CrF50krms+UWo4BkCRJKk7LdoFJkqR6KzcPUu6dr1rsW9eK5POkFc1nSi0nHMgnSVKZ/jXv2oYFAWv2eHtLjbg2AyRJkorjGCBJkorVUkmZhjID1IIi4psRMSEi7omIuyJi54joFRE/iYhHI+KRiPh7RAyPiAHVMndFxNSImFyz3at6vfdGREbE5s2+N60YXX+nEbFhRLxS/b3fHRH/jIg3VY/tEREXLeY660bE3Ig4smbfLdXrTIyI52qepw0j4smIWLdabnj1OXwkIh6LiNNrnrk9qu07oOa6F0XEHnX8sWgpIuKl6n9rn5f7I+J3EdGzemyPiHiheuzBiDi55vwjujwTd0XEqOr17ltEfb+NiCdqyv6zWnZSdHkN+fw/67psn7uI602OiN7V7XWrz+TWNXXMqKnzHyv2J6hViQFQi4mItwD7A9tn5jbAO4CngR8AfYE3ZeZI4ALgb8CMzBydmaOBXwCnzd/OzDnVyx4O3FD9r1YNi/qdPlb9vW8LnE33VlV7P3Bz7XUyc+fq83Q88Kea5+nJ+WUiIqg8fxdUn8fNgD7AiTXXngR8c3luTg3xWPX3vDUwHPhAzbHrq8e2A/aPiF1rjtU+E6Mz8/6l1HNMTdm3Vp+jicBu8wtUA/m+mXlLdXsLoB3YLSLW7HK9DuATtTsy896aPwfH1dT5ju79KMoVDfxfqzEAaj1Dgecz8zWAzHwemAV8HDg6Mzuq+38DvAbstaSLRUQf4G3AJ4HD6thuNUg3f6drATO7cbnDgS8DwyJi+DI0Yy/g1epzSPW5PBr4RESsUS1zN/BCROyzDNdVg1V/d7cCwxZx7BXgrkUde4P+yMLP7mFAbbbncOD3wOXAQV3O/QlwdEQ4hENviAFQ67kcGBERD0fEzyLi7cCmwMTMnN2l7Hhgy6Vc7yDg0sx8GJgeETus+CarwRb3O92kmvZ/DPgScOqSLhIRI4ChmXkr8Gfgg8vQhi2B22t3VJ/PiVSe1/lOBI5bhuuqwSJiNWBn4NJFHOsPjASuq9n9wS5dYKsvpYof15T93+q+PwPvrQliPkglKKJm+9zqvq6Z64lUsp//0Y3b01K1NfDTWlqvRYXLzJeAHagsHf8c8CdgjzdwycN5/V9W52I32Kpgcb/T+V1gmwBfZOlrr3yQyl9EXa+zwmTmdQAR8bYVfW29YZtExF3ANGBKZt5Tc2y3iLgbmAxclplTa4517QJ7ZSn11HaBfRggM6cB9wF7R8RoYF5m3gcQETtSyYJPBK4EtouIdbpc84fAMfh3mN4AU4gtqJqSvga4JiLuBf4TWD8i+mbmizVFdwAWObgVoPqHxl7A1hGRVPrUMyKOSReAWikt7ncKnNml6DjgN0u53OHAkIj4cHV7vYgYmZmPdKMp9wPv69K2tYD1gUeBnWoOzc8CzevGddU4j2Xm6Oqg9hsj4sDMHFc9dn1m7h8RGwE3R8SfM/OuFVz//G6waSyc/Tkc2DwinqxurwUcCvxyfoHMfKQavNWOW9JyaMWxOY1i9NxiIuJNETGyZtdo4CEqg1pPjYj2armPAmsAVy3hcu8Dfp+ZG2Tmhpk5AniCmsGHWuks7nc6oku5twGPLe4iEbEZ0Cczh1WvsyGVf1V3Nwt0JbBG9Tmk+lyeAvw2M1+uLZiZlwP9gW26eW01UHWc4bHA1xdx7AngR8DX6lD134B383p3F9WZYR8Atq55Lg9i0c/licBX6tAuFcIAqPX0Ac6uTk29BxgFfIfKH06vAg9HxCNUZu8cvJRMzuHA+V32/RW7wVZmi/udfp3XxwDdTWXW4KdqyuxdnXo8KSImLeE63Xo2qs/dwcD7q8/jw1Sez8XNPDuRfw/S1DouoBLQLuofR78Ado+IDavbXccAvbW6/021z1hEvL+6/8ddyvcCyMxZwE3AtMx8vFp2N2ByZj5TU/91wKiIGFrbqMycANzxBu+7eBHRsE+r8VUYkiQV6tWOmxoWBKzW/paWioIcAyRJUrFaKiZpKLvAJElScQyAJElScewCkySpUFFwHqTcO5ckScUyAJJWUhHRUZ1WfF9EnFfzDq7ludaCN8ZHxIERcewSyvaLiP+3HHV8JyJct0VqKdHAT2sxAJJWXq9UXy+wFTAHOLL2YFQs8//HM3NcZv5oCUX6AcscAElSKzEAklYN1wObRsSGEfFQRPyOyruWRkTEvhFxU0TcUc0U9QGIiP0i4sGIuAM4ZP6FIuKIiDij+n1wRJwfEXdXP2+lsjLw/EUXf1wtd0xE3BYR90TEd2uu9c2ovNj3BuBNDftpSOqWkhdCdBC0tJKLyhu138Xrb/MeCXwsM2+OynuejgPekZn/ioivAV+KiP+i8m6lvai8u+tPi7n8T4FrM/PgqLzuog+V1yZslZmjq/XvW61zJyp57nERsTvwLyrvehpN5c+aO+jyBnlJahYDIGnltXpUXggJlQzQr4H1gKcy8+bq/l2ovE7lxuq/wHpRef3A5sAT8198GhF/AMYsoo69gI/Cgpf0vhAR/buU2bf6ubO63YdKQNQXOH/+u8EiYhySWkzrZWYaxQBIWnm9Mj8LM181yPlX7S7gisw8vEu5hc57gwL4YWb+T5c6vrgC65CkFcoxQNKq7WZg14jYFCAi1ozKm+AfBDaMiE2q5Rb3EtQrgc9Uz22PiLWBF6lkd+a7DPhEzdiiYRExiMpLLN8bEatHRF/ggBV8b5LeoKCtYZ9W03otkrTCZOZzwBHAHyPiHqrdX5n5KpUur4urg6CfXcwlvgDsGRH3Uhm/Myozp1PpUrsvIn6cmZcD5wA3Vcv9BeibmXdQGVt0N/B/wG11u1FJWka+DV6SpELN7byrYUFAz7bRLTXgyAyQJEkqjoOgJUkqVBQ8C8wMkCRJKo4ZIEmSCtWKKzQ3ihkgSZJUHAMgSZJUHLvAJEkqVrl5kHLvXJIkFcsMkCRJhXIavCRJUkHMAEmSVCwzQJIkScUwAyRJUqFcCFGSJKkgZoAkSSpWuXmQcu9ckiQVywyQJEmFch0gSZKkgkRmNrsNkiRJDWUGSJIkFccASJIkFccASJIkFccASJIkFccASJIkFccASJIkFef/A4GQoXd7FytJAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 720x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAItCAYAAADR8MWyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3debxVdb3/8dfnHMQJZFIGGRyANFHTVLLMOYdyngqtW9262VzazdQsK5sszax+3lvUtaxbmal5Nc0hR5xnRXBEUEBBBURTFDnn8/tjb3BzZDgge58t39ezx36411rftb7fxVkdvrzX97tWZCaSJEklaenqBkiSJDWaHSBJklQcO0CSJKk4doAkSVJx7ABJkqTidOvqBkiSpK6x9rAjGzYVfN6Tf45G1dUZJkCSJKk4doAkSVJxvAUmSVKhIsrNQco9c0mSVCwTIEmSChUF5yDlnrkkSSqWCZAkSYVyDJAkSVJBTIAkSSqUCZAkSVJBTIAkSSpURFO9naKhTIAkSVJxTIAkSSpWuTlIuWcuSZKKZQdIkiQVx1tgkiQVymnwkiRJBTEBkiSpUCZAkiRJBTEBkiSpUFFwDlLumUuSpGKZAEmSVCjHAEmSJBXEBEiSpEKZAEmSJBXEBEiSpEKZAEmSJBXEBEiSpEIF0dVN6DImQJIkqTgmQJIkFcoxQJIkSQWxAyRJkorjLTBJkgrlLTBJkqSCmABJklQoEyBJkqSCmABJklSscnOQcs9ckiQVywRIkqRCOQZIkiSpICZAkiQVygRIkiSpICZAkiQVKgrOQco9c0mSVCwTIEmSCuUYIEmSpIKYAEmSVKiI6OomdBkTIEmSVJymTYAiItcaOqarm6HVxCtTzwUg8+EubolWFxGb0dY+oaubodVIa8soMrPcSKbBmrYDJEmS6stB0JIkSQUxAZIkqVA+CFGSJKkgJkCSJBXKMUCSJEkFMQGSJKlQJkCSJEkFMQGSJKlQzgKTJEkqiAmQJEmlcgyQJElSOUyAJEkqlLPAJEmSCmICJElSoSKiq5vQZUyAJElScewASZKk4ngLTJKkQvkgREmSpIKYAEmSVCinwUuSJBXEBEiSpFI5DV6SJKkcJkCSJJWq4Bik4FOXJEmlMgGSJKlUjgGSJEkqhwmQJEmlMgGSJEkqhwmQJEmlKjgGKfjUJUlSqUyAJEkqVDoGSJIkqRx2gCRJUnG8BSZJUqnKvQNmAiRJkspjAiRJUqlayo2ATIAkSVJxTIAkSSqV0+AlSZLKYQdIkqRSRQM/y2tKxL4R8XBEPBYRJyxh+08j4t7q55GIeL5mW1vNtos7c+reApMkSV0qIlqBs4C9gGnAHRFxcWZOXFgmM4+tKf9FYNuaQ8zLzG1WpE47QJIklap5ZoGNBh7LzMcBIuJc4CBg4lLKHwl8681U6C0wSZJUdxFxdETcWfM5umbzYGBqzfK06rolHWcjYBPgmprVa1WPeWtEHNyZ9pgASZJUqgbOAsvMscDYVXCoMcD5mdlWs26jzJweEZsC10TE+MyctKyDmABJkqSuNh0YWrM8pLpuScYAf65dkZnTq/99HLiOxccHLZEdIEmSStU8s8DuAEZGxCYR0Z1KJ+cNs7kiYnOgD3BLzbo+EbFm9fv6wE4sfezQIt4CkyRJXSozF0TEF4ArgFbg7MycEBGnAHdm5sLO0Bjg3MzMmt3fDvwqItqpBDun1s4eWxo7QJIklap5ZoGRmZcBl3VYd3KH5W8vYb+bga1WtD5vgUmSpOKYAEmSVKrmCYAazgRIkiQVxw6QJEkqjrfAJEkqVDbwQYjNxgRIkiQVxwRIkqRSNdE0+EYzAZIkScUxAZIkqVTlBkAmQJIkqTwmQJIklcpZYJIkSeUwAZIkqVTOApMkSSqHCZAkSaUqNwAyAZIkSeUxAZIkqVTOApMkSSqHCZAkSaUyAZIkSSqHHaAmtteu7+C+a3/CAzf8lK9+7sA3bB82eH0u+/NJ3H7Fj7jiL99k8MC+i7Z9+PBdGH/9GYy//gw+fPgujWy2mtgNN9zFPvt8hr32OpqxY//6hu3z57/GMcf8iL32OpojjvhPpk2bCcC0aTPZeuvDOOigL3HQQV/i5JPPanTT1aTGjbub9+/7efbZ+7P8euwFb9g+f/5rHHvs6eyz92f50Ae/xvRpzyza9vDDUxjzoePZf/8vceABX+bVV+c3sukqnLfAmlRLS3Dm9/6d/T78A6Y/PYsbL/k+f7/qLh56dPqiMj/8xof54wXj+OP5N7Dre0Zxyglj+OQx/0WfXuty0jGHstN+J5HAzZd+n0uvuovn577UdSekLtfW1sYpp/yS3/72uwwY0I/DD/8Ke+zxLkaMGLaozF//eiXrrdeDq64ay6WX3sDpp/+OM888HoBhwwbyf//3865qvppQW1sb3z1lLP9z9rcZMKAfHzzia+y+x2hGjBi6qMz55/+TXuutyxVX/jeXXjqO03/ye37606+yYEEbXzvuTH704y+z+eabMGfOC3Tr1tqFZ1OogmOQgk+9ue2wzQgmTZnBlCef4bXX2vjrJbew/97bL1Zm85FDuP6mBwC4/uYJ7L/XdkAlObp63HjmzH2J5+e+xNXjxrP3ru9o+Dmoudx//6NstNEghg4dSPfua7Dffrtw9dW3LVbmmmtu45BD9gRgn3124pZb7iMzu6K5egu4//5HGTbs9WvqAx94L9dcfftiZa65+nYOOnh3APbZ5z3cesv9ZCY33XQvm222EZtvvgkAffqsR2urHSA1TkM7QBHx3ogwO++EDQf2YdpTsxYtT396FoMH9FmszPiJT3DQ+0cDcNC+O7Bez3Xo27tHdd/ZNfvOZsOBi++r8sycOYuBA9dftDxgQD9mzpz1hjKDBlXKdOvWSs+e6zJnzgtA5TbYwQd/mY985ATuvHNC4xqupvXMzNkMHFRzTQ1cwjX1TMdrah2ef/5Fpkx5CiL4j09+h0MP/U9+85u/NbTtqopo3KfJ1P0WWERsCxwFHAFMBi5cRtmjgaPr3abVxYnf/yM/PeXjfOTwXbnp9geZ/vQs2trbu7pZWg3179+Xa689mz591uOBBx7j85//PpdeehY9eqzT1U3TW1TbgjbuvutB/nr+aay11pr8+8dPZtSo4bz73Vt3ddNUiLokQBHxtoj4VkQ8BPwCeBKIzNw9M3+xtP0yc2xmbp+Z2y+tTCmemjGHIRv2W7Q8eFA/ps+cs1iZp2fOYcynf8q7P3Ai3/rxXwCY+8LL1X371uzbl6dmLL6vyjNgQD9mzHhu0fLMmbMYMKDfG8o8/XSlzIIFbbz44kv06bMe3buvQZ8+6wGw5ZYjGDZsIJMnT0dl6z+gLzOerrmmZizhmurf8Zp6md69ezJgYD+2334L+vRZj7XXXpNddt2OiRMnNbT9ovIqjEZ9mky9boE9BOwB7J+Z7612etrqVNdq6c77JjFik4FsNHQD1lijlSMOeDeXXnXXYmX69elJVGPF4z5/EOf85ToArrr+Pt6389b07rUuvXuty/t23pqrrr+v0aegJrPVViOZMuUppk6dwfz5r3HppTewxx6jFyuzxx7v4m9/uxqAK664iR133JqIYPbsubS1Vf4vPHXqDKZMeYqhQwc2/BzUXLbaaiRPPPE006bNZP7817jsshvZfY8dFiuz+x478H8XXQvAFVfczI47bkVE8N73bssjjz7JvHmvsmBBG3fcMYHhw4cuqRqpLup1C+xQYAxwbURcDpxLU/b/mldbWzvHfvN3XPKHE2ltbeGcv1zHg49M45tfOZy7x0/m0qvuYpd3v51Tjh9DJtx424Mc883fAjBn7kv88Od/48ZLvgfAD352IXOcAVa8bt1aOfnkz/Af//Et2traOeyw9zFy5Eb87Gf/y5ZbjmTPPd/F4YfvxXHHncFeex1Nr149+OlPvwbAHXc8wM9//ke6detGS0vwne98nt69e3bxGamrdevWyje++Sn+45Pfob29nUMP25ORI4fx85//iS23HMEee4zm8MPfx/FfO5N99v4svXr14Cdn/CcAvXr14OMfP4AjjjiOCNhll+3Ybbfiw/+Gy5Zy/2qOes7wiIh1gYOAI6kkQr8H/paZV3Zi31xr6Ji6tU1leWXquQBkPtzFLdHqImIz2todDK5Vp7VlFJnZ0B7J8CP/1LBpnpP+fFRT9bbqNQaoG0BmvpSZf8rMA4AhwD3A8fWoU5IkraCCZ4HVawzQ7R1XZOac6iDnPetUpyRJUqfUawxQ83X1JEnS4gr+27peHaANIuIrS9uYmWfUqV5JkqTlqlcHqBXoQdF9S0mSmlzBs8Dq1QF6OjNPqdOxJUmS3hTHAEmSVKomnJ3VKPXqAO0JEBG7A6Oq6yZk5rV1qk+SJKnT6tUBWjsibgNeARa+v+GIiPgRcEhm+hIhSZLUZerVAfp/wH9n5u9qV0bER4H/ovJ0aEmS1JXKvQNWtwchbtGx8wOQmb8HNq9TnZIkSZ1SrwRoiR2riGihMkVekiR1tYKnwdcrAbo0In5dfRkqsOjFqL8ELqtTnZIkSZ1Srw7QccDzwBMRcVdE3AVMAV4AvlqnOiVJ0opoicZ9mky9OkDbAGcAQ4GPA7+j8ib47lSeEC1JktRl6tUB+hXwambOA/oAJ1bXzQXG1qlOSZK0AjIa92k2dXsXWGbOrn7/EDA2My8ALoiIe+tUpyRJUqfUrQMUEd0ycwGVp0If3YA6JUnSimjCsTmNUq/OyJ+B6yPiOWAeMA4gIkZQuQ0mSZLUZerSAcrM70fE1cAg4MrMzOqmFuCL9ahTkiStIF+Guupl5q1LWPdIveqTJEnqLMfjSJJUqoLHANVrGrwkSVLTMgGSJKlUBccgBZ+6JEkqlR0gSZJUHG+BSZJUqoKnwZsASZKk4pgASZJUKqfBS5IklcMESJKkQqVjgCRJksphAiRJUqkKjkEKPnVJklQqEyBJkkrlLDBJkqRymABJklQqZ4FJkiSVwwRIkqRSOQZIkiSpHCZAkiSVqtwAyARIkiSVxw6QJEkqjrfAJEkqVDoIWpIkqRwmQJIklcoESJIkqRwmQJIklcpXYUiSJJXDBEiSpFIVHIMUfOqSJKlUJkCSJJXKMUCSJEnlMAGSJKlUPgdIkiSpHCZAkiSVygRIkiSpHCZAkiQVKp0FJkmS1HUiYt+IeDgiHouIE5ZS5oMRMTEiJkTEn2rWfywiHq1+PtaZ+kyAJElSl4qIVuAsYC9gGnBHRFycmRNryowETgR2ysw5EdG/ur4v8C1geyCBu6r7zllWnSZAkiSVqqWBn2UbDTyWmY9n5nzgXOCgDmU+BZy1sGOTmc9U1+8DXJWZs6vbrgL27cypS5Ik1VVEHB0Rd9Z8jq7ZPBiYWrM8rbqu1tuAt0XETRFxa0TsuwL7voG3wCRJKlUDB0Fn5lhg7Js4RDdgJLAbMAS4ISK2WtmDmQBJkqSuNh0YWrM8pLqu1jTg4sx8LTMnA49Q6RB1Zt83sAMkSVKpWqJxn2W7AxgZEZtERHdgDHBxhzIXUUl/iIj1qdwSexy4Atg7IvpERB9g7+q6ZfIWmCRJ6lKZuSAivkCl49IKnJ2ZEyLiFODOzLyY1zs6E4E24LjMnAUQEd+l0okCOCUzZy+vTjtAkiSVqolehZGZlwGXdVh3cs33BL5S/XTc92zg7BWpz1tgkiSpOCZAkiSVqnkCoIZr6g7QK1PP7eomaDUTsVlXN0GrkdaWUV3dBEkrqak7QJucdV1XN0Gricmf3w2A0++/smsbotXGV7fem1cW3N7VzdBqZK1uoxteZzbRGKBGcwyQJEkqTlMnQJIkqY4a+CToZmMCJEmSimMCJElSqRwDJEmSVA47QJIkqTjeApMkqVTl3gEzAZIkSeUxAZIkqVAtBccgBZ+6JEkqlQmQJEmFKvg5iCZAkiSpPCZAkiQVygRIkiSpICZAkiQVKgqOgEyAJElScUyAJEkqVMEBkAmQJEkqjwmQJEmFMgGSJEkqiAmQJEmFioJjkIJPXZIklcoOkCRJKo63wCRJKpSDoCVJkgpiAiRJUqFaTIAkSZLKYQIkSVKhHAMkSZJUEBMgSZIKZQIkSZJUEBMgSZIKFQVHQCZAkiSpOCZAkiQVypehSpIkFcQESJKkQhU8BMgESJIklccESJKkQpkASZIkFcQOkCRJKo63wCRJKpS3wCRJkgpiAiRJUqFaTIAkSZLKYQIkSVKhHAMkSZJUEBMgSZIKZQIkSZJUEBMgSZIKFQVPAzMBkiRJxTEBkiSpUI4BkiRJKogJkCRJhTIBkiRJKogJkCRJhTIBkiRJKogdIEmSVBxvgUmSVKiCn4NoAiRJkspjAiRJUqEcBC1JklQQEyBJkgoVBccgBZ+6JEkqVacToIhYMzNfrWdjJElS4zgGaBkiYnREjAcerS6/IyJ+UfeWSZIk1UlnEqCfA/sDFwFk5n0RsXtdWyVJkuouCo6AOjMGqCUzn+iwrq0ejZEkSWqEziRAUyNiNJAR0Qp8EXikvs2SJEn1VnAA1KkE6LPAV4BhwExgx+o6SZKkt6TlJkCZ+QwwpgFtUQe7DO3Dye8dQUtLcN7Ep/nlPVMX237YZgM44T2bMvOl+QD8fvx0zntwBjtu2JtvvHf4onLDe6/Dl66ayFWTZzW0/Wo+U++ZyM2/PZ9sb2fzPd/DNofsvdj2iVeMY8IVN9DS0kK3tdZkl08fSZ+hg3jxmVmcd8z36L1hfwD6j9yYnT99ZFecgprMjePu40c//APtbe0cevhufPJTBy62ff781zjphF8yccJkevXuyWlnfIHBgzfgtfkLOOXb/8OECZNpaWnh+BM/wg6jt+iisyhXyQnQcjtAEfFrIDuuz8yj69IiAZUX1H1nl5F89JL7mfGvV7no8HfyzymzeGzOy4uVu/SxZ/n2uMcWW3frU8+z/3l3AdBrzW5c++HRjJs6p2FtV3Nqb2vnxt+cx34nf4F1+/bmbyecxkbbb0WfoYMWlRmx8/Zssc/OAEy5435uOedCPvCNzwOw3oD1Oez0E7uk7WpObW3t/OB75zD2NycwYEBfjvzQyey2+3YMHzF4UZkLL7iO9dZbl0uvOIN/XHYLZ/7kXE4744tccP61le3/dyqzZs3lc58+jT+fdwotLT6eTo3RmSvtn8DV1c9NQH/A5wHV2Tv6r8cTc+cx9YVXeK09+ftjz7DXJv1W+DjvH74B1z85m1cWtNehlXorefaxKfQauD7rDVif1jW6MXyndzLljvsXK9N9nbUXfV/w6nyCgv95qOV6YPwkhg0bwJCh/Vmjezf2ff+OXHvNXYuVue6auznw4Eqneq+9R3PbrRPITCZNms7oHUcB0K9fL3r2XIcJD0xu+DmULqJxn2bTmVtgf6ldjog/ADcurXxE7LKc493Q6dYVbOC63Xn6X6/3M5/+16tsM2C9N5Tbd9P1Gb1hLyY/P4/v3TRpsX0A9h+xAWffN63u7VXze2n2XNZdv8+i5XX79eGZR6e8odyEf1zP/X+/lvYFC9j/219atP7FZ2ZxwVdPpfs6a7H9mP0ZtMWIRjRbTWzmzDkMGNh30fKAgX0Zf/+kpZbp1q2VHj3X4fnn/8Vmmw3jumvu5v0feDczZsziwYlTmDFjFlttPRypEVbmXWCbAAOWsf24JaxLYGtgKNC6tB0j4mjAW2uddPWUWVzy6DPMb0+O3GIQp+2xGR+5+PV/0W+wTnc267cuN3j7Sytg1Pt3ZdT7d+WxcXdw9/mXs/sXP8o6fdbjqF+ewlo9e/DspCe58sdjOeKnJy2WGEkr4uBDd+Xxx5/iyCO+yaAN1+cd24yk1dtfDdfShMlMo3RmDNAcXh8D1ALMBk5YWvnMPKDD/jsB3wBmUJlCv1SZORYYW93vDeOOSjLjpfkM6rHmouVBPdZk5kuLpzvPv7pg0fe/PPg0J7x708W27zdiA658/DkWtBf9R6mqdfv24qXnXu8MvzRrDuv27bXU8sN32o5xv64EwK1rrEHrGmsAsMHwYaw3YH3mPvUMG4zYqL6NVlMbMKAPM2fMXrQ8c8Zs+vfvs8QyAwf2Y8GCNv714sv07t2DiOBrJ3xkUbl/O+o7bLTxIKRGWWZ3OyqPiHwHsEH10yczN83M85Z34IjYMyKuA74HnJGZO2bmJaugzUW4/5kX2LjX2gzpuRZrtAT7j+jPPzvM4tpgne6Lvr9v435vGCB9wIj+XPLosw1pr5rfBiM2Yu7Tz/LCzOdoe20Bk266m4122HqxMnOffmbR9yfvnkCvgRsAMG/ui7S3VcaRvTDzOebOeJaeA9ZvXOPVlEZtuSlPPDGDadOe4bX5C7j8H7ey2+7vXKzMbru/k4svGgfAVVfezuh3bUFEMG/eq7z88isA3HLzeFpbWxYbPC3V2zIToMzMiLgsM7fs7AEjYj/gJGAu8I3MXOp4IS1dW8K3xz3GOQdsRUsEf31oBo/OeZljdtiY8c++yNVTZvHxrQez58b9aGtPnn91Acdd89Ci/Qf3XJNBPdbktqee78KzUDNpaW1lp//4IP/43lm0tyeb7bEjfYcO4s5z/876w4ex8Q5bM+EfNzD9/odo6dZK93XXYbcvfhSApx98jLvOvZSWbq0Qwc5Hj2Gtnut28Rmpq3Xr1srXT/oYn/3Uj2lrb+fgQ3ZlxMghnPWL89li1Cbsvsd2HHLYrnz9+F+y3z5foVfvHvz49C8AMHv2C3zmUz+ipaWF/v378INTfbxcVyj5FlhkLvv2SET8L/CTzLynUweMaAemAfex5OnzB75hpyUfJzc567rOFJWWa/LndwPg9Puv7NqGaLXx1a335pUFt3d1M7QaWavbaDKzoV2SvS6/qWFjJK7ad6em6m4tNQGKiG6ZuQDYFrgjIiYBLwFBJRx651J29UWpkiS9BbQUPNx2WbfAbgfeCXQqsVkoM68HiIi1gIXzZB/LzFdWqoWSJEmr2LI6QAGQmZOWUeaNO0V0A34AfAJ4onqcoRHxW+CkzHxtJdsqSZJWoZLHAC2rA7RBRHxlaRsz84ylbDoN6AlskpkvAkTEesDp1c+XV7KtkiRJq8SyOkCtQA9Y4Wfh7w+8LWtGV2fmCxHxWeAh7ABJktQUSn705LI6QE9n5ikrcczMJUwty8y20h9uKEmSliwi9gV+RiWA+U1mnrqUcocB5wM7ZOadEbEx8CDwcLXIrZn5meXVt9wxQCthYkR8NDN/v9jBIj5CJQGSJElNoFlmgUVEK3AWsBeVR+ncEREXZ+bEDuV6UrmTdFuHQ0zKzG1WpM5ldYD2XJED1fg8cGFEfAJY+Frg7YG1gUNW8piSJGn1NZrKjPHHASLiXOAgYGKHct8FfsSS3zu6QpZ6+y8zZy9t27Jk5vTMfBdwCjCl+jklM0dn5vSVOaYkSVr1WqJxn4g4OiLurPnUvvx8MDC1Znladd0iEfFOYGhmXrqEU9kkIu6JiOsjYufOnPvKvA1+mSKib/XrvdXPYutXtmMlSZLeumpfeL6iIqIFOAP4+BI2Pw0My8xZEbEdcFFEjMrMF5Z1zFXeAaJy2yupPjG6Zv3C5U2XtJMkSWqsJpoFNh0YWrM8pLpuoZ7AlsB1lfe0MxC4OCIOzMw7gVcBMvOu6psr3gbcuawK69EB2i0zn6jDcSVJ0urpDmBkRGxCpeMzBjhq4cbMnAusv3A5Iq4DvlqdBbYBMLs623xTYCTw+PIqrEcH6G9UXqEhSZKaWLM8CTozF0TEF4ArqEyDPzszJ0TEKcCdmXnxMnbfBTglIl4D2oHPdGa4TT06QE3yxylJkt4qMvMy4LIO605eStndar5fAFywovXVowM0OCJ+vrSNmfmlOtQpSZLUafXoAM3j9ef/SJKkJlXyCxrq0QGalZnn1OG4kiRJq0Q9OkDz63BMSZK0ijXLIOiusMofAZCZO3ZcFxHDI+KbETFhVdcnSZK0our2DKSI2DAijo2IO4AJ1brG1Ks+SZK0Yloa+Gk2q7xN1Xd9XAtcB/QDPgk8nZnfyczxq7o+SZKkFVWPMUD/D7gFOKr6eGqi5GHmkiQ1qZaC/3quRwdoEHAE8JOIGAicB6xRh3okSZJWSj0GQc/KzF9m5q7AnsDzwMyIeDAifrCq65MkSSunJRr3aTZ1HZeUmdMy8yeZuT1wEJWHJEqSJHWphg3MzsxHgE81qj5JkrRszgJrnCYMwSRJUmnqMQh6Wcodbi5JUpNpxrE5jbLKO0ARcQlL7ugElecCSZIkdal6JECnr+Q2SZLUQD4HaBXKzOtX9TElSZJWpXq8CmNkRPw2Is6IiCER8Y+I+FdE3BcRO6zq+iRJklZUPWaB/ZbKqzCeAm4DzgbWB75K5TUZkiSpCfggxFWrR2aOzczTgXmZ+dfMfCUzrwLWrEN9kiRJK6Qeg6Dba76/sIxtkiSpCzXjAwobpR4doM0j4n4q096HV79TXd60DvVJkiStkHp0gN6+hHUBDAVOrEN9kiRpJTgNfhXKzCcWfo+IbYGjgCOAycAFq7o+SZKkFVWPJ0G/DTiy+nkO+AsQmbn7qq5LkiStvGacndUo9bgF9hAwDtg/Mx8DiIhj61CPJEnSSqlHB+hQYAxwbURcDpyLb4GXJKnplJwArfIZcJl5UWaOATYHrgWOAfpHxH9HxN6ruj5JkqQVVbdHAGTmS5n5p8w8ABgC3AMcX6/6JEnSimlp4KfZNKRNmTmn+nToPRtRnyRJ0rLUYwyQJEl6Cyj5OUDNmEpJkiTVlQmQJEmFchaYJElSQewASZKk4ngLTJKkQpWcgpR87pIkqVAmQJIkFcpB0JIkSQUxAZIkqVDhgxAlSZLKYQIkSVKhHAMkSZJUEBMgSZIKVXIKUvK5S5KkQpkASZJUqBZngUmSJJXDBEiSpEI5C0ySJKkgJkCSJBXKBEiSJKkgdoAkSVJxvAUmSVKhWru6AV3IBEiSJBXHBEiSpEKV/CDEyGzOk48o+KciSSpSZjZ0XtYP7r2qYX/Xfn2bvZpqzllTJ0B3P3dJVzdBq4l3rn8AAK8suL2LW6LVxVrdRrPpJ87r6mZoNfL42R9seJ1Og5ckSSpIUydAkiSpfkyAJEmSCmICJElSoVpNgCRJksphAiRJUqEcAyRJklQQEyBJkhlFmHQAABjdSURBVApV8pOgTYAkSVJxTIAkSSqUY4AkSZIKYgdIkiQVx1tgkiQVqrWrG9CFTIAkSVJxTIAkSSqUg6AlSZIKYgIkSVKhfBCiJElSQUyAJEkqVKtjgCRJksphAiRJUqGcBSZJklQQEyBJkgplAiRJklQQEyBJkgplAiRJklQQEyBJkgrV6pOgJUmSymEHSJIkFcdbYJIkFarkFKTkc5ckSU0iIvaNiIcj4rGIOGEJ2z8TEeMj4t6IuDEitqjZdmJ1v4cjYp/O1GcCJElSoZplGnxEtAJnAXsB04A7IuLizJxYU+xPmfnLavkDgTOAfasdoTHAKGBD4J8R8bbMbFtWnSZAkiSpq40GHsvMxzNzPnAucFBtgcx8oWZxXWDhFLaDgHMz89XMnAw8Vj3eMpkASZJUqEYmQBFxNHB0zaqxmTm2+n0wMLVm2zTgXUs4xueBrwDdgT1q9r21w76Dl9ceO0CSJKnuqp2dscstuOxjnAWcFRFHAd8APrayx7IDJElSoZroQYjTgaE1y0Oq65bmXOC/V3JfwDFAkiSp690BjIyITSKiO5VBzRfXFoiIkTWL+wGPVr9fDIyJiDUjYhNgJHD78io0AZIkqVDNMgssMxdExBeAK4BW4OzMnBARpwB3ZubFwBci4n3Aa8Acqre/quXOAyYCC4DPL28GGNgBkiRJTSAzLwMu67Du5JrvX17Gvt8Hvr8i9dkBkiSpUM2SAHUFxwBJkqTimABJklQoEyBJkqSCmABJklSoVhMgSZKkctgBkiRJxfEWmCRJhWppnldhNJwJkCRJKo4JkCRJhSo5BSn53CVJUqFMgCRJKpQPQpQkSSqICZAkSYXyQYiSJEkFMQGSJKlQPgdIkiSpICZAkiQVyllgkiRJBTEBkiSpUCZAkiRJBTEBkiSpUCWnICWfuyRJKpQdIEmSVBxvgUmSVKhwELQkSVI5TIAkSSpUwQGQCZAkSSqPCZAkSYVyDJAkSVJBTICa2L23PsQ5Z15Ee1s7exzwLg766J6Lbb/0z9dzzSW30draQs/e6/KZr3+IDQb1XbT95Zde4atH/Zjtd9mST/znoY1uvprQjePu40c//APtbe0cevhufPJTBy62ff781zjphF8yccJkevXuyWlnfIHBgzfgtfkLOOXb/8OECZNpaWnh+BM/wg6jt+iis1Az2WXLAXzzyG1pjeAv4x7nV/94+A1lPrD9EL500Cgyk4emzuXYX9/GjpttwEljtllUZvignnz5V7dy1T1PNbL5xSs5BbED1KTa29o5+/QLOelnn6Zf/158/ZNnst3OoxiyycBFZTZ+22B+cPYxrLlWd6688Gb++F9/55jvfnTR9vPGXs7m22zaFc1XE2pra+cH3zuHsb85gQED+nLkh05mt923Y/iIwYvKXHjBday33rpcesUZ/OOyWzjzJ+dy2hlf5ILzr61s/79TmTVrLp/79Gn8+bxTaGkp+denWgK+/eF38rGf3MCMOS/zt2++j6vvfYrHnn5xUZmN+/fgM/ttzgd/eA0vvPwa/XquCcCtDz/LAd+5CoBe667BNT/8AOMmzOyS81CZ/O3VpB6b+CQDh/RjwOB+dFujG+9537bcOW7CYmVGbTeCNdfqDsDIUcOY/czcRdsef2gqc2e/yNaj39bQdqt5PTB+EsOGDWDI0P6s0b0b+75/R6695q7Fylx3zd0cePDOAOy192huu3UCmcmkSdMZveMoAPr160XPnusw4YHJDT8HNZd3bNqXJ575F1Ofe4nX2pK/3z6V9207eLEyH9plE/73mkm88PJrAMx68dU3HOf92w3h+vFP88r8toa0W6+LyIZ9mk1dOkARcWU9jluS2c/Opd+A3ouW+27Qi9nPzl1q+Wv/fjvb7Lg5AO3t7fzhF5fwkS8eUPd26q1j5sw5DBj4+i3SAQP78swzc5Zaplu3Vnr0XIfnn/8Xm202jOuuuZsFC9qYNu0ZHpw4hRkzZjW0/Wo+A3qvzdOzX160PGPOywzovfZiZTYZ2JNNBvTgvBN25/yv78EuWw54w3H2Hz2MS26bWvf2SrXqdQtsg5XZKSKOBo5exW1Z7Y27/C4ef2gq3zrr8wBceeHNbPvuzenXv/dy9pQ65+BDd+Xxx5/iyCO+yaAN1+cd24yk1dtf6oTWlmDjAT056rTrGNhnbc49fnfef/KVvDivkght0Gst3jakF+MmzOjilpap4ElgdesA9YqIpY66zcwLl7J+LDAWIJoxL2ugvhv0YtbM5xctz352Ln036PWGcuPveIS/nfNPvnXW51ije+XH+egDU3jovslceeHNvDrvVRa81sZaa3fnqM/t37D2q/kMGNCHmTNmL1qeOWM2/fv3WWKZgQP7sWBBG/968WV69+5BRPC1Ez6yqNy/HfUdNtp4UMParuY08/l5DOq7zqLlgX3WYebz8xYrM2POPO59fDYL2pJpz73M5JkvsvGAHoyfUkkf99thCFfdPZ0FbUX/ylcXqFsHCNifJXcuE1hiB0ivG/72ocyY9hzPPDWLvhv04uZ/3sMXv/2RxcpMfngav/7R+Zz400/Rq2/PRetry1136e08/tA0Oz9i1Jab8sQTM5g27RkG9O/L5f+4lVN//LnFyuy2+zu5+KJxvGObkVx15e2MftcWRATz5r1KZrLOOmtxy83jaW1tWWzwtMp0/+Q5bDygB0PWX4eZc+ax/+ihHDv2tsXKXHXPdA4YPYwLbppCnx7d2WRAT6Y++9Ki7fuPHsbpF4xvdNNVVfJzgOrVAXoiMz9Rp2MXobVbK//+lUP5wbFjaW9Ldt9/NEM3Hch5v76cTTcfwvY7b8kfz/o7r857lTO/8XsA1h/Qm+N+/MkubrmaVbdurXz9pI/x2U/9mLb2dg4+ZFdGjBzCWb84ny1GbcLue2zHIYftyteP/yX77fMVevXuwY9P/wIAs2e/wGc+9SNaWlro378PPzj1s118NmoGbe3Jd/54D787dhdaWoLzb5zMo0+9wDEHjWL8lNlcfd/T3PDATN47aiCXf3cf2tuTU/96P8+/NB+Awf3WYVDfdbjtkWe7+ExUoshc9bFjRNyTmdu+yWPk3c9dsqqapMK9c/3KgPBXFtzexS3R6mKtbqPZ9BPndXUztBp5/OwPkpkNzWTun/33ht173Lrv/k2VN9VrFONHll9EkiSpa9TrFtjfOwxiDipjfwAyM4fXqV5JkqTlqlcHaPsOyy3AB4GvAvfUqU5JkrQCWprqplRj1aUDlJmzACKiBfg34DjgXmC/zJxYjzolSZI6qy4doIhYA/gEcCxwI3BwZj5Wj7okSdLKKTgAqtstsMnAAuBM4Elg64jYeuHGpT0IUZIkqRHq1QH6J5VBz++ofmr5IERJkpqAD0JcxTLz40vbFhFvfBOeJElSA9UrAVpMRPQGDgOOAt4ObNiIeiVJ0tIVHADVrwMUEWsDB1Hp9GwL9AQOBm6oV52SJEmdUZcnQUfEn4BHgL2AXwAbA3My87rMbK9HnZIkacVEAz/Npl6vwtgCmAM8CDyYmW28/iRoSZKkLlWvQdDbRMTmwJHAPyPiOaBnRAzIzJn1qFOSJK2Ykp8EXa8EiMx8KDO/lZmbA18GzgHuiIib61WnJElSZ9TrSdA7ZOYdC5cz8y7grog4Dti5HnVKkqQVU3AAVLcEaGxEPBoR342ILRauzApngUmSpC5VrzFA20bEZsAY4PyIeA34M3BuZk6pR52SJGnFRJQ7P6meY4AezszvZOYWwEeBXsDVEXFTveqUJEnqjLo/CToiWoD+wABgXeCZetcpSZKWr+QxQPV8EvTOVKbBHwyMB84Fjs3MufWqU5IkqTPqNQtsKvAElU7PtzPT1EeSJDWNeiVA783MJ5a0ISK6ZeaCOtUrSZI6KQq+B1avQdB/XPglIv7QYdvtdapTkiSpU+qVAK1b831Uh20F9zclSWoedZsK/hZQr3Nf1oMFyn3ogCRJagr1SoB6R8QhVDpYvSPi0Or6oPI8IEmS1MVKHgNUrw7Q9cCBNd8PqNnmqzAkSVKXqterMP59adsi4rB61ClJklZMwQFQl4x/+mkX1ClJkrRI3V+FsQQldzglSWoaJY8B6ooEyFlgkiSpS9XrVRjjWXJHJ6i8FFWSJHWxggOgut0C279Ox5UkSXrT6tUBWjszHwKIiDUz89WFGyJiRyovSpUkSV2opeAIqF5jgP5U8/2WDtv+q051SpIkdUq9EqBYyvclLUuSpC5Q8l/IjXgXWMfB0M4CkyRJXapeCdCQiPg5lc7lwu9UlwfXqU5JkqROqVcH6Lia73d22NZxWZIkdYGIcm/K1OtdYOcsbVtEDKtHnZIkSZ1VtydBR8S7I+LwiOhfXd46Iv4E3FSvOiVJUudFAz/Npi4doIg4DTgbOAy4NCK+B1wJ3AaMrEedkiRJnVWvMUD7Adtm5isR0QeYCmyZmVPqVJ8kSVpBvgx11XslM18ByMw5wKN2fiRJUrOoVwK0aURcXLO8Se1yZh5Yp3olSVInFRwA1a0DdFCH5Z/UqR5JkqQVVq9p8NcvbVtE7FSPOiVJ0oqp21Twt4C6dIAiohX4IJWnPl+emQ9ExP7A14G1gW3rUa8kSVJn1OsW2P8AQ4HbgZ9HxFPA9sAJmXlRneqUJEkroORZYPXqAG0PbJ2Z7RGxFjADGJ6Zs+pUnyRJeguLiH2BnwGtwG8y89QO23cBzgS2BsZk5vk129qA8dXFJzsz2apeHaD5mdkOUH0W0ON2fiRJajbNEQFVh86cBewFTAPuiIiLM3NiTbEngY8DX13CIeZl5jYrUme9OkCbR8T91e8BDK8uB5CZuXWd6pUkSW89o4HHMvNxgIg4l8qM8kUdoIXPE4yI9lVRYb06QG+v03ElSdIqEg1MgCLiaODomlVjM3Ns9ftgKm+NWGga8K4VOPxaEXEnsAA4tTPjjes1Df6JehxXkiS9NVU7O2OXW3DlbJSZ0yNiU+CaiBifmZOWtUO9psG/COSSNlG5BbZePeqVJElvSdOpzB5faEh1Xadk5vTqfx+PiOuoPG6n8R2gzOxZj+NKkqRVJ6JpHoV4BzAyIjah0vEZAxzVmR2rL11/OTNfjYj1gZ2AHy9vv6Y5c0mSVKbMXAB8AbgCeBA4LzMnRMQpEXEgQETsEBHTgCOAX0XEhOrubwfujIj7gGupjAGa+MZaFlevQdCSJKnpNcc0eIDMvAy4rMO6k2u+30Hl1ljH/W4GtlrR+kyAJElScUyAJEkqVCOnwTcbEyBJklQcEyBJkoplAiRJklQMEyBJkgrVRM8Barhyz1ySJBXLBEiSpGI5BkiSJKkYJkCSJBXK5wBJkiQVpGkToMwst1u6giLi6Mwc29XtaGaZ2dVNeMvweuocr6nO85rqpP9p/DVlAqS3uqO7ugFarXg9aVXzmlLTsQMkSZKK07S3wCRJUr2Vm4OUe+arF++ta1XyetKq5jWlphMO5JMkqUwvLbi+YZ2Adbvt2lQjrk2AJElScRwDJElSsZoqlGkoE6AmFBEnRcSEiLg/Iu6NiHdFRPeIODMiHouIRyPi/yJiSET0q5a5NyJmRMT0muXu1eMdHBEZEZt39blp1ej4M42IjSNiXvXnfl9E3BwRm1W37RYRf1/KcdaPiNci4jM1626rHufJiHi25nraOCKmRMT61XJDqtfhoxExKSJ+VnPN7VZt3wE1x/17ROxWxz8WLUdE/Kv639rrZWJE/D4i1qhu2y0i5la3PRQRp9fs//EO18S9EbFF9XgPLKG+30XE5JqyN1fLTosOryFf+Luuw/K5Szje9IhYs7q8fvWa3Kqmjtk1df5z1f4JanViB6jJRMS7gf2Bd2bm1sD7gKnAD4CewGaZORK4CLgQmJ2Z22TmNsAvgZ8uXM7M+dXDHgncWP2vVg9L+plOqv7c3wGcA3y9E8c5Ari19jiZ+a7q9XQy8Jea62nKwjIREVSuv4uq1+PbgB7A92uOPQ04aWVOTg0xqfpz3goYAnywZtu46rZtgf0jYqeabbXXxDaZOXE59RxXU/Y91evoSWDnhQWqHfmemXlbdfntQCuwc0Ss2+F4bcAnaldk5via34MX19T5vs79UZQrGvi/ZmMHqPkMAp7LzFcBMvM54Hng34FjM7Otuv63wKvAHss6WET0AN4LfBIYU8d2q0E6+TNdD5jTicMdCfwnMDgihqxAM/YAXqleh1Svy2OBT0TEOtUy9wFzI2KvFTiuGqz6s7sdGLyEbfOAe5e07U36M4tfu2OA2rTnSOAPwJXAQR32PRM4NiIcwqE3xQ5Q87kSGBoRj0TEf0XErsAI4MnMfKFD2TuBUcs53kHA5Zn5CDArIrZb9U1Wgy3tZzq8GvtPAr4CnLGsg0TEUGBQZt4OnAd8aAXaMAq4q3ZF9fp8ksr1utD3gW+swHHVYBGxFvAu4PIlbOsDjARuqFn9oQ63wNZeThWn1ZT9Y3XdecDBNZ2YD1HpFFGzfG51Xcfk+kkq6ee/deL0tFwtDfw0l+ZrUeEy81/AdlQeHf8s8BdgtzdxyCN5/V9W5+JtsNXB0n6mC2+BDQeOYfnPXvkQlb+IOh5nlcnMGwAi4r2r+th604ZHxL3ATODpzLy/ZtvOEXEfMB24IjNn1GzreAts3nLqqb0F9mGAzJwJPADsGRHbAAsy8wGAiNieSgr+JHA1sG1E9O1wzB8Cx+HfYXoTjBCbUDWSvg64LiLGA58GhkVEz8x8sabodsASB7cCVH9p7AFsFRFJ5Z56RsRx6QOg3pKW9jMFzupQ9GLgt8s53JHAwIj4cHV5w4gYmZmPdqIpE4HDO7RtPWAY8BgwumbTwhRoQSeOq8aZlJnbVAe13xQRB2bmxdVt4zJz/4jYBLg1Is7LzHtXcf0Lb4PNZPH050hg84iYUl1eDzgM+PXCApn5aLXzVjtuSSuhGcfmNIq95yYTEZtFxMiaVdsAD1MZ1HpGRLRWy30UWAe4ZhmHOxz4Q2ZulJkbZ+ZQYDI1gw/1lrO0n+nQDuXeC0xa2kEi4m1Aj8wcXD3OxlT+Vd3ZFOhqYJ3qdUj1uvwJ8LvMfLm2YGZeCfQBtu7ksdVA1XGGJwAnLmHbZOBU4Pg6VH0h8AFev91FdWbYB4Gtaq7Lg1jydfl94Kt1aJcKYQeo+fQAzqlOTb0f2AL4NpVfTq8Aj0TEo1Rm7xyynCTnSOBvHdZdgLfB3sqW9jM9kdfHAN1HZdbgf9SU2bM69XhaRExbxnE6dW1Ur7tDgCOq1+MjVK7Ppc08+z5v7KSpeVxEpUO7pH8c/RLYJSI2ri53HAP0nur6zWqvsYg4orr+tA7luwNk5vPALcDMzHy8WnZnYHpmPlVT/w3AFhExqLZRmTkBuPtNnnfxIqJhn2bjqzAkSSrUK223NKwTsFbru5uqF+QYIEmSitVUfZKG8haYJEkqjh0gSZJUHG+BSZJUqCg4Byn3zCVJUrHsAElvURHRVp1W/EBE/LXmHVwrc6xFb4yPiAMj4oRllO0dEZ9biTq+HRE+t0VqKtHAT3OxAyS9dc2rvl5gS2A+8JnajVGxwv8fz8yLM/PUZRTpDaxwB0iSmokdIGn1MA4YEREbR8TDEfF7Ku9aGhoRe0fELRFxdzUp6gEQEftGxEMRcTdw6MIDRcTHI+L/Vb8PiIi/RcR91c97qDwZeOFDF0+rljsuIu6IiPsj4js1xzopKi/2vRHYrGF/GpI6peQHIToIWnqLi8obtd/P62/zHgl8LDNvjcp7nr4BvC8zX4qI44GvRMSPqbxbaQ8q7+76y1IO/3Pg+sw8JCqvu+hB5bUJW2bmNtX6967WOZpKzn1xROwCvETlXU/bUPldczcd3iAvSV3FDpD01rV2VF4ICZUE6H+ADYEnMvPW6vodqbxO5abqv8C6U3n9wObA5IUvPo2I/wWOXkIdewAfhUUv6Z0bEX06lNm7+rmnutyDSoeoJ/C3he8Gi4iLkdRkmi+ZaRQ7QNJb17yFKcxC1U7OS7WrgKsy88gO5Rbb700K4IeZ+asOdRyzCuuQpFXKMUDS6u1WYKeIGAEQEetG5U3wDwEbR8TwarmlvQT1auCz1X1bI6IX8CKVdGehK4BP1IwtGhwR/am8xPLgiFg7InoCB6zic5P0JgUtDfs0m+ZrkaRVJjOfBT4O/Dki7qd6+yszX6Fyy+vS6iDoZ5ZyiC8Du0fEeCrjd7bIzFlUbqk9EBGnZeaVwJ+AW6rlzgd6ZubdVMYW3Qf8A7ijbicqSSvIt8FLklSo19rvbVgnYI2WbZpqwJEJkCRJKo6DoCVJKlQUPAvMBEiSJBXHBEiSpEI14xOaG8UESJIkFccOkCRJKo63wCRJKla5OUi5Zy5JkoplAiRJUqGcBi9JklQQEyBJkoplAiRJklQMEyBJkgrlgxAlSZIKYgIkSVKxys1Byj1zSZJULBMgSZIK5XOAJEmSChKZ2dVtkCRJaigTIEmSVBw7QJIkqTh2gCRJUnHsAEmSpOLYAZIkScWxAyRJkorz/wGyQUm1xARjrAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 720x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "best.show_results()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best._model.export('table-type-classifier')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c24fe5b --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning diff --git a/download_arxiv_ml_papers.sh b/scripts/download_arxiv_s3_papers.sh similarity index 72% rename from download_arxiv_ml_papers.sh rename to scripts/download_arxiv_s3_papers.sh index dcc62a5..cfbe354 100755 --- a/download_arxiv_ml_papers.sh +++ b/scripts/download_arxiv_s3_papers.sh @@ -1,10 +1,20 @@ #!/bin/bash + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + index_dir="index" papers_dir="papers" src_dir="src" mkdir -p "${index_dir}" "${papers_dir}" "${src_dir}" -jq -r '.[] | select(.arxiv_id) | "/"+.arxiv_id+"."' pwc/papers-with-abstracts.json | sort -u > wildcards.txt +python <<EOF +import pandas as pd, re +arxiv_no_version = re.compile(r'^(\d+\.\d+)(v\d+)?$') +ids = pd.read_csv('arxiv-papers.csv.xz')['arxiv_id'] +ids = ids.str.replace(arxiv_no_version, r'/\1.') +ids.to_csv('wildcards.txt', header=False, index=False) +EOF + aws s3 cp --request-payer requester s3://arxiv/src/arXiv_src_manifest.xml . xmllint --xpath '//filename/text()' arXiv_src_manifest.xml > tars.txt diff --git a/scripts/pull_docker_images.sh b/scripts/pull_docker_images.sh new file mode 100755 index 0000000..067695b --- /dev/null +++ b/scripts/pull_docker_images.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +docker pull arxivvanity/engrafo:b3db888fefa118eacf4f13566204b68ce100b3a6 diff --git a/setup.py b/setup.py index 6bd0111..87e81a9 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + import os import re @@ -6,12 +8,12 @@ directory = os.path.dirname(os.path.abspath(__file__)) # Extract version information -# path = os.path.join(directory, 'sota_extractor2', '__init__.py') +# path = os.path.join(directory, 'axcell', '__init__.py') # with open(path) as read_file: # text = read_file.read() # pattern = re.compile(r"^__version__ = ['\"]([^'\"]*)['\"]", re.MULTILINE) # version = pattern.search(text).group(1) -version="2.0-alpha" +version="1.0.0" # # Extract long_description # path = os.path.join(directory, 'README.md') @@ -19,13 +21,14 @@ # long_description = read_file.read() long_description = "" setuptools.setup( - name='sota_extractor2', + name='axcell', version=version, - url='https://...', - description='System for extracting data from arxiv papers', - long_description_content_type='text/markdown', - long_description=long_description, - license='???', + url='https://github.com/paperswithcode/axcell', + description='System for extracting machine learning results from arxiv papers', + author='Papers with Code', +# long_description_content_type='text/markdown', +# long_description=long_description, + license='Apache License 2.0', packages=setuptools.find_packages(), include_package_data=True, @@ -39,9 +42,8 @@ ], project_urls={ # Optional - 'Homepage': 'https://...', - 'Source': 'https://...', - 'Bug Reports': 'https://...', - 'Citation': 'https://...', + 'Homepage': 'https://github.com/paperswithcode/axcell', + 'Source': 'https://github.com/paperswithcode/axcell', + 'Citation': 'https://arxiv.org/abs/2004.14356', }, -) \ No newline at end of file +) diff --git a/sota_extractor2/__init__.py b/sota_extractor2/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sota_extractor2/data/db.py b/sota_extractor2/data/db.py deleted file mode 100644 index c30d964..0000000 --- a/sota_extractor2/data/db.py +++ /dev/null @@ -1 +0,0 @@ -raise NotImplementedError() \ No newline at end of file diff --git a/sota_extractor2/data/doc_utils.py b/sota_extractor2/data/doc_utils.py deleted file mode 100644 index 4cfd3f2..0000000 --- a/sota_extractor2/data/doc_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -import re -from bs4 import BeautifulSoup, Comment, Tag -import codecs - -def _handle_reference(el): - if el.get('href', "").startswith("#"): - r = str(el.get('href')) - el.clear() # to remove it's content from the descendants iterator - return "xxref-" + r[1:] - - -def _handle_anchor(el): - if el.get('id', ""): - id_str = el.get('id', "") - el.clear() # to remove it's content from the descendants iterator - return "xxanchor-" + id_str - - -def _handle_table(el): - if el.name.lower() == 'table': - id_str = el.get('id', "xxunk") - el.clear() # to remove it's content from the descendants iterator - return f"xxtable-xxanchor-" + id_str - - -_transforms_el = [ - _handle_reference, - _handle_table, - _handle_anchor, -] - - -def transform(el): - if isinstance(el, Tag): - for f in _transforms_el: - r = f(el) - if r is not None: - return transform(r) - elif not isinstance(el, Comment): - return str(el) - return '' - - -def get_text(*els): - t = " ".join([transform(t) - for el in els for t in getattr(el, 'descendants', [el])]) - t = re.sub("^[aA]bstract ?", "", t) - t = re.sub("[ \n\xa0]+", " ", t) - t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t) - t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t) - return t.strip() - - -def content_in_section(header, names=['h3', 'h4'], skip_comments=True): - for el in header.next_siblings: - if getattr(el, 'name', '') in names: - break - if skip_comments and isinstance(el, Comment): - continue - yield el - - -def get_class(el): - if hasattr(el, 'get'): - # fixme: less convoluted way to return '' if calss is not found - return (el.get('class', [''])+[''])[0] - else: - return '' - - -def get_name(el): - return hasattr(el, 'name') and el.name or '' - - -def _group_bibliography(el): - if get_class(el) == 'thebibliography': - return [get_text(i) for i in el.select('p.bibitem')] - return [] - - -def _group_table(el): - if get_class(el) == 'table': - return [get_text(el)] - return [] - - -class ParagraphGrouper: - def __init__(self): - self.els = [] - self.join_next_p = False - - def collect(self, el): - if get_name(el) == 'table': - self.join_next_p = True - elif get_name(el) == "p": - if self.join_next_p: - self.join_next_p = False - self.els.append(el) - else: - return self.flush(new_els=[el]) - else: - self.els.append(el) - return [] - - def flush(self, new_els=None): - text = get_text(*self.els) - if new_els is None: - new_els = [] - if isinstance(new_els, Tag): # allow for one tag to be passed - new_els = [new_els] - self.els = new_els - if text: - return [text] - return [] - - def reset(self): - self.els = [] - - -_group_el = [ - _group_bibliography, - _group_table, -] - - -def group_content(elements): - par_gruop = ParagraphGrouper() - for el in elements: - fragments = [frag for grouper in _group_el for frag in grouper(el)] - if fragments: - fragments = par_gruop.flush() + fragments - else: - fragments = par_gruop.collect(el) - for frag in fragments: - yield frag - - for frag in par_gruop.flush(): - yield frag - - -def set_ids_by_labels(soup): - captions = soup.select(".caption") - prefix = "tex4ht:label?:" - for caption in captions: - el = caption.next_sibling - if isinstance(el, Comment) and el.string.startswith(prefix): - label = el.string[len(prefix):].strip() - for table in caption.parent.select("table"): - table["id"] = label - -def read_html(file): - with codecs.open(file, 'r', encoding='UTF-8') as f: - text = f.read() - return BeautifulSoup(text, "html.parser") diff --git a/sota_extractor2/data/paper_collection.py b/sota_extractor2/data/paper_collection.py deleted file mode 100644 index 332e904..0000000 --- a/sota_extractor2/data/paper_collection.py +++ /dev/null @@ -1,111 +0,0 @@ -from .elastic import Paper as PaperText, Fragments -from .table import Table, read_tables -from .json import load_gql_dump -from pathlib import Path -import re -import pickle -from joblib import Parallel, delayed -from collections import UserList -from ..helpers.jupyter import display_table - -class Paper: - def __init__(self, paper_id, text, tables, annotations): - self.paper_id = paper_id - if text is not None: - self.text = text - else: - self.text = PaperText() - self.text.fragments = Fragments() - self.tables = tables - self._annotations = annotations - if annotations is not None: - self.gold_tags = annotations.gold_tags.strip() - else: - self.gold_tags = '' - - -arxiv_version_re = re.compile(r"v\d+$") -def remove_arxiv_version(arxiv_id): - return arxiv_version_re.sub("", arxiv_id) - - -def _load_texts(path, jobs): - files = list(path.glob("**/text.json")) - texts = Parallel(n_jobs=jobs, prefer="processes")(delayed(PaperText.from_file)(f) for f in files) - return {remove_arxiv_version(text.meta.id): text for text in texts} - - -def _load_tables(path, annotations, jobs): - files = list(path.glob("**/metadata.json")) - tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(remove_arxiv_version(f.parent.name))) for f in files) - return {remove_arxiv_version(f.parent.name): tbls for f, tbls in zip(files, tables)} - -def _load_annotated_papers(path): - dump = load_gql_dump(path, compressed=False)["allPapers"] - annotations = {} - for a in dump: - arxiv_id = remove_arxiv_version(a.arxiv_id) - annotations[arxiv_id] = a - return annotations - - -class PaperCollection(UserList): - def __init__(self, data=None): - super().__init__(data) - - @classmethod - def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, jobs=-1): - path = Path(path) - if annotations_path is None: - annotations_path = path / "structure-annotations.json" - if load_texts: - texts = _load_texts(path, jobs) - else: - texts = {} - - annotations = _load_annotated_papers(annotations_path) - if load_tables: - tables = _load_tables(path, annotations, jobs) - else: - tables = {} - annotations = {} - outer_join = set(texts).union(set(tables)) - - papers = [Paper(k, texts.get(k), tables.get(k, []), annotations.get(k)) for k in outer_join] - return cls(papers) - - def get_by_id(self, paper_id): - paper_id = remove_arxiv_version(paper_id) - for p in self.data: - if p.paper_id == paper_id: - return p - return None - - @classmethod - def cells_gold_tags_legend(cls): - tags = [ - ("Tag", "description"), - ("model-best", "model that has results that author most likely would like to have exposed"), - ("model-paper", "an example of a generic model, (like LSTM)"), - ("model-competing", "model from another paper used for comparison"), - ("dataset-task", "Task"), - ("dataset", "Dataset"), - ("dataset-sub", "Subdataset"), - ("dataset-metric", "Metric"), - ("model-params", "Params, f.e., number of layers or inference time"), - ("table-meta", "Cell describing other header cells"), - ("trash", "Parsing erros") - ] - anns = [(t[0], "") for t in tags] - anns[0] = ("", "") - display_table(tags, anns) - - - def to_pickle(self, path): - with open(path, "wb") as f: - pickle.dump(self, f) - - @classmethod - def from_pickle(cls, path): - with open(path, "rb") as f: - return pickle.load(f) diff --git a/sota_extractor2/data/structure.py b/sota_extractor2/data/structure.py deleted file mode 100644 index a2e0a7c..0000000 --- a/sota_extractor2/data/structure.py +++ /dev/null @@ -1,144 +0,0 @@ -import re -import pandas as pd -from collections import namedtuple -import hashlib -from fastai.text import progress_bar -from .elastic import Fragment -from .json import * - -def get_all_tables(papers): - for paper in papers: - for table in paper.table_set.all(): - if 'trash' not in table.gold_tags and table.gold_tags != '': - table.paper_id = paper.arxiv_id - yield table - -def consume_cells(*matrix): - Cell = namedtuple('AnnCell', 'row col vals') - for row_id, row in enumerate(zip(*matrix)): - for col_id, cell_val in enumerate(zip(*row)): - yield Cell(row=row_id, col=col_id, vals=cell_val) - - -reference_re = re.compile(r"\[[^]]*\]") -ours_re = re.compile(r"\(ours?\)") -all_parens_re = re.compile(r"\([^)]*\)") - - -def clear_cell(s): - for pat in [reference_re, all_parens_re]: - s = pat.sub("", s) - s = s.strip() - return s - - -def empty_fragment(paper_id): - fragment = Fragment(paper_id=paper_id) - fragment.meta['highlight'] = {'text': ['']} - return fragment - - -def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpus_limit=10): - cell_content = clear_cell(cell_content) - if cell_content == "" and cell_reference == "": - return [] - - evidence_query = Fragment.search().highlight( - 'text', pre_tags="<b>", post_tags="</b>", fragment_size=400) - cell_content = cell_content.replace("\xa0", " ") - query = { - "query": cell_content, - "slop": 2 - } - paper_fragments = list(evidence_query - .filter('term', paper_id=paper_id) - .query('match_phrase', text=query)[:paper_limit]) - if cell_reference != "": - reference_fragments = list(evidence_query - .filter('term', paper_id=paper_id) - .query('match_phrase', text={ - "query": cell_reference, - "slop": 1 - })[:paper_limit]) - else: - reference_fragments = [] - other_fagements = list(evidence_query - .exclude('term', paper_id=paper_id) - .query('match_phrase', text=query)[:corpus_limit]) - if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements): - print(f"No evidences for '{cell_content}' of {paper_id}") - if not len(paper_fragments) and not len(reference_fragments): - paper_fragments = [empty_fragment(paper_id)] - return paper_fragments + reference_fragments + other_fagements - -fix_refs_re = re.compile('\(\?\)|\s[?]+(\s|$)') - - -def fix_refs(text): - return fix_refs_re.sub(' xref-unkown ', fix_refs_re.sub(' xref-unkown ', text)) - - -highlight_re = re.compile("</?b>") -partial_highlight_re = re.compile(r"\<b\>xxref\</b\>-(?!\<b\>)") - - -def fix_reference_hightlight(s): - return partial_highlight_re.sub("xxref-", s) - - -def create_evidence_records(textfrag, cell, table): - for text_highlited in textfrag.meta['highlight']['text']: - text_highlited = fix_reference_hightlight(fix_refs(text_highlited)) - text = highlight_re.sub("", text_highlited) - text_sha1 = hashlib.sha1(text.encode("utf-8")).hexdigest() - - cell_ext_id = f"{table.ext_id}/{cell.row}/{cell.col}" - - yield {"text_sha1": text_sha1, - "text_highlited": text_highlited, - "text": text, - "header": textfrag.header, - "cell_type": cell.vals[1], - "cell_content": fix_refs(cell.vals[0]), - "cell_reference": cell.vals[2], - "this_paper": textfrag.paper_id == table.paper_id, - "row": cell.row, - "col": cell.col, - "ext_id": cell_ext_id - #"table_id":table_id - } - - -def filter_cells(cell): - return re.search("[a-zA-Z]{2,}", cell.vals[1]) is not None - - -interesting_types = ["model-paper", "model-best", "model-competing", "dataset", "dataset-sub", "dataset-task"] - - -def evidence_for_table(table, paper_limit=10, corpus_limit=1, limit_type='interesting'): - def get_limits(cell_type): - if limit_type == 'interesting' and (cell_type.strip() in interesting_types) or (limit_type == 'max'): - return dict(paper_limit=1000, corpus_limit=1000) - return dict(paper_limit=paper_limit, corpus_limit=corpus_limit) - records = [ - record - for cell in consume_cells(table.matrix, table.matrix_gold_tags, table.matrix_references) if filter_cells(cell) - for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=table.paper_id, **get_limits(cell.vals[1])) - for record in create_evidence_records(evidence, cell, table=table) - ] - df = pd.DataFrame.from_records(records) - return df - - -def prepare_data(tables, csv_path, limit_type='interesting'): - df = pd.concat([evidence_for_table(table, - paper_limit=100, - corpus_limit=20, - limit_type=limit_type) for table in progress_bar(tables)]) - #moved to experiment preprocessing - #df = df.drop_duplicates( - # ["cell_content", "text_highlited", "cell_type", "this_paper"]) - print("Number of text fragments ", len(df)) - csv_path.parent.mkdir(parents=True, exist_ok=True) - df.to_csv(csv_path, index=None) diff --git a/sota_extractor2/data/table.py b/sota_extractor2/data/table.py deleted file mode 100644 index ecc91c8..0000000 --- a/sota_extractor2/data/table.py +++ /dev/null @@ -1,138 +0,0 @@ -import pandas as pd -import json -from pathlib import Path -import re -from dataclasses import dataclass, field -from typing import List -from ..helpers.jupyter import display_table - -@dataclass -class Cell: - value: str - gold_tags: str = '' - refs: List[str] = field(default_factory=list) - layout: str = '' - - -reference_re = re.compile(r"<ref id='([^']*)'>(.*?)</ref>") -num_re = re.compile(r"^\d+$") - -def extract_references(s): - parts = reference_re.split(s) - refs = parts[1::3] - text = [] - for i, x in enumerate(parts): - if i % 3 == 0: - text.append(x) - elif i % 3 == 2: - s = x.strip() - if num_re.match(s): - text.append(s) - else: - text.append(f"[{s}]") - text = ''.join(text) - return text, refs - - -def str2cell(s): - value, refs = extract_references(s) - return Cell(value=value, refs=refs) - -def read_str_csv(filename): - try: - df = pd.read_csv(filename, header=None, dtype=str).fillna('') - except pd.errors.EmptyDataError: - df = pd.DataFrame() - return df - - - - -class Table: - def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, old_name=None): - self.df = df - self.caption = caption - self.figure_id = figure_id - self.df = df.applymap(str2cell) - self.old_name = old_name - - if layout is not None: - self.layout = layout - for r, row in layout.iterrows(): - for c, cell in enumerate(row): - self.df.iloc[r,c].layout = cell - - if annotations is not None: - self.gold_tags = annotations.gold_tags.strip() - tags = annotations.matrix_gold_tags - gt_rows = len(annotations.matrix_gold_tags) - if gt_rows > 0: - gt_cols = len(annotations.matrix_gold_tags[0]) - if self.df.shape != (0,0) and self.df.shape == (gt_rows, gt_cols): - for r, row in enumerate(tags): - for c, cell in enumerate(row): - self.df.iloc[r,c].gold_tags = cell.strip() - else: - self.gold_tags = '' - - @classmethod - def from_file(cls, path, metadata, annotations=None, match_name=None): - path = Path(path) - filename = path / metadata['filename'] - df = read_str_csv(filename) - if 'layout' in metadata: - layout = read_str_csv(path / metadata['layout']) - else: - layout = None - if annotations is not None and match_name is not None: - table_ann = annotations.table_set.filter(name=match_name) + [None] - table_ann = table_ann[0] - else: - table_ann = None - return cls(df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, match_name) - - def display(self): - display_table(self.df.applymap(lambda x: x.value).values, self.df.applymap(lambda x: x.gold_tags).values) - -##### -# this code is used to migrate table annotations from -# tables parsed by htlatex to tables parsed by -# latexml. After all annotated tables will be successfully -# migrated, we switch back to match-by-name - -from unidecode import unidecode -import string -from collections import Counter - -punctuation_table = str.maketrans('', '', string.punctuation) -def normalize_string(s): - if s is None: - return "" - return unidecode(s.strip().lower().replace(' ', '')).translate(punctuation_table) - -def _remove_almost_empty_values(d): - return {k:v for k,v in d.items() if len(v) >= 10} - -def _keep_unique_values(d): - c = Counter(d.values()) - unique = [k for k,v in c.items() if v == 1] - return {k: v for k,v in d.items() if v in unique} - -def _match_tables_by_captions(annotations, metadata): - if annotations is None: - return {} - old_captions = {x.name: normalize_string(x.desc) for x in annotations.table_set} - new_captions = {m['filename']: normalize_string(m['caption']) for m in metadata} - old_captions = _keep_unique_values(_remove_almost_empty_values(old_captions)) - new_captions = _keep_unique_values(_remove_almost_empty_values(new_captions)) - old_captions_reverse = {v:k for k,v in old_captions.items()} - return {new_name:old_captions_reverse[caption] for new_name, caption in new_captions.items() if caption in old_captions_reverse} - -#### - -def read_tables(path, annotations): - path = Path(path) - with open(path / "metadata.json", "r") as f: - metadata = json.load(f) - _match_names = _match_tables_by_captions(annotations, metadata) - return [Table.from_file(path, m, annotations, match_name=_match_names.get(m["filename"])) for m in metadata] diff --git a/sota_extractor2/helpers/__init__.py b/sota_extractor2/helpers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sota_extractor2/helpers/table_style.py b/sota_extractor2/helpers/table_style.py deleted file mode 100644 index 41432bc..0000000 --- a/sota_extractor2/helpers/table_style.py +++ /dev/null @@ -1,3 +0,0 @@ -table_style="""<style>body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)} -</style> -""" diff --git a/sota_extractor2/helpers/training.py b/sota_extractor2/helpers/training.py deleted file mode 100644 index 2584191..0000000 --- a/sota_extractor2/helpers/training.py +++ /dev/null @@ -1,10 +0,0 @@ - -def set_seed(seed, name, quiet=False): - import torch - import numpy as np - if not quiet: - print(f"Setting {name} seed to {seed}") - torch.manual_seed(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - np.random.seed(seed) \ No newline at end of file diff --git a/sota_extractor2/models/linking/__init__.py b/sota_extractor2/models/linking/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sota_extractor2/models/structure/ulmfit.py b/sota_extractor2/models/structure/ulmfit.py deleted file mode 100644 index 5038c00..0000000 --- a/sota_extractor2/models/structure/ulmfit.py +++ /dev/null @@ -1 +0,0 @@ -from fastai.text import * diff --git a/tables2json.py b/tables2json.py deleted file mode 100755 index 8050f35..0000000 --- a/tables2json.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python - -import fire -from sota_extractor.taskdb import TaskDB -from pathlib import Path -import json -import re -import pandas as pd - -from label_tables import get_table, get_metadata - -def get_celltags(filename): - filename = Path(filename) - if filename.exists(): - - try: - celltags = pd.read_csv(filename, header=None, dtype=str).fillna('') - except pd.errors.EmptyDataError: - return pd.DataFrame() - return celltags - else: - return pd.DataFrame() - - -def get_tables(tables_dir): - tables_dir = Path(tables_dir) - all_metadata = {} - all_tables = {} - all_celltags = {} - for metadata_filename in tables_dir.glob("*/metadata.json"): - metadata = get_metadata(metadata_filename) - for k in metadata: - if metadata[k] is None: - metadata[k] = '' - basedir = metadata_filename.parent - arxiv_id = basedir.name - all_metadata[arxiv_id] = metadata - all_tables[arxiv_id] = {t:get_table(basedir / t) for t in metadata} - all_celltags[arxiv_id] = {t:get_celltags(basedir / t.replace("table", "celltags")) for t in metadata} - return all_metadata, all_tables, all_celltags - -def t2j(df): - rows, cols = df.shape - if rows == 0 or cols == 0: - return [[""]] - return [[df.iloc[r, c] for c in range(cols)] for r in range(rows)] - - -def tables2json(tables_dir): - metadata, tables, celltags = get_tables(tables_dir) - all_data = [] - for arxiv_id in metadata: - tabs = [] - for tab in metadata[arxiv_id]: - table = dict( - name=tab, - caption=metadata[arxiv_id][tab], - values=t2j(tables[arxiv_id][tab]), - tags=t2j(celltags[arxiv_id][tab]) - ) - tabs.append(table) - all_data.append(dict(paper_id=arxiv_id, tables=tabs)) - print(json.dumps(all_data)) - -if __name__ == '__main__': fire.Fire(tables2json) diff --git a/tabular.py b/tabular.py deleted file mode 100644 index 275e0dc..0000000 --- a/tabular.py +++ /dev/null @@ -1,22 +0,0 @@ -import pandas as pd -import numpy as np -import json - - -class Tabular: - def __init__(self, data, layout, caption, figure_id=None): - self.data = data - self.layout = layout - self.cell_tags = pd.DataFrame().reindex_like(data).fillna('') - self.datasets = set() - self.metrics = set() - self.caption = caption - self.figure_id = figure_id - - def mark_with_metric(self, metric_name): - self.metrics.add(metric_name) - - def mark_with_dataset(self, dataset_name): - self.datasets.add(dataset_name) - - diff --git a/test/src/table_01.csv b/test/src/table_01.csv deleted file mode 100644 index 1581ee8..0000000 --- a/test/src/table_01.csv +++ /dev/null @@ -1,5 +0,0 @@ -left,center,right -1,2,3 -4,5,6 -7,8,9 -a,b,c diff --git a/test/src/main.tex b/tests/data/main.tex similarity index 51% rename from test/src/main.tex rename to tests/data/main.tex index ac7a18d..9270799 100644 --- a/test/src/main.tex +++ b/tests/data/main.tex @@ -1,5 +1,6 @@ \documentclass{article} \usepackage{booktabs} +\usepackage{xcolor} \title{DILBERT: Distilling Inner Latent BERT variables} \author{John Doe} \begin{document} @@ -8,6 +9,9 @@ In this paper we achieve state-of-the-art performance in random number generation. \end{abstract} \section{Introduction} +\section{Model} +\subsection{Preprocessing} +\subsection{Architecture} \section{Experiments} In this section we present Table~\ref{tab}. \begin{table} @@ -21,4 +25,14 @@ \section{Experiments} \caption{A table.} \label{tab} \end{table} + \begin{table} + \begin{tabular}{lcr} \toprule + \textbf{bold text} & \textit{italic text} & \textbf{\textit{bold italic text}}\\\midrule + \textcolor{red}{red text} & \textcolor{green}{green text} & \textcolor{blue}{blue text}\\ + $\mathbf{5.4\%}$ & $\mathit{3.8\%}$ & $\mathbf{11.2}\pm 0.15$\\\midrule + \textbf{an \textit{italic} text inside bold} & {\bf \textcolor{red}{bold red}} & \\\bottomrule + \end{tabular} + \caption{A table.} + \label{tab2} + \end{table} \end{document} diff --git a/tests/test_extraction.py b/tests/test_extraction.py new file mode 100644 index 0000000..642efa8 --- /dev/null +++ b/tests/test_extraction.py @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pytest +from pathlib import Path +from axcell.helpers.paper_extractor import PaperExtractor +from axcell.data.paper_collection import PaperCollection +from shutil import copyfileobj +import gzip + + +def test_extraction(tmpdir): + # pack main.tex to an archive + tmpdir = Path(tmpdir) + source = Path(__file__).resolve().parent / "data" / "main.tex" + paper_id = "1234.56789" + archive = tmpdir / "sources" / paper_id + archive.parent.mkdir() + with source.open("rb") as src, gzip.open(archive, "wb") as dst: + copyfileobj(src, dst) + + extract = PaperExtractor(tmpdir) + status = extract(archive) + assert status == "success" + + pc = PaperCollection.from_files(tmpdir / "papers") + extracted = len(pc) + assert extracted == 1, f"Expected to extract exactly one paper, found {extracted}" + + paper = pc[0] + assert paper.paper_id == paper_id + assert paper.text.title == "DILBERT: Distilling Inner Latent BERT variables" + assert len(paper.tables) == 2 + + assert paper.tables[0].caption == "Table 1: A table." + assert paper.tables[1].caption == "Table 2: A table." + + assert paper.tables[0].shape == (5, 3) + assert paper.tables[1].shape == (4, 3) diff --git a/tests/test_metric_ranges.py b/tests/test_metric_ranges.py new file mode 100644 index 0000000..eb4fcc4 --- /dev/null +++ b/tests/test_metric_ranges.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pytest +from decimal import Decimal +from axcell.models.linking.bm25_naive import convert_metric + +raw_values = ["0.21", "0.21%", "21", "21%"] +ranges = ["0-1", "1-100", "abs", ""] + +values = { + # 0.21 0.21% 21 21% + "0-1": [ "0.21", "0.0021", "0.21", "0.21"], + "1-100": [ "21", "0.21", "21", "21"], + "abs": [ "0.21", "0.0021", "21", "0.21"], + "": [ "0.21", "0.21", "21", "21"] +} + +comp_values = { + # 0.21 0.21% 21 21% + "0-1": [ "0.79", "0.9979", "0.79", "0.79"], + "1-100": [ "79", "99.79", "79", "79"], + "": [ "0.79", "99.79", "79", "79"] +} + +cases = [(raw_value, rng, complementary, Decimal(answer)) + for complementary, vals in zip([False, True], [values, comp_values]) + for rng in vals + for raw_value, answer in zip(raw_values, vals[rng]) +] + + +@pytest.mark.parametrize("raw_value,rng,complementary,expected", cases) +def test_ranges(raw_value, rng, complementary, expected): + value = convert_metric(raw_value, rng, complementary) + assert value == expected, (f"{'complement of ' if complementary else ''}" + f"raw value {raw_value}, assuming {rng if rng else 'empty'} range " + f"should be extracted as {expected}, not {value}") diff --git a/unpack-sources.sh b/unpack-sources.sh deleted file mode 100755 index 89c4608..0000000 --- a/unpack-sources.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash -archive="$1" -outdir="$2" - -mime_type=$(file --brief --uncompress --mime-type "$archive") -case "$mime_type" in - 'application/x-tar') - mkdir -p "$outdir" - tar -xf "$archive" -C "$outdir" - ;; - 'text/x-tex') - mkdir -p "$outdir" - gunzip -c "$archive" > "$outdir/main.tex" - ;; - 'application/pdf') - echo "File '$archive' is a PDF file, not a LaTeX source code. Skipping" 1>&2 - ;; - *) - echo "File '$archive' is of unknown type '$mime_type'. Skipping" 1>&2 - ;; -esac