Merge branch 'dev' of github.com:ag-gipp/NLPLand into remove-spacy-tr…

…f #patch
jpwahle · Nov 4, 2021 · a78f82d · a78f82d
2 parents f47c630 + 078a9f2
commit a78f82d
Show file tree

Hide file tree

Showing 6 changed files with 198 additions and 13 deletions.
diff --git a/.github/workflows/branch.yaml b/.github/workflows/branch.yaml
@@ -0,0 +1,12 @@
+on:
+ issues:
+ types: [created]
+
+jobs:
+ create_issue_branch_job:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Create Issue Branch
+ uses: robvanderleek/create-issue-branch@main
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/tests.yml → .github/workflows/main.yml b/.github/workflows/tests.yml → .github/workflows/main.yml
@@ -1,9 +1,11 @@
-# .github/workflows/tests.yml
-name: Tests
-on: push
+name: CI
+on:
+ pull_request:
+ branches:
+ - dev
 jobs:
  tests:
- runs-on: [self-hosted, dke]
+ runs-on: ubuntu-latest
  steps:
  - uses: actions/checkout@v2
  - uses: actions/setup-python@v2
@@ -25,9 +27,10 @@ jobs:
  if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
  - name: Run tests
  run: |
- poetry run py.test tests -s --cov=nlpland --cov-report=term-missing --cov-fail-under 90
+ poetry run py.test tests -s --cov=nlpland --cov-report=xml --cov-fail-under 97
+
  linting:
- runs-on: [self-hosted, dke]
+ runs-on: ubuntu-latest
  steps:
  - uses: actions/checkout@v2
  - uses: actions/setup-python@v2
@@ -51,7 +54,7 @@ jobs:
  run: |
  poetry run pylint nlpland -j 4 --reports=y
  typing:
- runs-on: [self-hosted, dke]
+ runs-on: ubuntu-latest
  steps:
  - uses: actions/checkout@v2
  - uses: actions/setup-python@v2

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -0,0 +1,73 @@
+name: Release
+on:
+ pull_request:
+ branches:
+ - main
+ types: [closed]
+
+jobs:
+ fulltests:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions/setup-python@v2
+ with:
+ python-version: 3.7
+ - name: Install poetry
+ uses: snok/install-poetry@v1
+ with:
+ virtualenvs-create: true
+ virtualenvs-in-project: true
+ - name: Load cached venv
+ id: cached-poetry-dependencies
+ uses: actions/cache@v2
+ with:
+ path: .venv
+ key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
+ - name: Install dependencies
+ run: poetry install
+ if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
+ - name: Run linting
+ run: |
+ poetry run pylint nlpland -j 4 --reports=y
+ - name: Run tests
+ run: |
+ poetry run py.test tests -s --cov=nlpland --cov-report=term-missing --cov-fail-under 97
+ - name: Upload coverage report
+ uses: codecov/codecov-action@v2
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ file: ./coverage.xml
+ - name: Run typing
+ run: |
+ poetry run pyright nlpland
+ Release:
+ if: github.event.pull_request.merged == true
+ needs: [fulltests]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: "0"
+ - uses: actions/setup-python@v2
+ with:
+ python-version: 3.8
+ - name: Bump version and push tag
+ uses: anothrNick/[email protected]
+ id: tagging
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ WITH_V: true
+ - name: Build Changelog
+ id: build_changelog
+ uses: mikepenz/release-changelog-builder-action@v1
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ - name: Create Release
+ uses: actions/create-release@v1
+ with:
+ release_name: ${{steps.tagging.outputs.tag}}
+ body: ${{steps.build_changelog.outputs.changelog}}
+ tag_name: ${{steps.tagging.outputs.tag}}
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2021 Jan Philip Wahle, Lennart Küll, Terry Ruas, and others
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,15 +1,26 @@
 # NLPLand
-[![Tests](https://github.com/ag-gipp/NLPLand/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/ag-gipp/NLPLand/actions/workflows/tests.yml)
+
+<p align="center">
+<a href="https://codecov.io/gh/ag-gipp/NLPLand"><img src="https://codecov.io/gh/ag-gipp/NLPLand/branch/main/graph/badge.svg?token=7CL6B5LNKP"/></a> 
+<a href="https://github.com/ag-gipp/NLPLand/actions/workflows/release.yaml"><img alt="Actions Status" src="https://github.com/ag-gipp/NLPLand/actions/workflows/release.yaml/badge.svg"> 
+<a href="https://github.com/ag-gipp/NLPLand/actions/workflows/main.yml"><img alt="Actions Status" src="https://github.com/ag-gipp/NLPLand/actions/workflows/main.yml/badge.svg?branch=main">
+<a href="https://github.com/ag-gipp/NLPLand/releases"><img alt="Actions Status" src="https://img.shields.io/github/v/release/ag-gipp/NLPLand?sort=semver"></a>
+<a href="https://github.com/ag-gipp/NLPLand/blob/master/LICENSE"><img alt="License: MIT" src="https://black.readthedocs.io/en/stable/_static/license.svg"></a>
+<a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
+</p>
 
 This repository is part of the project titled "NLPLand and its Secrets".
 The project is within the scope of the Information Retrieval course at the Bergische University of Wuppertal in the summer semester of 2021.
 The following project description should give a broad overview over the project, but is subject to change.
 
 ### Project Description
+
 The ACL Anthology (AA) is the largest single repository of thousands of articles on Natural Language Processing (NLP) and Computational Linguistics (CL). It contains valuable metadata (e.g. venues, authors’ name, title) that can be used to better understand the field. NLP Scholar, uses this data to examine the literature to identify broad trends in productivity, focus, and impact. We want to extend this analysis to specific components in NLP publications.
 
 ## Installation & Setup
+
 ### poetry
+
 First download poetry as explained here: https://python-poetry.org/docs/#installation
 
 Also make sure you have python 3.7 installed.
@@ -21,39 +32,90 @@ If you are in a virtual environment it will install all dependencies there, othe
 (Should poetry not be able to find a python 3.7 installation, specify the python path using `poetry env use <path>` to create a venv based on the given python version.)
 
 If you were not already in a venv, execute `poetry shell` to activate the newly created one.
-(If the command does not work, try to activate the venv manually or try another shell.) 
+(If the command does not work, try to activate the venv manually or try another shell.)
 
 ### .env
-You have to rename the file `empty.env` to `.env`. In this file you have to set your variables. (Hint: All path variables can be either an absolute or relative path.)
+
+You have to rename the file `empty.env` to `.env`. In this file you have to set your variables. (Hint: All path variables can be either an absolute or relative path.)
 
 `PATH_PAPERS` is the path to the directory with the downloaded papers.
 (Only used in abstract extraction)
 
 `PATH_ANTHOLOGY` is the path to the `xml` [directory in the ACL Anthology](https://github.com/acl-org/acl-anthology/tree/master/data/xml).
 (Only used in abstract extraction)
 
-`PATH_DATASET` is the path to the `.txt` file of the [NLP Scholar dataset](http://saifmohammad.com/WebPages/nlpscholar.html). 
+`PATH_DATASET` is the path to the `.txt` file of the NLP Scholar dataset.
+
+`PATH_DATASET_EXPANDED` is the path to the `.txt` file of the expanded dataset or where it is supposed to be created.
+
+## Code quality and checks
+
+To maintain a consistent and well-tested repository, we use unit tests, linting, and typing checkers with GitHub actions. We use pytest for testing, pylint for linting, and pyright for typing.
+Every time code gets pushed to our repository these checks are executed and have to fullfill certain requirements before you can merge the code to our master branch.
+
+We also use naming conventions for branches, commits, and pull requests to leverage GitHub workflow automation and keep the repository clean.
+
+In the following we will describe how to run checks locally and which naming conventions we use.
+
+### Running test pipelines locally
+
+To run the test pipeline locally, make sure to install act from [here](https://github.com/nektos/act).
+
+To run the full check suite, execute:
+
+```sh
+act -P self-hosted=nektos/act-environments-ubuntu:18.04
+```
+
+To run a single check from the pipeline such as linting, execute:
+
+```sh
+act -j linting -P self-hosted=nektos/act-environments-ubuntu:18.04
+```
+
+### Repository and naming conventions
+
+Each feature request, bug, enhancement, etc. has to be related to an issue. We have templates for bugs and features requests when you create an issue on GitHub.
+An issue should be a closed component that can be implemented by one developer in 1 day. If the issue is larger than that, split it into smaller components.
+To identify the issue, we use labels such as `bug` or `enhancement`.
+
+To start a new branch, please use the naming convention `{issue_number}`\_`{short_issue_acronym}`. When you are done working on the branch, include the following text in the last commit message `fixes {issue_numer}`. Then create a pull request including the text `resolves {issue_number}`.
+
+We group issues using a task list in another issue that has the `Epic` label. These issues are larger components that need to be developed.
+Each issue with the `Epic` label has a task list with each element of the task list being a issue (e.g., this one [#47](https://github.com/ag-gipp/NLPLand/issues/47)).
+Whenever a pull request with the above convention gets merged, the corresponding issue gets closed, and the task in the Epic gets checked.
+
+To indicate whether the PR is a patch, minor, or major update, please use #patch, #minor, #major in the last commit message of the PR and in the PR description.
+See [here](https://github.com/anothrNick/github-tag-action) for more information.
+
+To build changelogs, each pull-request needs one of the labels "fix", "feature", or "test". See [here](https://github.com/mikepenz/release-changelog-builder-action) for more information.
+
+`PATH_DATASET` is the path to the `.txt` file of the [NLP Scholar dataset](http://saifmohammad.com/WebPages/nlpscholar.html).
 
 `PATH_DATASET_EXPANDED` is the path to the `.txt` file of the expanded dataset or where it is supposed to be created.
 
 ### Getting Started
+
 To get started we recommend downloading the ACL Anthology XML files and the NLP Scholar dataset and enter their paths into the .env.
 Then set `PATH_DATASET_EXPANDED` in the .env to a path of your choice.
 Next, run `cli extract anth --original` to create an extended dataset.
 You can find out more about the command in the documentation further down.
 Now you should be able to run all implemented analyses.
 
 ## Commands
+
 All commands are preceded with `cli`.
 
 ### Paper download
+
 The command `download` downloads and saves the papers to your computer.
 The papers will be structured as follows: `<year>/<venue-name>/<paper-id>.pdf`.
 Some special characters in the venue name and paper id will be removed or replaced, because of folder name restrictions.
 
 Example: `cli download --min-year 2015` will download all papers from 2015 onwards.
 
 ### Abstract extraction
+
 The command `extract <mode>` adds the abstracts to the dataset.
 There are two modes and multiple options:
 
@@ -74,6 +136,7 @@ Warning: This will overwrite everything once it saves.
 Example: `cli extract rule --overwrite-rule` will add new abstracts and overwrite all abstracts previously extracted with the rule-based system.
 
 ### Counting
+
 The command `count <k>` prints the term frequency of the top k grams/tokens.
 It also prints the top k tf-idf scores. Both are calculated using [sklearn](https://github.com/scikit-learn/scikit-learn).
 
@@ -83,6 +146,7 @@ To set the lower and upper bounds of n one can use e.g. `--ngrams 1,2`.
 Example: `cli count 10 --ngrams 2` prints the 10 bigrams with the highest term frequency and also separately tf-idf score.
 
 ### Counts over time
+
 The command `counts-time <k>` plots the top k grams over a specified time.
 It counts the term frequency per year and plots all tokens that were in a top k in one year or more.
 The time can be specified using the filters mentioned further down.
@@ -97,6 +161,7 @@ The option `--name <name>` or `-n <name>` allows to name the file the plot will
 Example: `cli counts-time 10 --min-year 2011` plots all unigrams that were in a top 10 from 2011 onwards.
 
 ### Scattertext
+
 The command `scatter` uses the library [scattertext](https://github.com/JasonKessler/scattertext) to compare the term frequencies of specified 2 subsets with an interactive scatterplot.
 The filters to specify the subsets are mentioned further down.
 
@@ -109,6 +174,7 @@ The option `--name <name>` or `-n` allows to name the file the plot will be save
 Example: `cli scatter --venues ACL --year 2019 --venues2 --year 2020` will plot the ACL papers from 2019 against those from 2020.
 
 ### Topic model training
+
 The command `topic-train <k>` will train a topic model with `k` topics using an LDA implementation in [gensim](https://github.com/RaRe-Technologies/gensim).
 It will also create an interactive plot using [pyLDAvis](https://github.com/bmabey/pyLDAvis).
 
@@ -117,12 +183,15 @@ The option `--name <name>` or `-n` allows to name the model and the file the plo
 Example: `cli topic 10 --min-year 2010` will create a topic model with 10 topics with all the data available from 2010 onwards.
 
 ### Topics over time
+
 WIP
 
 ### Semantic analysis
+
 WIP
 
 ### Misc
+
 These commands are mostly for development purposes and improving the rule-based system.
 
 The command `checkdataset` prints a lot of information about the dataset and performs various checks.
@@ -137,19 +206,21 @@ The command `checkpaper <paper-path>` prints the raw text of the paper specified
 The command `countabstractsanth` counts the amount abstracts and papers in the ACL Anthology based on the XML files.
 
 ## Filters
+
 The following filters are applicable to all commands except the few ones under [Misc](#misc).
 They will filter out rows that do not match the specified filters or mask certain attributes.
 Different filters can be applied simultaneously.
 The filters will then work additively, i.e. the more different filters are specified, the more restrictive the selection is.
 
 ### Data
+
 The filter `--data <type>` allows selecting only specific parts of the data.
 To do so it will mask all non-selected entries.
 Combinations are possible, by listing multiple types (see the example).
 In those cases the multiple types will be additive, so the more is listed, the less is masked.
 The following types exist:
 
-`all` selects everything and is the equivalent to applying no `data` filter at all. 
+`all` selects everything and is the equivalent to applying no `data` filter at all.
 
 `titles` selects the titles of the papers.
 
@@ -162,13 +233,15 @@ The following types exist:
 Example: `cli count 10 --data titles,abstracts-anth` will count all words in the titles and additionally all abstracts that were extracted from the XML files.
 
 ### Venues
+
 The filter `--venues <name(s)>` allows to select a subset of data containing only papers from specific venues.
 It is possible to select one or multiple venues (see the example).
 The venue name must match the name in the dataset, not the name of the folder the papers are saved in, as some special characters had to be removed or replaced for the folder naming.
 
 Example: `cli count 10 --venues ACL,EMNLP` will only count words from paper published in ACL and EMNLP.
 
 ### Years
+
 To filter the year of publication there are 3 filters one can use.
 
 `--year <year>` selects papers that were published in that year according to the ACL Anthology.
@@ -181,6 +254,7 @@ This filter will overwrite the other two, should they be applied at the same tim
 Example: `cli count 10 --min-year 2018 --max-year 2020` will count all words from papers published in 2018, 2019 and 2020.
 
 ### Authors
+
 To filter the authors there are two options.
 The filters ignore casing, but otherwise it has to be an exact match.
 In the NLP Scholar dataset nearly all authors are saved like `<lastname>, <firstname>`.

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ gensim = "^3.8.1"
 pyLDAvis = "^3.3.1"
 spacy = "^3.1.0"
 llvmlite="^0.37.0"
+dask="^2021.10.0"
 umap-learn = {extras = ["plot"], version = "^0.5.1"}
 en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl"}
 # en-core-web-trf = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.1.0/en_core_web_trf-3.1.0-py3-none-any.whl"}
@@ -44,4 +45,4 @@ requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.black]
-line-length = 100
+line-length = 100