Skip to content

Commit

Permalink
Improve auto-cite process (greenelab#120)
Browse files Browse the repository at this point in the history
- change auto-cite github actions workflow to install via requirements.txt (allows us to add more packages later if we need)
- update CFF file
- rebuild demo citations with new cache key
- add explicit cache matching key to sources. do this as a hash of the input source object. so, when any field in the source changes, invalidate the cache. just to be safe and for simpler/less error-prone behavior.
- add special case for when `id` is not defined. in this case, do not pass to manubot (because it will throw error), and instead pass source through untouched.
- move the code that merges in extra/overridden input props into the sources loop to avoid another "match by id" situation" (error prone)  
- force pyyaml to not use references (shows up when there are duplicate data structures)
- rename "find_match" to "get_cached". match by explicit cache key instead of id. if key absent, do not count as match. if more than one match, do not count as match (needed because if there are multiple sources with the same id in the same file, but with different other props, the template could choose the wrong one and screw up the output).
- add requirements.txt
- reformat all python with black formatter
- updated docs to reflect the above.
  • Loading branch information
vincerubinetti authored Jun 24, 2022
1 parent 2a1beab commit b7ec652
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 32 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/auto-cite.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Install Manubot
run: pip install --upgrade manubot
- name: Install packages
run: python -m pip install --upgrade --requirement ./auto-cite/requirements.txt
- name: Build updated citations
run: python ./auto-cite/auto-cite.py
- name: Commit updated citations
Expand Down
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ authors:
given-names: "Casey"
orcid: "https://orcid.org/0000-0001-8713-9213"
title: "Lab Website Template"
version: 0.4.1
date-released: 2021-08-23
version: 0.6.1
date-released: 2022-06-22
url: "https://github.com/greenelab/lab-website-template"
3 changes: 3 additions & 0 deletions _data/citations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
link: https://greenelab.github.io/knowledge-graph-review/
tags:
- knowledge graphs
_cache: 371e391cc663116136dc43f4c93ebdc2998f52a3abd67c7add918c3b9f4f98d9
- id: doi:10.1371/journal.pcbi.1007128
title: Open collaborative writing with Manubot
authors:
Expand All @@ -37,6 +38,7 @@
link: https://github.com/greenelab/meta-review
- type: website
link: http://manubot.org/
_cache: 9b4d326347b4c43474de520edff579266c3c490e1baf4a2074ea151d6c90e318
- id: doi:10.7554/eLife.32822
title: Sci-Hub provides access to nearly all scholarly literature
authors:
Expand All @@ -62,3 +64,4 @@
- type: source
link: https://github.com/greenelab/scihub
text: Analyses source
_cache: 1510c88d1d7cdb29f090da47b0903118fa57243de166541c8ff31c614fc05604
37 changes: 23 additions & 14 deletions auto-cite/auto-cite.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from util import *
from importlib import import_module
from dict_hash import sha256

# config info for input/output files and plugins
config = {}
Expand Down Expand Up @@ -38,11 +39,15 @@
log(message, 3, "red")
exit(1)

# run plugin
plugin_sources = import_module(f"plugins.{name}").main(data)

log(f"Got {len(plugin_sources)} sources", 2, "green")

for source in plugin_sources:
# make unique key for cache matching
source["_cache"] = sha256({**source, "plugin": name, "input": file})
# add source
sources.append(source)

log("Generating citations for sources")
Expand All @@ -60,36 +65,40 @@
# go through sources
for index, source in enumerate(sources):
# show progress
log(f"Source {index + 1} of {len(sources)} - {source.get('id', '-')}", 2)
log(f"Source {index + 1} of {len(sources)} - {source.get('id', 'No ID')}", 2)

# new citation for source
new_citation = {}

# find same source in existing citations
cached = find_match(source, citations)
cached = get_cached(source, citations)

if cached:
# use existing citation to save time
log("Using existing citation", 3)
new_citations.append(cached)
new_citation = cached

else:
elif source.get("id", "").strip():
# use Manubot to generate new citation
log("Using Manubot to generate new citation", 3)
try:
new_citations.append(cite_with_manubot(source))
new_citation = cite_with_manubot(source)
except Exception as message:
log(message, 3, "red")
exit(1)
else:
# pass source through untouched
log("Passing source through", 3)

log("Exporting citations")

# go through new citations
for citation in new_citations:
# merge in properties from input source
citation.update(find_match(citation, sources))

new_citation.update(source)
# ensure date in proper format for correct date sorting
citation["date"] = clean_date(citation.get("date"))
new_citation["date"] = clean_date(new_citation.get("date"))

log(f"Exported {len(new_citations)} citations", 2, "green")
# add new citation to list
new_citations.append(new_citation)

log("Exporting citations")

# save new citations
try:
Expand All @@ -98,4 +107,4 @@
log(message, 2, "red")
exit(1)

log("Done!")
log(f"Exported {len(new_citations)} citations", 2, "green")
6 changes: 1 addition & 5 deletions auto-cite/plugins/orcid.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@ def main(data):

for index, entry in enumerate(data):
# show progress
log(
f"Orcid {index + 1} of {len(data)} - {entry.get('orcid', '-')}",
3,
"cyan"
)
log(f"Orcid {index + 1} of {len(data)} - {entry.get('orcid', '-')}", 3, "cyan")

# query api to get dois from orcid
url = endpoint.replace("$ORCID", entry.get("orcid", "-"))
Expand Down
3 changes: 3 additions & 0 deletions auto-cite/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
manubot==0.5.3
PyYAML==6.0
dict-hash==1.1.26
23 changes: 14 additions & 9 deletions auto-cite/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from yaml.loader import SafeLoader
from datetime import datetime

# https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml
yaml.Dumper.ignore_aliases = lambda *args: True

# current working directory
directory = os.path.dirname(os.path.realpath(__file__))

Expand Down Expand Up @@ -44,12 +47,16 @@ def log(message="", level=1, color=""):
print(f"{(level - 1) * ' '}{palette[color]}{message}{palette['reset']}\n")


# find item in list that matches entry by id
def find_match(entry, list):
for item in list:
if type(item) == dict and item.get("id") == entry.get("id"):
return item
return {}
# find item in existing citations that matches source
def get_cached(source, citations):
_cache = source.get("_cache")
if not _cache:
return
# match by cache key
matches = [citation for citation in citations if citation.get("_cache") == _cache]
# only return if there is a unique match
if len(matches) == 1:
return matches[0]


# get date parts from Manubot citation
Expand Down Expand Up @@ -141,13 +148,11 @@ def cite_with_manubot(source):
# run Manubot and get results
try:
commands = ["manubot", "cite", id, "--log-level=WARNING"]
print(palette['gray'])
output = subprocess.Popen(commands, stdout=subprocess.PIPE).communicate()
print(palette['reset'])
except Exception as error:
log(error, 3, "gray")
raise Exception("Manubot could not generate citation")

# parse results as json
try:
manubot = json.loads(output[0])[0]
Expand Down

0 comments on commit b7ec652

Please sign in to comment.