Basically a fork from https://github.com/blochberger/sokman but with the intention of adding a visual interface as well
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

111 lines
3.7 KiB

from pprint import pprint
from time import sleep
from django.db import transaction
from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand, CommandError
import sok.management.commands.dblpimport as dblp
from sok.management.commands.snowball import semanticscholar
from sok.models import Publication, PublicationReference, SemanticScholar
class Command(BaseCommand):
def log_success(self, msg: str):
self.stdout.write(self.style.SUCCESS(msg))
def log_info(self, msg: str, nl: bool = True):
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
self.stdout.flush()
@transaction.atomic
def fix_references(self) -> None:
"""
Create relevant references to masters of referenced variants.
If mulitple variants of a publication exist, only the master variant is
considered. However, relevant publications might reference a non-master
master-variant, e. g., a preprint.
This command adds references to the master-variant, even though this
reference is not actually present in the publication. The reference
identifier is marked with a star, e. g., '[1]*'.
"""
self.log_info("--- Searching for references to variants ---")
for publication in Publication.objects.filter(variant_of__isnull=False):
variant = publication.variant_of
origs = PublicationReference.objects.filter(reference=publication)
for orig in origs:
if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists():
continue
fixed = PublicationReference(
reference=variant,
publication=orig.publication,
identifier=('' if orig.identifier is None else orig.identifier) + "*",
)
try:
fixed.full_clean()
fixed.save()
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}")
except ValidationError as e:
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}")
def fix_dblp(self):
self.log_info("--- Searching for entries not in the default DBLP dump ---")
keys_in_db = set(
Publication.objects.filter(
cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('cite_key', flat=True).distinct()
)
keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH)
self.stdout.write(f"DB: {len(keys_in_db):8d}")
self.stdout.write(f"DBLP: {len(keys_in_dump):8d}")
pprint(keys_in_db - keys_in_dump)
def find_missing_dois(self):
self.log_info("--- Searching for missing DOIs ---")
publications = Publication.objects.filter(doi__isnull=True)
keys = {
dblp.strip_cite_key_prefix(cite_key)
for cite_key in publications.values_list('cite_key', flat=True)
}
self.log_info("Parsing DBLP dump...")
results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys)
self.log_info("done")
for result in results:
if doi := result.doi:
publication = publications.get(cite_key=result.cite_key)
publication.doi = doi
publication.full_clean()
publication.save()
self.log_success(f"Added DOI '{doi}' to publication: {publication}")
def find_semanticscholar_ids(self):
self.log_info("--- Searching for paper IDs on Semantic Scholar ---")
publications = Publication.objects.filter(
doi__isnull=False,
semanticscholar__isnull=True,
)
for publication in publications:
data = semanticscholar(publication.doi)
paper_id = data['paperId']
obj = SemanticScholar(paper_id=paper_id, publication=publication)
obj.full_clean()
obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
sleep(2) # Throttle to avoid rate-limiting
# BaseCommand
def handle(self, *args, **options):
self.fix_references()
self.fix_dblp()
self.find_missing_dois()
self.find_semanticscholar_ids()