from pprint import pprint from time import sleep from tqdm import tqdm from django.db import transaction from django.core.exceptions import ValidationError from django.core.management.base import BaseCommand, CommandError import sok.management.commands.dblpimport as dblp from sok.management.commands.snowball import semanticscholar from sok.models import Publication, PublicationReference, SemanticScholar class Command(BaseCommand): def log_success(self, msg: str): self.stdout.write(self.style.SUCCESS(msg)) def log_info(self, msg: str, nl: bool = True): self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') self.stdout.flush() @transaction.atomic def fix_references(self) -> None: """ Create relevant references to masters of referenced variants. If mulitple variants of a publication exist, only the master variant is considered. However, relevant publications might reference a non-master master-variant, e. g., a preprint. This command adds references to the master-variant, even though this reference is not actually present in the publication. The reference identifier is marked with a star, e. g., '[1]*'. """ self.log_info("--- Searching for references to variants ---") for publication in Publication.objects.filter(variant_of__isnull=False): variant = publication.variant_of origs = PublicationReference.objects.filter(reference=publication) for orig in origs: if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists(): continue fixed = PublicationReference( reference=variant, publication=orig.publication, identifier=('' if orig.identifier is None else orig.identifier) + "*", ) try: fixed.full_clean() fixed.save() self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}") except ValidationError as e: print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication) raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}") @transaction.atomic def fix_abstracts(self) -> None: """ Add abstracts to those publications that have one on SemanticScholar If mulitple variants of a publication exist, only the master variant is considered. """ self.log_info("--- Searching for publications without abstracts ---") self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract") for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"): for semantic in publication.semanticscholar_set.all(): data = semanticscholar(semantic.paper_id) if abstract := data['abstract']: publication.abstract = abstract try: publication.full_clean() publication.save() self.log_success(f"Added abstract for: {publication}") except ValidationError as e: raise CommandError(f"{publication}: {e}") sleep(2) # Throttle to avoid rate-limiting @transaction.atomic def search_on_dblp(self, publication: Publication): query, results, total = dblp.PublicationResult.from_search(publication.title, 100) if total == 0: return for result in results: if publication.doi and result.doi: if publication.doi.lower() == result.doi.lower(): publication.cite_key = result.cite_key publication.year = result.year if not result.is_peer_reviewed == None: publication.peer_reviewed = result.is_peer_reviewed publication.first_page = result.first_page publication.last_page = result.last_page print(publication.peer_reviewed, result) # keep authors from semantic scholar for now, even though they might be a little broken? try: publication.full_clean() publication.save() self.log_success(f"Added DBLP info for: {publication}") except ValidationError as e: raise CommandError(f"{publication}: {e}") else: continue # quite definitely not the same publication elif publication.title == result.title and publication.year == result.year: print(f"Not quite certain about {result.cite_key} for publication {publication.title}") else: continue # I'd rather look at that in detail for now def find_secondary_on_dblp(self): self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---") for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): if publication.stage == 'excluded': continue self.search_on_dblp(publication) sleep(2) # Throttle to avoid rate-limiting keys = set( Publication.objects.exclude( cite_key__startswith=dblp.CITE_KEY_PREFIX ).values_list('cite_key', flat=True).distinct() ) n = len(keys) self.log_info(f"Found {n} publications that still need to be checked") def fix_dblp(self): self.log_info("--- Searching for entries not in the default DBLP dump ---") keys_in_db = set( Publication.objects.filter( cite_key__startswith=dblp.CITE_KEY_PREFIX ).values_list('cite_key', flat=True).distinct() ) keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH) self.stdout.write(f"DB: {len(keys_in_db):8d}") self.stdout.write(f"DBLP: {len(keys_in_dump):8d}") pprint(keys_in_db - keys_in_dump) def find_missing_dois(self): self.log_info("--- Searching for missing DOIs ---") publications = Publication.objects.filter(doi__isnull=True) keys = { dblp.strip_cite_key_prefix(cite_key) for cite_key in publications.values_list('cite_key', flat=True) } self.log_info("Parsing DBLP dump...") results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys) self.log_info("done") for result in results: if doi := result.doi: publication = publications.get(cite_key=result.cite_key) publication.doi = doi publication.full_clean() publication.save() self.log_success(f"Added DOI '{doi}' to publication: {publication}") def find_semanticscholar_ids(self): self.log_info("--- Searching for paper IDs on Semantic Scholar ---") publications = Publication.objects.filter( doi__isnull=False, semanticscholar__isnull=True, ) for publication in publications: data = semanticscholar(publication.doi) if not 'error' in data: paper_id = data['paperId'] obj = SemanticScholar(paper_id=paper_id, publication=publication) obj.full_clean() obj.save() self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") else: print(publication) sleep(2) # Throttle to avoid rate-limiting # BaseCommand def handle(self, *args, **options): self.fix_references() self.fix_abstracts() self.find_secondary_on_dblp() self.fix_dblp() self.find_missing_dois() self.find_semanticscholar_ids()