from pprint import pprint from time import sleep from tqdm import tqdm import re, requests, html from django.db import transaction from django.core.exceptions import ValidationError from django.core.management.base import BaseCommand, CommandError, CommandParser from django.db.models import Count from typing import List, Optional, Tuple import sok.management.commands.dblpimport as dblp from sok.management.commands.snowball import semanticscholar from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor class Command(BaseCommand): def log_success(self, msg: str): self.stdout.write(self.style.SUCCESS(msg)) def log_warn(self, msg: str): self.stdout.write(self.style.WARNING(msg)) def log_info(self, msg: str, nl: bool = True): self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') self.stdout.flush() @transaction.atomic def fix_references(self) -> None: """ Create relevant references to masters of referenced variants. If mulitple variants of a publication exist, only the master variant is considered. However, relevant publications might reference a non-master master-variant, e. g., a preprint. This command adds references to the master-variant, even though this reference is not actually present in the publication. The reference identifier is marked with a star, e. g., '[1]*'. """ self.log_info("--- Searching for references to variants ---") for publication in Publication.objects.filter(variant_of__isnull=False): variant = publication.variant_of origs = PublicationReference.objects.filter(reference=publication) for orig in origs: if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists(): continue fixed = PublicationReference( reference=variant, publication=orig.publication, identifier=('' if orig.identifier is None else orig.identifier) + "*", ) try: fixed.full_clean() fixed.save() self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}") except ValidationError as e: print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication) raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}") @transaction.atomic def fix_abstracts(self) -> None: """ Add abstracts to those publications that have one on SemanticScholar If mulitple variants of a publication exist, only the master variant is considered. """ self.log_info("--- Searching for publications without abstracts ---") self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract") for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"): for semantic in publication.semanticscholar_set.all(): data = semanticscholar(semantic.paper_id) if abstract := data['abstract']: publication.abstract = abstract if publication.peer_reviewed == None: publication.peer_reviewed = False try: publication.full_clean() publication.save() self.log_success(f"Added abstract for: {publication}") except ValidationError as e: raise CommandError(f"{publication}: {e}") sleep(2) # Throttle to avoid rate-limiting @transaction.atomic def search_on_dblp(self, publication: Publication): query, results, total = dblp.PublicationResult.from_search(publication.title, 100) if total == 0: return plausible = [] for result in results: if publication.doi and result.doi: if publication.doi.lower() == result.doi.lower(): plausible = [result] break else: continue # quite definitely not the same publication elif publication.title == result.title and publication.year == result.year: plausible.append(result) self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}") else: continue # I'd rather look at that in detail for now if len(plausible) == 1: result = plausible[0] if result.access.lower() == 'withdrawn': self.log_warn(f"Publication {publication.title} was WITHDRAWN!") return publication.cite_key = result.cite_key publication.year = result.year if not result.is_peer_reviewed == None: publication.peer_reviewed = result.is_peer_reviewed publication.first_page = result.first_page publication.last_page = result.last_page print(publication.peer_reviewed, result) try: publication.full_clean() publication.save() self.log_success(f"Added DBLP info for: {publication}") except ValidationError as e: raise CommandError(f"{publication}: {e}") # Store Authors authors: List[Author] = [] for (pid, name) in result.authors: author, created = Author.objects.get_or_create(name=name) if created: self.log_success(f"Added author: {author}") else: self.log_info(f"Author '{author}' already known") if not author.dblp_id and pid != None: try: author.dblp_id = pid author.full_clean() author.save() self.log_success(f"Added pid to author: {author}") except ValidationError as e: raise CommandError(f"{author}: {e}") authors.append(author) # Replace authors for publication if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)]) assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication)) for publication_author in PublicationAuthor.objects.filter(publication=publication): self.log_warn(f"Will now try to delete {publication_author.author} from {publication}") try: publication_author.delete() except e: raise CommandError(f"{publication} - {author}: {e}") for position, author in enumerate(authors): publication_author, created = PublicationAuthor.objects.get_or_create( author=author, publication=publication, position=position, ) if created: self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}") else: self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") def find_secondary_on_dblp(self): self.log_info("--- Searching for snowballed sources on DBLP ---") for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): if publication.stage == 'excluded': continue self.search_on_dblp(publication) sleep(2) # Throttle to avoid rate-limiting keys = set( Publication.objects.exclude( cite_key__startswith=dblp.CITE_KEY_PREFIX ).values_list('cite_key', flat=True).distinct() ) n = len(keys) self.log_info(f"Found {n} publications that still need to be verified") def fix_dblp(self): self.log_info("--- Searching for entries not in the default DBLP dump ---") keys_in_db = set( Publication.objects.filter( cite_key__startswith=dblp.CITE_KEY_PREFIX ).values_list('cite_key', flat=True).distinct() ) keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH) self.stdout.write(f"DB: {len(keys_in_db):8d}") self.stdout.write(f"DBLP: {len(keys_in_dump):8d}") pprint(keys_in_db - keys_in_dump) def find_missing_dois(self): self.log_info("--- Searching for missing DOIs ---") publications = Publication.objects.filter(doi__isnull=True) keys = { dblp.strip_cite_key_prefix(cite_key) for cite_key in publications.values_list('cite_key', flat=True) } self.log_info("Parsing DBLP dump...") results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys) self.log_info("done") for result in results: if doi := result.doi: publication = publications.get(cite_key=result.cite_key) publication.doi = doi publication.full_clean() publication.save() self.log_success(f"Added DOI '{doi}' to publication: {publication}") def find_semanticscholar_ids(self): self.log_info("--- Searching for paper IDs on Semantic Scholar ---") publications = Publication.objects.filter( doi__isnull=False, semanticscholar__isnull=True, ) for publication in publications: data = semanticscholar(publication.doi) if not 'error' in data: paper_id = data['paperId'] obj = SemanticScholar(paper_id=paper_id, publication=publication) obj.full_clean() obj.save() self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") else: print(publication) sleep(2) # Throttle to avoid rate-limiting def find_author_on_dblp( self, name: str, limit: int = 100, ) -> List[Tuple[str, str]]: url = 'http://dblp.org/search/author/api' response = requests.get( url, params={ 'q': name, 'f': 0, 'h': limit, 'c': 0, 'format': 'json', }, ) response.raise_for_status() search_result = response.json()['result'] hits = search_result['hits'] if hits['@total'] == '0': return None results = [hit['info'] for hit in hits['hit']] return results def verify_author(self, pid, keys): url = f"https://dblp.org/pid/{pid}.xml" response = requests.get(url) response.raise_for_status() pubs = re.findall('key="([^"]*)"', str(response.content)) sleep(1) # Throttle to avoid rate-limiting return keys.intersection(pubs) == keys def find_pid_for_authors(self): self.log_info("--- Adding ID's to authors ---") authors = Author.objects.filter(dblp_id__exact="") for author in tqdm(authors, unit='author'): pid = "" keys = set({ dblp.strip_cite_key_prefix(cite_key) for cite_key in PublicationAuthor.objects.filter( author__exact=author, publication__cite_key__startswith=dblp.CITE_KEY_PREFIX ).values_list('publication__cite_key', flat=True).distinct()} ) if len(keys) == 0: continue options = self.find_author_on_dblp(author.name) if not options: self.log_warn(f"Could not find ID for author {author}") elif len(options) == 1: pid = re.match('https://dblp.org/pid/(.*)', options[0]['url']).groups()[0] if self.verify_author(pid, keys): self.log_success(f"Found {pid} for author {author}") author.dblp_id = pid author.full_clean() author.save() self.log_success(f"Added pid for author {author}") else: for opt in options: if not 'url' in opt: continue pid = re.match('https://dblp.org/pid/(.*)', opt['url']).groups()[0] if self.verify_author(pid, keys): self.log_success(f"Found {pid} for author {author}") author.dblp_id = pid author.full_clean() author.save() self.log_success(f"Added pid for author {author}") break @transaction.atomic def merge(self, author, variant): for pub_author in PublicationAuthor.objects.filter(author=variant): publication = pub_author.publication if PublicationAuthor.objects.filter(author=author, publication=publication).exists(): self.log_warn(f"{author} is already author of {publication}") break fixed = PublicationAuthor( author=author, publication=publication, position=pub_author.position ) try: pub_author.delete() fixed.full_clean() fixed.save() self.log_success(f"Changed author: {publication}: {variant} -> {fixed.author}") except ValidationError as e: raise CommandError(f"{publication}: {variant} -> {fixed.author}: {e}") assert not PublicationAuthor.objects.filter(author=variant).exists() variant.delete() def merge_authors(self) -> None: self.log_info("--- Searching for potential duplicate authors and merging their publications ---") dup_ids = Author.objects.exclude(dblp_id__exact="").values_list('dblp_id').annotate(count=Count('id')).filter(count__gt=1) try: for (pid, counter) in tqdm(dup_ids, unit="duplicate"): response = requests.get(f"https://dblp.org/pid/{pid}.xml") response.raise_for_status() dblp_person = re.search('dblpperson name="([^"]*)"', response.content.decode('ascii')).groups() assert len(dblp_person) == 1 dblp_name = html.unescape(dblp_person[0]) variants = Author.objects.filter(dblp_id__exact=pid).exclude(name__exact=dblp_name) # for v in variants: # print(v.name, v.dblp_id) # break orig = Author.objects.filter(dblp_id__exact=pid, name__exact=dblp_name) print(variants, dblp_name, orig) assert len(orig) == 1 self.log_info(f"Suggestion to merge {', '.join([v.name for v in variants])} into {orig[0]}") while True: self.log_warn("Merge? [Y/n]") choice = input().lower() if choice in {'', 'y', 'yes'}: for variant in variants: self.merge(orig[0], variant) break elif choice in {'', 'n', 'no'}: break except KeyboardInterrupt: raise CommandError("Aborted.") # BaseCommand def add_arguments(self, parser: CommandParser): parser.add_argument('-b', '--abstract', action='store_true') parser.add_argument('-d', '--dblp', action='store_true') parser.add_argument('-a', '--authors', action='store_true') parser.add_argument('-s', '--secondary', action='store_true') def handle(self, *args, **options): self.fix_references() if options['secondary']: self.find_secondary_on_dblp() if options['dblp']: self.fix_dblp() self.find_missing_dois() self.find_semanticscholar_ids() if options['authors']: self.find_pid_for_authors() self.merge_authors() if options['abstract']: self.fix_abstracts()