import hashlib import json import pickle from pathlib import Path from time import sleep from typing import Any, Dict, List, Set import requests from django.core.management.base import BaseCommand, CommandParser, CommandError from tqdm import tqdm from sok.models import Publication, PublicationReference, SemanticScholar def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: """ Retrieve information from the Semantic Scholar API. The identifier can be a DOI or the Semantic Scholar paper ID. See: https://api.semanticscholar.org """ url = f'https://api.semanticscholar.org/v1/paper/{identifier}' params: Dict[str, Any] = dict() if include_unknown_references: params['include_unknown_references'] = 'true' response = requests.get(url, params=params) response.raise_for_status return response.json() class Command(BaseCommand): def echo(self, msg: str, bold: bool = False, nl: bool = True): if bold: msg = self.style.HTTP_INFO(msg) tqdm.write(msg, end='\n' if nl else '') #self.stdout.write(msg, ending='\n' if nl else '') def warn(self, msg: str): self.echo(self.style.WARNING(msg)) def add_reference( self, publication: Publication, reference: Publication, is_reference: bool = True, ): try: rel = PublicationReference.objects.get( publication=publication, reference=reference, ) if is_reference: self.echo(f"Reference already known: {rel.identifier} {reference}") else: self.echo(f"Citation already known: {rel.identifier} {publication}") except PublicationReference.DoesNotExist: rel = PublicationReference( publication=publication, reference=reference, ) rel.full_clean() rel.save() if is_reference: self.echo(f"Added reference: {reference}") else: self.echo(f"Added citation: {publication}") def display(self, obj: Dict[str, Any]): self.echo("") authors = [author['name'] for author in obj['authors']] title = obj['title'] self.echo(" " + ", ".join(authors)) self.echo(f" {title}", bold=True, nl=False) if year := obj.get('year', None): self.echo(f" ({year})") else: self.echo("") if venue := obj.get('venue', None): self.echo(f" {venue}") if doi := obj.get('doi', None): self.echo(f" {doi}") if paper_id := obj.get('paperId', None): self.echo(f" {paper_id}") def get_identifier(self, obj: Dict[str, Any]) -> str: if paper_id := obj.get('paperId', None): return paper_id raw = json.dumps(obj, sort_keys=True) hasher = hashlib.blake2b() hasher.update(raw.encode()) return hasher.hexdigest() def handle_objs( self, base: Publication, objs: List[Dict[str, Any]], is_reference: bool, ): title = "Reference" if is_reference else "Citation" if 0 < len(objs): self.echo(f"--- {title}s ---") for obj in tqdm(objs, unit=title.lower()): if paper_id := obj.get('paperId', None): try: existing = SemanticScholar.objects.get(paper_id=paper_id) if is_reference: self.add_reference(base, existing.publication) else: self.add_reference(existing.publication, base, is_reference) continue except SemanticScholar.DoesNotExist: if doi := obj.get('doi', None): try: publication = Publication.objects.get(doi=doi) new = SemanticScholar(paper_id=paper_id, publication=publication) new.full_clean() new.save() self.echo(f"New Semantic Scholar entry: {paper_id}") if is_reference: self.add_reference(base, new.publication) else: self.add_reference(new.publication, base, is_reference) continue except Publication.DoesNotExist: pass identifier = self.get_identifier(obj) if identifier in self.cache: continue self.display(obj) paper_id = obj.get('paperId', None) while True: self.echo("Ignore? [Y/n]", nl=False) if paper_id is not None: self.echo(", Show abstract [a]", nl=False) self.echo(": ") choice = input().lower() if choice in {'', 'y', 'yes'}: # Store choice self.cache.add(identifier) with self.cache_path.open('wb') as f: pickle.dump(self.cache, f) break elif choice in {'a'}: assert paper_id is not None data = semanticscholar(paper_id) if abstract := data.get('abstract', None): self.echo(abstract) elif choice in {'', 'n', 'no'}: # TODO Import? break # BaseCommand def add_arguments(self, parser: CommandParser): parser.add_argument('--reset-choices', action='store_true') parser.add_argument('--no-references', action='store_true') parser.add_argument('--no-citations', action='store_true') def handle(self, *args, **options): reset_choices: bool = options['reset_choices'] no_citations: bool = options['no_citations'] no_references: bool = options['no_references'] self.cache_path = Path('.choices.semanticscholar.pickle') self.cache: Set[str] = set() if reset_choices: self.cache_path.unlink(missing_ok=True) elif self.cache_path.exists(): self.echo("Loading previous choices (reset with --reset-choices)...", nl=False) with self.cache_path.open('rb') as f: self.cache = pickle.load(f) self.echo("done", bold=True) publications = Publication.objects.filter( semanticscholar__isnull=False, exclusion_criteria__isnull=True, ) try: for publication in tqdm(publications, unit="publication"): self.echo(f"=== Publication {publication} ===") for semantic in publication.semanticscholar_set.all(): data = semanticscholar(semantic.paper_id) if not no_references: references: List[Dict[str, Any]] = data['references'] self.handle_objs(publication, references, is_reference=True) if not no_citations: citations: List[Dict[str, Any]] = data['citations'] self.handle_objs(publication, citations, is_reference=False) sleep(2) # Throttle except KeyboardInterrupt: raise CommandError("Aborted.")