import hashlib import json import pickle from pathlib import Path from time import sleep from typing import Any, Dict, List, Set from django.conf import settings import requests from django.core.management.base import BaseCommand, CommandParser, CommandError from tqdm import tqdm from django.db import transaction from sok.models import ( Author, Publication, PublicationAuthor, PublicationReference, PublicationSource, SemanticScholar, Source, ) def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]: """ Retrieve information from the Semantic Scholar API. The identifier can be a DOI or the Semantic Scholar paper ID. See: https://api.semanticscholar.org """ url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}' params: Dict[str, Any] = dict() if type in ['citations', 'references']: url += ('/' + type) params['limit'] = '200' params['offset'] = str(offset) if include_unknown_references: params['include_unknown_references'] = 'true' if fields: params['fields'] = fields headers = { 'x-api-key': settings.SCHOLAR_API_KEY } response = requests.get(url, params=params, headers=headers) response.raise_for_status return response.json() class Command(BaseCommand): def echo(self, msg: str, bold: bool = False, nl: bool = True): if bold: msg = self.style.HTTP_INFO(msg) tqdm.write(msg, end='\n' if nl else '') def warn(self, msg: str): self.echo(self.style.WARNING(msg)) def add_reference( self, publication: Publication, reference: Publication, is_reference: bool = True, ): try: rel = PublicationReference.objects.get( publication=publication, reference=reference, ) if is_reference: self.echo(f"Reference already known: {rel.identifier} {reference}") else: self.echo(f"Citation already known: {rel.identifier} {publication}") except PublicationReference.DoesNotExist: rel = PublicationReference( publication=publication, reference=reference, ) rel.full_clean() rel.save() if is_reference: self.echo(f"Added reference: {reference}") else: self.echo(f"Added citation: {publication}") def display(self, obj: Dict[str, Any]): self.echo("") authors = [author['name'] for author in obj['authors']] title = obj['title'] self.echo(" " + ", ".join(authors)) self.echo(f" {title}", bold=True, nl=False) if year := obj.get('year', None): self.echo(f" ({year})") else: self.echo("") if venue := obj.get('venue', None): self.echo(f" {venue}") if doi := obj.get('doi', None): self.echo(f" {doi}") if paper_id := obj.get('paperId', None): self.echo(f" {paper_id}") def get_identifier(self, obj: Dict[str, Any]) -> str: if paper_id := obj.get('paperId', None): return paper_id raw = json.dumps(obj, sort_keys=True) hasher = hashlib.blake2b() hasher.update(raw.encode()) return hasher.hexdigest() @transaction.atomic def add_publ( self, paper_id, base: Publication, is_reference: bool, ) -> Publication: data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds') # Add authors to database ## TODO: semantic scholar author ids nutzen? authors: List[Author] = [] first = True cite_key = '' for author in data.get('authors', []): if name := author.get('name', None): author, created = Author.objects.get_or_create(name=name) if created: self.echo(f"Added author: {author}") else: self.echo(f"Author '{author}' already known") authors.append(author) cite_key = '' if authors: cite_key = authors[0].name.split(' ')[-1] cite_key += str(data.get('year')) title = data.get('title', '') cite_key += title.split(' ')[0] cite_key = cite_key.lower() try: for i in range(10): publication = Publication.objects.get(cite_key=cite_key) if publication.title == title: break else: cite_key += '_1' except Publication.DoesNotExist: publication = None # Add publication to database doi = data.get('externalIds', None).get('DOI', None) if not publication: publication = Publication.objects.create( cite_key=cite_key, title=title, year=data.get('year', 0), peer_reviewed=None, doi=doi, abstract=data.get('abstract', None), ) self.echo(f"Added publication: {publication}") else: self.echo(f"Publication '{publication}' already known") # Assign authors for position, author in enumerate(list(set(authors))): publication_author, created = PublicationAuthor.objects.get_or_create( author=author, publication=publication, position=position, ) if created: self.echo(f"Assigned author '{author}' to publication '{publication}' at position {position}") else: self.echo(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") # Add to Semantic Scholar and link publications new, created = SemanticScholar.objects.get_or_create(paper_id=paper_id, publication=publication) if created: new.full_clean() new.save() self.echo(f"New Semantic Scholar entry: {paper_id}") if is_reference: self.add_reference(base, new.publication) else: self.add_reference(new.publication, base, is_reference) def handle_objs( self, base: Publication, objs: List[Dict[str, Any]], is_reference: bool, ): title = "Reference" if is_reference else "Citation" if 0 < len(objs): self.echo(f"--- {title}s ---") progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's') for obj in progress_iterator: if paper_id := obj.get('paperId', None): try: # This publication already exists and has a semantic scholar entry existing = SemanticScholar.objects.get(paper_id=paper_id) if is_reference: self.add_reference(base, existing.publication) else: self.add_reference(existing.publication, base, is_reference) continue except SemanticScholar.DoesNotExist: # This publication already exists but does not have a semantic scholar entry if doi := obj.get('doi', None): try: publication = Publication.objects.get(doi=doi) new = SemanticScholar(paper_id=paper_id, publication=publication) new.full_clean() new.save() self.echo(f"New Semantic Scholar entry: {paper_id}") if is_reference: self.add_reference(base, new.publication) else: self.add_reference(new.publication, base, is_reference) continue except Publication.DoesNotExist: # This publication does not exist so we need to create it pass identifier = self.get_identifier(obj) if identifier in self.cache: continue self.display(obj) paper_id = obj.get('paperId', None) while True: self.echo("Ignore? [Y/n]", nl=False) if paper_id: self.echo(", Show abstract [a]", nl=False) self.echo(": ") choice = input().lower() if choice in {'', 'y', 'yes'}: # Store choice self.cache.add(identifier) with self.cache_path.open('wb') as f: pickle.dump(self.cache, f) break elif choice in {'a'}: assert paper_id is not None if abstract := obj.get('abstract', None): self.echo(abstract) else: self.echo('Sorry, there is no abstract for this publication on Semantic Scholar') elif choice in {'', 'n', 'no'}: # DONE Import? copied and adapted from PR if paper_id is not None: self.add_publ(paper_id, base, is_reference) else: self.warn("Could not add this paper, please do it manually!") break # BaseCommand def add_arguments(self, parser: CommandParser): parser.add_argument('--reset-choices', action='store_true') parser.add_argument('--no-references', action='store_true') parser.add_argument('--no-citations', action='store_true') parser.add_argument('-s', '--stage', type=int, default=10000) def handle(self, *args, **options): reset_choices: bool = options['reset_choices'] no_citations: bool = options['no_citations'] no_references: bool = options['no_references'] stage: int = options['stage'] self.cache_path = Path('.choices.semanticscholar.pickle') self.cache: Set[str] = set() if reset_choices: self.cache_path.unlink(missing_ok=True) elif self.cache_path.exists(): self.echo("Loading previous choices (reset with --reset-choices)...", nl=False) with self.cache_path.open('rb') as f: self.cache = pickle.load(f) self.echo("done", bold=True) publications = Publication.objects.filter( semanticscholar__isnull=False, exclusion_criteria__isnull=True, ) if stage < 10000: publications = [p for p in publications if p.stage_added() == stage] self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====") try: progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications') for publication in progress_iterator: self.echo(f"=== Publication {publication}: {publication.title} ===") for semantic in publication.semanticscholar_set.all(): if not no_references: offset = 0 while True: data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset) if not data.get('data', None): self.echo(self.style.WARNING("API did not return any references, verify manually!")) break references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']] self.handle_objs(publication, references, is_reference=True) # Handle limitation if there are more than 200 paper references if 'next' in data: offset = data['next'] else: break if not no_citations: offset = 0 while True: data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset) if not data.get('data', None): self.echo(self.style.WARNING("API did not return any citations, verify manually!")) break citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']] self.handle_objs(publication, citations, is_reference=False) # Handle limitation if there are more than 200 paper references if 'next' in data: offset = data['next'] else: break sleep(1) # Throttle except KeyboardInterrupt: raise CommandError("Aborted.")