diff --git a/sok/management/commands/repair.py b/sok/management/commands/repair.py index 0b5138f..bbf01cc 100644 --- a/sok/management/commands/repair.py +++ b/sok/management/commands/repair.py @@ -223,9 +223,10 @@ class Command(BaseCommand): obj.full_clean() obj.save() self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") - else: print(publication) + else: + self.log_warn(f"Could not find semanticscholar ID for publication '{publication.title}' with DOI '{publication.doi}'") - sleep(2) # Throttle to avoid rate-limiting + sleep(1) # Throttle to avoid rate-limiting (1/s with API Key) def find_author_on_dblp( self, @@ -360,6 +361,7 @@ class Command(BaseCommand): parser.add_argument('-d', '--dblp', action='store_true') parser.add_argument('-a', '--authors', action='store_true') parser.add_argument('-s', '--secondary', action='store_true') + parser.add_argument('-i', '--scholarid', action='store_true') def handle(self, *args, **options): self.fix_references() @@ -367,6 +369,7 @@ class Command(BaseCommand): if options['dblp']: self.fix_dblp() self.find_missing_dois() + if options['scholarid'] or options['dblp']: self.find_semanticscholar_ids() if options['authors']: self.find_pid_for_authors() diff --git a/sok/management/commands/snowball.py b/sok/management/commands/snowball.py index a3356ec..2258030 100644 --- a/sok/management/commands/snowball.py +++ b/sok/management/commands/snowball.py @@ -6,6 +6,8 @@ from pathlib import Path from time import sleep from typing import Any, Dict, List, Set +from django.conf import settings + import requests from django.core.management.base import BaseCommand, CommandParser, CommandError @@ -24,7 +26,7 @@ from sok.models import ( ) -def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: +def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]: """ Retrieve information from the Semantic Scholar API. @@ -33,11 +35,24 @@ def semanticscholar(identifier: str, include_unknown_references: bool = False) - See: https://api.semanticscholar.org """ - url = f'https://api.semanticscholar.org/v1/paper/{identifier}' + url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}' params: Dict[str, Any] = dict() + + if type in ['citations', 'references']: + url += ('/' + type) + params['limit'] = '200' + params['offset'] = str(offset) + if include_unknown_references: params['include_unknown_references'] = 'true' - response = requests.get(url, params=params) + if fields: + params['fields'] = fields + + headers = { + 'x-api-key': settings.SCHOLAR_API_KEY + } + + response = requests.get(url, params=params, headers=headers) response.raise_for_status return response.json() @@ -48,7 +63,6 @@ class Command(BaseCommand): if bold: msg = self.style.HTTP_INFO(msg) tqdm.write(msg, end='\n' if nl else '') - #self.stdout.write(msg, ending='\n' if nl else '') def warn(self, msg: str): self.echo(self.style.WARNING(msg)) @@ -113,8 +127,9 @@ class Command(BaseCommand): base: Publication, is_reference: bool, ) -> Publication: - data = semanticscholar(paper_id) + data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds') # Add authors to database + ## TODO: semantic scholar author ids nutzen? authors: List[Author] = [] first = True cite_key = '' @@ -145,7 +160,7 @@ class Command(BaseCommand): publication = None # Add publication to database - doi = data.get('doi', None) + doi = data.get('externalIds', None).get('DOI', None) if not publication: publication = Publication.objects.create( cite_key=cite_key, @@ -192,9 +207,11 @@ class Command(BaseCommand): title = "Reference" if is_reference else "Citation" if 0 < len(objs): self.echo(f"--- {title}s ---") - for obj in tqdm(objs, unit=title.lower()): + progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's') + for obj in progress_iterator: if paper_id := obj.get('paperId', None): try: + # This publication already exists and has a semantic scholar entry existing = SemanticScholar.objects.get(paper_id=paper_id) if is_reference: self.add_reference(base, existing.publication) @@ -202,6 +219,7 @@ class Command(BaseCommand): self.add_reference(existing.publication, base, is_reference) continue except SemanticScholar.DoesNotExist: + # This publication already exists but does not have a semantic scholar entry if doi := obj.get('doi', None): try: publication = Publication.objects.get(doi=doi) @@ -215,6 +233,7 @@ class Command(BaseCommand): self.add_reference(new.publication, base, is_reference) continue except Publication.DoesNotExist: + # This publication does not exist so we need to create it pass identifier = self.get_identifier(obj) @@ -225,11 +244,12 @@ class Command(BaseCommand): paper_id = obj.get('paperId', None) while True: - self.echo("Ignore? [Y/n]", nl=True) - if paper_id is not None: + self.echo("Ignore? [Y/n]", nl=False) + if paper_id: self.echo(", Show abstract [a]", nl=False) self.echo(": ") choice = input().lower() + if choice in {'', 'y', 'yes'}: # Store choice self.cache.add(identifier) @@ -238,9 +258,10 @@ class Command(BaseCommand): break elif choice in {'a'}: assert paper_id is not None - data = semanticscholar(paper_id) - if abstract := data.get('abstract', None): + if abstract := obj.get('abstract', None): self.echo(abstract) + else: + self.echo('Sorry, there is no abstract for this publication on Semantic Scholar') elif choice in {'', 'n', 'no'}: # DONE Import? copied and adapted from PR if paper_id is not None: @@ -277,23 +298,44 @@ class Command(BaseCommand): semanticscholar__isnull=False, exclusion_criteria__isnull=True, ) + if stage < 10000: publications = [p for p in publications if p.stage_added() == stage] self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====") try: - for publication in tqdm(publications, unit="publication"): + progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications') + for publication in progress_iterator: self.echo(f"=== Publication {publication}: {publication.title} ===") for semantic in publication.semanticscholar_set.all(): - data = semanticscholar(semantic.paper_id) if not no_references: - references: List[Dict[str, Any]] = data['references'] - self.handle_objs(publication, references, is_reference=True) + offset = 0 + while True: + data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset) + if not data.get('data', None): + self.echo(self.style.WARNING("API did not return any references, verify manually!")) + break + references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']] + self.handle_objs(publication, references, is_reference=True) + # Handle limitation if there are more than 200 paper references + if 'next' in data: offset = data['next'] + else: break if not no_citations: - citations: List[Dict[str, Any]] = data['citations'] - self.handle_objs(publication, citations, is_reference=False) + offset = 0 + while True: + data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset) + if not data.get('data', None): + self.echo(self.style.WARNING("API did not return any citations, verify manually!")) + break + citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']] + self.handle_objs(publication, citations, is_reference=False) + # Handle limitation if there are more than 200 paper references + if 'next' in data: offset = data['next'] + else: break + + sleep(1) # Throttle - sleep(2) # Throttle except KeyboardInterrupt: raise CommandError("Aborted.") + diff --git a/sok/management/commands/zimport.py b/sok/management/commands/zimport.py index 3fcef20..15135c0 100644 --- a/sok/management/commands/zimport.py +++ b/sok/management/commands/zimport.py @@ -55,13 +55,13 @@ class Command(BaseCommand): # BaseCommand def add_arguments(self, parser: CommandParser): - parser.add_argument('--search-term', default=None) + parser.add_argument('--search-term', default='Not specified') parser.add_argument('--source', default='Zotero') parser.add_argument('zfile') @transaction.atomic def handle(self, *args, **options): - source = Source.objects.get_or_create(name=options['source']) + source, created = Source.objects.get_or_create(name=options['source']) search_term: Optional[SearchTerm] = None if name := options['search_term']: @@ -142,6 +142,7 @@ class Command(BaseCommand): # Assign sources if search_term is not None: for publication in publications: + print(publication, search_term, source) publication_source, created = PublicationSource.objects.get_or_create( source=source, publication=publication, diff --git a/sokman/settings.py b/sokman/settings.py index 60660e1..933c83b 100644 --- a/sokman/settings.py +++ b/sokman/settings.py @@ -32,9 +32,17 @@ def get_or_generate_key() -> str: assert path.exists() return path.read_text() +def get_api_key() -> str: + path = Path(Path(__file__).parent,'api.secret') + + assert path.exists() + return path.read_text() + # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = get_or_generate_key() +SCHOLAR_API_KEY = get_api_key().strip('\n') + # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True