Browse Source

Fix snowballing functionality

webofscience
Maya Herrscher 1 week ago
parent
commit
bc8a19ff83
  1. 7
      sok/management/commands/repair.py
  2. 74
      sok/management/commands/snowball.py
  3. 5
      sok/management/commands/zimport.py
  4. 8
      sokman/settings.py

7
sok/management/commands/repair.py

@ -223,9 +223,10 @@ class Command(BaseCommand):
obj.full_clean() obj.full_clean()
obj.save() obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication) else:
self.log_warn(f"Could not find semanticscholar ID for publication '{publication.title}' with DOI '{publication.doi}'")
sleep(2) # Throttle to avoid rate-limiting sleep(1) # Throttle to avoid rate-limiting (1/s with API Key)
def find_author_on_dblp( def find_author_on_dblp(
self, self,
@ -360,6 +361,7 @@ class Command(BaseCommand):
parser.add_argument('-d', '--dblp', action='store_true') parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true') parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true') parser.add_argument('-s', '--secondary', action='store_true')
parser.add_argument('-i', '--scholarid', action='store_true')
def handle(self, *args, **options): def handle(self, *args, **options):
self.fix_references() self.fix_references()
@ -367,6 +369,7 @@ class Command(BaseCommand):
if options['dblp']: if options['dblp']:
self.fix_dblp() self.fix_dblp()
self.find_missing_dois() self.find_missing_dois()
if options['scholarid'] or options['dblp']:
self.find_semanticscholar_ids() self.find_semanticscholar_ids()
if options['authors']: if options['authors']:
self.find_pid_for_authors() self.find_pid_for_authors()

74
sok/management/commands/snowball.py

@ -6,6 +6,8 @@ from pathlib import Path
from time import sleep from time import sleep
from typing import Any, Dict, List, Set from typing import Any, Dict, List, Set
from django.conf import settings
import requests import requests
from django.core.management.base import BaseCommand, CommandParser, CommandError from django.core.management.base import BaseCommand, CommandParser, CommandError
@ -24,7 +26,7 @@ from sok.models import (
) )
def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]:
""" """
Retrieve information from the Semantic Scholar API. Retrieve information from the Semantic Scholar API.
@ -33,11 +35,24 @@ def semanticscholar(identifier: str, include_unknown_references: bool = False) -
See: https://api.semanticscholar.org See: https://api.semanticscholar.org
""" """
url = f'https://api.semanticscholar.org/v1/paper/{identifier}' url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}'
params: Dict[str, Any] = dict() params: Dict[str, Any] = dict()
if type in ['citations', 'references']:
url += ('/' + type)
params['limit'] = '200'
params['offset'] = str(offset)
if include_unknown_references: if include_unknown_references:
params['include_unknown_references'] = 'true' params['include_unknown_references'] = 'true'
response = requests.get(url, params=params) if fields:
params['fields'] = fields
headers = {
'x-api-key': settings.SCHOLAR_API_KEY
}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status response.raise_for_status
return response.json() return response.json()
@ -48,7 +63,6 @@ class Command(BaseCommand):
if bold: if bold:
msg = self.style.HTTP_INFO(msg) msg = self.style.HTTP_INFO(msg)
tqdm.write(msg, end='\n' if nl else '') tqdm.write(msg, end='\n' if nl else '')
#self.stdout.write(msg, ending='\n' if nl else '')
def warn(self, msg: str): def warn(self, msg: str):
self.echo(self.style.WARNING(msg)) self.echo(self.style.WARNING(msg))
@ -113,8 +127,9 @@ class Command(BaseCommand):
base: Publication, base: Publication,
is_reference: bool, is_reference: bool,
) -> Publication: ) -> Publication:
data = semanticscholar(paper_id) data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds')
# Add authors to database # Add authors to database
## TODO: semantic scholar author ids nutzen?
authors: List[Author] = [] authors: List[Author] = []
first = True first = True
cite_key = '' cite_key = ''
@ -145,7 +160,7 @@ class Command(BaseCommand):
publication = None publication = None
# Add publication to database # Add publication to database
doi = data.get('doi', None) doi = data.get('externalIds', None).get('DOI', None)
if not publication: if not publication:
publication = Publication.objects.create( publication = Publication.objects.create(
cite_key=cite_key, cite_key=cite_key,
@ -192,9 +207,11 @@ class Command(BaseCommand):
title = "Reference" if is_reference else "Citation" title = "Reference" if is_reference else "Citation"
if 0 < len(objs): if 0 < len(objs):
self.echo(f"--- {title}s ---") self.echo(f"--- {title}s ---")
for obj in tqdm(objs, unit=title.lower()): progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's')
for obj in progress_iterator:
if paper_id := obj.get('paperId', None): if paper_id := obj.get('paperId', None):
try: try:
# This publication already exists and has a semantic scholar entry
existing = SemanticScholar.objects.get(paper_id=paper_id) existing = SemanticScholar.objects.get(paper_id=paper_id)
if is_reference: if is_reference:
self.add_reference(base, existing.publication) self.add_reference(base, existing.publication)
@ -202,6 +219,7 @@ class Command(BaseCommand):
self.add_reference(existing.publication, base, is_reference) self.add_reference(existing.publication, base, is_reference)
continue continue
except SemanticScholar.DoesNotExist: except SemanticScholar.DoesNotExist:
# This publication already exists but does not have a semantic scholar entry
if doi := obj.get('doi', None): if doi := obj.get('doi', None):
try: try:
publication = Publication.objects.get(doi=doi) publication = Publication.objects.get(doi=doi)
@ -215,6 +233,7 @@ class Command(BaseCommand):
self.add_reference(new.publication, base, is_reference) self.add_reference(new.publication, base, is_reference)
continue continue
except Publication.DoesNotExist: except Publication.DoesNotExist:
# This publication does not exist so we need to create it
pass pass
identifier = self.get_identifier(obj) identifier = self.get_identifier(obj)
@ -225,11 +244,12 @@ class Command(BaseCommand):
paper_id = obj.get('paperId', None) paper_id = obj.get('paperId', None)
while True: while True:
self.echo("Ignore? [Y/n]", nl=True) self.echo("Ignore? [Y/n]", nl=False)
if paper_id is not None: if paper_id:
self.echo(", Show abstract [a]", nl=False) self.echo(", Show abstract [a]", nl=False)
self.echo(": ") self.echo(": ")
choice = input().lower() choice = input().lower()
if choice in {'', 'y', 'yes'}: if choice in {'', 'y', 'yes'}:
# Store choice # Store choice
self.cache.add(identifier) self.cache.add(identifier)
@ -238,9 +258,10 @@ class Command(BaseCommand):
break break
elif choice in {'a'}: elif choice in {'a'}:
assert paper_id is not None assert paper_id is not None
data = semanticscholar(paper_id) if abstract := obj.get('abstract', None):
if abstract := data.get('abstract', None):
self.echo(abstract) self.echo(abstract)
else:
self.echo('Sorry, there is no abstract for this publication on Semantic Scholar')
elif choice in {'', 'n', 'no'}: elif choice in {'', 'n', 'no'}:
# DONE Import? copied and adapted from PR # DONE Import? copied and adapted from PR
if paper_id is not None: if paper_id is not None:
@ -277,23 +298,44 @@ class Command(BaseCommand):
semanticscholar__isnull=False, semanticscholar__isnull=False,
exclusion_criteria__isnull=True, exclusion_criteria__isnull=True,
) )
if stage < 10000: if stage < 10000:
publications = [p for p in publications if p.stage_added() == stage] publications = [p for p in publications if p.stage_added() == stage]
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====") self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====")
try: try:
for publication in tqdm(publications, unit="publication"): progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications')
for publication in progress_iterator:
self.echo(f"=== Publication {publication}: {publication.title} ===") self.echo(f"=== Publication {publication}: {publication.title} ===")
for semantic in publication.semanticscholar_set.all(): for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if not no_references: if not no_references:
references: List[Dict[str, Any]] = data['references'] offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any references, verify manually!"))
break
references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']]
self.handle_objs(publication, references, is_reference=True) self.handle_objs(publication, references, is_reference=True)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
if not no_citations: if not no_citations:
citations: List[Dict[str, Any]] = data['citations'] offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any citations, verify manually!"))
break
citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']]
self.handle_objs(publication, citations, is_reference=False) self.handle_objs(publication, citations, is_reference=False)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
sleep(1) # Throttle
sleep(2) # Throttle
except KeyboardInterrupt: except KeyboardInterrupt:
raise CommandError("Aborted.") raise CommandError("Aborted.")

5
sok/management/commands/zimport.py

@ -55,13 +55,13 @@ class Command(BaseCommand):
# BaseCommand # BaseCommand
def add_arguments(self, parser: CommandParser): def add_arguments(self, parser: CommandParser):
parser.add_argument('--search-term', default=None) parser.add_argument('--search-term', default='Not specified')
parser.add_argument('--source', default='Zotero') parser.add_argument('--source', default='Zotero')
parser.add_argument('zfile') parser.add_argument('zfile')
@transaction.atomic @transaction.atomic
def handle(self, *args, **options): def handle(self, *args, **options):
source = Source.objects.get_or_create(name=options['source']) source, created = Source.objects.get_or_create(name=options['source'])
search_term: Optional[SearchTerm] = None search_term: Optional[SearchTerm] = None
if name := options['search_term']: if name := options['search_term']:
@ -142,6 +142,7 @@ class Command(BaseCommand):
# Assign sources # Assign sources
if search_term is not None: if search_term is not None:
for publication in publications: for publication in publications:
print(publication, search_term, source)
publication_source, created = PublicationSource.objects.get_or_create( publication_source, created = PublicationSource.objects.get_or_create(
source=source, source=source,
publication=publication, publication=publication,

8
sokman/settings.py

@ -32,9 +32,17 @@ def get_or_generate_key() -> str:
assert path.exists() assert path.exists()
return path.read_text() return path.read_text()
def get_api_key() -> str:
path = Path(Path(__file__).parent,'api.secret')
assert path.exists()
return path.read_text()
# SECURITY WARNING: keep the secret key used in production secret! # SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = get_or_generate_key() SECRET_KEY = get_or_generate_key()
SCHOLAR_API_KEY = get_api_key().strip('\n')
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True DEBUG = True

Loading…
Cancel
Save