Browse Source

Fix snowballing functionality

webofscience
Maya Herrscher 1 week ago
parent
commit
bc8a19ff83
  1. 7
      sok/management/commands/repair.py
  2. 78
      sok/management/commands/snowball.py
  3. 5
      sok/management/commands/zimport.py
  4. 8
      sokman/settings.py

7
sok/management/commands/repair.py

@ -223,9 +223,10 @@ class Command(BaseCommand):
obj.full_clean()
obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication)
else:
self.log_warn(f"Could not find semanticscholar ID for publication '{publication.title}' with DOI '{publication.doi}'")
sleep(2) # Throttle to avoid rate-limiting
sleep(1) # Throttle to avoid rate-limiting (1/s with API Key)
def find_author_on_dblp(
self,
@ -360,6 +361,7 @@ class Command(BaseCommand):
parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true')
parser.add_argument('-i', '--scholarid', action='store_true')
def handle(self, *args, **options):
self.fix_references()
@ -367,6 +369,7 @@ class Command(BaseCommand):
if options['dblp']:
self.fix_dblp()
self.find_missing_dois()
if options['scholarid'] or options['dblp']:
self.find_semanticscholar_ids()
if options['authors']:
self.find_pid_for_authors()

78
sok/management/commands/snowball.py

@ -6,6 +6,8 @@ from pathlib import Path
from time import sleep
from typing import Any, Dict, List, Set
from django.conf import settings
import requests
from django.core.management.base import BaseCommand, CommandParser, CommandError
@ -24,7 +26,7 @@ from sok.models import (
)
def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]:
def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]:
"""
Retrieve information from the Semantic Scholar API.
@ -33,11 +35,24 @@ def semanticscholar(identifier: str, include_unknown_references: bool = False) -
See: https://api.semanticscholar.org
"""
url = f'https://api.semanticscholar.org/v1/paper/{identifier}'
url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}'
params: Dict[str, Any] = dict()
if type in ['citations', 'references']:
url += ('/' + type)
params['limit'] = '200'
params['offset'] = str(offset)
if include_unknown_references:
params['include_unknown_references'] = 'true'
response = requests.get(url, params=params)
if fields:
params['fields'] = fields
headers = {
'x-api-key': settings.SCHOLAR_API_KEY
}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status
return response.json()
@ -48,7 +63,6 @@ class Command(BaseCommand):
if bold:
msg = self.style.HTTP_INFO(msg)
tqdm.write(msg, end='\n' if nl else '')
#self.stdout.write(msg, ending='\n' if nl else '')
def warn(self, msg: str):
self.echo(self.style.WARNING(msg))
@ -113,8 +127,9 @@ class Command(BaseCommand):
base: Publication,
is_reference: bool,
) -> Publication:
data = semanticscholar(paper_id)
data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds')
# Add authors to database
## TODO: semantic scholar author ids nutzen?
authors: List[Author] = []
first = True
cite_key = ''
@ -145,7 +160,7 @@ class Command(BaseCommand):
publication = None
# Add publication to database
doi = data.get('doi', None)
doi = data.get('externalIds', None).get('DOI', None)
if not publication:
publication = Publication.objects.create(
cite_key=cite_key,
@ -192,9 +207,11 @@ class Command(BaseCommand):
title = "Reference" if is_reference else "Citation"
if 0 < len(objs):
self.echo(f"--- {title}s ---")
for obj in tqdm(objs, unit=title.lower()):
progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's')
for obj in progress_iterator:
if paper_id := obj.get('paperId', None):
try:
# This publication already exists and has a semantic scholar entry
existing = SemanticScholar.objects.get(paper_id=paper_id)
if is_reference:
self.add_reference(base, existing.publication)
@ -202,6 +219,7 @@ class Command(BaseCommand):
self.add_reference(existing.publication, base, is_reference)
continue
except SemanticScholar.DoesNotExist:
# This publication already exists but does not have a semantic scholar entry
if doi := obj.get('doi', None):
try:
publication = Publication.objects.get(doi=doi)
@ -215,6 +233,7 @@ class Command(BaseCommand):
self.add_reference(new.publication, base, is_reference)
continue
except Publication.DoesNotExist:
# This publication does not exist so we need to create it
pass
identifier = self.get_identifier(obj)
@ -225,11 +244,12 @@ class Command(BaseCommand):
paper_id = obj.get('paperId', None)
while True:
self.echo("Ignore? [Y/n]", nl=True)
if paper_id is not None:
self.echo("Ignore? [Y/n]", nl=False)
if paper_id:
self.echo(", Show abstract [a]", nl=False)
self.echo(": ")
choice = input().lower()
if choice in {'', 'y', 'yes'}:
# Store choice
self.cache.add(identifier)
@ -238,9 +258,10 @@ class Command(BaseCommand):
break
elif choice in {'a'}:
assert paper_id is not None
data = semanticscholar(paper_id)
if abstract := data.get('abstract', None):
if abstract := obj.get('abstract', None):
self.echo(abstract)
else:
self.echo('Sorry, there is no abstract for this publication on Semantic Scholar')
elif choice in {'', 'n', 'no'}:
# DONE Import? copied and adapted from PR
if paper_id is not None:
@ -277,23 +298,44 @@ class Command(BaseCommand):
semanticscholar__isnull=False,
exclusion_criteria__isnull=True,
)
if stage < 10000:
publications = [p for p in publications if p.stage_added() == stage]
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====")
try:
for publication in tqdm(publications, unit="publication"):
progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications')
for publication in progress_iterator:
self.echo(f"=== Publication {publication}: {publication.title} ===")
for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if not no_references:
references: List[Dict[str, Any]] = data['references']
self.handle_objs(publication, references, is_reference=True)
offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any references, verify manually!"))
break
references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']]
self.handle_objs(publication, references, is_reference=True)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
if not no_citations:
citations: List[Dict[str, Any]] = data['citations']
self.handle_objs(publication, citations, is_reference=False)
offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any citations, verify manually!"))
break
citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']]
self.handle_objs(publication, citations, is_reference=False)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
sleep(1) # Throttle
sleep(2) # Throttle
except KeyboardInterrupt:
raise CommandError("Aborted.")

5
sok/management/commands/zimport.py

@ -55,13 +55,13 @@ class Command(BaseCommand):
# BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('--search-term', default=None)
parser.add_argument('--search-term', default='Not specified')
parser.add_argument('--source', default='Zotero')
parser.add_argument('zfile')
@transaction.atomic
def handle(self, *args, **options):
source = Source.objects.get_or_create(name=options['source'])
source, created = Source.objects.get_or_create(name=options['source'])
search_term: Optional[SearchTerm] = None
if name := options['search_term']:
@ -142,6 +142,7 @@ class Command(BaseCommand):
# Assign sources
if search_term is not None:
for publication in publications:
print(publication, search_term, source)
publication_source, created = PublicationSource.objects.get_or_create(
source=source,
publication=publication,

8
sokman/settings.py

@ -32,9 +32,17 @@ def get_or_generate_key() -> str:
assert path.exists()
return path.read_text()
def get_api_key() -> str:
path = Path(Path(__file__).parent,'api.secret')
assert path.exists()
return path.read_text()
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = get_or_generate_key()
SCHOLAR_API_KEY = get_api_key().strip('\n')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True

Loading…
Cancel
Save