Basically a fork from https://github.com/blochberger/sokman but with the intention of adding a visual interface as well
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

341 lines
10 KiB

import hashlib
import json
import pickle
from pathlib import Path
from time import sleep
from typing import Any, Dict, List, Set
from django.conf import settings
import requests
from django.core.management.base import BaseCommand, CommandParser, CommandError
from tqdm import tqdm
from django.db import transaction
from sok.models import (
Author,
Publication,
PublicationAuthor,
PublicationReference,
PublicationSource,
SemanticScholar,
Source,
)
def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]:
"""
Retrieve information from the Semantic Scholar API.
The identifier can be a DOI or the Semantic Scholar paper ID.
See: https://api.semanticscholar.org
"""
url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}'
params: Dict[str, Any] = dict()
if type in ['citations', 'references']:
url += ('/' + type)
params['limit'] = '200'
params['offset'] = str(offset)
if include_unknown_references:
params['include_unknown_references'] = 'true'
if fields:
params['fields'] = fields
headers = {
'x-api-key': settings.SCHOLAR_API_KEY
}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status
return response.json()
class Command(BaseCommand):
def echo(self, msg: str, bold: bool = False, nl: bool = True):
if bold:
msg = self.style.HTTP_INFO(msg)
tqdm.write(msg, end='\n' if nl else '')
def warn(self, msg: str):
self.echo(self.style.WARNING(msg))
def add_reference(
self,
publication: Publication,
reference: Publication,
is_reference: bool = True,
):
try:
rel = PublicationReference.objects.get(
publication=publication,
reference=reference,
)
if is_reference:
self.echo(f"Reference already known: {rel.identifier} {reference}")
else:
self.echo(f"Citation already known: {rel.identifier} {publication}")
except PublicationReference.DoesNotExist:
rel = PublicationReference(
publication=publication,
reference=reference,
)
rel.full_clean()
rel.save()
if is_reference:
self.echo(f"Added reference: {reference}")
else:
self.echo(f"Added citation: {publication}")
def display(self, obj: Dict[str, Any]):
self.echo("")
authors = [author['name'] for author in obj['authors']]
title = obj['title']
self.echo(" " + ", ".join(authors))
self.echo(f" {title}", bold=True, nl=False)
if year := obj.get('year', None):
self.echo(f" ({year})")
else:
self.echo("")
if venue := obj.get('venue', None):
self.echo(f" {venue}")
if doi := obj.get('doi', None):
self.echo(f" {doi}")
if paper_id := obj.get('paperId', None):
self.echo(f" {paper_id}")
def get_identifier(self, obj: Dict[str, Any]) -> str:
if paper_id := obj.get('paperId', None):
return paper_id
raw = json.dumps(obj, sort_keys=True)
hasher = hashlib.blake2b()
hasher.update(raw.encode())
return hasher.hexdigest()
@transaction.atomic
def add_publ(
self,
paper_id,
base: Publication,
is_reference: bool,
) -> Publication:
data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds')
# Add authors to database
## TODO: semantic scholar author ids nutzen?
authors: List[Author] = []
first = True
cite_key = ''
for author in data.get('authors', []):
if name := author.get('name', None):
author, created = Author.objects.get_or_create(name=name)
if created:
self.echo(f"Added author: {author}")
else:
self.echo(f"Author '{author}' already known")
authors.append(author)
cite_key = ''
if authors:
cite_key = authors[0].name.split(' ')[-1]
cite_key += str(data.get('year'))
title = data.get('title', '')
cite_key += title.split(' ')[0]
cite_key = cite_key.lower()
try:
for i in range(10):
publication = Publication.objects.get(cite_key=cite_key)
if publication.title == title:
break
else:
cite_key += '_1'
except Publication.DoesNotExist:
publication = None
# Add publication to database
doi = data.get('externalIds', None).get('DOI', None)
if not publication:
publication = Publication.objects.create(
cite_key=cite_key,
title=title,
year=data.get('year', 0),
peer_reviewed=None,
doi=doi,
abstract=data.get('abstract', None),
)
self.echo(f"Added publication: {publication}")
else:
self.echo(f"Publication '{publication}' already known")
# Assign authors
for position, author in enumerate(list(set(authors))):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.echo(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.echo(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
# Add to Semantic Scholar and link publications
new, created = SemanticScholar.objects.get_or_create(paper_id=paper_id, publication=publication)
if created:
new.full_clean()
new.save()
self.echo(f"New Semantic Scholar entry: {paper_id}")
if is_reference:
self.add_reference(base, new.publication)
else:
self.add_reference(new.publication, base, is_reference)
def handle_objs(
self,
base: Publication,
objs: List[Dict[str, Any]],
is_reference: bool,
):
title = "Reference" if is_reference else "Citation"
if 0 < len(objs):
self.echo(f"--- {title}s ---")
progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's')
for obj in progress_iterator:
if paper_id := obj.get('paperId', None):
try:
# This publication already exists and has a semantic scholar entry
existing = SemanticScholar.objects.get(paper_id=paper_id)
if is_reference:
self.add_reference(base, existing.publication)
else:
self.add_reference(existing.publication, base, is_reference)
continue
except SemanticScholar.DoesNotExist:
# This publication already exists but does not have a semantic scholar entry
if doi := obj.get('doi', None):
try:
publication = Publication.objects.get(doi=doi)
new = SemanticScholar(paper_id=paper_id, publication=publication)
new.full_clean()
new.save()
self.echo(f"New Semantic Scholar entry: {paper_id}")
if is_reference:
self.add_reference(base, new.publication)
else:
self.add_reference(new.publication, base, is_reference)
continue
except Publication.DoesNotExist:
# This publication does not exist so we need to create it
pass
identifier = self.get_identifier(obj)
if identifier in self.cache:
continue
self.display(obj)
paper_id = obj.get('paperId', None)
while True:
self.echo("Ignore? [Y/n]", nl=False)
if paper_id:
self.echo(", Show abstract [a]", nl=False)
self.echo(": ")
choice = input().lower()
if choice in {'', 'y', 'yes'}:
# Store choice
self.cache.add(identifier)
with self.cache_path.open('wb') as f:
pickle.dump(self.cache, f)
break
elif choice in {'a'}:
assert paper_id is not None
if abstract := obj.get('abstract', None):
self.echo(abstract)
else:
self.echo('Sorry, there is no abstract for this publication on Semantic Scholar')
elif choice in {'', 'n', 'no'}:
# DONE Import? copied and adapted from PR
if paper_id is not None:
self.add_publ(paper_id, base, is_reference)
else:
self.warn("Could not add this paper, please do it manually!")
break
# BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('--reset-choices', action='store_true')
parser.add_argument('--no-references', action='store_true')
parser.add_argument('--no-citations', action='store_true')
parser.add_argument('-s', '--stage', type=int, default=10000)
def handle(self, *args, **options):
reset_choices: bool = options['reset_choices']
no_citations: bool = options['no_citations']
no_references: bool = options['no_references']
stage: int = options['stage']
self.cache_path = Path('.choices.semanticscholar.pickle')
self.cache: Set[str] = set()
if reset_choices:
self.cache_path.unlink(missing_ok=True)
elif self.cache_path.exists():
self.echo("Loading previous choices (reset with --reset-choices)...", nl=False)
with self.cache_path.open('rb') as f:
self.cache = pickle.load(f)
self.echo("done", bold=True)
publications = Publication.objects.filter(
semanticscholar__isnull=False,
exclusion_criteria__isnull=True,
)
if stage < 10000:
publications = [p for p in publications if p.stage_added() == stage]
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====")
try:
progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications')
for publication in progress_iterator:
self.echo(f"=== Publication {publication}: {publication.title} ===")
for semantic in publication.semanticscholar_set.all():
if not no_references:
offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any references, verify manually!"))
break
references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']]
self.handle_objs(publication, references, is_reference=True)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
if not no_citations:
offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any citations, verify manually!"))
break
citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']]
self.handle_objs(publication, citations, is_reference=False)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
sleep(1) # Throttle
except KeyboardInterrupt:
raise CommandError("Aborted.")