|
|
@ -6,6 +6,8 @@ from pathlib import Path |
|
|
from time import sleep |
|
|
from time import sleep |
|
|
from typing import Any, Dict, List, Set |
|
|
from typing import Any, Dict, List, Set |
|
|
|
|
|
|
|
|
|
|
|
from django.conf import settings |
|
|
|
|
|
|
|
|
import requests |
|
|
import requests |
|
|
|
|
|
|
|
|
from django.core.management.base import BaseCommand, CommandParser, CommandError |
|
|
from django.core.management.base import BaseCommand, CommandParser, CommandError |
|
|
@ -24,7 +26,7 @@ from sok.models import ( |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: |
|
|
def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]: |
|
|
""" |
|
|
""" |
|
|
Retrieve information from the Semantic Scholar API. |
|
|
Retrieve information from the Semantic Scholar API. |
|
|
|
|
|
|
|
|
@ -33,11 +35,24 @@ def semanticscholar(identifier: str, include_unknown_references: bool = False) - |
|
|
See: https://api.semanticscholar.org |
|
|
See: https://api.semanticscholar.org |
|
|
""" |
|
|
""" |
|
|
|
|
|
|
|
|
url = f'https://api.semanticscholar.org/v1/paper/{identifier}' |
|
|
url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}' |
|
|
params: Dict[str, Any] = dict() |
|
|
params: Dict[str, Any] = dict() |
|
|
|
|
|
|
|
|
|
|
|
if type in ['citations', 'references']: |
|
|
|
|
|
url += ('/' + type) |
|
|
|
|
|
params['limit'] = '200' |
|
|
|
|
|
params['offset'] = str(offset) |
|
|
|
|
|
|
|
|
if include_unknown_references: |
|
|
if include_unknown_references: |
|
|
params['include_unknown_references'] = 'true' |
|
|
params['include_unknown_references'] = 'true' |
|
|
response = requests.get(url, params=params) |
|
|
if fields: |
|
|
|
|
|
params['fields'] = fields |
|
|
|
|
|
|
|
|
|
|
|
headers = { |
|
|
|
|
|
'x-api-key': settings.SCHOLAR_API_KEY |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
response = requests.get(url, params=params, headers=headers) |
|
|
response.raise_for_status |
|
|
response.raise_for_status |
|
|
return response.json() |
|
|
return response.json() |
|
|
|
|
|
|
|
|
@ -48,7 +63,6 @@ class Command(BaseCommand): |
|
|
if bold: |
|
|
if bold: |
|
|
msg = self.style.HTTP_INFO(msg) |
|
|
msg = self.style.HTTP_INFO(msg) |
|
|
tqdm.write(msg, end='\n' if nl else '') |
|
|
tqdm.write(msg, end='\n' if nl else '') |
|
|
#self.stdout.write(msg, ending='\n' if nl else '') |
|
|
|
|
|
|
|
|
|
|
|
def warn(self, msg: str): |
|
|
def warn(self, msg: str): |
|
|
self.echo(self.style.WARNING(msg)) |
|
|
self.echo(self.style.WARNING(msg)) |
|
|
@ -113,8 +127,9 @@ class Command(BaseCommand): |
|
|
base: Publication, |
|
|
base: Publication, |
|
|
is_reference: bool, |
|
|
is_reference: bool, |
|
|
) -> Publication: |
|
|
) -> Publication: |
|
|
data = semanticscholar(paper_id) |
|
|
data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds') |
|
|
# Add authors to database |
|
|
# Add authors to database |
|
|
|
|
|
## TODO: semantic scholar author ids nutzen? |
|
|
authors: List[Author] = [] |
|
|
authors: List[Author] = [] |
|
|
first = True |
|
|
first = True |
|
|
cite_key = '' |
|
|
cite_key = '' |
|
|
@ -145,7 +160,7 @@ class Command(BaseCommand): |
|
|
publication = None |
|
|
publication = None |
|
|
|
|
|
|
|
|
# Add publication to database |
|
|
# Add publication to database |
|
|
doi = data.get('doi', None) |
|
|
doi = data.get('externalIds', None).get('DOI', None) |
|
|
if not publication: |
|
|
if not publication: |
|
|
publication = Publication.objects.create( |
|
|
publication = Publication.objects.create( |
|
|
cite_key=cite_key, |
|
|
cite_key=cite_key, |
|
|
@ -192,9 +207,11 @@ class Command(BaseCommand): |
|
|
title = "Reference" if is_reference else "Citation" |
|
|
title = "Reference" if is_reference else "Citation" |
|
|
if 0 < len(objs): |
|
|
if 0 < len(objs): |
|
|
self.echo(f"--- {title}s ---") |
|
|
self.echo(f"--- {title}s ---") |
|
|
for obj in tqdm(objs, unit=title.lower()): |
|
|
progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's') |
|
|
|
|
|
for obj in progress_iterator: |
|
|
if paper_id := obj.get('paperId', None): |
|
|
if paper_id := obj.get('paperId', None): |
|
|
try: |
|
|
try: |
|
|
|
|
|
# This publication already exists and has a semantic scholar entry |
|
|
existing = SemanticScholar.objects.get(paper_id=paper_id) |
|
|
existing = SemanticScholar.objects.get(paper_id=paper_id) |
|
|
if is_reference: |
|
|
if is_reference: |
|
|
self.add_reference(base, existing.publication) |
|
|
self.add_reference(base, existing.publication) |
|
|
@ -202,6 +219,7 @@ class Command(BaseCommand): |
|
|
self.add_reference(existing.publication, base, is_reference) |
|
|
self.add_reference(existing.publication, base, is_reference) |
|
|
continue |
|
|
continue |
|
|
except SemanticScholar.DoesNotExist: |
|
|
except SemanticScholar.DoesNotExist: |
|
|
|
|
|
# This publication already exists but does not have a semantic scholar entry |
|
|
if doi := obj.get('doi', None): |
|
|
if doi := obj.get('doi', None): |
|
|
try: |
|
|
try: |
|
|
publication = Publication.objects.get(doi=doi) |
|
|
publication = Publication.objects.get(doi=doi) |
|
|
@ -215,6 +233,7 @@ class Command(BaseCommand): |
|
|
self.add_reference(new.publication, base, is_reference) |
|
|
self.add_reference(new.publication, base, is_reference) |
|
|
continue |
|
|
continue |
|
|
except Publication.DoesNotExist: |
|
|
except Publication.DoesNotExist: |
|
|
|
|
|
# This publication does not exist so we need to create it |
|
|
pass |
|
|
pass |
|
|
|
|
|
|
|
|
identifier = self.get_identifier(obj) |
|
|
identifier = self.get_identifier(obj) |
|
|
@ -225,11 +244,12 @@ class Command(BaseCommand): |
|
|
|
|
|
|
|
|
paper_id = obj.get('paperId', None) |
|
|
paper_id = obj.get('paperId', None) |
|
|
while True: |
|
|
while True: |
|
|
self.echo("Ignore? [Y/n]", nl=True) |
|
|
self.echo("Ignore? [Y/n]", nl=False) |
|
|
if paper_id is not None: |
|
|
if paper_id: |
|
|
self.echo(", Show abstract [a]", nl=False) |
|
|
self.echo(", Show abstract [a]", nl=False) |
|
|
self.echo(": ") |
|
|
self.echo(": ") |
|
|
choice = input().lower() |
|
|
choice = input().lower() |
|
|
|
|
|
|
|
|
if choice in {'', 'y', 'yes'}: |
|
|
if choice in {'', 'y', 'yes'}: |
|
|
# Store choice |
|
|
# Store choice |
|
|
self.cache.add(identifier) |
|
|
self.cache.add(identifier) |
|
|
@ -238,9 +258,10 @@ class Command(BaseCommand): |
|
|
break |
|
|
break |
|
|
elif choice in {'a'}: |
|
|
elif choice in {'a'}: |
|
|
assert paper_id is not None |
|
|
assert paper_id is not None |
|
|
data = semanticscholar(paper_id) |
|
|
if abstract := obj.get('abstract', None): |
|
|
if abstract := data.get('abstract', None): |
|
|
|
|
|
self.echo(abstract) |
|
|
self.echo(abstract) |
|
|
|
|
|
else: |
|
|
|
|
|
self.echo('Sorry, there is no abstract for this publication on Semantic Scholar') |
|
|
elif choice in {'', 'n', 'no'}: |
|
|
elif choice in {'', 'n', 'no'}: |
|
|
# DONE Import? copied and adapted from PR |
|
|
# DONE Import? copied and adapted from PR |
|
|
if paper_id is not None: |
|
|
if paper_id is not None: |
|
|
@ -277,23 +298,44 @@ class Command(BaseCommand): |
|
|
semanticscholar__isnull=False, |
|
|
semanticscholar__isnull=False, |
|
|
exclusion_criteria__isnull=True, |
|
|
exclusion_criteria__isnull=True, |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
if stage < 10000: |
|
|
if stage < 10000: |
|
|
publications = [p for p in publications if p.stage_added() == stage] |
|
|
publications = [p for p in publications if p.stage_added() == stage] |
|
|
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====") |
|
|
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====") |
|
|
try: |
|
|
try: |
|
|
for publication in tqdm(publications, unit="publication"): |
|
|
progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications') |
|
|
|
|
|
for publication in progress_iterator: |
|
|
self.echo(f"=== Publication {publication}: {publication.title} ===") |
|
|
self.echo(f"=== Publication {publication}: {publication.title} ===") |
|
|
for semantic in publication.semanticscholar_set.all(): |
|
|
for semantic in publication.semanticscholar_set.all(): |
|
|
data = semanticscholar(semantic.paper_id) |
|
|
|
|
|
|
|
|
|
|
|
if not no_references: |
|
|
if not no_references: |
|
|
references: List[Dict[str, Any]] = data['references'] |
|
|
offset = 0 |
|
|
|
|
|
while True: |
|
|
|
|
|
data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset) |
|
|
|
|
|
if not data.get('data', None): |
|
|
|
|
|
self.echo(self.style.WARNING("API did not return any references, verify manually!")) |
|
|
|
|
|
break |
|
|
|
|
|
references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']] |
|
|
self.handle_objs(publication, references, is_reference=True) |
|
|
self.handle_objs(publication, references, is_reference=True) |
|
|
|
|
|
# Handle limitation if there are more than 200 paper references |
|
|
|
|
|
if 'next' in data: offset = data['next'] |
|
|
|
|
|
else: break |
|
|
|
|
|
|
|
|
if not no_citations: |
|
|
if not no_citations: |
|
|
citations: List[Dict[str, Any]] = data['citations'] |
|
|
offset = 0 |
|
|
|
|
|
while True: |
|
|
|
|
|
data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset) |
|
|
|
|
|
if not data.get('data', None): |
|
|
|
|
|
self.echo(self.style.WARNING("API did not return any citations, verify manually!")) |
|
|
|
|
|
break |
|
|
|
|
|
citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']] |
|
|
self.handle_objs(publication, citations, is_reference=False) |
|
|
self.handle_objs(publication, citations, is_reference=False) |
|
|
|
|
|
# Handle limitation if there are more than 200 paper references |
|
|
|
|
|
if 'next' in data: offset = data['next'] |
|
|
|
|
|
else: break |
|
|
|
|
|
|
|
|
|
|
|
sleep(1) # Throttle |
|
|
|
|
|
|
|
|
sleep(2) # Throttle |
|
|
|
|
|
except KeyboardInterrupt: |
|
|
except KeyboardInterrupt: |
|
|
raise CommandError("Aborted.") |
|
|
raise CommandError("Aborted.") |
|
|
|
|
|
|
|
|
|