Compare commits

...

2 Commits

Author SHA1 Message Date
Maya Herrscher bc8a19ff83 Fix snowballing functionality 1 week ago
Maya Herrscher 0bc652f4d7 CSV import for Zotero files 1 week ago
  1. 7
      sok/management/commands/repair.py
  2. 74
      sok/management/commands/snowball.py
  3. 154
      sok/management/commands/zimport.py
  4. 14
      sokman/settings.py

7
sok/management/commands/repair.py

@ -223,9 +223,10 @@ class Command(BaseCommand):
obj.full_clean()
obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication)
else:
self.log_warn(f"Could not find semanticscholar ID for publication '{publication.title}' with DOI '{publication.doi}'")
sleep(2) # Throttle to avoid rate-limiting
sleep(1) # Throttle to avoid rate-limiting (1/s with API Key)
def find_author_on_dblp(
self,
@ -360,6 +361,7 @@ class Command(BaseCommand):
parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true')
parser.add_argument('-i', '--scholarid', action='store_true')
def handle(self, *args, **options):
self.fix_references()
@ -367,6 +369,7 @@ class Command(BaseCommand):
if options['dblp']:
self.fix_dblp()
self.find_missing_dois()
if options['scholarid'] or options['dblp']:
self.find_semanticscholar_ids()
if options['authors']:
self.find_pid_for_authors()

74
sok/management/commands/snowball.py

@ -6,6 +6,8 @@ from pathlib import Path
from time import sleep
from typing import Any, Dict, List, Set
from django.conf import settings
import requests
from django.core.management.base import BaseCommand, CommandParser, CommandError
@ -24,7 +26,7 @@ from sok.models import (
)
def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]:
def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]:
"""
Retrieve information from the Semantic Scholar API.
@ -33,11 +35,24 @@ def semanticscholar(identifier: str, include_unknown_references: bool = False) -
See: https://api.semanticscholar.org
"""
url = f'https://api.semanticscholar.org/v1/paper/{identifier}'
url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}'
params: Dict[str, Any] = dict()
if type in ['citations', 'references']:
url += ('/' + type)
params['limit'] = '200'
params['offset'] = str(offset)
if include_unknown_references:
params['include_unknown_references'] = 'true'
response = requests.get(url, params=params)
if fields:
params['fields'] = fields
headers = {
'x-api-key': settings.SCHOLAR_API_KEY
}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status
return response.json()
@ -48,7 +63,6 @@ class Command(BaseCommand):
if bold:
msg = self.style.HTTP_INFO(msg)
tqdm.write(msg, end='\n' if nl else '')
#self.stdout.write(msg, ending='\n' if nl else '')
def warn(self, msg: str):
self.echo(self.style.WARNING(msg))
@ -113,8 +127,9 @@ class Command(BaseCommand):
base: Publication,
is_reference: bool,
) -> Publication:
data = semanticscholar(paper_id)
data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds')
# Add authors to database
## TODO: semantic scholar author ids nutzen?
authors: List[Author] = []
first = True
cite_key = ''
@ -145,7 +160,7 @@ class Command(BaseCommand):
publication = None
# Add publication to database
doi = data.get('doi', None)
doi = data.get('externalIds', None).get('DOI', None)
if not publication:
publication = Publication.objects.create(
cite_key=cite_key,
@ -192,9 +207,11 @@ class Command(BaseCommand):
title = "Reference" if is_reference else "Citation"
if 0 < len(objs):
self.echo(f"--- {title}s ---")
for obj in tqdm(objs, unit=title.lower()):
progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's')
for obj in progress_iterator:
if paper_id := obj.get('paperId', None):
try:
# This publication already exists and has a semantic scholar entry
existing = SemanticScholar.objects.get(paper_id=paper_id)
if is_reference:
self.add_reference(base, existing.publication)
@ -202,6 +219,7 @@ class Command(BaseCommand):
self.add_reference(existing.publication, base, is_reference)
continue
except SemanticScholar.DoesNotExist:
# This publication already exists but does not have a semantic scholar entry
if doi := obj.get('doi', None):
try:
publication = Publication.objects.get(doi=doi)
@ -215,6 +233,7 @@ class Command(BaseCommand):
self.add_reference(new.publication, base, is_reference)
continue
except Publication.DoesNotExist:
# This publication does not exist so we need to create it
pass
identifier = self.get_identifier(obj)
@ -225,11 +244,12 @@ class Command(BaseCommand):
paper_id = obj.get('paperId', None)
while True:
self.echo("Ignore? [Y/n]", nl=True)
if paper_id is not None:
self.echo("Ignore? [Y/n]", nl=False)
if paper_id:
self.echo(", Show abstract [a]", nl=False)
self.echo(": ")
choice = input().lower()
if choice in {'', 'y', 'yes'}:
# Store choice
self.cache.add(identifier)
@ -238,9 +258,10 @@ class Command(BaseCommand):
break
elif choice in {'a'}:
assert paper_id is not None
data = semanticscholar(paper_id)
if abstract := data.get('abstract', None):
if abstract := obj.get('abstract', None):
self.echo(abstract)
else:
self.echo('Sorry, there is no abstract for this publication on Semantic Scholar')
elif choice in {'', 'n', 'no'}:
# DONE Import? copied and adapted from PR
if paper_id is not None:
@ -277,23 +298,44 @@ class Command(BaseCommand):
semanticscholar__isnull=False,
exclusion_criteria__isnull=True,
)
if stage < 10000:
publications = [p for p in publications if p.stage_added() == stage]
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====")
try:
for publication in tqdm(publications, unit="publication"):
progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications')
for publication in progress_iterator:
self.echo(f"=== Publication {publication}: {publication.title} ===")
for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if not no_references:
references: List[Dict[str, Any]] = data['references']
offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any references, verify manually!"))
break
references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']]
self.handle_objs(publication, references, is_reference=True)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
if not no_citations:
citations: List[Dict[str, Any]] = data['citations']
offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any citations, verify manually!"))
break
citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']]
self.handle_objs(publication, citations, is_reference=False)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
sleep(1) # Throttle
sleep(2) # Throttle
except KeyboardInterrupt:
raise CommandError("Aborted.")

154
sok/management/commands/zimport.py

@ -0,0 +1,154 @@
import html
import io
import string
import csv
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple, Union
import requests
from django.db import transaction
from django.core.management.base import BaseCommand, CommandParser, CommandError
from sok.models import (
Author,
Publication,
PublicationAuthor,
PublicationSource,
PublicationTag,
SearchTerm,
Source,
Tag,
)
# TODO: anpassen für WoS?
PUBLICATIONS = {
'article',
'inproceedings',
'proceedings',
'book',
'incollection',
'phdthesis',
'mastersthesis',
'www',
'person',
'data',
}
CITE_KEY_PREFIX = 'Z:'
class Command(BaseCommand):
def log_success(self, msg: str):
self.stdout.write(self.style.SUCCESS(msg))
def log_info(self, msg: str, nl: bool = True):
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
self.stdout.flush()
# BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('--search-term', default='Not specified')
parser.add_argument('--source', default='Zotero')
parser.add_argument('zfile')
@transaction.atomic
def handle(self, *args, **options):
source, created = Source.objects.get_or_create(name=options['source'])
search_term: Optional[SearchTerm] = None
if name := options['search_term']:
search_term, created = SearchTerm.objects.get_or_create(name=name)
if created:
self.log_success(f"Created search term: {search_term}")
publications: List[Publication] = []
zotero_file = options['zfile']
with open(zotero_file, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for publ in reader:
authors: List[Author] = []
for name in set(publ['Author'].split('; ')):
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
authors.append(author)
tags: List[Tag] = []
for t in set(publ['Manual Tags'].split('; ')).union(publ['Automatic Tags'].split('; ')):
if t == '': continue
tag, created = Tag.objects.get_or_create(name=t)
if created:
self.log_success(f"Added tag: {tag}")
else:
self.log_info(f"Tag '{tag}' already exists")
tags.append(tag)
pages = (None, None)
if '-' in publ['Pages']:
pages = publ['Pages'].split('-')
# Add publication to database
publication, created = Publication.objects.get_or_create(
cite_key=publ['Key'],
title=publ['Title'],
year=publ['Publication Year'],
# TODO: peer_reviewed=result.is_peer_reviewed,
first_page=pages[0],
last_page=pages[1],
doi=publ['DOI'] or None,
abstract=publ['Abstract Note'] or None,
)
if created:
self.log_success(f"Added publication: {publication}")
else:
self.log_info(f"Publication '{publication}' already known")
publications.append(publication)
# Assign authors
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
# Assign tags
for position, tag in enumerate(tags):
publication_tag, created = PublicationTag.objects.get_or_create(
tag=tag,
publication=publication,
)
if created:
self.log_success(f"Assigned tag '{tag}' to publication '{publication}'")
else:
self.log_info(f"Tag '{tag}' already assigned to publication '{publication}'")
# Assign sources
if search_term is not None:
for publication in publications:
print(publication, search_term, source)
publication_source, created = PublicationSource.objects.get_or_create(
source=source,
publication=publication,
search_term=search_term,
)
if created:
self.log_success(f"Assigned source '{source}' to publication '{publication}' with search term '{search_term}'")
else:
self.log_info(f"Source '{source}' already assigned to publication '{publication}' with search term '{search_term}'")

14
sokman/settings.py

@ -32,9 +32,17 @@ def get_or_generate_key() -> str:
assert path.exists()
return path.read_text()
def get_api_key() -> str:
path = Path(Path(__file__).parent,'api.secret')
assert path.exists()
return path.read_text()
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = get_or_generate_key()
SCHOLAR_API_KEY = get_api_key().strip('\n')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
@ -89,9 +97,13 @@ WSGI_APPLICATION = 'sokman.wsgi.application'
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'jens.sqlite3',
},
'uc_sok': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
},
}

Loading…
Cancel
Save