Compare commits

...

2 Commits

Author SHA1 Message Date
Maya Herrscher bc8a19ff83 Fix snowballing functionality 1 week ago
Maya Herrscher 0bc652f4d7 CSV import for Zotero files 1 week ago
  1. 7
      sok/management/commands/repair.py
  2. 74
      sok/management/commands/snowball.py
  3. 154
      sok/management/commands/zimport.py
  4. 14
      sokman/settings.py

7
sok/management/commands/repair.py

@ -223,9 +223,10 @@ class Command(BaseCommand):
obj.full_clean() obj.full_clean()
obj.save() obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication) else:
self.log_warn(f"Could not find semanticscholar ID for publication '{publication.title}' with DOI '{publication.doi}'")
sleep(2) # Throttle to avoid rate-limiting sleep(1) # Throttle to avoid rate-limiting (1/s with API Key)
def find_author_on_dblp( def find_author_on_dblp(
self, self,
@ -360,6 +361,7 @@ class Command(BaseCommand):
parser.add_argument('-d', '--dblp', action='store_true') parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true') parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true') parser.add_argument('-s', '--secondary', action='store_true')
parser.add_argument('-i', '--scholarid', action='store_true')
def handle(self, *args, **options): def handle(self, *args, **options):
self.fix_references() self.fix_references()
@ -367,6 +369,7 @@ class Command(BaseCommand):
if options['dblp']: if options['dblp']:
self.fix_dblp() self.fix_dblp()
self.find_missing_dois() self.find_missing_dois()
if options['scholarid'] or options['dblp']:
self.find_semanticscholar_ids() self.find_semanticscholar_ids()
if options['authors']: if options['authors']:
self.find_pid_for_authors() self.find_pid_for_authors()

74
sok/management/commands/snowball.py

@ -6,6 +6,8 @@ from pathlib import Path
from time import sleep from time import sleep
from typing import Any, Dict, List, Set from typing import Any, Dict, List, Set
from django.conf import settings
import requests import requests
from django.core.management.base import BaseCommand, CommandParser, CommandError from django.core.management.base import BaseCommand, CommandParser, CommandError
@ -24,7 +26,7 @@ from sok.models import (
) )
def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: def semanticscholar(identifier: str, fields: str = None, type: str = None, offset: int = 0, include_unknown_references: bool = False) -> Dict[str, Any]:
""" """
Retrieve information from the Semantic Scholar API. Retrieve information from the Semantic Scholar API.
@ -33,11 +35,24 @@ def semanticscholar(identifier: str, include_unknown_references: bool = False) -
See: https://api.semanticscholar.org See: https://api.semanticscholar.org
""" """
url = f'https://api.semanticscholar.org/v1/paper/{identifier}' url = f'https://api.semanticscholar.org/graph/v1/paper/{identifier}'
params: Dict[str, Any] = dict() params: Dict[str, Any] = dict()
if type in ['citations', 'references']:
url += ('/' + type)
params['limit'] = '200'
params['offset'] = str(offset)
if include_unknown_references: if include_unknown_references:
params['include_unknown_references'] = 'true' params['include_unknown_references'] = 'true'
response = requests.get(url, params=params) if fields:
params['fields'] = fields
headers = {
'x-api-key': settings.SCHOLAR_API_KEY
}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status response.raise_for_status
return response.json() return response.json()
@ -48,7 +63,6 @@ class Command(BaseCommand):
if bold: if bold:
msg = self.style.HTTP_INFO(msg) msg = self.style.HTTP_INFO(msg)
tqdm.write(msg, end='\n' if nl else '') tqdm.write(msg, end='\n' if nl else '')
#self.stdout.write(msg, ending='\n' if nl else '')
def warn(self, msg: str): def warn(self, msg: str):
self.echo(self.style.WARNING(msg)) self.echo(self.style.WARNING(msg))
@ -113,8 +127,9 @@ class Command(BaseCommand):
base: Publication, base: Publication,
is_reference: bool, is_reference: bool,
) -> Publication: ) -> Publication:
data = semanticscholar(paper_id) data = semanticscholar(paper_id, fields='title,authors,year,abstract,externalIds')
# Add authors to database # Add authors to database
## TODO: semantic scholar author ids nutzen?
authors: List[Author] = [] authors: List[Author] = []
first = True first = True
cite_key = '' cite_key = ''
@ -145,7 +160,7 @@ class Command(BaseCommand):
publication = None publication = None
# Add publication to database # Add publication to database
doi = data.get('doi', None) doi = data.get('externalIds', None).get('DOI', None)
if not publication: if not publication:
publication = Publication.objects.create( publication = Publication.objects.create(
cite_key=cite_key, cite_key=cite_key,
@ -192,9 +207,11 @@ class Command(BaseCommand):
title = "Reference" if is_reference else "Citation" title = "Reference" if is_reference else "Citation"
if 0 < len(objs): if 0 < len(objs):
self.echo(f"--- {title}s ---") self.echo(f"--- {title}s ---")
for obj in tqdm(objs, unit=title.lower()): progress_iterator = tqdm(objs, unit=title.lower(), position=2, leave=False, desc=title + 's')
for obj in progress_iterator:
if paper_id := obj.get('paperId', None): if paper_id := obj.get('paperId', None):
try: try:
# This publication already exists and has a semantic scholar entry
existing = SemanticScholar.objects.get(paper_id=paper_id) existing = SemanticScholar.objects.get(paper_id=paper_id)
if is_reference: if is_reference:
self.add_reference(base, existing.publication) self.add_reference(base, existing.publication)
@ -202,6 +219,7 @@ class Command(BaseCommand):
self.add_reference(existing.publication, base, is_reference) self.add_reference(existing.publication, base, is_reference)
continue continue
except SemanticScholar.DoesNotExist: except SemanticScholar.DoesNotExist:
# This publication already exists but does not have a semantic scholar entry
if doi := obj.get('doi', None): if doi := obj.get('doi', None):
try: try:
publication = Publication.objects.get(doi=doi) publication = Publication.objects.get(doi=doi)
@ -215,6 +233,7 @@ class Command(BaseCommand):
self.add_reference(new.publication, base, is_reference) self.add_reference(new.publication, base, is_reference)
continue continue
except Publication.DoesNotExist: except Publication.DoesNotExist:
# This publication does not exist so we need to create it
pass pass
identifier = self.get_identifier(obj) identifier = self.get_identifier(obj)
@ -225,11 +244,12 @@ class Command(BaseCommand):
paper_id = obj.get('paperId', None) paper_id = obj.get('paperId', None)
while True: while True:
self.echo("Ignore? [Y/n]", nl=True) self.echo("Ignore? [Y/n]", nl=False)
if paper_id is not None: if paper_id:
self.echo(", Show abstract [a]", nl=False) self.echo(", Show abstract [a]", nl=False)
self.echo(": ") self.echo(": ")
choice = input().lower() choice = input().lower()
if choice in {'', 'y', 'yes'}: if choice in {'', 'y', 'yes'}:
# Store choice # Store choice
self.cache.add(identifier) self.cache.add(identifier)
@ -238,9 +258,10 @@ class Command(BaseCommand):
break break
elif choice in {'a'}: elif choice in {'a'}:
assert paper_id is not None assert paper_id is not None
data = semanticscholar(paper_id) if abstract := obj.get('abstract', None):
if abstract := data.get('abstract', None):
self.echo(abstract) self.echo(abstract)
else:
self.echo('Sorry, there is no abstract for this publication on Semantic Scholar')
elif choice in {'', 'n', 'no'}: elif choice in {'', 'n', 'no'}:
# DONE Import? copied and adapted from PR # DONE Import? copied and adapted from PR
if paper_id is not None: if paper_id is not None:
@ -277,23 +298,44 @@ class Command(BaseCommand):
semanticscholar__isnull=False, semanticscholar__isnull=False,
exclusion_criteria__isnull=True, exclusion_criteria__isnull=True,
) )
if stage < 10000: if stage < 10000:
publications = [p for p in publications if p.stage_added() == stage] publications = [p for p in publications if p.stage_added() == stage]
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====") self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====")
try: try:
for publication in tqdm(publications, unit="publication"): progress_iterator = tqdm(publications, unit="publication", position=1, desc='Publications')
for publication in progress_iterator:
self.echo(f"=== Publication {publication}: {publication.title} ===") self.echo(f"=== Publication {publication}: {publication.title} ===")
for semantic in publication.semanticscholar_set.all(): for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if not no_references: if not no_references:
references: List[Dict[str, Any]] = data['references'] offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='references', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any references, verify manually!"))
break
references: List[Dict[str, Any]] = [d['citedPaper'] for d in data['data']]
self.handle_objs(publication, references, is_reference=True) self.handle_objs(publication, references, is_reference=True)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
if not no_citations: if not no_citations:
citations: List[Dict[str, Any]] = data['citations'] offset = 0
while True:
data = semanticscholar(semantic.paper_id, type='citations', fields='title,abstract', offset=offset)
if not data.get('data', None):
self.echo(self.style.WARNING("API did not return any citations, verify manually!"))
break
citations: List[Dict[str, Any]] = [d['citingPaper'] for d in data['data']]
self.handle_objs(publication, citations, is_reference=False) self.handle_objs(publication, citations, is_reference=False)
# Handle limitation if there are more than 200 paper references
if 'next' in data: offset = data['next']
else: break
sleep(1) # Throttle
sleep(2) # Throttle
except KeyboardInterrupt: except KeyboardInterrupt:
raise CommandError("Aborted.") raise CommandError("Aborted.")

154
sok/management/commands/zimport.py

@ -0,0 +1,154 @@
import html
import io
import string
import csv
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple, Union
import requests
from django.db import transaction
from django.core.management.base import BaseCommand, CommandParser, CommandError
from sok.models import (
Author,
Publication,
PublicationAuthor,
PublicationSource,
PublicationTag,
SearchTerm,
Source,
Tag,
)
# TODO: anpassen für WoS?
PUBLICATIONS = {
'article',
'inproceedings',
'proceedings',
'book',
'incollection',
'phdthesis',
'mastersthesis',
'www',
'person',
'data',
}
CITE_KEY_PREFIX = 'Z:'
class Command(BaseCommand):
def log_success(self, msg: str):
self.stdout.write(self.style.SUCCESS(msg))
def log_info(self, msg: str, nl: bool = True):
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
self.stdout.flush()
# BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('--search-term', default='Not specified')
parser.add_argument('--source', default='Zotero')
parser.add_argument('zfile')
@transaction.atomic
def handle(self, *args, **options):
source, created = Source.objects.get_or_create(name=options['source'])
search_term: Optional[SearchTerm] = None
if name := options['search_term']:
search_term, created = SearchTerm.objects.get_or_create(name=name)
if created:
self.log_success(f"Created search term: {search_term}")
publications: List[Publication] = []
zotero_file = options['zfile']
with open(zotero_file, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for publ in reader:
authors: List[Author] = []
for name in set(publ['Author'].split('; ')):
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
authors.append(author)
tags: List[Tag] = []
for t in set(publ['Manual Tags'].split('; ')).union(publ['Automatic Tags'].split('; ')):
if t == '': continue
tag, created = Tag.objects.get_or_create(name=t)
if created:
self.log_success(f"Added tag: {tag}")
else:
self.log_info(f"Tag '{tag}' already exists")
tags.append(tag)
pages = (None, None)
if '-' in publ['Pages']:
pages = publ['Pages'].split('-')
# Add publication to database
publication, created = Publication.objects.get_or_create(
cite_key=publ['Key'],
title=publ['Title'],
year=publ['Publication Year'],
# TODO: peer_reviewed=result.is_peer_reviewed,
first_page=pages[0],
last_page=pages[1],
doi=publ['DOI'] or None,
abstract=publ['Abstract Note'] or None,
)
if created:
self.log_success(f"Added publication: {publication}")
else:
self.log_info(f"Publication '{publication}' already known")
publications.append(publication)
# Assign authors
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
# Assign tags
for position, tag in enumerate(tags):
publication_tag, created = PublicationTag.objects.get_or_create(
tag=tag,
publication=publication,
)
if created:
self.log_success(f"Assigned tag '{tag}' to publication '{publication}'")
else:
self.log_info(f"Tag '{tag}' already assigned to publication '{publication}'")
# Assign sources
if search_term is not None:
for publication in publications:
print(publication, search_term, source)
publication_source, created = PublicationSource.objects.get_or_create(
source=source,
publication=publication,
search_term=search_term,
)
if created:
self.log_success(f"Assigned source '{source}' to publication '{publication}' with search term '{search_term}'")
else:
self.log_info(f"Source '{source}' already assigned to publication '{publication}' with search term '{search_term}'")

14
sokman/settings.py

@ -32,9 +32,17 @@ def get_or_generate_key() -> str:
assert path.exists() assert path.exists()
return path.read_text() return path.read_text()
def get_api_key() -> str:
path = Path(Path(__file__).parent,'api.secret')
assert path.exists()
return path.read_text()
# SECURITY WARNING: keep the secret key used in production secret! # SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = get_or_generate_key() SECRET_KEY = get_or_generate_key()
SCHOLAR_API_KEY = get_api_key().strip('\n')
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True DEBUG = True
@ -89,9 +97,13 @@ WSGI_APPLICATION = 'sokman.wsgi.application'
DATABASES = { DATABASES = {
'default': { 'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'jens.sqlite3',
},
'uc_sok': {
'ENGINE': 'django.db.backends.sqlite3', 'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3', 'NAME': BASE_DIR / 'db.sqlite3',
} },
} }

Loading…
Cancel
Save