Basically a fork from https://github.com/blochberger/sokman but with the intention of adding a visual interface as well
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

235 lines
9.0 KiB

from pprint import pprint
from time import sleep
from tqdm import tqdm
from django.db import transaction
from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand, CommandError, CommandParser
from typing import List, Optional, Tuple
import sok.management.commands.dblpimport as dblp
from sok.management.commands.snowball import semanticscholar
from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor
class Command(BaseCommand):
def log_success(self, msg: str):
self.stdout.write(self.style.SUCCESS(msg))
def log_warn(self, msg: str):
self.stdout.write(self.style.WARNING(msg))
def log_info(self, msg: str, nl: bool = True):
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
self.stdout.flush()
@transaction.atomic
def fix_references(self) -> None:
"""
Create relevant references to masters of referenced variants.
If mulitple variants of a publication exist, only the master variant is
considered. However, relevant publications might reference a non-master
master-variant, e. g., a preprint.
This command adds references to the master-variant, even though this
reference is not actually present in the publication. The reference
identifier is marked with a star, e. g., '[1]*'.
"""
self.log_info("--- Searching for references to variants ---")
for publication in Publication.objects.filter(variant_of__isnull=False):
variant = publication.variant_of
origs = PublicationReference.objects.filter(reference=publication)
for orig in origs:
if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists():
continue
fixed = PublicationReference(
reference=variant,
publication=orig.publication,
identifier=('' if orig.identifier is None else orig.identifier) + "*",
)
try:
fixed.full_clean()
fixed.save()
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}")
except ValidationError as e:
print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication)
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}")
@transaction.atomic
def fix_abstracts(self) -> None:
"""
Add abstracts to those publications that have one on SemanticScholar
If mulitple variants of a publication exist, only the master variant is
considered.
"""
self.log_info("--- Searching for publications without abstracts ---")
self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract")
for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"):
for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if abstract := data['abstract']:
publication.abstract = abstract
if publication.peer_reviewed == None: publication.peer_reviewed = False
try:
publication.full_clean()
publication.save()
self.log_success(f"Added abstract for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
sleep(2) # Throttle to avoid rate-limiting
@transaction.atomic
def search_on_dblp(self, publication: Publication):
query, results, total = dblp.PublicationResult.from_search(publication.title, 100)
if total == 0: return
for result in results:
if publication.doi and result.doi:
if publication.doi.lower() == result.doi.lower():
publication.cite_key = result.cite_key
publication.year = result.year
if not result.is_peer_reviewed == None:
publication.peer_reviewed = result.is_peer_reviewed
publication.first_page = result.first_page
publication.last_page = result.last_page
print(publication.peer_reviewed, result)
try:
publication.full_clean()
publication.save()
self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
# Store Authors
authors: List[Author] = []
for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid to author: {author}")
except ValidationError as e:
raise CommandError(f"{author}: {e}")
authors.append(author)
# Replace authors for publication
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
for publication_author in PublicationAuthor.objects.filter(publication=publication):
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
try:
publication_author.delete()
except e:
raise CommandError(f"{publication} - {author}: {e}")
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
else: continue # quite definitely not the same publication
elif publication.title == result.title and publication.year == result.year:
self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}")
else: continue # I'd rather look at that in detail for now
def find_secondary_on_dblp(self):
self.log_info("--- Searching for snowballed sources on DBLP ---")
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
if publication.stage == 'excluded': continue
self.search_on_dblp(publication)
sleep(2) # Throttle to avoid rate-limiting
keys = set(
Publication.objects.exclude(
cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('cite_key', flat=True).distinct()
)
n = len(keys)
self.log_info(f"Found {n} publications that still need to be verified")
def fix_dblp(self):
self.log_info("--- Searching for entries not in the default DBLP dump ---")
keys_in_db = set(
Publication.objects.filter(
cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('cite_key', flat=True).distinct()
)
keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH)
self.stdout.write(f"DB: {len(keys_in_db):8d}")
self.stdout.write(f"DBLP: {len(keys_in_dump):8d}")
pprint(keys_in_db - keys_in_dump)
def find_missing_dois(self):
self.log_info("--- Searching for missing DOIs ---")
publications = Publication.objects.filter(doi__isnull=True)
keys = {
dblp.strip_cite_key_prefix(cite_key)
for cite_key in publications.values_list('cite_key', flat=True)
}
self.log_info("Parsing DBLP dump...")
results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys)
self.log_info("done")
for result in results:
if doi := result.doi:
publication = publications.get(cite_key=result.cite_key)
publication.doi = doi
publication.full_clean()
publication.save()
self.log_success(f"Added DOI '{doi}' to publication: {publication}")
def find_semanticscholar_ids(self):
self.log_info("--- Searching for paper IDs on Semantic Scholar ---")
publications = Publication.objects.filter(
doi__isnull=False,
semanticscholar__isnull=True,
)
for publication in publications:
data = semanticscholar(publication.doi)
if not 'error' in data:
paper_id = data['paperId']
obj = SemanticScholar(paper_id=paper_id, publication=publication)
obj.full_clean()
obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication)
sleep(2) # Throttle to avoid rate-limiting
# BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('-b', '--abstract', action='store_true')
parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true')
def handle(self, *args, **options):
self.fix_references()
if options['abstract']: self.fix_abstracts()
self.find_secondary_on_dblp()
if options['dblp']:
self.fix_dblp()
self.find_missing_dois()
self.find_semanticscholar_ids()
if options['authors']: self.fix_authors()