Basically a fork from https://github.com/blochberger/sokman but with the intention of adding a visual interface as well
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
235 lines
9.0 KiB
235 lines
9.0 KiB
from pprint import pprint
|
|
from time import sleep
|
|
from tqdm import tqdm
|
|
|
|
from django.db import transaction
|
|
from django.core.exceptions import ValidationError
|
|
from django.core.management.base import BaseCommand, CommandError, CommandParser
|
|
|
|
from typing import List, Optional, Tuple
|
|
|
|
import sok.management.commands.dblpimport as dblp
|
|
|
|
from sok.management.commands.snowball import semanticscholar
|
|
from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor
|
|
|
|
|
|
class Command(BaseCommand):
|
|
|
|
def log_success(self, msg: str):
|
|
self.stdout.write(self.style.SUCCESS(msg))
|
|
|
|
def log_warn(self, msg: str):
|
|
self.stdout.write(self.style.WARNING(msg))
|
|
|
|
def log_info(self, msg: str, nl: bool = True):
|
|
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
|
|
self.stdout.flush()
|
|
|
|
@transaction.atomic
|
|
def fix_references(self) -> None:
|
|
"""
|
|
Create relevant references to masters of referenced variants.
|
|
|
|
If mulitple variants of a publication exist, only the master variant is
|
|
considered. However, relevant publications might reference a non-master
|
|
master-variant, e. g., a preprint.
|
|
|
|
This command adds references to the master-variant, even though this
|
|
reference is not actually present in the publication. The reference
|
|
identifier is marked with a star, e. g., '[1]*'.
|
|
"""
|
|
|
|
self.log_info("--- Searching for references to variants ---")
|
|
for publication in Publication.objects.filter(variant_of__isnull=False):
|
|
variant = publication.variant_of
|
|
origs = PublicationReference.objects.filter(reference=publication)
|
|
for orig in origs:
|
|
if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists():
|
|
continue
|
|
fixed = PublicationReference(
|
|
reference=variant,
|
|
publication=orig.publication,
|
|
identifier=('' if orig.identifier is None else orig.identifier) + "*",
|
|
)
|
|
try:
|
|
fixed.full_clean()
|
|
fixed.save()
|
|
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}")
|
|
except ValidationError as e:
|
|
print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication)
|
|
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}")
|
|
|
|
@transaction.atomic
|
|
def fix_abstracts(self) -> None:
|
|
"""
|
|
Add abstracts to those publications that have one on SemanticScholar
|
|
|
|
If mulitple variants of a publication exist, only the master variant is
|
|
considered.
|
|
"""
|
|
|
|
self.log_info("--- Searching for publications without abstracts ---")
|
|
self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract")
|
|
for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"):
|
|
for semantic in publication.semanticscholar_set.all():
|
|
data = semanticscholar(semantic.paper_id)
|
|
if abstract := data['abstract']:
|
|
publication.abstract = abstract
|
|
if publication.peer_reviewed == None: publication.peer_reviewed = False
|
|
try:
|
|
publication.full_clean()
|
|
publication.save()
|
|
self.log_success(f"Added abstract for: {publication}")
|
|
except ValidationError as e:
|
|
raise CommandError(f"{publication}: {e}")
|
|
sleep(2) # Throttle to avoid rate-limiting
|
|
|
|
@transaction.atomic
|
|
def search_on_dblp(self, publication: Publication):
|
|
query, results, total = dblp.PublicationResult.from_search(publication.title, 100)
|
|
if total == 0: return
|
|
for result in results:
|
|
if publication.doi and result.doi:
|
|
if publication.doi.lower() == result.doi.lower():
|
|
publication.cite_key = result.cite_key
|
|
publication.year = result.year
|
|
if not result.is_peer_reviewed == None:
|
|
publication.peer_reviewed = result.is_peer_reviewed
|
|
publication.first_page = result.first_page
|
|
publication.last_page = result.last_page
|
|
print(publication.peer_reviewed, result)
|
|
try:
|
|
publication.full_clean()
|
|
publication.save()
|
|
self.log_success(f"Added DBLP info for: {publication}")
|
|
except ValidationError as e:
|
|
raise CommandError(f"{publication}: {e}")
|
|
|
|
# Store Authors
|
|
authors: List[Author] = []
|
|
for (pid, name) in result.authors:
|
|
author, created = Author.objects.get_or_create(name=name)
|
|
|
|
if created:
|
|
self.log_success(f"Added author: {author}")
|
|
else:
|
|
self.log_info(f"Author '{author}' already known")
|
|
if not author.dblp_id and pid != None:
|
|
try:
|
|
author.dblp_id = pid
|
|
author.full_clean()
|
|
author.save()
|
|
self.log_success(f"Added pid to author: {author}")
|
|
except ValidationError as e:
|
|
raise CommandError(f"{author}: {e}")
|
|
|
|
authors.append(author)
|
|
|
|
# Replace authors for publication
|
|
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
|
|
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
|
|
for publication_author in PublicationAuthor.objects.filter(publication=publication):
|
|
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
|
|
try:
|
|
publication_author.delete()
|
|
except e:
|
|
raise CommandError(f"{publication} - {author}: {e}")
|
|
|
|
for position, author in enumerate(authors):
|
|
publication_author, created = PublicationAuthor.objects.get_or_create(
|
|
author=author,
|
|
publication=publication,
|
|
position=position,
|
|
)
|
|
if created:
|
|
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
|
|
else:
|
|
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
|
|
|
|
else: continue # quite definitely not the same publication
|
|
elif publication.title == result.title and publication.year == result.year:
|
|
self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}")
|
|
else: continue # I'd rather look at that in detail for now
|
|
|
|
def find_secondary_on_dblp(self):
|
|
self.log_info("--- Searching for snowballed sources on DBLP ---")
|
|
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
|
|
if publication.stage == 'excluded': continue
|
|
self.search_on_dblp(publication)
|
|
sleep(2) # Throttle to avoid rate-limiting
|
|
keys = set(
|
|
Publication.objects.exclude(
|
|
cite_key__startswith=dblp.CITE_KEY_PREFIX
|
|
).values_list('cite_key', flat=True).distinct()
|
|
)
|
|
n = len(keys)
|
|
self.log_info(f"Found {n} publications that still need to be verified")
|
|
|
|
def fix_dblp(self):
|
|
self.log_info("--- Searching for entries not in the default DBLP dump ---")
|
|
keys_in_db = set(
|
|
Publication.objects.filter(
|
|
cite_key__startswith=dblp.CITE_KEY_PREFIX
|
|
).values_list('cite_key', flat=True).distinct()
|
|
)
|
|
keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH)
|
|
|
|
self.stdout.write(f"DB: {len(keys_in_db):8d}")
|
|
self.stdout.write(f"DBLP: {len(keys_in_dump):8d}")
|
|
pprint(keys_in_db - keys_in_dump)
|
|
|
|
def find_missing_dois(self):
|
|
self.log_info("--- Searching for missing DOIs ---")
|
|
publications = Publication.objects.filter(doi__isnull=True)
|
|
keys = {
|
|
dblp.strip_cite_key_prefix(cite_key)
|
|
for cite_key in publications.values_list('cite_key', flat=True)
|
|
}
|
|
self.log_info("Parsing DBLP dump...")
|
|
results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys)
|
|
self.log_info("done")
|
|
|
|
for result in results:
|
|
if doi := result.doi:
|
|
publication = publications.get(cite_key=result.cite_key)
|
|
publication.doi = doi
|
|
publication.full_clean()
|
|
publication.save()
|
|
self.log_success(f"Added DOI '{doi}' to publication: {publication}")
|
|
|
|
def find_semanticscholar_ids(self):
|
|
self.log_info("--- Searching for paper IDs on Semantic Scholar ---")
|
|
publications = Publication.objects.filter(
|
|
doi__isnull=False,
|
|
semanticscholar__isnull=True,
|
|
)
|
|
for publication in publications:
|
|
data = semanticscholar(publication.doi)
|
|
if not 'error' in data:
|
|
paper_id = data['paperId']
|
|
obj = SemanticScholar(paper_id=paper_id, publication=publication)
|
|
obj.full_clean()
|
|
obj.save()
|
|
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
|
|
else: print(publication)
|
|
|
|
sleep(2) # Throttle to avoid rate-limiting
|
|
|
|
# BaseCommand
|
|
def add_arguments(self, parser: CommandParser):
|
|
parser.add_argument('-b', '--abstract', action='store_true')
|
|
parser.add_argument('-d', '--dblp', action='store_true')
|
|
parser.add_argument('-a', '--authors', action='store_true')
|
|
parser.add_argument('-s', '--secondary', action='store_true')
|
|
|
|
def handle(self, *args, **options):
|
|
self.fix_references()
|
|
if options['abstract']: self.fix_abstracts()
|
|
self.find_secondary_on_dblp()
|
|
if options['dblp']:
|
|
self.fix_dblp()
|
|
self.find_missing_dois()
|
|
self.find_semanticscholar_ids()
|
|
if options['authors']: self.fix_authors()
|
|
|
|
|