Basically a fork from https://github.com/blochberger/sokman but with the intention of adding a visual interface as well
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

367 lines
13 KiB

from pprint import pprint
from time import sleep
from tqdm import tqdm
import re, requests, html
from django.db import transaction
from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand, CommandError, CommandParser
from django.db.models import Count
from typing import List, Optional, Tuple
import sok.management.commands.dblpimport as dblp
from sok.management.commands.snowball import semanticscholar
from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor
class Command(BaseCommand):
def log_success(self, msg: str):
self.stdout.write(self.style.SUCCESS(msg))
def log_warn(self, msg: str):
self.stdout.write(self.style.WARNING(msg))
def log_info(self, msg: str, nl: bool = True):
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
self.stdout.flush()
@transaction.atomic
def fix_references(self) -> None:
"""
Create relevant references to masters of referenced variants.
If mulitple variants of a publication exist, only the master variant is
considered. However, relevant publications might reference a non-master
master-variant, e. g., a preprint.
This command adds references to the master-variant, even though this
reference is not actually present in the publication. The reference
identifier is marked with a star, e. g., '[1]*'.
"""
self.log_info("--- Searching for references to variants ---")
for publication in Publication.objects.filter(variant_of__isnull=False):
variant = publication.variant_of
origs = PublicationReference.objects.filter(reference=publication)
for orig in origs:
if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists():
continue
fixed = PublicationReference(
reference=variant,
publication=orig.publication,
identifier=('' if orig.identifier is None else orig.identifier) + "*",
)
try:
fixed.full_clean()
fixed.save()
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}")
except ValidationError as e:
print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication)
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}")
@transaction.atomic
def fix_abstracts(self) -> None:
"""
Add abstracts to those publications that have one on SemanticScholar
If mulitple variants of a publication exist, only the master variant is
considered.
"""
self.log_info("--- Searching for publications without abstracts ---")
self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract")
for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"):
for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if abstract := data['abstract']:
publication.abstract = abstract
if publication.peer_reviewed == None: publication.peer_reviewed = False
try:
publication.full_clean()
publication.save()
self.log_success(f"Added abstract for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
sleep(2) # Throttle to avoid rate-limiting
@transaction.atomic
def search_on_dblp(self, publication: Publication):
query, results, total = dblp.PublicationResult.from_search(publication.title, 100)
if total == 0: return
for result in results:
if publication.doi and result.doi:
if publication.doi.lower() == result.doi.lower():
publication.cite_key = result.cite_key
publication.year = result.year
if not result.is_peer_reviewed == None:
publication.peer_reviewed = result.is_peer_reviewed
publication.first_page = result.first_page
publication.last_page = result.last_page
print(publication.peer_reviewed, result)
try:
publication.full_clean()
publication.save()
self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
# Store Authors
authors: List[Author] = []
for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid to author: {author}")
except ValidationError as e:
raise CommandError(f"{author}: {e}")
authors.append(author)
# Replace authors for publication
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
for publication_author in PublicationAuthor.objects.filter(publication=publication):
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
try:
publication_author.delete()
except e:
raise CommandError(f"{publication} - {author}: {e}")
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
else: continue # quite definitely not the same publication
elif publication.title == result.title and publication.year == result.year:
self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}")
else: continue # I'd rather look at that in detail for now
def find_secondary_on_dblp(self):
self.log_info("--- Searching for snowballed sources on DBLP ---")
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
if publication.stage == 'excluded': continue
self.search_on_dblp(publication)
sleep(2) # Throttle to avoid rate-limiting
keys = set(
Publication.objects.exclude(
cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('cite_key', flat=True).distinct()
)
n = len(keys)
self.log_info(f"Found {n} publications that still need to be verified")
def fix_dblp(self):
self.log_info("--- Searching for entries not in the default DBLP dump ---")
keys_in_db = set(
Publication.objects.filter(
cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('cite_key', flat=True).distinct()
)
keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH)
self.stdout.write(f"DB: {len(keys_in_db):8d}")
self.stdout.write(f"DBLP: {len(keys_in_dump):8d}")
pprint(keys_in_db - keys_in_dump)
def find_missing_dois(self):
self.log_info("--- Searching for missing DOIs ---")
publications = Publication.objects.filter(doi__isnull=True)
keys = {
dblp.strip_cite_key_prefix(cite_key)
for cite_key in publications.values_list('cite_key', flat=True)
}
self.log_info("Parsing DBLP dump...")
results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys)
self.log_info("done")
for result in results:
if doi := result.doi:
publication = publications.get(cite_key=result.cite_key)
publication.doi = doi
publication.full_clean()
publication.save()
self.log_success(f"Added DOI '{doi}' to publication: {publication}")
def find_semanticscholar_ids(self):
self.log_info("--- Searching for paper IDs on Semantic Scholar ---")
publications = Publication.objects.filter(
doi__isnull=False,
semanticscholar__isnull=True,
)
for publication in publications:
data = semanticscholar(publication.doi)
if not 'error' in data:
paper_id = data['paperId']
obj = SemanticScholar(paper_id=paper_id, publication=publication)
obj.full_clean()
obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication)
sleep(2) # Throttle to avoid rate-limiting
def find_author_on_dblp(
self,
name: str,
limit: int = 100,
) -> List[Tuple[str, str]]:
url = 'http://dblp.org/search/author/api'
response = requests.get(
url,
params={
'q': name,
'f': 0,
'h': limit,
'c': 0,
'format': 'json',
},
)
response.raise_for_status()
search_result = response.json()['result']
hits = search_result['hits']
if hits['@total'] == '0': return None
results = [hit['info'] for hit in hits['hit']]
return results
def verify_author(self, pid, keys):
url = f"https://dblp.org/pid/{pid}.xml"
response = requests.get(url)
response.raise_for_status()
pubs = re.findall('key="([^"]*)"', str(response.content))
sleep(1) # Throttle to avoid rate-limiting
return keys.intersection(pubs) == keys
def find_pid_for_authors(self):
self.log_info("--- Adding ID's to authors ---")
authors = Author.objects.filter(dblp_id__exact="")
for author in tqdm(authors, unit='author'):
pid = ""
keys = set({
dblp.strip_cite_key_prefix(cite_key)
for cite_key in PublicationAuthor.objects.filter(
author__exact=author,
publication__cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('publication__cite_key', flat=True).distinct()}
)
if len(keys) == 0: continue
options = self.find_author_on_dblp(author.name)
if not options:
self.log_warn(f"Could not find ID for author {author}")
elif len(options) == 1:
pid = re.match('https://dblp.org/pid/(.*)', options[0]['url']).groups()[0]
if self.verify_author(pid, keys):
self.log_success(f"Found {pid} for author {author}")
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid for author {author}")
else:
for opt in options:
if not 'url' in opt: continue
pid = re.match('https://dblp.org/pid/(.*)', opt['url']).groups()[0]
if self.verify_author(pid, keys):
self.log_success(f"Found {pid} for author {author}")
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid for author {author}")
break
@transaction.atomic
def merge(self, author, variant):
for pub_author in PublicationAuthor.objects.filter(author=variant):
publication = pub_author.publication
if PublicationAuthor.objects.filter(author=author, publication=publication).exists():
self.log_warn(f"{author} is already author of {publication}")
break
fixed = PublicationAuthor(
author=author,
publication=publication,
position=pub_author.position
)
try:
pub_author.delete()
fixed.full_clean()
fixed.save()
self.log_success(f"Changed author: {publication}: {variant} -> {fixed.author}")
except ValidationError as e:
raise CommandError(f"{publication}: {variant} -> {fixed.author}: {e}")
assert not PublicationAuthor.objects.filter(author=variant).exists()
variant.delete()
def merge_authors(self) -> None:
self.log_info("--- Searching for potential duplicate authors and merging their publications ---")
dup_ids = Author.objects.exclude(dblp_id__exact="").values_list('dblp_id').annotate(count=Count('id')).filter(count__gt=1)
try:
for (pid, counter) in tqdm(dup_ids, unit="duplicate"):
response = requests.get(f"https://dblp.org/pid/{pid}.xml")
response.raise_for_status()
dblp_person = re.search('dblpperson name="([^"]*)"', response.content.decode('ascii')).groups()
assert len(dblp_person) == 1
dblp_name = html.unescape(dblp_person[0])
variants = Author.objects.filter(dblp_id__exact=pid).exclude(name__exact=dblp_name)
# for v in variants:
# print(v.name, v.dblp_id)
# break
orig = Author.objects.filter(dblp_id__exact=pid, name__exact=dblp_name)
print(variants, dblp_name, orig)
assert len(orig) == 1
self.log_info(f"Suggestion to merge {', '.join([v.name for v in variants])} into {orig[0]}")
while True:
self.log_warn("Merge? [Y/n]")
choice = input().lower()
if choice in {'', 'y', 'yes'}:
for variant in variants: self.merge(orig[0], variant)
break
elif choice in {'', 'n', 'no'}:
break
except KeyboardInterrupt:
raise CommandError("Aborted.")
# BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('-b', '--abstract', action='store_true')
parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true')
def handle(self, *args, **options):
self.fix_references()
if options['secondary']: self.find_secondary_on_dblp()
if options['dblp']:
self.fix_dblp()
self.find_missing_dois()
self.find_semanticscholar_ids()
if options['authors']:
self.find_pid_for_authors()
self.merge_authors()
if options['abstract']: self.fix_abstracts()