Basically a fork from https://github.com/blochberger/sokman but with the intention of adding a visual interface as well
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
367 lines
13 KiB
367 lines
13 KiB
from pprint import pprint
|
|
from time import sleep
|
|
from tqdm import tqdm
|
|
|
|
import re, requests, html
|
|
|
|
from django.db import transaction
|
|
from django.core.exceptions import ValidationError
|
|
from django.core.management.base import BaseCommand, CommandError, CommandParser
|
|
from django.db.models import Count
|
|
|
|
from typing import List, Optional, Tuple
|
|
|
|
import sok.management.commands.dblpimport as dblp
|
|
|
|
from sok.management.commands.snowball import semanticscholar
|
|
from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor
|
|
|
|
|
|
class Command(BaseCommand):
|
|
|
|
def log_success(self, msg: str):
|
|
self.stdout.write(self.style.SUCCESS(msg))
|
|
|
|
def log_warn(self, msg: str):
|
|
self.stdout.write(self.style.WARNING(msg))
|
|
|
|
def log_info(self, msg: str, nl: bool = True):
|
|
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
|
|
self.stdout.flush()
|
|
|
|
@transaction.atomic
|
|
def fix_references(self) -> None:
|
|
"""
|
|
Create relevant references to masters of referenced variants.
|
|
|
|
If mulitple variants of a publication exist, only the master variant is
|
|
considered. However, relevant publications might reference a non-master
|
|
master-variant, e. g., a preprint.
|
|
|
|
This command adds references to the master-variant, even though this
|
|
reference is not actually present in the publication. The reference
|
|
identifier is marked with a star, e. g., '[1]*'.
|
|
"""
|
|
|
|
self.log_info("--- Searching for references to variants ---")
|
|
for publication in Publication.objects.filter(variant_of__isnull=False):
|
|
variant = publication.variant_of
|
|
origs = PublicationReference.objects.filter(reference=publication)
|
|
for orig in origs:
|
|
if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists():
|
|
continue
|
|
fixed = PublicationReference(
|
|
reference=variant,
|
|
publication=orig.publication,
|
|
identifier=('' if orig.identifier is None else orig.identifier) + "*",
|
|
)
|
|
try:
|
|
fixed.full_clean()
|
|
fixed.save()
|
|
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}")
|
|
except ValidationError as e:
|
|
print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication)
|
|
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}")
|
|
|
|
@transaction.atomic
|
|
def fix_abstracts(self) -> None:
|
|
"""
|
|
Add abstracts to those publications that have one on SemanticScholar
|
|
|
|
If mulitple variants of a publication exist, only the master variant is
|
|
considered.
|
|
"""
|
|
|
|
self.log_info("--- Searching for publications without abstracts ---")
|
|
self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract")
|
|
for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"):
|
|
for semantic in publication.semanticscholar_set.all():
|
|
data = semanticscholar(semantic.paper_id)
|
|
if abstract := data['abstract']:
|
|
publication.abstract = abstract
|
|
if publication.peer_reviewed == None: publication.peer_reviewed = False
|
|
try:
|
|
publication.full_clean()
|
|
publication.save()
|
|
self.log_success(f"Added abstract for: {publication}")
|
|
except ValidationError as e:
|
|
raise CommandError(f"{publication}: {e}")
|
|
sleep(2) # Throttle to avoid rate-limiting
|
|
|
|
@transaction.atomic
|
|
def search_on_dblp(self, publication: Publication):
|
|
query, results, total = dblp.PublicationResult.from_search(publication.title, 100)
|
|
if total == 0: return
|
|
for result in results:
|
|
if publication.doi and result.doi:
|
|
if publication.doi.lower() == result.doi.lower():
|
|
publication.cite_key = result.cite_key
|
|
publication.year = result.year
|
|
if not result.is_peer_reviewed == None:
|
|
publication.peer_reviewed = result.is_peer_reviewed
|
|
publication.first_page = result.first_page
|
|
publication.last_page = result.last_page
|
|
print(publication.peer_reviewed, result)
|
|
try:
|
|
publication.full_clean()
|
|
publication.save()
|
|
self.log_success(f"Added DBLP info for: {publication}")
|
|
except ValidationError as e:
|
|
raise CommandError(f"{publication}: {e}")
|
|
|
|
# Store Authors
|
|
authors: List[Author] = []
|
|
for (pid, name) in result.authors:
|
|
author, created = Author.objects.get_or_create(name=name)
|
|
|
|
if created:
|
|
self.log_success(f"Added author: {author}")
|
|
else:
|
|
self.log_info(f"Author '{author}' already known")
|
|
if not author.dblp_id and pid != None:
|
|
try:
|
|
author.dblp_id = pid
|
|
author.full_clean()
|
|
author.save()
|
|
self.log_success(f"Added pid to author: {author}")
|
|
except ValidationError as e:
|
|
raise CommandError(f"{author}: {e}")
|
|
|
|
authors.append(author)
|
|
|
|
# Replace authors for publication
|
|
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
|
|
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
|
|
for publication_author in PublicationAuthor.objects.filter(publication=publication):
|
|
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
|
|
try:
|
|
publication_author.delete()
|
|
except e:
|
|
raise CommandError(f"{publication} - {author}: {e}")
|
|
|
|
for position, author in enumerate(authors):
|
|
publication_author, created = PublicationAuthor.objects.get_or_create(
|
|
author=author,
|
|
publication=publication,
|
|
position=position,
|
|
)
|
|
if created:
|
|
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
|
|
else:
|
|
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
|
|
|
|
else: continue # quite definitely not the same publication
|
|
elif publication.title == result.title and publication.year == result.year:
|
|
self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}")
|
|
else: continue # I'd rather look at that in detail for now
|
|
|
|
def find_secondary_on_dblp(self):
|
|
self.log_info("--- Searching for snowballed sources on DBLP ---")
|
|
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
|
|
if publication.stage == 'excluded': continue
|
|
self.search_on_dblp(publication)
|
|
sleep(2) # Throttle to avoid rate-limiting
|
|
keys = set(
|
|
Publication.objects.exclude(
|
|
cite_key__startswith=dblp.CITE_KEY_PREFIX
|
|
).values_list('cite_key', flat=True).distinct()
|
|
)
|
|
n = len(keys)
|
|
self.log_info(f"Found {n} publications that still need to be verified")
|
|
|
|
def fix_dblp(self):
|
|
self.log_info("--- Searching for entries not in the default DBLP dump ---")
|
|
keys_in_db = set(
|
|
Publication.objects.filter(
|
|
cite_key__startswith=dblp.CITE_KEY_PREFIX
|
|
).values_list('cite_key', flat=True).distinct()
|
|
)
|
|
keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH)
|
|
|
|
self.stdout.write(f"DB: {len(keys_in_db):8d}")
|
|
self.stdout.write(f"DBLP: {len(keys_in_dump):8d}")
|
|
pprint(keys_in_db - keys_in_dump)
|
|
|
|
def find_missing_dois(self):
|
|
self.log_info("--- Searching for missing DOIs ---")
|
|
publications = Publication.objects.filter(doi__isnull=True)
|
|
keys = {
|
|
dblp.strip_cite_key_prefix(cite_key)
|
|
for cite_key in publications.values_list('cite_key', flat=True)
|
|
}
|
|
self.log_info("Parsing DBLP dump...")
|
|
results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys)
|
|
self.log_info("done")
|
|
|
|
for result in results:
|
|
if doi := result.doi:
|
|
publication = publications.get(cite_key=result.cite_key)
|
|
publication.doi = doi
|
|
publication.full_clean()
|
|
publication.save()
|
|
self.log_success(f"Added DOI '{doi}' to publication: {publication}")
|
|
|
|
def find_semanticscholar_ids(self):
|
|
self.log_info("--- Searching for paper IDs on Semantic Scholar ---")
|
|
publications = Publication.objects.filter(
|
|
doi__isnull=False,
|
|
semanticscholar__isnull=True,
|
|
)
|
|
for publication in publications:
|
|
data = semanticscholar(publication.doi)
|
|
if not 'error' in data:
|
|
paper_id = data['paperId']
|
|
obj = SemanticScholar(paper_id=paper_id, publication=publication)
|
|
obj.full_clean()
|
|
obj.save()
|
|
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
|
|
else: print(publication)
|
|
|
|
sleep(2) # Throttle to avoid rate-limiting
|
|
|
|
def find_author_on_dblp(
|
|
self,
|
|
name: str,
|
|
limit: int = 100,
|
|
) -> List[Tuple[str, str]]:
|
|
url = 'http://dblp.org/search/author/api'
|
|
response = requests.get(
|
|
url,
|
|
params={
|
|
'q': name,
|
|
'f': 0,
|
|
'h': limit,
|
|
'c': 0,
|
|
'format': 'json',
|
|
},
|
|
)
|
|
response.raise_for_status()
|
|
search_result = response.json()['result']
|
|
hits = search_result['hits']
|
|
if hits['@total'] == '0': return None
|
|
results = [hit['info'] for hit in hits['hit']]
|
|
|
|
return results
|
|
|
|
def verify_author(self, pid, keys):
|
|
url = f"https://dblp.org/pid/{pid}.xml"
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
pubs = re.findall('key="([^"]*)"', str(response.content))
|
|
sleep(1) # Throttle to avoid rate-limiting
|
|
return keys.intersection(pubs) == keys
|
|
|
|
def find_pid_for_authors(self):
|
|
self.log_info("--- Adding ID's to authors ---")
|
|
authors = Author.objects.filter(dblp_id__exact="")
|
|
for author in tqdm(authors, unit='author'):
|
|
pid = ""
|
|
keys = set({
|
|
dblp.strip_cite_key_prefix(cite_key)
|
|
for cite_key in PublicationAuthor.objects.filter(
|
|
author__exact=author,
|
|
publication__cite_key__startswith=dblp.CITE_KEY_PREFIX
|
|
).values_list('publication__cite_key', flat=True).distinct()}
|
|
)
|
|
if len(keys) == 0: continue
|
|
options = self.find_author_on_dblp(author.name)
|
|
if not options:
|
|
self.log_warn(f"Could not find ID for author {author}")
|
|
elif len(options) == 1:
|
|
pid = re.match('https://dblp.org/pid/(.*)', options[0]['url']).groups()[0]
|
|
if self.verify_author(pid, keys):
|
|
self.log_success(f"Found {pid} for author {author}")
|
|
author.dblp_id = pid
|
|
author.full_clean()
|
|
author.save()
|
|
self.log_success(f"Added pid for author {author}")
|
|
else:
|
|
for opt in options:
|
|
if not 'url' in opt: continue
|
|
pid = re.match('https://dblp.org/pid/(.*)', opt['url']).groups()[0]
|
|
if self.verify_author(pid, keys):
|
|
self.log_success(f"Found {pid} for author {author}")
|
|
author.dblp_id = pid
|
|
author.full_clean()
|
|
author.save()
|
|
self.log_success(f"Added pid for author {author}")
|
|
break
|
|
|
|
@transaction.atomic
|
|
def merge(self, author, variant):
|
|
for pub_author in PublicationAuthor.objects.filter(author=variant):
|
|
publication = pub_author.publication
|
|
if PublicationAuthor.objects.filter(author=author, publication=publication).exists():
|
|
self.log_warn(f"{author} is already author of {publication}")
|
|
break
|
|
fixed = PublicationAuthor(
|
|
author=author,
|
|
publication=publication,
|
|
position=pub_author.position
|
|
)
|
|
try:
|
|
pub_author.delete()
|
|
fixed.full_clean()
|
|
fixed.save()
|
|
self.log_success(f"Changed author: {publication}: {variant} -> {fixed.author}")
|
|
except ValidationError as e:
|
|
raise CommandError(f"{publication}: {variant} -> {fixed.author}: {e}")
|
|
assert not PublicationAuthor.objects.filter(author=variant).exists()
|
|
variant.delete()
|
|
|
|
|
|
|
|
def merge_authors(self) -> None:
|
|
self.log_info("--- Searching for potential duplicate authors and merging their publications ---")
|
|
|
|
dup_ids = Author.objects.exclude(dblp_id__exact="").values_list('dblp_id').annotate(count=Count('id')).filter(count__gt=1)
|
|
|
|
try:
|
|
for (pid, counter) in tqdm(dup_ids, unit="duplicate"):
|
|
|
|
response = requests.get(f"https://dblp.org/pid/{pid}.xml")
|
|
response.raise_for_status()
|
|
dblp_person = re.search('dblpperson name="([^"]*)"', response.content.decode('ascii')).groups()
|
|
assert len(dblp_person) == 1
|
|
dblp_name = html.unescape(dblp_person[0])
|
|
|
|
variants = Author.objects.filter(dblp_id__exact=pid).exclude(name__exact=dblp_name)
|
|
# for v in variants:
|
|
# print(v.name, v.dblp_id)
|
|
# break
|
|
orig = Author.objects.filter(dblp_id__exact=pid, name__exact=dblp_name)
|
|
print(variants, dblp_name, orig)
|
|
assert len(orig) == 1
|
|
|
|
self.log_info(f"Suggestion to merge {', '.join([v.name for v in variants])} into {orig[0]}")
|
|
|
|
while True:
|
|
self.log_warn("Merge? [Y/n]")
|
|
choice = input().lower()
|
|
if choice in {'', 'y', 'yes'}:
|
|
for variant in variants: self.merge(orig[0], variant)
|
|
break
|
|
elif choice in {'', 'n', 'no'}:
|
|
break
|
|
except KeyboardInterrupt:
|
|
raise CommandError("Aborted.")
|
|
|
|
# BaseCommand
|
|
def add_arguments(self, parser: CommandParser):
|
|
parser.add_argument('-b', '--abstract', action='store_true')
|
|
parser.add_argument('-d', '--dblp', action='store_true')
|
|
parser.add_argument('-a', '--authors', action='store_true')
|
|
parser.add_argument('-s', '--secondary', action='store_true')
|
|
|
|
def handle(self, *args, **options):
|
|
self.fix_references()
|
|
if options['secondary']: self.find_secondary_on_dblp()
|
|
if options['dblp']:
|
|
self.fix_dblp()
|
|
self.find_missing_dois()
|
|
self.find_semanticscholar_ids()
|
|
if options['authors']:
|
|
self.find_pid_for_authors()
|
|
self.merge_authors()
|
|
if options['abstract']: self.fix_abstracts()
|
|
|
|
|