|
|
@ -2,9 +2,12 @@ from pprint import pprint |
|
|
|
from time import sleep |
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
import re, requests, html |
|
|
|
|
|
|
|
from django.db import transaction |
|
|
|
from django.core.exceptions import ValidationError |
|
|
|
from django.core.management.base import BaseCommand, CommandError, CommandParser |
|
|
|
from django.db.models import Count |
|
|
|
|
|
|
|
from typing import List, Optional, Tuple |
|
|
|
|
|
|
@ -216,6 +219,133 @@ class Command(BaseCommand): |
|
|
|
|
|
|
|
sleep(2) # Throttle to avoid rate-limiting |
|
|
|
|
|
|
|
def find_author_on_dblp( |
|
|
|
self, |
|
|
|
name: str, |
|
|
|
limit: int = 100, |
|
|
|
) -> List[Tuple[str, str]]: |
|
|
|
url = 'http://dblp.org/search/author/api' |
|
|
|
response = requests.get( |
|
|
|
url, |
|
|
|
params={ |
|
|
|
'q': name, |
|
|
|
'f': 0, |
|
|
|
'h': limit, |
|
|
|
'c': 0, |
|
|
|
'format': 'json', |
|
|
|
}, |
|
|
|
) |
|
|
|
response.raise_for_status() |
|
|
|
search_result = response.json()['result'] |
|
|
|
hits = search_result['hits'] |
|
|
|
if hits['@total'] == '0': return None |
|
|
|
results = [hit['info'] for hit in hits['hit']] |
|
|
|
|
|
|
|
return results |
|
|
|
|
|
|
|
def verify_author(self, pid, keys): |
|
|
|
url = f"https://dblp.org/pid/{pid}.xml" |
|
|
|
response = requests.get(url) |
|
|
|
response.raise_for_status() |
|
|
|
pubs = re.findall('key="([^"]*)"', str(response.content)) |
|
|
|
sleep(1) # Throttle to avoid rate-limiting |
|
|
|
return keys.intersection(pubs) == keys |
|
|
|
|
|
|
|
def find_pid_for_authors(self): |
|
|
|
self.log_info("--- Adding ID's to authors ---") |
|
|
|
authors = Author.objects.filter(dblp_id__exact="") |
|
|
|
for author in tqdm(authors, unit='author'): |
|
|
|
pid = "" |
|
|
|
keys = set({ |
|
|
|
dblp.strip_cite_key_prefix(cite_key) |
|
|
|
for cite_key in PublicationAuthor.objects.filter( |
|
|
|
author__exact=author, |
|
|
|
publication__cite_key__startswith=dblp.CITE_KEY_PREFIX |
|
|
|
).values_list('publication__cite_key', flat=True).distinct()} |
|
|
|
) |
|
|
|
if len(keys) == 0: continue |
|
|
|
options = self.find_author_on_dblp(author.name) |
|
|
|
if not options: |
|
|
|
self.log_warn(f"Could not find ID for author {author}") |
|
|
|
elif len(options) == 1: |
|
|
|
pid = re.match('https://dblp.org/pid/(.*)', options[0]['url']).groups()[0] |
|
|
|
if self.verify_author(pid, keys): |
|
|
|
self.log_success(f"Found {pid} for author {author}") |
|
|
|
author.dblp_id = pid |
|
|
|
author.full_clean() |
|
|
|
author.save() |
|
|
|
self.log_success(f"Added pid for author {author}") |
|
|
|
else: |
|
|
|
for opt in options: |
|
|
|
if not 'url' in opt: continue |
|
|
|
pid = re.match('https://dblp.org/pid/(.*)', opt['url']).groups()[0] |
|
|
|
if self.verify_author(pid, keys): |
|
|
|
self.log_success(f"Found {pid} for author {author}") |
|
|
|
author.dblp_id = pid |
|
|
|
author.full_clean() |
|
|
|
author.save() |
|
|
|
self.log_success(f"Added pid for author {author}") |
|
|
|
break |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def merge(self, author, variant): |
|
|
|
for pub_author in PublicationAuthor.objects.filter(author=variant): |
|
|
|
publication = pub_author.publication |
|
|
|
if PublicationAuthor.objects.filter(author=author, publication=publication).exists(): |
|
|
|
self.log_warn(f"{author} is already author of {publication}") |
|
|
|
break |
|
|
|
fixed = PublicationAuthor( |
|
|
|
author=author, |
|
|
|
publication=publication, |
|
|
|
position=pub_author.position |
|
|
|
) |
|
|
|
try: |
|
|
|
pub_author.delete() |
|
|
|
fixed.full_clean() |
|
|
|
fixed.save() |
|
|
|
self.log_success(f"Changed author: {publication}: {variant} -> {fixed.author}") |
|
|
|
except ValidationError as e: |
|
|
|
raise CommandError(f"{publication}: {variant} -> {fixed.author}: {e}") |
|
|
|
assert not PublicationAuthor.objects.filter(author=variant).exists() |
|
|
|
variant.delete() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_authors(self) -> None: |
|
|
|
self.log_info("--- Searching for potential duplicate authors and merging their publications ---") |
|
|
|
|
|
|
|
dup_ids = Author.objects.exclude(dblp_id__exact="").values_list('dblp_id').annotate(count=Count('id')).filter(count__gt=1) |
|
|
|
|
|
|
|
try: |
|
|
|
for (pid, counter) in tqdm(dup_ids, unit="duplicate"): |
|
|
|
|
|
|
|
response = requests.get(f"https://dblp.org/pid/{pid}.xml") |
|
|
|
response.raise_for_status() |
|
|
|
dblp_person = re.search('dblpperson name="([^"]*)"', response.content.decode('ascii')).groups() |
|
|
|
assert len(dblp_person) == 1 |
|
|
|
dblp_name = html.unescape(dblp_person[0]) |
|
|
|
|
|
|
|
variants = Author.objects.filter(dblp_id__exact=pid).exclude(name__exact=dblp_name) |
|
|
|
# for v in variants: |
|
|
|
# print(v.name, v.dblp_id) |
|
|
|
# break |
|
|
|
orig = Author.objects.filter(dblp_id__exact=pid, name__exact=dblp_name) |
|
|
|
print(variants, dblp_name, orig) |
|
|
|
assert len(orig) == 1 |
|
|
|
|
|
|
|
self.log_info(f"Suggestion to merge {', '.join([v.name for v in variants])} into {orig[0]}") |
|
|
|
|
|
|
|
while True: |
|
|
|
self.log_warn("Merge? [Y/n]") |
|
|
|
choice = input().lower() |
|
|
|
if choice in {'', 'y', 'yes'}: |
|
|
|
for variant in variants: self.merge(orig[0], variant) |
|
|
|
break |
|
|
|
elif choice in {'', 'n', 'no'}: |
|
|
|
break |
|
|
|
except KeyboardInterrupt: |
|
|
|
raise CommandError("Aborted.") |
|
|
|
|
|
|
|
# BaseCommand |
|
|
|
def add_arguments(self, parser: CommandParser): |
|
|
|
parser.add_argument('-b', '--abstract', action='store_true') |
|
|
@ -225,11 +355,13 @@ class Command(BaseCommand): |
|
|
|
|
|
|
|
def handle(self, *args, **options): |
|
|
|
self.fix_references() |
|
|
|
if options['abstract']: self.fix_abstracts() |
|
|
|
self.find_secondary_on_dblp() |
|
|
|
if options['secondary']: self.find_secondary_on_dblp() |
|
|
|
if options['dblp']: |
|
|
|
self.fix_dblp() |
|
|
|
self.find_missing_dois() |
|
|
|
self.find_semanticscholar_ids() |
|
|
|
if options['authors']: self.fix_authors() |
|
|
|
if options['authors']: |
|
|
|
self.find_pid_for_authors() |
|
|
|
self.merge_authors() |
|
|
|
if options['abstract']: self.fix_abstracts() |
|
|
|
|
|
|
|