diff --git a/sok/management/commands/dblpimport.py b/sok/management/commands/dblpimport.py index 7af064a..084f0ad 100644 --- a/sok/management/commands/dblpimport.py +++ b/sok/management/commands/dblpimport.py @@ -166,7 +166,7 @@ class PublicationResult: url = f"https://dblp.uni-trier.de/rec/{key}.xml" response = requests.get(url) - response.raise_for_status + response.raise_for_status() parser = xml.sax.make_parser() handler = DBLPHandler({key}) @@ -223,10 +223,10 @@ class PublicationResult: 'format': 'json', }, ) - response.raise_for_status + response.raise_for_status() search_result = response.json()['result'] hits = search_result['hits'] - if hits['@total'] == '0': return (None, None, 0) + if hits['@total'] == '0' or hits['@sent'] == '0': return (None, None, 0) results = [cls.from_search_hit(hit) for hit in hits['hit']] total = hits['@total'] diff --git a/sok/management/commands/repair.py b/sok/management/commands/repair.py index 78fffdf..d7352f8 100644 --- a/sok/management/commands/repair.py +++ b/sok/management/commands/repair.py @@ -2,9 +2,12 @@ from pprint import pprint from time import sleep from tqdm import tqdm +import re, requests, html + from django.db import transaction from django.core.exceptions import ValidationError from django.core.management.base import BaseCommand, CommandError, CommandParser +from django.db.models import Count from typing import List, Optional, Tuple @@ -216,6 +219,133 @@ class Command(BaseCommand): sleep(2) # Throttle to avoid rate-limiting + def find_author_on_dblp( + self, + name: str, + limit: int = 100, + ) -> List[Tuple[str, str]]: + url = 'http://dblp.org/search/author/api' + response = requests.get( + url, + params={ + 'q': name, + 'f': 0, + 'h': limit, + 'c': 0, + 'format': 'json', + }, + ) + response.raise_for_status() + search_result = response.json()['result'] + hits = search_result['hits'] + if hits['@total'] == '0': return None + results = [hit['info'] for hit in hits['hit']] + + return results + + def verify_author(self, pid, keys): + url = f"https://dblp.org/pid/{pid}.xml" + response = requests.get(url) + response.raise_for_status() + pubs = re.findall('key="([^"]*)"', str(response.content)) + sleep(1) # Throttle to avoid rate-limiting + return keys.intersection(pubs) == keys + + def find_pid_for_authors(self): + self.log_info("--- Adding ID's to authors ---") + authors = Author.objects.filter(dblp_id__exact="") + for author in tqdm(authors, unit='author'): + pid = "" + keys = set({ + dblp.strip_cite_key_prefix(cite_key) + for cite_key in PublicationAuthor.objects.filter( + author__exact=author, + publication__cite_key__startswith=dblp.CITE_KEY_PREFIX + ).values_list('publication__cite_key', flat=True).distinct()} + ) + if len(keys) == 0: continue + options = self.find_author_on_dblp(author.name) + if not options: + self.log_warn(f"Could not find ID for author {author}") + elif len(options) == 1: + pid = re.match('https://dblp.org/pid/(.*)', options[0]['url']).groups()[0] + if self.verify_author(pid, keys): + self.log_success(f"Found {pid} for author {author}") + author.dblp_id = pid + author.full_clean() + author.save() + self.log_success(f"Added pid for author {author}") + else: + for opt in options: + if not 'url' in opt: continue + pid = re.match('https://dblp.org/pid/(.*)', opt['url']).groups()[0] + if self.verify_author(pid, keys): + self.log_success(f"Found {pid} for author {author}") + author.dblp_id = pid + author.full_clean() + author.save() + self.log_success(f"Added pid for author {author}") + break + + @transaction.atomic + def merge(self, author, variant): + for pub_author in PublicationAuthor.objects.filter(author=variant): + publication = pub_author.publication + if PublicationAuthor.objects.filter(author=author, publication=publication).exists(): + self.log_warn(f"{author} is already author of {publication}") + break + fixed = PublicationAuthor( + author=author, + publication=publication, + position=pub_author.position + ) + try: + pub_author.delete() + fixed.full_clean() + fixed.save() + self.log_success(f"Changed author: {publication}: {variant} -> {fixed.author}") + except ValidationError as e: + raise CommandError(f"{publication}: {variant} -> {fixed.author}: {e}") + assert not PublicationAuthor.objects.filter(author=variant).exists() + variant.delete() + + + + def merge_authors(self) -> None: + self.log_info("--- Searching for potential duplicate authors and merging their publications ---") + + dup_ids = Author.objects.exclude(dblp_id__exact="").values_list('dblp_id').annotate(count=Count('id')).filter(count__gt=1) + + try: + for (pid, counter) in tqdm(dup_ids, unit="duplicate"): + + response = requests.get(f"https://dblp.org/pid/{pid}.xml") + response.raise_for_status() + dblp_person = re.search('dblpperson name="([^"]*)"', response.content.decode('ascii')).groups() + assert len(dblp_person) == 1 + dblp_name = html.unescape(dblp_person[0]) + + variants = Author.objects.filter(dblp_id__exact=pid).exclude(name__exact=dblp_name) + # for v in variants: + # print(v.name, v.dblp_id) + # break + orig = Author.objects.filter(dblp_id__exact=pid, name__exact=dblp_name) + print(variants, dblp_name, orig) + assert len(orig) == 1 + + self.log_info(f"Suggestion to merge {', '.join([v.name for v in variants])} into {orig[0]}") + + while True: + self.log_warn("Merge? [Y/n]") + choice = input().lower() + if choice in {'', 'y', 'yes'}: + for variant in variants: self.merge(orig[0], variant) + break + elif choice in {'', 'n', 'no'}: + break + except KeyboardInterrupt: + raise CommandError("Aborted.") + # BaseCommand def add_arguments(self, parser: CommandParser): parser.add_argument('-b', '--abstract', action='store_true') @@ -225,11 +355,13 @@ class Command(BaseCommand): def handle(self, *args, **options): self.fix_references() - if options['abstract']: self.fix_abstracts() - self.find_secondary_on_dblp() + if options['secondary']: self.find_secondary_on_dblp() if options['dblp']: self.fix_dblp() self.find_missing_dois() self.find_semanticscholar_ids() - if options['authors']: self.fix_authors() + if options['authors']: + self.find_pid_for_authors() + self.merge_authors() + if options['abstract']: self.fix_abstracts()