Browse Source

Small fixes and author merging

master
Maya Herrscher 1 month ago
parent
commit
3bbcc454c7
  1. 6
      sok/management/commands/dblpimport.py
  2. 138
      sok/management/commands/repair.py

6
sok/management/commands/dblpimport.py

@ -166,7 +166,7 @@ class PublicationResult:
url = f"https://dblp.uni-trier.de/rec/{key}.xml" url = f"https://dblp.uni-trier.de/rec/{key}.xml"
response = requests.get(url) response = requests.get(url)
response.raise_for_status response.raise_for_status()
parser = xml.sax.make_parser() parser = xml.sax.make_parser()
handler = DBLPHandler({key}) handler = DBLPHandler({key})
@ -223,10 +223,10 @@ class PublicationResult:
'format': 'json', 'format': 'json',
}, },
) )
response.raise_for_status response.raise_for_status()
search_result = response.json()['result'] search_result = response.json()['result']
hits = search_result['hits'] hits = search_result['hits']
if hits['@total'] == '0': return (None, None, 0) if hits['@total'] == '0' or hits['@sent'] == '0': return (None, None, 0)
results = [cls.from_search_hit(hit) for hit in hits['hit']] results = [cls.from_search_hit(hit) for hit in hits['hit']]
total = hits['@total'] total = hits['@total']

138
sok/management/commands/repair.py

@ -2,9 +2,12 @@ from pprint import pprint
from time import sleep from time import sleep
from tqdm import tqdm from tqdm import tqdm
import re, requests, html
from django.db import transaction from django.db import transaction
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand, CommandError, CommandParser from django.core.management.base import BaseCommand, CommandError, CommandParser
from django.db.models import Count
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@ -216,6 +219,133 @@ class Command(BaseCommand):
sleep(2) # Throttle to avoid rate-limiting sleep(2) # Throttle to avoid rate-limiting
def find_author_on_dblp(
self,
name: str,
limit: int = 100,
) -> List[Tuple[str, str]]:
url = 'http://dblp.org/search/author/api'
response = requests.get(
url,
params={
'q': name,
'f': 0,
'h': limit,
'c': 0,
'format': 'json',
},
)
response.raise_for_status()
search_result = response.json()['result']
hits = search_result['hits']
if hits['@total'] == '0': return None
results = [hit['info'] for hit in hits['hit']]
return results
def verify_author(self, pid, keys):
url = f"https://dblp.org/pid/{pid}.xml"
response = requests.get(url)
response.raise_for_status()
pubs = re.findall('key="([^"]*)"', str(response.content))
sleep(1) # Throttle to avoid rate-limiting
return keys.intersection(pubs) == keys
def find_pid_for_authors(self):
self.log_info("--- Adding ID's to authors ---")
authors = Author.objects.filter(dblp_id__exact="")
for author in tqdm(authors, unit='author'):
pid = ""
keys = set({
dblp.strip_cite_key_prefix(cite_key)
for cite_key in PublicationAuthor.objects.filter(
author__exact=author,
publication__cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('publication__cite_key', flat=True).distinct()}
)
if len(keys) == 0: continue
options = self.find_author_on_dblp(author.name)
if not options:
self.log_warn(f"Could not find ID for author {author}")
elif len(options) == 1:
pid = re.match('https://dblp.org/pid/(.*)', options[0]['url']).groups()[0]
if self.verify_author(pid, keys):
self.log_success(f"Found {pid} for author {author}")
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid for author {author}")
else:
for opt in options:
if not 'url' in opt: continue
pid = re.match('https://dblp.org/pid/(.*)', opt['url']).groups()[0]
if self.verify_author(pid, keys):
self.log_success(f"Found {pid} for author {author}")
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid for author {author}")
break
@transaction.atomic
def merge(self, author, variant):
for pub_author in PublicationAuthor.objects.filter(author=variant):
publication = pub_author.publication
if PublicationAuthor.objects.filter(author=author, publication=publication).exists():
self.log_warn(f"{author} is already author of {publication}")
break
fixed = PublicationAuthor(
author=author,
publication=publication,
position=pub_author.position
)
try:
pub_author.delete()
fixed.full_clean()
fixed.save()
self.log_success(f"Changed author: {publication}: {variant} -> {fixed.author}")
except ValidationError as e:
raise CommandError(f"{publication}: {variant} -> {fixed.author}: {e}")
assert not PublicationAuthor.objects.filter(author=variant).exists()
variant.delete()
def merge_authors(self) -> None:
self.log_info("--- Searching for potential duplicate authors and merging their publications ---")
dup_ids = Author.objects.exclude(dblp_id__exact="").values_list('dblp_id').annotate(count=Count('id')).filter(count__gt=1)
try:
for (pid, counter) in tqdm(dup_ids, unit="duplicate"):
response = requests.get(f"https://dblp.org/pid/{pid}.xml")
response.raise_for_status()
dblp_person = re.search('dblpperson name="([^"]*)"', response.content.decode('ascii')).groups()
assert len(dblp_person) == 1
dblp_name = html.unescape(dblp_person[0])
variants = Author.objects.filter(dblp_id__exact=pid).exclude(name__exact=dblp_name)
# for v in variants:
# print(v.name, v.dblp_id)
# break
orig = Author.objects.filter(dblp_id__exact=pid, name__exact=dblp_name)
print(variants, dblp_name, orig)
assert len(orig) == 1
self.log_info(f"Suggestion to merge {', '.join([v.name for v in variants])} into {orig[0]}")
while True:
self.log_warn("Merge? [Y/n]")
choice = input().lower()
if choice in {'', 'y', 'yes'}:
for variant in variants: self.merge(orig[0], variant)
break
elif choice in {'', 'n', 'no'}:
break
except KeyboardInterrupt:
raise CommandError("Aborted.")
# BaseCommand # BaseCommand
def add_arguments(self, parser: CommandParser): def add_arguments(self, parser: CommandParser):
parser.add_argument('-b', '--abstract', action='store_true') parser.add_argument('-b', '--abstract', action='store_true')
@ -225,11 +355,13 @@ class Command(BaseCommand):
def handle(self, *args, **options): def handle(self, *args, **options):
self.fix_references() self.fix_references()
if options['abstract']: self.fix_abstracts() if options['secondary']: self.find_secondary_on_dblp()
self.find_secondary_on_dblp()
if options['dblp']: if options['dblp']:
self.fix_dblp() self.fix_dblp()
self.find_missing_dois() self.find_missing_dois()
self.find_semanticscholar_ids() self.find_semanticscholar_ids()
if options['authors']: self.fix_authors() if options['authors']:
self.find_pid_for_authors()
self.merge_authors()
if options['abstract']: self.fix_abstracts()

Loading…
Cancel
Save