Browse Source

First steps towards fixing authors

master
Maya Herrscher 2 months ago
parent
commit
66a1054472
  1. 5
      sok/management/commands/dblpimport.py
  2. 12
      sok/management/commands/dblpsearch.py
  3. 75
      sok/management/commands/repair.py
  4. 18
      sok/migrations/0007_author_dblp_id.py
  5. 1
      sok/models.py

5
sok/management/commands/dblpimport.py

@ -96,7 +96,7 @@ class PublicationResult:
year: int year: int
pages: Optional[Tuple[int, int]] pages: Optional[Tuple[int, int]]
dblp_doi: Optional[str] = None dblp_doi: Optional[str] = None
authors: List[str] = field(default_factory=list) authors: List[Tuple[str,str]] = field(default_factory=list)
urls: List[str] = field(default_factory=list) urls: List[str] = field(default_factory=list)
@property @property
@ -202,7 +202,7 @@ class PublicationResult:
year=int(info['year']), year=int(info['year']),
pages=pages, pages=pages,
dblp_doi=doi, dblp_doi=doi,
authors=[html.unescape(author['text']) for author in authors], authors=[(author['@pid'], html.unescape(author['text'])) for author in authors],
) )
@classmethod @classmethod
@ -453,6 +453,7 @@ class Command(BaseCommand):
# Add authors to database # Add authors to database
authors: List[Author] = [] authors: List[Author] = []
for name in result.authors: for name in result.authors:
# TODO? find author id's -> not in xml!
author, created = Author.objects.get_or_create(name=name) author, created = Author.objects.get_or_create(name=name)
if created: if created:
self.log_success(f"Added author: {author}") self.log_success(f"Added author: {author}")

12
sok/management/commands/dblpsearch.py

@ -34,7 +34,7 @@ class Command(BaseCommand):
self.stdout.write("") self.stdout.write("")
self.log_info(result.cite_key) self.log_info(result.cite_key)
if 0 < len(result.authors): if 0 < len(result.authors):
self.stdout.write(" " + ", ".join([name for name in result.authors])) self.stdout.write(" " + ", ".join([author.name for author in result.authors]))
self.log_info(" " + result.title, nl=False) self.log_info(" " + result.title, nl=False)
self.stdout.write(f" ({result.year})") self.stdout.write(f" ({result.year})")
@ -66,8 +66,16 @@ class Command(BaseCommand):
# Store Authors # Store Authors
authors: List[Author] = [] authors: List[Author] = []
for name in result.authors: for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name) author, created = Author.objects.get_or_create(name=name)
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
except ValidationError as e:
raise CommandError(f"{author}: {e}")
if created: if created:
self.log_success(f"Added author: {author}") self.log_success(f"Added author: {author}")
else: else:

75
sok/management/commands/repair.py

@ -4,12 +4,14 @@ from tqdm import tqdm
from django.db import transaction from django.db import transaction
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand, CommandError from django.core.management.base import BaseCommand, CommandError, CommandParser
from typing import List, Optional, Tuple
import sok.management.commands.dblpimport as dblp import sok.management.commands.dblpimport as dblp
from sok.management.commands.snowball import semanticscholar from sok.management.commands.snowball import semanticscholar
from sok.models import Publication, PublicationReference, SemanticScholar from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor
class Command(BaseCommand): class Command(BaseCommand):
@ -17,6 +19,9 @@ class Command(BaseCommand):
def log_success(self, msg: str): def log_success(self, msg: str):
self.stdout.write(self.style.SUCCESS(msg)) self.stdout.write(self.style.SUCCESS(msg))
def log_warn(self, msg: str):
self.stdout.write(self.style.WARNING(msg))
def log_info(self, msg: str, nl: bool = True): def log_info(self, msg: str, nl: bool = True):
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
self.stdout.flush() self.stdout.flush()
@ -71,6 +76,7 @@ class Command(BaseCommand):
data = semanticscholar(semantic.paper_id) data = semanticscholar(semantic.paper_id)
if abstract := data['abstract']: if abstract := data['abstract']:
publication.abstract = abstract publication.abstract = abstract
if publication.peer_reviewed == None: publication.peer_reviewed = False
try: try:
publication.full_clean() publication.full_clean()
publication.save() publication.save()
@ -93,20 +99,61 @@ class Command(BaseCommand):
publication.first_page = result.first_page publication.first_page = result.first_page
publication.last_page = result.last_page publication.last_page = result.last_page
print(publication.peer_reviewed, result) print(publication.peer_reviewed, result)
# keep authors from semantic scholar for now, even though they might be a little broken?
try: try:
publication.full_clean() publication.full_clean()
publication.save() publication.save()
self.log_success(f"Added DBLP info for: {publication}") self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e: except ValidationError as e:
raise CommandError(f"{publication}: {e}") raise CommandError(f"{publication}: {e}")
# Store Authors
authors: List[Author] = []
for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid to author: {author}")
except ValidationError as e:
raise CommandError(f"{author}: {e}")
authors.append(author)
# Replace authors for publication
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
for publication_author in PublicationAuthor.objects.filter(publication=publication):
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
try:
publication_author.delete()
except e:
raise CommandError(f"{publication} - {author}: {e}")
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
else: continue # quite definitely not the same publication else: continue # quite definitely not the same publication
elif publication.title == result.title and publication.year == result.year: elif publication.title == result.title and publication.year == result.year:
print(f"Not quite certain about {result.cite_key} for publication {publication.title}") self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}")
else: continue # I'd rather look at that in detail for now else: continue # I'd rather look at that in detail for now
def find_secondary_on_dblp(self): def find_secondary_on_dblp(self):
self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---") self.log_info("--- Searching for snowballed sources on DBLP ---")
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
if publication.stage == 'excluded': continue if publication.stage == 'excluded': continue
self.search_on_dblp(publication) self.search_on_dblp(publication)
@ -117,7 +164,7 @@ class Command(BaseCommand):
).values_list('cite_key', flat=True).distinct() ).values_list('cite_key', flat=True).distinct()
) )
n = len(keys) n = len(keys)
self.log_info(f"Found {n} publications that still need to be checked") self.log_info(f"Found {n} publications that still need to be verified")
def fix_dblp(self): def fix_dblp(self):
self.log_info("--- Searching for entries not in the default DBLP dump ---") self.log_info("--- Searching for entries not in the default DBLP dump ---")
@ -170,11 +217,19 @@ class Command(BaseCommand):
sleep(2) # Throttle to avoid rate-limiting sleep(2) # Throttle to avoid rate-limiting
# BaseCommand # BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('-b', '--abstract', action='store_true')
parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true')
def handle(self, *args, **options): def handle(self, *args, **options):
self.fix_references() self.fix_references()
self.fix_abstracts() if options['abstract']: self.fix_abstracts()
self.find_secondary_on_dblp() self.find_secondary_on_dblp()
self.fix_dblp() if options['dblp']:
self.find_missing_dois() self.fix_dblp()
self.find_semanticscholar_ids() self.find_missing_dois()
self.find_semanticscholar_ids()
if options['authors']: self.fix_authors()

18
sok/migrations/0007_author_dblp_id.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-16 08:36
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0006_alter_publication_abstract'),
]
operations = [
migrations.AddField(
model_name='author',
name='dblp_id',
field=models.CharField(blank=True, max_length=127),
),
]

1
sok/models.py

@ -7,6 +7,7 @@ from django.db.models.query import QuerySet
class Author(models.Model): class Author(models.Model):
name = models.CharField(max_length=255, unique=True) name = models.CharField(max_length=255, unique=True)
dblp_id = models.CharField(max_length=127, blank=True)
def __str__(self) -> str: def __str__(self) -> str:
return self.name return self.name

Loading…
Cancel
Save