Browse Source

First steps towards fixing authors

master
Maya Herrscher 2 months ago
parent
commit
66a1054472
  1. 5
      sok/management/commands/dblpimport.py
  2. 12
      sok/management/commands/dblpsearch.py
  3. 69
      sok/management/commands/repair.py
  4. 18
      sok/migrations/0007_author_dblp_id.py
  5. 1
      sok/models.py

5
sok/management/commands/dblpimport.py

@ -96,7 +96,7 @@ class PublicationResult:
year: int
pages: Optional[Tuple[int, int]]
dblp_doi: Optional[str] = None
authors: List[str] = field(default_factory=list)
authors: List[Tuple[str,str]] = field(default_factory=list)
urls: List[str] = field(default_factory=list)
@property
@ -202,7 +202,7 @@ class PublicationResult:
year=int(info['year']),
pages=pages,
dblp_doi=doi,
authors=[html.unescape(author['text']) for author in authors],
authors=[(author['@pid'], html.unescape(author['text'])) for author in authors],
)
@classmethod
@ -453,6 +453,7 @@ class Command(BaseCommand):
# Add authors to database
authors: List[Author] = []
for name in result.authors:
# TODO? find author id's -> not in xml!
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")

12
sok/management/commands/dblpsearch.py

@ -34,7 +34,7 @@ class Command(BaseCommand):
self.stdout.write("")
self.log_info(result.cite_key)
if 0 < len(result.authors):
self.stdout.write(" " + ", ".join([name for name in result.authors]))
self.stdout.write(" " + ", ".join([author.name for author in result.authors]))
self.log_info(" " + result.title, nl=False)
self.stdout.write(f" ({result.year})")
@ -66,8 +66,16 @@ class Command(BaseCommand):
# Store Authors
authors: List[Author] = []
for name in result.authors:
for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name)
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
except ValidationError as e:
raise CommandError(f"{author}: {e}")
if created:
self.log_success(f"Added author: {author}")
else:

69
sok/management/commands/repair.py

@ -4,12 +4,14 @@ from tqdm import tqdm
from django.db import transaction
from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand, CommandError
from django.core.management.base import BaseCommand, CommandError, CommandParser
from typing import List, Optional, Tuple
import sok.management.commands.dblpimport as dblp
from sok.management.commands.snowball import semanticscholar
from sok.models import Publication, PublicationReference, SemanticScholar
from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor
class Command(BaseCommand):
@ -17,6 +19,9 @@ class Command(BaseCommand):
def log_success(self, msg: str):
self.stdout.write(self.style.SUCCESS(msg))
def log_warn(self, msg: str):
self.stdout.write(self.style.WARNING(msg))
def log_info(self, msg: str, nl: bool = True):
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '')
self.stdout.flush()
@ -71,6 +76,7 @@ class Command(BaseCommand):
data = semanticscholar(semantic.paper_id)
if abstract := data['abstract']:
publication.abstract = abstract
if publication.peer_reviewed == None: publication.peer_reviewed = False
try:
publication.full_clean()
publication.save()
@ -93,20 +99,61 @@ class Command(BaseCommand):
publication.first_page = result.first_page
publication.last_page = result.last_page
print(publication.peer_reviewed, result)
# keep authors from semantic scholar for now, even though they might be a little broken?
try:
publication.full_clean()
publication.save()
self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
# Store Authors
authors: List[Author] = []
for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid to author: {author}")
except ValidationError as e:
raise CommandError(f"{author}: {e}")
authors.append(author)
# Replace authors for publication
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
for publication_author in PublicationAuthor.objects.filter(publication=publication):
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
try:
publication_author.delete()
except e:
raise CommandError(f"{publication} - {author}: {e}")
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
else: continue # quite definitely not the same publication
elif publication.title == result.title and publication.year == result.year:
print(f"Not quite certain about {result.cite_key} for publication {publication.title}")
self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}")
else: continue # I'd rather look at that in detail for now
def find_secondary_on_dblp(self):
self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---")
self.log_info("--- Searching for snowballed sources on DBLP ---")
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
if publication.stage == 'excluded': continue
self.search_on_dblp(publication)
@ -117,7 +164,7 @@ class Command(BaseCommand):
).values_list('cite_key', flat=True).distinct()
)
n = len(keys)
self.log_info(f"Found {n} publications that still need to be checked")
self.log_info(f"Found {n} publications that still need to be verified")
def fix_dblp(self):
self.log_info("--- Searching for entries not in the default DBLP dump ---")
@ -170,11 +217,19 @@ class Command(BaseCommand):
sleep(2) # Throttle to avoid rate-limiting
# BaseCommand
def add_arguments(self, parser: CommandParser):
parser.add_argument('-b', '--abstract', action='store_true')
parser.add_argument('-d', '--dblp', action='store_true')
parser.add_argument('-a', '--authors', action='store_true')
parser.add_argument('-s', '--secondary', action='store_true')
def handle(self, *args, **options):
self.fix_references()
self.fix_abstracts()
if options['abstract']: self.fix_abstracts()
self.find_secondary_on_dblp()
if options['dblp']:
self.fix_dblp()
self.find_missing_dois()
self.find_semanticscholar_ids()
if options['authors']: self.fix_authors()

18
sok/migrations/0007_author_dblp_id.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-16 08:36
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0006_alter_publication_abstract'),
]
operations = [
migrations.AddField(
model_name='author',
name='dblp_id',
field=models.CharField(blank=True, max_length=127),
),
]

1
sok/models.py

@ -7,6 +7,7 @@ from django.db.models.query import QuerySet
class Author(models.Model):
name = models.CharField(max_length=255, unique=True)
dblp_id = models.CharField(max_length=127, blank=True)
def __str__(self) -> str:
return self.name

Loading…
Cancel
Save