diff --git a/sok/management/commands/dblpimport.py b/sok/management/commands/dblpimport.py index 2bcd418..7af064a 100644 --- a/sok/management/commands/dblpimport.py +++ b/sok/management/commands/dblpimport.py @@ -96,7 +96,7 @@ class PublicationResult: year: int pages: Optional[Tuple[int, int]] dblp_doi: Optional[str] = None - authors: List[str] = field(default_factory=list) + authors: List[Tuple[str,str]] = field(default_factory=list) urls: List[str] = field(default_factory=list) @property @@ -202,7 +202,7 @@ class PublicationResult: year=int(info['year']), pages=pages, dblp_doi=doi, - authors=[html.unescape(author['text']) for author in authors], + authors=[(author['@pid'], html.unescape(author['text'])) for author in authors], ) @classmethod @@ -453,6 +453,7 @@ class Command(BaseCommand): # Add authors to database authors: List[Author] = [] for name in result.authors: + # TODO? find author id's -> not in xml! author, created = Author.objects.get_or_create(name=name) if created: self.log_success(f"Added author: {author}") diff --git a/sok/management/commands/dblpsearch.py b/sok/management/commands/dblpsearch.py index 19aa678..bc5346c 100644 --- a/sok/management/commands/dblpsearch.py +++ b/sok/management/commands/dblpsearch.py @@ -34,7 +34,7 @@ class Command(BaseCommand): self.stdout.write("") self.log_info(result.cite_key) if 0 < len(result.authors): - self.stdout.write(" " + ", ".join([name for name in result.authors])) + self.stdout.write(" " + ", ".join([author.name for author in result.authors])) self.log_info(" " + result.title, nl=False) self.stdout.write(f" ({result.year})") @@ -66,8 +66,16 @@ class Command(BaseCommand): # Store Authors authors: List[Author] = [] - for name in result.authors: + for (pid, name) in result.authors: author, created = Author.objects.get_or_create(name=name) + if not author.dblp_id and pid != None: + try: + author.dblp_id = pid + author.full_clean() + author.save() + except ValidationError as e: + raise CommandError(f"{author}: {e}") + if created: self.log_success(f"Added author: {author}") else: diff --git a/sok/management/commands/repair.py b/sok/management/commands/repair.py index df74d06..78fffdf 100644 --- a/sok/management/commands/repair.py +++ b/sok/management/commands/repair.py @@ -4,12 +4,14 @@ from tqdm import tqdm from django.db import transaction from django.core.exceptions import ValidationError -from django.core.management.base import BaseCommand, CommandError +from django.core.management.base import BaseCommand, CommandError, CommandParser + +from typing import List, Optional, Tuple import sok.management.commands.dblpimport as dblp from sok.management.commands.snowball import semanticscholar -from sok.models import Publication, PublicationReference, SemanticScholar +from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor class Command(BaseCommand): @@ -17,6 +19,9 @@ class Command(BaseCommand): def log_success(self, msg: str): self.stdout.write(self.style.SUCCESS(msg)) + def log_warn(self, msg: str): + self.stdout.write(self.style.WARNING(msg)) + def log_info(self, msg: str, nl: bool = True): self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') self.stdout.flush() @@ -71,6 +76,7 @@ class Command(BaseCommand): data = semanticscholar(semantic.paper_id) if abstract := data['abstract']: publication.abstract = abstract + if publication.peer_reviewed == None: publication.peer_reviewed = False try: publication.full_clean() publication.save() @@ -93,20 +99,61 @@ class Command(BaseCommand): publication.first_page = result.first_page publication.last_page = result.last_page print(publication.peer_reviewed, result) - # keep authors from semantic scholar for now, even though they might be a little broken? try: publication.full_clean() publication.save() self.log_success(f"Added DBLP info for: {publication}") except ValidationError as e: raise CommandError(f"{publication}: {e}") + + # Store Authors + authors: List[Author] = [] + for (pid, name) in result.authors: + author, created = Author.objects.get_or_create(name=name) + + if created: + self.log_success(f"Added author: {author}") + else: + self.log_info(f"Author '{author}' already known") + if not author.dblp_id and pid != None: + try: + author.dblp_id = pid + author.full_clean() + author.save() + self.log_success(f"Added pid to author: {author}") + except ValidationError as e: + raise CommandError(f"{author}: {e}") + + authors.append(author) + + # Replace authors for publication + if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)]) + assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication)) + for publication_author in PublicationAuthor.objects.filter(publication=publication): + self.log_warn(f"Will now try to delete {publication_author.author} from {publication}") + try: + publication_author.delete() + except e: + raise CommandError(f"{publication} - {author}: {e}") + + for position, author in enumerate(authors): + publication_author, created = PublicationAuthor.objects.get_or_create( + author=author, + publication=publication, + position=position, + ) + if created: + self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}") + else: + self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") + else: continue # quite definitely not the same publication elif publication.title == result.title and publication.year == result.year: - print(f"Not quite certain about {result.cite_key} for publication {publication.title}") + self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}") else: continue # I'd rather look at that in detail for now def find_secondary_on_dblp(self): - self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---") + self.log_info("--- Searching for snowballed sources on DBLP ---") for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): if publication.stage == 'excluded': continue self.search_on_dblp(publication) @@ -117,7 +164,7 @@ class Command(BaseCommand): ).values_list('cite_key', flat=True).distinct() ) n = len(keys) - self.log_info(f"Found {n} publications that still need to be checked") + self.log_info(f"Found {n} publications that still need to be verified") def fix_dblp(self): self.log_info("--- Searching for entries not in the default DBLP dump ---") @@ -170,11 +217,19 @@ class Command(BaseCommand): sleep(2) # Throttle to avoid rate-limiting # BaseCommand + def add_arguments(self, parser: CommandParser): + parser.add_argument('-b', '--abstract', action='store_true') + parser.add_argument('-d', '--dblp', action='store_true') + parser.add_argument('-a', '--authors', action='store_true') + parser.add_argument('-s', '--secondary', action='store_true') def handle(self, *args, **options): self.fix_references() - self.fix_abstracts() + if options['abstract']: self.fix_abstracts() self.find_secondary_on_dblp() - self.fix_dblp() - self.find_missing_dois() - self.find_semanticscholar_ids() + if options['dblp']: + self.fix_dblp() + self.find_missing_dois() + self.find_semanticscholar_ids() + if options['authors']: self.fix_authors() + diff --git a/sok/migrations/0007_author_dblp_id.py b/sok/migrations/0007_author_dblp_id.py new file mode 100644 index 0000000..5590a98 --- /dev/null +++ b/sok/migrations/0007_author_dblp_id.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.1 on 2025-05-16 08:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('sok', '0006_alter_publication_abstract'), + ] + + operations = [ + migrations.AddField( + model_name='author', + name='dblp_id', + field=models.CharField(blank=True, max_length=127), + ), + ] diff --git a/sok/models.py b/sok/models.py index 3f6b7e6..98b3a67 100644 --- a/sok/models.py +++ b/sok/models.py @@ -7,6 +7,7 @@ from django.db.models.query import QuerySet class Author(models.Model): name = models.CharField(max_length=255, unique=True) + dblp_id = models.CharField(max_length=127, blank=True) def __str__(self) -> str: return self.name