From 230f9a562712be56414b7a1423ab99c094edacf2 Mon Sep 17 00:00:00 2001 From: Maya Herrscher Date: Thu, 15 May 2025 09:53:34 +0200 Subject: [PATCH] Repair stuff, add abstracts, add snowball import (from PR 3 in Original Repo) --- .editorconfig | 5 + sok/management/commands/dblpimport.py | 7 +- sok/management/commands/dblpsearch.py | 17 +++- sok/management/commands/repair.py | 81 +++++++++++++-- sok/management/commands/snowball.py | 98 ++++++++++++++++++- .../0004_alter_publication_references.py | 18 ++++ sok/migrations/0005_publication_abstract.py | 18 ++++ .../0006_alter_publication_abstract.py | 18 ++++ sok/models.py | 11 +++ sokman/settings.py | 3 + 10 files changed, 261 insertions(+), 15 deletions(-) create mode 100644 .editorconfig create mode 100644 sok/migrations/0004_alter_publication_references.py create mode 100644 sok/migrations/0005_publication_abstract.py create mode 100644 sok/migrations/0006_alter_publication_abstract.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..541ad0c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,5 @@ +# top-most EditorConfig file +root = true + +[*.py] +indent_style = tab diff --git a/sok/management/commands/dblpimport.py b/sok/management/commands/dblpimport.py index 67f1ef4..2bcd418 100644 --- a/sok/management/commands/dblpimport.py +++ b/sok/management/commands/dblpimport.py @@ -42,7 +42,7 @@ PUBLICATIONS = { } CITE_KEY_PREFIX = 'DBLP:' -DUMP_PATH = Path('dblp') / 'dblp-2021-03-01.xml' +DUMP_PATH = Path('dblp') / 'dblp.xml' def strip_cite_key_prefix(value: str) -> str: @@ -95,6 +95,7 @@ class PublicationResult: title: str year: int pages: Optional[Tuple[int, int]] + dblp_doi: Optional[str] = None authors: List[str] = field(default_factory=list) urls: List[str] = field(default_factory=list) @@ -104,6 +105,7 @@ class PublicationResult: @property def doi(self) -> Optional[str]: + if self.dblp_doi: return self.dblp_doi for url_str in self.urls: url = urlparse(url_str) if url.hostname is not None and url.hostname.endswith('doi.org'): @@ -192,12 +194,14 @@ class PublicationResult: authors = [authors] # TODO Parse URLs ('ee') + doi = None if not 'doi' in info else info['doi'] return cls( key=info['key'], title=clean_title(html.unescape(info['title'])), year=int(info['year']), pages=pages, + dblp_doi=doi, authors=[html.unescape(author['text']) for author in authors], ) @@ -222,6 +226,7 @@ class PublicationResult: response.raise_for_status search_result = response.json()['result'] hits = search_result['hits'] + if hits['@total'] == '0': return (None, None, 0) results = [cls.from_search_hit(hit) for hit in hits['hit']] total = hits['@total'] diff --git a/sok/management/commands/dblpsearch.py b/sok/management/commands/dblpsearch.py index 700521d..19aa678 100644 --- a/sok/management/commands/dblpsearch.py +++ b/sok/management/commands/dblpsearch.py @@ -61,6 +61,7 @@ class Command(BaseCommand): source: Source, search_term: SearchTerm, paper_id: Optional[str], + abstract: Optional[str], ) -> Publication: # Store Authors @@ -70,7 +71,7 @@ class Command(BaseCommand): if created: self.log_success(f"Added author: {author}") else: - self.log_info(f"Author '{author}' alreay known") + self.log_info(f"Author '{author}' already known") authors.append(author) # Store Publication @@ -81,6 +82,7 @@ class Command(BaseCommand): peer_reviewed=result.is_peer_reviewed, first_page=result.first_page, last_page=result.last_page, + abstract=abstract, ) publication.full_clean() publication.save() @@ -121,6 +123,7 @@ class Command(BaseCommand): if not (0 < limit <= 1000): raise CommandError(f"Invalid value for 'limit': {limit}; allowed range is 1 – 1000") reset_choices: bool = options['reset_choices'] + print(Source.objects.get()) source = Source.objects.get(name='DBLP') path = Path('.choices.dblp.pickle') @@ -135,6 +138,8 @@ class Command(BaseCommand): self.log_info("Querying DBLP... ", nl=False) query, results, total = dblp.PublicationResult.from_search(options['term'], limit) + if total == 0: + raise CommandError(f"No publications found for search term {options['term']}!") self.log_success(f"done, found {len(results)}/{total} publication(s)") # Create search term @@ -149,7 +154,7 @@ class Command(BaseCommand): existing.add(publication.cite_key) self.add_publication_source(publication, source, search_term) - # Promt the user for importing new entries + # Prompt the user for importing new entries for result in results: # Skip existing entries if result.cite_key in existing.union(cache[query]): @@ -157,16 +162,18 @@ class Command(BaseCommand): self.display_result(result) - # TODO Add abstract from semantic scholar + # DONE Add abstract from semantic scholar data: Dict[str, Any] = dict() if doi := result.doi: data = semanticscholar(doi) + abstract = data.get('abstract', None) + while True: choice = input("Import? [y/N], Show abstract? [a]: ").lower() if choice in {'y', 'yes'}: - self.store_result(result, source, search_term, data.get('paperId', None)) + self.store_result(result, source, search_term, data.get('paperId', None), abstract) break elif choice in {'', 'n', 'no'}: # Store choice @@ -175,7 +182,7 @@ class Command(BaseCommand): pickle.dump(cache, f) break elif choice == 'a': - if abstract := data.get('abstract', None): + if abstract: self.stdout.write(abstract) except KeyboardInterrupt: raise CommandError("Aborted.") diff --git a/sok/management/commands/repair.py b/sok/management/commands/repair.py index 843f922..df74d06 100644 --- a/sok/management/commands/repair.py +++ b/sok/management/commands/repair.py @@ -1,5 +1,6 @@ from pprint import pprint from time import sleep +from tqdm import tqdm from django.db import transaction from django.core.exceptions import ValidationError @@ -51,8 +52,73 @@ class Command(BaseCommand): fixed.save() self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}") except ValidationError as e: + print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication) raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}") + @transaction.atomic + def fix_abstracts(self) -> None: + """ + Add abstracts to those publications that have one on SemanticScholar + + If mulitple variants of a publication exist, only the master variant is + considered. + """ + + self.log_info("--- Searching for publications without abstracts ---") + self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract") + for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"): + for semantic in publication.semanticscholar_set.all(): + data = semanticscholar(semantic.paper_id) + if abstract := data['abstract']: + publication.abstract = abstract + try: + publication.full_clean() + publication.save() + self.log_success(f"Added abstract for: {publication}") + except ValidationError as e: + raise CommandError(f"{publication}: {e}") + sleep(2) # Throttle to avoid rate-limiting + + @transaction.atomic + def search_on_dblp(self, publication: Publication): + query, results, total = dblp.PublicationResult.from_search(publication.title, 100) + if total == 0: return + for result in results: + if publication.doi and result.doi: + if publication.doi.lower() == result.doi.lower(): + publication.cite_key = result.cite_key + publication.year = result.year + if not result.is_peer_reviewed == None: + publication.peer_reviewed = result.is_peer_reviewed + publication.first_page = result.first_page + publication.last_page = result.last_page + print(publication.peer_reviewed, result) + # keep authors from semantic scholar for now, even though they might be a little broken? + try: + publication.full_clean() + publication.save() + self.log_success(f"Added DBLP info for: {publication}") + except ValidationError as e: + raise CommandError(f"{publication}: {e}") + else: continue # quite definitely not the same publication + elif publication.title == result.title and publication.year == result.year: + print(f"Not quite certain about {result.cite_key} for publication {publication.title}") + else: continue # I'd rather look at that in detail for now + + def find_secondary_on_dblp(self): + self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---") + for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): + if publication.stage == 'excluded': continue + self.search_on_dblp(publication) + sleep(2) # Throttle to avoid rate-limiting + keys = set( + Publication.objects.exclude( + cite_key__startswith=dblp.CITE_KEY_PREFIX + ).values_list('cite_key', flat=True).distinct() + ) + n = len(keys) + self.log_info(f"Found {n} publications that still need to be checked") + def fix_dblp(self): self.log_info("--- Searching for entries not in the default DBLP dump ---") keys_in_db = set( @@ -93,12 +159,13 @@ class Command(BaseCommand): ) for publication in publications: data = semanticscholar(publication.doi) - - paper_id = data['paperId'] - obj = SemanticScholar(paper_id=paper_id, publication=publication) - obj.full_clean() - obj.save() - self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") + if not 'error' in data: + paper_id = data['paperId'] + obj = SemanticScholar(paper_id=paper_id, publication=publication) + obj.full_clean() + obj.save() + self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") + else: print(publication) sleep(2) # Throttle to avoid rate-limiting @@ -106,6 +173,8 @@ class Command(BaseCommand): def handle(self, *args, **options): self.fix_references() + self.fix_abstracts() + self.find_secondary_on_dblp() self.fix_dblp() self.find_missing_dois() self.find_semanticscholar_ids() diff --git a/sok/management/commands/snowball.py b/sok/management/commands/snowball.py index b6e53c4..58d3698 100644 --- a/sok/management/commands/snowball.py +++ b/sok/management/commands/snowball.py @@ -11,7 +11,17 @@ import requests from django.core.management.base import BaseCommand, CommandParser, CommandError from tqdm import tqdm -from sok.models import Publication, PublicationReference, SemanticScholar +from django.db import transaction + +from sok.models import ( + Author, + Publication, + PublicationAuthor, + PublicationReference, + PublicationSource, + SemanticScholar, + Source, +) def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: @@ -95,6 +105,84 @@ class Command(BaseCommand): hasher.update(raw.encode()) return hasher.hexdigest() + + @transaction.atomic + def add_publ( + self, + paper_id, + base: Publication, + is_reference: bool, + ) -> Publication: + data = semanticscholar(paper_id) + # Add authors to database + authors: List[Author] = [] + first = True + cite_key = '' + for author in data.get('authors', []): + if name := author.get('name', None): + author, created = Author.objects.get_or_create(name=name) + if created: + self.echo(f"Added author: {author}") + else: + self.echo(f"Author '{author}' alreay known") + authors.append(author) + cite_key = '' + if authors: + cite_key = authors[0].name.split(' ')[-1] + cite_key += str(data.get('year')) + + title = data.get('title', '') + cite_key += title.split(' ')[0] + cite_key = cite_key.lower() + try: + for i in range(10): + publication = Publication.objects.get(cite_key=cite_key) + if publication.title == title: + break + else: + cite_key += '_1' + except Publication.DoesNotExist: + publication = None + + # Add publication to database + doi = data.get('doi', None) + if not publication: + self.echo(f"Will create now with cite key {cite_key}") + publication = Publication.objects.create( + cite_key=cite_key, + title=title, + year=data.get('year', 0), + peer_reviewed=None, + doi=doi, + ) + self.echo(f"Added publication: {publication}") + else: + self.echo(f"Publication '{publication}' already known") + + # Assign authors + for position, author in enumerate(authors): + publication_author, created = PublicationAuthor.objects.get_or_create( + author=author, + publication=publication, + position=position, + ) + if created: + self.echo(f"Assigned author '{author}' to publication '{publication}' at position {position}") + else: + self.echo(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") + + # Add to Semantic Scholar and link publications + new, created = SemanticScholar.objects.get_or_create(paper_id=paper_id, publication=publication) + if created: + new.full_clean() + new.save() + self.echo(f"New Semantic Scholar entry: {paper_id}") + if is_reference: + self.add_reference(base, new.publication) + else: + self.add_reference(new.publication, base, is_reference) + + def handle_objs( self, base: Publication, @@ -137,7 +225,7 @@ class Command(BaseCommand): paper_id = obj.get('paperId', None) while True: - self.echo("Ignore? [Y/n]", nl=False) + self.echo("Ignore? [Y/n]", nl=True) if paper_id is not None: self.echo(", Show abstract [a]", nl=False) self.echo(": ") @@ -154,7 +242,11 @@ class Command(BaseCommand): if abstract := data.get('abstract', None): self.echo(abstract) elif choice in {'', 'n', 'no'}: - # TODO Import? + # TODO Import? copied and adapted from PR + if paper_id is not None: + self.add_publ(paper_id, base, is_reference) + else: + self.echo("Could not add this paper, please do it manually!") break # BaseCommand diff --git a/sok/migrations/0004_alter_publication_references.py b/sok/migrations/0004_alter_publication_references.py new file mode 100644 index 0000000..8f45fbf --- /dev/null +++ b/sok/migrations/0004_alter_publication_references.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.1 on 2025-05-09 11:30 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('sok', '0003_remove_publication_references_complete'), + ] + + operations = [ + migrations.AlterField( + model_name='publication', + name='references', + field=models.ManyToManyField(related_name='referenced_by', through='sok.PublicationReference', through_fields=('publication', 'reference'), to='sok.publication'), + ), + ] diff --git a/sok/migrations/0005_publication_abstract.py b/sok/migrations/0005_publication_abstract.py new file mode 100644 index 0000000..22994f4 --- /dev/null +++ b/sok/migrations/0005_publication_abstract.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.1 on 2025-05-14 07:47 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('sok', '0004_alter_publication_references'), + ] + + operations = [ + migrations.AddField( + model_name='publication', + name='abstract', + field=models.CharField(blank=True, default=None, max_length=5000, null=True), + ), + ] diff --git a/sok/migrations/0006_alter_publication_abstract.py b/sok/migrations/0006_alter_publication_abstract.py new file mode 100644 index 0000000..5428206 --- /dev/null +++ b/sok/migrations/0006_alter_publication_abstract.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.1 on 2025-05-14 08:15 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('sok', '0005_publication_abstract'), + ] + + operations = [ + migrations.AlterField( + model_name='publication', + name='abstract', + field=models.TextField(blank=True, default=None, null=True), + ), + ] diff --git a/sok/models.py b/sok/models.py index ac36d1e..4e83a51 100644 --- a/sok/models.py +++ b/sok/models.py @@ -66,6 +66,7 @@ class Publication(models.Model): first_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None) last_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None) doi = models.CharField(max_length=255, unique=True, blank=True, null=True, default=None) + abstract = models.TextField(unique=False, blank=True, null=True, default=None) variant_of = models.ForeignKey( 'Publication', @@ -121,6 +122,16 @@ class Publication(models.Model): if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False): return 'tertiary' + if ref_by := self.referenced_by.filter(exclusion_criteria__isnull=True): + stages = set([ref.stage for ref in ref_by]) + if 'secondary' in stages: return 'sec-secondary' + elif 'tertiary' in stages: return 'sec-tertiary' + + if refs := self.references.filter(exclusion_criteria__isnull=True): + stages = set([ref.stage for ref in ref_by]) + if 'secondary' in stages: return 'tert-secondary' + elif 'tertiary' in stages: return 'tert-tertiary' + return None def __str__(self) -> str: diff --git a/sokman/settings.py b/sokman/settings.py index 3264826..f1f31ba 100644 --- a/sokman/settings.py +++ b/sokman/settings.py @@ -132,3 +132,6 @@ USE_TZ = True # https://docs.djangoproject.com/en/3.1/howto/static-files/ STATIC_URL = '/static/' + +# Add this to avoid the warning models.W042 +DEFAULT_AUTO_FIELD='django.db.models.AutoField'