From fa4981693cc5c2b219d730c60af8b4d81f29b808 Mon Sep 17 00:00:00 2001 From: Maya Herrscher Date: Mon, 19 May 2025 22:04:55 +0200 Subject: [PATCH] Add some things --- sok/management/commands/repair.py | 118 ++++++++++++++++-------------- sok/management/commands/stats.py | 19 +++++ 2 files changed, 82 insertions(+), 55 deletions(-) diff --git a/sok/management/commands/repair.py b/sok/management/commands/repair.py index d7352f8..0b5138f 100644 --- a/sok/management/commands/repair.py +++ b/sok/management/commands/repair.py @@ -92,68 +92,76 @@ class Command(BaseCommand): def search_on_dblp(self, publication: Publication): query, results, total = dblp.PublicationResult.from_search(publication.title, 100) if total == 0: return + plausible = [] for result in results: if publication.doi and result.doi: if publication.doi.lower() == result.doi.lower(): - publication.cite_key = result.cite_key - publication.year = result.year - if not result.is_peer_reviewed == None: - publication.peer_reviewed = result.is_peer_reviewed - publication.first_page = result.first_page - publication.last_page = result.last_page - print(publication.peer_reviewed, result) - try: - publication.full_clean() - publication.save() - self.log_success(f"Added DBLP info for: {publication}") - except ValidationError as e: - raise CommandError(f"{publication}: {e}") - - # Store Authors - authors: List[Author] = [] - for (pid, name) in result.authors: - author, created = Author.objects.get_or_create(name=name) - - if created: - self.log_success(f"Added author: {author}") - else: - self.log_info(f"Author '{author}' already known") - if not author.dblp_id and pid != None: - try: - author.dblp_id = pid - author.full_clean() - author.save() - self.log_success(f"Added pid to author: {author}") - except ValidationError as e: - raise CommandError(f"{author}: {e}") - - authors.append(author) - - # Replace authors for publication - if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)]) - assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication)) - for publication_author in PublicationAuthor.objects.filter(publication=publication): - self.log_warn(f"Will now try to delete {publication_author.author} from {publication}") - try: - publication_author.delete() - except e: - raise CommandError(f"{publication} - {author}: {e}") - - for position, author in enumerate(authors): - publication_author, created = PublicationAuthor.objects.get_or_create( - author=author, - publication=publication, - position=position, - ) - if created: - self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}") - else: - self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") - + plausible = [result] + break else: continue # quite definitely not the same publication elif publication.title == result.title and publication.year == result.year: + plausible.append(result) self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}") else: continue # I'd rather look at that in detail for now + if len(plausible) == 1: + result = plausible[0] + if result.access.lower() == 'withdrawn': + self.log_warn(f"Publication {publication.title} was WITHDRAWN!") + return + publication.cite_key = result.cite_key + publication.year = result.year + if not result.is_peer_reviewed == None: + publication.peer_reviewed = result.is_peer_reviewed + publication.first_page = result.first_page + publication.last_page = result.last_page + print(publication.peer_reviewed, result) + try: + publication.full_clean() + publication.save() + self.log_success(f"Added DBLP info for: {publication}") + except ValidationError as e: + raise CommandError(f"{publication}: {e}") + + # Store Authors + authors: List[Author] = [] + for (pid, name) in result.authors: + author, created = Author.objects.get_or_create(name=name) + + if created: + self.log_success(f"Added author: {author}") + else: + self.log_info(f"Author '{author}' already known") + if not author.dblp_id and pid != None: + try: + author.dblp_id = pid + author.full_clean() + author.save() + self.log_success(f"Added pid to author: {author}") + except ValidationError as e: + raise CommandError(f"{author}: {e}") + + authors.append(author) + + # Replace authors for publication + if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)]) + assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication)) + for publication_author in PublicationAuthor.objects.filter(publication=publication): + self.log_warn(f"Will now try to delete {publication_author.author} from {publication}") + try: + publication_author.delete() + except e: + raise CommandError(f"{publication} - {author}: {e}") + + for position, author in enumerate(authors): + publication_author, created = PublicationAuthor.objects.get_or_create( + author=author, + publication=publication, + position=position, + ) + if created: + self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}") + else: + self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") def find_secondary_on_dblp(self): self.log_info("--- Searching for snowballed sources on DBLP ---") diff --git a/sok/management/commands/stats.py b/sok/management/commands/stats.py index f2eb59b..418f283 100644 --- a/sok/management/commands/stats.py +++ b/sok/management/commands/stats.py @@ -1,4 +1,8 @@ from typing import Set +import pickle +from django.db.models import Count + +from pathlib import Path from django.core.management.base import BaseCommand from django.db.models import Count, Q @@ -43,7 +47,22 @@ class Command(BaseCommand): ).distinct(): publications_relevant.add(publication.cite_key) + self.cache_path = Path('.choices.semanticscholar.pickle') + if self.cache_path.exists(): + self.echo("Loading choices from snowballing...") + with self.cache_path.open('rb') as f: + self.cache = pickle.load(f) + self.echo(f"{len(self.cache)} publications excluded during snowballing") + self.echo(f"{len(Publication.objects.filter(variant_of__isnull=True))} publications included in total so far") + # Output self.echo(f"Total publications: {len(publications_found):4d}", bold=True) self.echo(f"- peer reviewed: {len(publications_peer_reviewed):4d}", bold=True) self.echo(f"- relevant: {len(publications_relevant):4d}", bold=True) + + pubs = Publication.objects.values('title').annotate(count=Count('referenced_by')).values('title', 'year', 'count').order_by('count').reverse() + for p in pubs[:15]: print(f"{p['title']} ({p['year']}): {p['count']} citations") + + years = Publication.objects.values('year').annotate(count=Count('year')).values('year', 'count') + for y in years: print(f"{y['year']}, {y['count']}") +