Browse Source

Add some things

master
Maya Herrscher 1 month ago
parent
commit
fa4981693c
  1. 118
      sok/management/commands/repair.py
  2. 19
      sok/management/commands/stats.py

118
sok/management/commands/repair.py

@ -92,68 +92,76 @@ class Command(BaseCommand):
def search_on_dblp(self, publication: Publication): def search_on_dblp(self, publication: Publication):
query, results, total = dblp.PublicationResult.from_search(publication.title, 100) query, results, total = dblp.PublicationResult.from_search(publication.title, 100)
if total == 0: return if total == 0: return
plausible = []
for result in results: for result in results:
if publication.doi and result.doi: if publication.doi and result.doi:
if publication.doi.lower() == result.doi.lower(): if publication.doi.lower() == result.doi.lower():
publication.cite_key = result.cite_key plausible = [result]
publication.year = result.year break
if not result.is_peer_reviewed == None:
publication.peer_reviewed = result.is_peer_reviewed
publication.first_page = result.first_page
publication.last_page = result.last_page
print(publication.peer_reviewed, result)
try:
publication.full_clean()
publication.save()
self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
# Store Authors
authors: List[Author] = []
for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid to author: {author}")
except ValidationError as e:
raise CommandError(f"{author}: {e}")
authors.append(author)
# Replace authors for publication
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
for publication_author in PublicationAuthor.objects.filter(publication=publication):
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
try:
publication_author.delete()
except e:
raise CommandError(f"{publication} - {author}: {e}")
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
else: continue # quite definitely not the same publication else: continue # quite definitely not the same publication
elif publication.title == result.title and publication.year == result.year: elif publication.title == result.title and publication.year == result.year:
plausible.append(result)
self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}") self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}")
else: continue # I'd rather look at that in detail for now else: continue # I'd rather look at that in detail for now
if len(plausible) == 1:
result = plausible[0]
if result.access.lower() == 'withdrawn':
self.log_warn(f"Publication {publication.title} was WITHDRAWN!")
return
publication.cite_key = result.cite_key
publication.year = result.year
if not result.is_peer_reviewed == None:
publication.peer_reviewed = result.is_peer_reviewed
publication.first_page = result.first_page
publication.last_page = result.last_page
print(publication.peer_reviewed, result)
try:
publication.full_clean()
publication.save()
self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
# Store Authors
authors: List[Author] = []
for (pid, name) in result.authors:
author, created = Author.objects.get_or_create(name=name)
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' already known")
if not author.dblp_id and pid != None:
try:
author.dblp_id = pid
author.full_clean()
author.save()
self.log_success(f"Added pid to author: {author}")
except ValidationError as e:
raise CommandError(f"{author}: {e}")
authors.append(author)
# Replace authors for publication
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)])
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication))
for publication_author in PublicationAuthor.objects.filter(publication=publication):
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}")
try:
publication_author.delete()
except e:
raise CommandError(f"{publication} - {author}: {e}")
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
def find_secondary_on_dblp(self): def find_secondary_on_dblp(self):
self.log_info("--- Searching for snowballed sources on DBLP ---") self.log_info("--- Searching for snowballed sources on DBLP ---")

19
sok/management/commands/stats.py

@ -1,4 +1,8 @@
from typing import Set from typing import Set
import pickle
from django.db.models import Count
from pathlib import Path
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db.models import Count, Q from django.db.models import Count, Q
@ -43,7 +47,22 @@ class Command(BaseCommand):
).distinct(): ).distinct():
publications_relevant.add(publication.cite_key) publications_relevant.add(publication.cite_key)
self.cache_path = Path('.choices.semanticscholar.pickle')
if self.cache_path.exists():
self.echo("Loading choices from snowballing...")
with self.cache_path.open('rb') as f:
self.cache = pickle.load(f)
self.echo(f"{len(self.cache)} publications excluded during snowballing")
self.echo(f"{len(Publication.objects.filter(variant_of__isnull=True))} publications included in total so far")
# Output # Output
self.echo(f"Total publications: {len(publications_found):4d}", bold=True) self.echo(f"Total publications: {len(publications_found):4d}", bold=True)
self.echo(f"- peer reviewed: {len(publications_peer_reviewed):4d}", bold=True) self.echo(f"- peer reviewed: {len(publications_peer_reviewed):4d}", bold=True)
self.echo(f"- relevant: {len(publications_relevant):4d}", bold=True) self.echo(f"- relevant: {len(publications_relevant):4d}", bold=True)
pubs = Publication.objects.values('title').annotate(count=Count('referenced_by')).values('title', 'year', 'count').order_by('count').reverse()
for p in pubs[:15]: print(f"{p['title']} ({p['year']}): {p['count']} citations")
years = Publication.objects.values('year').annotate(count=Count('year')).values('year', 'count')
for y in years: print(f"{y['year']}, {y['count']}")

Loading…
Cancel
Save