Browse Source

Repair stuff, add abstracts, add snowball import (from PR 3 in Original

Repo)
master
Maya Herrscher 2 months ago
parent
commit
230f9a5627
  1. 5
      .editorconfig
  2. 7
      sok/management/commands/dblpimport.py
  3. 17
      sok/management/commands/dblpsearch.py
  4. 81
      sok/management/commands/repair.py
  5. 98
      sok/management/commands/snowball.py
  6. 18
      sok/migrations/0004_alter_publication_references.py
  7. 18
      sok/migrations/0005_publication_abstract.py
  8. 18
      sok/migrations/0006_alter_publication_abstract.py
  9. 11
      sok/models.py
  10. 3
      sokman/settings.py

5
.editorconfig

@ -0,0 +1,5 @@
# top-most EditorConfig file
root = true
[*.py]
indent_style = tab

7
sok/management/commands/dblpimport.py

@ -42,7 +42,7 @@ PUBLICATIONS = {
}
CITE_KEY_PREFIX = 'DBLP:'
DUMP_PATH = Path('dblp') / 'dblp-2021-03-01.xml'
DUMP_PATH = Path('dblp') / 'dblp.xml'
def strip_cite_key_prefix(value: str) -> str:
@ -95,6 +95,7 @@ class PublicationResult:
title: str
year: int
pages: Optional[Tuple[int, int]]
dblp_doi: Optional[str] = None
authors: List[str] = field(default_factory=list)
urls: List[str] = field(default_factory=list)
@ -104,6 +105,7 @@ class PublicationResult:
@property
def doi(self) -> Optional[str]:
if self.dblp_doi: return self.dblp_doi
for url_str in self.urls:
url = urlparse(url_str)
if url.hostname is not None and url.hostname.endswith('doi.org'):
@ -192,12 +194,14 @@ class PublicationResult:
authors = [authors]
# TODO Parse URLs ('ee')
doi = None if not 'doi' in info else info['doi']
return cls(
key=info['key'],
title=clean_title(html.unescape(info['title'])),
year=int(info['year']),
pages=pages,
dblp_doi=doi,
authors=[html.unescape(author['text']) for author in authors],
)
@ -222,6 +226,7 @@ class PublicationResult:
response.raise_for_status
search_result = response.json()['result']
hits = search_result['hits']
if hits['@total'] == '0': return (None, None, 0)
results = [cls.from_search_hit(hit) for hit in hits['hit']]
total = hits['@total']

17
sok/management/commands/dblpsearch.py

@ -61,6 +61,7 @@ class Command(BaseCommand):
source: Source,
search_term: SearchTerm,
paper_id: Optional[str],
abstract: Optional[str],
) -> Publication:
# Store Authors
@ -70,7 +71,7 @@ class Command(BaseCommand):
if created:
self.log_success(f"Added author: {author}")
else:
self.log_info(f"Author '{author}' alreay known")
self.log_info(f"Author '{author}' already known")
authors.append(author)
# Store Publication
@ -81,6 +82,7 @@ class Command(BaseCommand):
peer_reviewed=result.is_peer_reviewed,
first_page=result.first_page,
last_page=result.last_page,
abstract=abstract,
)
publication.full_clean()
publication.save()
@ -121,6 +123,7 @@ class Command(BaseCommand):
if not (0 < limit <= 1000):
raise CommandError(f"Invalid value for 'limit': {limit}; allowed range is 1 – 1000")
reset_choices: bool = options['reset_choices']
print(Source.objects.get())
source = Source.objects.get(name='DBLP')
path = Path('.choices.dblp.pickle')
@ -135,6 +138,8 @@ class Command(BaseCommand):
self.log_info("Querying DBLP... ", nl=False)
query, results, total = dblp.PublicationResult.from_search(options['term'], limit)
if total == 0:
raise CommandError(f"No publications found for search term {options['term']}!")
self.log_success(f"done, found {len(results)}/{total} publication(s)")
# Create search term
@ -149,7 +154,7 @@ class Command(BaseCommand):
existing.add(publication.cite_key)
self.add_publication_source(publication, source, search_term)
# Promt the user for importing new entries
# Prompt the user for importing new entries
for result in results:
# Skip existing entries
if result.cite_key in existing.union(cache[query]):
@ -157,16 +162,18 @@ class Command(BaseCommand):
self.display_result(result)
# TODO Add abstract from semantic scholar
# DONE Add abstract from semantic scholar
data: Dict[str, Any] = dict()
if doi := result.doi:
data = semanticscholar(doi)
abstract = data.get('abstract', None)
while True:
choice = input("Import? [y/N], Show abstract? [a]: ").lower()
if choice in {'y', 'yes'}:
self.store_result(result, source, search_term, data.get('paperId', None))
self.store_result(result, source, search_term, data.get('paperId', None), abstract)
break
elif choice in {'', 'n', 'no'}:
# Store choice
@ -175,7 +182,7 @@ class Command(BaseCommand):
pickle.dump(cache, f)
break
elif choice == 'a':
if abstract := data.get('abstract', None):
if abstract:
self.stdout.write(abstract)
except KeyboardInterrupt:
raise CommandError("Aborted.")

81
sok/management/commands/repair.py

@ -1,5 +1,6 @@
from pprint import pprint
from time import sleep
from tqdm import tqdm
from django.db import transaction
from django.core.exceptions import ValidationError
@ -51,8 +52,73 @@ class Command(BaseCommand):
fixed.save()
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}")
except ValidationError as e:
print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication)
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}")
@transaction.atomic
def fix_abstracts(self) -> None:
"""
Add abstracts to those publications that have one on SemanticScholar
If mulitple variants of a publication exist, only the master variant is
considered.
"""
self.log_info("--- Searching for publications without abstracts ---")
self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract")
for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"):
for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if abstract := data['abstract']:
publication.abstract = abstract
try:
publication.full_clean()
publication.save()
self.log_success(f"Added abstract for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
sleep(2) # Throttle to avoid rate-limiting
@transaction.atomic
def search_on_dblp(self, publication: Publication):
query, results, total = dblp.PublicationResult.from_search(publication.title, 100)
if total == 0: return
for result in results:
if publication.doi and result.doi:
if publication.doi.lower() == result.doi.lower():
publication.cite_key = result.cite_key
publication.year = result.year
if not result.is_peer_reviewed == None:
publication.peer_reviewed = result.is_peer_reviewed
publication.first_page = result.first_page
publication.last_page = result.last_page
print(publication.peer_reviewed, result)
# keep authors from semantic scholar for now, even though they might be a little broken?
try:
publication.full_clean()
publication.save()
self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
else: continue # quite definitely not the same publication
elif publication.title == result.title and publication.year == result.year:
print(f"Not quite certain about {result.cite_key} for publication {publication.title}")
else: continue # I'd rather look at that in detail for now
def find_secondary_on_dblp(self):
self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---")
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
if publication.stage == 'excluded': continue
self.search_on_dblp(publication)
sleep(2) # Throttle to avoid rate-limiting
keys = set(
Publication.objects.exclude(
cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('cite_key', flat=True).distinct()
)
n = len(keys)
self.log_info(f"Found {n} publications that still need to be checked")
def fix_dblp(self):
self.log_info("--- Searching for entries not in the default DBLP dump ---")
keys_in_db = set(
@ -93,12 +159,13 @@ class Command(BaseCommand):
)
for publication in publications:
data = semanticscholar(publication.doi)
paper_id = data['paperId']
obj = SemanticScholar(paper_id=paper_id, publication=publication)
obj.full_clean()
obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
if not 'error' in data:
paper_id = data['paperId']
obj = SemanticScholar(paper_id=paper_id, publication=publication)
obj.full_clean()
obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication)
sleep(2) # Throttle to avoid rate-limiting
@ -106,6 +173,8 @@ class Command(BaseCommand):
def handle(self, *args, **options):
self.fix_references()
self.fix_abstracts()
self.find_secondary_on_dblp()
self.fix_dblp()
self.find_missing_dois()
self.find_semanticscholar_ids()

98
sok/management/commands/snowball.py

@ -11,7 +11,17 @@ import requests
from django.core.management.base import BaseCommand, CommandParser, CommandError
from tqdm import tqdm
from sok.models import Publication, PublicationReference, SemanticScholar
from django.db import transaction
from sok.models import (
Author,
Publication,
PublicationAuthor,
PublicationReference,
PublicationSource,
SemanticScholar,
Source,
)
def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]:
@ -95,6 +105,84 @@ class Command(BaseCommand):
hasher.update(raw.encode())
return hasher.hexdigest()
@transaction.atomic
def add_publ(
self,
paper_id,
base: Publication,
is_reference: bool,
) -> Publication:
data = semanticscholar(paper_id)
# Add authors to database
authors: List[Author] = []
first = True
cite_key = ''
for author in data.get('authors', []):
if name := author.get('name', None):
author, created = Author.objects.get_or_create(name=name)
if created:
self.echo(f"Added author: {author}")
else:
self.echo(f"Author '{author}' alreay known")
authors.append(author)
cite_key = ''
if authors:
cite_key = authors[0].name.split(' ')[-1]
cite_key += str(data.get('year'))
title = data.get('title', '')
cite_key += title.split(' ')[0]
cite_key = cite_key.lower()
try:
for i in range(10):
publication = Publication.objects.get(cite_key=cite_key)
if publication.title == title:
break
else:
cite_key += '_1'
except Publication.DoesNotExist:
publication = None
# Add publication to database
doi = data.get('doi', None)
if not publication:
self.echo(f"Will create now with cite key {cite_key}")
publication = Publication.objects.create(
cite_key=cite_key,
title=title,
year=data.get('year', 0),
peer_reviewed=None,
doi=doi,
)
self.echo(f"Added publication: {publication}")
else:
self.echo(f"Publication '{publication}' already known")
# Assign authors
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.echo(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.echo(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
# Add to Semantic Scholar and link publications
new, created = SemanticScholar.objects.get_or_create(paper_id=paper_id, publication=publication)
if created:
new.full_clean()
new.save()
self.echo(f"New Semantic Scholar entry: {paper_id}")
if is_reference:
self.add_reference(base, new.publication)
else:
self.add_reference(new.publication, base, is_reference)
def handle_objs(
self,
base: Publication,
@ -137,7 +225,7 @@ class Command(BaseCommand):
paper_id = obj.get('paperId', None)
while True:
self.echo("Ignore? [Y/n]", nl=False)
self.echo("Ignore? [Y/n]", nl=True)
if paper_id is not None:
self.echo(", Show abstract [a]", nl=False)
self.echo(": ")
@ -154,7 +242,11 @@ class Command(BaseCommand):
if abstract := data.get('abstract', None):
self.echo(abstract)
elif choice in {'', 'n', 'no'}:
# TODO Import?
# TODO Import? copied and adapted from PR
if paper_id is not None:
self.add_publ(paper_id, base, is_reference)
else:
self.echo("Could not add this paper, please do it manually!")
break
# BaseCommand

18
sok/migrations/0004_alter_publication_references.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-09 11:30
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0003_remove_publication_references_complete'),
]
operations = [
migrations.AlterField(
model_name='publication',
name='references',
field=models.ManyToManyField(related_name='referenced_by', through='sok.PublicationReference', through_fields=('publication', 'reference'), to='sok.publication'),
),
]

18
sok/migrations/0005_publication_abstract.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-14 07:47
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0004_alter_publication_references'),
]
operations = [
migrations.AddField(
model_name='publication',
name='abstract',
field=models.CharField(blank=True, default=None, max_length=5000, null=True),
),
]

18
sok/migrations/0006_alter_publication_abstract.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-14 08:15
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0005_publication_abstract'),
]
operations = [
migrations.AlterField(
model_name='publication',
name='abstract',
field=models.TextField(blank=True, default=None, null=True),
),
]

11
sok/models.py

@ -66,6 +66,7 @@ class Publication(models.Model):
first_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None)
last_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None)
doi = models.CharField(max_length=255, unique=True, blank=True, null=True, default=None)
abstract = models.TextField(unique=False, blank=True, null=True, default=None)
variant_of = models.ForeignKey(
'Publication',
@ -121,6 +122,16 @@ class Publication(models.Model):
if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 'tertiary'
if ref_by := self.referenced_by.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage for ref in ref_by])
if 'secondary' in stages: return 'sec-secondary'
elif 'tertiary' in stages: return 'sec-tertiary'
if refs := self.references.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage for ref in ref_by])
if 'secondary' in stages: return 'tert-secondary'
elif 'tertiary' in stages: return 'tert-tertiary'
return None
def __str__(self) -> str:

3
sokman/settings.py

@ -132,3 +132,6 @@ USE_TZ = True
# https://docs.djangoproject.com/en/3.1/howto/static-files/
STATIC_URL = '/static/'
# Add this to avoid the warning models.W042
DEFAULT_AUTO_FIELD='django.db.models.AutoField'

Loading…
Cancel
Save