Browse Source

Repair stuff, add abstracts, add snowball import (from PR 3 in Original

Repo)
master
Maya Herrscher 2 months ago
parent
commit
230f9a5627
  1. 5
      .editorconfig
  2. 7
      sok/management/commands/dblpimport.py
  3. 17
      sok/management/commands/dblpsearch.py
  4. 81
      sok/management/commands/repair.py
  5. 98
      sok/management/commands/snowball.py
  6. 18
      sok/migrations/0004_alter_publication_references.py
  7. 18
      sok/migrations/0005_publication_abstract.py
  8. 18
      sok/migrations/0006_alter_publication_abstract.py
  9. 11
      sok/models.py
  10. 3
      sokman/settings.py

5
.editorconfig

@ -0,0 +1,5 @@
# top-most EditorConfig file
root = true
[*.py]
indent_style = tab

7
sok/management/commands/dblpimport.py

@ -42,7 +42,7 @@ PUBLICATIONS = {
} }
CITE_KEY_PREFIX = 'DBLP:' CITE_KEY_PREFIX = 'DBLP:'
DUMP_PATH = Path('dblp') / 'dblp-2021-03-01.xml' DUMP_PATH = Path('dblp') / 'dblp.xml'
def strip_cite_key_prefix(value: str) -> str: def strip_cite_key_prefix(value: str) -> str:
@ -95,6 +95,7 @@ class PublicationResult:
title: str title: str
year: int year: int
pages: Optional[Tuple[int, int]] pages: Optional[Tuple[int, int]]
dblp_doi: Optional[str] = None
authors: List[str] = field(default_factory=list) authors: List[str] = field(default_factory=list)
urls: List[str] = field(default_factory=list) urls: List[str] = field(default_factory=list)
@ -104,6 +105,7 @@ class PublicationResult:
@property @property
def doi(self) -> Optional[str]: def doi(self) -> Optional[str]:
if self.dblp_doi: return self.dblp_doi
for url_str in self.urls: for url_str in self.urls:
url = urlparse(url_str) url = urlparse(url_str)
if url.hostname is not None and url.hostname.endswith('doi.org'): if url.hostname is not None and url.hostname.endswith('doi.org'):
@ -192,12 +194,14 @@ class PublicationResult:
authors = [authors] authors = [authors]
# TODO Parse URLs ('ee') # TODO Parse URLs ('ee')
doi = None if not 'doi' in info else info['doi']
return cls( return cls(
key=info['key'], key=info['key'],
title=clean_title(html.unescape(info['title'])), title=clean_title(html.unescape(info['title'])),
year=int(info['year']), year=int(info['year']),
pages=pages, pages=pages,
dblp_doi=doi,
authors=[html.unescape(author['text']) for author in authors], authors=[html.unescape(author['text']) for author in authors],
) )
@ -222,6 +226,7 @@ class PublicationResult:
response.raise_for_status response.raise_for_status
search_result = response.json()['result'] search_result = response.json()['result']
hits = search_result['hits'] hits = search_result['hits']
if hits['@total'] == '0': return (None, None, 0)
results = [cls.from_search_hit(hit) for hit in hits['hit']] results = [cls.from_search_hit(hit) for hit in hits['hit']]
total = hits['@total'] total = hits['@total']

17
sok/management/commands/dblpsearch.py

@ -61,6 +61,7 @@ class Command(BaseCommand):
source: Source, source: Source,
search_term: SearchTerm, search_term: SearchTerm,
paper_id: Optional[str], paper_id: Optional[str],
abstract: Optional[str],
) -> Publication: ) -> Publication:
# Store Authors # Store Authors
@ -70,7 +71,7 @@ class Command(BaseCommand):
if created: if created:
self.log_success(f"Added author: {author}") self.log_success(f"Added author: {author}")
else: else:
self.log_info(f"Author '{author}' alreay known") self.log_info(f"Author '{author}' already known")
authors.append(author) authors.append(author)
# Store Publication # Store Publication
@ -81,6 +82,7 @@ class Command(BaseCommand):
peer_reviewed=result.is_peer_reviewed, peer_reviewed=result.is_peer_reviewed,
first_page=result.first_page, first_page=result.first_page,
last_page=result.last_page, last_page=result.last_page,
abstract=abstract,
) )
publication.full_clean() publication.full_clean()
publication.save() publication.save()
@ -121,6 +123,7 @@ class Command(BaseCommand):
if not (0 < limit <= 1000): if not (0 < limit <= 1000):
raise CommandError(f"Invalid value for 'limit': {limit}; allowed range is 1 – 1000") raise CommandError(f"Invalid value for 'limit': {limit}; allowed range is 1 – 1000")
reset_choices: bool = options['reset_choices'] reset_choices: bool = options['reset_choices']
print(Source.objects.get())
source = Source.objects.get(name='DBLP') source = Source.objects.get(name='DBLP')
path = Path('.choices.dblp.pickle') path = Path('.choices.dblp.pickle')
@ -135,6 +138,8 @@ class Command(BaseCommand):
self.log_info("Querying DBLP... ", nl=False) self.log_info("Querying DBLP... ", nl=False)
query, results, total = dblp.PublicationResult.from_search(options['term'], limit) query, results, total = dblp.PublicationResult.from_search(options['term'], limit)
if total == 0:
raise CommandError(f"No publications found for search term {options['term']}!")
self.log_success(f"done, found {len(results)}/{total} publication(s)") self.log_success(f"done, found {len(results)}/{total} publication(s)")
# Create search term # Create search term
@ -149,7 +154,7 @@ class Command(BaseCommand):
existing.add(publication.cite_key) existing.add(publication.cite_key)
self.add_publication_source(publication, source, search_term) self.add_publication_source(publication, source, search_term)
# Promt the user for importing new entries # Prompt the user for importing new entries
for result in results: for result in results:
# Skip existing entries # Skip existing entries
if result.cite_key in existing.union(cache[query]): if result.cite_key in existing.union(cache[query]):
@ -157,16 +162,18 @@ class Command(BaseCommand):
self.display_result(result) self.display_result(result)
# TODO Add abstract from semantic scholar # DONE Add abstract from semantic scholar
data: Dict[str, Any] = dict() data: Dict[str, Any] = dict()
if doi := result.doi: if doi := result.doi:
data = semanticscholar(doi) data = semanticscholar(doi)
abstract = data.get('abstract', None)
while True: while True:
choice = input("Import? [y/N], Show abstract? [a]: ").lower() choice = input("Import? [y/N], Show abstract? [a]: ").lower()
if choice in {'y', 'yes'}: if choice in {'y', 'yes'}:
self.store_result(result, source, search_term, data.get('paperId', None)) self.store_result(result, source, search_term, data.get('paperId', None), abstract)
break break
elif choice in {'', 'n', 'no'}: elif choice in {'', 'n', 'no'}:
# Store choice # Store choice
@ -175,7 +182,7 @@ class Command(BaseCommand):
pickle.dump(cache, f) pickle.dump(cache, f)
break break
elif choice == 'a': elif choice == 'a':
if abstract := data.get('abstract', None): if abstract:
self.stdout.write(abstract) self.stdout.write(abstract)
except KeyboardInterrupt: except KeyboardInterrupt:
raise CommandError("Aborted.") raise CommandError("Aborted.")

81
sok/management/commands/repair.py

@ -1,5 +1,6 @@
from pprint import pprint from pprint import pprint
from time import sleep from time import sleep
from tqdm import tqdm
from django.db import transaction from django.db import transaction
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
@ -51,8 +52,73 @@ class Command(BaseCommand):
fixed.save() fixed.save()
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}") self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}")
except ValidationError as e: except ValidationError as e:
print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication)
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}") raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}")
@transaction.atomic
def fix_abstracts(self) -> None:
"""
Add abstracts to those publications that have one on SemanticScholar
If mulitple variants of a publication exist, only the master variant is
considered.
"""
self.log_info("--- Searching for publications without abstracts ---")
self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract")
for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"):
for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)
if abstract := data['abstract']:
publication.abstract = abstract
try:
publication.full_clean()
publication.save()
self.log_success(f"Added abstract for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
sleep(2) # Throttle to avoid rate-limiting
@transaction.atomic
def search_on_dblp(self, publication: Publication):
query, results, total = dblp.PublicationResult.from_search(publication.title, 100)
if total == 0: return
for result in results:
if publication.doi and result.doi:
if publication.doi.lower() == result.doi.lower():
publication.cite_key = result.cite_key
publication.year = result.year
if not result.is_peer_reviewed == None:
publication.peer_reviewed = result.is_peer_reviewed
publication.first_page = result.first_page
publication.last_page = result.last_page
print(publication.peer_reviewed, result)
# keep authors from semantic scholar for now, even though they might be a little broken?
try:
publication.full_clean()
publication.save()
self.log_success(f"Added DBLP info for: {publication}")
except ValidationError as e:
raise CommandError(f"{publication}: {e}")
else: continue # quite definitely not the same publication
elif publication.title == result.title and publication.year == result.year:
print(f"Not quite certain about {result.cite_key} for publication {publication.title}")
else: continue # I'd rather look at that in detail for now
def find_secondary_on_dblp(self):
self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---")
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"):
if publication.stage == 'excluded': continue
self.search_on_dblp(publication)
sleep(2) # Throttle to avoid rate-limiting
keys = set(
Publication.objects.exclude(
cite_key__startswith=dblp.CITE_KEY_PREFIX
).values_list('cite_key', flat=True).distinct()
)
n = len(keys)
self.log_info(f"Found {n} publications that still need to be checked")
def fix_dblp(self): def fix_dblp(self):
self.log_info("--- Searching for entries not in the default DBLP dump ---") self.log_info("--- Searching for entries not in the default DBLP dump ---")
keys_in_db = set( keys_in_db = set(
@ -93,12 +159,13 @@ class Command(BaseCommand):
) )
for publication in publications: for publication in publications:
data = semanticscholar(publication.doi) data = semanticscholar(publication.doi)
if not 'error' in data:
paper_id = data['paperId'] paper_id = data['paperId']
obj = SemanticScholar(paper_id=paper_id, publication=publication) obj = SemanticScholar(paper_id=paper_id, publication=publication)
obj.full_clean() obj.full_clean()
obj.save() obj.save()
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}")
else: print(publication)
sleep(2) # Throttle to avoid rate-limiting sleep(2) # Throttle to avoid rate-limiting
@ -106,6 +173,8 @@ class Command(BaseCommand):
def handle(self, *args, **options): def handle(self, *args, **options):
self.fix_references() self.fix_references()
self.fix_abstracts()
self.find_secondary_on_dblp()
self.fix_dblp() self.fix_dblp()
self.find_missing_dois() self.find_missing_dois()
self.find_semanticscholar_ids() self.find_semanticscholar_ids()

98
sok/management/commands/snowball.py

@ -11,7 +11,17 @@ import requests
from django.core.management.base import BaseCommand, CommandParser, CommandError from django.core.management.base import BaseCommand, CommandParser, CommandError
from tqdm import tqdm from tqdm import tqdm
from sok.models import Publication, PublicationReference, SemanticScholar from django.db import transaction
from sok.models import (
Author,
Publication,
PublicationAuthor,
PublicationReference,
PublicationSource,
SemanticScholar,
Source,
)
def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]:
@ -95,6 +105,84 @@ class Command(BaseCommand):
hasher.update(raw.encode()) hasher.update(raw.encode())
return hasher.hexdigest() return hasher.hexdigest()
@transaction.atomic
def add_publ(
self,
paper_id,
base: Publication,
is_reference: bool,
) -> Publication:
data = semanticscholar(paper_id)
# Add authors to database
authors: List[Author] = []
first = True
cite_key = ''
for author in data.get('authors', []):
if name := author.get('name', None):
author, created = Author.objects.get_or_create(name=name)
if created:
self.echo(f"Added author: {author}")
else:
self.echo(f"Author '{author}' alreay known")
authors.append(author)
cite_key = ''
if authors:
cite_key = authors[0].name.split(' ')[-1]
cite_key += str(data.get('year'))
title = data.get('title', '')
cite_key += title.split(' ')[0]
cite_key = cite_key.lower()
try:
for i in range(10):
publication = Publication.objects.get(cite_key=cite_key)
if publication.title == title:
break
else:
cite_key += '_1'
except Publication.DoesNotExist:
publication = None
# Add publication to database
doi = data.get('doi', None)
if not publication:
self.echo(f"Will create now with cite key {cite_key}")
publication = Publication.objects.create(
cite_key=cite_key,
title=title,
year=data.get('year', 0),
peer_reviewed=None,
doi=doi,
)
self.echo(f"Added publication: {publication}")
else:
self.echo(f"Publication '{publication}' already known")
# Assign authors
for position, author in enumerate(authors):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
position=position,
)
if created:
self.echo(f"Assigned author '{author}' to publication '{publication}' at position {position}")
else:
self.echo(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'")
# Add to Semantic Scholar and link publications
new, created = SemanticScholar.objects.get_or_create(paper_id=paper_id, publication=publication)
if created:
new.full_clean()
new.save()
self.echo(f"New Semantic Scholar entry: {paper_id}")
if is_reference:
self.add_reference(base, new.publication)
else:
self.add_reference(new.publication, base, is_reference)
def handle_objs( def handle_objs(
self, self,
base: Publication, base: Publication,
@ -137,7 +225,7 @@ class Command(BaseCommand):
paper_id = obj.get('paperId', None) paper_id = obj.get('paperId', None)
while True: while True:
self.echo("Ignore? [Y/n]", nl=False) self.echo("Ignore? [Y/n]", nl=True)
if paper_id is not None: if paper_id is not None:
self.echo(", Show abstract [a]", nl=False) self.echo(", Show abstract [a]", nl=False)
self.echo(": ") self.echo(": ")
@ -154,7 +242,11 @@ class Command(BaseCommand):
if abstract := data.get('abstract', None): if abstract := data.get('abstract', None):
self.echo(abstract) self.echo(abstract)
elif choice in {'', 'n', 'no'}: elif choice in {'', 'n', 'no'}:
# TODO Import? # TODO Import? copied and adapted from PR
if paper_id is not None:
self.add_publ(paper_id, base, is_reference)
else:
self.echo("Could not add this paper, please do it manually!")
break break
# BaseCommand # BaseCommand

18
sok/migrations/0004_alter_publication_references.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-09 11:30
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0003_remove_publication_references_complete'),
]
operations = [
migrations.AlterField(
model_name='publication',
name='references',
field=models.ManyToManyField(related_name='referenced_by', through='sok.PublicationReference', through_fields=('publication', 'reference'), to='sok.publication'),
),
]

18
sok/migrations/0005_publication_abstract.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-14 07:47
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0004_alter_publication_references'),
]
operations = [
migrations.AddField(
model_name='publication',
name='abstract',
field=models.CharField(blank=True, default=None, max_length=5000, null=True),
),
]

18
sok/migrations/0006_alter_publication_abstract.py

@ -0,0 +1,18 @@
# Generated by Django 5.2.1 on 2025-05-14 08:15
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sok', '0005_publication_abstract'),
]
operations = [
migrations.AlterField(
model_name='publication',
name='abstract',
field=models.TextField(blank=True, default=None, null=True),
),
]

11
sok/models.py

@ -66,6 +66,7 @@ class Publication(models.Model):
first_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None) first_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None)
last_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None) last_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None)
doi = models.CharField(max_length=255, unique=True, blank=True, null=True, default=None) doi = models.CharField(max_length=255, unique=True, blank=True, null=True, default=None)
abstract = models.TextField(unique=False, blank=True, null=True, default=None)
variant_of = models.ForeignKey( variant_of = models.ForeignKey(
'Publication', 'Publication',
@ -121,6 +122,16 @@ class Publication(models.Model):
if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False): if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 'tertiary' return 'tertiary'
if ref_by := self.referenced_by.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage for ref in ref_by])
if 'secondary' in stages: return 'sec-secondary'
elif 'tertiary' in stages: return 'sec-tertiary'
if refs := self.references.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage for ref in ref_by])
if 'secondary' in stages: return 'tert-secondary'
elif 'tertiary' in stages: return 'tert-tertiary'
return None return None
def __str__(self) -> str: def __str__(self) -> str:

3
sokman/settings.py

@ -132,3 +132,6 @@ USE_TZ = True
# https://docs.djangoproject.com/en/3.1/howto/static-files/ # https://docs.djangoproject.com/en/3.1/howto/static-files/
STATIC_URL = '/static/' STATIC_URL = '/static/'
# Add this to avoid the warning models.W042
DEFAULT_AUTO_FIELD='django.db.models.AutoField'

Loading…
Cancel
Save