Compare commits

...

2 Commits

Author SHA1 Message Date
Maya Herrscher 854cdddad5 Further improve snowballing 2 months ago
Maya Herrscher 65a6008c8f Add transitive solution for snowballing stages 2 months ago
  1. 9
      sok/admin.py
  2. 25
      sok/management/commands/snowball.py
  3. 41
      sok/models.py

9
sok/admin.py

@ -64,6 +64,7 @@ class PublicationStageFilter(admin.SimpleListFilter):
return (
('primary', _("primary")),
('secondary', _("secondary")),
('2-secondary', _("2-secondary")),
('tertiary', _("tertiary")),
('excluded', _("excluded")),
('-', _("-")),
@ -95,6 +96,14 @@ class PublicationStageFilter(admin.SimpleListFilter):
referenced_by__sources__isnull=False,
)
if self.value() == '2-secondary':
ids: Set[int] = {
publication.id
for publication in queryset
if publication.stage == '2-secondary'
}
return queryset.filter(id__in=ids)
if self.value() == '-':
ids: Set[int] = {
publication.id

25
sok/management/commands/snowball.py

@ -124,12 +124,12 @@ class Command(BaseCommand):
if created:
self.echo(f"Added author: {author}")
else:
self.echo(f"Author '{author}' alreay known")
self.echo(f"Author '{author}' already known")
authors.append(author)
cite_key = ''
if authors:
cite_key = authors[0].name.split(' ')[-1]
cite_key += str(data.get('year'))
cite_key = ''
if authors:
cite_key = authors[0].name.split(' ')[-1]
cite_key += str(data.get('year'))
title = data.get('title', '')
cite_key += title.split(' ')[0]
@ -147,20 +147,20 @@ class Command(BaseCommand):
# Add publication to database
doi = data.get('doi', None)
if not publication:
self.echo(f"Will create now with cite key {cite_key}")
publication = Publication.objects.create(
cite_key=cite_key,
title=title,
year=data.get('year', 0),
peer_reviewed=None,
doi=doi,
abstract=data.get('abstract', None),
)
self.echo(f"Added publication: {publication}")
else:
self.echo(f"Publication '{publication}' already known")
# Assign authors
for position, author in enumerate(authors):
for position, author in enumerate(list(set(authors))):
publication_author, created = PublicationAuthor.objects.get_or_create(
author=author,
publication=publication,
@ -242,11 +242,11 @@ class Command(BaseCommand):
if abstract := data.get('abstract', None):
self.echo(abstract)
elif choice in {'', 'n', 'no'}:
# TODO Import? copied and adapted from PR
# DONE Import? copied and adapted from PR
if paper_id is not None:
self.add_publ(paper_id, base, is_reference)
else:
self.echo("Could not add this paper, please do it manually!")
self.warn("Could not add this paper, please do it manually!")
break
# BaseCommand
@ -255,11 +255,13 @@ class Command(BaseCommand):
parser.add_argument('--reset-choices', action='store_true')
parser.add_argument('--no-references', action='store_true')
parser.add_argument('--no-citations', action='store_true')
parser.add_argument('-s', '--stage', type=int, default=10000)
def handle(self, *args, **options):
reset_choices: bool = options['reset_choices']
no_citations: bool = options['no_citations']
no_references: bool = options['no_references']
stage: int = options['stage']
self.cache_path = Path('.choices.semanticscholar.pickle')
self.cache: Set[str] = set()
@ -275,9 +277,12 @@ class Command(BaseCommand):
semanticscholar__isnull=False,
exclusion_criteria__isnull=True,
)
if stage < 10000:
publications = [p for p in publications if p.stage_added() == stage]
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====")
try:
for publication in tqdm(publications, unit="publication"):
self.echo(f"=== Publication {publication} ===")
self.echo(f"=== Publication {publication}: {publication.title} ===")
for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id)

41
sok/models.py

@ -103,6 +103,24 @@ class Publication(models.Model):
def relevant_referenced_by(self) -> QuerySet:
return self.referenced_by.filter(exclusion_criteria__isnull=True)
def stage_added(self) -> int:
if not self.is_relevant:
return 10000
if self.sources.exists():
return 0
if self.referenced_by.filter(exclusion_criteria__isnull=True, sources__isnull=False) or self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 1
if ref_by := self.referenced_by.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage_added() for ref in ref_by])
return 1 + sorted(stages)[0]
if refs := self.references.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage_added() for ref in refs])
return 1 + sorted(stages)[0]
@property
def stage(self) -> Optional[str]:
if not self.is_relevant:
@ -113,24 +131,27 @@ class Publication(models.Model):
return 'primary'
# Referenced by primary (backward snowballing)
# TODO make transitive
# DONE make transitive
if self.referenced_by.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 'secondary'
# References a primary (forward snowballing)
# TODO make transitive
if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 'tertiary'
if ref_by := self.referenced_by.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage for ref in ref_by])
if 'secondary' in stages: return 'sec-secondary'
elif 'tertiary' in stages: return 'sec-tertiary'
# prefer secondary over tertiary regardless of stage for now
exclude = False
if self.referenced_by.filter(exclusion_criteria__isnull=True):
stage = self.stage_added()
if stage > 10000: exclude = True
else: return str(stage) + '-secondary'
if refs := self.references.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage for ref in ref_by])
if 'secondary' in stages: return 'tert-secondary'
elif 'tertiary' in stages: return 'tert-tertiary'
if self.references.filter(exclusion_criteria__isnull=True):
stage = self.stage_added()
if stage > 10000: return 'excluded-by-ref'
else: return str(stage) + '-tertiary'
if exclude: return 'excluded-by-ref'
return None

Loading…
Cancel
Save