Compare commits

...

2 Commits

Author SHA1 Message Date
Maya Herrscher 854cdddad5 Further improve snowballing 2 months ago
Maya Herrscher 65a6008c8f Add transitive solution for snowballing stages 2 months ago
  1. 9
      sok/admin.py
  2. 25
      sok/management/commands/snowball.py
  3. 41
      sok/models.py

9
sok/admin.py

@ -64,6 +64,7 @@ class PublicationStageFilter(admin.SimpleListFilter):
return ( return (
('primary', _("primary")), ('primary', _("primary")),
('secondary', _("secondary")), ('secondary', _("secondary")),
('2-secondary', _("2-secondary")),
('tertiary', _("tertiary")), ('tertiary', _("tertiary")),
('excluded', _("excluded")), ('excluded', _("excluded")),
('-', _("-")), ('-', _("-")),
@ -95,6 +96,14 @@ class PublicationStageFilter(admin.SimpleListFilter):
referenced_by__sources__isnull=False, referenced_by__sources__isnull=False,
) )
if self.value() == '2-secondary':
ids: Set[int] = {
publication.id
for publication in queryset
if publication.stage == '2-secondary'
}
return queryset.filter(id__in=ids)
if self.value() == '-': if self.value() == '-':
ids: Set[int] = { ids: Set[int] = {
publication.id publication.id

25
sok/management/commands/snowball.py

@ -124,12 +124,12 @@ class Command(BaseCommand):
if created: if created:
self.echo(f"Added author: {author}") self.echo(f"Added author: {author}")
else: else:
self.echo(f"Author '{author}' alreay known") self.echo(f"Author '{author}' already known")
authors.append(author) authors.append(author)
cite_key = '' cite_key = ''
if authors: if authors:
cite_key = authors[0].name.split(' ')[-1] cite_key = authors[0].name.split(' ')[-1]
cite_key += str(data.get('year')) cite_key += str(data.get('year'))
title = data.get('title', '') title = data.get('title', '')
cite_key += title.split(' ')[0] cite_key += title.split(' ')[0]
@ -147,20 +147,20 @@ class Command(BaseCommand):
# Add publication to database # Add publication to database
doi = data.get('doi', None) doi = data.get('doi', None)
if not publication: if not publication:
self.echo(f"Will create now with cite key {cite_key}")
publication = Publication.objects.create( publication = Publication.objects.create(
cite_key=cite_key, cite_key=cite_key,
title=title, title=title,
year=data.get('year', 0), year=data.get('year', 0),
peer_reviewed=None, peer_reviewed=None,
doi=doi, doi=doi,
abstract=data.get('abstract', None),
) )
self.echo(f"Added publication: {publication}") self.echo(f"Added publication: {publication}")
else: else:
self.echo(f"Publication '{publication}' already known") self.echo(f"Publication '{publication}' already known")
# Assign authors # Assign authors
for position, author in enumerate(authors): for position, author in enumerate(list(set(authors))):
publication_author, created = PublicationAuthor.objects.get_or_create( publication_author, created = PublicationAuthor.objects.get_or_create(
author=author, author=author,
publication=publication, publication=publication,
@ -242,11 +242,11 @@ class Command(BaseCommand):
if abstract := data.get('abstract', None): if abstract := data.get('abstract', None):
self.echo(abstract) self.echo(abstract)
elif choice in {'', 'n', 'no'}: elif choice in {'', 'n', 'no'}:
# TODO Import? copied and adapted from PR # DONE Import? copied and adapted from PR
if paper_id is not None: if paper_id is not None:
self.add_publ(paper_id, base, is_reference) self.add_publ(paper_id, base, is_reference)
else: else:
self.echo("Could not add this paper, please do it manually!") self.warn("Could not add this paper, please do it manually!")
break break
# BaseCommand # BaseCommand
@ -255,11 +255,13 @@ class Command(BaseCommand):
parser.add_argument('--reset-choices', action='store_true') parser.add_argument('--reset-choices', action='store_true')
parser.add_argument('--no-references', action='store_true') parser.add_argument('--no-references', action='store_true')
parser.add_argument('--no-citations', action='store_true') parser.add_argument('--no-citations', action='store_true')
parser.add_argument('-s', '--stage', type=int, default=10000)
def handle(self, *args, **options): def handle(self, *args, **options):
reset_choices: bool = options['reset_choices'] reset_choices: bool = options['reset_choices']
no_citations: bool = options['no_citations'] no_citations: bool = options['no_citations']
no_references: bool = options['no_references'] no_references: bool = options['no_references']
stage: int = options['stage']
self.cache_path = Path('.choices.semanticscholar.pickle') self.cache_path = Path('.choices.semanticscholar.pickle')
self.cache: Set[str] = set() self.cache: Set[str] = set()
@ -275,9 +277,12 @@ class Command(BaseCommand):
semanticscholar__isnull=False, semanticscholar__isnull=False,
exclusion_criteria__isnull=True, exclusion_criteria__isnull=True,
) )
if stage < 10000:
publications = [p for p in publications if p.stage_added() == stage]
self.echo(f"==== {len(publications)} publications from stage {stage} will be shown ====")
try: try:
for publication in tqdm(publications, unit="publication"): for publication in tqdm(publications, unit="publication"):
self.echo(f"=== Publication {publication} ===") self.echo(f"=== Publication {publication}: {publication.title} ===")
for semantic in publication.semanticscholar_set.all(): for semantic in publication.semanticscholar_set.all():
data = semanticscholar(semantic.paper_id) data = semanticscholar(semantic.paper_id)

41
sok/models.py

@ -103,6 +103,24 @@ class Publication(models.Model):
def relevant_referenced_by(self) -> QuerySet: def relevant_referenced_by(self) -> QuerySet:
return self.referenced_by.filter(exclusion_criteria__isnull=True) return self.referenced_by.filter(exclusion_criteria__isnull=True)
def stage_added(self) -> int:
if not self.is_relevant:
return 10000
if self.sources.exists():
return 0
if self.referenced_by.filter(exclusion_criteria__isnull=True, sources__isnull=False) or self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 1
if ref_by := self.referenced_by.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage_added() for ref in ref_by])
return 1 + sorted(stages)[0]
if refs := self.references.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage_added() for ref in refs])
return 1 + sorted(stages)[0]
@property @property
def stage(self) -> Optional[str]: def stage(self) -> Optional[str]:
if not self.is_relevant: if not self.is_relevant:
@ -113,24 +131,27 @@ class Publication(models.Model):
return 'primary' return 'primary'
# Referenced by primary (backward snowballing) # Referenced by primary (backward snowballing)
# TODO make transitive # DONE make transitive
if self.referenced_by.filter(exclusion_criteria__isnull=True, sources__isnull=False): if self.referenced_by.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 'secondary' return 'secondary'
# References a primary (forward snowballing) # References a primary (forward snowballing)
# TODO make transitive
if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False): if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False):
return 'tertiary' return 'tertiary'
if ref_by := self.referenced_by.filter(exclusion_criteria__isnull=True): # prefer secondary over tertiary regardless of stage for now
stages = set([ref.stage for ref in ref_by]) exclude = False
if 'secondary' in stages: return 'sec-secondary' if self.referenced_by.filter(exclusion_criteria__isnull=True):
elif 'tertiary' in stages: return 'sec-tertiary' stage = self.stage_added()
if stage > 10000: exclude = True
else: return str(stage) + '-secondary'
if refs := self.references.filter(exclusion_criteria__isnull=True): if self.references.filter(exclusion_criteria__isnull=True):
stages = set([ref.stage for ref in ref_by]) stage = self.stage_added()
if 'secondary' in stages: return 'tert-secondary' if stage > 10000: return 'excluded-by-ref'
elif 'tertiary' in stages: return 'tert-tertiary' else: return str(stage) + '-tertiary'
if exclude: return 'excluded-by-ref'
return None return None

Loading…
Cancel
Save