From 1780dc40a3d57142b3ace596d93fe4cdb30b6c52 Mon Sep 17 00:00:00 2001 From: Maximilian Blochberger Date: Wed, 10 Mar 2021 14:54:41 +0100 Subject: [PATCH] Initial version --- .gitignore | 10 + LICENSE | 15 + README.md | 132 +++++++ dblp/.gitkeep | 0 manage.py | 22 ++ requirements.txt | 3 + sok/__init__.py | 0 sok/admin.py | 367 +++++++++++++++++++ sok/apps.py | 5 + sok/management/commands/citations.py | 60 ++++ sok/management/commands/dblpimport.py | 497 ++++++++++++++++++++++++++ sok/management/commands/dblpsearch.py | 181 ++++++++++ sok/management/commands/dblptex.py | 20 ++ sok/management/commands/repair.py | 111 ++++++ sok/management/commands/snowball.py | 202 +++++++++++ sok/management/commands/stats.py | 49 +++ sok/management/commands/tagdag.py | 38 ++ sok/migrations/0001_initial.py | 161 +++++++++ sok/migrations/__init__.py | 0 sok/models.py | 189 ++++++++++ sok/tests.py | 3 + sok/views.py | 3 + sokman/__init__.py | 0 sokman/asgi.py | 16 + sokman/settings.py | 134 +++++++ sokman/urls.py | 21 ++ sokman/wsgi.py | 16 + 27 files changed, 2255 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 dblp/.gitkeep create mode 100755 manage.py create mode 100644 requirements.txt create mode 100644 sok/__init__.py create mode 100644 sok/admin.py create mode 100644 sok/apps.py create mode 100644 sok/management/commands/citations.py create mode 100644 sok/management/commands/dblpimport.py create mode 100644 sok/management/commands/dblpsearch.py create mode 100644 sok/management/commands/dblptex.py create mode 100644 sok/management/commands/repair.py create mode 100644 sok/management/commands/snowball.py create mode 100644 sok/management/commands/stats.py create mode 100644 sok/management/commands/tagdag.py create mode 100644 sok/migrations/0001_initial.py create mode 100644 sok/migrations/__init__.py create mode 100644 sok/models.py create mode 100644 sok/tests.py create mode 100644 sok/views.py create mode 100644 sokman/__init__.py create mode 100644 sokman/asgi.py create mode 100644 sokman/settings.py create mode 100644 sokman/urls.py create mode 100644 sokman/wsgi.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ec40429 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +*.sqlite3 +*.secret +*.pickle +dblp/*.dtd +dblp/*.xml +dblp/*.gz +dblp/*.md5 + +# Python +__pycache__ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c519744 --- /dev/null +++ b/LICENSE @@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2021, Maximilian Blochberger + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4dbe023 --- /dev/null +++ b/README.md @@ -0,0 +1,132 @@ +# `sokman` – Manges your SoKs + +`sokman` aids in creating reproducible systematic mappings or systematic literature reviews and tries to automate some of the tedious tasks, such as searching, snowballing, and exporting a visual graph representation of the created systematization of knowledge (SoK). + +`sokman` is a Django project. It basically uses Django's admin interface and some management commands, so it is designed to run locally – you have been warned. + +Data sources: +- [DBLP](https://dblp.org) is used for searching publications, importing publications, and exporting Bibtex entries +- [Semantic Scholar](https://semanticscholar.org) is used for snowballing. + +## Systematic Mapping + +`sokman` helps keeping track of a systematic mapping process, as described by Petersen [1]. The steps can be performed as described below. + +### Definition of Research Quesiton + +Sorry, not automated, yet. + +### Conduct Search + +Use pre-defined search terms, such as `systematic mapping` and then you can search DBLP. + +```sh +./manage.py dblpsearch 'systematic mapping' +``` + +Identified publications can be imported directly or they can be excluded. Known publications will be skipped automatically and excluded publications are memoized and not further asked in subsequent searches but can be reset with `--reset-choices`. +Initial screening can be performed by showing the publication's abstract. Note that excluded publications will not be added to the database, and an exclusion criterion is therefore not documented. + +Note that DBLP only performs matching based on the title, author, and venue and does not perform full-text search of the publication. Other sources should be considered, but for now only DBLP is supported. + +Publications can be marked as duplicate or variant of other publications in the admin interface. + +#### Snowballing + +In order to identify additional publications, snowballing should be performed by backward and forward searching references and citations. + +```sh +./manage.py snowball +``` +If only a backward or forward search should be performed `--no-citations` and `--no-references` can be specified respectively. + +Note that importing publications identified through snowballing is not yet possible automatically. Publications have to be added manuall through the admin interface. If the publication is on DBLP, it can simply be imported by using the following command, where the cite key is for example `DBLP:conf/ease/PetersenFMM08`: + +```sh +./manage.py dblpimport 'DBLP:conf/ease/PetersenFMM08' +``` + +Detected references and citations will be automatically added to existing publications. Known publications will be skipped and excluded publications will be memoized and can be reset with `--reset-choices`. + +You might need to run this command until no further references are found. You might need to manually add Semantic Scholar paper IDs to publications. Publications that have a DOI can be identified automatically by runnnig: + +```sh +./manage.py repair +``` + +Repair will also add synthetic references and citations if another version or duplicate is identified. + +### Screening + +Coarse screening based on the abstract of a publication can be performed during searching or snowballing, as the abstract can be shown there and the publication does not need to be included. Note that no exclusion criterion will be assigned to the publication in that case, as the publications will not be added to the database. +Detailed screening can be performed using the admin interface, where relevant publications can be identified and others can be excluded by adding custom exclusion criteria. + +#### Publication Stages + +Publications are assigned a stage: +- *primary* – Relevant publications that are identified by a search term directly. +- *secondary* – Relevant publications that are cited by a primary publication. +- *tertiary* - Relevant publications that cite a primary publication. +- *excluded* - Irrelevant publications that were excluded with an exclusion criterion. +- *-* – All other publications, including transitive citations/cites. + +### Keywording using Abstracts + +Keywording can be performed using the admin interface by assigning tags to the publication. For each tag, specific criteria can be added that specify when a tag should be applied to a publication. + +### Data Extraction and Mapping Process + +Mapping can be performed by setting tags into relation, e. g., by grouping. This can be done by editing tags in the admin interface. Tags can imply other tags. + +Tags and citations can be exported in DOT format, which can be rendered using Graphviz. + +```sh +# Render relevant publications that have more than ten citations +./manage.py citations --min-citations 10 > citations.dot && dot -Tsvg -ocitations.svg citations.dot + +# Render tag DAG +./manage.py tagtag --root 'Literature on Systematic Mapping' > sysmap.dot && dot -Tsvg -osysmap.svg sysmap.dot + +# Render with TIKZ +pip install dot2tex +dot2tex -f tikz --usepdflatex sysmap.dot > sysmap.tex +``` + +If you want to get the BIB entry from DBLP: + +```sh +./manage.py dblptex 'DBLP:conf/ease/PetersenFMM08' >> references.bib +``` + +## Installation + +```sh +# Check out repository +git clone https://github.com/blochberger/sokman.git +cd sokman + +# Set up virtual environment +python -m venv .env +. .env/bin/activate +pip install --upgrade pip setuptools wheel +pip install -r requirements.txt + +# Loading DBLP dumps +cd dblp +curl -O 'https://dblp.org/xml/release/dblp-2019-11-22.dtd' +curl -O 'https://dblp.org/xml/release/dblp-2021-03-01.xml.gz' +gunzip 'dblp-2021-03-01.xml.gz' +cd .. + +# Set up +./manage.py makemigrations +./manage.py migrate +./manage.py createsuperuser + +# Run web server +./manage.py runserver +``` + +--- + +1. Kai Petersen, Robert Feldt, Shahid Mujtaba, Michael Mattsson: [**Systematic Mapping Studies in Software Engineering**](http://ewic.bcs.org/content/ConWebDoc/19543), EASE 2008 diff --git a/dblp/.gitkeep b/dblp/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/manage.py b/manage.py new file mode 100755 index 0000000..9a22207 --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'sokman.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6be6a1f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +django +requests +tqdm diff --git a/sok/__init__.py b/sok/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sok/admin.py b/sok/admin.py new file mode 100644 index 0000000..67e001e --- /dev/null +++ b/sok/admin.py @@ -0,0 +1,367 @@ +from typing import Optional, Set, Tuple + +from django.contrib import admin, messages +from django.db.models import Count, F, Q +from django.db.models.query import QuerySet +from django.http import HttpRequest +from django.utils.translation import gettext_lazy as _ + +from .models import ( + Author, + ExclusionCriterion, + Publication, + SearchTerm, + SemanticScholar, + Source, + Tag, +) + + +# Filters + + +class PublicationVariantFilter(admin.SimpleListFilter): + title = _("is variant") + parameter_name = 'variant' + + def lookups(self, request: HttpRequest, model_admin) -> Tuple[Tuple[str, str], ...]: + return ( + ('yes', _("yes")), + ('no', _("no")), + ) + + def queryset(self, request: HttpRequest, queryset: QuerySet) -> QuerySet: + if self.value() == 'yes': + return queryset.filter(variant_of__isnull=False) + if self.value() == 'no': + return queryset.filter(variant_of__isnull=True) + return queryset + + +class PublicationRelevanceFilter(admin.SimpleListFilter): + title = _("is relevant") + parameter_name = 'is_relevant' + + def lookups(self, request: HttpRequest, model_admin) -> Tuple[Tuple[str, str], ...]: + return ( + ('yes', _("yes")), + ('no', _("no")), + ) + + def queryset(self, request: HttpRequest, queryset: QuerySet) -> QuerySet: + if self.value() == 'yes': + return queryset.filter(exclusion_criteria__isnull=True) + if self.value() == 'no': + return queryset.filter(exclusion_criteria__isnull=False) + return queryset + + +class PublicationStageFilter(admin.SimpleListFilter): + title = _("stage") + parameter_name = 'stage' + + def lookups(self, request: HttpRequest, model_admin) -> Tuple[Tuple[str, str], ...]: + return ( + ('primary', _("primary")), + ('secondary', _("secondary")), + ('tertiary', _("tertiary")), + ('excluded', _("excluded")), + ('-', _("-")), + ) + + def queryset(self, request: HttpRequest, queryset: QuerySet) -> QuerySet: + if self.value() == 'excluded': + return queryset.filter(exclusion_criteria__isnull=False) + + relevant = queryset.filter(exclusion_criteria__isnull=True) + + if self.value() == 'primary': + return relevant.filter(sources__isnull=False) + + if self.value() == 'secondary': + return relevant.filter( + referenced_by__exclusion_criteria__isnull=True, + referenced_by__sources__isnull=False, + sources__isnull=True, + ) + + if self.value() == 'tertiary': + return relevant.filter( + references__exclusion_criteria__isnull=True, + references__sources__isnull=False, + sources__isnull=True, + ).exclude( + referenced_by__exclusion_criteria__isnull=True, + referenced_by__sources__isnull=False, + ) + + if self.value() == '-': + ids: Set[int] = { + publication.id + for publication in queryset + if publication.stage is None + } + return queryset.filter(id__in=ids) + + return queryset + + +class TagCategoryFilter(admin.SimpleListFilter): + title = _("category") + parameter_name = 'category' + + def lookups(self, request: HttpRequest, model_admin) -> Tuple[Tuple[str, str], ...]: + return ( + (str(tag.pk), tag.name) + for tag in Tag.objects.filter(implies__isnull=True) + ) + + def queryset(self, request: HttpRequest, queryset: QuerySet) -> QuerySet: + if value := self.value(): + pk = int(value) + category = Tag.objects.get(pk=pk) + # TODO Make transitive? + return queryset.filter(implies=category) + return queryset + + +# Inlines + + +class AuthorPublicationsInline(admin.TabularInline): + model = Author.publications.through + extra = 0 + ordering = ['position'] + autocomplete_fields = ('publication',) + + +class ExclusionCriterionPublications(admin.TabularInline): + model = ExclusionCriterion.publications.through + extra = 0 + autocomplete_fields = ('publication',) + + +class PublicationAuthorsInline(admin.TabularInline): + model = Publication.authors.through + extra = 0 + ordering = ['position'] + autocomplete_fields = ('author',) + + +class PublicationSourcesInline(admin.TabularInline): + model = Publication.sources.through + extra = 0 + autocomplete_fields = ('source', 'search_term') + + +class PublicationCitationsInline(admin.TabularInline): + verbose_name = "citation" + model = Publication.referenced_by.through + fk_name = 'reference' + extra = 0 + ordering = ['identifier'] + autocomplete_fields = ('publication',) + + +class PublicationReferencesInline(admin.TabularInline): + verbose_name = "reference" + model = Publication.references.through + fk_name = 'publication' + extra = 0 + ordering = ['identifier'] + autocomplete_fields = ('reference',) + + +class PublicationTagsInline(admin.TabularInline): + model = Publication.tags.through + extra = 0 + autocomplete_fields = ('tag',) + + +class TagPublicationsInline(admin.TabularInline): + model = Tag.publications.through + extra = 0 + autocomplete_fields = ('publication',) + + +# Models + + +@admin.register(Author) +class AuthorAdmin(admin.ModelAdmin): + list_display = ('name', 'publication_count', 'relevant_publication_count') + search_fields = ('name',) + inlines = (AuthorPublicationsInline,) + + def get_queryset(self, request: HttpRequest) -> QuerySet: + return Author.objects.annotate( + publication_count=Count('publications', distinct=True), + relevant_publication_count=Count( + 'publications', + filter=Q(publications__exclusion_criteria__isnull=True), + distinct=True, + ), + ) + + def publication_count(self, obj: Author) -> int: + return obj.publication_count + + def relevant_publication_count(self, obj: Author) -> int: + return obj.relevant_publication_count + + publication_count.short_description = "publications" + publication_count.admin_order_field = 'publication_count' + relevant_publication_count.short_description = "rel. publications" + relevant_publication_count.admin_order_field = 'relevant_publication_count' + + +@admin.register(ExclusionCriterion) +class ExclusionCriteriaAdmin(admin.ModelAdmin): + list_display = ('name', 'publication_count') + search_fields = ('name',) + inlines = (ExclusionCriterionPublications,) + + def get_queryset(self, request: HttpRequest) -> QuerySet: + return ExclusionCriterion.objects.annotate(publication_count=Count('publications')) + + def publication_count(self, obj: ExclusionCriterion) -> int: + return obj.publication_count + + publication_count.short_description = "publications" + publication_count.admin_order_field = 'publication_count' + + +@admin.register(Tag) +class TagAdmin(admin.ModelAdmin): + list_display = ('name', 'publication_count', 'total_publications') + list_filter = (TagCategoryFilter,) + search_fields = ('name',) + autocomplete_fields = ('implies',) + inlines = (TagPublicationsInline,) + + def get_queryset(self, request: HttpRequest) -> QuerySet: + return Tag.objects.annotate(publication_count=Count('publications')) + + #def _implied_by(self, obj: Tag) -> str: + # return ", ".join(map(str, obj.implied_by.order_by('name'))) + + def publication_count(self, obj: Tag) -> int: + return obj.publication_count + + publication_count.short_description = "publications" + publication_count.admin_order_field = "publication_count" + + +@admin.register(SemanticScholar) +class SemanticScholarAdmin(admin.ModelAdmin): + list_display = ('paper_id', 'publication') + search_fields = ('paper_id', 'publication') + autocomplete_fields = ('publication',) + + +@admin.register(SearchTerm) +class SearchTermAdmin(admin.ModelAdmin): + list_display = ('name', 'publication_count') + search_fields = ('name',) + + def get_queryset(self, request: HttpRequest) -> QuerySet: + return SearchTerm.objects.annotate( + publication_count=Count( + 'publicationsource__publication', + filter=Q(publicationsource__publication__exclusion_criteria__isnull=True), + distinct=True, + ), + ) + + def publication_count(self, obj: SearchTerm) -> int: + return obj.publication_count + + publication_count.short_description = "publications" + publication_count.admin_order_field = 'publication_count' + + +@admin.register(Source) +class SourceAdmin(admin.ModelAdmin): + list_display = ('name', 'publication_count') + search_fields = ('name',) + + def get_queryset(self, request: HttpRequest) -> QuerySet: + return Source.objects.annotate( + publication_count=Count('publications', distinct=True), + ) + + def publication_count(self, obj: Source) -> int: + return obj.publication_count + + publication_count.short_description = "publications" + publication_count.admin_order_field = 'publication_count' + + +@admin.register(Publication) +class PublicationAdmin(admin.ModelAdmin): + search_fields = ('cite_key', 'doi', 'title') + list_display = ( + #'cite_key', + 'title', + 'year', + 'citation_count', + 'references_count', + 'page_count', + 'stage', + 'classified', + ) + list_filter = ( + PublicationStageFilter, + PublicationRelevanceFilter, + 'classified', + 'references_complete', + 'peer_reviewed', + PublicationVariantFilter, + #'year', + #'sources', + ) + inlines = ( + PublicationAuthorsInline, + PublicationReferencesInline, + PublicationCitationsInline, + PublicationSourcesInline, + PublicationTagsInline, + ) + autocomplete_fields = ('exclusion_criteria', 'variant_of') + actions = ('cite',) + + def get_queryset(self, request: HttpRequest) -> QuerySet: + return Publication.objects.annotate( + citation_count=Count( + 'referenced_by', + filter=Q(exclusion_criteria__isnull=True), + distinct=True, + ), + references_count=Count( + 'references', + filter=Q(exclusion_criteria__isnull=True), + distinct=True, + ), + page_count=1 + F('last_page') - F('first_page'), + ) + + def citation_count(self, obj: Publication) -> int: + return obj.citation_count + + def references_count(self, obj: Publication) -> int: + return obj.references_count + + def page_count(self, obj: Publication) -> int: + return obj.page_count + + def cite(self, request: HttpRequest, queryset: QuerySet): + cite_keys = queryset.order_by('cite_key').values_list('cite_key', flat=True).distinct() + cite_str = ", ".join(list(cite_keys)) + self.message_user(request, f"\\cite{{{cite_str}}}", level=messages.SUCCESS) + + citation_count.short_description = "citations" + citation_count.admin_order_field = 'citation_count' + references_count.short_description = "references" + references_count.admin_order_field = 'references_count' + page_count.short_description = "pages" + page_count.admin_order_field = 'page_count' diff --git a/sok/apps.py b/sok/apps.py new file mode 100644 index 0000000..dc3a91a --- /dev/null +++ b/sok/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class SoKConfig(AppConfig): + name = 'sok' diff --git a/sok/management/commands/citations.py b/sok/management/commands/citations.py new file mode 100644 index 0000000..9a981a5 --- /dev/null +++ b/sok/management/commands/citations.py @@ -0,0 +1,60 @@ +from typing import Set, Tuple + +from django.core.management.base import BaseCommand, CommandParser +from django.db.models import Count, Q + +from sok.models import Publication + + +class Command(BaseCommand): + + def echo(self, msg: str): + self.stdout.write(msg) + + # BaseCommand + + def add_arguments(self, parser: CommandParser): + parser.add_argument('--min-citations', type=int, default=0) + parser.add_argument('--pk', action='store_true') + + def graphviz(self, pk: bool, min_citations: int) -> None: + publications = Publication.objects.filter(exclusion_criteria__isnull=True).annotate( + citation_count=Count( + 'referenced_by', + filter=Q(exclusion_criteria__isnull=True), + distinct=True, + ), + ).filter(citation_count__gte=min_citations) + + self.echo("digraph G {") + self.echo("\trankdir = RL;") + + graph: Set[Tuple[int, int]] = set() + + for publication in publications: + + if publication.stage != 'primary': + continue + + for reference in publication.relevant_references: + + if reference not in publications: + continue + + graph.add((publication.pk, reference.pk)) + if (reference.pk, publication.pk) in graph: + self.stderr.write(self.style.ERROR( + f"CYCLE: {publication.cite_key} <-> {reference.cite_key}" + )) + + if pk: + self.echo(f'\t"{publication.pk}" -> "{reference.pk}";') + else: + self.echo(f'\t"{publication.cite_key}" -> "{reference.cite_key}";') + + self.echo("}") + + def handle(self, *args, **options) -> None: + min_citations: int = options['min_citations'] + pk: bool = options['pk'] + self.graphviz(pk, min_citations) diff --git a/sok/management/commands/dblpimport.py b/sok/management/commands/dblpimport.py new file mode 100644 index 0000000..67f1ef4 --- /dev/null +++ b/sok/management/commands/dblpimport.py @@ -0,0 +1,497 @@ +import html +import io +import pickle +import string +import xml.sax + +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Set, Tuple, Union +from urllib.parse import urlparse + +import requests + +from django.db import transaction +from django.core.management.base import BaseCommand, CommandParser, CommandError + +from sok.models import ( + Author, + Publication, + PublicationAuthor, + PublicationSource, + SearchTerm, + Source, +) + + +Attributes = xml.sax.xmlreader.AttributesImpl + + +PUBLICATIONS = { + 'article', + 'inproceedings', + 'proceedings', + 'book', + 'incollection', + 'phdthesis', + 'mastersthesis', + 'www', + 'person', + 'data', +} + +CITE_KEY_PREFIX = 'DBLP:' +DUMP_PATH = Path('dblp') / 'dblp-2021-03-01.xml' + + +def strip_cite_key_prefix(value: str) -> str: + if value.startswith(CITE_KEY_PREFIX): + return value[len(CITE_KEY_PREFIX):] + return value + + +def strip_issue_from_page(value: str) -> int: + return int(''.join(c for c in value.split(':')[-1] if c in string.digits)) + + +def clean_title(value: str) -> str: + if value.endswith('.'): + return value[:-1] + return value + + +def parse_pages(raw: str) -> Tuple[int, int]: + # Observed: + # - '1-10' + # - '1' + # - '16:1-16:10' + # - 'I-X, 1-66' + # - '186-' + + pages = raw.split(', ')[-1].split('-') + + if 2 == len(pages): + first, last = pages + if last != '': + return (strip_issue_from_page(first), strip_issue_from_page(last)) + else: + pages = [first] + + if 1 == len(pages): + page = strip_issue_from_page(pages[0]) + return (page, page) + + raise NotImplementedError(f"Unexpected value for : {raw}") + + +class FinishedParsing(Exception): + pass + + +@dataclass(frozen=True) +class PublicationResult: + key: str + title: str + year: int + pages: Optional[Tuple[int, int]] + authors: List[str] = field(default_factory=list) + urls: List[str] = field(default_factory=list) + + @property + def cite_key(self) -> str: + return CITE_KEY_PREFIX + self.key + + @property + def doi(self) -> Optional[str]: + for url_str in self.urls: + url = urlparse(url_str) + if url.hostname is not None and url.hostname.endswith('doi.org'): + return url.path[1:] # Strip leading '/' + return None + + @property + def is_peer_reviewed(self) -> Optional[bool]: + """ + Heuristically determine whether a publication is peer reviewed. + """ + + # Preprint on arXiv.org + if self.key.startswith('journals/corr/abs-'): + return False + + # Consider conference proceedings, journal articles, and dissertations + # as peer reviewed. + if any([ + self.key.startswith('phd/'), + self.key.startswith('conf/'), + self.key.startswith('journals/'), + ]): + return True + + return None + + @property + def first_page(self) -> Optional[int]: + if self.pages is None: + return None + return self.pages[0] + + @property + def last_page(self) -> Optional[int]: + if self.pages is None: + return None + return self.pages[1] + + @classmethod + def from_dump(cls, path: Path, keys: Set[str]) -> List['PublicationResult']: + parser = xml.sax.make_parser() + + # Enable DTD parsing + parser.setFeature(xml.sax.handler.feature_external_ges, True) + + handler = DBLPHandler(keys) + parser.setContentHandler(handler) + try: + parser.parse(path) + except FinishedParsing: + pass # Just a workaround to abort SAX parsing if all entries were found + + return handler.publications + + @classmethod + def from_api(cls, key: str) -> 'PublicationResult': + + url = f"https://dblp.uni-trier.de/rec/{key}.xml" + response = requests.get(url) + response.raise_for_status + + parser = xml.sax.make_parser() + handler = DBLPHandler({key}) + parser.setContentHandler(handler) + try: + parser.parse(io.BytesIO(response.content)) + except FinishedParsing: + pass + + assert 1 == len(handler.publications) + + return handler.publications[0] + + @classmethod + def from_search_hit(cls, hit: Dict[str, Any]) -> 'PublicationResult': + info = hit['info'] + + pages: Optional[Tuple[int, int]] = None + if raw_pages := info.get('pages', None): + pages = parse_pages(raw_pages) + + # A single author is not a list, d'oh. + authors = info.get('authors', dict()).get('author', []) + if type(authors) is not list: + authors = [authors] + + # TODO Parse URLs ('ee') + + return cls( + key=info['key'], + title=clean_title(html.unescape(info['title'])), + year=int(info['year']), + pages=pages, + authors=[html.unescape(author['text']) for author in authors], + ) + + @classmethod + def from_search( + cls, + search_term: str, + limit: int = 1000, + ) -> Tuple[str, List['PublicationResult'], int]: + # see https://dblp.uni-trier.de/faq/13501473.html + url = 'http://dblp.org/search/publ/api' + response = requests.get( + url, + params={ + 'q': search_term, + 'f': 0, + 'h': limit, + 'c': 0, + 'format': 'json', + }, + ) + response.raise_for_status + search_result = response.json()['result'] + hits = search_result['hits'] + results = [cls.from_search_hit(hit) for hit in hits['hit']] + + total = hits['@total'] + # TODO re-request if len(results) < hits_total + + return (search_result['query'], results, total) + + +@dataclass +class DBLPFullHandler(xml.sax.handler.ContentHandler): + entries: Dict[str, str] = field(default_factory=dict) + + # ContentHandler + + def startElement(self, name: str, attributes: Attributes): + if name in PUBLICATIONS: + key = attributes.getValue('key') + self.entries[key] = name + + +def get_all_cite_keys(path: Path) -> Set[str]: + cache_path = path.with_suffix('.pickle') + cache: Dict[str, str] = dict() + if cache_path.exists(): + with cache_path.open('rb') as f: + cache = pickle.load(f) + else: + parser = xml.sax.make_parser() + + # Enable DTD parsing + parser.setFeature(xml.sax.handler.feature_external_ges, True) + + handler = DBLPFullHandler() + parser.setContentHandler(handler) + parser.parse(path) + + cache = handler.entries + with cache_path.open('wb') as f: + pickle.dump(cache, f) + + return {CITE_KEY_PREFIX + key for key in cache.keys()} + + +@dataclass +class DBLPHandler(xml.sax.handler.ContentHandler): + key_queue: Set[str] + tag_stack: List[str] = field(default_factory=list) + publications: List[PublicationResult] = field(default_factory=list) + key: Optional[str] = None + author: Optional[str] = None + authors: List[Author] = field(default_factory=list) + title: Optional[str] = None + year: Optional[int] = None + pages: Optional[Tuple[int, int]] = None + urls: List[str] = field(default_factory=list) + + @property + def current_tag(self) -> str: + assert 0 < len(self.tag_stack) + return self.tag_stack[-1] + + @property + def is_handling_publication(self) -> bool: + return self.key is not None + + def startElement(self, name: str, attributes: Attributes): + self.tag_stack.append(name) + + if name in PUBLICATIONS: + self.startPublication(name, attributes) + + if not self.is_handling_publication: + return + + if name == 'author': + self.author = '' + if name == 'title': + self.title = '' + + def endElement(self, name: str): + self.tag_stack.pop(-1) + + if self.is_handling_publication and name in PUBLICATIONS: + self.endPublication() + + if name == 'author' and self.author is not None: + self.authors.append(self.author) + self.author = None + + def characters(self, content: Union[bytes, str]): + assert isinstance(content, str) # TODO Handle bytes? + + if not self.is_handling_publication: + return + + if 'author' in self.tag_stack: + assert self.author is not None + self.author += content + + if 'title' in self.tag_stack: + assert self.title is not None + self.title += content + + if self.current_tag == 'ee': + self.urls.append(content) + + if self.current_tag == 'year': + assert self.year is None + self.year = int(content) + + if self.current_tag == 'pages': + assert self.pages is None + self.pages = parse_pages(content) + + def startPublication(self, name: str, attributes: Attributes): + assert name in PUBLICATIONS + assert not self.is_handling_publication + assert 'key' in attributes + assert self.author is None + assert 0 == len(self.authors) + assert self.title is None + assert self.year is None + assert self.pages is None + assert 0 == len(self.urls) + + key = attributes.getValue('key') + if key not in self.key_queue: + return # This is not the publication you are looking for. + + self.key_queue.remove(key) + self.key = key + + def endPublication(self): + assert self.is_handling_publication + assert self.author is None + assert self.key is not None + assert self.title is not None + assert self.year is not None + assert 0 < len(self.authors) + + title = self.title + if title.endswith('.'): + title = title[:-1] + + publication = PublicationResult( + key=self.key, + title=title, + year=self.year, + authors=self.authors, + pages=self.pages, + urls=self.urls, + ) + self.publications.append(publication) + + self.key = None + self.authors = [] + self.title = None + self.year = None + self.pages = None + self.urls = [] + + if 0 == len(self.key_queue): + raise FinishedParsing + + +class Command(BaseCommand): + + def log_success(self, msg: str): + self.stdout.write(self.style.SUCCESS(msg)) + + def log_info(self, msg: str, nl: bool = True): + self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') + self.stdout.flush() + + # BaseCommand + + def add_arguments(self, parser: CommandParser): + parser.add_argument('--use-api', action='store_true', default=False) + parser.add_argument('--search-term', default=None) + parser.add_argument('keys', nargs='+') + + @transaction.atomic + def handle(self, *args, **options): + use_api = options['use_api'] + source = Source.objects.get(name='DBLP') + + search_term: Optional[SearchTerm] = None + if name := options['search_term']: + search_term, created = SearchTerm.objects.get_or_create(name=name) + if created: + self.log_success(f"Created search term: {search_term}") + + cite_keys: Set[str] = set() + publications: List[Publication] = [] + for key in set(options['keys']): + try: + publication = Publication.objects.get(cite_key=key) + publications.append(publication) + except Publication.DoesNotExist: + if not key.startswith(CITE_KEY_PREFIX): + raise CommandError(f"Invalid cite key: {key}") + cite_keys.add(strip_cite_key_prefix(key)) + + if 0 < len(cite_keys): + + if use_api: + self.log_info("Querying DBLP... ", nl=False) + else: + self.log_info(f"Parsing DBLP dump '{DUMP_PATH}'... ", nl=False) + start = datetime.now() + if use_api: + results: List[PublicationResult] = [] + for key in cite_keys: + result = PublicationResult.from_api(key) + results.append(result) + else: + results = PublicationResult.from_dump(DUMP_PATH, cite_keys) + end = datetime.now() + duration = end - start + self.log_success(f"done ({duration}).") + + for result in results: + + # Add authors to database + authors: List[Author] = [] + for name in result.authors: + author, created = Author.objects.get_or_create(name=name) + if created: + self.log_success(f"Added author: {author}") + else: + self.log_info(f"Author '{author}' alreay known") + authors.append(author) + + # Add publication to database + publication, created = Publication.objects.get_or_create( + cite_key=result.cite_key, + title=result.title, + year=result.year, + peer_reviewed=result.is_peer_reviewed, + first_page=result.first_page, + last_page=result.last_page, + doi=result.doi, + ) + if created: + self.log_success(f"Added publication: {publication}") + else: + self.log_info(f"Publication '{publication}' already known") + publications.append(publication) + + # Assign authors + for position, author in enumerate(authors): + publication_author, created = PublicationAuthor.objects.get_or_create( + author=author, + publication=publication, + position=position, + ) + if created: + self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}") + else: + self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") + + # Assign sources + if search_term is not None: + for publication in publications: + publication_source, created = PublicationSource.objects.get_or_create( + source=source, + publication=publication, + search_term=search_term, + ) + if created: + self.log_success(f"Assigned source '{source}' to publication '{publication}' with search term '{search_term}'") + else: + self.log_info(f"Source '{source}' already assigned to publication '{publication}' with search term '{search_term}'") diff --git a/sok/management/commands/dblpsearch.py b/sok/management/commands/dblpsearch.py new file mode 100644 index 0000000..700521d --- /dev/null +++ b/sok/management/commands/dblpsearch.py @@ -0,0 +1,181 @@ +import pickle + +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Set + +from django.core.management.base import BaseCommand, CommandError, CommandParser +from django.db import transaction + +import sok.management.commands.dblpimport as dblp + +from sok.management.commands.snowball import semanticscholar +from sok.models import ( + Author, + Publication, + PublicationAuthor, + PublicationSource, + SearchTerm, + SemanticScholar, + Source, +) + + +class Command(BaseCommand): + + def log_success(self, msg: str): + self.stdout.write(self.style.SUCCESS(msg)) + + def log_info(self, msg: str, nl: bool = True): + self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') + self.stdout.flush() + + def display_result(self, result: dblp.PublicationResult): + self.stdout.write("") + self.log_info(result.cite_key) + if 0 < len(result.authors): + self.stdout.write(" " + ", ".join([name for name in result.authors])) + self.log_info(" " + result.title, nl=False) + self.stdout.write(f" ({result.year})") + + def add_publication_source( + self, + publication: Publication, + source: Source, + search_term: SearchTerm, + ): + publication_source, created = PublicationSource.objects.get_or_create( + source=source, + publication=publication, + search_term=search_term, + ) + if created: + self.log_success(f"Assigned source '{source}' to publication '{publication}' with search term '{search_term}'") + else: + self.log_info(f"Source '{source}' already assigned to publication '{publication}' with search term '{search_term}'") + + @transaction.atomic + def store_result( + self, + result: dblp.PublicationResult, + source: Source, + search_term: SearchTerm, + paper_id: Optional[str], + ) -> Publication: + + # Store Authors + authors: List[Author] = [] + for name in result.authors: + author, created = Author.objects.get_or_create(name=name) + if created: + self.log_success(f"Added author: {author}") + else: + self.log_info(f"Author '{author}' alreay known") + authors.append(author) + + # Store Publication + publication = Publication( + cite_key=result.cite_key, + title=result.title, + year=result.year, + peer_reviewed=result.is_peer_reviewed, + first_page=result.first_page, + last_page=result.last_page, + ) + publication.full_clean() + publication.save() + self.log_success(f"Added publication: {publication}") + + # Assign authors to publication + for position, author in enumerate(authors): + publication_author, created = PublicationAuthor.objects.get_or_create( + author=author, + publication=publication, + position=position, + ) + if created: + self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}") + else: + self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") + + if paper_id is not None: + s, created = SemanticScholar.objects.get_or_create(paper_id=paper_id, publication=publication) + if created: + self.log_success(f"Added Semantic Scholar '{paper_id}' to publication '{publication}'") + else: + self.log_info(f"Semantic Scholar '{paper_id}' for publication '{publication}' is already known") + + self.add_publication_source(publication, source, search_term) + return publication + + # BaseCommand + + def add_arguments(self, parser: CommandParser): + parser.add_argument('--reset-choices', action='store_true') + parser.add_argument('--limit', type=int, default=1000, help="1 – 1000 (default: 1000)") + parser.add_argument('term') + + def handle(self, *args, **options): + try: + limit: int = options['limit'] + if not (0 < limit <= 1000): + raise CommandError(f"Invalid value for 'limit': {limit}; allowed range is 1 – 1000") + reset_choices: bool = options['reset_choices'] + source = Source.objects.get(name='DBLP') + + path = Path('.choices.dblp.pickle') + cache: Dict[str, Set[str]] = defaultdict(set) + if reset_choices: + path.unlink(missing_ok=True) + elif path.exists(): + self.log_info("Loading previous choices (reset with --reset-choices)...", nl=False) + with path.open('rb') as f: + cache = pickle.load(f) + self.log_success("done") + + self.log_info("Querying DBLP... ", nl=False) + query, results, total = dblp.PublicationResult.from_search(options['term'], limit) + self.log_success(f"done, found {len(results)}/{total} publication(s)") + + # Create search term + search_term, created = SearchTerm.objects.get_or_create(name=query) + if created: + self.log_success(f"Created search term: {search_term}") + + # Add search term to existing entries + cite_keys = {result.cite_key for result in results} + existing: Set[str] = set() + for publication in Publication.objects.filter(cite_key__in=cite_keys): + existing.add(publication.cite_key) + self.add_publication_source(publication, source, search_term) + + # Promt the user for importing new entries + for result in results: + # Skip existing entries + if result.cite_key in existing.union(cache[query]): + continue + + self.display_result(result) + + # TODO Add abstract from semantic scholar + + data: Dict[str, Any] = dict() + if doi := result.doi: + data = semanticscholar(doi) + + while True: + choice = input("Import? [y/N], Show abstract? [a]: ").lower() + if choice in {'y', 'yes'}: + self.store_result(result, source, search_term, data.get('paperId', None)) + break + elif choice in {'', 'n', 'no'}: + # Store choice + cache[query].add(result.cite_key) + with path.open('wb') as f: + pickle.dump(cache, f) + break + elif choice == 'a': + if abstract := data.get('abstract', None): + self.stdout.write(abstract) + except KeyboardInterrupt: + raise CommandError("Aborted.") diff --git a/sok/management/commands/dblptex.py b/sok/management/commands/dblptex.py new file mode 100644 index 0000000..ed7c7b8 --- /dev/null +++ b/sok/management/commands/dblptex.py @@ -0,0 +1,20 @@ +import requests + +from django.core.management.base import BaseCommand, CommandParser + +import sok.management.commands.dblpimport as dblp + + +class Command(BaseCommand): + + # BaseCommand + + def add_arguments(self, parser: CommandParser): + parser.add_argument('key') + + def handle(self, *args, **options): + key = dblp.strip_cite_key_prefix(options['key']) + url = f'https://dblp.uni-trier.de/rec/{key}.bib?param=0' + response = requests.get(url) + response.raise_for_status + self.stdout.write(response.content.decode()) diff --git a/sok/management/commands/repair.py b/sok/management/commands/repair.py new file mode 100644 index 0000000..843f922 --- /dev/null +++ b/sok/management/commands/repair.py @@ -0,0 +1,111 @@ +from pprint import pprint +from time import sleep + +from django.db import transaction +from django.core.exceptions import ValidationError +from django.core.management.base import BaseCommand, CommandError + +import sok.management.commands.dblpimport as dblp + +from sok.management.commands.snowball import semanticscholar +from sok.models import Publication, PublicationReference, SemanticScholar + + +class Command(BaseCommand): + + def log_success(self, msg: str): + self.stdout.write(self.style.SUCCESS(msg)) + + def log_info(self, msg: str, nl: bool = True): + self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') + self.stdout.flush() + + @transaction.atomic + def fix_references(self) -> None: + """ + Create relevant references to masters of referenced variants. + + If mulitple variants of a publication exist, only the master variant is + considered. However, relevant publications might reference a non-master + master-variant, e. g., a preprint. + + This command adds references to the master-variant, even though this + reference is not actually present in the publication. The reference + identifier is marked with a star, e. g., '[1]*'. + """ + + self.log_info("--- Searching for references to variants ---") + for publication in Publication.objects.filter(variant_of__isnull=False): + variant = publication.variant_of + origs = PublicationReference.objects.filter(reference=publication) + for orig in origs: + if PublicationReference.objects.filter(reference=variant, publication=orig.publication).exists(): + continue + fixed = PublicationReference( + reference=variant, + publication=orig.publication, + identifier=('' if orig.identifier is None else orig.identifier) + "*", + ) + try: + fixed.full_clean() + fixed.save() + self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}") + except ValidationError as e: + raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}") + + def fix_dblp(self): + self.log_info("--- Searching for entries not in the default DBLP dump ---") + keys_in_db = set( + Publication.objects.filter( + cite_key__startswith=dblp.CITE_KEY_PREFIX + ).values_list('cite_key', flat=True).distinct() + ) + keys_in_dump = dblp.get_all_cite_keys(dblp.DUMP_PATH) + + self.stdout.write(f"DB: {len(keys_in_db):8d}") + self.stdout.write(f"DBLP: {len(keys_in_dump):8d}") + pprint(keys_in_db - keys_in_dump) + + def find_missing_dois(self): + self.log_info("--- Searching for missing DOIs ---") + publications = Publication.objects.filter(doi__isnull=True) + keys = { + dblp.strip_cite_key_prefix(cite_key) + for cite_key in publications.values_list('cite_key', flat=True) + } + self.log_info("Parsing DBLP dump...") + results = dblp.PublicationResult.from_dump(dblp.DUMP_PATH, keys) + self.log_info("done") + + for result in results: + if doi := result.doi: + publication = publications.get(cite_key=result.cite_key) + publication.doi = doi + publication.full_clean() + publication.save() + self.log_success(f"Added DOI '{doi}' to publication: {publication}") + + def find_semanticscholar_ids(self): + self.log_info("--- Searching for paper IDs on Semantic Scholar ---") + publications = Publication.objects.filter( + doi__isnull=False, + semanticscholar__isnull=True, + ) + for publication in publications: + data = semanticscholar(publication.doi) + + paper_id = data['paperId'] + obj = SemanticScholar(paper_id=paper_id, publication=publication) + obj.full_clean() + obj.save() + self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") + + sleep(2) # Throttle to avoid rate-limiting + + # BaseCommand + + def handle(self, *args, **options): + self.fix_references() + self.fix_dblp() + self.find_missing_dois() + self.find_semanticscholar_ids() diff --git a/sok/management/commands/snowball.py b/sok/management/commands/snowball.py new file mode 100644 index 0000000..b6e53c4 --- /dev/null +++ b/sok/management/commands/snowball.py @@ -0,0 +1,202 @@ +import hashlib +import json +import pickle + +from pathlib import Path +from time import sleep +from typing import Any, Dict, List, Set + +import requests + +from django.core.management.base import BaseCommand, CommandParser, CommandError +from tqdm import tqdm + +from sok.models import Publication, PublicationReference, SemanticScholar + + +def semanticscholar(identifier: str, include_unknown_references: bool = False) -> Dict[str, Any]: + """ + Retrieve information from the Semantic Scholar API. + + The identifier can be a DOI or the Semantic Scholar paper ID. + + See: https://api.semanticscholar.org + """ + + url = f'https://api.semanticscholar.org/v1/paper/{identifier}' + params: Dict[str, Any] = dict() + if include_unknown_references: + params['include_unknown_references'] = 'true' + response = requests.get(url, params=params) + response.raise_for_status + return response.json() + + +class Command(BaseCommand): + + def echo(self, msg: str, bold: bool = False, nl: bool = True): + if bold: + msg = self.style.HTTP_INFO(msg) + tqdm.write(msg, end='\n' if nl else '') + #self.stdout.write(msg, ending='\n' if nl else '') + + def warn(self, msg: str): + self.echo(self.style.WARNING(msg)) + + def add_reference( + self, + publication: Publication, + reference: Publication, + is_reference: bool = True, + ): + try: + rel = PublicationReference.objects.get( + publication=publication, + reference=reference, + ) + if is_reference: + self.echo(f"Reference already known: {rel.identifier} {reference}") + else: + self.echo(f"Citation already known: {rel.identifier} {publication}") + except PublicationReference.DoesNotExist: + rel = PublicationReference( + publication=publication, + reference=reference, + ) + rel.full_clean() + rel.save() + if is_reference: + self.echo(f"Added reference: {reference}") + else: + self.echo(f"Added citation: {publication}") + + def display(self, obj: Dict[str, Any]): + self.echo("") + authors = [author['name'] for author in obj['authors']] + title = obj['title'] + self.echo(" " + ", ".join(authors)) + self.echo(f" {title}", bold=True, nl=False) + if year := obj.get('year', None): + self.echo(f" ({year})") + else: + self.echo("") + if venue := obj.get('venue', None): + self.echo(f" {venue}") + if doi := obj.get('doi', None): + self.echo(f" {doi}") + if paper_id := obj.get('paperId', None): + self.echo(f" {paper_id}") + + def get_identifier(self, obj: Dict[str, Any]) -> str: + if paper_id := obj.get('paperId', None): + return paper_id + raw = json.dumps(obj, sort_keys=True) + hasher = hashlib.blake2b() + hasher.update(raw.encode()) + return hasher.hexdigest() + + def handle_objs( + self, + base: Publication, + objs: List[Dict[str, Any]], + is_reference: bool, + ): + title = "Reference" if is_reference else "Citation" + if 0 < len(objs): + self.echo(f"--- {title}s ---") + for obj in tqdm(objs, unit=title.lower()): + if paper_id := obj.get('paperId', None): + try: + existing = SemanticScholar.objects.get(paper_id=paper_id) + if is_reference: + self.add_reference(base, existing.publication) + else: + self.add_reference(existing.publication, base, is_reference) + continue + except SemanticScholar.DoesNotExist: + if doi := obj.get('doi', None): + try: + publication = Publication.objects.get(doi=doi) + new = SemanticScholar(paper_id=paper_id, publication=publication) + new.full_clean() + new.save() + self.echo(f"New Semantic Scholar entry: {paper_id}") + if is_reference: + self.add_reference(base, new.publication) + else: + self.add_reference(new.publication, base, is_reference) + continue + except Publication.DoesNotExist: + pass + + identifier = self.get_identifier(obj) + if identifier in self.cache: + continue + + self.display(obj) + + paper_id = obj.get('paperId', None) + while True: + self.echo("Ignore? [Y/n]", nl=False) + if paper_id is not None: + self.echo(", Show abstract [a]", nl=False) + self.echo(": ") + choice = input().lower() + if choice in {'', 'y', 'yes'}: + # Store choice + self.cache.add(identifier) + with self.cache_path.open('wb') as f: + pickle.dump(self.cache, f) + break + elif choice in {'a'}: + assert paper_id is not None + data = semanticscholar(paper_id) + if abstract := data.get('abstract', None): + self.echo(abstract) + elif choice in {'', 'n', 'no'}: + # TODO Import? + break + + # BaseCommand + + def add_arguments(self, parser: CommandParser): + parser.add_argument('--reset-choices', action='store_true') + parser.add_argument('--no-references', action='store_true') + parser.add_argument('--no-citations', action='store_true') + + def handle(self, *args, **options): + reset_choices: bool = options['reset_choices'] + no_citations: bool = options['no_citations'] + no_references: bool = options['no_references'] + + self.cache_path = Path('.choices.semanticscholar.pickle') + self.cache: Set[str] = set() + if reset_choices: + self.cache_path.unlink(missing_ok=True) + elif self.cache_path.exists(): + self.echo("Loading previous choices (reset with --reset-choices)...", nl=False) + with self.cache_path.open('rb') as f: + self.cache = pickle.load(f) + self.echo("done", bold=True) + + publications = Publication.objects.filter( + semanticscholar__isnull=False, + exclusion_criteria__isnull=True, + ) + try: + for publication in tqdm(publications, unit="publication"): + self.echo(f"=== Publication {publication} ===") + for semantic in publication.semanticscholar_set.all(): + data = semanticscholar(semantic.paper_id) + + if not no_references: + references: List[Dict[str, Any]] = data['references'] + self.handle_objs(publication, references, is_reference=True) + + if not no_citations: + citations: List[Dict[str, Any]] = data['citations'] + self.handle_objs(publication, citations, is_reference=False) + + sleep(2) # Throttle + except KeyboardInterrupt: + raise CommandError("Aborted.") diff --git a/sok/management/commands/stats.py b/sok/management/commands/stats.py new file mode 100644 index 0000000..f2eb59b --- /dev/null +++ b/sok/management/commands/stats.py @@ -0,0 +1,49 @@ +from typing import Set + +from django.core.management.base import BaseCommand +from django.db.models import Count, Q + +import sok.management.commands.dblpimport as dblp + +from sok.models import Publication, SearchTerm + + +class Command(BaseCommand): + + def echo(self, msg: str, bold=True): + if bold: + msg = self.style.HTTP_INFO(msg) + self.stdout.write(msg) + + # BaseCommand + + def handle(self, *args, **options): + publications_found: Set[str] = set() + publications_peer_reviewed: Set[str] = set() + publications_relevant: Set[str] = set() + + self.echo("Loading DBLP dump...") + all_cite_keys = dblp.get_all_cite_keys(dblp.DUMP_PATH) + + for search_term in SearchTerm.objects.all(): + # DBLP search result + self.echo(f"Searching DBLP for '{search_term}'") + query, results, total = dblp.PublicationResult.from_search(search_term.name, 1000) + for result in results: + if result.cite_key not in all_cite_keys: + continue + publications_found.add(result.cite_key) + if result.is_peer_reviewed: + publications_peer_reviewed.add(result.cite_key) + + # Relevant publications + for publication in Publication.objects.filter( + publicationsource__search_term=search_term, + exclusion_criteria__isnull=True, + ).distinct(): + publications_relevant.add(publication.cite_key) + + # Output + self.echo(f"Total publications: {len(publications_found):4d}", bold=True) + self.echo(f"- peer reviewed: {len(publications_peer_reviewed):4d}", bold=True) + self.echo(f"- relevant: {len(publications_relevant):4d}", bold=True) diff --git a/sok/management/commands/tagdag.py b/sok/management/commands/tagdag.py new file mode 100644 index 0000000..b5b9e3a --- /dev/null +++ b/sok/management/commands/tagdag.py @@ -0,0 +1,38 @@ +from typing import Set, Tuple + +from django.core.management.base import BaseCommand, CommandParser + +from sok.models import Tag + + +class Command(BaseCommand): + + def echo(self, msg: str): + self.stdout.write(msg) + + # BaseCommand + + def add_arguments(self, parser: CommandParser): + parser.add_argument('--root', default='CAPI Misuse') + + def _graphviz(self, root: Tag) -> None: + for tag in root.implied_by.all(): + edge = (tag.pk, root.pk) + if edge in self.graph: + continue + if edge[::-1] in self.graph: + self.stderr.write(self.style.ERROR(f"CYCLE: '{root}' <-> '{tag}'")) + self.graph.add(edge) + self._graphviz(tag) + self.echo(f'\t"{tag}" -> "{root}";') + + def graphviz(self, root: Tag) -> None: + self.echo("digraph G {") + self.echo("\trankdir = RL;") + self._graphviz(root) + self.echo("}") + + def handle(self, *args, **options) -> None: + self.graph: Set[Tuple[int, int]] = set() + root = Tag.objects.get(name=options['root']) + self.graphviz(root) diff --git a/sok/migrations/0001_initial.py b/sok/migrations/0001_initial.py new file mode 100644 index 0000000..90a99dd --- /dev/null +++ b/sok/migrations/0001_initial.py @@ -0,0 +1,161 @@ +# Generated by Django 3.1.7 on 2021-03-10 13:42 + +import django.core.validators +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Author', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + ), + migrations.CreateModel( + name='ExclusionCriterion', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ('description', models.TextField(blank=True, default='')), + ], + options={ + 'verbose_name_plural': 'exclusion criteria', + }, + ), + migrations.CreateModel( + name='Publication', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('cite_key', models.CharField(blank=True, max_length=255, null=True, unique=True)), + ('title', models.CharField(max_length=255)), + ('year', models.PositiveSmallIntegerField()), + ('references_complete', models.BooleanField(default=False)), + ('peer_reviewed', models.BooleanField(default=None, null=True)), + ('classified', models.BooleanField(default=False)), + ('first_page', models.PositiveSmallIntegerField(blank=True, default=None, null=True)), + ('last_page', models.PositiveSmallIntegerField(blank=True, default=None, null=True)), + ('doi', models.CharField(blank=True, default=None, max_length=255, null=True, unique=True)), + ], + ), + migrations.CreateModel( + name='SearchTerm', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + ), + migrations.CreateModel( + name='Source', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + ), + migrations.CreateModel( + name='Tag', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ('criteria', models.TextField(blank=True)), + ('implies', models.ManyToManyField(blank=True, related_name='implied_by', to='sok.Tag')), + ], + ), + migrations.CreateModel( + name='SemanticScholar', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('paper_id', models.CharField(max_length=40, unique=True, validators=[django.core.validators.RegexValidator('^[a-f0-9]{40}$')])), + ('publication', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.publication')), + ], + options={ + 'verbose_name_plural': 'semantic scholar', + }, + ), + migrations.CreateModel( + name='PublicationTag', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('comment', models.CharField(blank=True, max_length=255, null=True)), + ('publication', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.publication')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.tag')), + ], + options={ + 'unique_together': {('publication', 'tag')}, + }, + ), + migrations.CreateModel( + name='PublicationSource', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('publication', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.publication')), + ('search_term', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.searchterm')), + ('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.source')), + ], + options={ + 'unique_together': {('publication', 'source', 'search_term')}, + }, + ), + migrations.CreateModel( + name='PublicationReference', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('identifier', models.CharField(blank=True, default=None, max_length=255, null=True)), + ('publication', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.publication')), + ('reference', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='cited_by', to='sok.publication')), + ], + options={ + 'unique_together': {('publication', 'reference'), ('publication', 'identifier')}, + }, + ), + migrations.CreateModel( + name='PublicationAuthor', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('position', models.PositiveSmallIntegerField()), + ('author', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.author')), + ('publication', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='sok.publication')), + ], + options={ + 'unique_together': {('publication', 'author'), ('publication', 'position')}, + }, + ), + migrations.AddField( + model_name='publication', + name='authors', + field=models.ManyToManyField(related_name='publications', through='sok.PublicationAuthor', to='sok.Author'), + ), + migrations.AddField( + model_name='publication', + name='exclusion_criteria', + field=models.ManyToManyField(blank=True, related_name='publications', to='sok.ExclusionCriterion'), + ), + migrations.AddField( + model_name='publication', + name='references', + field=models.ManyToManyField(related_name='referenced_by', through='sok.PublicationReference', to='sok.Publication'), + ), + migrations.AddField( + model_name='publication', + name='sources', + field=models.ManyToManyField(related_name='publications', through='sok.PublicationSource', to='sok.Source'), + ), + migrations.AddField( + model_name='publication', + name='tags', + field=models.ManyToManyField(related_name='publications', through='sok.PublicationTag', to='sok.Tag'), + ), + migrations.AddField( + model_name='publication', + name='variant_of', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='variants', to='sok.publication'), + ), + ] diff --git a/sok/migrations/__init__.py b/sok/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sok/models.py b/sok/models.py new file mode 100644 index 0000000..db8f558 --- /dev/null +++ b/sok/models.py @@ -0,0 +1,189 @@ +from typing import Optional, Set + +from django.core.validators import RegexValidator +from django.db import models +from django.db.models.query import QuerySet + + +class Author(models.Model): + name = models.CharField(max_length=255, unique=True) + + def __str__(self) -> str: + return self.name + + +class Tag(models.Model): + name = models.CharField(max_length=255, unique=True) + criteria = models.TextField(blank=True) + implies = models.ManyToManyField('Tag', related_name='implied_by', blank=True) + + @property + def transitive_publications(self) -> Set['Publication']: + publications: Set[Publication] = set(self.publications.filter(exclusion_criteria__isnull=True)) + for implied in self.implied_by.all(): + publications = publications.union(implied.transitive_publications) + return publications + + @property + def total_publications(self) -> int: + return len(self.transitive_publications) + + def __str__(self) -> str: + return self.name + + +class ExclusionCriterion(models.Model): + name = models.CharField(max_length=255, unique=True) + description = models.TextField(blank=True, default='') + + def __str__(self) -> str: + return self.name + + class Meta: + verbose_name_plural = "exclusion criteria" + + +class Source(models.Model): + name = models.CharField(max_length=255, unique=True) + + def __str__(self) -> str: + return self.name + + +class SearchTerm(models.Model): + name = models.CharField(max_length=255, unique=True) + + def __str__(self) -> str: + return self.name + + +class Publication(models.Model): + cite_key = models.CharField( + max_length=255, + unique=True, + blank=True, # TODO REMOVE + null=True, # TODO REMOVE + ) + title = models.CharField(max_length=255) + year = models.PositiveSmallIntegerField() + references_complete = models.BooleanField(default=False) + peer_reviewed = models.BooleanField(null=True, default=None) + classified = models.BooleanField(default=False) + first_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None) + last_page = models.PositiveSmallIntegerField(blank=True, null=True, default=None) + doi = models.CharField(max_length=255, unique=True, blank=True, null=True, default=None) + + variant_of = models.ForeignKey( + 'Publication', + on_delete=models.CASCADE, + related_name='variants', + blank=True, + null=True, + ) + + authors = models.ManyToManyField(Author, related_name='publications', through='PublicationAuthor') + sources = models.ManyToManyField(Source, related_name='publications', through='PublicationSource') + references = models.ManyToManyField('Publication', related_name='referenced_by', through='PublicationReference', through_fields=('publication', 'reference')) + exclusion_criteria = models.ManyToManyField(ExclusionCriterion, related_name='publications', blank=True) + tags = models.ManyToManyField(Tag, related_name='publications', through='PublicationTag') + + @property + def is_peer_reviewed_or_cited_by_peer_reviewed(self) -> bool: + if self.peer_reviewed: + return True + for referenced_by in self.referenced_by.filter(): + if referenced_by.is_peer_reviewed_or_cited_by_peer_reviewed: + return True + return False + + @property + def is_relevant(self) -> bool: + return not self.exclusion_criteria.exists() + + @property + def relevant_references(self) -> QuerySet: + return self.references.filter(exclusion_criteria__isnull=True) + + @property + def relevant_referenced_by(self) -> QuerySet: + return self.referenced_by.filter(exclusion_criteria__isnull=True) + + @property + def stage(self) -> Optional[str]: + if not self.is_relevant: + return 'excluded' + + # Directly found by search term + if self.sources.exists(): + return 'primary' + + # Referenced by primary (backward snowballing) + # TODO make transitive + if self.referenced_by.filter(exclusion_criteria__isnull=True, sources__isnull=False): + return 'secondary' + + # References a primary (forward snowballing) + # TODO make transitive + if self.references.filter(exclusion_criteria__isnull=True, sources__isnull=False): + return 'tertiary' + + return None + + def __str__(self) -> str: + return self.cite_key + + +class SemanticScholar(models.Model): + paper_id = models.CharField( + max_length=40, + unique=True, + validators=[ + RegexValidator(r'^[a-f0-9]{40}$'), + ], + ) + publication = models.ForeignKey(Publication, on_delete=models.CASCADE) + + def __str__(self) -> str: + return self.paper_id + + class Meta: + verbose_name_plural = "semantic scholar" + + +# M:N Relationships + + +class PublicationAuthor(models.Model): + publication = models.ForeignKey(Publication, on_delete=models.CASCADE) + author = models.ForeignKey(Author, on_delete=models.CASCADE) + position = models.PositiveSmallIntegerField() + + class Meta: + unique_together = (('publication', 'author'), ('publication', 'position')) + + +class PublicationTag(models.Model): + publication = models.ForeignKey(Publication, on_delete=models.CASCADE) + tag = models.ForeignKey(Tag, on_delete=models.CASCADE) + comment = models.CharField(max_length=255, blank=True, null=True) + + class Meta: + unique_together = (('publication', 'tag'),) + + +class PublicationSource(models.Model): + publication = models.ForeignKey(Publication, on_delete=models.CASCADE) + source = models.ForeignKey(Source, on_delete=models.CASCADE) + search_term = models.ForeignKey(SearchTerm, on_delete=models.CASCADE) + + class Meta: + unique_together = (('publication', 'source', 'search_term'),) + + +class PublicationReference(models.Model): + publication = models.ForeignKey(Publication, on_delete=models.CASCADE) + reference = models.ForeignKey(Publication, on_delete=models.CASCADE, related_name='cited_by') + identifier = models.CharField(max_length=255, blank=True, null=True, default=None) + + class Meta: + unique_together = (('publication', 'reference'), ('publication', 'identifier')) diff --git a/sok/tests.py b/sok/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/sok/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/sok/views.py b/sok/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/sok/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/sokman/__init__.py b/sokman/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sokman/asgi.py b/sokman/asgi.py new file mode 100644 index 0000000..6eef51e --- /dev/null +++ b/sokman/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for sokman project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/3.1/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'sokman.settings') + +application = get_asgi_application() diff --git a/sokman/settings.py b/sokman/settings.py new file mode 100644 index 0000000..3264826 --- /dev/null +++ b/sokman/settings.py @@ -0,0 +1,134 @@ +""" +Django settings for sokman project. + +Generated by 'django-admin startproject' using Django 3.1.6. + +For more information on this file, see +https://docs.djangoproject.com/en/3.1/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/3.1/ref/settings/ +""" + +from pathlib import Path + +from django.core.management.utils import get_random_secret_key + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/3.1/howto/deployment/checklist/ + +def get_or_generate_key() -> str: + path = Path(__file__ + '.secret') + + # Create secret key + if not path.exists(): + sk = get_random_secret_key() + path.write_text(sk) + + assert path.exists() + return path.read_text() + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = get_or_generate_key() + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'sok', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'sokman.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'sokman.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/3.1/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/3.1/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/3.1/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_L10N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/3.1/howto/static-files/ + +STATIC_URL = '/static/' diff --git a/sokman/urls.py b/sokman/urls.py new file mode 100644 index 0000000..aa2118c --- /dev/null +++ b/sokman/urls.py @@ -0,0 +1,21 @@ +"""sokman URL Configuration + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/3.1/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path + +urlpatterns = [ + path('admin/', admin.site.urls), +] diff --git a/sokman/wsgi.py b/sokman/wsgi.py new file mode 100644 index 0000000..0d51e02 --- /dev/null +++ b/sokman/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for sokman project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/3.1/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'sokman.settings') + +application = get_wsgi_application()