|
@ -4,12 +4,14 @@ from tqdm import tqdm |
|
|
|
|
|
|
|
|
from django.db import transaction |
|
|
from django.db import transaction |
|
|
from django.core.exceptions import ValidationError |
|
|
from django.core.exceptions import ValidationError |
|
|
from django.core.management.base import BaseCommand, CommandError |
|
|
from django.core.management.base import BaseCommand, CommandError, CommandParser |
|
|
|
|
|
|
|
|
|
|
|
from typing import List, Optional, Tuple |
|
|
|
|
|
|
|
|
import sok.management.commands.dblpimport as dblp |
|
|
import sok.management.commands.dblpimport as dblp |
|
|
|
|
|
|
|
|
from sok.management.commands.snowball import semanticscholar |
|
|
from sok.management.commands.snowball import semanticscholar |
|
|
from sok.models import Publication, PublicationReference, SemanticScholar |
|
|
from sok.models import Publication, PublicationReference, SemanticScholar, Author, PublicationAuthor |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Command(BaseCommand): |
|
|
class Command(BaseCommand): |
|
@ -17,6 +19,9 @@ class Command(BaseCommand): |
|
|
def log_success(self, msg: str): |
|
|
def log_success(self, msg: str): |
|
|
self.stdout.write(self.style.SUCCESS(msg)) |
|
|
self.stdout.write(self.style.SUCCESS(msg)) |
|
|
|
|
|
|
|
|
|
|
|
def log_warn(self, msg: str): |
|
|
|
|
|
self.stdout.write(self.style.WARNING(msg)) |
|
|
|
|
|
|
|
|
def log_info(self, msg: str, nl: bool = True): |
|
|
def log_info(self, msg: str, nl: bool = True): |
|
|
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') |
|
|
self.stdout.write(self.style.HTTP_INFO(msg), ending='\n' if nl else '') |
|
|
self.stdout.flush() |
|
|
self.stdout.flush() |
|
@ -71,6 +76,7 @@ class Command(BaseCommand): |
|
|
data = semanticscholar(semantic.paper_id) |
|
|
data = semanticscholar(semantic.paper_id) |
|
|
if abstract := data['abstract']: |
|
|
if abstract := data['abstract']: |
|
|
publication.abstract = abstract |
|
|
publication.abstract = abstract |
|
|
|
|
|
if publication.peer_reviewed == None: publication.peer_reviewed = False |
|
|
try: |
|
|
try: |
|
|
publication.full_clean() |
|
|
publication.full_clean() |
|
|
publication.save() |
|
|
publication.save() |
|
@ -93,20 +99,61 @@ class Command(BaseCommand): |
|
|
publication.first_page = result.first_page |
|
|
publication.first_page = result.first_page |
|
|
publication.last_page = result.last_page |
|
|
publication.last_page = result.last_page |
|
|
print(publication.peer_reviewed, result) |
|
|
print(publication.peer_reviewed, result) |
|
|
# keep authors from semantic scholar for now, even though they might be a little broken? |
|
|
|
|
|
try: |
|
|
try: |
|
|
publication.full_clean() |
|
|
publication.full_clean() |
|
|
publication.save() |
|
|
publication.save() |
|
|
self.log_success(f"Added DBLP info for: {publication}") |
|
|
self.log_success(f"Added DBLP info for: {publication}") |
|
|
except ValidationError as e: |
|
|
except ValidationError as e: |
|
|
raise CommandError(f"{publication}: {e}") |
|
|
raise CommandError(f"{publication}: {e}") |
|
|
|
|
|
|
|
|
|
|
|
# Store Authors |
|
|
|
|
|
authors: List[Author] = [] |
|
|
|
|
|
for (pid, name) in result.authors: |
|
|
|
|
|
author, created = Author.objects.get_or_create(name=name) |
|
|
|
|
|
|
|
|
|
|
|
if created: |
|
|
|
|
|
self.log_success(f"Added author: {author}") |
|
|
|
|
|
else: |
|
|
|
|
|
self.log_info(f"Author '{author}' already known") |
|
|
|
|
|
if not author.dblp_id and pid != None: |
|
|
|
|
|
try: |
|
|
|
|
|
author.dblp_id = pid |
|
|
|
|
|
author.full_clean() |
|
|
|
|
|
author.save() |
|
|
|
|
|
self.log_success(f"Added pid to author: {author}") |
|
|
|
|
|
except ValidationError as e: |
|
|
|
|
|
raise CommandError(f"{author}: {e}") |
|
|
|
|
|
|
|
|
|
|
|
authors.append(author) |
|
|
|
|
|
|
|
|
|
|
|
# Replace authors for publication |
|
|
|
|
|
if len(authors) != len(PublicationAuthor.objects.filter(publication=publication)): print(authors, [pa.author for pa in PublicationAuthor.objects.filter(publication=publication)]) |
|
|
|
|
|
assert len(authors) >= len(PublicationAuthor.objects.filter(publication=publication)) |
|
|
|
|
|
for publication_author in PublicationAuthor.objects.filter(publication=publication): |
|
|
|
|
|
self.log_warn(f"Will now try to delete {publication_author.author} from {publication}") |
|
|
|
|
|
try: |
|
|
|
|
|
publication_author.delete() |
|
|
|
|
|
except e: |
|
|
|
|
|
raise CommandError(f"{publication} - {author}: {e}") |
|
|
|
|
|
|
|
|
|
|
|
for position, author in enumerate(authors): |
|
|
|
|
|
publication_author, created = PublicationAuthor.objects.get_or_create( |
|
|
|
|
|
author=author, |
|
|
|
|
|
publication=publication, |
|
|
|
|
|
position=position, |
|
|
|
|
|
) |
|
|
|
|
|
if created: |
|
|
|
|
|
self.log_success(f"Assigned author '{author}' to publication '{publication}' at position {position}") |
|
|
|
|
|
else: |
|
|
|
|
|
self.log_info(f"Author '{author}' already assigned to publication '{publication}' at position '{position}'") |
|
|
|
|
|
|
|
|
else: continue # quite definitely not the same publication |
|
|
else: continue # quite definitely not the same publication |
|
|
elif publication.title == result.title and publication.year == result.year: |
|
|
elif publication.title == result.title and publication.year == result.year: |
|
|
print(f"Not quite certain about {result.cite_key} for publication {publication.title}") |
|
|
self.log_warn(f"Not quite certain about {result.cite_key} for publication {publication.title}") |
|
|
else: continue # I'd rather look at that in detail for now |
|
|
else: continue # I'd rather look at that in detail for now |
|
|
|
|
|
|
|
|
def find_secondary_on_dblp(self): |
|
|
def find_secondary_on_dblp(self): |
|
|
self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---") |
|
|
self.log_info("--- Searching for snowballed sources on DBLP ---") |
|
|
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): |
|
|
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): |
|
|
if publication.stage == 'excluded': continue |
|
|
if publication.stage == 'excluded': continue |
|
|
self.search_on_dblp(publication) |
|
|
self.search_on_dblp(publication) |
|
@ -117,7 +164,7 @@ class Command(BaseCommand): |
|
|
).values_list('cite_key', flat=True).distinct() |
|
|
).values_list('cite_key', flat=True).distinct() |
|
|
) |
|
|
) |
|
|
n = len(keys) |
|
|
n = len(keys) |
|
|
self.log_info(f"Found {n} publications that still need to be checked") |
|
|
self.log_info(f"Found {n} publications that still need to be verified") |
|
|
|
|
|
|
|
|
def fix_dblp(self): |
|
|
def fix_dblp(self): |
|
|
self.log_info("--- Searching for entries not in the default DBLP dump ---") |
|
|
self.log_info("--- Searching for entries not in the default DBLP dump ---") |
|
@ -170,11 +217,19 @@ class Command(BaseCommand): |
|
|
sleep(2) # Throttle to avoid rate-limiting |
|
|
sleep(2) # Throttle to avoid rate-limiting |
|
|
|
|
|
|
|
|
# BaseCommand |
|
|
# BaseCommand |
|
|
|
|
|
def add_arguments(self, parser: CommandParser): |
|
|
|
|
|
parser.add_argument('-b', '--abstract', action='store_true') |
|
|
|
|
|
parser.add_argument('-d', '--dblp', action='store_true') |
|
|
|
|
|
parser.add_argument('-a', '--authors', action='store_true') |
|
|
|
|
|
parser.add_argument('-s', '--secondary', action='store_true') |
|
|
|
|
|
|
|
|
def handle(self, *args, **options): |
|
|
def handle(self, *args, **options): |
|
|
self.fix_references() |
|
|
self.fix_references() |
|
|
self.fix_abstracts() |
|
|
if options['abstract']: self.fix_abstracts() |
|
|
self.find_secondary_on_dblp() |
|
|
self.find_secondary_on_dblp() |
|
|
self.fix_dblp() |
|
|
if options['dblp']: |
|
|
self.find_missing_dois() |
|
|
self.fix_dblp() |
|
|
self.find_semanticscholar_ids() |
|
|
self.find_missing_dois() |
|
|
|
|
|
self.find_semanticscholar_ids() |
|
|
|
|
|
if options['authors']: self.fix_authors() |
|
|
|
|
|
|
|
|