|
@ -1,5 +1,6 @@ |
|
|
from pprint import pprint |
|
|
from pprint import pprint |
|
|
from time import sleep |
|
|
from time import sleep |
|
|
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
from django.db import transaction |
|
|
from django.db import transaction |
|
|
from django.core.exceptions import ValidationError |
|
|
from django.core.exceptions import ValidationError |
|
@ -51,8 +52,73 @@ class Command(BaseCommand): |
|
|
fixed.save() |
|
|
fixed.save() |
|
|
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}") |
|
|
self.log_success(f"Added reference: {publication} -- {fixed.identifier} -> {variant}") |
|
|
except ValidationError as e: |
|
|
except ValidationError as e: |
|
|
|
|
|
print(orig.publication, origs, variant, fixed.identifier, fixed.reference, fixed.publication) |
|
|
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}") |
|
|
raise CommandError(f"{publication} -- {fixed.identifier} -> {variant}: {e}") |
|
|
|
|
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
|
|
def fix_abstracts(self) -> None: |
|
|
|
|
|
""" |
|
|
|
|
|
Add abstracts to those publications that have one on SemanticScholar |
|
|
|
|
|
|
|
|
|
|
|
If mulitple variants of a publication exist, only the master variant is |
|
|
|
|
|
considered. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
self.log_info("--- Searching for publications without abstracts ---") |
|
|
|
|
|
self.log_info(f"{len(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False))} eligible publications found, {len(Publication.objects.filter(abstract__isnull=True))} without abstract") |
|
|
|
|
|
for publication in tqdm(Publication.objects.filter(abstract__isnull=True, variant_of__isnull=True, semanticscholar__isnull=False),unit="abstract"): |
|
|
|
|
|
for semantic in publication.semanticscholar_set.all(): |
|
|
|
|
|
data = semanticscholar(semantic.paper_id) |
|
|
|
|
|
if abstract := data['abstract']: |
|
|
|
|
|
publication.abstract = abstract |
|
|
|
|
|
try: |
|
|
|
|
|
publication.full_clean() |
|
|
|
|
|
publication.save() |
|
|
|
|
|
self.log_success(f"Added abstract for: {publication}") |
|
|
|
|
|
except ValidationError as e: |
|
|
|
|
|
raise CommandError(f"{publication}: {e}") |
|
|
|
|
|
sleep(2) # Throttle to avoid rate-limiting |
|
|
|
|
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
|
|
def search_on_dblp(self, publication: Publication): |
|
|
|
|
|
query, results, total = dblp.PublicationResult.from_search(publication.title, 100) |
|
|
|
|
|
if total == 0: return |
|
|
|
|
|
for result in results: |
|
|
|
|
|
if publication.doi and result.doi: |
|
|
|
|
|
if publication.doi.lower() == result.doi.lower(): |
|
|
|
|
|
publication.cite_key = result.cite_key |
|
|
|
|
|
publication.year = result.year |
|
|
|
|
|
if not result.is_peer_reviewed == None: |
|
|
|
|
|
publication.peer_reviewed = result.is_peer_reviewed |
|
|
|
|
|
publication.first_page = result.first_page |
|
|
|
|
|
publication.last_page = result.last_page |
|
|
|
|
|
print(publication.peer_reviewed, result) |
|
|
|
|
|
# keep authors from semantic scholar for now, even though they might be a little broken? |
|
|
|
|
|
try: |
|
|
|
|
|
publication.full_clean() |
|
|
|
|
|
publication.save() |
|
|
|
|
|
self.log_success(f"Added DBLP info for: {publication}") |
|
|
|
|
|
except ValidationError as e: |
|
|
|
|
|
raise CommandError(f"{publication}: {e}") |
|
|
|
|
|
else: continue # quite definitely not the same publication |
|
|
|
|
|
elif publication.title == result.title and publication.year == result.year: |
|
|
|
|
|
print(f"Not quite certain about {result.cite_key} for publication {publication.title}") |
|
|
|
|
|
else: continue # I'd rather look at that in detail for now |
|
|
|
|
|
|
|
|
|
|
|
def find_secondary_on_dblp(self): |
|
|
|
|
|
self.log_info("--- Searching for secondary and tertiary sources in the default DBLP dump ---") |
|
|
|
|
|
for publication in tqdm(Publication.objects.exclude(cite_key__startswith=dblp.CITE_KEY_PREFIX), unit="publication"): |
|
|
|
|
|
if publication.stage == 'excluded': continue |
|
|
|
|
|
self.search_on_dblp(publication) |
|
|
|
|
|
sleep(2) # Throttle to avoid rate-limiting |
|
|
|
|
|
keys = set( |
|
|
|
|
|
Publication.objects.exclude( |
|
|
|
|
|
cite_key__startswith=dblp.CITE_KEY_PREFIX |
|
|
|
|
|
).values_list('cite_key', flat=True).distinct() |
|
|
|
|
|
) |
|
|
|
|
|
n = len(keys) |
|
|
|
|
|
self.log_info(f"Found {n} publications that still need to be checked") |
|
|
|
|
|
|
|
|
def fix_dblp(self): |
|
|
def fix_dblp(self): |
|
|
self.log_info("--- Searching for entries not in the default DBLP dump ---") |
|
|
self.log_info("--- Searching for entries not in the default DBLP dump ---") |
|
|
keys_in_db = set( |
|
|
keys_in_db = set( |
|
@ -93,12 +159,13 @@ class Command(BaseCommand): |
|
|
) |
|
|
) |
|
|
for publication in publications: |
|
|
for publication in publications: |
|
|
data = semanticscholar(publication.doi) |
|
|
data = semanticscholar(publication.doi) |
|
|
|
|
|
if not 'error' in data: |
|
|
paper_id = data['paperId'] |
|
|
paper_id = data['paperId'] |
|
|
obj = SemanticScholar(paper_id=paper_id, publication=publication) |
|
|
obj = SemanticScholar(paper_id=paper_id, publication=publication) |
|
|
obj.full_clean() |
|
|
obj.full_clean() |
|
|
obj.save() |
|
|
obj.save() |
|
|
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") |
|
|
self.log_success(f"Set semanticscholar ID for publication '{publication}': {paper_id}") |
|
|
|
|
|
else: print(publication) |
|
|
|
|
|
|
|
|
sleep(2) # Throttle to avoid rate-limiting |
|
|
sleep(2) # Throttle to avoid rate-limiting |
|
|
|
|
|
|
|
@ -106,6 +173,8 @@ class Command(BaseCommand): |
|
|
|
|
|
|
|
|
def handle(self, *args, **options): |
|
|
def handle(self, *args, **options): |
|
|
self.fix_references() |
|
|
self.fix_references() |
|
|
|
|
|
self.fix_abstracts() |
|
|
|
|
|
self.find_secondary_on_dblp() |
|
|
self.fix_dblp() |
|
|
self.fix_dblp() |
|
|
self.find_missing_dois() |
|
|
self.find_missing_dois() |
|
|
self.find_semanticscholar_ids() |
|
|
self.find_semanticscholar_ids() |
|
|