Skip to content

Commit 4e354bb

Browse files
Merge pull request #81 from pitangainnovare/add-support-to-dataverse
Adiciona suporte à contagem de acessos Dataverse e adequa models
2 parents 8b4a3f5 + 8c039e0 commit 4e354bb

14 files changed

Lines changed: 437 additions & 167 deletions

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.5.1
1+
1.6.0
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Generated by Django 5.0.7 on 2025-04-01 01:09
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("article", "0001_initial"),
9+
("collection", "0001_initial"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="article",
15+
name="files",
16+
field=models.JSONField(
17+
blank=True, default=dict, null=True, verbose_name="Files"
18+
),
19+
),
20+
migrations.AddField(
21+
model_name="article",
22+
name="pid_generic",
23+
field=models.CharField(
24+
blank=True,
25+
db_index=True,
26+
max_length=50,
27+
null=True,
28+
verbose_name="PID Generic",
29+
),
30+
),
31+
migrations.RemoveField(
32+
model_name="article",
33+
name="pdfs",
34+
),
35+
migrations.AlterUniqueTogether(
36+
name="article",
37+
unique_together={
38+
("collection", "scielo_issn", "pid_v2", "pid_v3", "pid_generic")
39+
},
40+
),
41+
42+
]

article/models.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,16 @@ class Article(CommonControlField):
3939
db_index=True,
4040
)
4141

42-
pdfs = models.JSONField(
43-
verbose_name=_('Format with Language'),
42+
pid_generic = models.CharField(
43+
verbose_name=_('PID Generic'),
44+
max_length=50,
45+
blank=True,
46+
null=True,
47+
db_index=True,
48+
)
49+
50+
files = models.JSONField(
51+
verbose_name=_('Files'),
4452
null=True,
4553
blank=True,
4654
default=dict,
@@ -92,9 +100,10 @@ def metadata(cls, collection=None):
92100
yield {
93101
'collection': a.collection.acron3,
94102
'default_lang': a.default_lang,
95-
'pdfs': a.pdfs,
103+
'files': a.files,
96104
'pid_v2': a.pid_v2,
97105
'pid_v3': a.pid_v3,
106+
'pid_generic': a.pid_generic,
98107
'processing_date': a.processing_date,
99108
'publication_date': a.publication_date,
100109
'publication_year': a.publication_year,
@@ -110,4 +119,5 @@ class Meta:
110119
'scielo_issn',
111120
'pid_v2',
112121
'pid_v3',
122+
'pid_generic',
113123
)

article/tasks.py

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from collection.models import Collection
88
from config import celery_app
9-
from core.utils import date_utils
9+
from core.utils import date_utils, standardizer
1010
from core.utils.utils import _get_user
1111

1212
from journal.models import Journal
@@ -54,7 +54,7 @@ def task_load_article_from_article_meta(self, from_date=None, until_date=None, d
5454

5555
article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code'))
5656
if created or force_update:
57-
article.pdfs = obj.get('pdfs') or {}
57+
article.files = obj.get('files') or {}
5858
article.processing_date = obj.get('processing_date') or ''
5959
article.publication_date = obj.get('publication_date') or ''
6060
article.publication_year = obj.get('publication_year') or ''
@@ -119,12 +119,12 @@ def task_load_article_from_opac(self, collection='scl', from_date=None, until_da
119119
return True
120120

121121

122-
@celery_app.task(bind=True, name=_('Load preprint data from Preprints Server'), timelimit=-1)
122+
@celery_app.task(bind=True, name=_('Load preprint data from SciELO Preprints'), timelimit=-1)
123123
def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
124124
user = _get_user(self.request, username=username, user_id=user_id)
125125

126126
from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
127-
logging.info(f'Loading preprints from Preprints Server. From: {from_date}, Until: {until_date}')
127+
logging.info(f'Loading preprints from SciELO Preprints. From: {from_date}, Until: {until_date}')
128128

129129
col_obj = Collection.objects.get(acron3='preprints')
130130
if not col_obj:
@@ -134,12 +134,11 @@ def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None
134134
for record in utils.fetch_preprint_oai_pmh(from_date, until_date):
135135
data = utils.extract_preprint_data(record)
136136

137-
if not data.get('pid_v2'):
138-
logging.error(f'PIDv2 not found in record: {record}')
137+
if not data.get('pid_generic'):
138+
logging.error(f'Preprint ID not found in record: {record}')
139139
continue
140140

141-
# Currently, we are using the record.header.identifier as the PIDv2
142-
article, created = models.Article.objects.get_or_create(collection=col_obj, pid_v2=data['pid_v2'])
141+
article, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=data['pid_generic'])
143142
if created or force_update:
144143
article.text_langs = data.get('text_langs')
145144
article.default_lang = data.get('default_language')
@@ -151,3 +150,39 @@ def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None
151150

152151
article.save()
153152
logging.debug(f'Article {"created" if created else "updated"}: {article}')
153+
154+
155+
@celery_app.task(bind=True, name=_('Load dataset metadata from Dataverse'), timelimit=-1)
156+
def task_load_dataset_metadata_from_dataverse(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
157+
user = _get_user(self.request, username=username, user_id=user_id)
158+
159+
from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
160+
logging.info(f'Loading dataset metadata from SciELO Data. From: {from_date}, Until: {until_date}')
161+
162+
col_obj = Collection.objects.get(acron3='data')
163+
if not col_obj:
164+
logging.error(f'Collection not found: data')
165+
return False
166+
167+
for record in utils.fetch_dataverse_metadata(from_date, until_date):
168+
dataset_doi = record.get('dataset_doi')
169+
if not dataset_doi:
170+
logging.error(f'Dataset DOI not found in record: {record}')
171+
continue
172+
173+
dataset, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=dataset_doi)
174+
if created or force_update:
175+
dataset.publication_date = record.get('dataset_published')
176+
177+
file_persistent_id = record.get('file_persistent_id')
178+
file_id = record.get('file_id')
179+
file_name = record.get('file_name')
180+
file_url = record.get('file_url')
181+
182+
if file_id:
183+
dataset.files[file_id] = {'name': file_name, 'url': file_url, 'file_persisent_id': file_persistent_id}
184+
185+
dataset.save()
186+
logging.debug(f'Dataset {"created" if created else "updated"}: {dataset}')
187+
188+
return True

article/utils.py

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,30 @@
1-
import langcodes
21
import logging
32
import requests
43
import os
54

65
from sickle import Sickle
76
from time import sleep
87

8+
from core.utils import standardizer
9+
910

1011
ARTICLEMETA_ENDPOINT = os.environ.get('ARTICLEMETA_COLLECT_URL', 'http://articlemeta.scielo.org/api/v1/article/counter_dict')
11-
ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_COLLECT_MAX_RETRIES', 5))
12-
ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_COLLECT_URL_SLEEP_TIME', 30))
12+
ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_MAX_RETRIES', 5))
13+
ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_SLEEP_TIME', 30))
1314

1415
OPAC_ENDPOINT = os.environ.get('OPAC_ENDPOINT', 'https://www.scielo.br/api/v1/counter_dict')
15-
OPAC_MAX_RETRIES = int(os.environ.get('OPAC_COLLECT_MAX_RETRIES', 5))
16-
OPAC_SLEEP_TIME = int(os.environ.get('OPAC_COLLECT_URL_SLEEP_TIME', 30))
16+
OPAC_MAX_RETRIES = int(os.environ.get('OPAC_MAX_RETRIES', 5))
17+
OPAC_SLEEP_TIME = int(os.environ.get('OPAC_SLEEP_TIME', 30))
1718

1819
OAI_PMH_PREPRINT_ENDPOINT = os.environ.get('OAI_PMH_PREPRINT_ENDPOINT', 'https://preprints.scielo.org/index.php/scielo/oai')
1920
OAI_METADATA_PREFIX = os.environ.get('OAI_METADATA_PREFIX', 'oai_dc')
2021
OAI_PMH_MAX_RETRIES = int(os.environ.get('OAI_PMH_MAX_RETRIES', 5))
2122

23+
DATAVERSE_ENDPOINT = os.environ.get('DATAVERSE_ENDPOINT', 'https://data.scielo.org/api')
24+
DATAVERSE_ROOT_COLLECTION = os.environ.get('DATAVERSE_ROOT_COLLECTION', 'scielodata')
25+
DATAVERSE_MAX_RETRIES = int(os.environ.get('DATAVERSE_MAX_RETRIES', 5))
26+
DATAVERSE_SLEEP_TIME = int(os.environ.get('DATAVERSE_SLEEP_TIME', 30))
27+
2228

2329
def fetch_article_meta_dict(from_date, until_date, offset=0, limit=1000, collection=None, issn=None):
2430
for t in range(1, ARTICLEMETA_MAX_RETRIES + 1):
@@ -91,14 +97,14 @@ def fetch_preprint_oai_pmh(from_date, until_date):
9197

9298

9399
def extract_preprint_data(record):
94-
pid_v2 = _extract_preprint_compatible_identifer(record.header.identifier)
95-
text_langs = [_standardize_langcode(l) for l in record.metadata.get('language', [])]
100+
pid_generic = _extract_preprint_compatible_identifer(record.header.identifier)
101+
text_langs = [standardizer.standardize_language_code(l) for l in record.metadata.get('language', [])]
96102
publication_date = record.metadata.get('date', [''])[0]
97103
default_language = text_langs[0] if text_langs else ''
98104
publication_year = _extract_preprint_publication_year_from_date(publication_date)
99105

100106
data = {
101-
'pid_v2': pid_v2,
107+
'pid_generic': pid_generic,
102108
'text_langs': text_langs,
103109
'publication_date': publication_date,
104110
'default_language': default_language,
@@ -124,12 +130,75 @@ def _extract_preprint_publication_year_from_date(date_str):
124130
return ''
125131

126132

127-
def _standardize_langcode(language):
128-
if langcodes.tag_is_valid(language):
129-
return langcodes.standardize_tag(language)
133+
def fetch_dataverse_metadata(from_date=None, until_date=None):
134+
def get_subdataverses():
135+
url = f"{DATAVERSE_ENDPOINT}/dataverses/{DATAVERSE_ROOT_COLLECTION}/contents"
136+
try:
137+
response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
138+
response.raise_for_status()
139+
return response.json().get("data", [])
140+
except requests.exceptions.RequestException as e:
141+
logging.error(f"Error fetching subdataverses: {e}")
142+
return []
130143

131-
logging.warning(f'Tentando padronizar {language}')
132-
inferred_lang, score = langcodes.best_match(language, langcodes.LANGUAGE_ALPHA3.keys())
133-
134-
if score >= 0.75:
135-
return langcodes.standardize_tag(inferred_lang)
144+
def get_datasets(subdataverse_id):
145+
url = f"{DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents"
146+
try:
147+
response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
148+
response.raise_for_status()
149+
return response.json().get("data", [])
150+
except requests.exceptions.RequestException as e:
151+
logging.error(f"Error fetching datasets for subdataverse {subdataverse_id}: {e}")
152+
return []
153+
154+
def get_files(dataset_id):
155+
url = f"{DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files"
156+
try:
157+
response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
158+
response.raise_for_status()
159+
return response.json().get("data", [])
160+
except requests.exceptions.RequestException as e:
161+
logging.error(f"Error fetching files for dataset {dataset_id}: {e}")
162+
return []
163+
164+
subdataverses = get_subdataverses()
165+
166+
for subdataverse in subdataverses:
167+
if subdataverse["type"] != "dataverse":
168+
continue
169+
170+
subdataverse_id = subdataverse["id"]
171+
subdataverse_title = subdataverse["title"]
172+
datasets = get_datasets(subdataverse_id)
173+
174+
for dataset in datasets:
175+
if dataset["type"] != "dataset":
176+
continue
177+
178+
dataset_id = dataset["id"]
179+
doi = standardizer.standardize_doi(dataset.get("persistentUrl"))
180+
if not doi:
181+
logging.warning(f"Dataset {dataset_id} does not have a DOI.")
182+
continue
183+
184+
publication_date = dataset.get("publicationDate", None)
185+
186+
if publication_date:
187+
if (from_date and publication_date < from_date) or (until_date and publication_date > until_date):
188+
continue
189+
190+
files = get_files(dataset_id)
191+
192+
for file in files:
193+
file_persistent_id = file["dataFile"].get("persistentId", None)
194+
file_persistent_id_stz = standardizer.standardize_pid_generic(file_persistent_id) if file_persistent_id else None
195+
196+
yield {
197+
"title": subdataverse_title,
198+
"dataset_doi": doi,
199+
"dataset_published": publication_date,
200+
"file_id": file["dataFile"]["id"],
201+
"file_name": file["label"],
202+
"file_url": f"{DATAVERSE_ENDPOINT}/access/datafile/{file['dataFile']['id']}",
203+
"file_persistent_id": file_persistent_id_stz,
204+
}

article/wagtail_hooks.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,9 @@ class ArticleSnippetViewSet(SnippetViewSet):
2020
"scielo_issn",
2121
"pid_v2",
2222
"pid_v3",
23-
"pdfs",
24-
"default_lang",
25-
"text_langs",
26-
"processing_date",
23+
"pid_generic",
24+
"files",
2725
"publication_date",
28-
"publication_year",
2926
)
3027
list_filter = (
3128
"collection",
@@ -36,6 +33,7 @@ class ArticleSnippetViewSet(SnippetViewSet):
3633
"scielo_issn",
3734
"pid_v2",
3835
"pid_v3",
36+
"pid_generic",
3937
)
4038

4139
register_snippet(ArticleSnippetViewSet)

0 commit comments

Comments
 (0)