1- import langcodes
21import logging
32import requests
43import os
54
65from sickle import Sickle
76from time import sleep
87
8+ from core .utils import standardizer
9+
910
1011ARTICLEMETA_ENDPOINT = os .environ .get ('ARTICLEMETA_COLLECT_URL' , 'http://articlemeta.scielo.org/api/v1/article/counter_dict' )
11- ARTICLEMETA_MAX_RETRIES = int (os .environ .get ('ARTICLEMETA_COLLECT_MAX_RETRIES ' , 5 ))
12- ARTICLEMETA_SLEEP_TIME = int (os .environ .get ('ARTICLEMETA_COLLECT_URL_SLEEP_TIME ' , 30 ))
12+ ARTICLEMETA_MAX_RETRIES = int (os .environ .get ('ARTICLEMETA_MAX_RETRIES ' , 5 ))
13+ ARTICLEMETA_SLEEP_TIME = int (os .environ .get ('ARTICLEMETA_SLEEP_TIME ' , 30 ))
1314
1415OPAC_ENDPOINT = os .environ .get ('OPAC_ENDPOINT' , 'https://www.scielo.br/api/v1/counter_dict' )
15- OPAC_MAX_RETRIES = int (os .environ .get ('OPAC_COLLECT_MAX_RETRIES ' , 5 ))
16- OPAC_SLEEP_TIME = int (os .environ .get ('OPAC_COLLECT_URL_SLEEP_TIME ' , 30 ))
16+ OPAC_MAX_RETRIES = int (os .environ .get ('OPAC_MAX_RETRIES ' , 5 ))
17+ OPAC_SLEEP_TIME = int (os .environ .get ('OPAC_SLEEP_TIME ' , 30 ))
1718
1819OAI_PMH_PREPRINT_ENDPOINT = os .environ .get ('OAI_PMH_PREPRINT_ENDPOINT' , 'https://preprints.scielo.org/index.php/scielo/oai' )
1920OAI_METADATA_PREFIX = os .environ .get ('OAI_METADATA_PREFIX' , 'oai_dc' )
2021OAI_PMH_MAX_RETRIES = int (os .environ .get ('OAI_PMH_MAX_RETRIES' , 5 ))
2122
23+ DATAVERSE_ENDPOINT = os .environ .get ('DATAVERSE_ENDPOINT' , 'https://data.scielo.org/api' )
24+ DATAVERSE_ROOT_COLLECTION = os .environ .get ('DATAVERSE_ROOT_COLLECTION' , 'scielodata' )
25+ DATAVERSE_MAX_RETRIES = int (os .environ .get ('DATAVERSE_MAX_RETRIES' , 5 ))
26+ DATAVERSE_SLEEP_TIME = int (os .environ .get ('DATAVERSE_SLEEP_TIME' , 30 ))
27+
2228
2329def fetch_article_meta_dict (from_date , until_date , offset = 0 , limit = 1000 , collection = None , issn = None ):
2430 for t in range (1 , ARTICLEMETA_MAX_RETRIES + 1 ):
@@ -91,14 +97,14 @@ def fetch_preprint_oai_pmh(from_date, until_date):
9197
9298
9399def extract_preprint_data (record ):
94- pid_v2 = _extract_preprint_compatible_identifer (record .header .identifier )
95- text_langs = [_standardize_langcode (l ) for l in record .metadata .get ('language' , [])]
100+ pid_generic = _extract_preprint_compatible_identifer (record .header .identifier )
101+ text_langs = [standardizer . standardize_language_code (l ) for l in record .metadata .get ('language' , [])]
96102 publication_date = record .metadata .get ('date' , ['' ])[0 ]
97103 default_language = text_langs [0 ] if text_langs else ''
98104 publication_year = _extract_preprint_publication_year_from_date (publication_date )
99105
100106 data = {
101- 'pid_v2 ' : pid_v2 ,
107+ 'pid_generic ' : pid_generic ,
102108 'text_langs' : text_langs ,
103109 'publication_date' : publication_date ,
104110 'default_language' : default_language ,
@@ -124,12 +130,75 @@ def _extract_preprint_publication_year_from_date(date_str):
124130 return ''
125131
126132
127- def _standardize_langcode (language ):
128- if langcodes .tag_is_valid (language ):
129- return langcodes .standardize_tag (language )
133+ def fetch_dataverse_metadata (from_date = None , until_date = None ):
134+ def get_subdataverses ():
135+ url = f"{ DATAVERSE_ENDPOINT } /dataverses/{ DATAVERSE_ROOT_COLLECTION } /contents"
136+ try :
137+ response = requests .get (url , timeout = DATAVERSE_SLEEP_TIME )
138+ response .raise_for_status ()
139+ return response .json ().get ("data" , [])
140+ except requests .exceptions .RequestException as e :
141+ logging .error (f"Error fetching subdataverses: { e } " )
142+ return []
130143
131- logging .warning (f'Tentando padronizar { language } ' )
132- inferred_lang , score = langcodes .best_match (language , langcodes .LANGUAGE_ALPHA3 .keys ())
133-
134- if score >= 0.75 :
135- return langcodes .standardize_tag (inferred_lang )
144+ def get_datasets (subdataverse_id ):
145+ url = f"{ DATAVERSE_ENDPOINT } /dataverses/{ subdataverse_id } /contents"
146+ try :
147+ response = requests .get (url , timeout = DATAVERSE_SLEEP_TIME )
148+ response .raise_for_status ()
149+ return response .json ().get ("data" , [])
150+ except requests .exceptions .RequestException as e :
151+ logging .error (f"Error fetching datasets for subdataverse { subdataverse_id } : { e } " )
152+ return []
153+
154+ def get_files (dataset_id ):
155+ url = f"{ DATAVERSE_ENDPOINT } /datasets/{ dataset_id } /versions/:latest/files"
156+ try :
157+ response = requests .get (url , timeout = DATAVERSE_SLEEP_TIME )
158+ response .raise_for_status ()
159+ return response .json ().get ("data" , [])
160+ except requests .exceptions .RequestException as e :
161+ logging .error (f"Error fetching files for dataset { dataset_id } : { e } " )
162+ return []
163+
164+ subdataverses = get_subdataverses ()
165+
166+ for subdataverse in subdataverses :
167+ if subdataverse ["type" ] != "dataverse" :
168+ continue
169+
170+ subdataverse_id = subdataverse ["id" ]
171+ subdataverse_title = subdataverse ["title" ]
172+ datasets = get_datasets (subdataverse_id )
173+
174+ for dataset in datasets :
175+ if dataset ["type" ] != "dataset" :
176+ continue
177+
178+ dataset_id = dataset ["id" ]
179+ doi = standardizer .standardize_doi (dataset .get ("persistentUrl" ))
180+ if not doi :
181+ logging .warning (f"Dataset { dataset_id } does not have a DOI." )
182+ continue
183+
184+ publication_date = dataset .get ("publicationDate" , None )
185+
186+ if publication_date :
187+ if (from_date and publication_date < from_date ) or (until_date and publication_date > until_date ):
188+ continue
189+
190+ files = get_files (dataset_id )
191+
192+ for file in files :
193+ file_persistent_id = file ["dataFile" ].get ("persistentId" , None )
194+ file_persistent_id_stz = standardizer .standardize_pid_generic (file_persistent_id ) if file_persistent_id else None
195+
196+ yield {
197+ "title" : subdataverse_title ,
198+ "dataset_doi" : doi ,
199+ "dataset_published" : publication_date ,
200+ "file_id" : file ["dataFile" ]["id" ],
201+ "file_name" : file ["label" ],
202+ "file_url" : f"{ DATAVERSE_ENDPOINT } /access/datafile/{ file ['dataFile' ]['id' ]} " ,
203+ "file_persistent_id" : file_persistent_id_stz ,
204+ }
0 commit comments