Skip to content

Commit cf0f343

Browse files
authored
Allow publishing datasets on data.gouv.fr (#1202)
2 parents d4b676f + d14c586 commit cf0f343

13 files changed

Lines changed: 417 additions & 19 deletions

File tree

.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
OTA_ENGINE_GITHUB_TOKEN=your_github_token_here
55
OTA_ENGINE_GITLAB_TOKEN=your_gitlab_token_here
66
OTA_ENGINE_GITLAB_RELEASES_TOKEN=your_gitlab_releases_token_here
7+
OTA_ENGINE_DATAGOUV_API_KEY=your_datagouv_api_key_here
78
OTA_ENGINE_SENDINBLUE_API_KEY=your_sendinblue_api_key_here
89
OTA_ENGINE_SMTP_PASSWORD=your_smtp_password_here
910

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22

33
All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
44

5+
## Unreleased [minor]
6+
7+
> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.
8+
9+
### Added
10+
11+
- Add support for publishing datasets to data.gouv.fr; configure `dataset.datagouv.datasetId` or `dataset.datagouv.organizationIdOrSlug` in configuration file and set `OTA_ENGINE_DATAGOUV_API_KEY` environment variable
12+
- Add ability to publish datasets to multiple platforms simultaneously; datasets can now be published to GitHub (or GitLab) and data.gouv.fr in parallel
13+
514
## 10.0.1 - 2025-11-24
615

716
_Full changeset and discussions: [#1208](https://github.com/OpenTermsArchive/engine/pull/1208)._

bin/ota-dataset.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ import logger from '../src/logger/index.js';
1111

1212
program
1313
.name('ota dataset')
14-
.description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases')
14+
.description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases, GitLab releases, or data.gouv.fr')
1515
.option('-f, --file <filename>', 'file name of the generated dataset')
16-
.option('-p, --publish', 'publish dataset to GitHub releases on versions repository. Mandatory authentication to GitHub is provided through the `OTA_ENGINE_GITHUB_TOKEN` environment variable')
16+
.option('-p, --publish', 'publish dataset. Supports GitHub releases (OTA_ENGINE_GITHUB_TOKEN), GitLab releases (OTA_ENGINE_GITLAB_TOKEN), or data.gouv.fr (OTA_ENGINE_DATAGOUV_API_KEY + config)')
1717
.option('-r, --remove-local-copy', 'remove local copy of dataset after publishing. Works only in combination with --publish option')
1818
.option('--schedule', 'schedule automatic dataset generation');
1919

config/default.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
}
5757
},
5858
"dataset": {
59-
"title": "sandbox",
59+
"title": "Sandbox collection dataset",
6060
"versionsRepositoryURL": "https://github.com/OpenTermsArchive/sandbox-declarations",
6161
"publishingSchedule": "30 8 * * MON"
6262
}

scripts/dataset/assets/README.template.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export function title({ releaseDate }) {
1414

1515
const title = config.get('@opentermsarchive/engine.dataset.title');
1616

17-
return `${title}${releaseDate} dataset`;
17+
return `${title}${releaseDate}`;
1818
}
1919

2020
export function body({ servicesCount, firstVersionDate, lastVersionDate }) {

scripts/dataset/export/test/fixtures/dataset/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Open Terms Archive — sandbox — January 1, 2022 dataset
1+
# Open Terms Archive — sandbox — January 1, 2022
22

33
This dataset consolidates the contractual documents of 2 service providers, in all their versions that were accessible online between January 1, 2021 and January 6, 2022.
44

scripts/dataset/index.js

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import publishRelease from './publish/index.js';
99

1010
export async function release({ shouldPublish, shouldRemoveLocalCopy, fileName }) {
1111
const releaseDate = new Date();
12-
const archiveName = fileName || `dataset-${config.get('@opentermsarchive/engine.dataset.title')}-${releaseDate.toISOString().replace(/T.*/, '')}`;
12+
const archiveName = fileName || `${config.get('@opentermsarchive/engine.dataset.title').toLowerCase().replace(/[^a-zA-Z0-9.\-_]/g, '-')}-${releaseDate.toISOString().replace(/T.*/, '')}`;
1313
const archivePath = `${path.basename(archiveName, '.zip')}.zip`; // allow to pass filename or filename.zip as the archive name and have filename.zip as the result name
1414

1515
logger.info('Start exporting dataset…');
@@ -24,13 +24,18 @@ export async function release({ shouldPublish, shouldRemoveLocalCopy, fileName }
2424

2525
logger.info('Start publishing dataset…');
2626

27-
const releaseUrl = await publishRelease({
27+
const results = await publishRelease({
2828
archivePath,
2929
releaseDate,
3030
stats,
3131
});
3232

33-
logger.info(`Dataset published to ${releaseUrl}`);
33+
if (results.length > 0) {
34+
logger.info('Dataset published to following platforms:');
35+
results.forEach(result => {
36+
logger.info(` - ${result.platform}: ${result.url}`);
37+
});
38+
}
3439

3540
if (!shouldRemoveLocalCopy) {
3641
return;

scripts/dataset/logger/index.js

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,35 @@ const { combine, timestamp, printf, colorize } = winston.format;
88
logger.format = combine(
99
colorize(),
1010
timestamp({ format: 'YYYY-MM-DDTHH:mm:ssZ' }),
11-
printf(({ level, message, counter, hash, timestamp }) => {
12-
const prefix = counter && hash ? `${counter.toString().padEnd(6)} ${hash.padEnd(40)}` : '';
11+
printf(({ level, message, counter, hash, timestamp, module }) => {
12+
let prefix = counter && hash ? `${counter.toString().padEnd(6)} ${hash.padEnd(40)}` : '';
1313

1414
const timestampPrefix = config.get('@opentermsarchive/engine.logger.timestampPrefix') ? `${timestamp} ` : '';
1515

16-
return `${timestampPrefix}${level.padEnd(15)} ${prefix.padEnd(50)} ${message}`;
16+
prefix = module ? `${module} ${prefix}` : prefix;
17+
18+
const levelStr = level.padEnd(15);
19+
let coloredLevel = levelStr;
20+
let coloredMessage = message;
21+
22+
if (level.includes('warn')) {
23+
coloredLevel = `\x1b[33m${levelStr}\x1b[0m`;
24+
coloredMessage = `\x1b[33m${message}\x1b[0m`;
25+
} else if (level.includes('error')) {
26+
coloredLevel = `\x1b[31m${levelStr}\x1b[0m`;
27+
coloredMessage = `\x1b[31m${message}\x1b[0m`;
28+
}
29+
30+
return `${timestampPrefix} ${coloredLevel} ${prefix.padEnd(50)} ${coloredMessage}`;
1731
}),
1832
);
1933

34+
export function createModuleLogger(moduleName) {
35+
return {
36+
info: message => logger.info(message, { module: moduleName }),
37+
warn: message => logger.warn(message, { module: moduleName }),
38+
error: message => logger.error(message, { module: moduleName }),
39+
};
40+
}
41+
2042
export default logger;
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
import fsApi from 'fs';
2+
import path from 'path';
3+
4+
import FormData from 'form-data';
5+
import nodeFetch from 'node-fetch';
6+
7+
import { createModuleLogger } from '../../logger/index.js';
8+
9+
const logger = createModuleLogger('datagouv');
10+
11+
const DATASET_LICENSE = 'odc-odbl';
12+
const DEFAULT_RESOURCE_DESCRIPTION = 'See README.md inside the archive for dataset structure and usage information.';
13+
14+
const routes = {
15+
dataset: (apiBaseUrl, datasetId) => `${apiBaseUrl}/datasets/${datasetId}/`,
16+
datasets: apiBaseUrl => `${apiBaseUrl}/datasets/`,
17+
datasetUpload: (apiBaseUrl, datasetId) => `${apiBaseUrl}/datasets/${datasetId}/upload/`,
18+
resource: (apiBaseUrl, datasetId, resourceId) => `${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/`,
19+
resourceUpload: (apiBaseUrl, datasetId, resourceId) => `${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/upload/`,
20+
organization: (apiBaseUrl, organizationIdOrSlug) => `${apiBaseUrl}/organizations/${organizationIdOrSlug}/`,
21+
organizationDatasets: (apiBaseUrl, organizationId) => `${apiBaseUrl}/organizations/${organizationId}/datasets/?page_size=100`,
22+
};
23+
24+
export async function getOrganization({ apiBaseUrl, headers, organizationIdOrSlug }) {
25+
logger.info(`Fetching organization: ${organizationIdOrSlug}…`);
26+
27+
const orgResponse = await nodeFetch(routes.organization(apiBaseUrl, organizationIdOrSlug), { headers });
28+
29+
if (!orgResponse.ok) {
30+
const errorText = await orgResponse.text();
31+
32+
throw new Error(`Failed to retrieve organization: ${orgResponse.status} ${orgResponse.statusText} - ${errorText}`);
33+
}
34+
35+
const orgData = await orgResponse.json();
36+
37+
logger.info(`Found organization: ${orgData.name} (ID: ${orgData.id})`);
38+
39+
return orgData;
40+
}
41+
42+
export async function getDataset({ apiBaseUrl, headers, datasetId }) {
43+
const datasetResponse = await nodeFetch(routes.dataset(apiBaseUrl, datasetId), { headers });
44+
45+
if (!datasetResponse.ok) {
46+
const errorText = await datasetResponse.text();
47+
const error = new Error(`Failed to retrieve dataset: ${datasetResponse.status} ${datasetResponse.statusText} - ${errorText}`);
48+
49+
error.statusCode = datasetResponse.status;
50+
throw error;
51+
}
52+
53+
const datasetData = await datasetResponse.json();
54+
55+
return datasetData;
56+
}
57+
58+
export async function findDatasetByTitle({ apiBaseUrl, headers, organizationId, title }) {
59+
logger.info(`Searching for dataset with title "${title}" in organization…`);
60+
61+
const searchResponse = await nodeFetch(routes.organizationDatasets(apiBaseUrl, organizationId), { headers });
62+
63+
if (!searchResponse.ok) {
64+
const errorText = await searchResponse.text();
65+
66+
throw new Error(`Failed to search for datasets: ${searchResponse.status} ${searchResponse.statusText} - ${errorText}`);
67+
}
68+
69+
const searchData = await searchResponse.json();
70+
71+
const dataset = searchData.data.find(ds => ds.title === title);
72+
73+
if (dataset) {
74+
logger.info(`Found existing dataset: ${dataset.title} (ID: ${dataset.id})`);
75+
76+
return dataset;
77+
}
78+
79+
logger.info('No existing dataset found with this title');
80+
81+
return null;
82+
}
83+
84+
export async function createDataset({ apiBaseUrl, headers, organizationId, title, description, license, frequency }) {
85+
logger.info(`Creating new dataset: ${title}…`);
86+
87+
const createResponse = await nodeFetch(routes.datasets(apiBaseUrl), {
88+
method: 'POST',
89+
headers: {
90+
...headers,
91+
'Content-Type': 'application/json',
92+
},
93+
body: JSON.stringify({
94+
title,
95+
description,
96+
organization: organizationId,
97+
license,
98+
frequency,
99+
}),
100+
});
101+
102+
if (!createResponse.ok) {
103+
const errorText = await createResponse.text();
104+
105+
throw new Error(`Failed to create dataset: ${createResponse.status} ${createResponse.statusText} - ${errorText}`);
106+
}
107+
108+
const dataset = await createResponse.json();
109+
110+
logger.info(`Dataset created successfully: ${dataset.title} (ID: ${dataset.id})`);
111+
112+
return dataset;
113+
}
114+
115+
export async function updateDatasetMetadata({ apiBaseUrl, headers, datasetId, title, description, stats, frequency }) {
116+
const updatePayload = {
117+
title,
118+
description,
119+
license: DATASET_LICENSE,
120+
frequency,
121+
};
122+
123+
if (stats?.firstVersionDate && stats?.lastVersionDate) {
124+
updatePayload.temporal_coverage = {
125+
start: stats.firstVersionDate.toISOString(),
126+
end: stats.lastVersionDate.toISOString(),
127+
};
128+
}
129+
130+
const updateResponse = await nodeFetch(routes.dataset(apiBaseUrl, datasetId), {
131+
method: 'PUT',
132+
headers: {
133+
...headers,
134+
'Content-Type': 'application/json',
135+
},
136+
body: JSON.stringify(updatePayload),
137+
});
138+
139+
if (!updateResponse.ok) {
140+
const errorText = await updateResponse.text();
141+
const error = new Error(`Failed to update dataset metadata: ${updateResponse.status} ${updateResponse.statusText} - ${errorText}`);
142+
143+
error.statusCode = updateResponse.status;
144+
throw error;
145+
}
146+
147+
logger.info('Dataset metadata updated successfully');
148+
}
149+
150+
export async function uploadResource({ apiBaseUrl, headers, datasetId, archivePath }) {
151+
logger.info('Uploading dataset archive…');
152+
153+
const { formData, fileName } = createFormDataForFile(archivePath);
154+
155+
const uploadResponse = await nodeFetch(routes.datasetUpload(apiBaseUrl, datasetId), {
156+
method: 'POST',
157+
headers: { ...formData.getHeaders(), ...headers },
158+
body: formData,
159+
});
160+
161+
if (!uploadResponse.ok) {
162+
const errorText = await uploadResponse.text();
163+
164+
throw new Error(`Failed to upload dataset file: ${uploadResponse.status} ${uploadResponse.statusText} - ${errorText}`);
165+
}
166+
167+
const uploadResult = await uploadResponse.json();
168+
169+
logger.info(`Dataset file uploaded successfully with resource ID: ${uploadResult.id}`);
170+
171+
return { resourceId: uploadResult.id, fileName };
172+
}
173+
174+
export async function replaceResourceFile({ apiBaseUrl, headers, datasetId, resourceId, archivePath }) {
175+
logger.info(`Replacing file for existing resource ID: ${resourceId}…`);
176+
177+
const { formData, fileName } = createFormDataForFile(archivePath);
178+
179+
const uploadResponse = await nodeFetch(routes.resourceUpload(apiBaseUrl, datasetId, resourceId), {
180+
method: 'POST',
181+
headers: { ...formData.getHeaders(), ...headers },
182+
body: formData,
183+
});
184+
185+
if (!uploadResponse.ok) {
186+
const errorText = await uploadResponse.text();
187+
188+
throw new Error(`Failed to replace resource file: ${uploadResponse.status} ${uploadResponse.statusText} - ${errorText}`);
189+
}
190+
191+
const uploadResult = await uploadResponse.json();
192+
193+
logger.info('Resource file replaced successfully');
194+
195+
return { resourceId: uploadResult.id, fileName };
196+
}
197+
198+
export async function updateResourceMetadata({ apiBaseUrl, headers, datasetId, resourceId, fileName }) {
199+
logger.info('Updating resource metadata…');
200+
201+
const resourceUpdateResponse = await nodeFetch(routes.resource(apiBaseUrl, datasetId, resourceId), {
202+
method: 'PUT',
203+
headers: { ...headers, 'Content-Type': 'application/json' },
204+
body: JSON.stringify({
205+
title: fileName,
206+
description: DEFAULT_RESOURCE_DESCRIPTION,
207+
filetype: 'file',
208+
format: 'zip',
209+
mime: 'application/zip',
210+
}),
211+
});
212+
213+
if (!resourceUpdateResponse.ok) {
214+
const errorText = await resourceUpdateResponse.text();
215+
216+
throw new Error(`Failed to update resource metadata: ${resourceUpdateResponse.status} ${resourceUpdateResponse.statusText} - ${errorText}`);
217+
}
218+
219+
logger.info('Resource metadata updated successfully');
220+
}
221+
222+
function createFormDataForFile(archivePath) {
223+
const formData = new FormData();
224+
const fileName = path.basename(archivePath);
225+
const fileStats = fsApi.statSync(archivePath);
226+
227+
formData.append('file', fsApi.createReadStream(archivePath), {
228+
filename: fileName,
229+
contentType: 'application/zip',
230+
knownLength: fileStats.size,
231+
});
232+
233+
return { formData, fileName };
234+
}

0 commit comments

Comments
 (0)