-
Notifications
You must be signed in to change notification settings - Fork 141
Expand file tree
/
Copy pathdc_manifest.proto
More file actions
369 lines (287 loc) · 11.8 KB
/
dc_manifest.proto
File metadata and controls
369 lines (287 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
syntax = "proto3";
// protoc --proto_path=. --python_out=. dc_manifest.proto
// IMPORTANT NOTE: Do not change the existing field names in
// DataCommonsManifest::Import and ExternalTable messages, since these messages
// are shared with the Template Generator UI application in github.
package datacommons.proto;
message ExternalTable {
// Name of the table.
string table_name = 1 ;
// Path to the template MCF (previously called the "schema mapping file").
// TODO(shanth): Rename to template_mcf_path.
string mapping_path = 2;
// Path to one or more data CSV file patterns.
// NOTE: All files must be present in the same directory if DownloadConfig is
// specified.
// TODO(shanth): Change to csv_paths after textprotos in CNS have been updated
repeated string csv_path = 3;
// Ordered list of column names that the schema mapping file will refer to.
//
// NOTE: This need not match the raw CSV column names.
repeated string column_names = 4;
// Delimiter for CSV.
// NOTE: expected to be a single character.
string field_delim = 5 ;
// Download info for TMCF/CSV files.
DownloadConfig download_config = 6;
}
// Arguments relevant for the resolution of an import.
message ResolutionInfo {
// Set if we need any DCID generation or resolution.
bool uses_id_resolver = 1;
// List of ID properties this dataset depends on for resolution.
repeated string requires_ids = 2;
// List of ID properties this dataset provides for downstream resolution.
repeated string provides_ids = 3;
// Info about new geos introduced by this dataset.
message GeoInfo {
// List the ID properties that DCIDs are based on.
repeated string dcid_sources = 1;
}
// REQUIRED: Must be set if new geos are introduced.
GeoInfo new_geo = 4;
// When "uses_id_resolver" is true, this indicates whether the resolution was
// done against the KG, and if so which type.
enum KGType {
KG_NONE = 0;
KG_BQ = 1;
KG_MCF = 2;
}
KGType kg_for_resolution = 5 ;
// There are scenarios where additional MCF nodes are necessary purely for
// resolution. Examples include StatVar MCFs for table flow and geo nodes
// used to resolve by other IDs (like wikidataId).
repeated string resolution_helper_mcf_paths = 6;
// This must be present if |automated_mcf_generation_by| is set and |table| is
// empty in DataCommonsManifest::Import. One exception is for the "resolved
// MCF --> cache" mode in the controller, in that case, this field is not
// needed.
repeated string unresolved_mcf_urls = 7;
}
// Configuration used to download files into CNS as the first step.
message DownloadConfig {
// The src directory is expected to contain CSV/TMCF/MCF files with the exact
// names as those in DataCommonsManifest::Import.
//
// If DownloadConfig is specified for a CSV/TMCF import (i.e., in
// ExternalTable.download_config), then the CSV file is required and the TMCF
// file is optional. If DownloadConfig is specified for a node MCF import
// (i.e., Import.mcf_download_config), then the source directory must contain
// all the files equivalent to "mcf_url".
//
// For example, suppose that the import specifies a TMCF path of
// /cns/path/to/file/data.tmcf, CSV path of /cns/path/to/file/data.csv and the
// "src_prefix_dir" is /bigstore/bucket/foo. Then /bigstore/bucket/foo/
// should have files named data.csv (required) and data.tmcf (optional).
//
// As an MCF example, suppose the mcf_url is /cns/path/to/files/*.mcf and
// there are three resolved MCF files (1.mcf, 2.mcf, 3.mcf) that match the
// pattern. If "src_prefix_dir" is /bigstore/bucket/foo, then it must contain
// unresolved MCF files with those same names.
oneof prefix {
// Source prefix directory.
string src_prefix_dir = 1;
// Path to a file containing the latest version directory name. If it is an
// absolute path, then it is used as the "src_prefix_dir". If not, then
// "src_prefix_dir" is constructed as:
// join(dirname(src_latest_version_file), <VERSION>)
string src_latest_version_file = 2;
}
message PathPair {
string src_file = 1;
string dst_file = 2;
}
message FileOp {
oneof op {
// Copy from src_file to dst_file.
PathPair cp = 1;
// Delete the given file.
string del = 2;
}
}
reserved 3, 4;
}
// Configuration used in selecting nodes for golden triples generation.
message GoldenTripleSelection {
// For specified number of svObs nodes, pick one into golden triples.
int32 svobs_nodes_pick_one_in = 3 ;
reserved 1, 2;
}
// A manifest is a collection of imports each having a name, provenance URL,
// provenance description, and list of CNS path patterns or external table info.
message DataCommonsManifest {
// Categories of imports.
// LINT.IfChange
enum ImportCategory {
IMPORT_CATEGORY_UNKNOWN = 0;
// Import with Graph Schema nodes.
SCHEMA = 1;
// Import with place nodes.
PLACE = 2;
// Import with StatVar nodes having curated DCIDs.
CURATED_STATVAR = 3;
// Import with StatVar nodes having auto-generated DCIDs.
GENERATED_STATVAR = 4;
// Aggregated stats.
AGGREGATED_STATS = 7;
// Stats, which contain observations.
STATS = 8;
// Import with entity nodes that are not Schema/StatVars, observations or
// places.
ENTITY = 9;
// Imputed stats.
IMPUTED_STATS = 10;
// Intermediate stats that are used as inputs to derived graph computations,
// but are not built into cache.
// This could be either a direct import from source, or some kind of
// aggregation that doesn't get built into cache.
INTERMEDIATE_STATS = 11;
// Schema import from custom DC.
CUSTOM_SCHEMA = 12;
reserved 5, 6;
}
// LINT.ThenChange(//depot/google3/datacommons/import/mcf_vocab.h)
// Next ID: 32.
message Import {
string import_name = 1;
string provenance_url = 2;
string provenance_description = 6;
ImportCategory category = 20;
// The list of Import Groups that this import is part of. Must refer to a
// valid ImportGroup.name. If empty, then this import is part of all groups.
repeated string import_groups = 23;
// When input is node MCF, "mcf_url" is set.
repeated string mcf_url = 3;
// The URL of the optimized MCF in proto (TFRecord) format.
repeated string mcf_proto_url = 25;
// GCS path prefix for resolved MCFs.
string mcf_gcs_path_prefix = 31;
// In cases like Core Geo MCFs, the MCFs are versioned in piper but that
// version is not directly accessed by the binary (owing to memory
// limitations). Instead, we keep a copy of these files in CNS and provide
// that path in "mcf_url". Set this bool to true to mark that situation.
//
// If set, there exist tests (manifest_checker_test.cc) to ensure the MCFs
// are in sync and tooling (sync_replicated_mcfs.cc) to update the CNS copy.
bool is_mcf_replicated_from_piper = 18;
// When input comes from TMCF/CSV, this is set.
repeated ExternalTable table = 4;
string curator_email = 5 ;
// Information about resolution.
ResolutionInfo resolution_info = 9;
// Dates associated with stats. All dates are in ISO8601 format.
//
// NOTE: For datasets that are released more frequently than a month, please
// skip |end_date_in_kg|, |end_date_in_source| and |next_release_date|.
//
// Earliest value of observationDate in the KG.
string start_date_in_kg = 10;
// Latest value of observationDate in the KG.
string end_date_in_kg = 11;
// Latest value of observationDate that is available at the source.
// If end_date_in_kg < end_date_in_source, the dataset needs refresh.
string end_date_in_source = 21;
// Data Release Schedule
//
// Release frequency of the import source.
string release_frequency = 12;
// Next date when the data source will be released. If this date is in the
// past, the dataset needs refresh.
string next_release_date = 13;
// The URLs from where the data can be accessed/downloaded.
repeated string data_download_url = 15;
// As part of which import-group do we regenerate the MCFs?
//
// The branch controllers can convert table to MCF or resolve MCF if this
// field is set to the appropriate import group. Not setting this field
// disables any such work, instead the resolved MCF (in "mcf_url") is used
// as is.
string automated_mcf_generation_by = 24;
// Configuration for generating golden triples.
GoldenTripleSelection golden_triple_selection = 14;
// Whether unresolved MCFs are missing.
// These are all cases that the resolved MCFs are PopObs instead of SVObs.
// For SVObs, the Flume pipeline will generate unresolved MCFs from CSV+TMCF
// automatically.
bool is_unresolved_mcf_missing = 22 ;
// Indicates which MCF files to copy from GCS or other source before
// resolving.
// REQUIRES: this is an MCF-only import with table being empty.
DownloadConfig mcf_download_config = 17;
// DatasetInfo that this import corresponds to. Must refer to a valid
// DatasetInfo.name.
string dataset_name = 26;
// Metadata files that contain curated resolved MCF for the import. These
// MCFs provide a way for extending Provenance, Dataset and Source nodes
// with new properties, without requiring any manifest proto and pipeline
// changes.
repeated string metadata_mcf_url = 30;
reserved 7, 8, 16, 19, 27, 28, 29;
}
repeated Import import = 1;
// Import Group is a collection of imports that are built together into a
// single cache for serving.
message ImportGroup {
// Name of the import group.
string name = 1;
// Description.
string description = 2;
bool is_custom_dc = 5 ;
reserved 3, 4;
}
repeated ImportGroup import_groups = 2;
// Represents a dataset from which one or more imports come from. This is used
// to display on the Data Sources page
// (https://docs.datacommons.org/datasets/).
message DatasetInfo {
// Name of dataset
string name = 1;
// Url of dataset to link to
string url = 2;
// Markdown description of data source
oneof description {
// Inline markdown string containing description
string description_md = 3;
// Name of markdown file containing description, if used instead of [3]
string description_file = 4;
}
// Verticals that should include this data source
repeated string verticals = 5;
}
// Represents a source which contains zero or more datasets. For sources with
// zero distinct child datasets (i.e. the source directly corresponds to a
// dataset), the dataset details will be contained in the source. This is used
// to display on the Data Sources page
// (https://docs.datacommons.org/datasets/).
message DatasetSource {
// Name of source
string name = 1;
// Url of source to link to
string url = 2;
// Markdown description of source
string header_md = 3;
// Markdown description of any terms of service
string footer_md = 4;
// Datasets associated with this source
repeated DatasetInfo datasets = 5;
}
repeated DatasetSource dataset_source = 4;
// Paths to curated import MCF. These MCFs provide a way for extending
// Provenance, Dataset and Source nodes with new properties, without requiring
// any manifest proto and pipeline changes, as well as source-specific
// metadata about StatisticalVariables.
repeated string import_metadata_mcf = 5;
reserved 3;
}
// A snapshot of files in a single import. Each file has length and mtime, that
// act as cheap alternatives to a full content checksum. This snapshot gets
// programmatically generated and validated.
message ImportSnapshot {
message FileStat {
string path = 1;
int64 length = 2;
double mtime_secs = 3;
}
string import_name = 1;
repeated FileStat stat = 2;
}