Skip to content

Commit 9130e42

Browse files
committed
Add helper scripts for import automation
- Retry failed ingestion job - Generate provisional nodes - Convert DC manifest files to MCF
1 parent dc738c8 commit 9130e42

5 files changed

Lines changed: 1188 additions & 0 deletions

File tree

Lines changed: 369 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
syntax = "proto3";
2+
3+
// protoc --proto_path=. --python_out=. dc_manifest.proto
4+
5+
// IMPORTANT NOTE: Do not change the existing field names in
6+
// DataCommonsManifest::Import and ExternalTable messages, since these messages
7+
// are shared with the Template Generator UI application in github.
8+
9+
package datacommons.proto;
10+
11+
12+
13+
14+
message ExternalTable {
15+
// Name of the table.
16+
string table_name = 1 ;
17+
18+
// Path to the template MCF (previously called the "schema mapping file").
19+
// TODO(shanth): Rename to template_mcf_path.
20+
string mapping_path = 2;
21+
22+
// Path to one or more data CSV file patterns.
23+
// NOTE: All files must be present in the same directory if DownloadConfig is
24+
// specified.
25+
// TODO(shanth): Change to csv_paths after textprotos in CNS have been updated
26+
repeated string csv_path = 3;
27+
28+
// Ordered list of column names that the schema mapping file will refer to.
29+
//
30+
// NOTE: This need not match the raw CSV column names.
31+
repeated string column_names = 4;
32+
33+
// Delimiter for CSV.
34+
// NOTE: expected to be a single character.
35+
string field_delim = 5 ;
36+
37+
// Download info for TMCF/CSV files.
38+
DownloadConfig download_config = 6;
39+
}
40+
41+
// Arguments relevant for the resolution of an import.
42+
message ResolutionInfo {
43+
// Set if we need any DCID generation or resolution.
44+
bool uses_id_resolver = 1;
45+
46+
// List of ID properties this dataset depends on for resolution.
47+
repeated string requires_ids = 2;
48+
49+
// List of ID properties this dataset provides for downstream resolution.
50+
repeated string provides_ids = 3;
51+
52+
// Info about new geos introduced by this dataset.
53+
message GeoInfo {
54+
// List the ID properties that DCIDs are based on.
55+
repeated string dcid_sources = 1;
56+
}
57+
58+
// REQUIRED: Must be set if new geos are introduced.
59+
GeoInfo new_geo = 4;
60+
61+
// When "uses_id_resolver" is true, this indicates whether the resolution was
62+
// done against the KG, and if so which type.
63+
enum KGType {
64+
KG_NONE = 0;
65+
KG_BQ = 1;
66+
KG_MCF = 2;
67+
}
68+
69+
KGType kg_for_resolution = 5 ;
70+
71+
// There are scenarios where additional MCF nodes are necessary purely for
72+
// resolution. Examples include StatVar MCFs for table flow and geo nodes
73+
// used to resolve by other IDs (like wikidataId).
74+
repeated string resolution_helper_mcf_paths = 6;
75+
76+
// This must be present if |automated_mcf_generation_by| is set and |table| is
77+
// empty in DataCommonsManifest::Import. One exception is for the "resolved
78+
// MCF --> cache" mode in the controller, in that case, this field is not
79+
// needed.
80+
repeated string unresolved_mcf_urls = 7;
81+
}
82+
83+
// Configuration used to download files into CNS as the first step.
84+
message DownloadConfig {
85+
// The src directory is expected to contain CSV/TMCF/MCF files with the exact
86+
// names as those in DataCommonsManifest::Import.
87+
//
88+
// If DownloadConfig is specified for a CSV/TMCF import (i.e., in
89+
// ExternalTable.download_config), then the CSV file is required and the TMCF
90+
// file is optional. If DownloadConfig is specified for a node MCF import
91+
// (i.e., Import.mcf_download_config), then the source directory must contain
92+
// all the files equivalent to "mcf_url".
93+
//
94+
// For example, suppose that the import specifies a TMCF path of
95+
// /cns/path/to/file/data.tmcf, CSV path of /cns/path/to/file/data.csv and the
96+
// "src_prefix_dir" is /bigstore/bucket/foo. Then /bigstore/bucket/foo/
97+
// should have files named data.csv (required) and data.tmcf (optional).
98+
//
99+
// As an MCF example, suppose the mcf_url is /cns/path/to/files/*.mcf and
100+
// there are three resolved MCF files (1.mcf, 2.mcf, 3.mcf) that match the
101+
// pattern. If "src_prefix_dir" is /bigstore/bucket/foo, then it must contain
102+
// unresolved MCF files with those same names.
103+
oneof prefix {
104+
// Source prefix directory.
105+
string src_prefix_dir = 1;
106+
107+
// Path to a file containing the latest version directory name. If it is an
108+
// absolute path, then it is used as the "src_prefix_dir". If not, then
109+
// "src_prefix_dir" is constructed as:
110+
// join(dirname(src_latest_version_file), <VERSION>)
111+
string src_latest_version_file = 2;
112+
}
113+
114+
message PathPair {
115+
string src_file = 1;
116+
string dst_file = 2;
117+
}
118+
119+
message FileOp {
120+
oneof op {
121+
// Copy from src_file to dst_file.
122+
PathPair cp = 1;
123+
124+
// Delete the given file.
125+
string del = 2;
126+
}
127+
}
128+
129+
reserved 3, 4;
130+
}
131+
132+
// Configuration used in selecting nodes for golden triples generation.
133+
message GoldenTripleSelection {
134+
// For specified number of svObs nodes, pick one into golden triples.
135+
int32 svobs_nodes_pick_one_in = 3 ;
136+
137+
reserved 1, 2;
138+
}
139+
140+
// A manifest is a collection of imports each having a name, provenance URL,
141+
// provenance description, and list of CNS path patterns or external table info.
142+
message DataCommonsManifest {
143+
// Categories of imports.
144+
// LINT.IfChange
145+
enum ImportCategory {
146+
IMPORT_CATEGORY_UNKNOWN = 0;
147+
148+
// Import with Graph Schema nodes.
149+
SCHEMA = 1;
150+
151+
// Import with place nodes.
152+
PLACE = 2;
153+
154+
// Import with StatVar nodes having curated DCIDs.
155+
CURATED_STATVAR = 3;
156+
157+
// Import with StatVar nodes having auto-generated DCIDs.
158+
GENERATED_STATVAR = 4;
159+
160+
// Aggregated stats.
161+
AGGREGATED_STATS = 7;
162+
163+
// Stats, which contain observations.
164+
STATS = 8;
165+
166+
// Import with entity nodes that are not Schema/StatVars, observations or
167+
// places.
168+
ENTITY = 9;
169+
170+
// Imputed stats.
171+
IMPUTED_STATS = 10;
172+
173+
// Intermediate stats that are used as inputs to derived graph computations,
174+
// but are not built into cache.
175+
// This could be either a direct import from source, or some kind of
176+
// aggregation that doesn't get built into cache.
177+
INTERMEDIATE_STATS = 11;
178+
179+
// Schema import from custom DC.
180+
CUSTOM_SCHEMA = 12;
181+
182+
reserved 5, 6;
183+
}
184+
// LINT.ThenChange(//depot/google3/datacommons/import/mcf_vocab.h)
185+
186+
// Next ID: 32.
187+
message Import {
188+
string import_name = 1;
189+
string provenance_url = 2;
190+
string provenance_description = 6;
191+
ImportCategory category = 20;
192+
193+
// The list of Import Groups that this import is part of. Must refer to a
194+
// valid ImportGroup.name. If empty, then this import is part of all groups.
195+
repeated string import_groups = 23;
196+
197+
// When input is node MCF, "mcf_url" is set.
198+
repeated string mcf_url = 3;
199+
200+
// The URL of the optimized MCF in proto (TFRecord) format.
201+
repeated string mcf_proto_url = 25;
202+
203+
// GCS path prefix for resolved MCFs.
204+
string mcf_gcs_path_prefix = 31;
205+
206+
// In cases like Core Geo MCFs, the MCFs are versioned in piper but that
207+
// version is not directly accessed by the binary (owing to memory
208+
// limitations). Instead, we keep a copy of these files in CNS and provide
209+
// that path in "mcf_url". Set this bool to true to mark that situation.
210+
//
211+
// If set, there exist tests (manifest_checker_test.cc) to ensure the MCFs
212+
// are in sync and tooling (sync_replicated_mcfs.cc) to update the CNS copy.
213+
bool is_mcf_replicated_from_piper = 18;
214+
215+
// When input comes from TMCF/CSV, this is set.
216+
repeated ExternalTable table = 4;
217+
string curator_email = 5 ;
218+
219+
// Information about resolution.
220+
ResolutionInfo resolution_info = 9;
221+
222+
// Dates associated with stats. All dates are in ISO8601 format.
223+
//
224+
// NOTE: For datasets that are released more frequently than a month, please
225+
// skip |end_date_in_kg|, |end_date_in_source| and |next_release_date|.
226+
//
227+
// Earliest value of observationDate in the KG.
228+
string start_date_in_kg = 10;
229+
230+
// Latest value of observationDate in the KG.
231+
string end_date_in_kg = 11;
232+
233+
// Latest value of observationDate that is available at the source.
234+
// If end_date_in_kg < end_date_in_source, the dataset needs refresh.
235+
string end_date_in_source = 21;
236+
237+
// Data Release Schedule
238+
//
239+
// Release frequency of the import source.
240+
string release_frequency = 12;
241+
242+
// Next date when the data source will be released. If this date is in the
243+
// past, the dataset needs refresh.
244+
string next_release_date = 13;
245+
246+
// The URLs from where the data can be accessed/downloaded.
247+
repeated string data_download_url = 15;
248+
249+
// As part of which import-group do we regenerate the MCFs?
250+
//
251+
// The branch controllers can convert table to MCF or resolve MCF if this
252+
// field is set to the appropriate import group. Not setting this field
253+
// disables any such work, instead the resolved MCF (in "mcf_url") is used
254+
// as is.
255+
string automated_mcf_generation_by = 24;
256+
257+
// Configuration for generating golden triples.
258+
GoldenTripleSelection golden_triple_selection = 14;
259+
260+
// Whether unresolved MCFs are missing.
261+
// These are all cases that the resolved MCFs are PopObs instead of SVObs.
262+
// For SVObs, the Flume pipeline will generate unresolved MCFs from CSV+TMCF
263+
// automatically.
264+
bool is_unresolved_mcf_missing = 22 ;
265+
266+
// Indicates which MCF files to copy from GCS or other source before
267+
// resolving.
268+
// REQUIRES: this is an MCF-only import with table being empty.
269+
DownloadConfig mcf_download_config = 17;
270+
271+
// DatasetInfo that this import corresponds to. Must refer to a valid
272+
// DatasetInfo.name.
273+
string dataset_name = 26;
274+
275+
// Metadata files that contain curated resolved MCF for the import. These
276+
// MCFs provide a way for extending Provenance, Dataset and Source nodes
277+
// with new properties, without requiring any manifest proto and pipeline
278+
// changes.
279+
repeated string metadata_mcf_url = 30;
280+
281+
reserved 7, 8, 16, 19, 27, 28, 29;
282+
}
283+
284+
repeated Import import = 1;
285+
286+
// Import Group is a collection of imports that are built together into a
287+
// single cache for serving.
288+
message ImportGroup {
289+
// Name of the import group.
290+
string name = 1;
291+
292+
// Description.
293+
string description = 2;
294+
bool is_custom_dc = 5 ;
295+
296+
reserved 3, 4;
297+
}
298+
299+
repeated ImportGroup import_groups = 2;
300+
301+
// Represents a dataset from which one or more imports come from. This is used
302+
// to display on the Data Sources page
303+
// (https://docs.datacommons.org/datasets/).
304+
message DatasetInfo {
305+
// Name of dataset
306+
string name = 1;
307+
308+
// Url of dataset to link to
309+
string url = 2;
310+
311+
// Markdown description of data source
312+
oneof description {
313+
// Inline markdown string containing description
314+
string description_md = 3;
315+
316+
// Name of markdown file containing description, if used instead of [3]
317+
string description_file = 4;
318+
}
319+
320+
// Verticals that should include this data source
321+
repeated string verticals = 5;
322+
}
323+
324+
// Represents a source which contains zero or more datasets. For sources with
325+
// zero distinct child datasets (i.e. the source directly corresponds to a
326+
// dataset), the dataset details will be contained in the source. This is used
327+
// to display on the Data Sources page
328+
// (https://docs.datacommons.org/datasets/).
329+
message DatasetSource {
330+
// Name of source
331+
string name = 1;
332+
333+
// Url of source to link to
334+
string url = 2;
335+
336+
// Markdown description of source
337+
string header_md = 3;
338+
339+
// Markdown description of any terms of service
340+
string footer_md = 4;
341+
342+
// Datasets associated with this source
343+
repeated DatasetInfo datasets = 5;
344+
}
345+
346+
repeated DatasetSource dataset_source = 4;
347+
348+
// Paths to curated import MCF. These MCFs provide a way for extending
349+
// Provenance, Dataset and Source nodes with new properties, without requiring
350+
// any manifest proto and pipeline changes, as well as source-specific
351+
// metadata about StatisticalVariables.
352+
repeated string import_metadata_mcf = 5;
353+
354+
reserved 3;
355+
}
356+
357+
// A snapshot of files in a single import. Each file has length and mtime, that
358+
// act as cheap alternatives to a full content checksum. This snapshot gets
359+
// programmatically generated and validated.
360+
message ImportSnapshot {
361+
message FileStat {
362+
string path = 1;
363+
int64 length = 2;
364+
double mtime_secs = 3;
365+
}
366+
367+
string import_name = 1;
368+
repeated FileStat stat = 2;
369+
}

0 commit comments

Comments
 (0)