1+ syntax = "proto3" ;
2+
3+ // protoc --proto_path=. --python_out=. dc_manifest.proto
4+
5+ // IMPORTANT NOTE: Do not change the existing field names in
6+ // DataCommonsManifest::Import and ExternalTable messages, since these messages
7+ // are shared with the Template Generator UI application in github.
8+
9+ package datacommons.proto ;
10+
11+
12+
13+
14+ message ExternalTable {
15+ // Name of the table.
16+ string table_name = 1 ;
17+
18+ // Path to the template MCF (previously called the "schema mapping file").
19+ // TODO(shanth): Rename to template_mcf_path.
20+ string mapping_path = 2 ;
21+
22+ // Path to one or more data CSV file patterns.
23+ // NOTE: All files must be present in the same directory if DownloadConfig is
24+ // specified.
25+ // TODO(shanth): Change to csv_paths after textprotos in CNS have been updated
26+ repeated string csv_path = 3 ;
27+
28+ // Ordered list of column names that the schema mapping file will refer to.
29+ //
30+ // NOTE: This need not match the raw CSV column names.
31+ repeated string column_names = 4 ;
32+
33+ // Delimiter for CSV.
34+ // NOTE: expected to be a single character.
35+ string field_delim = 5 ;
36+
37+ // Download info for TMCF/CSV files.
38+ DownloadConfig download_config = 6 ;
39+ }
40+
41+ // Arguments relevant for the resolution of an import.
42+ message ResolutionInfo {
43+ // Set if we need any DCID generation or resolution.
44+ bool uses_id_resolver = 1 ;
45+
46+ // List of ID properties this dataset depends on for resolution.
47+ repeated string requires_ids = 2 ;
48+
49+ // List of ID properties this dataset provides for downstream resolution.
50+ repeated string provides_ids = 3 ;
51+
52+ // Info about new geos introduced by this dataset.
53+ message GeoInfo {
54+ // List the ID properties that DCIDs are based on.
55+ repeated string dcid_sources = 1 ;
56+ }
57+
58+ // REQUIRED: Must be set if new geos are introduced.
59+ GeoInfo new_geo = 4 ;
60+
61+ // When "uses_id_resolver" is true, this indicates whether the resolution was
62+ // done against the KG, and if so which type.
63+ enum KGType {
64+ KG_NONE = 0 ;
65+ KG_BQ = 1 ;
66+ KG_MCF = 2 ;
67+ }
68+
69+ KGType kg_for_resolution = 5 ;
70+
71+ // There are scenarios where additional MCF nodes are necessary purely for
72+ // resolution. Examples include StatVar MCFs for table flow and geo nodes
73+ // used to resolve by other IDs (like wikidataId).
74+ repeated string resolution_helper_mcf_paths = 6 ;
75+
76+ // This must be present if |automated_mcf_generation_by| is set and |table| is
77+ // empty in DataCommonsManifest::Import. One exception is for the "resolved
78+ // MCF --> cache" mode in the controller, in that case, this field is not
79+ // needed.
80+ repeated string unresolved_mcf_urls = 7 ;
81+ }
82+
83+ // Configuration used to download files into CNS as the first step.
84+ message DownloadConfig {
85+ // The src directory is expected to contain CSV/TMCF/MCF files with the exact
86+ // names as those in DataCommonsManifest::Import.
87+ //
88+ // If DownloadConfig is specified for a CSV/TMCF import (i.e., in
89+ // ExternalTable.download_config), then the CSV file is required and the TMCF
90+ // file is optional. If DownloadConfig is specified for a node MCF import
91+ // (i.e., Import.mcf_download_config), then the source directory must contain
92+ // all the files equivalent to "mcf_url".
93+ //
94+ // For example, suppose that the import specifies a TMCF path of
95+ // /cns/path/to/file/data.tmcf, CSV path of /cns/path/to/file/data.csv and the
96+ // "src_prefix_dir" is /bigstore/bucket/foo. Then /bigstore/bucket/foo/
97+ // should have files named data.csv (required) and data.tmcf (optional).
98+ //
99+ // As an MCF example, suppose the mcf_url is /cns/path/to/files/*.mcf and
100+ // there are three resolved MCF files (1.mcf, 2.mcf, 3.mcf) that match the
101+ // pattern. If "src_prefix_dir" is /bigstore/bucket/foo, then it must contain
102+ // unresolved MCF files with those same names.
103+ oneof prefix {
104+ // Source prefix directory.
105+ string src_prefix_dir = 1 ;
106+
107+ // Path to a file containing the latest version directory name. If it is an
108+ // absolute path, then it is used as the "src_prefix_dir". If not, then
109+ // "src_prefix_dir" is constructed as:
110+ // join(dirname(src_latest_version_file), <VERSION>)
111+ string src_latest_version_file = 2 ;
112+ }
113+
114+ message PathPair {
115+ string src_file = 1 ;
116+ string dst_file = 2 ;
117+ }
118+
119+ message FileOp {
120+ oneof op {
121+ // Copy from src_file to dst_file.
122+ PathPair cp = 1 ;
123+
124+ // Delete the given file.
125+ string del = 2 ;
126+ }
127+ }
128+
129+ reserved 3 , 4 ;
130+ }
131+
132+ // Configuration used in selecting nodes for golden triples generation.
133+ message GoldenTripleSelection {
134+ // For specified number of svObs nodes, pick one into golden triples.
135+ int32 svobs_nodes_pick_one_in = 3 ;
136+
137+ reserved 1 , 2 ;
138+ }
139+
140+ // A manifest is a collection of imports each having a name, provenance URL,
141+ // provenance description, and list of CNS path patterns or external table info.
142+ message DataCommonsManifest {
143+ // Categories of imports.
144+ // LINT.IfChange
145+ enum ImportCategory {
146+ IMPORT_CATEGORY_UNKNOWN = 0 ;
147+
148+ // Import with Graph Schema nodes.
149+ SCHEMA = 1 ;
150+
151+ // Import with place nodes.
152+ PLACE = 2 ;
153+
154+ // Import with StatVar nodes having curated DCIDs.
155+ CURATED_STATVAR = 3 ;
156+
157+ // Import with StatVar nodes having auto-generated DCIDs.
158+ GENERATED_STATVAR = 4 ;
159+
160+ // Aggregated stats.
161+ AGGREGATED_STATS = 7 ;
162+
163+ // Stats, which contain observations.
164+ STATS = 8 ;
165+
166+ // Import with entity nodes that are not Schema/StatVars, observations or
167+ // places.
168+ ENTITY = 9 ;
169+
170+ // Imputed stats.
171+ IMPUTED_STATS = 10 ;
172+
173+ // Intermediate stats that are used as inputs to derived graph computations,
174+ // but are not built into cache.
175+ // This could be either a direct import from source, or some kind of
176+ // aggregation that doesn't get built into cache.
177+ INTERMEDIATE_STATS = 11 ;
178+
179+ // Schema import from custom DC.
180+ CUSTOM_SCHEMA = 12 ;
181+
182+ reserved 5 , 6 ;
183+ }
184+ // LINT.ThenChange(//depot/google3/datacommons/import/mcf_vocab.h)
185+
186+ // Next ID: 32.
187+ message Import {
188+ string import_name = 1 ;
189+ string provenance_url = 2 ;
190+ string provenance_description = 6 ;
191+ ImportCategory category = 20 ;
192+
193+ // The list of Import Groups that this import is part of. Must refer to a
194+ // valid ImportGroup.name. If empty, then this import is part of all groups.
195+ repeated string import_groups = 23 ;
196+
197+ // When input is node MCF, "mcf_url" is set.
198+ repeated string mcf_url = 3 ;
199+
200+ // The URL of the optimized MCF in proto (TFRecord) format.
201+ repeated string mcf_proto_url = 25 ;
202+
203+ // GCS path prefix for resolved MCFs.
204+ string mcf_gcs_path_prefix = 31 ;
205+
206+ // In cases like Core Geo MCFs, the MCFs are versioned in piper but that
207+ // version is not directly accessed by the binary (owing to memory
208+ // limitations). Instead, we keep a copy of these files in CNS and provide
209+ // that path in "mcf_url". Set this bool to true to mark that situation.
210+ //
211+ // If set, there exist tests (manifest_checker_test.cc) to ensure the MCFs
212+ // are in sync and tooling (sync_replicated_mcfs.cc) to update the CNS copy.
213+ bool is_mcf_replicated_from_piper = 18 ;
214+
215+ // When input comes from TMCF/CSV, this is set.
216+ repeated ExternalTable table = 4 ;
217+ string curator_email = 5 ;
218+
219+ // Information about resolution.
220+ ResolutionInfo resolution_info = 9 ;
221+
222+ // Dates associated with stats. All dates are in ISO8601 format.
223+ //
224+ // NOTE: For datasets that are released more frequently than a month, please
225+ // skip |end_date_in_kg|, |end_date_in_source| and |next_release_date|.
226+ //
227+ // Earliest value of observationDate in the KG.
228+ string start_date_in_kg = 10 ;
229+
230+ // Latest value of observationDate in the KG.
231+ string end_date_in_kg = 11 ;
232+
233+ // Latest value of observationDate that is available at the source.
234+ // If end_date_in_kg < end_date_in_source, the dataset needs refresh.
235+ string end_date_in_source = 21 ;
236+
237+ // Data Release Schedule
238+ //
239+ // Release frequency of the import source.
240+ string release_frequency = 12 ;
241+
242+ // Next date when the data source will be released. If this date is in the
243+ // past, the dataset needs refresh.
244+ string next_release_date = 13 ;
245+
246+ // The URLs from where the data can be accessed/downloaded.
247+ repeated string data_download_url = 15 ;
248+
249+ // As part of which import-group do we regenerate the MCFs?
250+ //
251+ // The branch controllers can convert table to MCF or resolve MCF if this
252+ // field is set to the appropriate import group. Not setting this field
253+ // disables any such work, instead the resolved MCF (in "mcf_url") is used
254+ // as is.
255+ string automated_mcf_generation_by = 24 ;
256+
257+ // Configuration for generating golden triples.
258+ GoldenTripleSelection golden_triple_selection = 14 ;
259+
260+ // Whether unresolved MCFs are missing.
261+ // These are all cases that the resolved MCFs are PopObs instead of SVObs.
262+ // For SVObs, the Flume pipeline will generate unresolved MCFs from CSV+TMCF
263+ // automatically.
264+ bool is_unresolved_mcf_missing = 22 ;
265+
266+ // Indicates which MCF files to copy from GCS or other source before
267+ // resolving.
268+ // REQUIRES: this is an MCF-only import with table being empty.
269+ DownloadConfig mcf_download_config = 17 ;
270+
271+ // DatasetInfo that this import corresponds to. Must refer to a valid
272+ // DatasetInfo.name.
273+ string dataset_name = 26 ;
274+
275+ // Metadata files that contain curated resolved MCF for the import. These
276+ // MCFs provide a way for extending Provenance, Dataset and Source nodes
277+ // with new properties, without requiring any manifest proto and pipeline
278+ // changes.
279+ repeated string metadata_mcf_url = 30 ;
280+
281+ reserved 7 , 8 , 16 , 19 , 27 , 28 , 29 ;
282+ }
283+
284+ repeated Import import = 1 ;
285+
286+ // Import Group is a collection of imports that are built together into a
287+ // single cache for serving.
288+ message ImportGroup {
289+ // Name of the import group.
290+ string name = 1 ;
291+
292+ // Description.
293+ string description = 2 ;
294+ bool is_custom_dc = 5 ;
295+
296+ reserved 3 , 4 ;
297+ }
298+
299+ repeated ImportGroup import_groups = 2 ;
300+
301+ // Represents a dataset from which one or more imports come from. This is used
302+ // to display on the Data Sources page
303+ // (https://docs.datacommons.org/datasets/).
304+ message DatasetInfo {
305+ // Name of dataset
306+ string name = 1 ;
307+
308+ // Url of dataset to link to
309+ string url = 2 ;
310+
311+ // Markdown description of data source
312+ oneof description {
313+ // Inline markdown string containing description
314+ string description_md = 3 ;
315+
316+ // Name of markdown file containing description, if used instead of [3]
317+ string description_file = 4 ;
318+ }
319+
320+ // Verticals that should include this data source
321+ repeated string verticals = 5 ;
322+ }
323+
324+ // Represents a source which contains zero or more datasets. For sources with
325+ // zero distinct child datasets (i.e. the source directly corresponds to a
326+ // dataset), the dataset details will be contained in the source. This is used
327+ // to display on the Data Sources page
328+ // (https://docs.datacommons.org/datasets/).
329+ message DatasetSource {
330+ // Name of source
331+ string name = 1 ;
332+
333+ // Url of source to link to
334+ string url = 2 ;
335+
336+ // Markdown description of source
337+ string header_md = 3 ;
338+
339+ // Markdown description of any terms of service
340+ string footer_md = 4 ;
341+
342+ // Datasets associated with this source
343+ repeated DatasetInfo datasets = 5 ;
344+ }
345+
346+ repeated DatasetSource dataset_source = 4 ;
347+
348+ // Paths to curated import MCF. These MCFs provide a way for extending
349+ // Provenance, Dataset and Source nodes with new properties, without requiring
350+ // any manifest proto and pipeline changes, as well as source-specific
351+ // metadata about StatisticalVariables.
352+ repeated string import_metadata_mcf = 5 ;
353+
354+ reserved 3 ;
355+ }
356+
357+ // A snapshot of files in a single import. Each file has length and mtime, that
358+ // act as cheap alternatives to a full content checksum. This snapshot gets
359+ // programmatically generated and validated.
360+ message ImportSnapshot {
361+ message FileStat {
362+ string path = 1 ;
363+ int64 length = 2 ;
364+ double mtime_secs = 3 ;
365+ }
366+
367+ string import_name = 1 ;
368+ repeated FileStat stat = 2 ;
369+ }
0 commit comments