33import os
44import pandas as pd
55import re
6- import shutil
76
87from absl import logging
98from google .cloud import storage
10- from googleapiclient .discovery import build
119
1210
13- def load_mcf_file (file : str ) -> pd . DataFrame :
14- """ Reads an MCF text file and returns it as a dataframe ."""
11+ def load_mcf_file (file : str ):
12+ """ Reads an MCF text file and returns mcf nodes ."""
1513 mcf_file = open (file , 'r' , encoding = 'utf-8' )
1614 mcf_contents = mcf_file .read ()
1715 mcf_file .close ()
@@ -27,25 +25,22 @@ def load_mcf_file(file: str) -> pd.DataFrame:
2725 if parsed_line is not None :
2826 current_mcf_node [parsed_line .group (1 )] = parsed_line .group (2 )
2927 if current_mcf_node :
30- if current_mcf_node ['typeOf' ] == 'dcid:StatVarObservation' :
31- mcf_nodes .append (current_mcf_node )
32- else :
33- logging .warning (
34- f'Ignoring node of type:{ current_mcf_node ["typeOf" ]} ' )
35- df = pd .DataFrame (mcf_nodes )
36- return df
28+ mcf_nodes .append (current_mcf_node )
29+
30+ logging .info (f'Loaded { len (mcf_nodes )} nodes from file { file } ' )
31+ return mcf_nodes
3732
3833
3934def load_mcf_files (path : str ) -> pd .DataFrame :
4035 """ Loads all sharded mcf files in the given directory and
41- returns a single combined dataframe ."""
42- df_list = []
36+ returns a combined MCF node list ."""
37+ node_list = []
4338 filenames = glob .glob (path )
39+ logging .info (f'Loading { len (filenames )} files from path { path } ' )
4440 for filename in filenames :
45- df = load_mcf_file (filename )
46- df_list .append (df )
47- result = pd .concat (df_list , ignore_index = True )
48- return result
41+ nodes = load_mcf_file (filename )
42+ node_list .extend (nodes )
43+ return node_list
4944
5045
5146def load_csv_data (path : str , tmp_dir : str ) -> pd .DataFrame :
@@ -66,72 +61,33 @@ def load_csv_data(path: str, tmp_dir: str) -> pd.DataFrame:
6661
6762def write_csv_data (df : pd .DataFrame , dest : str , file : str , tmp_dir : str ):
6863 """ Writes a dataframe to a CSV file with the given path."""
69- tmp_file = os .path .join (tmp_dir , file )
70- with open (tmp_file , mode = 'w' , encoding = 'utf-8' ) as out_file :
71- df .to_csv (out_file , index = False , mode = 'w' , header = True )
72- upload_output_data (tmp_file , dest )
73-
74-
75- def launch_dataflow_job (project : str , job : str , current_data : str ,
76- previous_data : str , file_format : str ,
77- output_location : str ) -> str :
78- parameters = {
79- 'currentData' : current_data ,
80- 'previousData' : previous_data ,
81- 'outputLocation' : output_location + '/diff' ,
82- }
83- if file_format == 'mcf' :
84- logging .info ('Using mcf file format' )
85- template = 'gs://datcom-dataflow/templates/differ-mcf'
64+ if dest .startswith ('gs://' ):
65+ path = os .path .join (tmp_dir , file )
8666 else :
87- logging .info ('Using tfrecord file format' )
88- template = 'gs://datcom-dataflow/templates/differ-tfr'
89- parameters ['useOptimizedGraphFormat' ] = 'true'
90-
91- dataflow = build ("dataflow" , "v1b3" )
92- request = (dataflow .projects ().templates ().launch (
93- projectId = project ,
94- gcsPath = template ,
95- body = {
96- "jobName" : job ,
97- "parameters" : parameters ,
98- },
99- ))
100- response = request .execute ()
101- job_id = response ['job' ]['id' ]
102- return f'https://pantheon.corp.google.com/dataflow/jobs/{ job_id } ?project={ project } '
103-
104-
105- def get_job_status (project : str , job : str ) -> str :
106- dataflow = build ("dataflow" , "v1b3" )
107- request = (dataflow .projects ().jobs ().list (projectId = project , name = job ))
108- response = request .execute ()
109- return response ['jobs' ][0 ]['currentState' ]
67+ path = os .path .join (dest , file )
68+ with open (path , mode = 'w' , encoding = 'utf-8' ) as out_file :
69+ df .to_csv (out_file , index = False , mode = 'w' , header = True )
70+ if dest .startswith ('gs://' ):
71+ upload_output_data (path , dest )
11072
11173
11274def upload_output_data (src : str , dest : str ):
113- if dest .startswith ('gs://' ):
114- client = storage .Client ()
115- bucket_name = dest .split ('/' )[2 ]
116- bucket = client .get_bucket (bucket_name )
117- for filepath in glob .iglob (src ):
118- filename = os .path .basename (filepath )
119- logging .info ('Uploading %s to %s' , filename , dest )
120- blobname = dest [len ('gs://' + bucket_name + '/' ):] + '/' + filename
121- blob = bucket .blob (blobname )
122- blob .upload_from_filename (filepath )
123- else :
124- os .makedirs (dest , exist_ok = True )
125- for filepath in glob .iglob (src ):
126- shutil .copyfile (filepath ,
127- os .path .join (dest , os .path .basename (filepath )))
75+ client = storage .Client ()
76+ bucket_name = dest .split ('/' )[2 ]
77+ bucket = client .get_bucket (bucket_name )
78+ for filepath in glob .iglob (src ):
79+ filename = os .path .basename (filepath )
80+ logging .info ('Uploading %s to %s' , filename , dest )
81+ blobname = dest [len ('gs://' + bucket_name + '/' ):] + '/' + filename
82+ blob = bucket .blob (blobname )
83+ blob .upload_from_filename (filepath )
12884
12985
130- def get_gcs_data (uri : str , tmp_dir : str ) -> str :
86+ def get_gcs_data (uri : str , dest_dir : str ) -> str :
13187 """ Downloads files from GCS and copies them to local.
13288 Args:
13389 uri: single file path or wildcard format
134- tmp_dir : destination folder
90+ dest_dir : destination folder
13591 Returns:
13692 path to the output file/folder
13793 """
@@ -141,20 +97,23 @@ def get_gcs_data(uri: str, tmp_dir: str) -> str:
14197 dirname = os .path .dirname (file_pat )
14298 for blob in bucket .list_blobs (prefix = dirname ):
14399 if fnmatch .fnmatch (blob .name , file_pat ):
144- path = blob .name .replace ('/' , '_' )
145- blob .download_to_filename (os .path .join (tmp_dir , path ))
146- return os .path .join (tmp_dir , file_pat .replace ('/' , '_' ))
100+ dest_file = os .path .join (dest_dir , blob .name )
101+ os .makedirs (os .path .dirname (dest_file ), exist_ok = True )
102+ blob .download_to_filename (dest_file )
103+ return os .path .join (dest_dir , file_pat )
147104
148105
149- def load_data (path : str , tmp_dir : str ) -> pd . DataFrame :
150- """ Loads data from the given path and returns as a dataframe.
106+ def load_data (path : str , tmp_dir : str ) -> list :
107+ """ Loads data from the given path and returns dataframe.
151108 Args:
152109 path: local or gcs path (single file or wildcard format)
153- tmp_dir: destination folder
110+ tmp_dir: temporary folder
154111 Returns:
155- dataframe with the input data
112+ combined list of mcf nodes
156113 """
157114 if path .startswith ('gs://' ):
158115 os .makedirs (tmp_dir , exist_ok = True )
159116 path = get_gcs_data (path , tmp_dir )
160- return load_mcf_files (path )
117+
118+ mcf_nodes = load_mcf_files (path )
119+ return mcf_nodes
0 commit comments