hareesh-ms
diff --git a/‎import-automation/executor/app/configs.py‎
Lines changed: 0 additions & 2 deletions b/‎import-automation/executor/app/configs.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎import-automation/executor/app/executor/import_executor.py‎
Lines changed: 2 additions & 3 deletions b/‎import-automation/executor/app/executor/import_executor.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎import-automation/executor/config_override_test.json‎
Lines changed: 0 additions & 1 deletion b/‎import-automation/executor/config_override_test.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎import-automation/executor/run_import.sh‎
Lines changed: 0 additions & 2 deletions b/‎import-automation/executor/run_import.sh‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tools/import_differ/README.md‎
Lines changed: 7 additions & 7 deletions b/‎tools/import_differ/README.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎tools/import_differ/differ_utils.py‎
Lines changed: 41 additions & 82 deletions b/‎tools/import_differ/differ_utils.py‎
Lines changed: 41 additions & 82 deletions
diff --git a/‎tools/import_differ/graph.tfrecord-00000-of-00001.gz‎
1.19 MB b/‎tools/import_differ/graph.tfrecord-00000-of-00001.gz‎
1.19 MB
@@ -117,8 +117,6 @@ class ExecutorConfig:
     local_repo_dir: str = '/data'
     # Location of the import tool jar.
     import_tool_path: str = '/import-tool.jar'
-    # Location of the differ tool jar.
-    differ_tool_path: str = '/differ-tool.jar'
     # Cloud workflow id.
     cloud_workflow_id: str = 'import-automation-workflow'
     # Maximum time a user script can run for in seconds.
 
@@ -452,7 +452,7 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
             validation_output_file = os.path.join(validation_output_path,
                                                   'validation_output.csv')
             differ_output = os.path.join(validation_output_path,
-                                         'point_analysis_summary.csv')
+                                         'obs_diff_summary.csv')
 
             # Invoke differ and validation scripts.
             differ_output_file = ''
@@ -462,11 +462,10 @@ def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
                 differ = ImportDiffer(current_data=current_data_path,
                                       previous_data=previous_data_path,
                                       output_location=validation_output_path,
-                                      differ_tool='',
                                       project_id=self.config.gcp_project_id,
                                       job_name=differ_job_name,
                                       file_format='mcf',
-                                      runner_mode='native')
+                                      runner_mode='local')
                 differ.run_differ()
                 differ_output_file = differ_output
             else:
 
@@ -6,7 +6,6 @@
         "user_script_timeout": 3600,
         "disable_email_notifications": true,
         "import_tool_path" : "/tmp/import-tool/import-tool.jar",
-        "differ_tool_path" : "/tmp/import-tool/differ-tool.jar",
         "gcp_project_id": "datcom-ci",
         "gcs_volume_mount_dir": "/tmp",
         "storage_prod_bucket_name": "datcom-import-test"
 
@@ -279,8 +279,6 @@ function run_import_executor {
     mkdir -p $TMP_DIR/import-tool
     run_cmd wget "https://storage.googleapis.com/datacommons_public/import_tools/import-tool.jar" \
       -O $TMP_DIR/import-tool/import-tool.jar
-    run_cmd wget "https://storage.googleapis.com/datacommons_public/import_tools/differ-tool.jar" \
-      -O $TMP_DIR/import-tool/differ-tool.jar
   fi
 
   run_cmd $SCRIPT_DIR/run_local_executor.sh \
 
@@ -1,12 +1,11 @@
 # Import Differ
 
-This utility generates a diff (point and series analysis) of two versions of the same dataset for import analysis.
+This utility generates a diff of two versions of a dataset for import analysis.
 
 **Usage**
 
 ***Prerequisites***
 - Python/Pandas is installed for native runner mode.
-- Java is installed for local runner mode.
 - gcloud ADC is configured for cloud runner mode.
 
 ```
@@ -18,7 +17,7 @@ python import_differ.py --current_data=<path> --previous_data=<path> --output_lo
 - previous\_data: Path to the previous data (wildcard on local/GCS supported).
 - output\_location: Path to the output data folder (local/GCS).
 - file\_format: Format of the input data (mcf,tfrecord).
-- runner\_mode: Runner mode: native (Python) / local (Dataflow in local mode) / cloud (Dataflow in Cloud).
+- runner\_mode: Runner mode: local (Python) / cloud (Dataflow in Cloud).
 - project\_id: GCP project Id for the dataflow job.
 - job\_name: Name of the differ dataflow job.
 
@@ -35,7 +34,8 @@ Summary output generated is of the form below showing counts of differences for
 |3|dcid:var4|0|2|0|
 
 Detailed diff output is written to files for further analysis. Sample result files can be found under folder 'test/results'.
-- point\_analysis\_summary.csv: diff summry for point analysis
-- point\_analysis\_results.csv: detailed results for point analysis
-- series\_analysis\_summary.csv: diff summry for series analysis
-- series\_analysis\_results.csv: detailed results for series analysis
+- obs\_diff\_summary.csv: diff summary for observation analysis
+- obs\_diff\_samples.csv: sample diff for observation analysis
+- obs\_diff\_log.csv: diff log for observations
+- schema\_diff\_summary.csv: diff summary for schema analysis
+- schema\_diff\_log.csv: diff log for schema nodes 
@@ -3,15 +3,13 @@
 import os
 import pandas as pd
 import re
-import shutil
 
 from absl import logging
 from google.cloud import storage
-from googleapiclient.discovery import build
 
 
-def load_mcf_file(file: str) -> pd.DataFrame:
-    """ Reads an MCF text file and returns it as a dataframe."""
+def load_mcf_file(file: str):
+    """ Reads an MCF text file and returns mcf nodes."""
     mcf_file = open(file, 'r', encoding='utf-8')
     mcf_contents = mcf_file.read()
     mcf_file.close()
@@ -27,25 +25,22 @@ def load_mcf_file(file: str) -> pd.DataFrame:
             if parsed_line is not None:
                 current_mcf_node[parsed_line.group(1)] = parsed_line.group(2)
         if current_mcf_node:
-            if current_mcf_node['typeOf'] == 'dcid:StatVarObservation':
-                mcf_nodes.append(current_mcf_node)
-            else:
-                logging.warning(
-                    f'Ignoring node of type:{current_mcf_node["typeOf"]}')
-    df = pd.DataFrame(mcf_nodes)
-    return df
+            mcf_nodes.append(current_mcf_node)
+
+    logging.info(f'Loaded {len(mcf_nodes)} nodes from file {file}')
+    return mcf_nodes
 
 
 def load_mcf_files(path: str) -> pd.DataFrame:
     """ Loads all sharded mcf files in the given directory and 
-    returns a single combined dataframe."""
-    df_list = []
+    returns a combined MCF node list."""
+    node_list = []
     filenames = glob.glob(path)
+    logging.info(f'Loading {len(filenames)} files from path {path}')
     for filename in filenames:
-        df = load_mcf_file(filename)
-        df_list.append(df)
-    result = pd.concat(df_list, ignore_index=True)
-    return result
+        nodes = load_mcf_file(filename)
+        node_list.extend(nodes)
+    return node_list
 
 
 def load_csv_data(path: str, tmp_dir: str) -> pd.DataFrame:
@@ -66,72 +61,33 @@ def load_csv_data(path: str, tmp_dir: str) -> pd.DataFrame:
 
 def write_csv_data(df: pd.DataFrame, dest: str, file: str, tmp_dir: str):
     """ Writes a dataframe to a CSV file with the given path."""
-    tmp_file = os.path.join(tmp_dir, file)
-    with open(tmp_file, mode='w', encoding='utf-8') as out_file:
-        df.to_csv(out_file, index=False, mode='w', header=True)
-    upload_output_data(tmp_file, dest)
-
-
-def launch_dataflow_job(project: str, job: str, current_data: str,
-                        previous_data: str, file_format: str,
-                        output_location: str) -> str:
-    parameters = {
-        'currentData': current_data,
-        'previousData': previous_data,
-        'outputLocation': output_location + '/diff',
-    }
-    if file_format == 'mcf':
-        logging.info('Using mcf file format')
-        template = 'gs://datcom-dataflow/templates/differ-mcf'
+    if dest.startswith('gs://'):
+        path = os.path.join(tmp_dir, file)
     else:
-        logging.info('Using tfrecord file format')
-        template = 'gs://datcom-dataflow/templates/differ-tfr'
-        parameters['useOptimizedGraphFormat'] = 'true'
-
-    dataflow = build("dataflow", "v1b3")
-    request = (dataflow.projects().templates().launch(
-        projectId=project,
-        gcsPath=template,
-        body={
-            "jobName": job,
-            "parameters": parameters,
-        },
-    ))
-    response = request.execute()
-    job_id = response['job']['id']
-    return f'https://pantheon.corp.google.com/dataflow/jobs/{job_id}?project={project}'
-
-
-def get_job_status(project: str, job: str) -> str:
-    dataflow = build("dataflow", "v1b3")
-    request = (dataflow.projects().jobs().list(projectId=project, name=job))
-    response = request.execute()
-    return response['jobs'][0]['currentState']
+        path = os.path.join(dest, file)
+    with open(path, mode='w', encoding='utf-8') as out_file:
+        df.to_csv(out_file, index=False, mode='w', header=True)
+    if dest.startswith('gs://'):
+        upload_output_data(path, dest)
 
 
 def upload_output_data(src: str, dest: str):
-    if dest.startswith('gs://'):
-        client = storage.Client()
-        bucket_name = dest.split('/')[2]
-        bucket = client.get_bucket(bucket_name)
-        for filepath in glob.iglob(src):
-            filename = os.path.basename(filepath)
-            logging.info('Uploading %s to %s', filename, dest)
-            blobname = dest[len('gs://' + bucket_name + '/'):] + '/' + filename
-            blob = bucket.blob(blobname)
-            blob.upload_from_filename(filepath)
-    else:
-        os.makedirs(dest, exist_ok=True)
-        for filepath in glob.iglob(src):
-            shutil.copyfile(filepath,
-                            os.path.join(dest, os.path.basename(filepath)))
+    client = storage.Client()
+    bucket_name = dest.split('/')[2]
+    bucket = client.get_bucket(bucket_name)
+    for filepath in glob.iglob(src):
+        filename = os.path.basename(filepath)
+        logging.info('Uploading %s to %s', filename, dest)
+        blobname = dest[len('gs://' + bucket_name + '/'):] + '/' + filename
+        blob = bucket.blob(blobname)
+        blob.upload_from_filename(filepath)
 
 
-def get_gcs_data(uri: str, tmp_dir: str) -> str:
+def get_gcs_data(uri: str, dest_dir: str) -> str:
     """ Downloads files from GCS and copies them to local.
     Args:
       uri: single file path or wildcard format 
-      tmp_dir: destination folder
+      dest_dir: destination folder
     Returns:
       path to the output file/folder
     """
@@ -141,20 +97,23 @@ def get_gcs_data(uri: str, tmp_dir: str) -> str:
     dirname = os.path.dirname(file_pat)
     for blob in bucket.list_blobs(prefix=dirname):
         if fnmatch.fnmatch(blob.name, file_pat):
-            path = blob.name.replace('/', '_')
-            blob.download_to_filename(os.path.join(tmp_dir, path))
-    return os.path.join(tmp_dir, file_pat.replace('/', '_'))
+            dest_file = os.path.join(dest_dir, blob.name)
+            os.makedirs(os.path.dirname(dest_file), exist_ok=True)
+            blob.download_to_filename(dest_file)
+    return os.path.join(dest_dir, file_pat)
 
 
-def load_data(path: str, tmp_dir: str) -> pd.DataFrame:
-    """ Loads data from the given path and returns as a dataframe.
+def load_data(path: str, tmp_dir: str) -> list:
+    """ Loads data from the given path and returns dataframe.
     Args:
       path: local or gcs path (single file or wildcard format)
-      tmp_dir: destination folder
+      tmp_dir: temporary folder
     Returns:
-      dataframe with the input data
+      combined list of mcf nodes
     """
     if path.startswith('gs://'):
         os.makedirs(tmp_dir, exist_ok=True)
         path = get_gcs_data(path, tmp_dir)
-    return load_mcf_files(path)
+
+    mcf_nodes = load_mcf_files(path)
+    return mcf_nodes