-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathetl.py
More file actions
29 lines (21 loc) · 827 Bytes
/
etl.py
File metadata and controls
29 lines (21 loc) · 827 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
def extract_ids_from_ndjson(input_file, output_file):
ids = []
# Read the NDJSON file
with open(input_file, 'r') as f:
for line in f:
data = json.loads(line.strip())
ids.append(data['_id'])
with open(input_file_edge, 'r') as f:
for line in f:
data = json.loads(line.strip())
ids.append(data['_id'])
# Write the IDs to the output file in the specified format
with open(output_file, 'w') as f:
f.write('[' + ','.join([f'"{id}"' for id in ids]) + ']')
# Specify the input and output file paths
input_file = 'OUT/Observation.vertex.json'
input_file_edge= 'OUT/Observation.in.edge.json'
output_file = 'output.json'
# Extract the IDs and write them to the output file
extract_ids_from_ndjson(input_file, output_file)