22
33import com .google .cloud .ByteArray ;
44import com .google .cloud .spanner .Mutation ;
5+ import com .google .cloud .storage .Blob ;
6+ import com .google .cloud .storage .BlobId ;
7+ import com .google .cloud .storage .Storage ;
8+ import com .google .cloud .storage .StorageOptions ;
59import java .io .Serializable ;
10+ import java .nio .charset .StandardCharsets ;
611import java .util .ArrayList ;
712import java .util .Collections ;
813import java .util .List ;
914import java .util .Map ;
1015import org .apache .beam .sdk .Pipeline ;
1116import org .apache .beam .sdk .metrics .Counter ;
17+ import org .apache .beam .sdk .transforms .Create ;
1218import org .apache .beam .sdk .transforms .DoFn ;
1319import org .apache .beam .sdk .transforms .Flatten ;
1420import org .apache .beam .sdk .transforms .ParDo ;
1521import org .apache .beam .sdk .values .KV ;
1622import org .apache .beam .sdk .values .PCollection ;
1723import org .apache .beam .sdk .values .PCollectionList ;
24+ import org .apache .beam .sdk .values .TypeDescriptor ;
1825import org .datacommons .Storage .Observations ;
1926import org .datacommons .ingestion .spanner .SpannerClient ;
2027import org .datacommons .pipeline .util .PipelineUtils ;
2532import org .datacommons .proto .Mcf .McfStatVarObsSeries .StatVarObs ;
2633import org .datacommons .proto .Mcf .ValueType ;
2734import org .datacommons .util .GraphUtils ;
35+ import org .datacommons .util .McfUtil ;
2836import org .slf4j .Logger ;
2937import org .slf4j .LoggerFactory ;
3038
@@ -42,8 +50,8 @@ public static List<Node> graphToNodes(McfGraph graph, Counter mcfNodesWithoutTyp
4250 // Generate corresponding node
4351 Map <String , McfGraph .Values > pv = pvs .getPvsMap ();
4452 Node .Builder node = Node .builder ();
45- node .subjectId (nodeEntry .getKey ());
46- node .value (nodeEntry .getKey ());
53+ node .subjectId (McfUtil . stripNamespace ( nodeEntry .getKey () ));
54+ node .value (McfUtil . stripNamespace ( nodeEntry .getKey () ));
4755 node .name (GraphUtils .getPropertyValue (pv , "name" ));
4856 List <String > types = GraphUtils .getPropertyValues (pv , "typeOf" );
4957 if (types .isEmpty ()) {
@@ -74,14 +82,42 @@ public static List<Node> graphToNodes(McfGraph graph, Counter mcfNodesWithoutTyp
7482 return nodes ;
7583 }
7684
85+ public static PCollection <McfGraph > getProvenance (
86+ String bucketName ,
87+ String importName ,
88+ String provenanceFile ,
89+ String meatadataFile ,
90+ Pipeline p ) {
91+ LOGGER .info ("Reading provenance mcf from {} {} {}" , bucketName , provenanceFile , meatadataFile );
92+ Storage storage = StorageOptions .getDefaultInstance ().getService ();
93+ Blob blob = storage .get (BlobId .of (bucketName , meatadataFile ));
94+ List <McfGraph > mcfList = new ArrayList <>();
95+ if (blob != null && blob .exists ()) {
96+ String s = new String (blob .getContent (), StandardCharsets .UTF_8 );
97+ mcfList .add (GraphUtils .convertToGraph (s ));
98+ }
99+ blob = storage .get (BlobId .of (bucketName , provenanceFile ));
100+ if (blob != null && blob .exists ()) {
101+ String s = new String (blob .getContent (), StandardCharsets .UTF_8 );
102+ mcfList .add (GraphUtils .convertToGraph (s ));
103+ }
104+ if (mcfList .isEmpty ()) {
105+ String defaultProvenance =
106+ "Node: dcid:dc/base/" + importName + "\n " + "typeOf: dcid:Provenance\n " ;
107+ mcfList .add (GraphUtils .convertToGraph (defaultProvenance ));
108+ }
109+ return p .apply (Create .of (mcfList ).withType (TypeDescriptor .of (McfGraph .class )));
110+ }
111+
77112 public static List <Edge > graphToEdges (McfGraph graph , String provenance ) {
78113 List <Edge > edges = new ArrayList <>();
79114 for (Map .Entry <String , PropertyValues > nodeEntry : graph .getNodesMap ().entrySet ()) {
80115 PropertyValues pvs = nodeEntry .getValue ();
81116 if (!GraphUtils .isObservation (pvs )) {
82117 Map <String , McfGraph .Values > pv = pvs .getPvsMap ();
83118 // String provenance = GraphUtils.getPropertyValue(pv, "provenance");
84- String subjectId = nodeEntry .getKey (); // Use the map key as the subjectId
119+ String subjectId =
120+ McfUtil .stripNamespace (nodeEntry .getKey ()); // Use the map key as the subjectId
85121 for (Map .Entry <String , McfGraph .Values > entry : pv .entrySet ()) { // Iterate over properties
86122 for (TypedValue val : entry .getValue ().getTypedValuesList ()) {
87123 Edge .Builder edge = Edge .builder ();
@@ -177,6 +213,9 @@ public static PCollection<KV<String, Mutation>> graphToNodes(
177213 public void processElement (
178214 @ Element McfGraph element , OutputReceiver <KV <String , Mutation >> receiver ) {
179215 List <Node > nodes = graphToNodes (element , mcfNodesWithoutTypeCounter );
216+ // for (Node node : nodes) {
217+ // LOGGER.info("Processing node: {}", node);
218+ // }
180219 List <KV <String , Mutation >> mutations =
181220 spannerClient .toGraphKVMutations (nodes , Collections .emptyList ());
182221 mutations .stream ()
@@ -202,6 +241,9 @@ public static PCollection<KV<String, Mutation>> graphToEdges(
202241 public void processElement (
203242 @ Element McfGraph element , OutputReceiver <KV <String , Mutation >> receiver ) {
204243 List <Edge > edges = graphToEdges (element , provenance );
244+ // for (Edge edge : edges) {
245+ // LOGGER.info("Processing Edge: {}", edge);
246+ // }
205247 List <KV <String , Mutation >> mutations =
206248 spannerClient .toGraphKVMutations (Collections .emptyList (), edges );
207249 mutations .stream ()
0 commit comments