Skip to content

Commit 2f58a00

Browse files
committed
Improved file reading
1 parent 3647fad commit 2f58a00

2 files changed

Lines changed: 136 additions & 4 deletions

File tree

nodes/src/main/java/org/nodes/data/RDF.java

Lines changed: 134 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,95 @@
22

33
import static org.nodes.data.RDF.simplify;
44

5+
import java.io.BufferedInputStream;
56
import java.io.File;
7+
import java.io.FileInputStream;
8+
import java.io.FileNotFoundException;
9+
import java.io.IOException;
10+
import java.io.InputStream;
611
import java.util.ArrayList;
712
import java.util.HashMap;
13+
import java.util.LinkedHashMap;
814
import java.util.List;
915
import java.util.Map;
1016
import java.util.regex.Pattern;
1117

1218
import org.nodes.DTGraph;
1319
import org.nodes.DTLink;
1420
import org.nodes.DTNode;
21+
import org.nodes.LightUGraph;
1522
import org.nodes.MapDTGraph;
1623
import org.nodes.Node;
24+
import org.nodes.UGraph;
25+
import org.nodes.UNode;
1726
import org.openrdf.model.Statement;
1827
import org.openrdf.rio.RDFFormat;
28+
import org.openrdf.rio.RDFParseException;
29+
import org.openrdf.rio.RDFParser;
30+
import org.openrdf.rio.Rio;
31+
import org.openrdf.rio.helpers.RDFHandlerBase;
32+
import org.openrdf.rio.turtle.TurtleParser;
33+
import org.rdfhdt.hdt.exceptions.NotFoundException;
34+
import org.rdfhdt.hdt.hdt.HDT;
35+
import org.rdfhdt.hdt.hdt.HDTManager;
36+
import org.rdfhdt.hdt.triples.IteratorTripleString;
37+
import org.rdfhdt.hdt.triples.TripleString;
1938

2039
import nl.peterbloem.kit.Functions;
2140
import nl.peterbloem.kit.Global;
2241

2342
public class RDF
2443
{
44+
public static MapDTGraph<String, String> readHDT(File file)
45+
throws FileNotFoundException, IOException
46+
{
47+
MapDTGraph<String, String> graph = new MapDTGraph<String, String>();
48+
49+
// Load HDT file. NOTE: Use loadHDT() if you don't need ?P?, ?PO or ??O queries
50+
HDT hdt = HDTManager.loadHDT(
51+
new BufferedInputStream(new FileInputStream(file)), null);
52+
53+
int i = 0;
54+
try {
55+
// Search pattern: Empty string means "any"
56+
IteratorTripleString it = hdt.search("", "", "");
57+
DTNode<String, String> node1, node2;
58+
59+
while(it.hasNext()) {
60+
TripleString ts = it.next();
61+
62+
String subject = ts.getSubject().toString(),
63+
predicate = ts.getPredicate().toString(),
64+
object = ts.getObject().toString();
65+
66+
node1 = graph.node(subject);
67+
node2 = graph.node(object);
68+
69+
if (node1 == null)
70+
node1 = graph.add(subject);
71+
72+
73+
if (node2 == null)
74+
node2 = graph.add(object);
75+
76+
node1.connect(node2, predicate);
77+
78+
Functions.dot(i, (int)it.estimatedNumResults());
79+
i++;
80+
}
81+
} catch (NotFoundException e)
82+
{
83+
// File must be empty, return empty graph
84+
} finally
85+
{
86+
// IMPORTANT: Free resources
87+
hdt.close();
88+
}
89+
90+
return graph;
91+
}
92+
93+
2594
/**
2695
* Reads the given file into a graph.
2796
*
@@ -30,12 +99,22 @@ public class RDF
3099
*/
31100
public static MapDTGraph<String, String> read(File file)
32101
{
33-
return read(file, null);
102+
return read(file, RDFFormat.RDFXML);
103+
}
104+
105+
public static MapDTGraph<String, String> read(File file, RDFFormat format)
106+
{
107+
return read(file, null, format);
34108
}
35109

36110
public static MapDTGraph<String, String> read(File file, List<String> linkWhitelist)
37111
{
38-
RDFDataSet testSet = new RDFFileDataSet(file, RDFFormat.RDFXML);
112+
return read(file, null, RDFFormat.RDFXML);
113+
}
114+
115+
public static MapDTGraph<String, String> read(File file, List<String> linkWhitelist, RDFFormat format)
116+
{
117+
RDFDataSet testSet = new RDFFileDataSet(file, format);
39118

40119
List<Statement> triples = testSet.getStatements(null, null, null, false);
41120

@@ -82,8 +161,9 @@ public static MapDTGraph<String, String> createDirectedGraph(
82161
MapDTGraph<String, String> graph = new MapDTGraph<String, String>();
83162
DTNode<String, String> node1, node2;
84163

85-
Global.log().info("Constructing graph");
164+
Global.log().info("Constructing graph (size: "+sesameGraph.size()+")");
86165

166+
int i = 0;
87167
for (Statement statement : sesameGraph)
88168
{
89169

@@ -119,6 +199,9 @@ public static MapDTGraph<String, String> createDirectedGraph(
119199
node2 = graph.add(object);
120200

121201
node1.connect(node2, predicate);
202+
203+
Functions.dot(i, sesameGraph.size());
204+
i++;
122205
}
123206

124207
return graph;
@@ -180,4 +263,52 @@ public static DTGraph<String, String> simplify(DTGraph<String, String> graph)
180263

181264
return out;
182265
}
266+
267+
/**
268+
* Reads a simple graph: no self-loops, no multiple edges. Two resources
269+
* have an edge if they are connected in either direction by one or more predicates
270+
*
271+
* @param file
272+
* @return
273+
*/
274+
public static UGraph<String> readSimple(File file)
275+
throws IOException
276+
{
277+
RDFFormat format = RDFFormat.forFileName(file.getName());
278+
279+
InputStream in = new BufferedInputStream(new FileInputStream(file));
280+
RDFParser parser = Rio.createParser(format);
281+
282+
final UGraph<String> graph = new LightUGraph<String>();
283+
final Map<String, UNode<String>> nodes = new HashMap<String, UNode<String>>();
284+
285+
parser.setRDFHandler(new RDFHandlerBase()
286+
{
287+
@Override
288+
public void handleStatement(Statement statement)
289+
{
290+
String subject = statement.getSubject().toString();
291+
String object = statement.getObject().toString();
292+
293+
if(! nodes.containsKey(subject))
294+
nodes.put(subject, graph.add(subject));
295+
if(! nodes.containsKey(object))
296+
nodes.put(object, graph.add(object));
297+
298+
UNode<String> subNode = nodes.get(subject);
299+
UNode<String> obNode = nodes.get(object);
300+
301+
if( (!subNode.connected(obNode)) && subNode.index() != obNode.index() )
302+
subNode.connect(obNode);
303+
}
304+
});
305+
306+
try {
307+
parser.parse(in, "local://");
308+
} catch (Exception e)
309+
{
310+
throw new RuntimeException("Error parsing file ("+file.getAbsolutePath()+").", e);
311+
}
312+
return graph;
313+
}
183314
}

nodes/src/main/java/org/nodes/data/RDFDataSet.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ public List<Statement> getStatements(Resource subject, URI predicate, Value obje
5050
/**
5151
* Wrapper for the Sesame connection getStatements, to avoid try-catch statements.
5252
*/
53-
public List<Statement> getStatements(Resource subject, URI predicate, Value object, boolean allowInference) {
53+
public List<Statement> getStatements(Resource subject, URI predicate, Value object, boolean allowInference)
54+
{
5455
List<Statement> resGraph = new ArrayList<Statement>();
5556

5657
try {

0 commit comments

Comments
 (0)