Skip to content

Commit 71e8d72

Browse files
committed
Adding pre-process codes and Generator
1 parent 86ef055 commit 71e8d72

5 files changed

Lines changed: 489 additions & 3 deletions

File tree

DGraph/distributed/RankLocalOps.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import torch
1919

2020
try:
21-
from torch_local import local_masked_gather, local_masked_scatter
21+
from DGraph.torch_local import local_masked_gather, local_masked_scatter
2222

2323
_LOCAL_OPT_KERNELS_AVAILABLE = True
2424
except ImportError:
@@ -69,8 +69,8 @@ def OptimizedRankLocalMaskedGather(
6969
num_features = src.shape[-1]
7070
local_masked_gather(
7171
src,
72-
indices,
73-
rank_mapping,
72+
indices.cuda(),
73+
rank_mapping.cuda(),
7474
output,
7575
bs,
7676
num_src_rows,
Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
#include <iostream>
2+
#include <vector>
3+
#include <tuple>
4+
#include <random>
5+
#include <chrono>
6+
#include <set>
7+
#include <algorithm>
8+
#include <unordered_map>
9+
#include <fstream>
10+
#include <iostream>
11+
#include <sstream>
12+
// using adj_list = std::unordered_map<unsigned long, std::set<unsigned long>>;
13+
// Function to generate a random connected graph with a maximum degree constraint
14+
// and return it as an adjacency list in an unordered_map (single-threaded).
15+
16+
bool find_in_adj_list(const std::vector<unsigned long> &adj_list, unsigned long vertex)
17+
{
18+
return std::find(adj_list.begin(), adj_list.end(), vertex) != adj_list.end();
19+
}
20+
21+
using adj_list = std::vector<std::vector<unsigned long>>;
22+
23+
std::tuple<adj_list, unsigned long> generate_graph(
24+
const unsigned long &num_vertices,
25+
const int &max_degree)
26+
{
27+
adj_list adjacency_map;
28+
29+
if (num_vertices == 0)
30+
{
31+
std::cerr << "Warning: Number of vertices is 0. Returning empty map." << std::endl;
32+
return std::tuple<adj_list, unsigned long>(adjacency_map, 0);
33+
}
34+
35+
if (max_degree <= 0)
36+
{
37+
std::cerr << "Error: Maximum degree must be greater than 0." << std::endl;
38+
// Depending on requirements, might return empty or throw exception
39+
return std::tuple<adj_list, unsigned long>(adjacency_map, 0);
40+
}
41+
42+
// Keep track of the current degree of each vertex.
43+
std::vector<int> degree(num_vertices, 0);
44+
45+
adjacency_map.reserve(num_vertices);
46+
unsigned seed = std::random_device()();
47+
std::mt19937_64 rng(seed);
48+
std::uniform_int_distribution<unsigned long> dist_vertices(0, num_vertices - 1);
49+
50+
// 1. Build a random spanning tree to ensure connectivity (for num_vertices > 1).
51+
52+
std::vector<unsigned long> visited;
53+
visited.reserve(num_vertices);
54+
visited.push_back(0); // Start with vertex 0
55+
std::vector<unsigned long> unvisited;
56+
unvisited.reserve(num_vertices - 1);
57+
58+
for (unsigned long i = 0; i < num_vertices; ++i)
59+
{
60+
adjacency_map.push_back(std::vector<unsigned long>(max_degree, num_vertices));
61+
}
62+
for (unsigned long i = 1; i < num_vertices; ++i)
63+
{
64+
unvisited.push_back(i);
65+
}
66+
67+
std::shuffle(unvisited.begin(), unvisited.end(), rng);
68+
69+
long num_edges = 0;
70+
auto num_vertex_processed = 1;
71+
std::cout << "Building spanning tree..." << std::endl;
72+
73+
for (auto u : unvisited)
74+
{
75+
// Randomly select a vertex from the visited set
76+
std::uniform_int_distribution<unsigned long>
77+
dist_visited(0, visited.size() - 1);
78+
unsigned long v_idx = dist_visited(rng);
79+
unsigned long v = visited[v_idx];
80+
auto max_degree_check = std::max(degree[u], degree[v]) < max_degree;
81+
82+
while (!max_degree_check)
83+
{
84+
// If the edge already exists or max degree is exceeded, try again
85+
v_idx = dist_visited(rng);
86+
v = visited[v_idx];
87+
max_degree_check = std::max(degree[u], degree[v]) < max_degree;
88+
}
89+
90+
adjacency_map[u][degree[u]] = v;
91+
adjacency_map[v][degree[v]] = u;
92+
93+
// std::cout << std::endl;
94+
degree[u]++;
95+
degree[v]++;
96+
97+
num_edges++;
98+
num_vertex_processed++;
99+
100+
visited.push_back(u);
101+
}
102+
103+
104+
105+
106+
std::cout << "Number of edges in the spanning tree: " << num_edges << std::endl;
107+
if (visited.size() != num_vertices)
108+
{
109+
std::cerr << "Warning: Could not connect all vertices while building spanning tree (possibly due to low max_degree). Graph might be disconnected." << std::endl;
110+
}
111+
112+
visited.clear();
113+
unvisited.clear();
114+
115+
// Graph is connected with all edges having at least one vertex
116+
// and a maximum degree of max_degree.
117+
118+
// Add additional random edges between vertices (will not exceed max_degree).
119+
// Graph will of course remain connected.
120+
121+
// heuristic
122+
const auto max_new_edges = num_edges * 2;
123+
const auto min_new_edges = num_edges / 2;
124+
125+
// chose a number between min_new_edges and max_new_edges
126+
std::uniform_int_distribution<unsigned long> dist_edges(min_new_edges, max_new_edges);
127+
const auto num_edges_to_add = dist_edges(rng);
128+
129+
unsigned long max_possible_edges = num_vertices * (max_degree) / 4;
130+
131+
unsigned long current_edges_count = num_edges;
132+
// Attempt to add more edges for a reasonable number of times
133+
134+
const auto max_possible_edges = num_vertices * (num_vertices - 1) / 2;
135+
136+
std::cout << "Total edge attempts: " << num_edges_to_add << std::endl;
137+
unsigned int attempts = 0;
138+
while (attempts < num_edges_to_add)
139+
{
140+
attempts++; // Count attempt even if unsuccessful
141+
unsigned long u = dist_vertices(rng);
142+
unsigned long v = dist_vertices(rng);
143+
144+
if (u == v)
145+
{
146+
continue; // No self-loops
147+
}
148+
149+
unsigned long first = std::min(u, v);
150+
unsigned long second = std::max(u, v);
151+
152+
const auto &adj_list = adjacency_map[first];
153+
// Check if the edge already exists
154+
155+
// const auto not_in_adj_list = adj_list.find(second) == adj_list.end();
156+
const auto in_adj_list = find_in_adj_list(adj_list, second);
157+
// Check if edge exists and if adding it violates max_degree
158+
if (!in_adj_list && std::max(degree[u], degree[v]) < max_degree)
159+
{
160+
adjacency_map[u][degree[u]] = v;
161+
adjacency_map[v][degree[v]] = u; // Add the edge in both directions
162+
degree[u]++;
163+
degree[v]++;
164+
current_edges_count++;
165+
}
166+
if (current_edges_count >= max_possible_edges)
167+
{ // Stop if we reach the maximum possible edges
168+
break;
169+
}
170+
}
171+
172+
return std::tuple<adj_list, unsigned long>(adjacency_map, current_edges_count);
173+
}
174+
175+
void write_graph_file(
176+
const adj_list &graph,
177+
const std::string &filename,
178+
const unsigned long &num_vertices,
179+
const unsigned long &num_edges)
180+
{
181+
std::cout << "Write_graph_file called for file " << filename << std::endl;
182+
// Example: Simulate writing to a file
183+
std::ofstream outfile(filename);
184+
185+
if (!outfile.is_open())
186+
{
187+
std::cerr << "Error opening file " << filename << " for writing." << std::endl;
188+
return;
189+
}
190+
191+
outfile << num_vertices << " " << num_edges << std::endl;
192+
for (unsigned long i = 0; i < num_vertices; ++i)
193+
{
194+
const auto &neighbors = graph.at(i);
195+
outfile << neighbors[0] + 1;
196+
197+
for (auto it = neighbors.begin() + 1; it != neighbors.end(); ++it)
198+
{
199+
if (*it < num_vertices)
200+
{
201+
outfile << " " << *it + 1;
202+
}
203+
}
204+
outfile << '\n';
205+
}
206+
outfile.close();
207+
}
208+
209+
int main(int argc, char *argv[])
210+
{
211+
// Check if the correct number of command-line arguments is provided
212+
if (argc != 4)
213+
{
214+
std::cerr << "Usage: " << argv[0] << " <num_vertices> <max_degree> <output_filename>" << std::endl;
215+
return 1; // Indicate an error
216+
}
217+
218+
unsigned long num_vertices;
219+
unsigned int max_degree_uint;
220+
std::string output_filename;
221+
222+
// Parse the number of vertices
223+
try
224+
{
225+
// Use std::stoul for unsigned long
226+
num_vertices = std::stoul(argv[1]);
227+
// Optional: Add a check for extremely large values if needed
228+
if (num_vertices == 0)
229+
{
230+
std::cerr << "Error: Number of vertices must be greater than 0." << std::endl;
231+
return 1;
232+
}
233+
}
234+
catch (const std::invalid_argument &ia)
235+
{
236+
std::cerr << "Error: Invalid number of vertices provided: " << ia.what() << std::endl;
237+
return 1;
238+
}
239+
catch (const std::out_of_range &oor)
240+
{
241+
std::cerr << "Error: Number of vertices out of range: " << oor.what() << std::endl;
242+
return 1;
243+
}
244+
245+
// Parse the maximum degree
246+
try
247+
{
248+
// Use std::stoi for int, then cast to unsigned int
249+
int max_degree_int = std::stoi(argv[2]);
250+
if (max_degree_int < 1)
251+
{
252+
std::cerr << "Error: Maximum degree must be at least 1." << std::endl;
253+
return 1;
254+
}
255+
// Check if the integer value fits into an unsigned int
256+
if (max_degree_int > 10)
257+
{
258+
std::cerr << "Error: Maximum degree value too large for unsigned int." << std::endl;
259+
return 1;
260+
}
261+
max_degree_uint = static_cast<unsigned int>(max_degree_int);
262+
}
263+
catch (const std::invalid_argument &ia)
264+
{
265+
std::cerr << "Error: Invalid maximum degree provided: " << ia.what() << std::endl;
266+
return 1;
267+
}
268+
catch (const std::out_of_range &oor)
269+
{
270+
std::cerr << "Error: Maximum degree out of range: " << oor.what() << std::endl;
271+
return 1;
272+
}
273+
274+
// The third argument is the filename
275+
output_filename = argv[3];
276+
277+
// Cast max_degree_uint to int for the generate_graph_map function signature
278+
int max_degree_int_for_func = static_cast<int>(max_degree_uint);
279+
280+
const auto graph_adj_list = generate_graph(num_vertices, max_degree_int_for_func);
281+
282+
// Print the generated adjacency list
283+
std::cout << "Generated Graph Adjacency List:" << std::endl;
284+
// Iterate through vertices 0 to num_vertices-1 for consistent output order
285+
286+
const auto &adj_list = std::get<0>(graph_adj_list);
287+
const auto &num_edges = std::get<1>(graph_adj_list);
288+
write_graph_file(adj_list, output_filename, num_vertices, num_edges);
289+
return 0;
290+
}

experiments/Synthetic-Billion/Generator/Makefile

Whitespace-only changes.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
## Billion Vertex Graphs
2+
3+
As DGraph allows us to learn extremely large graphs, we push the size of the graphs beyond to train with full graph GNN training. We generate a synthetic graphs with 1 billion vertices.
4+
5+
## Data Generation
6+
7+
### Building the Graph Generator
8+
We provide a fast graph generator to generate large graphs. The generator generates a graph with a given number of vertices and a maximum degree. The generator just requires a `GCC>10.3`. Build the generator in the `Generator` directory
9+
```bash
10+
cd Generator
11+
make
12+
```
13+
14+
### Generating the Graph
15+
The generator takes the number of vertices and the maximum degree as input, and outputs a text file in the METIS graph format. Run the following command to generate a graph with 1 billion vertices with a maximum degree of 5:
16+
17+
```bash
18+
./Generator/graph_generator 1000000000 5 1B5D.graph
19+
```
20+
21+
This will generate an undirected graph with 1 billion vertices and a maximum degree of 5. The graph will be saved in the file `1B5D.graph`. The generator will take a few minutes to run and require `~150GB` of memory.
22+
23+
The graph will be generated in the METIS format, which is a simple text format that describes the graph. The first line of the file contains the number of vertices and edges. The i-th line of the file contains the neighbors of the i-th vertex.
24+
25+
### Partition the graph
26+
27+
We assume a there is a working `METIS` installation with flags `i64=1` and `r64=1`. `Parametis` may be useful as well.
28+
29+
To partition the graph in to `<num_partitions>` partitions, run the following command:
30+
```bash
31+
gpmetis 1B5D.graph <num_partitions>
32+
```
33+
This will generate a file `1B5D.graph.part.<num_partitions>` which contains the partitioning of the graph. The i-th line of the file contains the partition id of the i-th vertex. The partition ids are 0-indexed. This also requires `~150GB` of memory (with the flag `-ondisk`).
34+
35+
### Preprocess for DGraph
36+
37+
To finish the graph generation and make the data ready for DGraph, we take the graph file and partition file and run the following command:
38+
```bash
39+
python preprocess.py --g <graph_file> --p <partition_file> --np <num_partitions>
40+
```
41+
42+
The script will generate the necessary files for DGraph to run a distributed training partitioned in `<num_partitions>` partitions.
43+

0 commit comments

Comments
 (0)