-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathconv_meta_block.cpp
More file actions
104 lines (91 loc) · 3.26 KB
/
conv_meta_block.cpp
File metadata and controls
104 lines (91 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// Script to convert Meta's block storage traces to libCacheSim csv format
//
// Usage:
// g++ conv_meta_block.cpp -o conv_meta_block
// ./conv_meta_block block_traces_1_conved.csv block_traces_1.csv
#include <fstream>
#include <iostream>
#include <random>
#include <sstream>
#include <unordered_set>
#include <vector>
constexpr size_t CHUNK_SIZE = 128 * 1024; // 128 KiB
constexpr double SAMPLE_RATIO = 0.1; // sample SAMPLE_RATIO fraction of blocks
constexpr size_t SAMPLE_SEED = 42;
int main(int argc, char* argv[]) {
if (argc < 3) {
std::cout << "Usage: " << argv[0]
<< " <output_csv> <input_csv1> [<input_csv2> ...]\n";
return EXIT_FAILURE;
}
// open files
std::ofstream output_csv{argv[1]};
if (!output_csv) {
std::cout << "Error: Could not open file " << argv[1] << std::endl;
return EXIT_FAILURE;
}
std::vector<std::ifstream> input_csvs{};
for (size_t i = 2; i < argc; ++i) {
input_csvs.emplace_back(argv[i]);
if (!input_csvs.back()) {
std::cout << "Error: Could not open file " << argv[i] << std::endl;
}
}
// vars for sampling
std::unordered_set<std::string> keep{};
std::unordered_set<std::string> dont_keep{};
std::mt19937 gen{SAMPLE_SEED};
std::uniform_real_distribution<> dis(0.0, 1.0);
// header of output csv, following libCacheSim csv format
output_csv << "timestamp,obj_id,obj_size\n";
// fields of input csvs we are interested in, following Meta's block storage
// traces format
size_t op_time; // 0
std::string block_id; // 1
size_t io_size; // 3
size_t io_offset; // 4
std::string rs_shard_id; // 10
// read input_csvs, write to output_csv
std::string line;
std::string cell;
std::vector<std::string> cells{};
for (auto& input_csv : input_csvs) {
getline(input_csv, line); // skip header
while (getline(input_csv, line)) {
cells.clear();
std::stringstream row(line);
while (getline(row, cell, ',')) {
cells.push_back(cell);
}
op_time = std::stoull(cells[0]);
block_id = cells[1];
io_size = std::stoull(cells[3]);
io_offset = std::stoull(cells[4]);
rs_shard_id = cells[10];
// Reference for doing conversion:
// https://github.com/facebook/CacheLib/commit/23a888e54e4fed22f81c114e3ca9af95d7f0787c
// Each storage node block is uniquely identified via block_id +
// rs_shard_id. It's in turn broken into chunks to be cached. We
// represent a chunk of a storage node block as the cache object.
std::string real_block_id = block_id + "_" + rs_shard_id;
if (keep.count(real_block_id) == 0 &&
dont_keep.count(real_block_id) == 0) {
double u = dis(gen);
if (u > SAMPLE_RATIO) {
dont_keep.insert(real_block_id);
} else {
keep.insert(real_block_id);
}
}
if (dont_keep.count(real_block_id) > 0) {
continue;
}
size_t start_chunk = io_offset / CHUNK_SIZE;
size_t end_chunk = (io_offset + io_size - 1) / CHUNK_SIZE;
for (size_t chunk_id = start_chunk; chunk_id <= end_chunk; ++chunk_id) {
std::string obj_id = real_block_id + "_" + std::to_string(chunk_id);
output_csv << op_time << ',' << obj_id << ',' << CHUNK_SIZE << '\n';
}
}
}
}