AutoDiCE_examples/commbench.cpp at master · parrotsky/AutoDiCE_examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#include "net.h"
#include <algorithm>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "benchmark.h"
#include "cpu.h"
#include "gpu.h"
#include <stdio.h>
#include <vector>
#include <mpi.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
static ncnn::PoolAllocator g_workspace_pool_allocator;

int main(int argc, char** argv)
{
   int provided;
   MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
   if (provided < MPI_THREAD_MULTIPLE) {
       fprintf(stderr, "xxx MPI does not provide needed thread support!\n");
       return -1;
   // Error - MPI does not provide needed threading level
   }
    // MPI::Init(argc, argv);
        // Check for at least one argument
    if (argc < 2) {
        std::cerr << "Usage: " << argv[0] << " [int1] [int2] [int3]\n";
        return -1;
    }

    ncnn::Mat mat, output;
    int dim1, dim2, dim3;
    const int warmup_trials = 20;
    const int measured_trials = 100;
    double send_bandwidth = 0;
    double recv_bandwidth = 0;
    double total_bandwidth = 0;

    switch(argc) {
        case 2: // 1D
            std::stringstream(argv[1]) >> dim1;
            mat = ncnn::Mat(dim1);
            output = ncnn::Mat(dim1);
            break;
        case 3: // 2D
            std::stringstream(argv[1]) >> dim1;
            std::stringstream(argv[2]) >> dim2;
            mat = ncnn::Mat(dim1, dim2);
            output = ncnn::Mat(dim1, dim2);
            break;
        case 4: // 3D
            std::stringstream(argv[1]) >> dim1;
            std::stringstream(argv[2]) >> dim2;
            std::stringstream(argv[3]) >> dim3;
            mat = ncnn::Mat(dim1, dim2, dim3);
            output = ncnn::Mat(dim1, dim2, dim3);
            break;
        default:
            std::cerr << "Invalid number of dimensions. Please provide 1 to 3 dimensions.\n";
            return -1;
    }

    // Debug output
    // std::cout << "Mat constructed successfully." << mat.total() << std::endl;

    double data_in_MB = (sizeof(float) * mat.total()) / (1024.0 * 1024.0); // Amount of data transferred in MB

    // Get the number of processes
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // Get the rank of the process
    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    MPI_Request requests[2];
    MPI_Status statuses[2];

    if (world_size < 2) {
        std::cerr << "World size must be greater than 1\n";
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

        for (int t = 0; t < warmup_trials + measured_trials; ++t) {

    if (world_rank == 0) {
        // Process 0 sends the buffer to process 1
        MPI_Isend((float* )mat, mat.total(), MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &requests[0]);

        auto start = std::chrono::high_resolution_clock::now();

        // Wait for the send to complete
        MPI_Wait(&requests[0], &statuses[0]);

        auto finish = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = finish - start;
      //  std::cout << "Send time: " << elapsed.count() << " s\n";

	  if (t >= warmup_trials) {
	      double bandwidth = data_in_MB / elapsed.count(); // in bytes per second
        send_bandwidth += bandwidth;
    }

    } else if (world_rank == 1) {
        // Process 1 receives the buffer from process 0
        MPI_Irecv((float* )mat, mat.total(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &requests[1]);

        auto start = std::chrono::high_resolution_clock::now();

        // Wait for the receive to complete
        MPI_Wait(&requests[1], &statuses[1]);

        auto finish = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = finish - start;
    //    std::cout << "Receive time: " << elapsed.count() << " s\n";

	            // Skip the warmup trials
            if (t >= warmup_trials) {
		//double bandwidth = (sizeof(float) * mat.total()) / elapsed.count(); // in bytes per second
		//bandwidth /= (1024 * 1024); // convert to MB/s
	              double bandwidth = data_in_MB / elapsed.count(); // in bytes per second
                recv_bandwidth += bandwidth;
            }
    }
}

    // Calculate average bandwidth
if (world_rank == 0) {
    double average_bandwidth = send_bandwidth / measured_trials;
    double average_time = data_in_MB / average_bandwidth; // Average time in seconds
    std::cout << "Average send bandwidth: " << average_bandwidth << " MB/s"<< "; Average send time: " << average_time *1000.0 << " ms\n";
} else if (world_rank == 1) {
    double average_bandwidth = recv_bandwidth / measured_trials;
    double average_time = data_in_MB / average_bandwidth; // Average time in seconds
    std::cout << "Average recv bandwidth: " << average_bandwidth << " MB/s"<< "; Average recv time: " << average_time*1000.0 << " ms\n";
}


// ## if test benchmark for all reduce.
// ## enable the next part.

   total_bandwidth = 0;
   for (int t = 0; t < warmup_trials + measured_trials; ++t) {

        auto start = std::chrono::high_resolution_clock::now();

        // Perform the allreduce operation
        MPI_Allreduce((float* )mat, (float* )output, mat.total(), MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);

        auto finish = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = finish - start;

        // Skip the warmup trials
        if (t >= warmup_trials) {
            double bandwidth = data_in_MB / elapsed.count(); // in bytes per second
            total_bandwidth += bandwidth;
        }
    }

    // Calculate average bandwidth and average time
    double average_bandwidth = total_bandwidth / measured_trials;
    double average_time = data_in_MB / average_bandwidth; // Average time in seconds

    std::cout << "Average allreduce bandwidth: " << average_bandwidth << " MB/s" << "; Average allreduce time: " << average_time*1000.0 << " ms\n";


   g_blob_pool_allocator.set_size_compare_ratio(0.0f);
   g_workspace_pool_allocator.set_size_compare_ratio(0.5f);
   std::vector<float> cls_scores;
   // Finalize the MPI environment.
   MPI_Finalize();
   return 0;
}