Skip to content

Commit 29bb80a

Browse files
committed
Update
1 parent c0045bc commit 29bb80a

5 files changed

Lines changed: 126 additions & 111 deletions

File tree

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ NVCC=nvcc
44
CXX=g++
55

66
#Uncomment for a GPU enabled library
7-
#CUDA_ENABLED=-DCUDA_ENABLED
7+
CUDA_ENABLED=-DCUDA_ENABLED
88

99
#Uncomment to compile in double precision mode
1010
#DOUBLE_PRECISION=-DDOUBLE_PRECISION

example.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ g++ -std=c++14 -Iinclude example.cpp -llapacke -lcblas
1919
// precision as the library.
2020
using real = lanczos::real;
2121
//A functor that will return the result of multiplying a certain matrix times a given vector
22-
struct MatrixDot{
22+
struct DiagonalMatrix: public lanczos::MatrixDot{
2323
int size;
24-
MatrixDot(int size): size(size){}
24+
DiagonalMatrix(int size): size(size){}
2525

26-
void operator()(real* v, real* Mv){
26+
virtual void operator()(real* v, real* Mv) override{
2727
//an example diagonal matrix
2828
for(int i=0; i<size; i++){
2929
Mv[i] = (2+i/10.0)*v[i];
@@ -46,7 +46,7 @@ int main(){
4646
//A vector to store the result of sqrt(M)*v
4747
std::vector<real> result(size);
4848
//A functor that multiplies by a diagonal matrix
49-
MatrixDot dot(size);
49+
DiagonalMatrix dot(size);
5050
//Call the solver
5151
int numberIterations = lanczos.solve(dot, result.data(), v.data(), size);
5252
std::cout<<"Solved after "<<numberIterations<< " iterations"<<std::endl;

example.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ References:
1919
using real = lanczos::real;
2020

2121
//A functor that will return the result of multiplying a certain matrix times a given vector
22-
struct MatrixDot{
22+
struct DiagonalMatrix: public lanczos::MatrixDot{
2323
int size;
24-
MatrixDot(int size): size(size){}
24+
DiagonalMatrix(int size): size(size){}
2525

2626
void operator()(real* v, real* Mv){
2727
//An example diagonal matrix
@@ -46,7 +46,7 @@ int main(){
4646
//A vector to store the result of sqrt(M)*v
4747
lanczos::device_container<real> result(size);
4848
//A functor that multiplies by ta diagonal matrix
49-
MatrixDot dot(size);
49+
DiagonalMatrix dot(size);
5050
//Call the solver
5151
real* d_result = lanczos::detail::getRawPointer(result);
5252
real* d_v = lanczos::detail::getRawPointer(v);

include/LanczosAlgorithm.cu

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,102 @@ namespace lanczos{
4545
this->max_iter += inc;
4646
}
4747

48+
int Solver::solve(MatrixDot *dot, real *Bz, real*z, int N){
49+
//Handles the case of the number of elements changing since last call
50+
if(N != this->N){
51+
real * d_V = detail::getRawPointer(V);
52+
if(z == d_V){
53+
throw std::runtime_error("[Lanczos] Size mismatch in input");
54+
}
55+
numElementsChanged(N);
56+
}
57+
/*See algorithm I in [1]*/
58+
/************v[0] = z/||z||_2*****/
59+
/*If z is not the array provided by getV*/
60+
real* d_V = detail::getRawPointer(V);
61+
if(z != d_V){
62+
detail::device_copy(z, z+N, V.begin());
63+
}
64+
/*1/norm(z)*/
65+
real invz2 = 1.0/computeNorm(d_V, N);
66+
/*v[0] = v[0]*1/norm(z)*/
67+
device_scal(N, &invz2, d_V, 1);
68+
/*Lanczos iterations for Krylov decomposition*/
69+
/*Will perform iterations until Error<=tolerance*/
70+
int i = -1;
71+
real normResult_prev = 1.0; //For error estimation, see eq 27 in [1]
72+
while(true){
73+
i++;
74+
/*Allocate more space if needed*/
75+
if(i == max_iter-1){
76+
#ifdef CUDA_ENABLED
77+
CudaSafeCall(cudaDeviceSynchronize());
78+
#endif
79+
this->incrementMaxIterations(2);
80+
}
81+
computeIteration(dot, i, invz2);
82+
/*Check convergence if needed*/
83+
if(i >= check_convergence_steps){ //Miminum of 3 iterations, will be auto tuned
84+
/*Compute Bz using h and z*/
85+
/**** y = ||z||_2 * Vm · H^1/2 · e_1 *****/
86+
this->computeCurrentResultEstimation(i, Bz, 1.0/invz2);
87+
/*The first time the result is computed it is only stored as oldBz*/
88+
if(i>check_convergence_steps){
89+
if(checkConvergence(i, Bz, normResult_prev)){
90+
return i;
91+
}
92+
}
93+
/*Always save the current result as oldBz*/
94+
detail::device_copy(Bz, Bz+N, oldBz.begin());
95+
/*Store the norm of the result*/
96+
real * d_oldBz = detail::getRawPointer(oldBz);
97+
device_nrm2(N, d_oldBz, 1, &normResult_prev);
98+
}
99+
}
100+
}
101+
102+
void Solver::computeIteration(MatrixDot *dot, int i, real invz2){
103+
real* d_V = detail::getRawPointer(V);
104+
real * d_w = detail::getRawPointer(w);
105+
/*w = D·vi*/
106+
dot->operator()(d_V+N*i, d_w);
107+
if(i>0){
108+
/*w = w-h[i-1][i]·vi*/
109+
real alpha = -hsup[i-1];
110+
device_axpy(N,
111+
&alpha,
112+
d_V+N*(i-1), 1,
113+
d_w, 1);
114+
}
115+
/*h[i][i] = dot(w, vi)*/
116+
device_dot(N,
117+
d_w, 1,
118+
d_V+N*i, 1,
119+
&(hdiag[i]));
120+
if(i<max_iter-1){
121+
/*w = w-h[i][i]·vi*/
122+
real alpha = -hdiag[i];
123+
device_axpy(N,
124+
&alpha,
125+
d_V+N*i, 1,
126+
d_w, 1);
127+
/*h[i+1][i] = h[i][i+1] = norm(w)*/
128+
device_nrm2(N, (real*)d_w, 1, &(hsup[i]));
129+
/*v_(i+1) = w·1/ norm(w)*/
130+
real tol = 1e-3*hdiag[i]*invz2;
131+
if(hsup[i]<tol) hsup[i] = real(0.0);
132+
if(hsup[i]>real(0.0)){
133+
real invw2 = 1.0/hsup[i];
134+
device_scal(N, &invw2, d_w, 1);
135+
}
136+
else{/*If norm(w) = 0 that means all elements of w are zero, so set w = e1*/
137+
detail::device_fill(w.begin(), w.end(), real());
138+
w[0] = 1;
139+
}
140+
detail::device_copy(w.begin(), w.begin()+N, V.begin() + N*(i+1));
141+
}
142+
}
143+
48144
//Computes the current result guess sqrt(M)·v, stores in BdW
49145
void Solver::computeCurrentResultEstimation(int iter, real * BdW, real z2){
50146
iter++;

include/LanczosAlgorithm.h

Lines changed: 22 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ Some notes:
2929
#include"utils/debugTools.h"
3030
#endif
3131
namespace lanczos{
32+
33+
struct MatrixDot{
34+
35+
virtual void operator()(real* Mv, real*v) = 0;
36+
37+
};
38+
3239
struct Solver{
3340
Solver(real tolerance = 1e-3);
3441

@@ -40,8 +47,20 @@ namespace lanczos{
4047

4148
//Given a Dotctor that computes a product M·v (where M is handled by Dotctor ), computes Bv = sqrt(M)·v
4249
//Returns the number of iterations performed
43-
template<class Dotctor> //B = sqrt(M)
44-
int solve(Dotctor &dot, real *Bv, real* v, int N);
50+
//B = sqrt(M)
51+
int solve(MatrixDot *dot, real *Bv, real* v, int N);
52+
53+
//Overload for a shared_ptr
54+
int solve(std::shared_ptr<MatrixDot> dot, real *Bv, real* v, int N){
55+
return this->solve(dot.get(), Bv, v, N);
56+
}
57+
58+
//Overload for an instance
59+
template<class SomeDot>
60+
int solve(SomeDot &dot, real *Bv, real* v, int N){
61+
MatrixDot* ptr = static_cast<MatrixDot*>(&dot);
62+
return this->solve(ptr, Bv, v, N);
63+
}
4564

4665
//You can use this array as input to the solve operation, which will save some memory
4766
real * getV(int N){
@@ -70,8 +89,7 @@ namespace lanczos{
7089
bool checkConvergence(int current_iterations, real *Bz, real normNoise_prev);
7190
real computeError(real* Bz, real normNoise_prev);
7291
real computeNorm(real *v, int numberElements);
73-
template<class Dotctor>
74-
void computeIteration(Dotctor &dot, int i, real invz2);
92+
void computeIteration(MatrixDot *dot, int i, real invz2);
7593
void registerRequiredStepsForConverge(int steps_needed);
7694
void resizeIfNeeded(real*z, int N);
7795
int N;
@@ -94,105 +112,6 @@ namespace lanczos{
94112
int check_convergence_steps;
95113
real tolerance;
96114
};
97-
98-
template<class Dotctor>
99-
int Solver::solve(Dotctor &dot, real *Bz, real*z, int N){
100-
//Handles the case of the number of elements changing since last call
101-
if(N != this->N){
102-
real * d_V = detail::getRawPointer(V);
103-
if(z == d_V){
104-
throw std::runtime_error("[Lanczos] Size mismatch in input");
105-
}
106-
numElementsChanged(N);
107-
}
108-
/*See algorithm I in [1]*/
109-
/************v[0] = z/||z||_2*****/
110-
/*If z is not the array provided by getV*/
111-
real* d_V = detail::getRawPointer(V);
112-
if(z != d_V){
113-
detail::device_copy(z, z+N, V.begin());
114-
}
115-
/*1/norm(z)*/
116-
real invz2 = 1.0/computeNorm(d_V, N);
117-
/*v[0] = v[0]*1/norm(z)*/
118-
device_scal(N, &invz2, d_V, 1);
119-
/*Lanczos iterations for Krylov decomposition*/
120-
/*Will perform iterations until Error<=tolerance*/
121-
int i = -1;
122-
real normResult_prev = 1.0; //For error estimation, see eq 27 in [1]
123-
while(true){
124-
i++;
125-
/*Allocate more space if needed*/
126-
if(i == max_iter-1){
127-
#ifdef CUDA_ENABLED
128-
CudaSafeCall(cudaDeviceSynchronize());
129-
#endif
130-
this->incrementMaxIterations(2);
131-
}
132-
computeIteration(dot, i, invz2);
133-
/*Check convergence if needed*/
134-
if(i >= check_convergence_steps){ //Miminum of 3 iterations, will be auto tuned
135-
/*Compute Bz using h and z*/
136-
/**** y = ||z||_2 * Vm · H^1/2 · e_1 *****/
137-
this->computeCurrentResultEstimation(i, Bz, 1.0/invz2);
138-
/*The first time the result is computed it is only stored as oldBz*/
139-
if(i>check_convergence_steps){
140-
if(checkConvergence(i, Bz, normResult_prev)){
141-
return i;
142-
}
143-
}
144-
/*Always save the current result as oldBz*/
145-
detail::device_copy(Bz, Bz+N, oldBz.begin());
146-
/*Store the norm of the result*/
147-
real * d_oldBz = detail::getRawPointer(oldBz);
148-
device_nrm2(N, d_oldBz, 1, &normResult_prev);
149-
}
150-
}
151-
}
152-
153-
154-
template<class Dotctor>
155-
void Solver::computeIteration(Dotctor &dot, int i, real invz2){
156-
real* d_V = detail::getRawPointer(V);
157-
real * d_w = detail::getRawPointer(w);
158-
/*w = D·vi*/
159-
dot(d_V+N*i, d_w);
160-
if(i>0){
161-
/*w = w-h[i-1][i]·vi*/
162-
real alpha = -hsup[i-1];
163-
device_axpy(N,
164-
&alpha,
165-
d_V+N*(i-1), 1,
166-
d_w, 1);
167-
}
168-
/*h[i][i] = dot(w, vi)*/
169-
device_dot(N,
170-
d_w, 1,
171-
d_V+N*i, 1,
172-
&(hdiag[i]));
173-
if(i<max_iter-1){
174-
/*w = w-h[i][i]·vi*/
175-
real alpha = -hdiag[i];
176-
device_axpy(N,
177-
&alpha,
178-
d_V+N*i, 1,
179-
d_w, 1);
180-
/*h[i+1][i] = h[i][i+1] = norm(w)*/
181-
device_nrm2(N, (real*)d_w, 1, &(hsup[i]));
182-
/*v_(i+1) = w·1/ norm(w)*/
183-
real tol = 1e-3*hdiag[i]*invz2;
184-
if(hsup[i]<tol) hsup[i] = real(0.0);
185-
if(hsup[i]>real(0.0)){
186-
real invw2 = 1.0/hsup[i];
187-
device_scal(N, &invw2, d_w, 1);
188-
}
189-
else{/*If norm(w) = 0 that means all elements of w are zero, so set w = e1*/
190-
detail::device_fill(w.begin(), w.end(), real());
191-
w[0] = 1;
192-
}
193-
detail::device_copy(w.begin(), w.begin()+N, V.begin() + N*(i+1));
194-
}
195-
}
196115
}
197116

198117
#ifndef SHARED_LIBRARY_COMPILATION

0 commit comments

Comments
 (0)