From 9bdfdee0538ba40127c0aae1071d8f2c1071553f Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Tue, 7 Jan 2025 15:57:57 -0800 Subject: [PATCH 01/11] Drop in Lijun's pycuampcor-v2-devel changes verbatim --- cxx/isce3/cuda/Sources.cmake | 16 +- .../{GDALImage.cu => GDALImage.cpp} | 1 + .../cuda/matchtemplate/pycuampcor/GDALImage.h | 1 + .../matchtemplate/pycuampcor/PyCuAmpcor.cpp | 106 ++++ .../pycuampcor/cuAmpcorChunk.cpp | 578 +++++++++++++++++ .../matchtemplate/pycuampcor/cuAmpcorChunk.cu | 583 ------------------ .../matchtemplate/pycuampcor/cuAmpcorChunk.h | 49 +- ...orController.cu => cuAmpcorController.cpp} | 69 ++- .../pycuampcor/cuAmpcorController.h | 7 +- ...pcorParameter.cu => cuAmpcorParameter.cpp} | 218 ++++--- .../pycuampcor/cuAmpcorParameter.h | 67 +- .../matchtemplate/pycuampcor/cuAmpcorUtil.h | 75 ++- .../pycuampcor/{cuArrays.cu => cuArrays.cpp} | 93 ++- .../cuda/matchtemplate/pycuampcor/cuArrays.h | 7 +- .../matchtemplate/pycuampcor/cuArraysCopy.cu | 230 ++++--- .../pycuampcor/cuArraysPadding.cu | 179 +++--- .../pycuampcor/cuCorrFrequency.cu | 45 +- .../pycuampcor/cuCorrFrequency.h | 13 +- .../pycuampcor/cuCorrNormalization.cu | 135 ++-- .../pycuampcor/cuCorrNormalizationSAT.cu | 131 +++- ...CorrNormalizer.cu => cuCorrNormalizer.cpp} | 45 +- .../pycuampcor/cuCorrNormalizer.h | 34 +- .../pycuampcor/cuCorrTimeDomain.cu | 38 +- .../cuda/matchtemplate/pycuampcor/cuDeramp.cu | 50 +- .../pycuampcor/cuEstimateStats.cu | 60 +- .../cuda/matchtemplate/pycuampcor/cuOffset.cu | 152 +++-- .../{cuOverSampler.cu => cuOverSampler.cpp} | 67 +- .../matchtemplate/pycuampcor/cuOverSampler.h | 15 +- .../pycuampcor/cuSincOverSampler.cu | 6 +- .../pycuampcor/cuSincOverSampler.h | 2 +- .../matchtemplate/pycuampcor/cudaError.cpp | 44 ++ .../cuda/matchtemplate/pycuampcor/cudaError.h | 42 +- .../matchtemplate/pycuampcor/cudaUtil.cpp | 58 ++ .../cuda/matchtemplate/pycuampcor/cudaUtil.h | 65 +- .../matchtemplate/pycuampcor/data_types.h | 46 ++ .../cuda/matchtemplate/pycuampcor/float2.h | 126 +++- .../cuda/matchtemplate/pycuampcor.cpp | 3 + 37 files changed, 2009 insertions(+), 1447 deletions(-) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{GDALImage.cu => GDALImage.cpp} (99%) create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cpp delete mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorController.cu => cuAmpcorController.cpp} (71%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorParameter.cu => cuAmpcorParameter.cpp} (58%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuArrays.cu => cuArrays.cpp} (60%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuCorrNormalizer.cu => cuCorrNormalizer.cpp} (51%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuOverSampler.cu => cuOverSampler.cpp} (68%) create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h diff --git a/cxx/isce3/cuda/Sources.cmake b/cxx/isce3/cuda/Sources.cmake index 64f129308..0147fbe14 100644 --- a/cxx/isce3/cuda/Sources.cmake +++ b/cxx/isce3/cuda/Sources.cmake @@ -28,23 +28,25 @@ geometry/Topo.cpp geometry/utilities.cu image/gpuResampSlc.cu image/ResampSlc.cpp -matchtemplate/pycuampcor/cuAmpcorChunk.cu -matchtemplate/pycuampcor/cuAmpcorController.cu -matchtemplate/pycuampcor/cuAmpcorParameter.cu -matchtemplate/pycuampcor/cuArrays.cu +matchtemplate/pycuampcor/cuAmpcorChunk.cpp +matchtemplate/pycuampcor/cuAmpcorController.cpp +matchtemplate/pycuampcor/cuAmpcorParameter.cpp +matchtemplate/pycuampcor/cuArrays.cpp matchtemplate/pycuampcor/cuArraysCopy.cu matchtemplate/pycuampcor/cuArraysPadding.cu matchtemplate/pycuampcor/cuCorrFrequency.cu matchtemplate/pycuampcor/cuCorrNormalization.cu matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu -matchtemplate/pycuampcor/cuCorrNormalizer.cu +matchtemplate/pycuampcor/cuCorrNormalizer.cpp matchtemplate/pycuampcor/cuCorrTimeDomain.cu matchtemplate/pycuampcor/cuDeramp.cu matchtemplate/pycuampcor/cuEstimateStats.cu matchtemplate/pycuampcor/cuOffset.cu -matchtemplate/pycuampcor/cuOverSampler.cu +matchtemplate/pycuampcor/cuOverSampler.cpp matchtemplate/pycuampcor/cuSincOverSampler.cu -matchtemplate/pycuampcor/GDALImage.cu +matchtemplate/pycuampcor/cudaError.cpp +matchtemplate/pycuampcor/cudaUtil.cpp +matchtemplate/pycuampcor/GDALImage.cpp product/SubSwaths.cu signal/gpuAzimuthFilter.cu signal/gpuCrossMul.cu diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp similarity index 99% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp index ef54f0386..42b864799 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp @@ -8,6 +8,7 @@ #include "GDALImage.h" // dependencies +#include #include #include "cudaError.h" diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h index 91c3abead..6ce0a2d6e 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h @@ -12,6 +12,7 @@ #define __GDALIMAGE_H // dependencies +#include #include #include #include diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp new file mode 100644 index 000000000..2d2b61809 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp @@ -0,0 +1,106 @@ +#include +#include + +#include "cuAmpcorController.h" +#include "cuAmpcorParameter.h" + +PYBIND11_MODULE(PyCuAmpcor, m) +{ + m.doc() = "Python module controller for underlying CUDA-Ampcor code"; + + using str = std::string; + using cls = cuAmpcorController; + + pybind11::class_(m, "PyCuAmpcor") + .def(pybind11::init<>()) + + // define a trivial binding for a controller method +#define DEF_METHOD(name) def(#name, &cls::name) + + // define a trivial getter/setter for a controller parameter +#define DEF_PARAM_RENAME(T, pyname, cppname) \ + def_property(#pyname, [](const cls& self) -> T { \ + return self.param->cppname; \ + }, [](cls& self, const T i) { \ + self.param->cppname = i; \ + }) + + // same as above, for even more trivial cases where pyname == cppname +#define DEF_PARAM(T, name) DEF_PARAM_RENAME(T, name, name) + + .DEF_PARAM(int, algorithm) + .DEF_PARAM(int, deviceID) + .DEF_PARAM(int, nStreams) + .DEF_PARAM(int, derampMethod) + + .DEF_PARAM(str, referenceImageName) + .DEF_PARAM(int, referenceImageHeight) + .DEF_PARAM(int, referenceImageWidth) + .DEF_PARAM(str, secondaryImageName) + .DEF_PARAM(int, secondaryImageHeight) + .DEF_PARAM(int, secondaryImageWidth) + + .DEF_PARAM(int, numberWindowDown) + .DEF_PARAM(int, numberWindowAcross) + + .DEF_PARAM_RENAME(int, windowSizeHeight, windowSizeHeightRaw) + .DEF_PARAM_RENAME(int, windowSizeWidth, windowSizeWidthRaw) + + .DEF_PARAM(str, offsetImageName) + .DEF_PARAM(str, grossOffsetImageName) + .DEF_PARAM(int, mergeGrossOffset) + .DEF_PARAM(str, snrImageName) + .DEF_PARAM(str, covImageName) + .DEF_PARAM(str, peakValueImageName) + + .DEF_PARAM(int, rawDataOversamplingFactor) + .DEF_PARAM(int, corrStatWindowSize) + + .DEF_PARAM(int, numberWindowDownInChunk) + .DEF_PARAM(int, numberWindowAcrossInChunk) + + .DEF_PARAM(int, useMmap) + + .DEF_PARAM_RENAME(int, halfSearchRangeAcross, halfSearchRangeAcrossRaw) + .DEF_PARAM_RENAME(int, halfSearchRangeDown, halfSearchRangeDownRaw) + + .DEF_PARAM_RENAME(int, referenceStartPixelAcrossStatic, referenceStartPixelAcross0) + .DEF_PARAM_RENAME(int, referenceStartPixelDownStatic, referenceStartPixelDown0) + + .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingMethod, oversamplingMethod) + .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingFactor, oversamplingFactor) + + .DEF_PARAM_RENAME(int, mmapSize, mmapSizeInGB) + + .DEF_PARAM_RENAME(int, skipSampleDown, skipSampleDownRaw) + .DEF_PARAM_RENAME(int, skipSampleAcross, skipSampleAcrossRaw) + .DEF_PARAM_RENAME(int, corrSurfaceZoomInWindow, zoomWindowSize) + + .DEF_METHOD(runAmpcor) + + .DEF_METHOD(isDoublePrecision) + + .def("checkPixelInImageRange", [](const cls& self) { + self.param->checkPixelInImageRange(); + }) + + .def("setupParams", [](cls& self) { + self.param->setupParameters(); + }) + + .def("setConstantGrossOffset", [](cls& self, const int goDown, + const int goAcross) { + self.param->setStartPixels( + self.param->referenceStartPixelDown0, + self.param->referenceStartPixelAcross0, + goDown, goAcross); + }) + .def("setVaryingGrossOffset", [](cls& self, std::vector vD, + std::vector vA) { + self.param->setStartPixels( + self.param->referenceStartPixelDown0, + self.param->referenceStartPixelAcross0, + vD.data(), vA.data()); + }) + ; +} diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cpp new file mode 100644 index 000000000..2a59d81bc --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cpp @@ -0,0 +1,578 @@ +#include "cuAmpcorChunk.h" + +#include "cuAmpcorUtil.h" +#include +#include + +/** + * Run ampcor process for a batch of images (a chunk) + * @param[in] idxDown_ index of the chunk along Down/Azimuth direction + * @param[in] idxAcross_ index of the chunk along Across/Range direction + */ +void cuAmpcorChunk::run(int idxDown_, int idxAcross_) +{ + // set chunk index + setIndex(idxDown_, idxAcross_); + + // load reference image chunk + loadReferenceChunk(); + // oversample reference + // (deramping included in oversampler) + referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod); + // c_referenceBatchRaw and c_referenceBatchOverSampled now have enlarged size + + int2 offset = make_int2((c_referenceBatchOverSampled->height - r_referenceBatchOverSampled->height)/2, + (c_referenceBatchOverSampled->width - r_referenceBatchOverSampled->width)/2); + // extract and take amplitudes + cuArraysCopyExtractAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, offset, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the raw reference image(s) + c_referenceBatchRaw->outputToFile("c_referenceBatchRaw", stream); + // dump the oversampled reference image(s) + c_referenceBatchOverSampled->outputToFile("c_referenceBatchOverSampled", stream); + r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampled", stream); +#endif + + // compute and subtract the mean value + cuArraysSubtractMean(r_referenceBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled reference image(s) with mean subtracted + r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream); +#endif + + // load secondary image chunk to c_secondaryBatchRaw + loadSecondaryChunk(); + // oversampling the secondary image(s) + secondaryBatchOverSampler->execute(c_secondaryBatchRaw, c_secondaryBatchOverSampled, param->derampMethod); + // take amplitudes + cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the raw secondary image + c_secondaryBatchRaw->outputToFile("c_secondaryBatchRaw", stream); + // dump the oversampled secondary image(s) + c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream); + r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream); +#endif + + // correlate oversampled images + if(param->algorithm == 0) { + cuCorrFreqDomain_OverSampled->execute(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatch); + } + else { + cuCorrTimeDomain(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatch, stream); + } + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface (un-normalized) + r_corrBatch->outputToFile("r_corrBatch", stream); +#endif + + // normalize the correlation surface + corrNormalizerOverSampled->execute(r_corrBatch, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface (normalized) + r_corrBatch->outputToFile("r_corrBatchNormed", stream); +#endif + + // find the maximum location of the correlation surface, in a rectangle area {range} from {start} + int extraPadSize = param->halfZoomWindowSizeRaw*param->rawDataOversamplingFactor; + int2 start = make_int2(extraPadSize, extraPadSize); + int2 range = make_int2(r_corrBatch->height-extraPadSize, r_corrBatch->width-extraPadSize); + cuArraysMaxloc2D(r_corrBatch, start, range, offsetInit, r_maxval, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the max location and value + offsetInit->outputToFile("i_offsetInit", stream); + r_maxval->outputToFile("r_maxvalInit", stream); +#endif + + // extract a smaller chip around the peak {offsetInit} + // with extra pads, i_corrBatchValidCount is no longer needed + cuArraysCopyExtractCorr(r_corrBatch, r_corrBatchZoomIn, offsetInit, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the extracted correlation Surface + r_corrBatchZoomIn->outputToFile("r_corrBatchZoomIn", stream); +#endif + + // statistics of correlation surface + // estimate variance on r_corrBatch + cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, r_covValue, stream); + + // snr on the extracted surface r_corrBatchZoomIn + cuArraysSumSquare(r_corrBatchZoomIn, r_corrBatchSum, stream); + int corrSurfaceSize = r_corrBatch->height*r_corrBatch->width; + cuEstimateSnr(r_corrBatchSum, r_maxval, r_snrValue, corrSurfaceSize, stream); + +#ifdef CUAMPCOR_DEBUG + r_snrValue->outputToFile("r_snrValue", stream); + r_covValue->outputToFile("r_covValue", stream); +#endif + + // oversample the correlation surface + if(param->oversamplingMethod) { + // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor + // we need the max loc as the center if shifted + std::cout << "Sinc oversampler does not work at this moment\n"; + exit(1); + // corrSincOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled, + // maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor + // ); + + } + else { + corrOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled); + } + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface + r_corrBatchZoomInOverSampled->outputToFile("r_corrBatchZoomInOverSampled", stream); +#endif + + //find the max again + cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, offsetZoomIn, corrMaxValue, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the max location on oversampled correlation surface + offsetZoomIn->outputToFile("i_offsetZoomIn", stream); + corrMaxValue->outputToFile("r_maxvalZoomInOversampled", stream); +#endif + + // determine the final offset from initial (pixel/2) and oversampled (sub-pixel) + cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, + make_int2(param->corrWindowSize.x/2, param->corrWindowSize.y/2), // init offset origin + param->rawDataOversamplingFactor, // init offset factor + make_int2(param->corrZoomInSize.x/2*param->oversamplingFactor, param->corrZoomInSize.y/2*param->oversamplingFactor), + param->rawDataOversamplingFactor*param->oversamplingFactor, + stream); + +#ifdef CUAMPCOR_DEBUG + // dump the final offset + offsetFinal->outputToFile("i_offsetFinal", stream); +#endif + + // Insert the chunk results to final images + cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // snr + cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // Variance. + cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // peak value. + cuArraysCopyInsert(r_maxval, peakValueImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // all done +} + +/// set chunk index +void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) +{ + idxChunkDown = idxDown_; + idxChunkAcross = idxAcross_; + idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; + + if(idxChunkDown == param->numberChunkDown -1) { + nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); + } + else { + nWindowsDown = param->numberWindowDownInChunk; + } + + if(idxChunkAcross == param->numberChunkAcross -1) { + nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); + } + else { + nWindowsAcross = param->numberWindowAcrossInChunk; + } +} + +/// obtain the starting pixels for each chip +/// @param[in] oStartPixel start pixel locations for all chips +/// @param[out] rstartPixel start pixel locations for chips within the chunk +void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) +{ + for(int i=0; inumberWindowDownInChunk; ++i) { + int iDown = i; + if(i>=nWindowsDown) iDown = nWindowsDown-1; + for(int j=0; jnumberWindowAcrossInChunk; ++j){ + int iAcross = j; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; + int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross + + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; + rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff; + } + } +} + +void cuAmpcorChunk::loadReferenceChunk() +{ + + // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw + // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right) + + // get the chunk size to be loaded to gpu + int startD = param->referenceChunkStartPixelDown[idxChunk]; //start pixel down (along height) + int startA = param->referenceChunkStartPixelAcross[idxChunk]; // start pixel across (along width) + int height = param->referenceChunkHeight[idxChunk]; // number of pixels along height + int width = param->referenceChunkWidth[idxChunk]; // number of pixels along width + +#ifdef CUAMPCOR_DEBUG + std::cout << "loading reference chunk ...\n " + << " index: " << idxChunk << " " + << "starting pixel: (" << startD << ", " << startA << ") " + << "size : (" << height << ", " << width << ")" + << "\n"; +#endif + + + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) + { + // yes, simply set the image to 0 + c_referenceBatchRaw->setZero(stream); + } + else + { + // use cpu to compute the starting positions for each window + getRelativeOffset(ChunkOffsetDown->hostData, param->referenceStartPixelDown, param->referenceChunkStartPixelDown[idxChunk]); + // copy the positions to gpu + ChunkOffsetDown->copyToDevice(stream); + // same for the across direction + getRelativeOffset(ChunkOffsetAcross->hostData, param->referenceStartPixelAcross, param->referenceChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + +#ifdef CUAMPCOR_DEBUG + std::cout << "loading reference windows from chunk debug ... \n"; + auto * startPixelDownToChunk = ChunkOffsetDown->hostData; + auto * startPixelAcrossToChunk = ChunkOffsetAcross->hostData; + + for(int i=0; inumberWindowDownInChunk; ++i) { + int iDown = i; + if(i>=nWindowsDown) iDown = nWindowsDown-1; + for(int j=0; jnumberWindowAcrossInChunk; ++j){ + int iAcross = j; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; + int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross + + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; + std::cout << "Window index in chuck: (" << iDown << ", " << iAcross << ") \n"; + std::cout << " Staring pixel location from raw: (" << param->referenceStartPixelDown[idxInAll] << ", " + << param->referenceStartPixelAcross[idxInAll] <<")\n"; + std::cout << " Staring pixel location from chunk: (" << startPixelDownToChunk[idxInChunk] << ", " + << startPixelAcrossToChunk[idxInChunk] <<")\n"; + + } + } + +#endif + + + // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) + if(referenceImage->isComplex()) + { + // allocate a gpu buffer to load data from cpu/file + // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 + c_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + c_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream); + + //copy the chunk to a batch format (nImages, height, width) + // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + // deallocate the gpu buffer + delete c_referenceChunkRaw; + } + // if the image is real + else { + r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + r_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream); + + // copy the chunk (real) to a batch format (complex) + cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + // deallocate the gpu buffer + delete r_referenceChunkRaw; + } // end of if complex + } // end of if all pixels out of range +} + +void cuAmpcorChunk::loadSecondaryChunk() +{ + // get the chunk size to be loaded to gpu + int height = param->secondaryChunkHeight[idxChunk]; // number of pixels along height + int width = param->secondaryChunkWidth[idxChunk]; // number of pixels along width + + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) + { + // yes, simply set the image to 0 + c_secondaryBatchRaw->setZero(stream); + } + else + { + //copy to a batch format (nImages, height, width) + getRelativeOffset(ChunkOffsetDown->hostData, param->secondaryStartPixelDown, param->secondaryChunkStartPixelDown[idxChunk]); + ChunkOffsetDown->copyToDevice(stream); + getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + + if(secondaryImage->isComplex()) + { + c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + c_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(c_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + delete c_secondaryChunkRaw; + } + else { //real image + //allocate the gpu buffer + r_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + r_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(r_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + // convert to the batch format + cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + delete r_secondaryChunkRaw; + } + } +} + +/// constructor +cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) + +{ + param = param_; + referenceImage = reference_; + secondaryImage = secondary_; + offsetImage = offsetImage_; + snrImage = snrImage_; + covImage = covImage_; + peakValueImage = peakValueImage_; + + stream = stream_; + + ChunkOffsetDown = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetDown->allocate(); + ChunkOffsetDown->allocateHost(); + ChunkOffsetAcross = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetAcross->allocate(); + ChunkOffsetAcross->allocateHost(); + + c_referenceBatchRaw = new cuArrays ( + param->windowSizeHeightRawEnlarged, param->windowSizeWidthRawEnlarged, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchRaw->allocate(); + + c_secondaryBatchRaw = new cuArrays ( + param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchRaw->allocate(); + + c_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeightEnlarged, param->windowSizeWidthEnlarged, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchOverSampled->allocate(); + + c_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchOverSampled->allocate(); + + r_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeight, param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_referenceBatchOverSampled->allocate(); + + r_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_secondaryBatchOverSampled->allocate(); + + referenceBatchOverSampler = new cuOverSamplerC2C( + c_referenceBatchRaw->height, c_referenceBatchRaw->width, //orignal size + c_referenceBatchOverSampled->height, c_referenceBatchOverSampled->width, //oversampled size + c_referenceBatchRaw->count, stream); + + secondaryBatchOverSampler = new cuOverSamplerC2C( + c_secondaryBatchRaw->height, c_secondaryBatchRaw->width, + c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, + c_secondaryBatchRaw->count, stream); + + r_corrBatch = new cuArrays ( + param->corrWindowSize.x, + param->corrWindowSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatch->allocate(); + + r_corrBatchZoomIn = new cuArrays ( + param->corrZoomInSize.x, + param->corrZoomInSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomIn->allocate(); + + r_corrBatchZoomInOverSampled = new cuArrays ( + param->corrZoomInOversampledSize.x, + param->corrZoomInOversampledSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomInOverSampled->allocate(); + + offsetInit = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetInit->allocate(); + + offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetZoomIn->allocate(); + + offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetFinal->allocate(); + + maxLocShift = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + maxLocShift->allocate(); + + corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + corrMaxValue->allocate(); + + r_corrBatchSum = new cuArrays ( + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchSum->allocate(); + + i_maxloc = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + i_maxloc->allocate(); + + r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_maxval->allocate(); + + r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_snrValue->allocate(); + + r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_covValue->allocate(); + + // end of new arrays + + if(param->oversamplingMethod) { + corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); + } + else { + corrOverSampler= new cuOverSamplerR2R( + r_corrBatchZoomIn->height, r_corrBatchZoomIn->width, + r_corrBatchZoomInOverSampled->height, r_corrBatchZoomInOverSampled->width, + param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + stream); + } + if(param->algorithm == 0) { + cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, + stream); + } + + + corrNormalizerOverSampled = + std::unique_ptr(newCuNormalizer( + param->searchWindowSizeHeight, + param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + )); + + +#ifdef CUAMPCOR_DEBUG + std::cout << "all objects in chunk are created ...\n"; +#endif +} + +// destructor +cuAmpcorChunk::~cuAmpcorChunk() +{ + corrNormalizerOverSampled.release(); + + if(param->oversamplingMethod) { + delete corrSincOverSampler; + } + else { + delete corrOverSampler; + } + if(param->algorithm == 0) { + delete cuCorrFreqDomain_OverSampled; + } + + delete ChunkOffsetDown ; + delete ChunkOffsetAcross ; + delete c_referenceBatchRaw; + delete c_secondaryBatchRaw; + delete c_referenceBatchOverSampled; + delete c_secondaryBatchOverSampled; + delete r_referenceBatchOverSampled; + delete r_secondaryBatchOverSampled; + delete referenceBatchOverSampler; + delete secondaryBatchOverSampler; + + delete r_corrBatch; + delete r_corrBatchZoomIn; + delete r_corrBatchZoomInOverSampled; + delete offsetInit; + delete offsetZoomIn; + delete offsetFinal; + delete maxLocShift; + delete corrMaxValue; + + delete r_corrBatchSum; + delete i_maxloc; + delete r_maxval; + delete r_snrValue; + delete r_covValue; + + // end of deletions + +} + +// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu deleted file mode 100644 index 144062b1c..000000000 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu +++ /dev/null @@ -1,583 +0,0 @@ -#include "cuAmpcorChunk.h" -#include "cuAmpcorUtil.h" - -/** - * Run ampcor process for a batch of images (a chunk) - * @param[in] idxDown_ index of the chunk along Down/Azimuth direction - * @param[in] idxAcross_ index of the chunk along Across/Range direction - */ -void cuAmpcorChunk::run(int idxDown_, int idxAcross_) -{ - // set chunk index - setIndex(idxDown_, idxAcross_); - - // load reference image chunk - loadReferenceChunk(); - // take amplitudes - cuArraysAbs(c_referenceBatchRaw, r_referenceBatchRaw, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the raw reference image(s) - c_referenceBatchRaw->outputToFile("c_referenceBatchRaw", stream); - r_referenceBatchRaw->outputToFile("r_referenceBatchRaw", stream); -#endif - - // compute and subtract mean values (for normalized) - cuArraysSubtractMean(r_referenceBatchRaw, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the raw reference image(s) - r_referenceBatchRaw->outputToFile("r_referenceBatchRawSubMean", stream); -#endif - - // load secondary image chunk - loadSecondaryChunk(); - // take amplitudes - cuArraysAbs(c_secondaryBatchRaw, r_secondaryBatchRaw, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the raw secondary image(s) - c_secondaryBatchRaw->outputToFile("c_secondaryBatchRaw", stream); - r_secondaryBatchRaw->outputToFile("r_secondaryBatchRaw", stream); -#endif - - //cross correlation for un-oversampled data - if(param->algorithm == 0) { - cuCorrFreqDomain->execute(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw); - } else { - cuCorrTimeDomain(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation - } - -#ifdef CUAMPCOR_DEBUG - // dump the un-normalized correlation surface - r_corrBatchRaw->outputToFile("r_corrBatchRawUnNorm", stream); -#endif - - // normalize the correlation surface - corrNormalizerRaw->execute(r_corrBatchRaw, r_referenceBatchRaw, r_secondaryBatchRaw, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the normalized correlation surface - r_corrBatchRaw->outputToFile("r_corrBatchRaw", stream); -#endif - - // find the maximum location of none-oversampled correlation - // 41 x 41, if halfsearchrange=20 - cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream); - - // estimate variance - cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_referenceBatchRaw->size, r_covValue, stream); - - // estimate SNR - // step1: extraction of correlation surface around the peak - cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchRawZoomIn, i_corrBatchZoomInValid, offsetInit, stream); - - // step2: summation of correlation and data point values - cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream); - -#ifdef CUAMPCOR_DEBUG - r_maxval->outputToFile("r_maxval", stream); - r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream); - i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream); - r_corrBatchSum->outputToFile("r_corrBatchSum", stream); - i_corrBatchValidCount->outputToFile("i_corrBatchValidCount", stream); -#endif - - // step3: divide the peak value by the mean of surrounding values - cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream); - -#ifdef CUAMPCOR_DEBUG - offsetInit->outputToFile("i_offsetInit", stream); - r_snrValue->outputToFile("r_snrValue", stream); - r_covValue->outputToFile("r_covValue", stream); -#endif - - // Using the approximate estimation to adjust secondary image (half search window size becomes only 4 pixels) - // determine the starting pixel to extract secondary images around the max location - cuDetermineSecondaryExtractOffset(offsetInit, - maxLocShift, - param->halfSearchRangeDownRaw, // old range - param->halfSearchRangeAcrossRaw, - param->halfZoomWindowSizeRaw, // new range - param->halfZoomWindowSizeRaw, - stream); - -#ifdef CUAMPCOR_DEBUG - offsetInit->outputToFile("i_offsetInitAdjusted", stream); - maxLocShift->outputToFile("i_maxLocShift", stream); -#endif - - // oversample reference - // (deramping included in oversampler) - referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod); - // take amplitudes - cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the oversampled reference image(s) - c_referenceBatchOverSampled->outputToFile("c_referenceBatchOverSampled", stream); - r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampled", stream); -#endif - - // compute and subtract the mean value - cuArraysSubtractMean(r_referenceBatchOverSampled, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the oversampled reference image(s) with mean subtracted - r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream); -#endif - - // extract secondary and oversample - cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit, stream); - secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod); - // take amplitudes - cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the extracted raw secondary image - c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn", stream); - // dump the oversampled secondary image(s) - c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream); - r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream); -#endif - - // correlate oversampled images - if(param->algorithm == 0) { - cuCorrFreqDomain_OverSampled->execute(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn); - } - else { - cuCorrTimeDomain(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn, stream); - } - -#ifdef CUAMPCOR_DEBUG - // dump the oversampled correlation surface (un-normalized) - r_corrBatchZoomIn->outputToFile("r_corrBatchZoomInUnNorm", stream); -#endif - - // normalize the correlation surface - corrNormalizerOverSampled->execute(r_corrBatchZoomIn, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the oversampled correlation surface (normalized) - r_corrBatchZoomIn->outputToFile("r_corrBatchZoomIn", stream); -#endif - - // remove the last row and col to get even sequences - cuArraysCopyExtract(r_corrBatchZoomIn, r_corrBatchZoomInAdjust, make_int2(0,0), stream); - -#ifdef CUAMPCOR_DEBUG - // dump the adjusted correlation Surface - r_corrBatchZoomInAdjust->outputToFile("r_corrBatchZoomInAdjust", stream); -#endif - - // oversample the correlation surface - if(param->oversamplingMethod) { - // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor - // we need the max loc as the center if shifted - corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled, - maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor - ); - } - else { - corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled); - } - -#ifdef CUAMPCOR_DEBUG - // dump the oversampled correlation surface - r_corrBatchZoomInOverSampled->outputToFile("r_corrBatchZoomInOverSampled", stream); -#endif - - //find the max again - cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, offsetZoomIn, corrMaxValue, stream); - -#ifdef CUAMPCOR_DEBUG - // dump the max location on oversampled correlation surface - offsetZoomIn->outputToFile("i_offsetZoomIn", stream); - corrMaxValue->outputToFile("r_maxvalZoomInOversampled", stream); -#endif - - // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel) - // = (Init-HalfsearchRange) + ZoomIn/(2*ovs) - cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, - param->oversamplingFactor, param->rawDataOversamplingFactor, - param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw, - stream); - - // Insert the chunk results to final images - cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); - // snr - cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); - // Variance. - cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); - // Cross-correlation peak - cuArraysCopyInsert(corrMaxValue, corrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); - // all done - -} - -/// set chunk index -void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) -{ - idxChunkDown = idxDown_; - idxChunkAcross = idxAcross_; - idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; - - if(idxChunkDown == param->numberChunkDown -1) { - nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); - } - else { - nWindowsDown = param->numberWindowDownInChunk; - } - - if(idxChunkAcross == param->numberChunkAcross -1) { - nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); - } - else { - nWindowsAcross = param->numberWindowAcrossInChunk; - } -} - -/// obtain the starting pixels for each chip -/// @param[in] oStartPixel start pixel locations for all chips -/// @param[out] rstartPixel start pixel locations for chips within the chunk -void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) -{ - for(int i=0; inumberWindowDownInChunk; ++i) { - int iDown = i; - if(i>=nWindowsDown) iDown = nWindowsDown-1; - for(int j=0; jnumberWindowAcrossInChunk; ++j){ - int iAcross = j; - if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; - int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; - int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross - + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; - rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff; - } - } -} - -void cuAmpcorChunk::loadReferenceChunk() -{ - - // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw - // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right) - - // get the chunk size to be loaded to gpu - int startD = param->referenceChunkStartPixelDown[idxChunk]; //start pixel down (along height) - int startA = param->referenceChunkStartPixelAcross[idxChunk]; // start pixel across (along width) - int height = param->referenceChunkHeight[idxChunk]; // number of pixels along height - int width = param->referenceChunkWidth[idxChunk]; // number of pixels along width - - //use cpu to compute the starting positions for each window - getRelativeOffset(ChunkOffsetDown->hostData, ¶m->referenceStartPixelDown[0], param->referenceChunkStartPixelDown[idxChunk]); - // copy the positions to gpu - ChunkOffsetDown->copyToDevice(stream); - // same for the across direction - getRelativeOffset(ChunkOffsetAcross->hostData, ¶m->referenceStartPixelAcross[0], param->referenceChunkStartPixelAcross[idxChunk]); - ChunkOffsetAcross->copyToDevice(stream); - - // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) - if(referenceImage->isComplex()) - { - // allocate a gpu buffer to load data from cpu/file - // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 - c_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); - c_referenceChunkRaw->allocate(); - - // load the data from cpu - referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream); - - //copy the chunk to a batch format (nImages, height, width) - // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data - if(param->derampMethod == 0) { - cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw, param->referenceChunkWidth[idxChunk], - c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - } - else { - cuArraysCopyToBatchWithOffset(c_referenceChunkRaw, param->referenceChunkWidth[idxChunk], - c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - } - // deallocate the gpu buffer - delete c_referenceChunkRaw; - } - // if the image is real - else { - r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); - r_referenceChunkRaw->allocate(); - - // load the data from cpu - referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream); - - // copy the chunk (real) to a batch format (complex) - cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw, param->referenceChunkWidth[idxChunk], - c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - // deallocate the gpu buffer - delete r_referenceChunkRaw; - } - - -} - -void cuAmpcorChunk::loadSecondaryChunk() -{ - - //copy to a batch format (nImages, height, width) - getRelativeOffset(ChunkOffsetDown->hostData, ¶m->secondaryStartPixelDown[0], param->secondaryChunkStartPixelDown[idxChunk]); - ChunkOffsetDown->copyToDevice(stream); - getRelativeOffset(ChunkOffsetAcross->hostData, ¶m->secondaryStartPixelAcross[0], param->secondaryChunkStartPixelAcross[idxChunk]); - ChunkOffsetAcross->copyToDevice(stream); - - if(secondaryImage->isComplex()) - { - c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); - c_secondaryChunkRaw->allocate(); - - //load a chunk from mmap to gpu - secondaryImage->loadToDevice(c_secondaryChunkRaw->devData, - param->secondaryChunkStartPixelDown[idxChunk], - param->secondaryChunkStartPixelAcross[idxChunk], - param->secondaryChunkHeight[idxChunk], - param->secondaryChunkWidth[idxChunk], - stream); - - if(param->derampMethod == 0) { - cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk], - c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - } - else { - cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk], - c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - } - delete c_secondaryChunkRaw; - } - else { //real image - //allocate the gpu buffer - r_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); - r_secondaryChunkRaw->allocate(); - - //load a chunk from mmap to gpu - secondaryImage->loadToDevice(r_secondaryChunkRaw->devData, - param->secondaryChunkStartPixelDown[idxChunk], - param->secondaryChunkStartPixelAcross[idxChunk], - param->secondaryChunkHeight[idxChunk], - param->secondaryChunkWidth[idxChunk], - stream); - - // convert to the batch format - cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw, param->secondaryChunkWidth[idxChunk], - c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); - delete r_secondaryChunkRaw; - } -} - -/// constructor -cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, - cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, - cuArrays *corrImage_, - cudaStream_t stream_) - -{ - param = param_; - referenceImage = reference_; - secondaryImage = secondary_; - offsetImage = offsetImage_; - snrImage = snrImage_; - covImage = covImage_; - corrImage = corrImage_; - - stream = stream_; - - ChunkOffsetDown = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - ChunkOffsetDown->allocate(); - ChunkOffsetDown->allocateHost(); - ChunkOffsetAcross = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - ChunkOffsetAcross->allocate(); - ChunkOffsetAcross->allocateHost(); - - c_referenceBatchRaw = new cuArrays ( - param->windowSizeHeightRaw, param->windowSizeWidthRaw, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_referenceBatchRaw->allocate(); - - c_secondaryBatchRaw = new cuArrays ( - param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_secondaryBatchRaw->allocate(); - - r_referenceBatchRaw = new cuArrays ( - param->windowSizeHeightRaw, param->windowSizeWidthRaw, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_referenceBatchRaw->allocate(); - - r_secondaryBatchRaw = new cuArrays ( - param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_secondaryBatchRaw->allocate(); - - c_secondaryBatchZoomIn = new cuArrays ( - param->searchWindowSizeHeightRawZoomIn, param->searchWindowSizeWidthRawZoomIn, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_secondaryBatchZoomIn->allocate(); - - c_referenceBatchOverSampled = new cuArrays ( - param->windowSizeHeight, param->windowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_referenceBatchOverSampled->allocate(); - - c_secondaryBatchOverSampled = new cuArrays ( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_secondaryBatchOverSampled->allocate(); - - r_referenceBatchOverSampled = new cuArrays ( - param->windowSizeHeight, param->windowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_referenceBatchOverSampled->allocate(); - - r_secondaryBatchOverSampled = new cuArrays ( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_secondaryBatchOverSampled->allocate(); - - referenceBatchOverSampler = new cuOverSamplerC2C( - c_referenceBatchRaw->height, c_referenceBatchRaw->width, //orignal size - c_referenceBatchOverSampled->height, c_referenceBatchOverSampled->width, //oversampled size - c_referenceBatchRaw->count, stream); - - secondaryBatchOverSampler = new cuOverSamplerC2C(c_secondaryBatchZoomIn->height, c_secondaryBatchZoomIn->width, - c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream); - - r_corrBatchRaw = new cuArrays ( - param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, - param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchRaw->allocate(); - - r_corrBatchZoomIn = new cuArrays ( - param->searchWindowSizeHeight - param->windowSizeHeight+1, - param->searchWindowSizeWidth - param->windowSizeWidth+1, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchZoomIn->allocate(); - - r_corrBatchZoomInAdjust = new cuArrays ( - param->searchWindowSizeHeight - param->windowSizeHeight, - param->searchWindowSizeWidth - param->windowSizeWidth, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchZoomInAdjust->allocate(); - - - r_corrBatchZoomInOverSampled = new cuArrays ( - param->zoomWindowSize * param->oversamplingFactor, - param->zoomWindowSize * param->oversamplingFactor, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchZoomInOverSampled->allocate(); - - offsetInit = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - offsetInit->allocate(); - - offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - offsetZoomIn->allocate(); - - offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - offsetFinal->allocate(); - - maxLocShift = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - maxLocShift->allocate(); - - corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - corrMaxValue->allocate(); - - - // new arrays due to snr estimation - r_corrBatchRawZoomIn = new cuArrays ( - param->corrRawZoomInHeight, - param->corrRawZoomInWidth, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchRawZoomIn->allocate(); - - i_corrBatchZoomInValid = new cuArrays ( - param->corrRawZoomInHeight, - param->corrRawZoomInWidth, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - i_corrBatchZoomInValid->allocate(); - - - r_corrBatchSum = new cuArrays ( - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchSum->allocate(); - - i_corrBatchValidCount = new cuArrays ( - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - i_corrBatchValidCount->allocate(); - - i_maxloc = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - - i_maxloc->allocate(); - - r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - - r_maxval->allocate(); - - r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - - r_snrValue->allocate(); - - r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - - r_covValue->allocate(); - - // end of new arrays - - if(param->oversamplingMethod) { - corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); - } - else { - corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize, - (param->zoomWindowSize)*param->oversamplingFactor, - (param->zoomWindowSize)*param->oversamplingFactor, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, - stream); - } - if(param->algorithm == 0) { - cuCorrFreqDomain = new cuFreqCorrelator( - param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, - stream); - cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, - stream); - } - - corrNormalizerRaw = new cuNormalizer( - param->searchWindowSizeHeightRaw, - param->searchWindowSizeWidthRaw, - param->numberWindowDownInChunk * param->numberWindowAcrossInChunk - ); - - corrNormalizerOverSampled = new cuNormalizer( - param->searchWindowSizeHeight, - param->searchWindowSizeWidth, - param->numberWindowDownInChunk * param->numberWindowAcrossInChunk - ); - - -#ifdef CUAMPCOR_DEBUG - std::cout << "all objects in chunk are created ...\n"; -#endif -} - -// destructor -cuAmpcorChunk::~cuAmpcorChunk() -{ -} - -// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h index 5c990e233..beb127097 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h @@ -9,6 +9,7 @@ #define __CUAMPCORCHUNK_H #include "GDALImage.h" +#include "data_types.h" #include "cuArrays.h" #include "cuAmpcorParameter.h" #include "cuOverSampler.h" @@ -34,24 +35,26 @@ class cuAmpcorChunk{ GDALImage *referenceImage; ///< reference image object GDALImage *secondaryImage; ///< secondary image object cuAmpcorParameter *param; ///< reference to the (global) parameters - cuArrays *offsetImage; ///< output offsets image - cuArrays *snrImage; ///< snr image - cuArrays *covImage; ///< cov image - cuArrays *corrImage; ///< corr image + cuArrays *offsetImage; ///< output offsets image + cuArrays *snrImage; ///< snr image + cuArrays *covImage; ///< cov image + cuArrays *peakValueImage; ///< peak value image // local variables and workers // gpu buffer to load images from file - cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; - cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; + // image_complex_type uses original image type, + // convert to complex_type when copied to c_referenceBatchRaw + cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; + cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; // windows raw (not oversampled) data, complex and real - cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn; - cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; + cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw; + cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; // windows oversampled data - cuArrays * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled; - cuArrays * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled; - cuArrays * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust; + cuArrays * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled; + cuArrays * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled; + cuArrays * r_corrBatch, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled; // offset data cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; @@ -67,33 +70,33 @@ class cuAmpcorChunk{ cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; // correlation surface normalizer - cuNormalizer *corrNormalizerRaw; - cuNormalizer *corrNormalizerOverSampled; + std::unique_ptr corrNormalizerRaw; + std::unique_ptr corrNormalizerOverSampled; // save offset results in different stages cuArrays *offsetInit; cuArrays *offsetZoomIn; - cuArrays *offsetFinal; + cuArrays *offsetFinal; cuArrays *maxLocShift; // record the maxloc from the extract center - cuArrays *corrMaxValue; + cuArrays *corrMaxValue; cuArrays *i_maxloc; - cuArrays *r_maxval; + cuArrays *r_maxval; // SNR estimation - cuArrays *r_corrBatchRawZoomIn; - cuArrays *r_corrBatchSum; - cuArrays *i_corrBatchZoomInValid, *i_corrBatchValidCount; - cuArrays *r_snrValue; + cuArrays *r_corrBatchRawZoomIn; + cuArrays *r_corrBatchSum; + cuArrays *r_snrValue; // Variance estimation - cuArrays *r_covValue; + cuArrays *r_covValue; public: // constructor cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, - cuArrays *offsetImage_, cuArrays *snrImage_, - cuArrays *covImage_, cuArrays* corrImage_, cudaStream_t stream_); + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); // destructor ~cuAmpcorChunk(); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp similarity index 71% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp index e906ef340..5658a9550 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp @@ -12,16 +12,30 @@ #include "cudaUtil.h" #include "cuAmpcorChunk.h" #include "cuAmpcorUtil.h" +#include #include // constructor cuAmpcorController::cuAmpcorController() { // create a new set of parameters - param.reset(new cuAmpcorParameter()); + param = new cuAmpcorParameter(); } +// destructor +cuAmpcorController::~cuAmpcorController() +{ + delete param; +} +bool cuAmpcorController::isDoublePrecision() +{ +#ifdef CUAMPCOR_DOUBLE + return true; +#else + return false; +#endif +} /** * Run ampcor * @@ -40,43 +54,45 @@ void cuAmpcorController::runAmpcor() std::cout << "Opening secondary image " << param->secondaryImageName << "...\n"; GDALImage *secondaryImage = new GDALImage(param->secondaryImageName, 1, param->mmapSizeInGB); - cuArrays *offsetImage, *offsetImageRun; - cuArrays *snrImage, *snrImageRun; - cuArrays *covImage, *covImageRun; - cuArrays *corrImage, *corrImageRun; + cuArrays *offsetImage, *offsetImageRun; + cuArrays *snrImage, *snrImageRun; + cuArrays *covImage, *covImageRun; + cuArrays *peakValueImage, *peakValueImageRun; // nWindowsDownRun is defined as numberChunk * numberWindowInChunk // It may be bigger than the actual number of windows int nWindowsDownRun = param->numberChunkDown * param->numberWindowDownInChunk; int nWindowsAcrossRun = param->numberChunkAcross * param->numberWindowAcrossInChunk; - offsetImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + offsetImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); offsetImageRun->allocate(); - snrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + snrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); snrImageRun->allocate(); - covImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + covImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); covImageRun->allocate(); - corrImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); - corrImageRun->allocate(); + peakValueImageRun = new cuArrays(nWindowsDownRun, nWindowsAcrossRun); + peakValueImageRun->allocate(); // Offset fields. - offsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + offsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); offsetImage->allocate(); // SNR. - snrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + snrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); snrImage->allocate(); // Variance. - covImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + covImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); covImage->allocate(); - // Cross-correlation peak - corrImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); - corrImage->allocate(); + // Correlation surface peak value + peakValueImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + peakValueImage->allocate(); + + // set up the cuda streams cudaStream_t streams[param->nStreams]; @@ -87,8 +103,8 @@ void cuAmpcorController::runAmpcor() // create each stream checkCudaErrors(cudaStreamCreate(&streams[ist])); // create the chunk processor for each stream - chunk[ist]= new cuAmpcorChunk(param.get(), referenceImage, secondaryImage, - offsetImageRun, snrImageRun, covImageRun, corrImageRun, + chunk[ist]= new cuAmpcorChunk(param, referenceImage, secondaryImage, + offsetImageRun, snrImageRun, covImageRun, peakValueImageRun, streams[ist]); } @@ -131,17 +147,17 @@ void cuAmpcorController::runAmpcor() cuArraysCopyExtract(offsetImageRun, offsetImage, make_int2(0,0), streams[0]); cuArraysCopyExtract(snrImageRun, snrImage, make_int2(0,0), streams[0]); cuArraysCopyExtract(covImageRun, covImage, make_int2(0,0), streams[0]); - cuArraysCopyExtract(corrImageRun, corrImage, make_int2(0,0), streams[0]); + cuArraysCopyExtract(peakValueImageRun, peakValueImage, make_int2(0,0), streams[0]); /* save the offsets and gross offsets */ // copy the offset to host offsetImage->allocateHost(); offsetImage->copyToHost(streams[0]); // construct the gross offset - cuArrays *grossOffsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); + cuArrays *grossOffsetImage = new cuArrays(param->numberWindowDown, param->numberWindowAcross); grossOffsetImage->allocateHost(); for(int i=0; i< param->numberWindows; i++) - grossOffsetImage->hostData[i] = make_float2(param->grossOffsetDown[i], param->grossOffsetAcross[i]); + grossOffsetImage->hostData[i] = make_real2(param->grossOffsetDown[i], param->grossOffsetAcross[i]); // check whether to merge gross offset if (param->mergeGrossOffset) @@ -158,25 +174,24 @@ void cuAmpcorController::runAmpcor() // save the snr/cov images snrImage->outputToFile(param->snrImageName, streams[0]); covImage->outputToFile(param->covImageName, streams[0]); - - // save the cross-correlation peak - corrImage->outputToFile(param->corrImageName, streams[0]); + peakValueImage->outputToFile(param->peakValueImageName, streams[0]); // Delete arrays. delete offsetImage; delete snrImage; delete covImage; - delete corrImage; + delete peakValueImage; delete offsetImageRun; delete snrImageRun; delete covImageRun; - delete corrImageRun; + delete peakValueImageRun; for (int ist=0; istnStreams; ist++) { - checkCudaErrors(cudaStreamDestroy(streams[ist])); + // cufftplan etc are stream dependent, need to be deleted before stream is destroyed delete chunk[ist]; + checkCudaErrors(cudaStreamDestroy(streams[ist])); } delete referenceImage; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h index e807cce32..466fd6859 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.h @@ -15,18 +15,19 @@ #ifndef CU_AMPCOR_CONTROLLER_H #define CU_AMPCOR_CONTROLLER_H -#include - // dependencies #include "cuAmpcorParameter.h" class cuAmpcorController { public: - std::unique_ptr param; ///< the parameter set + cuAmpcorParameter *param; ///< the parameter set // constructor cuAmpcorController(); + // destructor + ~cuAmpcorController(); // run interface void runAmpcor(); + bool isDoublePrecision(); }; #endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp similarity index 58% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp index 3a1e72e66..2edca32ee 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp @@ -6,6 +6,7 @@ #include "cuAmpcorParameter.h" #include + #ifndef IDIVUP #define IDIVUP(i,j) ((i+j-1)/j) #endif @@ -31,9 +32,9 @@ cuAmpcorParameter::cuAmpcorParameter() skipSampleAcrossRaw = 64; skipSampleDownRaw = 64; - rawDataOversamplingFactor = 2; - zoomWindowSize = 16; - oversamplingFactor = 16; + rawDataOversamplingFactor = 2; //antialiasing + zoomWindowSize = 16; // correlation surface oversampling zoom + oversamplingFactor = 16; // correlation surface oversampling oversamplingMethod = 0; referenceImageName = "reference.slc"; @@ -42,10 +43,11 @@ cuAmpcorParameter::cuAmpcorParameter() secondaryImageName = "secondary.slc"; secondaryImageWidth = 1000; secondaryImageHeight = 1000; - offsetImageName = "DenseOffset.off"; - grossOffsetImageName = "GrossOffset.off"; - snrImageName = "snr.snr"; - covImageName = "cov.cov"; + offsetImageName = "DenseOffset.bip"; + grossOffsetImageName = "GrossOffset.bip"; + snrImageName = "snr.bip"; + covImageName = "cov.bip"; + peakValueImageName = "peakValue.bip"; numberWindowDown = 1; numberWindowAcross = 1; numberWindowDownInChunk = 1; @@ -54,7 +56,7 @@ cuAmpcorParameter::cuAmpcorParameter() referenceStartPixelDown0 = 0; referenceStartPixelAcross0 = 0; - corrStatWindowSize = 21; // 10*2+1 as in RIOPAC + // corrStatWindowSize = 21; // 10*2+1 as in RIOPAC useMmap = 1; // use mmap mmapSizeInGB = 1; @@ -70,32 +72,61 @@ cuAmpcorParameter::cuAmpcorParameter() void cuAmpcorParameter::setupParameters() { // Size to extract the raw correlation surface for snr/cov - corrRawZoomInHeight = std::min(corrStatWindowSize, 2*halfSearchRangeDownRaw+1); - corrRawZoomInWidth = std::min(corrStatWindowSize, 2*halfSearchRangeAcrossRaw+1); + // corrRawZoomInHeight = std::min(corrStatWindowSize, 2*halfSearchRangeDownRaw+1); + // corrRawZoomInWidth = std::min(corrStatWindowSize, 2*halfSearchRangeAcrossRaw+1); // Size to extract the resampled correlation surface for oversampling // users should use 16 for zoomWindowSize, no need to multiply by 2 // zoomWindowSize *= rawDataOversamplingFactor; //8 * 2 // to check the search range - int corrSurfaceActualSize = - std::min(halfSearchRangeAcrossRaw, halfSearchRangeDownRaw)* - 2*rawDataOversamplingFactor; - zoomWindowSize = std::min(zoomWindowSize, corrSurfaceActualSize); + // int corrSurfaceActualSize = + // std::min(halfSearchRangeAcrossRaw, halfSearchRangeDownRaw)* + // 2*rawDataOversamplingFactor; + // zoomWindowSize = std::min(zoomWindowSize, corrSurfaceActualSize); - halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 + // halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 + // template window size (after antialiasing oversampling) windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor; // windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor; - searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; - searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw; + // serve as extra margin for search range + halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); - searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw; - searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw; + // add to the search range + halfSearchRangeDownRaw += halfZoomWindowSizeRaw; + halfSearchRangeAcrossRaw += halfZoomWindowSizeRaw; - searchWindowSizeWidth = searchWindowSizeWidthRawZoomIn*rawDataOversamplingFactor; - searchWindowSizeHeight = searchWindowSizeHeightRawZoomIn*rawDataOversamplingFactor; + // add extra search range to ensure enough area to oversample correlation surface + searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; + searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw; + windowSizeHeightRawEnlarged = searchWindowSizeHeightRaw; + windowSizeWidthRawEnlarged = searchWindowSizeWidthRaw; + + // search window size (after antialiasing oversampling) + searchWindowSizeWidth = searchWindowSizeWidthRaw*rawDataOversamplingFactor; + searchWindowSizeHeight = searchWindowSizeHeightRaw*rawDataOversamplingFactor; + + windowSizeHeightEnlarged = searchWindowSizeHeight; + windowSizeWidthEnlarged = searchWindowSizeWidth; + + // correlation surface size + corrWindowSize = make_int2(searchWindowSizeHeight - windowSizeHeight + 1, + searchWindowSizeWidth - windowSizeWidth +1); + // check zoom in window size, if larger, issue a warning + if(zoomWindowSize >= corrWindowSize.x || zoomWindowSize >= corrWindowSize.y) + fprintf(stderr, "Warning: zoomWindowSize %d is bigger than the original correlation surface size (%d, %d)!\n", + zoomWindowSize, corrWindowSize.x, corrWindowSize.y ); + // use the smaller values + corrZoomInSize = make_int2(std::min(zoomWindowSize+1, corrWindowSize.x), + std::min(zoomWindowSize+1, corrWindowSize.y)); + + // oversampled correlation surface size + corrZoomInOversampledSize = make_int2(corrZoomInSize.x * oversamplingFactor, + corrZoomInSize.y * oversamplingFactor); + fprintf(stderr, "zoomWindowSize is (%d, %d)!\n", + corrZoomInSize.x, corrZoomInSize.y ); numberWindows = numberWindowDown*numberWindowAcross; if(numberWindows <=0) { fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross); @@ -111,25 +142,48 @@ void cuAmpcorParameter::setupParameters() void cuAmpcorParameter::allocateArrays() { - int arraySize = numberWindows; - grossOffsetDown.resize(arraySize); - grossOffsetAcross.resize(arraySize); - referenceStartPixelDown.resize(arraySize); - referenceStartPixelAcross.resize(arraySize); - secondaryStartPixelDown.resize(arraySize); - secondaryStartPixelAcross.resize(arraySize); - - int arraySizeChunk = numberChunks; - referenceChunkStartPixelDown.resize(arraySizeChunk); - referenceChunkStartPixelAcross.resize(arraySizeChunk); - secondaryChunkStartPixelDown.resize(arraySizeChunk); - secondaryChunkStartPixelAcross.resize(arraySizeChunk); - referenceChunkHeight.resize(arraySizeChunk); - referenceChunkWidth.resize(arraySizeChunk); - secondaryChunkHeight.resize(arraySizeChunk); - secondaryChunkWidth.resize(arraySizeChunk); + int arraySize = numberWindows*sizeof(int); + grossOffsetDown = (int *)malloc(arraySize); + grossOffsetAcross = (int *)malloc(arraySize); + referenceStartPixelDown = (int *)malloc(arraySize); + referenceStartPixelAcross = (int *)malloc(arraySize); + secondaryStartPixelDown = (int *)malloc(arraySize); + secondaryStartPixelAcross = (int *)malloc(arraySize); + + int arraySizeChunk = numberChunks*sizeof(int); + referenceChunkStartPixelDown = (int *)malloc(arraySizeChunk); + referenceChunkStartPixelAcross = (int *)malloc(arraySizeChunk); + secondaryChunkStartPixelDown = (int *)malloc(arraySizeChunk); + secondaryChunkStartPixelAcross = (int *)malloc(arraySizeChunk); + referenceChunkHeight = (int *)malloc(arraySizeChunk); + referenceChunkWidth = (int *)malloc(arraySizeChunk); + secondaryChunkHeight = (int *)malloc(arraySizeChunk); + secondaryChunkWidth = (int *)malloc(arraySizeChunk); +} + +void cuAmpcorParameter::deallocateArrays() +{ + free(grossOffsetDown); + free(grossOffsetAcross); + free(referenceStartPixelDown); + free(referenceStartPixelAcross); + free(secondaryStartPixelDown); + free(secondaryStartPixelAcross); + free(referenceChunkStartPixelDown); + free(referenceChunkStartPixelAcross); + free(secondaryChunkStartPixelDown); + free(secondaryChunkStartPixelAcross); + free(referenceChunkHeight); + free(referenceChunkWidth); + free(secondaryChunkHeight); + free(secondaryChunkWidth); } + +// **************** +// make reference window the same as secondary for oversampling +// **************** + /// Set starting pixels for reference and secondary windows from arrays /// set also gross offsets between reference and secondary windows /// @@ -137,12 +191,13 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD { for(int i=0; i referenceImageHeight) mChunkED = referenceImageHeight; + mChunkEA += windowSizeWidthRawEnlarged; + if (mChunkEA > referenceImageWidth) mChunkED = referenceImageWidth; + sChunkED += searchWindowSizeHeightRaw; + if (sChunkED > secondaryImageHeight) sChunkED = secondaryImageHeight; + sChunkEA += searchWindowSizeWidthRaw; + if (sChunkEA > secondaryImageWidth) sChunkED = secondaryImageWidth; + // set the starting pixel and size of the chunk referenceChunkStartPixelDown[idxChunk] = mChunkSD; referenceChunkStartPixelAcross[idxChunk] = mChunkSA; secondaryChunkStartPixelDown[idxChunk] = sChunkSD; secondaryChunkStartPixelAcross[idxChunk] = sChunkSA; - referenceChunkHeight[idxChunk] = mChunkED - mChunkSD + windowSizeHeightRaw; - referenceChunkWidth[idxChunk] = mChunkEA - mChunkSA + windowSizeWidthRaw; - secondaryChunkHeight[idxChunk] = sChunkED - sChunkSD + searchWindowSizeHeightRaw; - secondaryChunkWidth[idxChunk] = sChunkEA - sChunkSA + searchWindowSizeWidthRaw; + referenceChunkHeight[idxChunk] = mChunkED - mChunkSD; + referenceChunkWidth[idxChunk] = mChunkEA - mChunkSA; + secondaryChunkHeight[idxChunk] = sChunkED - sChunkSD; + secondaryChunkWidth[idxChunk] = sChunkEA - sChunkSA; + // search the max chunk size, used to determine the allocated read buffer size if(maxReferenceChunkHeight < referenceChunkHeight[idxChunk]) maxReferenceChunkHeight = referenceChunkHeight[idxChunk]; if(maxReferenceChunkWidth < referenceChunkWidth[idxChunk] ) maxReferenceChunkWidth = referenceChunkWidth[idxChunk]; if(maxSecondaryChunkHeight < secondaryChunkHeight[idxChunk]) maxSecondaryChunkHeight = secondaryChunkHeight[idxChunk]; @@ -253,8 +328,11 @@ void cuAmpcorParameter::setChunkStartPixels() } /// check whether reference and secondary windows are within the image range +// now issue warning rather than errors void cuAmpcorParameter::checkPixelInImageRange() { +// check range is no longer required, but offered as an option in DEBUG +#ifdef CUAMPCOR_DEBUG int endPixel; for(int row=0; row= referenceImageHeight) { - fprintf(stderr, "Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Warning: Reference Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); } endPixel = referenceStartPixelAcross[i] + windowSizeWidthRaw; if(endPixel >= referenceImageWidth) { - fprintf(stderr, "Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Reference Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); } //secondary if(secondaryStartPixelDown[i] <0) { - fprintf(stderr, "Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window start pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelDown[i]); } if(secondaryStartPixelAcross[i] <0) { - fprintf(stderr, "Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window start pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, secondaryStartPixelAcross[i]); } endPixel = secondaryStartPixelDown[i] + searchWindowSizeHeightRaw; if(endPixel >= secondaryImageHeight) { - fprintf(stderr, "Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window end pixel out ot range in Down, window (%d,%d), pixel %d\n", row, col, endPixel); } endPixel = secondaryStartPixelAcross[i] + searchWindowSizeWidthRaw; if(endPixel >= secondaryImageWidth) { - fprintf(stderr, "Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); - exit(EXIT_FAILURE); + printf("Warning: Secondary Window end pixel out ot range in Across, window (%d,%d), pixel %d\n", row, col, endPixel); } } } +#endif // CUAMPCOR_DEBUG } -cuAmpcorParameter::~cuAmpcorParameter() {} +cuAmpcorParameter::~cuAmpcorParameter() +{ + deallocateArrays(); +} // end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h index 1a0f86031..a02bdead6 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h @@ -10,7 +10,7 @@ #define __CUAMPCORPARAMETER_H #include -#include +#include // for int2 /// Class container for all parameters /// @@ -32,14 +32,6 @@ class cuAmpcorParameter{ public: - - // TODO this is to avoid memory issues due to copying the non-owning pointers - // we should use RAII ownership so these can be trivially defaulted - cuAmpcorParameter(const cuAmpcorParameter&) = delete; - cuAmpcorParameter& operator=(const cuAmpcorParameter&) = delete; - cuAmpcorParameter(cuAmpcorParameter&&) = delete; - cuAmpcorParameter& operator=(cuAmpcorParameter&&) = delete; - int algorithm; ///< Cross-correlation algorithm: 0=freq domain (default) 1=time domain int deviceID; ///< Targeted GPU device ID: use -1 to auto select int nStreams; ///< Number of streams to asynchonize data transfers and compute kernels @@ -48,24 +40,25 @@ class cuAmpcorParameter{ // chip or window size for raw data int windowSizeHeightRaw; ///< Template window height (original size) int windowSizeWidthRaw; ///< Template window width (original size) + + int windowSizeHeightRawEnlarged; ///< Template window height Enlarged to search window size for oversampling + int windowSizeWidthRawEnlarged; ///< Template window width Enlarged to search window size for oversampling + int searchWindowSizeHeightRaw; ///< Search window height (original size) int searchWindowSizeWidthRaw; ///< Search window width (orignal size) int halfSearchRangeDownRaw; ///< (searchWindowSizeHeightRaw-windowSizeHeightRaw)/2 int halfSearchRangeAcrossRaw; ///< (searchWindowSizeWidthRaw-windowSizeWidthRaw)/2 // search range is (-halfSearchRangeRaw, halfSearchRangeRaw) - - int searchWindowSizeHeightRawZoomIn; ///< search window height used for zoom in - int searchWindowSizeWidthRawZoomIn; ///< search window width used for zoom in - - int corrStatWindowSize; ///< correlation surface size used to estimate snr - int corrRawZoomInHeight; ///< correlation surface height used for oversampling - int corrRawZoomInWidth; ///< correlation surface width used for oversampling + // note the search range now includes extra margin for the correlation surface extraction // chip or window size after oversampling - int rawDataOversamplingFactor; ///< Raw data overampling factor (from original size to oversampled size) + int rawDataOversamplingFactor; ///< Raw data oversampling factor (from original size to oversampled size) int windowSizeHeight; ///< Template window length (oversampled size) int windowSizeWidth; ///< Template window width (original size) + int windowSizeHeightEnlarged; ///< Template window length enlarged (oversampled size) + int windowSizeWidthEnlarged; ///< Template window width enlarged (original size) + int searchWindowSizeHeight; ///< Search window height (oversampled size) int searchWindowSizeWidth; ///< Search window width (oversampled size) @@ -80,7 +73,14 @@ class cuAmpcorParameter{ int oversamplingFactor; ///< Oversampling factor for interpolating correlation surface int oversamplingMethod; ///< correlation surface oversampling method 0 = fft (default) 1 = sinc + // correlation surface + int2 corrWindowSize; // 2*halfSearchRange + 1 + int2 corrZoomInSize; // zoomWindowSize+1 + int2 corrZoomInOversampledSize; // corrZoomInSize * oversamplingFactor + int corrStatWindowSize; ///< correlation surface size used to estimate snr + // int corrRawZoomInHeight; ///< correlation surface height used for oversampling + // int corrRawZoomInWidth; ///< correlation surface width used for oversampling float thresholdSNR; ///< Threshold of Signal noise ratio to remove noisy data //reference image @@ -113,24 +113,24 @@ class cuAmpcorParameter{ int referenceStartPixelDown0; ///< first starting pixel in reference image (down) int referenceStartPixelAcross0; ///< first starting pixel in reference image (across) - std::vector referenceStartPixelDown; ///< reference starting pixels for each window (down) - std::vector referenceStartPixelAcross; ///< reference starting pixels for each window (across) - std::vector secondaryStartPixelDown; ///< secondary starting pixels for each window (down) - std::vector secondaryStartPixelAcross; ///< secondary starting pixels for each window (across) + int *referenceStartPixelDown; ///< reference starting pixels for each window (down) + int *referenceStartPixelAcross; ///< reference starting pixels for each window (across) + int *secondaryStartPixelDown; ///< secondary starting pixels for each window (down) + int *secondaryStartPixelAcross; ///< secondary starting pixels for each window (across) int grossOffsetDown0; ///< gross offset static component (down) int grossOffsetAcross0; ///< gross offset static component (across) - std::vector grossOffsetDown; ///< Gross offsets between reference and secondary windows (down) - std::vector grossOffsetAcross; ///< Gross offsets between reference and secondary windows (across) + int *grossOffsetDown; ///< Gross offsets between reference and secondary windows (down) + int *grossOffsetAcross; ///< Gross offsets between reference and secondary windows (across) int mergeGrossOffset; ///< whether to merge gross offsets into the final offsets - std::vector referenceChunkStartPixelDown; ///< reference starting pixels for each chunk (down) - std::vector referenceChunkStartPixelAcross; ///< reference starting pixels for each chunk (across) - std::vector secondaryChunkStartPixelDown; ///< secondary starting pixels for each chunk (down) - std::vector secondaryChunkStartPixelAcross; ///< secondary starting pixels for each chunk (across) - std::vector referenceChunkHeight; ///< reference chunk height - std::vector referenceChunkWidth; ///< reference chunk width - std::vector secondaryChunkHeight; ///< secondary chunk height - std::vector secondaryChunkWidth; ///< secondary chunk width + int *referenceChunkStartPixelDown; ///< reference starting pixels for each chunk (down) + int *referenceChunkStartPixelAcross; ///< reference starting pixels for each chunk (across) + int *secondaryChunkStartPixelDown; ///< secondary starting pixels for each chunk (down) + int *secondaryChunkStartPixelAcross; ///< secondary starting pixels for each chunk (across) + int *referenceChunkHeight; ///< reference chunk height + int *referenceChunkWidth; ///< reference chunk width + int *secondaryChunkHeight; ///< secondary chunk height + int *secondaryChunkWidth; ///< secondary chunk width int maxReferenceChunkHeight, maxReferenceChunkWidth; ///< max reference chunk size int maxSecondaryChunkHeight, maxSecondaryChunkWidth; ///< max secondary chunk size @@ -138,7 +138,7 @@ class cuAmpcorParameter{ std::string offsetImageName; ///< Offset fields output filename std::string snrImageName; ///< Output SNR filename std::string covImageName; ///< Output variance filename - std::string corrImageName; ///< Output cross-correlation peak filename + std::string peakValueImageName; ///< Output correlation surface peak value filename // Class constructor and default parameters setter cuAmpcorParameter(); @@ -147,6 +147,9 @@ class cuAmpcorParameter{ // Allocate various arrays after the number of Windows is given void allocateArrays(); + // Deallocate arrays on exit + void deallocateArrays(); + // Three methods to set reference/secondary starting pixels and gross offsets from input reference start pixel(s) and gross offset(s) // 1 (int *, int *, int *, int *): varying reference start pixels and gross offsets diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h index 5685863e9..55dde31fe 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h @@ -9,6 +9,7 @@ #ifndef __CUAMPCORUTIL_H #define __CUAMPCORUTIL_H +#include "data_types.h" #include "cuArrays.h" #include "cuAmpcorParameter.h" #include "cudaError.h" @@ -18,14 +19,14 @@ //in cuArraysCopy.cu: various utilities for copy images file in gpu memory -void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); -void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); -void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); -void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream); -void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); +void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); +void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); +void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); +void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream); +void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream); // same routine name overloaded for different data type // extract data from a large image @@ -33,67 +34,73 @@ template void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *offset, cudaStream_t); template void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t); +void cuArraysCopyExtractAbs(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream); template void cuArraysCopyInsert(cuArrays *in, cuArrays *out, int offsetX, int offsetY, cudaStream_t); template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut,cudaStream_t stream); -void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t stream); +void cuArraysSetConstant(cuArrays *imageIn, real_type value, cudaStream_t stream); -void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream); // cuDeramp.cu: deramping phase -void cuDeramp(int method, cuArrays *images, cudaStream_t stream); -void cuDerampMethod1(cuArrays *images, cudaStream_t stream); +void cuDeramp(int method, cuArrays *images, cudaStream_t stream); +void cuDerampMethod1(cuArrays *images, cudaStream_t stream); // cuArraysPadding.cu: various utilities for oversampling padding -void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysFFTPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream); //in cuCorrNormalization.cu: utilities to normalize the cross correlation function -void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); -void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream); +void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrNormalize64(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize128(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize256(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize512(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalize1024(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); + +// in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); template -void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); +void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream); // in cuCorrNormalizationSAT.cu: to normalize the cross correlation function with sum area table -void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, - cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySAT, cuArrays *secondarySAT2, cudaStream_t stream); //in cuOffset.cu: utitilies for determining the max locaiton of cross correlations or the offset -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream); -void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, - int OverSampleRatioZoomin, int OverSampleRatioRaw, - int xHalfRangeInit, int yHalfRangeInit, - cudaStream_t stream); +void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); +void cuArraysMaxloc2D(cuArrays *images, const int2 start, const int2 range, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); +void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetRoomIn, cuArrays *offsetFinal, const int2 initOrigin, const int initFactor, const int2 zoomInOrigin, const int zoomInFactor, cudaStream_t stream); void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream); //in cuCorrTimeDomain.cu: cross correlation in time domain -void cuCorrTimeDomain(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); +void cuCorrTimeDomain(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); //in cuCorrFrequency.cu: cross correlation in freq domain, also include fft correlatior class -void cuArraysElementMultiply(cuArrays *image1, cuArrays *image2, cudaStream_t stream); -void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, float coef, cudaStream_t stream); +void cuArraysElementMultiply(cuArrays *image1, cuArrays *image2, cudaStream_t stream); +void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, real_type coef, cudaStream_t stream); // For SNR estimation on Correlation surface (Minyan Zhong) // implemented in cuArraysCopy.cu -void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream); +void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *maxloc, cudaStream_t stream); // implemented in cuCorrNormalization.cu -void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, cuArrays *imagesValidCount, cudaStream_t stream); +void cuArraysSumSquare(cuArrays *images, cuArrays *imagesSum, cudaStream_t stream); // implemented in cuEstimateStats.cu -void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream); +void cuEstimateSnr(cuArrays *corrSum, cuArrays *maxval, cuArrays *snrValue, const int size, cudaStream_t stream); // implemented in cuEstimateStats.cu -void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream); +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream); #endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp similarity index 60% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp index f7cc88837..642c29c52 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp @@ -7,6 +7,11 @@ // dependencies #include "cuArrays.h" #include "cudaError.h" +#include "float2.h" +#include "data_types.h" +#include +#include +#include // allocate arrays in device memory template @@ -62,6 +67,34 @@ void cuArrays::setZero(cudaStream_t stream) checkCudaErrors(cudaMemsetAsync(devData, 0, getByteSize(), stream)); } + +// Overloaded << operator for composite type +std::ostream& operator<<(std::ostream& os, const float2& p) { + + return os << "(" << p.x << ", " << p.y << ")"; +} + +std::ostream& operator<<(std::ostream& os, const float3& p) { + + return os << "(" << p.x << ", " << p.y << ", " << p.z << ")"; +} + +std::ostream& operator<<(std::ostream& os, const double2& p) { + + return os << "(" << p.x << ", " << p.y << ")"; +} + +std::ostream& operator<<(std::ostream& os, const double3& p) { + + return os << "(" << p.x << ", " << p.y << ", " << p.z << ")"; +} + +std::ostream& operator<<(std::ostream& os, const int2& p) { + + return os << "(" << p.x << ", " << p.y << ")"; +} + + // output (partial) data when debugging template void cuArrays::debuginfo(cudaStream_t stream) { @@ -87,63 +120,6 @@ void cuArrays::debuginfo(cudaStream_t stream) { } } -// need specializations for x,y components -template<> -void cuArrays::debuginfo(cudaStream_t stream) { - std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl; - if( !is_allocatedHost) - allocateHost(); - copyToHost(stream); - - int range = std::min(10, size*count); - - for(int i=0; irange) { - for(int i=size*count-range; i -void cuArrays::debuginfo(cudaStream_t stream) { - std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl; - if( !is_allocatedHost) - allocateHost(); - copyToHost(stream); - - int range = std::min(10, size*count); - - for(int i=0; irange) { - for(int i=size*count-range; i -void cuArrays::debuginfo(cudaStream_t stream) { - std::cout << "Image height,width,count: " << height << "," << width << "," << count << std::endl; - if( !is_allocatedHost) - allocateHost(); - copyToHost(stream); - - int range = std::min(10, size*count); - - for(int i=0; irange) { - for(int i=size*count-range; i @@ -169,6 +145,9 @@ void cuArrays::outputHostToFile(std::string fn) template class cuArrays; template class cuArrays; template class cuArrays; +template class cuArrays; +template class cuArrays; +template class cuArrays; template class cuArrays; template class cuArrays; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h index b823a6b9d..7e92acd2b 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h @@ -12,14 +12,9 @@ #define __CUARRAYS_H // cuda dependencies -#include #include -#include -#include -#include -#include - +#include template class cuArrays{ diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu index 20e749316..bd23b41ec 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu @@ -25,10 +25,11 @@ #include "cudaUtil.h" #include "cudaError.h" #include "float2.h" +#include "data_types.h" // cuda kernel for cuArraysCopyToBatch -__global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX, const int inNY, - float2 *imageOut, const int outNX, const int outNY, +__global__ void cuArraysCopyToBatch_kernel(const image_complex_type *imageIn, const int inNX, const int inNY, + complex_type *imageOut, const int outNX, const int outNY, const int nImagesX, const int nImagesY, const int strideX, const int strideY) { @@ -40,7 +41,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX int idxImageX = idxImage/nImagesY; int idxImageY = idxImage%nImagesY; int idxIn = (idxImageX*strideX+outx)*inNY + idxImageY*strideY+outy; - imageOut[idxOut] = imageIn[idxIn]; + imageOut[idxOut] = make_complex_type(imageIn[idxIn].x, imageIn[idxIn].y); } /** @@ -52,7 +53,7 @@ __global__ void cuArraysCopyToBatch_kernel(const float2 *imageIn, const int inNX * @param strideW stride along width to extract chips * @param stream cudaStream */ -void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, +void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream) { const int nthreads = NTHREADS2D; @@ -68,54 +69,113 @@ void cuArraysCopyToBatch(cuArrays *image1, cuArrays *image2, // kernel for cuArraysCopyToBatchWithOffset template -__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNY, +__global__ void cuArraysCopyToBatchWithOffset_kernel(const T_in *imageIn, const int inNX, const int inNY, T_out *imageOut, const int outNX, const int outNY, const int nImages, const int *offsetX, const int *offsetY) { + // get image index int idxImage = blockIdx.z; + // check the image index within range + if(idxImage>=nImages ) return; + // get the output pixel location int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + // check the output location within range (due to cuda threads) + if(outx >= outNX || outy >= outNY) return; + // flatten the output location to 1d int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = T_out{imageIn[idxIn]}; + // find the input pixel location + int inx = offsetX[idxImage] + outx; + int iny = offsetY[idxImage] + outy; + // check whether the location is within the input image range + if(inx>=0 && inx=0 && iny +__global__ void cuArraysCopyToBatchWithOffset_kernel(const image_complex_type *imageIn, const int inNX, const int inNY, + complex_type *imageOut, const int outNX, const int outNY, const int nImages, + const int *offsetX, const int *offsetY) +{ + // get image index + int idxImage = blockIdx.z; + // check the image index within range + if(idxImage>=nImages ) return; + // get the output pixel location + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + // check the output location within range (due to cuda threads) + if(outx >= outNX || outy >= outNY) return; + // flatten the output location to 1d + int idxOut = idxImage*outNX*outNY + outx*outNY + outy; + // find the input pixel location + int inx = offsetX[idxImage] + outx; + int iny = offsetY[idxImage] + outy; + // check whether the location is within the input image range + if(inx>=0 && inx=0 && iny *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) +void cuArraysCopyToBatchWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); cuArraysCopyToBatchWithOffset_kernel<<>> ( - image1->devData, lda1, + image1->devData, inNX, inNY, image2->devData, image2->height, image2->width, image2->count, offsetH, offsetW); getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); } // same as above, but from complex to real(take amplitudes) -__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const float2 *imageIn, const int inNY, - float2 *imageOut, const int outNX, const int outNY, const int nImages, +__global__ void cuArraysCopyToBatchAbsWithOffset_kernel(const image_complex_type *imageIn, const int inNX, const int inNY, + complex_type *imageOut, const int outNX, const int outNY, const int nImages, const int *offsetX, const int *offsetY) { + + // get image index int idxImage = blockIdx.z; + // check the image index within range + if(idxImage>=nImages ) return; + // get the output pixel location int outx = threadIdx.x + blockDim.x*blockIdx.x; int outy = threadIdx.y + blockDim.y*blockIdx.y; - if(idxImage>=nImages || outx >= outNX || outy >= outNY) return; + // check the output location within range (due to cuda threads) + if(outx >= outNX || outy >= outNY) return; + // flatten the output location to 1d int idxOut = idxImage*outNX*outNY + outx*outNY + outy; - int idxIn = (offsetX[idxImage]+outx)*inNY + offsetY[idxImage] + outy; - imageOut[idxOut] = make_float2(complexAbs(imageIn[idxIn]), 0.0); + // find the input pixel location + int inx = offsetX[idxImage] + outx; + int iny = offsetY[idxImage] + outy; + // check whether the location is within the input image range + if(inx>=0 && inx=0 && iny *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) +void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); cuArraysCopyToBatchAbsWithOffset_kernel<<>> ( - image1->devData, lda1, + image1->devData, inNX, inNY, image2->devData, image2->height, image2->width, image2->count, offsetH, offsetW); getLastCudaError("cuArraysCopyToBatchAbsWithOffset_kernel"); @@ -151,24 +211,25 @@ void cuArraysCopyToBatchAbsWithOffset(cuArrays *image1, const int lda1, * @param strideW (varying) offsets along width to extract chips * @param stream cudaStream */ -void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int lda1, cuArrays *image2, - const int *offsetH, const int* offsetW, cudaStream_t stream) +void cuArraysCopyToBatchWithOffsetR2C(cuArrays *image1, const int inNX, const int inNY, + cuArrays *image2, const int *offsetH, const int* offsetW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); + cuArraysCopyToBatchWithOffset_kernel<<>> ( - image1->devData, lda1, + image1->devData, inNX, inNY, image2->devData, image2->height, image2->width, image2->count, offsetH, offsetW); getLastCudaError("cuArraysCopyToBatchWithOffsetR2C_kernel"); } //copy a chunk into a series of chips, from complex to real -__global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, +__global__ void cuArraysCopyC2R_kernel(const complex_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImagesX, const int nImagesY, - const int strideX, const int strideY, const float factor) + const int strideX, const int strideY, const real_type factor) { int idxImage = blockIdx.z; int outx = threadIdx.x + blockDim.x*blockIdx.x; @@ -190,13 +251,13 @@ __global__ void cuArraysCopyC2R_kernel(const float2 *imageIn, const int inNX, co * @param strideW offsets along width to extract chips * @param stream cudaStream */ -void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, +void cuArraysCopyC2R(cuArrays *image1, cuArrays *image2, int strideH, int strideW, cudaStream_t stream) { const int nthreads = 16; dim3 blockSize(nthreads, nthreads, 1); dim3 gridSize(IDIVUP(image2->height,nthreads), IDIVUP(image2->width,nthreads), image2->count); - float factor = 1.0f/image1->size; //the FFT factor + real_type factor = 1.0f/image1->size; //the FFT factor cuArraysCopyC2R_kernel<<>> ( image1->devData, image1->height, image1->width, image2->devData, image2->height, image2->width, @@ -242,12 +303,12 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays } // instantiate the above template for the data types we need -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, cuArrays *offsets, cudaStream_t); // correlation surface extraction (Minyan Zhong) -__global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, +__global__ void cuArraysCopyExtractVaryingOffsetCorr(const real_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImages, const int2 *maxloc) { @@ -266,21 +327,9 @@ __global__ void cuArraysCopyExtractVaryingOffsetCorr(const float *imageIn, const int iny = outy + maxloc[idxImage].y - outNY/2; // Find the location in flattened array. - int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy; - int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny; - - // check whether inside of the input image - if (inx>=0 && iny>=0 && inx *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream) +void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *maxloc, cudaStream_t stream) { //assert(imagesIn->height >= imagesOut && inNY >= outNY); const int nthreads = 16; @@ -298,8 +347,10 @@ void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesO dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); - cuArraysCopyExtractVaryingOffsetCorr<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, - imagesOut->devData, imagesOut->height, imagesOut->width, imagesValid->devData, imagesOut->count, maxloc->devData); + cuArraysCopyExtractVaryingOffsetCorr<<>>( + imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, + imagesOut->count, maxloc->devData); getLastCudaError("cuArraysCopyExtract error"); } @@ -323,8 +374,8 @@ __global__ void cuArraysCopyExtractFixedOffset(const T *imageIn, const int inNX, } } -__global__ void cuArraysCopyExtractFixedOffset(const float2 *imageIn, const int inNX, const int inNY, - float *imageOut, const int outNX, const int outNY, const int nImages, +__global__ void cuArraysCopyExtractFixedOffset(const complex_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImages, const int offsetX, const int offsetY) { int outx = threadIdx.x + blockDim.x*blockIdx.x; @@ -356,10 +407,43 @@ void cuArraysCopyExtract(cuArrays *imagesIn, cuArrays *imagesOut, i } // instantiate the above template for the data types we need -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); -template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); +template void cuArraysCopyExtract(cuArrays *in, cuArrays *out, int2 offset, cudaStream_t); + + +__global__ void cuArraysCopyExtractAbsFixedOffset(const complex_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, const int nImages, + const int offsetX, const int offsetY) +{ + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + + if(outx < outNX && outy < outNY) + { + int idxOut = (blockIdx.z * outNX + outx)*outNY+outy; + int idxIn = (blockIdx.z*inNX + outx + offsetX)*inNY + outy + offsetY; + imageOut[idxOut] = complexAbs(imageIn[idxIn]); + } +} +/** + * copy/extract images from a large size to + * a smaller size from the location (offsetX, offsetY), take amplitude + */ +void cuArraysCopyExtractAbs(cuArrays *imagesIn, cuArrays *imagesOut, int2 offset, cudaStream_t stream) +{ + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = NTHREADS2D; + dim3 threadsperblock(nthreads, nthreads,1); + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + cuArraysCopyExtractAbsFixedOffset<<>> + (imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, imagesOut->count, offset.x, offset.y); + getLastCudaError("cuArraysCopyExtractAbs error"); +} + + template __global__ void cuArraysCopyInsert_kernel(const T* imageIn, const int inNX, const int inNY, @@ -389,9 +473,9 @@ void cuArraysCopyInsert(cuArrays *imageIn, cuArrays *imageOut, int offsetX } // instantiate the above template for the data types we need -template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); -template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); -template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); +template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); template void cuArraysCopyInsert(cuArrays* in, cuArrays* out, int offX, int offY, cudaStream_t); template @@ -430,12 +514,12 @@ void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cuda } // instantiate the above template for the data types we need -template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); -template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); -template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); +template void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut, cudaStream_t); // cuda kernel for setting a constant value -__global__ void cuArraysSetConstant_kernel(float *image, int size, float value) +__global__ void cuArraysSetConstant_kernel(real_type *image, int size, real_type value) { int idx = threadIdx.x + blockDim.x*blockIdx.x; @@ -449,7 +533,7 @@ __global__ void cuArraysSetConstant_kernel(float *image, int size, float value) * Set real images to a constant value * @note use setZero if value=0 because cudaMemset is faster */ -void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t stream) +void cuArraysSetConstant(cuArrays *imageIn, real_type value, cudaStream_t stream) { const int nthreads = 256; int size = imageIn->getSize(); @@ -460,14 +544,14 @@ void cuArraysSetConstant(cuArrays *imageIn, float value, cudaStream_t str } -// convert float to float2(complex) -__global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size) +// convert real_type to complex_type(complex) +__global__ void cuArraysR2C_kernel(real_type *image1, complex_type *image2, int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if(idx < size) { image2[idx].x = image1[idx]; - image2[idx].y = 0.0f; + image2[idx].y = 0.0; } } @@ -476,7 +560,7 @@ __global__ void cuArraysR2C_kernel(float *image1, float2 *image2, int size) * @param[in] image1 input images * @param[out] image2 output images */ -void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int size = image1->getSize(); cuArraysR2C_kernel<<>>(image1->devData, image2->devData, size); @@ -484,8 +568,8 @@ void cuArraysR2C(cuArrays *image1, cuArrays *image2, cudaStream_t } -// take real part of float2 to float -__global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size) +// take real part of complex_type to real_type +__global__ void cuArraysC2R_kernel(complex_type *image1, real_type *image2, int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if(idx < size) @@ -499,7 +583,7 @@ __global__ void cuArraysC2R_kernel(float2 *image1, float *image2, int size) * @param[in] image1 input images * @param[out] image2 output images */ -void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int size = image1->getSize(); cuArraysC2R_kernel<<>>(image1->devData, image2->devData, size); @@ -507,7 +591,7 @@ void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t } // cuda kernel for cuArraysAbs -__global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size) +__global__ void cuArraysAbs_kernel(complex_type *image1, real_type *image2, int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if(idx < size) @@ -521,7 +605,7 @@ __global__ void cuArraysAbs_kernel(float2 *image1, float *image2, int size) * @param[in] image1 input images * @param[out] image2 output images */ -void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int size = image1->getSize(); cuArraysAbs_kernel<<>>(image1->devData, image2->devData, size); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu index af3927056..c3feacf30 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysPadding.cu @@ -1,83 +1,121 @@ /* * @file cuArraysPadding.cu - * @brief Utilities for padding zeros to cuArrays + * @brief Utilities for padding zeros to cuArrays for FFT (zero padding in the middle) */ #include "cuAmpcorUtil.h" #include "float2.h" -// cuda kernel for cuArraysPadding -__global__ void cuArraysPadding_kernel( - const float2 *image1, const int height1, const int width1, - float2 *image2, const int height2, const int width2) -{ - int tx = threadIdx.x + blockDim.x*blockIdx.x; - int ty = threadIdx.y + blockDim.y*blockIdx.y; - if(tx < height1/2 && ty < width1/2) - { - int tx1 = height1 - 1 - tx; - int ty1 = width1 -1 -ty; - int tx2 = height2 -1 -tx; - int ty2 = width2 -1 -ty; - - image2[IDX2R(tx, ty, width2)] = image1[IDX2R(tx, ty, width1)]; - image2[IDX2R(tx2, ty, width2)] = image1[IDX2R(tx1, ty, width1)]; - image2[IDX2R(tx, ty2, width2)] = image1[IDX2R(tx, ty1, width1)]; - image2[IDX2R(tx2, ty2, width2)] = image1[IDX2R(tx1, ty1, width1)]; - - } -} - -/** - * Padding zeros in the middle, move quads to corners - * @param[in] image1 input images - * @param[out] image2 output images - * @note This routine is for a single image, no longer used - */ -void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream) -{ - int ThreadsPerBlock = NTHREADS2D; - int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); - int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); - dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock); - dim3 dimGrid(BlockPerGridx, BlockPerGridy); - // set output image to 0 - checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); - // copy the quads of input images to four corners of the output images - cuArraysPadding_kernel<<>>( - image1->devData, image1->height, image1->width, - image2->devData, image2->height, image2->width); - getLastCudaError("cuArraysPadding_kernel"); -} -inline __device__ float2 cmplxMul(float2 c, float a) +// cuda kernel for zero padding for FFT oversampling, +// for both even and odd sequences +// @param[in] image1 input images +// @param[inout] image2 output images - memset to 0 in prior +// @note siding Nyquist frequency for even length with negative frequency +// for even N - positive f[0, ..., N/2-1], +// zeros 0...0, +// negative f[N/2], f[N/2+1, ..., N-1] +// for odd N - positive f[0, ..., (N-1)/2], +// zeros 0...0, +// negative f [(N+1)/2, ..., N-1] +__global__ void cuArraysPaddingMany_kernel( + const complex_type *image1, const int height1, const int width1, const int size1, + complex_type *image2, const int height2, const int width2, const int size2, const real_type factor ) { - return make_float2(c.x*a, c.y*a); + // thread indices are for input image1 + int x1 = threadIdx.x + blockDim.x*blockIdx.x; + int y1 = threadIdx.y + blockDim.y*blockIdx.y; + int imageIdx = blockIdx.z; + + // ensure threads are in the range of image1 + if (x1 >= height1 || y1 >= width1) + return; + + // determine the quadrants + // divup the length to be consistent with both even and odd lengths + int x2 = (x1 < (height1+1)/2) ? x1 : height2 - height1 + x1; + int y2 = (y1 < (width1+1)/2) ? y1 : width2 - width1 + y1; + image2[IDX2R(x2, y2, width2)+imageIdx*size2] + = image1[IDX2R(x1, y1, width1)+imageIdx*size1]*factor; + return; } -// cuda kernel for -__global__ void cuArraysPaddingMany_kernel( - const float2 *image1, const int height1, const int width1, const int size1, - float2 *image2, const int height2, const int width2, const int size2, const float factor ) +// cuda kernel for zero padding for FFT oversampling, +// for both even and odd sequences +// @param[in] image1 input images +// @param[inout] image2 output images - memset to 0 in prior +// @note for even length (height, width or both), +// the Nyquist frequency (N/2) component is split between positive and negative frequencies. +__global__ void cuArraysPaddingMany_split_Nyquist_kernel( + const complex_type *image1, const int height1, const int width1, const int size1, + complex_type *image2, const int height2, const int width2, const int size2, const real_type factor ) { - int tx = threadIdx.x + blockDim.x*blockIdx.x; - int ty = threadIdx.y + blockDim.y*blockIdx.y; - if(tx < height1/2 && ty < width1/2) + // thread indices are for input image1 + int x1 = threadIdx.x + blockDim.x*blockIdx.x; + int y1 = threadIdx.y + blockDim.y*blockIdx.y; + int imageIdx = blockIdx.z; + + // ensure threads are in the range of image1 + if (x1 >= height1 || y1 >= width1) + return; + + int xcenter1 = (height1+1)/2; // divup + int ycenter1 = (width1+1)/2; + + // for even N - positive f[0, ..., N/2-1], f[N/2]/2, + // zeros 0...0, + // negative f[N/2]/2, f[N/2+1, ..., N-1] + // f[N/2] is split to N/2 and M-N/2 + // for odd N - positive f[0, ..., (N-1)/2], + // zeros 0...0, + // negative f [(N+1)/2, ..., N-1] + + // split spectrum for even height1 at the center + if (height1 % 2 ==0 && x1 == xcenter1) { - - int tx1 = height1 - 1 - tx; - int ty1 = width1 -1 -ty; - int tx2 = height2 -1 -tx; - int ty2 = width2 -1 -ty; - - int stride1 = blockIdx.z*size1; - int stride2 = blockIdx.z*size2; - - image2[IDX2R(tx, ty, width2)+stride2] = image1[IDX2R(tx, ty, width1)+stride1]*factor; - image2[IDX2R(tx2, ty, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty, width1)+stride1], factor); - image2[IDX2R(tx, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx, ty1, width1)+stride1], factor); - image2[IDX2R(tx2, ty2, width2)+stride2] = cmplxMul(image1[IDX2R(tx1, ty1, width1)+stride1], factor); + // if also for the width + if (width1 % 2 ==0 && y1 == ycenter1) + { + // split into 4 + complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1]; + input *= 0.25f*factor; + image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input; + int x2 = x1 + height2 - height1; + int y2 = y1 + width2 - width1; + image2[IDX2R(x2, y1, width2)+imageIdx*size2] = input; + image2[IDX2R(x1, y2, width2)+imageIdx*size2] = input; + image2[IDX2R(x2, y2, width2)+imageIdx*size2] = input; + // all done for this pixel, so return + return; + } + // odd width or not at the center + complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1]; + input *= 0.5f*factor; + int x2 = x1 + height2 - height1; + image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input; + image2[IDX2R(x2, y1, width2)+imageIdx*size2] = input; + return; } + // split spectrum for even width1 + if (width1 % 2 ==0 && y1 == ycenter1) + { + // even height even and at the center has been considered earlier + // so just odd height or not at the center + complex_type input = image1[IDX2R(x1, y1, width1)+imageIdx*size1]; + input *= 0.5f*factor; + int y2 = y1 + width2 - width1; + image2[IDX2R(x1, y1, width2)+imageIdx*size2] = input; + image2[IDX2R(x1, y2, width2)+imageIdx*size2] = input; + return; + } + + // all other cases, simply copy + // determine the quadrants + int x2 = (x1 < xcenter1) ? x1 : height2 - height1 + x1; + int y2 = (y1 < ycenter1) ? y1 : width2 - width1 + y1; + image2[IDX2R(x2, y2, width2)+imageIdx*size2] + = image1[IDX2R(x1, y1, width1)+imageIdx*size1]*factor; + return; } /** @@ -86,16 +124,17 @@ __global__ void cuArraysPaddingMany_kernel( * @param[out] image2 output images * @note To keep the band center at (0,0), move quads to corners and pad zeros in the middle */ -void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream) +void cuArraysFFTPaddingMany(cuArrays *image1, cuArrays *image2, cudaStream_t stream) { int ThreadsPerBlock = NTHREADS2D; - int BlockPerGridx = IDIVUP (image1->height/2, ThreadsPerBlock); - int BlockPerGridy = IDIVUP (image1->width/2, ThreadsPerBlock); + // up to IDIVUP(dim, 2) for odd-length sequences + int BlockPerGridx = IDIVUP (image1->height, ThreadsPerBlock); + int BlockPerGridy = IDIVUP (image1->width, ThreadsPerBlock); dim3 dimBlock(ThreadsPerBlock, ThreadsPerBlock, 1); dim3 dimGrid(BlockPerGridx, BlockPerGridy, image1->count); checkCudaErrors(cudaMemsetAsync(image2->devData, 0, image2->getByteSize(),stream)); - float factor = 1.0f/image1->size; + real_type factor = 1.0f/image1->size; cuArraysPaddingMany_kernel<<>>( image1->devData, image1->height, image1->width, image1->size, image2->devData, image2->height, image2->width, image2->size, factor); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu index 094cd97b4..e0f9b002b 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.cu @@ -21,6 +21,16 @@ cuFreqCorrelator::cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaSt int n[NRANK] ={imageNX, imageNY}; // set up fft plans +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, + NULL, 1, imageSize, + NULL, 1, fImageSize, + CUFFT_D2Z, nImages)); + cufft_Error(cufftPlanMany(&backwardPlan, NRANK, n, + NULL, 1, fImageSize, + NULL, 1, imageSize, + CUFFT_Z2D, nImages)); +#else cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, @@ -29,16 +39,17 @@ cuFreqCorrelator::cuFreqCorrelator(int imageNX, int imageNY, int nImages, cudaSt NULL, 1, fImageSize, NULL, 1, imageSize, CUFFT_C2R, nImages)); +#endif stream = stream_; cufftSetStream(forwardPlan, stream); cufftSetStream(backwardPlan, stream); // set up work arrays - workFM = new cuArrays(imageNX, (imageNY/2+1), nImages); + workFM = new cuArrays(imageNX, (imageNY/2+1), nImages); workFM->allocate(); - workFS = new cuArrays(imageNX, (imageNY/2+1), nImages); + workFS = new cuArrays(imageNX, (imageNY/2+1), nImages); workFS->allocate(); - workT = new cuArrays (imageNX, imageNY, nImages); + workT = new cuArrays (imageNX, imageNY, nImages); workT->allocate(); } @@ -60,38 +71,46 @@ cuFreqCorrelator::~cuFreqCorrelator() * @param[out] results the correlation surfaces */ -void cuFreqCorrelator::execute(cuArrays *templates, cuArrays *images, cuArrays *results) +void cuFreqCorrelator::execute(cuArrays *templates, cuArrays *images, cuArrays *results) { // pad the reference windows to the the size of search windows cuArraysCopyPadded(templates, workT, stream); // forward fft to frequency domain +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecD2Z(forwardPlan, workT->devData, workFM->devData)); + cufft_Error(cufftExecD2Z(forwardPlan, images->devData, workFS->devData)); +#else cufft_Error(cufftExecR2C(forwardPlan, workT->devData, workFM->devData)); cufft_Error(cufftExecR2C(forwardPlan, images->devData, workFS->devData)); +#endif + // cufft doesn't normalize, so manually get the image size for normalization - float coef = 1.0/(images->size); + real_type coef = 1.0/(images->size); // multiply reference with secondary windows in frequency domain cuArraysElementMultiplyConjugate(workFM, workFS, coef, stream); // backward fft to get correlation surface in time domain +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2D(backwardPlan, workFM->devData, workT->devData)); +#else cufft_Error(cufftExecC2R(backwardPlan, workFM->devData, workT->devData)); +#endif // extract to get proper size of correlation surface cuArraysCopyExtract(workT, results, make_int2(0, 0), stream); // all done } // a = a^* * b -inline __device__ float2 cuMulConj(float2 a, float2 b) +inline __device__ complex_type cuMulConj(complex_type a, complex_type b, real_type f) { - return make_float2(a.x*b.x + a.y*b.y, -a.y*b.x + a.x*b.y); + return make_complex_type(f*(a.x*b.x + a.y*b.y), f*(-a.y*b.x + a.x*b.y)); } // cuda kernel for cuArraysElementMultiplyConjugate -__global__ void cudaKernel_elementMulConjugate(float2 *ainout, float2 *bin, int size, float coef) +__global__ void cudaKernel_elementMulConjugate(complex_type *ainout, complex_type *bin, int size, real_type coef) { int idx = threadIdx.x + blockIdx.x*blockDim.x; if(idx < size) { - cuComplex prod; - prod = cuMulConj(ainout[idx], bin[idx]); - ainout [idx] = prod*coef; + ainout [idx] = cuMulConj(ainout[idx], bin[idx], coef); } } @@ -101,7 +120,7 @@ __global__ void cudaKernel_elementMulConjugate(float2 *ainout, float2 *bin, int * @param[in] image2, the secondary image * @param[in] coef, usually the normalization factor */ -void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, float coef, cudaStream_t stream) +void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *image2, real_type coef, cudaStream_t stream) { int size = image1->getSize(); int threadsperblock = NTHREADS; @@ -109,4 +128,4 @@ void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays cudaKernel_elementMulConjugate<<>>(image1->devData, image2->devData, size, coef ); getLastCudaError("cuArraysElementMultiply error\n"); } -//end of file \ No newline at end of file +//end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h index ef101e686..d05a58860 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrFrequency.h @@ -8,8 +8,9 @@ #define __CUCORRFREQUENCY_H // dependencies -#include "cudaUtil.h" #include "cuArrays.h" +#include "data_types.h" +#include class cuFreqCorrelator { @@ -18,9 +19,9 @@ class cuFreqCorrelator cufftHandle forwardPlan; cufftHandle backwardPlan; // work data - cuArrays *workFM; - cuArrays *workFS; - cuArrays *workT; + cuArrays *workFM; + cuArrays *workFS; + cuArrays *workT; // cuda stream cudaStream_t stream; @@ -30,8 +31,8 @@ class cuFreqCorrelator // destructor ~cuFreqCorrelator(); // executor - void execute(cuArrays *templates, cuArrays *images, cuArrays *results); + void execute(cuArrays *templates, cuArrays *images, cuArrays *results); }; #endif //__CUCORRFREQUENCY_H -// end of file \ No newline at end of file +// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu index b6796d279..43aabd3ca 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu @@ -20,14 +20,13 @@ */ #include "cuAmpcorUtil.h" -#include #include // sum reduction within a block // the following implementation is compatible for sm_20 and above // newer architectures may support faster implementations, such as warp shuffle, cooperative groups template -__device__ float sumReduceBlock(float sum, volatile float *shmem) +__device__ real_type sumReduceBlock(real_type sum, volatile real_type *shmem) { const int tid = threadIdx.x; shmem[tid] = sum; @@ -53,9 +52,9 @@ __device__ float sumReduceBlock(float sum, volatile float *shmem) // cuda kernel to subtract mean value from the images template -__global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSize, float invSize, int nImages) +__global__ void cuArraysMean_kernel(real_type *images, real_type *image_sum, int imageSize, real_type invSize, int nImages) { - __shared__ float shmem[Nthreads]; + __shared__ real_type shmem[Nthreads]; const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -64,9 +63,9 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi const int imageIdx = bid; const int imageOffset = imageIdx * imageSize; - float *imageD = images + imageOffset; + real_type *imageD = images + imageOffset; - float sum = 0.0f; + real_type sum = 0.0; // perform the reduction beyond one block // save the results for each thread in block for (int i = tid; i < imageSize; i += Nthreads) @@ -74,7 +73,7 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi // reduction within the block sum = sumReduceBlock(sum, shmem); - const float mean = sum * invSize; + const real_type mean = sum * invSize; if(tid ==0) image_sum[bid] = mean; } @@ -84,11 +83,11 @@ __global__ void cuArraysMean_kernel(float *images, float *image_sum, int imageSi * @param[out] mean Output mean values * @param[in] stream cudaStream */ -void cuArraysMeanValue(cuArrays *images, cuArrays *mean, cudaStream_t stream) +void cuArraysMeanValue(cuArrays *images, cuArrays *mean, cudaStream_t stream) { const dim3 grid(images->count, 1, 1); const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const real_type invSize = 1.0f/imageSize; cuArraysMean_kernel <<>>(images->devData, mean->devData, imageSize, invSize, images->count); getLastCudaError("cuArraysMeanValue kernel error\n"); @@ -96,9 +95,9 @@ void cuArraysMeanValue(cuArrays *images, cuArrays *mean, cudaStrea // cuda kernel to compute and subtracts mean value from the images template -__global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float invSize, int nImages) +__global__ void cuArraysSubtractMean_kernel(real_type *images, int imageSize, real_type invSize, int nImages) { - __shared__ float shmem[Nthreads]; + __shared__ real_type shmem[Nthreads]; const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -107,16 +106,16 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float const int imageIdx = bid; const int imageOffset = imageIdx * imageSize; - float *imageD = images + imageOffset; + real_type *imageD = images + imageOffset; // compute the sum - float sum = 0.0f; + real_type sum = 0.0; for (int i = tid; i < imageSize; i += Nthreads) sum += imageD[i]; sum = sumReduceBlock(sum, shmem); // compute the mean - const float mean = sum * invSize; + const real_type mean = sum * invSize; // subtract the mean from each pixel for (int i = tid; i < imageSize; i += Nthreads) imageD[i] -= mean; @@ -128,11 +127,11 @@ __global__ void cuArraysSubtractMean_kernel(float *images, int imageSize, float * @param[out] mean Output mean values * @param[in] stream cudaStream */ -void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream) +void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream) { const dim3 grid(images->count, 1, 1); const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const real_type invSize = 1.0f/imageSize; cuArraysSubtractMean_kernel <<>>(images->devData, imageSize, invSize, images->count); getLastCudaError("cuArraysSubtractMean kernel error\n"); @@ -141,9 +140,9 @@ void cuArraysSubtractMean(cuArrays *images, cudaStream_t stream) // cuda kernel to compute summation on extracted correlation surface (Minyan) template -__global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *imagesSum, int *imagesValidCount, int imageSize, int nImages) +__global__ void cuArraysSum_kernel(real_type *images, real_type *imagesSum, int imageSize, int nImages) { - __shared__ float shmem[Nthreads]; + __shared__ real_type shmem[Nthreads]; const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -152,23 +151,18 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i const int imageIdx = bid; const int imageOffset = imageIdx * imageSize; - float* imageD = images + imageOffset; - int* imageValidD = imagesValid + imageOffset; + real_type* imageD = images + imageOffset; - float sum = 0.0f; - int count = 0; + real_type sum = 0.0; for (int i = tid; i < imageSize; i += Nthreads) { sum += imageD[i] * imageD[i]; - count += imageValidD[i]; } sum = sumReduceBlock(sum, shmem); - count = sumReduceBlock(count, shmem); if(tid ==0) { imagesSum[bid] = sum; - imagesValidCount[bid] = count; } } @@ -180,20 +174,19 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i * @param[out] imagesValidCount count of total valid pixels * @param[in] stream cudaStream */ -void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, - cuArrays *imagesValidCount, cudaStream_t stream) +void cuArraysSumSquare(cuArrays *images, cuArrays *imagesSum, cudaStream_t stream) { const dim3 grid(images->count, 1, 1); const int imageSize = images->width*images->height; - cuArraysSumCorr_kernel <<>>(images->devData, imagesValid->devData, - imagesSum->devData, imagesValidCount->devData, imageSize, images->count); - getLastCudaError("cuArraysSumValueCorr kernel error\n"); + cuArraysSum_kernel <<>>(images->devData, + imagesSum->devData, imageSize, images->count); + getLastCudaError("cuArraysSumValue kernel error\n"); } // intra-block inclusive prefix sum template -__device__ void inclusive_prefix_sum(float sum, volatile float *shmem) +__device__ void inclusive_prefix_sum(real_type sum, volatile real_type *shmem) { const int tid = threadIdx.x; shmem[tid] = sum; @@ -212,31 +205,31 @@ __device__ void inclusive_prefix_sum(float sum, volatile float *shmem) // prefix sum of pixel value and pixel value^2 template -__device__ float2 partialSums(const float v, volatile float* shmem, const int stride) +__device__ real2_type partialSums(const real_type v, volatile real_type* shmem, const int stride) { const int tid = threadIdx.x; - volatile float *shMem = shmem + 1; - volatile float *shMem2 = shMem + 1 + (1 << Nthreads2); + volatile real_type *shMem = shmem + 1; + volatile real_type *shMem2 = shMem + 1 + (1 << Nthreads2); inclusive_prefix_sum(v, shMem); inclusive_prefix_sum(v*v, shMem2); - const float Sum = shMem [tid-1 + stride] - shMem [tid-1]; - const float Sum2 = shMem2[tid-1 + stride] - shMem2[tid-1]; - return make_float2(Sum, Sum2); + const real_type Sum = shMem [tid-1 + stride] - shMem [tid-1]; + const real_type Sum2 = shMem2[tid-1 + stride] - shMem2[tid-1]; + return make_real2(Sum, Sum2); } // cuda kernel for cuCorrNormalize template __global__ void cuCorrNormalize_kernel( int nImages, - const float *templateIn, int templateNX, int templateNY, int templateSize, - const float *imageIn, int imageNX, int imageNY, int imageSize, - float *resultOut, int resultNX, int resultNY, int resultSize, - float templateCoeff) + const real_type *templateIn, int templateNX, int templateNY, int templateSize, + const real_type *imageIn, int imageNX, int imageNY, int imageSize, + real_type *resultOut, int resultNX, int resultNY, int resultSize, + real_type templateCoeff) { const int Nthreads = 1<(templateSum2, shmem); __syncthreads(); // reset shared memory value - shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0f; + shmem[tid] = shmem[tid + Nthreads] = shmem[tid + 2*Nthreads] = 0.0; __syncthreads(); // perform the prefix sum and sum^2 for secondary window // see notes above - float imageSum = 0.0f; - float imageSum2 = 0.0f; + real_type imageSum = 0.0; + real_type imageSum2 = 0.0; int iaddr = 0; const int windowSize = templateNX*imageNY; // iterative till reaching the templateNX row of the secondary window @@ -275,7 +268,7 @@ __global__ void cuCorrNormalize_kernel( while (iaddr < windowSize) { // cum sum for each row with a width=templateNY - const float2 res = partialSums(imageD[iaddr + tid], shmem, templateNY); + const real2_type res = partialSums(imageD[iaddr + tid], shmem, templateNY); // add to the total, which keeps track of the sum of area for each window imageSum += res.x; imageSum2 += res.y; @@ -287,17 +280,17 @@ __global__ void cuCorrNormalize_kernel( if (tid < resultNY) { // normalizing factor - const float norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; + const real_type norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; // normalize the correlation surface - resultD[tid] *= rsqrtf(norm2 + FLT_EPSILON); + resultD[tid] *= rsqrt(norm2 + EPSILON); } // iterative over the rest rows while (iaddr < imageSize) { // the prefix sum of the row removed is recomputed, to be subtracted - const float2 res1 = partialSums(imageD[iaddr-windowSize + tid], shmem, templateNY); + const real2_type res1 = partialSums(imageD[iaddr-windowSize + tid], shmem, templateNY); // the prefix sum of the new row, to be added - const float2 res2 = partialSums(imageD[iaddr + tid], shmem, templateNY); + const real2_type res2 = partialSums(imageD[iaddr + tid], shmem, templateNY); imageSum += res2.x - res1.x; imageSum2 += res2.y - res1.y; // move to next row @@ -307,8 +300,8 @@ __global__ void cuCorrNormalize_kernel( { const int ix = iaddr/imageNY; // get row index const int addr = (ix-templateNX)*resultNY; // get the correlation surface row index - const float norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; - resultD[addr + tid] *= rsqrtf(norm2 + FLT_EPSILON); + const real_type norm2 = (imageSum2 - imageSum*imageSum*templateCoeff)*templateSum2; + resultD[addr + tid] *= rsqrt(norm2 + EPSILON); } } } @@ -322,12 +315,12 @@ __global__ void cuCorrNormalize_kernel( * @warning The current implementation uses one thread for one column, therefore, * the secondary window width is limited to <=1024, the max threads in a block. */ -void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) +void cuCorrNormalize(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream) { const int nImages = images->count; const int imageNY = images->width; const dim3 grid(1, 1, nImages); - const float invTemplateSize = 1.0f/templates->size; + const real_type invTemplateSize = 1.0f/templates->size; if (imageNY <= 64) { cuCorrNormalize_kernel< 6><<>>(nImages, @@ -385,11 +378,11 @@ template<> struct Log2<512> { static const int value = 9; }; template<> struct Log2<1024> { static const int value = 10; }; template -void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { const int nImages = correlation->count; const dim3 grid(1, 1, nImages); - const float invReferenceSize = 1.0f/reference->size; + const real_type invReferenceSize = 1.0f/reference->size; cuCorrNormalize_kernel::value><<>>(nImages, reference->devData, reference->height, reference->width, reference->size, secondary->devData, secondary->height, secondary->width, secondary->size, @@ -398,20 +391,20 @@ void cuCorrNormalizeFixed(cuArrays *correlation, cuArrays *referen getLastCudaError("cuCorrNormalize kernel error"); } -template void cuCorrNormalizeFixed<64>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<64>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<128>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<128>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<256>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<256>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<512>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<512>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); -template void cuCorrNormalizeFixed<1024>(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, +template void cuCorrNormalizeFixed<1024>(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream); // end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu index 97b4e064b..656e5ebcb 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizationSAT.cu @@ -5,11 +5,14 @@ */ #include + +#if __CUDACC_VER_MAJOR__ >= 11 #include +#endif + // my declarations #include "cuAmpcorUtil.h" -// for FLT_EPSILON -#include + // alias for cuda cooperative groups namespace cg = cooperative_groups; @@ -25,16 +28,19 @@ namespace cg = cooperative_groups; * @note use one thread block for each image, blockIdx.x is image index **/ -__global__ void sum_square_kernel(float *sum2, const float *images, int n, int batch) + +#if __CUDACC_VER_MAJOR__ >= 11 +// use cg::reduce for NVCC 11 and above +__global__ void sum_square_kernel(real_type *sum2, const real_type *images, int n, int batch) { // get block id for each image int imageid = blockIdx.x; - const float *image = images + imageid*n; + const real_type *image = images + imageid*n; // get the thread block cg::thread_block cta = cg::this_thread_block(); // get the shared memory - extern float __shared__ sdata[]; + extern real_type __shared__ sdata[]; // get the current thread int tid = cta.thread_rank(); @@ -53,12 +59,12 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); // reduce in each tile with warp - sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); + sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); cg::sync(cta); // reduce all tiles with thread 0 if(tid == 0) { - float sum = 0.0; + real_type sum = 0.0; for (int i = 0; i < cta.size(); i += tile32.size()) sum += sdata[i]; // assign the value to results @@ -66,6 +72,69 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b } } +#else +// use warp-shuffle reduction for NVCC 9 & 10 +__global__ void sum_square_kernel(real_type *sum2, const real_type *images, int n, int batch) +{ + // get block id for each image + int imageid = blockIdx.x; + const real_type *image = images + imageid*n; + + // get the thread block + cg::thread_block cta = cg::this_thread_block(); + // get the shared memory + extern real_type __shared__ sdata[]; + + // get the current thread + unsigned int tid = cta.thread_rank(); + unsigned int blockSize = cta.size(); + + // stride over grid and add the values to the shared memory + real_type sum = 0; + + for(int i = tid; i < n; i += blockSize ) { + auto value = image[i]; + sum += value*value; + } + sdata[tid] = sum; + cg::sync(cta); + + // do reduction in shared memory in log2 steps + if ((blockSize >= 512) && (tid < 256)) { + sdata[tid] = sum = sum + sdata[tid + 256]; + } + cg::sync(cta); + + if ((blockSize >= 256) && (tid < 128)) { + sdata[tid] = sum = sum + sdata[tid + 128]; + } + cg::sync(cta); + + if ((blockSize >= 128) && (tid < 64)) { + sdata[tid] = sum = sum + sdata[tid + 64]; + } + cg::sync(cta); + + // partition thread block into tiles in size 32 (warp) + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // reduce within warp + if(tid < 32) { + if(blockSize >=64) sum += sdata[tid + 32]; + for (int offset = tile32.size()/2; offset >0; offset /=2) { + sum += tile32.shfl_down(sum, offset); + } + } + + // return results with thread 0 + if(tid == 0) { + // assign the value to results + sum2[imageid] = sum; + } +} +#endif // __CUDACC_VER_MAJOR check + + /** * cuda kernel for 2d sum area table * Compute the (inclusive) sum area table of the value and value^2 of a batch of 2d images. @@ -77,7 +146,7 @@ __global__ void sum_square_kernel(float *sum2, const float *images, int n, int b * @param[in] batch number of images **/ -__global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx, int ny, int batch) +__global__ void sat2d_kernel(real_type *sat, real_type * sat2, const real_type *data, int nx, int ny, int batch) { // get block id for each image int imageid = blockIdx.x; @@ -89,13 +158,13 @@ __global__ void sat2d_kernel(float *sat, float * sat2, const float *data, int nx // the number of rows may be bigger than the number of threads, iterate for (int row = tid; row < nx; row += blockDim.x) { // running sum for value and value^2 - float sum = 0.0f; - float sum2 = 0.0f; + real_type sum = 0.0; + real_type sum2 = 0.0; // starting position for this row int index = (imageid*nx+row)*ny; // iterative over column for (int i=0; i 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0; - float topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; - float bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; - float bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; + real_type topleft = (tx > 0 && ty > 0) ? sat[(tx-1)*secondaryNY+(ty-1)] : 0.0; + real_type topright = (tx > 0 ) ? sat[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; + real_type bottomleft = (ty > 0) ? sat[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; + real_type bottomright = sat[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; // get the sum - float secondarySum = bottomright + topleft - topright - bottomleft; + real_type secondarySum = bottomright + topleft - topright - bottomleft; // sum of value^2 - const float *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY; + const real_type *sat2 = secondarySat2 + imageid*secondaryNX*secondaryNY; // get sat2 values for four corners topleft = (tx > 0 && ty > 0) ? sat2[(tx-1)*secondaryNY+(ty-1)] : 0.0; topright = (tx > 0 ) ? sat2[(tx-1)*secondaryNY+(ty+referenceNY-1)] : 0.0; bottomleft = (ty > 0) ? sat2[(tx+referenceNX-1)*secondaryNY+(ty-1)] : 0.0; bottomright = sat2[(tx+referenceNX-1)*secondaryNY+(ty+referenceNY-1)]; - float secondarySum2 = bottomright + topleft - topright - bottomleft; + real_type secondarySum2 = bottomright + topleft - topright - bottomleft; // compute the normalization - float norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2; + real_type norm2 = (secondarySum2-secondarySum*secondarySum/(referenceNX*referenceNY))*refSum2; // normalize the correlation surface - correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrtf(norm2 + FLT_EPSILON); + correlation[(imageid*corNX+tx)*corNY+ty] *= rsqrt(norm2 + EPSILON); } } -void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, - cuArrays * referenceSum2, cuArrays *secondarySat, cuArrays *secondarySat2, cudaStream_t stream) +void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, + cuArrays * referenceSum2, cuArrays *secondarySat, cuArrays *secondarySat2, cudaStream_t stream) { // compute the std of reference image // note that the mean is already subtracted int nthreads = 256; - int sMemSize = nthreads*sizeof(float); + int sMemSize = nthreads*sizeof(real_type); int nblocks = reference->count; sum_square_kernel<<>>(referenceSum2->devData, reference->devData, reference->width * reference->height, reference->count); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp similarity index 51% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp index 783080d90..5c157f16a 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.cpp @@ -7,56 +7,41 @@ #include "cuCorrNormalizer.h" #include "cuAmpcorUtil.h" -cuNormalizer::cuNormalizer(int secondaryNX, int secondaryNY, int count) +cuNormalizeProcessor* +newCuNormalizer(int secondaryNX, int secondaryNY, int count) { // depending on NY, choose different processor if(secondaryNY <= 64) { - processor = new cuNormalizeFixed<64>(); + return new cuNormalizeFixed<64>(); } else if (secondaryNY <= 128) { - processor = new cuNormalizeFixed<128>(); + return new cuNormalizeFixed<128>(); } else if (secondaryNY <= 256) { - processor = new cuNormalizeFixed<256>(); + return new cuNormalizeFixed<256>(); } else if (secondaryNY <= 512) { - processor = new cuNormalizeFixed<512>(); + return new cuNormalizeFixed<512>(); } else if (secondaryNY <= 1024) { - processor = new cuNormalizeFixed<1024>(); + return new cuNormalizeFixed<1024>(); } else { - processor = new cuNormalizeSAT(secondaryNX, secondaryNY, count); + return new cuNormalizeSAT(secondaryNX, secondaryNY, count); } } -cuNormalizer::~cuNormalizer() -{ - delete processor; -} - -void cuNormalizer::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) -{ - processor->execute(correlation, reference, secondary, stream); -} - -/** - * - * - **/ - cuNormalizeSAT::cuNormalizeSAT(int secondaryNX, int secondaryNY, int count) { // allocate the work array // reference sum square - referenceSum2 = new cuArrays(1, 1, count); + referenceSum2 = new cuArrays(1, 1, count); referenceSum2->allocate(); // secondary sum and sum square - secondarySAT = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT = new cuArrays(secondaryNX, secondaryNY, count); secondarySAT->allocate(); - secondarySAT2 = new cuArrays(secondaryNX, secondaryNY, count); + secondarySAT2 = new cuArrays(secondaryNX, secondaryNY, count); secondarySAT2->allocate(); }; @@ -67,16 +52,16 @@ cuNormalizeSAT::~cuNormalizeSAT() delete secondarySAT2; } -void cuNormalizeSAT::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +void cuNormalizeSAT::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { cuCorrNormalizeSAT(correlation, reference, secondary, referenceSum2, secondarySAT, secondarySAT2, stream); } template -void cuNormalizeFixed::execute(cuArrays *correlation, - cuArrays *reference, cuArrays *secondary, cudaStream_t stream) +void cuNormalizeFixed::execute(cuArrays *correlation, + cuArrays *reference, cuArrays *secondary, cudaStream_t stream) { cuCorrNormalizeFixed(correlation, reference, secondary, stream); } diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h index 043accd3a..1b6a33b68 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalizer.h @@ -13,7 +13,7 @@ #define __CUNORMALIZER_H #include "cuArrays.h" -#include "cudaUtil.h" +#include "data_types.h" /** * Abstract class interface for correlation surface normalization processor @@ -22,46 +22,34 @@ class cuNormalizeProcessor { public: // default constructor and destructor - cuNormalizeProcessor() {} - ~cuNormalizeProcessor() {} + cuNormalizeProcessor() = default; + virtual ~cuNormalizeProcessor() = default; // execute interface - virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; + virtual void execute(cuArrays * correlation, cuArrays *reference, cuArrays *secondary, cudaStream_t stream) = 0; }; -class cuNormalizer { -private: - cuNormalizeProcessor *processor; -public: - // disable the default constructor - cuNormalizer() = delete; - // constructor with the secondary dimension - cuNormalizer(int secondaryNX, int secondaryNY, int count); - // destructor - ~cuNormalizer(); - // execute correlation surface normalization - void execute(cuArrays *correlation, cuArrays *reference, cuArrays *secondary, - cudaStream_t stream); -}; +// factory with the secondary dimension +cuNormalizeProcessor* newCuNormalizer(int NX, int NY, int count); template class cuNormalizeFixed : public cuNormalizeProcessor { public: - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; }; class cuNormalizeSAT : public cuNormalizeProcessor { private: - cuArrays *referenceSum2; - cuArrays *secondarySAT; - cuArrays *secondarySAT2; + cuArrays *referenceSum2; + cuArrays *secondarySAT; + cuArrays *secondarySAT2; public: cuNormalizeSAT(int secondaryNX, int secondaryNY, int count); ~cuNormalizeSAT(); - void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; + void execute(cuArrays * correlation, cuArrays *reference, cuArrays *search, cudaStream_t stream) override; }; #endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu index 116712986..ae166849d 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrTimeDomain.cu @@ -12,11 +12,11 @@ template __global__ void cuArraysCorrTime_kernel( const int nImages, - const float *templateIn, const int templateNX, const int templateNY, const int templateSize, - const float *imageIn, const int imageNX, const int imageNY, const int imageSize, - float *resultOut, const int resultNX, const int resultNY, const int resultSize) + const real_type *templateIn, const int templateNX, const int templateNY, const int templateSize, + const real_type *imageIn, const int imageNX, const int imageNY, const int imageSize, + real_type *resultOut, const int resultNX, const int resultNY, const int resultSize) { - __shared__ float shmem[nthreads*(1+NPT)]; + __shared__ real_type shmem[nthreads*(1+NPT)]; const int tid = threadIdx.x; const int bid = blockIdx.x; const int yc = blockIdx.y*NPT; @@ -26,9 +26,9 @@ __global__ void cuArraysCorrTime_kernel( const int templateOffset = imageIdx * templateSize; const int resultOffset = imageIdx * resultSize; - const float * imageD = imageIn + imageOffset + tid; - const float *templateD = templateIn + templateOffset + tid; - float * resultD = resultOut + resultOffset; + const real_type * imageD = imageIn + imageOffset + tid; + const real_type *templateD = templateIn + templateOffset + tid; + real_type * resultD = resultOut + resultOffset; const int q = min(nthreads/resultNY, 4); const int nt = nthreads/q; @@ -39,18 +39,18 @@ __global__ void cuArraysCorrTime_kernel( const int jbeg = templateNYq * ty; const int jend = ty+1 >= q ? templateNY : templateNYq + jbeg; - float *shTemplate = shmem; - float *shImage = shmem + nthreads; - float *shImage1 = shImage + tx; + real_type *shTemplate = shmem; + real_type *shImage = shmem + nthreads; + real_type *shImage1 = shImage + tx; - float corrCoeff[NPT]; + real_type corrCoeff[NPT]; for (int k = 0; k < NPT; k++) - corrCoeff[k] = 0.0f; + corrCoeff[k] = 0.0; int iaddr = yc*imageNY; - float img[NPT]; + real_type img[NPT]; for (int k = 0; k < NPT-1; k++, iaddr += imageNY) img[k] = imageD[iaddr]; for (int taddr = 0; taddr < templateSize; taddr += templateNY, iaddr += imageNY) @@ -98,15 +98,21 @@ __global__ void cuArraysCorrTime_kernel( * @param[out] results Output correlation surface * @param[in] stream cudaStream */ -void cuCorrTimeDomain(cuArrays *templates, - cuArrays *images, - cuArrays *results, +void cuCorrTimeDomain(cuArrays *templates, + cuArrays *images, + cuArrays *results, cudaStream_t stream) { /* compute correlation matrix */ const int nImages = images->count; const int imageNY = images->width; + +#if defined(CUAMPCOR_DOUBLE) && __CUDA_ARCH__ < 800 + // For GPUs older than A100, static shared memory limit is 48K + const int NPT = 4; +#else const int NPT = 8; +#endif const dim3 grid(nImages, (results->width-1)/NPT+1, 1); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu index ef35fa5b0..8ccf9e05f 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu @@ -13,8 +13,8 @@ */ #include "cuArrays.h" -#include "float2.h" -#include +#include "float2.h" +#include "data_types.h" #include "cudaError.h" #include "cudaUtil.h" #include "cuAmpcorUtil.h" @@ -24,23 +24,23 @@ #include -// cuda does not have a good support on volatile vector struct, e.g. float2 -// have to use regular float type for shared memory (volatile) data -// the following methods are defined to operate float2/complex objects through float -inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) +// cuda does not have a good support on volatile vector struct, e.g. real2_type +// have to use regular real_type type for shared memory (volatile) data +// the following methods are defined to operate real2_type/complex objects through real_type +inline static __device__ void copyToShared(volatile real_type *s, const int i, const real2_type x, const int block) { s[i] = x.x; s[i+block] = x.y; } -inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) +inline static __device__ void copyFromShared(real2_type &x, volatile real_type *s, const int i, const int block) { x.x = s[i]; x.y = s[i+block]; } -inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) +inline static __device__ void addInShared(volatile real_type *s, const int i, const int j, const int block) { s[i] += s[i+j]; s[i+block] += s[i+j+block];} -// kernel to do sum reduction for float2 within a block +// kernel to do sum reduction for real2_type within a block template -__device__ void complexSumReduceBlock(float2& sum, volatile float *shmem) +__device__ void complexSumReduceBlock(real2_type& sum, volatile real_type *shmem) { const int tid = threadIdx.x; copyToShared(shmem, tid, sum, nthreads); @@ -65,35 +65,35 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem) // cuda kernel for cuDerampMethod1 template -__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, - const int imageSize, const int nImages, const float normCoef) +__global__ void cuDerampMethod1_kernel(real2_type *images, const int imageNX, int const imageNY, + const int imageSize, const int nImages, const real_type normCoef) { - __shared__ float shmem[2*nthreads]; + __shared__ real_type shmem[2*nthreads]; int pixelIdx, pixelIdxX, pixelIdxY; const int bid = blockIdx.x; if(bid >= nImages) return; - float2 *image = images+ bid*imageSize; + real2_type *image = images+ bid*imageSize; const int tid = threadIdx.x; - float2 phaseDiffY = make_float2(0.0f, 0.0f); + real2_type phaseDiffY = make_real2(0.0, 0.0); for (int i = tid; i < imageSize; i += nthreads) { pixelIdxY = i % imageNY; if(pixelIdxY < imageNY -1) { pixelIdx = i; - float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]); + real2_type cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]); phaseDiffY += cprod; } } complexSumReduceBlock(phaseDiffY, shmem); //phaseDiffY *= normCoef; - float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x); + real_type phaseY=atan2(phaseDiffY.y, phaseDiffY.x); - float2 phaseDiffX = make_float2(0.0f, 0.0f); + real2_type phaseDiffX = make_real2(0.0, 0.0); for (int i = tid; i < imageSize; i += nthreads) { pixelIdxX = i / imageNY; if(pixelIdxX < imageNX -1) { pixelIdx = i; - float2 cprod = complexMulConj(image[i], image[i+imageNY]); + real2_type cprod = complexMulConj(image[i], image[i+imageNY]); phaseDiffX += cprod; } } @@ -101,14 +101,14 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co complexSumReduceBlock(phaseDiffX, shmem); //phaseDiffX *= normCoef; - float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON + real_type phaseX = atan2(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON for (int i = tid; i < imageSize; i += nthreads) { pixelIdxX = i%imageNY; pixelIdxY = i/imageNY; - float phase = pixelIdxX*phaseX + pixelIdxY*phaseY; - float2 phase_factor = make_float2(cosf(phase), sinf(phase)); + real_type phase = pixelIdxX*phaseX + pixelIdxY*phaseY; + real2_type phase_factor = make_real2(cosf(phase), sinf(phase)); image[i] *= phase_factor; } } @@ -120,12 +120,12 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co * @param[inout] images input/output complex signals * @param[in] stream cuda stream */ -void cuDerampMethod1(cuArrays *images, cudaStream_t stream) +void cuDerampMethod1(cuArrays *images, cudaStream_t stream) { const dim3 grid(images->count); const int imageSize = images->width*images->height; - const float invSize = 1.0f/imageSize; + const real_type invSize = 1.0f/imageSize; if(imageSize <=64) { cuDerampMethod1_kernel<64> <<>> @@ -147,7 +147,7 @@ void cuDerampMethod1(cuArrays *images, cudaStream_t stream) } -void cuDeramp(int method, cuArrays *images, cudaStream_t stream) +void cuDeramp(int method, cuArrays *images, cudaStream_t stream) { switch(method) { case 1: diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu index 18412bfe1..7a2a9e5b2 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu @@ -7,7 +7,8 @@ #include "cuArrays.h" #include "float2.h" -#include +#include "float.h" +#include "data_types.h" #include "cudaUtil.h" #include "cudaError.h" #include "cuAmpcorUtil.h" @@ -17,16 +18,20 @@ #include // cuda kernel for cuEstimateSnr -__global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrValidCount, const float* maxval, float* snrValue, const int size) - +__global__ void cudaKernel_estimateSnr(const real_type* corrSum, const real_type* maxval, real_type* snrValue, const int size, const int nImages) { int idx = threadIdx.x + blockDim.x*blockIdx.x; - if (idx >= size) return; - - float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1); + if (idx >= nImages) return; - snrValue[idx] = maxval[idx] * maxval[idx] / mean; + real_type peak = maxval[idx]; + peak *= peak; + real_type mean = (corrSum[idx] - peak) / (size - 1); + snrValue[idx] = peak / mean; +#ifdef CUAMPCOR_DEBUG + if(threadIdx.x==0) + printf("debug snr %g %g %g\n", peak, mean, snrValue[idx]); +#endif } /** @@ -36,18 +41,17 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali * @param[out] snrValue return snr value * @param[in] stream cuda stream */ -void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream) +void cuEstimateSnr(cuArrays *corrSum, cuArrays *maxval, cuArrays *snrValue, const int size, cudaStream_t stream) { - - int size = corrSum->getSize(); + int nImages = corrSum->getSize(); cudaKernel_estimateSnr<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>> - (corrSum->devData, corrValidCount->devData, maxval->devData, snrValue->devData, size); + (corrSum->devData, maxval->devData, snrValue->devData, size, nImages); getLastCudaError("cuda kernel estimate stats error\n"); } // cuda kernel for cuEstimateVariance -__global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, const int NY, const int2* maxloc, - const float* maxval, const int templateSize, float3* covValue, const int size) +__global__ void cudaKernel_estimateVar(const real_type* corrBatchRaw, const int NX, const int NY, const int2* maxloc, + const real_type* maxval, const int templateSize, real3_type* covValue, const int size) { // Find image id. @@ -58,12 +62,12 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, // Preparation. int px = maxloc[idxImage].x; int py = maxloc[idxImage].y; - float peak = maxval[idxImage]; + real_type peak = maxval[idxImage]; // Check if maxval is on the margin. if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY) { - covValue[idxImage] = make_float3(99.0, 99.0, 0.0); + covValue[idxImage] = make_real3(99.0, 99.0, 0.0); } else { @@ -79,32 +83,32 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, int idx22 = offset + (px + 1) * NY + py + 1; // second-order derivatives - float dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2.0*corrBatchRaw[idx11] ); - float dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2.0*corrBatchRaw[idx11] ) ; - float dxy = ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25; + real_type dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2.0*corrBatchRaw[idx11] ); + real_type dyy = - ( corrBatchRaw[idx12] + corrBatchRaw[idx10] - 2.0*corrBatchRaw[idx11] ) ; + real_type dxy = ( corrBatchRaw[idx22] + corrBatchRaw[idx00] - corrBatchRaw[idx20] - corrBatchRaw[idx02] ) *0.25; - float n2 = fmaxf(1.0 - peak, 0.0); + real_type n2 = max(1.0 - peak, 0.0); dxx = dxx * templateSize; dyy = dyy * templateSize; dxy = dxy * templateSize; - float n4 = n2*n2; + real_type n4 = n2*n2; n2 = n2 * 2; n4 = n4 * 0.5 * templateSize; - float u = dxy * dxy - dxx * dyy; - float u2 = u*u; + real_type u = dxy * dxy - dxx * dyy; + real_type u2 = u*u; // if the Gaussian curvature is too small if (fabsf(u) < 1e-2) { - covValue[idxImage] = make_float3(99.0, 99.0, 0.0); + covValue[idxImage] = make_real3(99.0, 99.0, 0.0); } else { - float cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2; - float cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2; - float cov_xy = ( n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2; - covValue[idxImage] = make_float3(cov_xx, cov_yy, cov_xy); + real_type cov_xx = (- n2 * u * dyy + n4 * ( dyy*dyy + dxy*dxy) ) / u2; + real_type cov_yy = (- n2 * u * dxx + n4 * ( dxx*dxx + dxy*dxy) ) / u2; + real_type cov_xy = ( n2 * u * dxy - n4 * ( dxx + dyy ) * dxy ) / u2; + covValue[idxImage] = make_real3(cov_xx, cov_yy, cov_xy); } } } @@ -118,7 +122,7 @@ __global__ void cudaKernel_estimateVar(const float* corrBatchRaw, const int NX, * @param[out] covValue variance value * @param[in] stream cuda stream */ -void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream) +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream) { int size = corrBatchRaw->count; // One dimensional launching parameters to loop over every correlation surface. diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu index ac77f131f..a93c9bb05 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu @@ -6,11 +6,9 @@ // my module dependencies #include "cuAmpcorUtil.h" -// for FLT_MAX -#include // find the max between two elements -inline static __device__ void maxPairReduce(volatile float* maxval, volatile int* maxloc, +inline static __device__ void maxPairReduce(volatile real_type* maxval, volatile int* maxloc, size_t gid, size_t strideid) { if(maxval[gid] < maxval[strideid]) { @@ -19,32 +17,38 @@ inline static __device__ void maxPairReduce(volatile float* maxval, volatile int } } -// max reduction kernel +// max reduction kernel for a 2d image +// start from (start.x, start.y) in a rectangle range (range.x, range.y) +// start + range <= imageSize template -__device__ void max_reduction(const float* const images, - const size_t imageSize, - const size_t nImages, - volatile float* shval, +__device__ void max_reduction_2d(const real_type* const image, + const int2 imageSize, + const int2 start, const int2 range, + volatile real_type* shval, volatile int* shloc) { int tid = threadIdx.x; - shval[tid] = -FLT_MAX; - int imageStart = blockIdx.x*imageSize; - int imagePixel; + shval[tid] = -REAL_MAX; // reduction for intra-block elements // i.e., for elements with i, i+BLOCKSIZE, i+2*BLOCKSIZE ... - for(int gid = tid; gid < imageSize; gid+=blockDim.x) + for(int gid = tid; gid < range.x*range.y; gid+=blockDim.x) { - imagePixel = imageStart+gid; - if(shval[tid] < images[imagePixel]) { - shval[tid] = images[imagePixel]; - shloc[tid] = gid; + // gid is the flattened pixel id in the search range + // get the pixel 2d coordinate in whole image + int idx = start.x + gid / range.y; + int idy = start.y + gid % range.y; + // get the flattened 1d coordinate + int pixelId = IDX2R(idx, idy, imageSize.y); + real_type pixelValue = image[pixelId]; + if(shval[tid] < pixelValue) { + shval[tid] = pixelValue; + shloc[tid] = pixelId; } } __syncthreads(); - // reduction within a block + // reduction within a block with shared memory if (BLOCKSIZE >=1024){ if (tid < 512) { maxPairReduce(shval, shloc, tid, tid + 512); } __syncthreads(); } if (BLOCKSIZE >=512) { if (tid < 256) { maxPairReduce(shval, shloc, tid, tid + 256); } __syncthreads(); } if (BLOCKSIZE >=256) { if (tid < 128) { maxPairReduce(shval, shloc, tid, tid + 128); } __syncthreads(); } @@ -65,21 +69,24 @@ __device__ void max_reduction(const float* const images, // kernel for 2D array(image), find max location only template -__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, float* maxval, - const size_t imageNX, const size_t imageNY, const size_t nImages) +__global__ void cudaKernel_maxloc2D(const real_type* const images, int2* maxloc, real_type* maxval, + const int2 imageSize, const size_t nImages, const int2 start, const int2 range) { - __shared__ float shval[BLOCKSIZE]; + __shared__ real_type shval[BLOCKSIZE]; __shared__ int shloc[BLOCKSIZE]; - int bid = blockIdx.x; - if(bid >= nImages) return; + int imageIdx = blockIdx.x; + if(imageIdx >= nImages) return; - const int imageSize = imageNX * imageNY; - max_reduction(images, imageSize, nImages, shval, shloc); + // get the starting pointer for this image + const real_type *image = images + imageIdx*imageSize.x*imageSize.y; + max_reduction_2d(image, imageSize, start, range, shval, shloc); + + // thread 0 contains the final result, convert it to 2d coordinate if (threadIdx.x == 0) { - maxloc[bid] = make_int2(shloc[0]/imageNY, shloc[0]%imageNY); - maxval[bid] = shval[0]; + maxloc[imageIdx] = make_int2(shloc[0]/imageSize.y, shloc[0]%imageSize.y); + maxval[imageIdx] = shval[0]; } } @@ -88,61 +95,56 @@ __global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, fl * @param[in] images input batch of images * @param[out] maxval arrays to hold the max values * @param[out] maxloc arrays to hold the max locations + * @param[in] start starting search pixel + * @param[in] range search range * @param[in] stream cudaStream * @note This routine is overloaded with the routine without maxval */ -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, - cuArrays *maxval, cudaStream_t stream) +void cuArraysMaxloc2D(cuArrays *images, + const int2 start, const int2 range, + cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream) { cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> - (images->devData, maxloc->devData, maxval->devData, images->height, images->width, images->count); + (images->devData, + maxloc->devData, maxval->devData, + make_int2(images->height, images->width), images->count, + start, range + ); getLastCudaError("cudaKernel find max location 2D error\n"); } -//kernel and function for 2D array(image), find max location only, use overload -template -__global__ void cudaKernel_maxloc2D(const float* const images, int2* maxloc, const size_t imageNX, const size_t imageNY, const size_t nImages) -{ - __shared__ float shval[BLOCKSIZE]; - __shared__ int shloc[BLOCKSIZE]; - - int bid = blockIdx.x; - if(bid >= nImages) return; - - const int imageSize = imageNX * imageNY; - max_reduction(images, imageSize, nImages, shval, shloc); - - if (threadIdx.x == 0) { - int xloc = shloc[0]/imageNY; - int yloc = shloc[0]%imageNY; - maxloc[bid] = make_int2(xloc, yloc); - } -} - /** - * Find (only) the maximum location for a batch of 2D images + * Find both the maximum value and the location for a batch of 2D images * @param[in] images input batch of images + * @param[out] maxval arrays to hold the max values * @param[out] maxloc arrays to hold the max locations * @param[in] stream cudaStream - * @note This routine is overloaded with the routine with maxval */ -void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cudaStream_t stream) +void cuArraysMaxloc2D(cuArrays *images, + cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream) { + // if no start and range are provided, use the whole image + int2 start = make_int2(0, 0); + int2 imageSize = make_int2(images->height, images->width); cudaKernel_maxloc2D<<count, NTHREADS, 0, stream>>> - (images->devData, maxloc->devData, images->height, images->width, images->count); + (images->devData, + maxloc->devData, maxval->devData, + imageSize, images->count, + start, imageSize + ); getLastCudaError("cudaKernel find max location 2D error\n"); } // cuda kernel for cuSubPixelOffset __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offsetZoomIn, - float2 *offsetFinal, - const float OSratio, - const float xoffset, const float yoffset, const int size) + real2_type *offsetFinal, const int size, + const int2 initOrigin, const real_type initRatio, + const int2 zoomInOrigin, const real_type zoomInRatio) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if (idx >= size) return; - offsetFinal[idx].x = OSratio*(offsetZoomIn[idx].x ) + offsetInit[idx].x - xoffset; - offsetFinal[idx].y = OSratio*(offsetZoomIn[idx].y ) + offsetInit[idx].y - yoffset; + offsetFinal[idx].x = initRatio*(offsetInit[idx].x-initOrigin.x) + zoomInRatio*(offsetZoomIn[idx].x - zoomInOrigin.x); + offsetFinal[idx].y = initRatio*(offsetInit[idx].y-initOrigin.y) + zoomInRatio*(offsetZoomIn[idx].y - zoomInOrigin.y); } @@ -152,36 +154,24 @@ __global__ void cuSubPixelOffset_kernel(const int2 *offsetInit, const int2 *offs * the cross-correlation before oversampling, in dimensions of pixel * @param[in] offsetZoomIn max location from the oversampled cross-correlation surface * @param[out] offsetFinal the combined offset value - * @param[in] OversampleRatioZoomIn the correlation surface oversampling factor - * @param[in] OversampleRatioRaw the oversampling factor of reference/secondary windows before cross-correlation - * @param[in] xHalfRangInit the original half search range along x, to be subtracted - * @param[in] yHalfRangInit the original half search range along y, to be subtracted - * - * 1. Cross-correlation is performed at first for the un-oversampled data with a larger search range. - * The secondary window is then extracted to a smaller size (a smaller search range) around the max location. - * The extraction starting location (offsetInit) - original half search range (xHalfRangeInit, yHalfRangeInit) - * = pixel size offset - * 2. Reference/secondary windows are then oversampled by OversampleRatioRaw, and cross-correlated. - * 3. The correlation surface is further oversampled by OversampleRatioZoomIn. - * The overall oversampling factor is OversampleRatioZoomIn*OversampleRatioRaw. - * The max location in oversampled correlation surface (offsetZoomIn) / overall oversampling factor - * = subpixel offset - * Final offset = pixel size offset + subpixel offset */ -void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetZoomIn, - cuArrays *offsetFinal, - int OverSampleRatioZoomin, int OverSampleRatioRaw, - int xHalfRangeInit, int yHalfRangeInit, +void cuSubPixelOffset(cuArrays *offsetInit, + cuArrays *offsetZoomIn, + cuArrays *offsetFinal, + const int2 initOrigin, const int initFactor, + const int2 zoomInOrigin, const int zoomInFactor, cudaStream_t stream) { int size = offsetInit->getSize(); - float OSratio = 1.0f/(float)(OverSampleRatioZoomin*OverSampleRatioRaw); - float xoffset = xHalfRangeInit ; - float yoffset = yHalfRangeInit ; + + // GPU performs multiplication faster + real_type initRatio = 1.0f/(real_type)(initFactor); + real_type zoomInRatio = 1.0f/(real_type)(zoomInFactor); + cuSubPixelOffset_kernel<<>> (offsetInit->devData, offsetZoomIn->devData, - offsetFinal->devData, OSratio, xoffset, yoffset, size); + offsetFinal->devData, size, initOrigin, initRatio, zoomInOrigin, zoomInRatio); getLastCudaError("cuSubPixelOffset_kernel"); } diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp similarity index 68% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp index 1b6ab6267..9da46c58e 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cpp @@ -36,9 +36,9 @@ cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int */ // set up work arrays - workIn = new cuArrays(inNXp2, inNYp2, nImages); + workIn = new cuArrays(inNXp2, inNYp2, nImages); workIn->allocate(); - workOut = new cuArrays(outNXp2, outNYp2, nImages); + workOut = new cuArrays(outNXp2, outNYp2, nImages); workOut->allocate(); // set up fft plans @@ -47,8 +47,13 @@ cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int int fImageSize = inNXp2*inNYp2; int nOverSample[NRANK] = {outNXp2, outNYp2}; int fImageOverSampleSize = outNXp2*outNYp2; +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_Z2Z, nImages)); + cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nOverSample, NULL, 1, fImageOverSampleSize, NULL, 1, fImageOverSampleSize, CUFFT_Z2Z, nImages)); +#else cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_C2C, nImages)); cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nOverSample, NULL, 1, fImageOverSampleSize, NULL, 1, fImageOverSampleSize, CUFFT_C2C, nImages)); +#endif // set cuda stream setStream(stream_); } @@ -69,12 +74,23 @@ void cuOverSamplerC2C::setStream(cudaStream_t stream_) * @param[out] imagesOut output batch of images * @param[in] method phase deramping method */ -void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut, int method) +void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut, int method) { - cuDeramp(method, imagesIn, stream); - cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_INVERSE )); - cuArraysPaddingMany(workIn, workOut, stream); - cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_FORWARD)); + cuDeramp(method, imagesIn, stream); + + #ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_FORWARD)); + #else + cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_FORWARD)); + #endif + + cuArraysFFTPaddingMany(workIn, workOut, stream); + +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_INVERSE)); +#else + cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_INVERSE)); +#endif } /// destructor @@ -117,12 +133,17 @@ cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int int fImageSize = inNXp2*inNYp2; int nUpSample[NRANK] = {outNXp2, outNYp2}; int fImageUpSampleSize = outNXp2*outNYp2; - workSizeIn = new cuArrays(inNXp2, inNYp2, nImages); + workSizeIn = new cuArrays(inNXp2, inNYp2, nImages); workSizeIn->allocate(); - workSizeOut = new cuArrays(outNXp2, outNYp2, nImages); + workSizeOut = new cuArrays(outNXp2, outNYp2, nImages); workSizeOut->allocate(); +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_Z2Z, nImages)); + cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nUpSample, NULL, 1, fImageUpSampleSize, NULL, 1, outNX*outNY, CUFFT_Z2Z, nImages)); +#else cufft_Error(cufftPlanMany(&forwardPlan, NRANK, n, NULL, 1, imageSize, NULL, 1, fImageSize, CUFFT_C2C, nImages)); cufft_Error(cufftPlanMany(&backwardPlan, NRANK, nUpSample, NULL, 1, fImageUpSampleSize, NULL, 1, outNX*outNY, CUFFT_C2C, nImages)); +#endif setStream(stream); } @@ -138,12 +159,25 @@ void cuOverSamplerR2R::setStream(cudaStream_t stream_) * @param[in] imagesIn input batch of images * @param[out] imagesOut output batch of images */ -void cuOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut) +void cuOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut) { + cuArraysCopyPadded(imagesIn, workSizeIn, stream); - cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_INVERSE)); - cuArraysPaddingMany(workSizeIn, workSizeOut, stream); - cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData,CUFFT_FORWARD )); + +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_FORWARD)); +#else + cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_FORWARD)); +#endif + + cuArraysFFTPaddingMany(workSizeIn, workSizeOut, stream); + +#ifdef CUAMPCOR_DOUBLE + cufft_Error(cufftExecZ2Z(backwardPlan, workSizeOut->devData, workSizeOut->devData, CUFFT_INVERSE)); +#else + cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData, CUFFT_INVERSE)); +#endif + cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream); } @@ -156,9 +190,4 @@ cuOverSamplerR2R::~cuOverSamplerR2R() workSizeOut->deallocate(); } -// end of file - - - - - +// end of file \ No newline at end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h index 9ddce96b2..cfc202047 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h @@ -12,7 +12,8 @@ #define __CUOVERSAMPLER_H #include "cuArrays.h" -#include "cudaUtil.h" +#include "data_types.h" +#include // FFT Oversampler for complex images class cuOverSamplerC2C @@ -21,8 +22,8 @@ class cuOverSamplerC2C cufftHandle forwardPlan; // forward fft handle cufftHandle backwardPlan; // backward fft handle cudaStream_t stream; // cuda stream - cuArrays *workIn; // work array to hold forward fft data - cuArrays *workOut; // work array to hold padded data + cuArrays *workIn; // work array to hold forward fft data + cuArrays *workOut; // work array to hold padded data public: // disable the default constructor cuOverSamplerC2C() = delete; @@ -31,7 +32,7 @@ class cuOverSamplerC2C // set cuda stream void setStream(cudaStream_t stream_); // execute oversampling - void execute(cuArrays *imagesIn, cuArrays *imagesOut, int deramp_method=0); + void execute(cuArrays *imagesIn, cuArrays *imagesOut, int deramp_method=0); // destructor ~cuOverSamplerC2C(); }; @@ -43,14 +44,14 @@ class cuOverSamplerR2R cufftHandle forwardPlan; cufftHandle backwardPlan; cudaStream_t stream; - cuArrays *workSizeIn; - cuArrays *workSizeOut; + cuArrays *workSizeIn; + cuArrays *workSizeOut; public: cuOverSamplerR2R() = delete; cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_); void setStream(cudaStream_t stream_); - void execute(cuArrays *imagesIn, cuArrays *imagesOut); + void execute(cuArrays *imagesIn, cuArrays *imagesOut); ~cuOverSamplerR2R(); }; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu index e851eefdb..03eaffa40 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu @@ -45,7 +45,7 @@ __global__ void cuSetupSincKernel_kernel(float *r_filter_, const int i_filtercoe float r_wgt = (1.0f - r_wgthgt_) + r_wgthgt_*cos(PI*r_wa*r_soff_inverse_); float r_s = r_wa*r_beta_*r_decfactor_inverse_*PI; float r_fct; - if(r_s != 0.0f) { + if(r_s != 0.0) { r_fct = sin(r_s)/r_s; } else { @@ -121,8 +121,8 @@ __global__ void cuSincInterpolation_kernel(const int nImages, int i_yfrac = int(r_yfrac*i_decfactor_); // temp variables - float intpData = 0.0f; // interpolated value - float r_sincwgt = 0.0f; // total filter weight + float intpData = 0.0; // interpolated value + float r_sincwgt = 0.0; // total filter weight float r_sinc_coef; // filter weight // iterate over lines of input image diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h index 3755c2375..b2ddc5849 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h @@ -34,7 +34,7 @@ class cuSincOverSamplerR2R static const int i_sincwindow = 2; ///< the oversampling is only performed within \pm i_sincwindow*i_covs around the peak static const int i_weight = 1; ///< weight for cos() pedestal - const float r_pedestal = 0.0f; ///< height of pedestal + const float r_pedestal = 0.0; ///< height of pedestal const float r_beta = 0.75f; ///< a low-band pass const float r_relfiltlen = 6.0f; ///< relative filter length diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp new file mode 100644 index 000000000..183fa4cc0 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp @@ -0,0 +1,44 @@ +#include "cudaError.h" + +#include +#include +#include +#include + +#ifdef __DRIVER_TYPES_H__ +#ifndef DEVICE_RESET +#define DEVICE_RESET cudaDeviceReset(); +#endif +#else +#ifndef DEVICE_RESET +#define DEVICE_RESET +#endif +#endif + +template +void check(T result, char const *const func, const char *const file, int const line) +{ + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", + file, line, static_cast(result), func); + DEVICE_RESET + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +template void check(cudaError_t, char const *const, const char *const, int const); +template void check(cufftResult_t, char const *const, const char *const, int const); + +void __getLastCudaError(const char *errorMessage, const char *file, const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, "%s(%i) : CUDA error : %s : (%d) %s.\n", + file, line, errorMessage, (int)err, cudaGetErrorString(err)); + DEVICE_RESET + exit(EXIT_FAILURE); + } +} diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h index 5e857cba0..f4f9524be 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.h @@ -10,51 +10,13 @@ #pragma once -#include -#include -#include -#include #include "debug.h" -#include - - -#ifdef __DRIVER_TYPES_H__ -#ifndef DEVICE_RESET -#define DEVICE_RESET cudaDeviceReset(); -#endif -#else -#ifndef DEVICE_RESET -#define DEVICE_RESET -#endif -#endif template -void check(T result, char const *const func, const char *const file, int const line) -{ - if (result) - { - - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", - file, line, static_cast(result), func); - DEVICE_RESET - // Make sure we call CUDA Device Reset before exiting - exit(EXIT_FAILURE); - } -} +void check(T result, char const *const func, const char *const file, int const line); // This will output the proper error string when calling cudaGetLastError -inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) -{ - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) - { - fprintf(stderr, "%s(%i) : CUDA error : %s : (%d) %s.\n", - file, line, errorMessage, (int)err, cudaGetErrorString(err)); - DEVICE_RESET - exit(EXIT_FAILURE); - } -} +void __getLastCudaError(const char *errorMessage, const char *file, const int line); // This will output the proper CUDA error strings in the event that a CUDA host call returns an error #ifdef CUDA_ERROR_CHECK diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp new file mode 100644 index 000000000..c96a12947 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.cpp @@ -0,0 +1,58 @@ +#include "cudaUtil.h" + +#include +#include +#include +#include "cudaError.h" + +int gpuDeviceInit(int devID) +{ + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0 || devID > device_count - 1) { + fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("Using CUDA Device %d ...\n", devID); + + return devID; +} + +void gpuDeviceList() +{ + int device_count = 0; + int current_device = 0; + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + fprintf(stderr, "Detecting all CUDA devices ...\n"); + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + while (current_device < device_count) { + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + if (deviceProp.computeMode == cudaComputeModeProhibited) { + fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: " + "device is running in \n", + current_device, deviceProp.name); + } else if (deviceProp.major < 1) { + fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: " + "device does not support CUDA \n", + current_device, deviceProp.name); + } else { + fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", + current_device, deviceProp.name); + } + current_device++; + } +} diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h index cc174e009..7516bc9ca 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaUtil.h @@ -10,9 +10,6 @@ #ifndef __CUDAUTIL_H #define __CUDAUTIL_H -#include -#include "cudaError.h" - // for 2D FFT #define NRANK 2 @@ -47,12 +44,6 @@ #define MIN(a,b) (a > b ? b: a) #endif -// Float To Int conversion -inline int ftoi(float value) -{ - return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5)); -} - // compute the next integer in power of 2 inline int nextpower2(int value) { @@ -61,63 +52,11 @@ inline int nextpower2(int value) return r; } - // General GPU Device CUDA Initialization -inline int gpuDeviceInit(int devID) -{ - int device_count; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) - { - fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - if (devID < 0 || devID > device_count-1) - { - fprintf(stderr, "gpuDeviceInit() Device %d is not a valid GPU device. \n", devID); - exit(EXIT_FAILURE); - } - - checkCudaErrors(cudaSetDevice(devID)); - printf("Using CUDA Device %d ...\n", devID); - - return devID; -} +int gpuDeviceInit(int devID); // This function lists all available GPUs -inline void gpuDeviceList() -{ - int device_count = 0; - int current_device = 0; - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - fprintf(stderr, "Detecting all CUDA devices ...\n"); - if (device_count == 0) - { - fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - while (current_device < device_count) - { - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); - if (deviceProp.computeMode == cudaComputeModeProhibited) - { - fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device is running in \n", current_device, deviceProp.name); - } - else if (deviceProp.major < 1) - { - fprintf(stderr, "CUDA Device [%d]: \"%s\" is not available: device does not support CUDA \n", current_device, deviceProp.name); - } - else { - fprintf(stderr, "CUDA Device [%d]: \"%s\" is available.\n", current_device, deviceProp.name); - } - current_device++; - } -} +void gpuDeviceList(); #endif //__CUDAUTIL_H //end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h new file mode 100644 index 000000000..134deafdc --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/data_types.h @@ -0,0 +1,46 @@ +/** + * @file debug.h + * @brief Define flags to control the debugging + * + * CUAMPCOR_DEBUG is used to output debugging information and intermediate results, + * disabled when NDEBUG macro is defined. + * CUDA_ERROR_CHECK is always enabled, to check CUDA routine errors + * + */ + +// code guard +#ifndef __CUAMPCOR_DATA_TYPES_H +#define __CUAMPCOR_DATA_TYPES_H + +#include + +// disable this for single precision version +// #define CUAMPCOR_DOUBLE + +#ifdef CUAMPCOR_DOUBLE + #define real_type double + #define complex_type double2 + #define real2_type double2 + #define real3_type double3 + #define make_real2 make_double2 + #define make_real3 make_double3 + #define make_complex_type make_double2 + #define EPSILON DBL_EPSILON + #define REAL_MAX DBL_MAX +#else + #define real_type float + #define complex_type float2 + #define real2_type float2 + #define real3_type float3 + #define make_real2 make_float2 + #define make_real3 make_float3 + #define make_complex_type make_float2 + #define EPSILON FLT_EPSILON + #define REAL_MAX FLT_MAX +#endif + +#define image_complex_type float2 +#define image_real_type float + +#endif //__CUAMPCOR_DATA_TYPES_H +//end of file \ No newline at end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h index e43e8e44a..1879bb91e 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/float2.h @@ -7,9 +7,10 @@ #ifndef __FLOAT2_H #define __FLOAT2_H -#include +#include +#include -inline __host__ __device__ void zero(float2 &a) { a.x = 0.0f; a.y = 0.0f; } +inline __host__ __device__ void zero(float2 &a) { a.x = 0.0; a.y = 0.0; } // negative inline __host__ __device__ float2 operator-(float2 &a) @@ -125,5 +126,124 @@ inline __host__ __device__ float2 complexExp(float arg) return make_float2(cosf(arg), sinf(arg)); } + +// double precision +inline __host__ __device__ void zero(double2 &a) { a.x = 0.0; a.y = 0.0; } + +// negative +inline __host__ __device__ double2 operator-(double2 &a) +{ + return make_double2(-a.x, -a.y); +} + +// complex conjugate +inline __host__ __device__ double2 conjugate(double2 a) +{ + return make_double2(a.x, -a.y); +} + +// addition +inline __host__ __device__ double2 operator+(double2 a, double2 b) +{ + return make_double2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(double2 &a, double2 b) +{ + a.x += b.x; + a.y += b.y; +} + +inline __host__ __device__ double2 operator+(double2 a, double b) +{ + return make_double2(a.x + b, a.y); +} +inline __host__ __device__ void operator+=(double2 &a, double b) +{ + a.x += b; +} + +// subtraction +inline __host__ __device__ double2 operator-(double2 a, double2 b) +{ + return make_double2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(double2 &a, double2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ double2 operator-(double2 a, double b) +{ + return make_double2(a.x - b, a.y); +} +inline __host__ __device__ void operator-=(double2 &a, double b) +{ + a.x -= b; +} + +// multiplication +inline __host__ __device__ double2 operator*(double2 a, double2 b) +{ + return make_double2(a.x*b.x - a.y*b.y, a.y*b.x + a.x*b.y); +} +inline __host__ __device__ void operator*=(double2 &a, double2 b) +{ + a.x = a.x*b.x - a.y*b.y; + a.y = a.y*b.x + a.x*b.y; +} +inline __host__ __device__ double2 operator*(double2 a, double b) +{ + return make_double2(a.x * b, a.y * b); +} +inline __host__ __device__ void operator*=(double2 &a, double b) +{ + a.x *= b; + a.y *= b; +} +inline __host__ __device__ double2 operator*(double2 a, int b) +{ + return make_double2(a.x * b, a.y * b); +} +inline __host__ __device__ void operator*=(double2 &a, int b) +{ + a.x *= b; + a.y *= b; +} +inline __host__ __device__ double2 complexMul(double2 a, double2 b) +{ + return a*b; +} +inline __host__ __device__ double2 complexMulConj(double2 a, double2 b) +{ + return make_double2(a.x*b.x + a.y*b.y, a.y*b.x - a.x*b.y); +} + +inline __host__ __device__ double2 operator/(double2 a, double b) +{ + return make_double2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(double2 &a, double b) +{ + a.x /= b; + a.y /= b; +} + +// abs, arg +inline __host__ __device__ double complexAbs(double2 a) +{ + return sqrt(a.x*a.x+a.y*a.y); +} +inline __host__ __device__ double complexArg(double2 a) +{ + return atan2(a.y, a.x); +} + +// make a complex number from phase +inline __host__ __device__ double2 complexExp(double arg) +{ + return make_double2(cos(arg), sin(arg)); +} + + #endif //__FLOAT2_H -// end of file \ No newline at end of file +// end of file diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp index 7e884ba45..d2a9615a1 100644 --- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp @@ -52,6 +52,7 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM(str, snrImageName) .DEF_PARAM(str, covImageName) .DEF_PARAM(str, corrImageName) + .DEF_PARAM(str, peakValueImageName) .DEF_PARAM(int, rawDataOversamplingFactor) .DEF_PARAM(int, corrStatWindowSize) @@ -78,6 +79,8 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_METHOD(runAmpcor) + .DEF_METHOD(isDoublePrecision) + .def("checkPixelInImageRange", [](const cls& self) { self.param->checkPixelInImageRange(); }) From 295eca287143b2438428a9c3d791235f290baff6 Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Tue, 21 Jan 2025 10:46:46 -0800 Subject: [PATCH 02/11] Drop in Lijun's latest v2 changes --- cxx/isce3/cuda/Sources.cmake | 4 +- .../matchtemplate/pycuampcor/GDALImage.cpp | 2 +- .../matchtemplate/pycuampcor/PyCuAmpcor.cpp | 1 + .../pycuampcor/cuAmpcorController.cpp | 12 +- .../pycuampcor/cuAmpcorParameter.cpp | 116 +++- .../pycuampcor/cuAmpcorParameter.h | 17 +- .../pycuampcor/cuAmpcorProcessor.cpp | 80 +++ .../pycuampcor/cuAmpcorProcessor.h | 72 ++ ...orChunk.cpp => cuAmpcorProcessorGrIMP.cpp} | 377 +++++------ ...AmpcorChunk.h => cuAmpcorProcessorGrIMP.h} | 62 +- .../pycuampcor/cuAmpcorProcessorROIPAC.cpp | 619 ++++++++++++++++++ .../pycuampcor/cuAmpcorProcessorROIPAC.h | 88 +++ .../matchtemplate/pycuampcor/cuAmpcorUtil.h | 11 +- .../matchtemplate/pycuampcor/cuArraysCopy.cu | 58 ++ .../pycuampcor/cuCorrNormalization.cu | 53 ++ .../pycuampcor/cuEstimateStats.cu | 53 +- .../cuda/matchtemplate/pycuampcor/cuOffset.cu | 54 ++ .../pycuampcor/cuSincOverSampler.cu | 120 +++- .../pycuampcor/cuSincOverSampler.h | 11 +- 19 files changed, 1460 insertions(+), 350 deletions(-) create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorChunk.cpp => cuAmpcorProcessorGrIMP.cpp} (84%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorChunk.h => cuAmpcorProcessorGrIMP.h} (59%) create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.h diff --git a/cxx/isce3/cuda/Sources.cmake b/cxx/isce3/cuda/Sources.cmake index 0147fbe14..07555053f 100644 --- a/cxx/isce3/cuda/Sources.cmake +++ b/cxx/isce3/cuda/Sources.cmake @@ -28,9 +28,11 @@ geometry/Topo.cpp geometry/utilities.cu image/gpuResampSlc.cu image/ResampSlc.cpp -matchtemplate/pycuampcor/cuAmpcorChunk.cpp matchtemplate/pycuampcor/cuAmpcorController.cpp matchtemplate/pycuampcor/cuAmpcorParameter.cpp +matchtemplate/pycuampcor/cuAmpcorProcessor.cpp +matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp +matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp matchtemplate/pycuampcor/cuArrays.cpp matchtemplate/pycuampcor/cuArraysCopy.cu matchtemplate/pycuampcor/cuArraysPadding.cu diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp index 42b864799..aa0c30e92 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp @@ -53,7 +53,7 @@ GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useM // determine the pixel size in bytes _pixelSize = GDALGetDataTypeSize(_dataType); - _bufferSize = 1024*1024*cacheSizeInGB; + _bufferSize = 1024*1024*1024*cacheSizeInGB; // checking whether using memory map if(_useMmap) { diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp index 2d2b61809..92334d671 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp @@ -32,6 +32,7 @@ PYBIND11_MODULE(PyCuAmpcor, m) .DEF_PARAM(int, deviceID) .DEF_PARAM(int, nStreams) .DEF_PARAM(int, derampMethod) + .DEF_PARAM(int, workflow) .DEF_PARAM(str, referenceImageName) .DEF_PARAM(int, referenceImageHeight) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp index 5658a9550..57a792259 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp @@ -10,7 +10,7 @@ #include "GDALImage.h" #include "cuArrays.h" #include "cudaUtil.h" -#include "cuAmpcorChunk.h" +#include "cuAmpcorProcessor.h" #include "cuAmpcorUtil.h" #include #include @@ -96,17 +96,15 @@ void cuAmpcorController::runAmpcor() // set up the cuda streams cudaStream_t streams[param->nStreams]; - cuAmpcorChunk *chunk[param->nStreams]; + std::unique_ptr chunk[param->nStreams]; // iterate over cuda streams for(int ist=0; istnStreams; ist++) { // create each stream checkCudaErrors(cudaStreamCreate(&streams[ist])); // create the chunk processor for each stream - chunk[ist]= new cuAmpcorChunk(param, referenceImage, secondaryImage, - offsetImageRun, snrImageRun, covImageRun, peakValueImageRun, - streams[ist]); - + chunk[ist]= cuAmpcorProcessor::create(param->workflow, param, referenceImage, secondaryImage, + offsetImageRun, snrImageRun, covImageRun, peakValueImageRun, streams[ist]); } int nChunksDown = param->numberChunkDown; @@ -190,7 +188,7 @@ void cuAmpcorController::runAmpcor() for (int ist=0; istnStreams; ist++) { // cufftplan etc are stream dependent, need to be deleted before stream is destroyed - delete chunk[ist]; + chunk[ist].release(); checkCudaErrors(cudaStreamDestroy(streams[ist])); } diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp index 2edca32ee..3fbb85222 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp @@ -6,6 +6,8 @@ #include "cuAmpcorParameter.h" #include +#include + #ifndef IDIVUP #define IDIVUP(i,j) ((i+j-1)/j) @@ -24,6 +26,7 @@ cuAmpcorParameter::cuAmpcorParameter() deviceID = 0; nStreams = 1; derampMethod = 1; + workflow = 0; windowSizeWidthRaw = 64; windowSizeHeightRaw = 64; @@ -56,7 +59,7 @@ cuAmpcorParameter::cuAmpcorParameter() referenceStartPixelDown0 = 0; referenceStartPixelAcross0 = 0; - // corrStatWindowSize = 21; // 10*2+1 as in RIOPAC + corrStatWindowSize = 21; // 10*2+1 as in RIOPAC useMmap = 1; // use mmap mmapSizeInGB = 1; @@ -70,21 +73,76 @@ cuAmpcorParameter::cuAmpcorParameter() */ void cuAmpcorParameter::setupParameters() +{ + // workflow specific parameter settings + // + switch (workflow) { + case 0: + _setupParameters_ROIPAC(); + break; + case 1: + _setupParameters_GrIMP(); + break; + default: + throw std::invalid_argument("Unsupported workflow"); + } + + // common parameter settings + numberWindows = numberWindowDown*numberWindowAcross; + if(numberWindows <=0) { + fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross); + exit(EXIT_FAILURE); + } + + numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk); + numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk); + numberChunks = numberChunkDown*numberChunkAcross; + allocateArrays(); +} + +void cuAmpcorParameter::_setupParameters_ROIPAC() { // Size to extract the raw correlation surface for snr/cov - // corrRawZoomInHeight = std::min(corrStatWindowSize, 2*halfSearchRangeDownRaw+1); - // corrRawZoomInWidth = std::min(corrStatWindowSize, 2*halfSearchRangeAcrossRaw+1); + corrRawZoomInHeight = std::min(corrStatWindowSize, 2*halfSearchRangeDownRaw+1); + corrRawZoomInWidth = std::min(corrStatWindowSize, 2*halfSearchRangeAcrossRaw+1); // Size to extract the resampled correlation surface for oversampling // users should use 16 for zoomWindowSize, no need to multiply by 2 // zoomWindowSize *= rawDataOversamplingFactor; //8 * 2 // to check the search range - // int corrSurfaceActualSize = - // std::min(halfSearchRangeAcrossRaw, halfSearchRangeDownRaw)* - // 2*rawDataOversamplingFactor; - // zoomWindowSize = std::min(zoomWindowSize, corrSurfaceActualSize); + int corrSurfaceActualSize = + std::min(halfSearchRangeAcrossRaw, halfSearchRangeDownRaw)* + 2*rawDataOversamplingFactor; + zoomWindowSize = std::min(zoomWindowSize, corrSurfaceActualSize); - // halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 + halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); // 8*2/(2*2) = 4 + + windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor; // + windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor; + + searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; + searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw; + + searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw; + searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw; + + searchWindowSizeWidth = searchWindowSizeWidthRawZoomIn*rawDataOversamplingFactor; + searchWindowSizeHeight = searchWindowSizeHeightRawZoomIn*rawDataOversamplingFactor; + + windowSizeWidthRawEnlarged = windowSizeWidthRaw; + windowSizeHeightRawEnlarged = windowSizeHeightRaw; + + // loading offsets + referenceLoadingOffsetDown = 0; + referenceLoadingOffsetAcross = 0; + // secondary loading offset relative to reference + secondaryLoadingOffsetDown = -halfSearchRangeDownRaw; + secondaryLoadingOffsetAcross = -halfSearchRangeAcrossRaw; + +} + +void cuAmpcorParameter::_setupParameters_GrIMP() +{ // template window size (after antialiasing oversampling) windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor; // @@ -111,6 +169,14 @@ void cuAmpcorParameter::setupParameters() windowSizeHeightEnlarged = searchWindowSizeHeight; windowSizeWidthEnlarged = searchWindowSizeWidth; + // loading offsets + // reference size matching the secondary size + referenceLoadingOffsetDown = -halfSearchRangeDownRaw; + referenceLoadingOffsetAcross = -halfSearchRangeAcrossRaw; + // secondary loading offset relative to reference + secondaryLoadingOffsetDown = 0; + secondaryLoadingOffsetAcross = 0; + // correlation surface size corrWindowSize = make_int2(searchWindowSizeHeight - windowSizeHeight + 1, searchWindowSizeWidth - windowSizeWidth +1); @@ -127,16 +193,7 @@ void cuAmpcorParameter::setupParameters() corrZoomInSize.y * oversamplingFactor); fprintf(stderr, "zoomWindowSize is (%d, %d)!\n", corrZoomInSize.x, corrZoomInSize.y ); - numberWindows = numberWindowDown*numberWindowAcross; - if(numberWindows <=0) { - fprintf(stderr, "Incorrect number of windows! (%d, %d)\n", numberWindowDown, numberWindowAcross); - exit(EXIT_FAILURE); - } - numberChunkDown = IDIVUP(numberWindowDown, numberWindowDownInChunk); - numberChunkAcross = IDIVUP(numberWindowAcross, numberWindowAcrossInChunk); - numberChunks = numberChunkDown*numberChunkAcross; - allocateArrays(); } @@ -191,13 +248,12 @@ void cuAmpcorParameter::setStartPixels(int *mStartD, int *mStartA, int *gOffsetD { for(int i=0; i cuAmpcorProcessor::create(int workflow, + cuAmpcorParameter *param_, + GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) +{ + if (workflow == 0) { + return std::unique_ptr(new cuAmpcorProcessorROIPAC( + param_, reference_, secondary_, offsetImage_, + snrImage_, covImage_, peakValueImage_, stream_)); + } else if (workflow == 1) { + return std::unique_ptr(new cuAmpcorProcessorGrIMP( + param_, reference_, secondary_, offsetImage_, + snrImage_, covImage_, peakValueImage_, stream_)); + } else { + throw std::invalid_argument("Unsupported workflow"); + } +} + +// constructor +cuAmpcorProcessor::cuAmpcorProcessor(cuAmpcorParameter *param_, + GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) + : param(param_), referenceImage(reference_), secondaryImage(secondary_), + offsetImage(offsetImage_), snrImage(snrImage_), covImage(covImage_), + peakValueImage(peakValueImage_), stream(stream_) +{ +} + +/// set chunk index +void cuAmpcorProcessor::setIndex(int idxDown_, int idxAcross_) +{ + idxChunkDown = idxDown_; + idxChunkAcross = idxAcross_; + idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; + + if(idxChunkDown == param->numberChunkDown -1) { + nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); + } + else { + nWindowsDown = param->numberWindowDownInChunk; + } + + if(idxChunkAcross == param->numberChunkAcross -1) { + nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); + } + else { + nWindowsAcross = param->numberWindowAcrossInChunk; + } +} + +/// obtain the starting pixels for each chip +/// @param[in] oStartPixel start pixel locations for all chips +/// @param[out] rstartPixel start pixel locations for chips within the chunk +void cuAmpcorProcessor::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) +{ + for(int i=0; inumberWindowDownInChunk; ++i) { + int iDown = i; + if(i>=nWindowsDown) iDown = nWindowsDown-1; + for(int j=0; jnumberWindowAcrossInChunk; ++j){ + int iAcross = j; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; + int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross + + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; + rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff; + } + } +} + diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h new file mode 100644 index 000000000..4d07d1443 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h @@ -0,0 +1,72 @@ +/* + * @file cuAmpcorChunk.h + * @brief Ampcor processor for a batch of windows + * + * + */ + +#ifndef __CUAMPCORPROCESSOR_H +#define __CUAMPCORPROCESSOR_H + +#include "GDALImage.h" +#include "data_types.h" +#include "cuArrays.h" +#include "cuAmpcorParameter.h" +#include "cuOverSampler.h" +#include "cuSincOverSampler.h" +#include "cuCorrFrequency.h" +#include "cuCorrNormalizer.h" + + +/** + * cuAmpcor batched processor (virtual class) + */ +class cuAmpcorProcessor{ +// shared variables +protected: + int idxChunkDown; ///< index of the chunk in total batches, down + int idxChunkAcross; ///< index of the chunk in total batches, across + int idxChunk; ///< + int nWindowsDown; ///< number of windows in one chunk, down + int nWindowsAcross; ///< number of windows in one chunk, across + + int devId; ///< GPU device ID to use + cudaStream_t stream; ///< CUDA stream to use + + GDALImage *referenceImage; ///< reference image object + GDALImage *secondaryImage; ///< secondary image object + cuAmpcorParameter *param; ///< reference to the (global) parameters + cuArrays *offsetImage; ///< output offsets image + cuArrays *snrImage; ///< snr image + cuArrays *covImage; ///< cov image + cuArrays *peakValueImage; ///< peak value image + + +public: + // default constructor and destructor + cuAmpcorProcessor(cuAmpcorParameter *param_, + GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); + virtual ~cuAmpcorProcessor() = default; + + // Factory method (virtual constructor) + static std::unique_ptr create(int workflow, + cuAmpcorParameter *param_, + GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); + + // workflow specific methods + virtual void run(int, int) = 0; + +protected: + // shared methods + void setIndex(int idxDown_, int idxAcross_); + void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff); + +}; + +#endif //__CUAMPCORPROCESSOR_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp similarity index 84% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cpp rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp index 2a59d81bc..e66111f30 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp @@ -1,4 +1,4 @@ -#include "cuAmpcorChunk.h" +#include "cuAmpcorProcessorGrIMP.h" #include "cuAmpcorUtil.h" #include @@ -9,7 +9,7 @@ * @param[in] idxDown_ index of the chunk along Down/Azimuth direction * @param[in] idxAcross_ index of the chunk along Across/Range direction */ -void cuAmpcorChunk::run(int idxDown_, int idxAcross_) +void cuAmpcorProcessorGrIMP::run(int idxDown_, int idxAcross_) { // set chunk index setIndex(idxDown_, idxAcross_); @@ -101,7 +101,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) // statistics of correlation surface // estimate variance on r_corrBatch - cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, r_covValue, stream); + cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, param->oversamplingFactor, r_covValue, stream); // snr on the extracted surface r_corrBatchZoomIn cuArraysSumSquare(r_corrBatchZoomIn, r_corrBatchSum, stream); @@ -117,11 +117,9 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) if(param->oversamplingMethod) { // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor // we need the max loc as the center if shifted - std::cout << "Sinc oversampler does not work at this moment\n"; - exit(1); - // corrSincOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled, - // maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor - // ); + corrSincOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled, + maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor + ); } else { @@ -166,48 +164,151 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) // all done } -/// set chunk index -void cuAmpcorChunk::setIndex(int idxDown_, int idxAcross_) + + +/// constructor +cuAmpcorProcessorGrIMP::cuAmpcorProcessorGrIMP(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) + : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_) { - idxChunkDown = idxDown_; - idxChunkAcross = idxAcross_; - idxChunk = idxChunkAcross + idxChunkDown*param->numberChunkAcross; - if(idxChunkDown == param->numberChunkDown -1) { - nWindowsDown = param->numberWindowDown - param->numberWindowDownInChunk*(param->numberChunkDown -1); - } - else { - nWindowsDown = param->numberWindowDownInChunk; - } + // allocate the SLC image read buffer + ChunkOffsetDown = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetDown->allocate(); + ChunkOffsetDown->allocateHost(); + ChunkOffsetAcross = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetAcross->allocate(); + ChunkOffsetAcross->allocateHost(); + + c_referenceBatchRaw = new cuArrays ( + param->windowSizeHeightRawEnlarged, param->windowSizeWidthRawEnlarged, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchRaw->allocate(); + + c_secondaryBatchRaw = new cuArrays ( + param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchRaw->allocate(); + + c_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeightEnlarged, param->windowSizeWidthEnlarged, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchOverSampled->allocate(); + + c_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchOverSampled->allocate(); + + r_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeight, param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_referenceBatchOverSampled->allocate(); + + r_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_secondaryBatchOverSampled->allocate(); - if(idxChunkAcross == param->numberChunkAcross -1) { - nWindowsAcross = param->numberWindowAcross - param->numberWindowAcrossInChunk*(param->numberChunkAcross -1); + referenceBatchOverSampler = new cuOverSamplerC2C( + c_referenceBatchRaw->height, c_referenceBatchRaw->width, //orignal size + c_referenceBatchOverSampled->height, c_referenceBatchOverSampled->width, //oversampled size + c_referenceBatchRaw->count, stream); + + secondaryBatchOverSampler = new cuOverSamplerC2C( + c_secondaryBatchRaw->height, c_secondaryBatchRaw->width, + c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, + c_secondaryBatchRaw->count, stream); + + r_corrBatch = new cuArrays ( + param->corrWindowSize.x, + param->corrWindowSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatch->allocate(); + + r_corrBatchZoomIn = new cuArrays ( + param->corrZoomInSize.x, + param->corrZoomInSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomIn->allocate(); + + r_corrBatchZoomInOverSampled = new cuArrays ( + param->corrZoomInOversampledSize.x, + param->corrZoomInOversampledSize.y, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomInOverSampled->allocate(); + + offsetInit = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetInit->allocate(); + + offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetZoomIn->allocate(); + + offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetFinal->allocate(); + + // the sinc interpolation center is at the correlation surface center + // E.g. (10, 10) for 21x21. + maxLocShift = make_int2(param->corrWindowSize.x/2, param->corrWindowSize.y/2); + + corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + corrMaxValue->allocate(); + + r_corrBatchSum = new cuArrays ( + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchSum->allocate(); + + i_maxloc = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + i_maxloc->allocate(); + + r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_maxval->allocate(); + + r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_snrValue->allocate(); + + r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_covValue->allocate(); + + // end of new arrays + + if(param->oversamplingMethod) { + corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); } else { - nWindowsAcross = param->numberWindowAcrossInChunk; + corrOverSampler= new cuOverSamplerR2R( + r_corrBatchZoomIn->height, r_corrBatchZoomIn->width, + r_corrBatchZoomInOverSampled->height, r_corrBatchZoomInOverSampled->width, + param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + stream); } -} - -/// obtain the starting pixels for each chip -/// @param[in] oStartPixel start pixel locations for all chips -/// @param[out] rstartPixel start pixel locations for chips within the chunk -void cuAmpcorChunk::getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff) -{ - for(int i=0; inumberWindowDownInChunk; ++i) { - int iDown = i; - if(i>=nWindowsDown) iDown = nWindowsDown-1; - for(int j=0; jnumberWindowAcrossInChunk; ++j){ - int iAcross = j; - if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; - int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; - int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross - + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; - rStartPixel[idxInChunk] = oStartPixel[idxInAll] - diff; - } + if(param->algorithm == 0) { + cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, + stream); } + + + corrNormalizerOverSampled = + std::unique_ptr(newCuNormalizer( + param->searchWindowSizeHeight, + param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + )); + + +#ifdef CUAMPCOR_DEBUG + std::cout << "all objects in chunk are created ...\n"; +#endif } -void cuAmpcorChunk::loadReferenceChunk() +void cuAmpcorProcessorGrIMP::loadReferenceChunk() { // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw @@ -245,31 +346,29 @@ void cuAmpcorChunk::loadReferenceChunk() ChunkOffsetAcross->copyToDevice(stream); #ifdef CUAMPCOR_DEBUG - std::cout << "loading reference windows from chunk debug ... \n"; - auto * startPixelDownToChunk = ChunkOffsetDown->hostData; - auto * startPixelAcrossToChunk = ChunkOffsetAcross->hostData; - - for(int i=0; inumberWindowDownInChunk; ++i) { - int iDown = i; - if(i>=nWindowsDown) iDown = nWindowsDown-1; - for(int j=0; jnumberWindowAcrossInChunk; ++j){ - int iAcross = j; - if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; - int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; - int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross - + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; - std::cout << "Window index in chuck: (" << iDown << ", " << iAcross << ") \n"; - std::cout << " Staring pixel location from raw: (" << param->referenceStartPixelDown[idxInAll] << ", " - << param->referenceStartPixelAcross[idxInAll] <<")\n"; - std::cout << " Staring pixel location from chunk: (" << startPixelDownToChunk[idxInChunk] << ", " - << startPixelAcrossToChunk[idxInChunk] <<")\n"; + std::cout << "loading reference windows from chunk debug ... \n"; + auto * startPixelDownToChunk = ChunkOffsetDown->hostData; + auto * startPixelAcrossToChunk = ChunkOffsetAcross->hostData; + + for(int i=0; inumberWindowDownInChunk; ++i) { + int iDown = i; + if(i>=nWindowsDown) iDown = nWindowsDown-1; + for(int j=0; jnumberWindowAcrossInChunk; ++j){ + int iAcross = j; + if(j>=nWindowsAcross) iAcross = nWindowsAcross-1; + int idxInChunk = iDown*param->numberWindowAcrossInChunk+iAcross; + int idxInAll = (iDown+idxChunkDown*param->numberWindowDownInChunk)*param->numberWindowAcross + + idxChunkAcross*param->numberWindowAcrossInChunk+iAcross; + std::cout << "Window index in chuck: (" << iDown << ", " << iAcross << ") \n"; + std::cout << " Staring pixel location from raw: (" << param->referenceStartPixelDown[idxInAll] << ", " + << param->referenceStartPixelAcross[idxInAll] <<")\n"; + std::cout << " Staring pixel location from chunk: (" << startPixelDownToChunk[idxInChunk] << ", " + << startPixelAcrossToChunk[idxInChunk] <<")\n"; + } } - } - #endif - // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) if(referenceImage->isComplex()) { @@ -314,7 +413,7 @@ void cuAmpcorChunk::loadReferenceChunk() } // end of if all pixels out of range } -void cuAmpcorChunk::loadSecondaryChunk() +void cuAmpcorProcessorGrIMP::loadSecondaryChunk() { // get the chunk size to be loaded to gpu int height = param->secondaryChunkHeight[idxChunk]; // number of pixels along height @@ -381,157 +480,8 @@ void cuAmpcorChunk::loadSecondaryChunk() } } -/// constructor -cuAmpcorChunk::cuAmpcorChunk(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, - cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, - cudaStream_t stream_) - -{ - param = param_; - referenceImage = reference_; - secondaryImage = secondary_; - offsetImage = offsetImage_; - snrImage = snrImage_; - covImage = covImage_; - peakValueImage = peakValueImage_; - - stream = stream_; - - ChunkOffsetDown = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - ChunkOffsetDown->allocate(); - ChunkOffsetDown->allocateHost(); - ChunkOffsetAcross = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - ChunkOffsetAcross->allocate(); - ChunkOffsetAcross->allocateHost(); - - c_referenceBatchRaw = new cuArrays ( - param->windowSizeHeightRawEnlarged, param->windowSizeWidthRawEnlarged, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_referenceBatchRaw->allocate(); - - c_secondaryBatchRaw = new cuArrays ( - param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_secondaryBatchRaw->allocate(); - - c_referenceBatchOverSampled = new cuArrays ( - param->windowSizeHeightEnlarged, param->windowSizeWidthEnlarged, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_referenceBatchOverSampled->allocate(); - - c_secondaryBatchOverSampled = new cuArrays ( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - c_secondaryBatchOverSampled->allocate(); - - r_referenceBatchOverSampled = new cuArrays ( - param->windowSizeHeight, param->windowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_referenceBatchOverSampled->allocate(); - - r_secondaryBatchOverSampled = new cuArrays ( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_secondaryBatchOverSampled->allocate(); - - referenceBatchOverSampler = new cuOverSamplerC2C( - c_referenceBatchRaw->height, c_referenceBatchRaw->width, //orignal size - c_referenceBatchOverSampled->height, c_referenceBatchOverSampled->width, //oversampled size - c_referenceBatchRaw->count, stream); - - secondaryBatchOverSampler = new cuOverSamplerC2C( - c_secondaryBatchRaw->height, c_secondaryBatchRaw->width, - c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, - c_secondaryBatchRaw->count, stream); - - r_corrBatch = new cuArrays ( - param->corrWindowSize.x, - param->corrWindowSize.y, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatch->allocate(); - - r_corrBatchZoomIn = new cuArrays ( - param->corrZoomInSize.x, - param->corrZoomInSize.y, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchZoomIn->allocate(); - - r_corrBatchZoomInOverSampled = new cuArrays ( - param->corrZoomInOversampledSize.x, - param->corrZoomInOversampledSize.y, - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchZoomInOverSampled->allocate(); - - offsetInit = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - offsetInit->allocate(); - - offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - offsetZoomIn->allocate(); - - offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - offsetFinal->allocate(); - - maxLocShift = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - maxLocShift->allocate(); - - corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - corrMaxValue->allocate(); - - r_corrBatchSum = new cuArrays ( - param->numberWindowDownInChunk, - param->numberWindowAcrossInChunk); - r_corrBatchSum->allocate(); - - i_maxloc = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - i_maxloc->allocate(); - - r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_maxval->allocate(); - - r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_snrValue->allocate(); - - r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); - r_covValue->allocate(); - - // end of new arrays - - if(param->oversamplingMethod) { - corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); - } - else { - corrOverSampler= new cuOverSamplerR2R( - r_corrBatchZoomIn->height, r_corrBatchZoomIn->width, - r_corrBatchZoomInOverSampled->height, r_corrBatchZoomInOverSampled->width, - param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, - stream); - } - if(param->algorithm == 0) { - cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( - param->searchWindowSizeHeight, param->searchWindowSizeWidth, - param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, - stream); - } - - - corrNormalizerOverSampled = - std::unique_ptr(newCuNormalizer( - param->searchWindowSizeHeight, - param->searchWindowSizeWidth, - param->numberWindowDownInChunk * param->numberWindowAcrossInChunk - )); - - -#ifdef CUAMPCOR_DEBUG - std::cout << "all objects in chunk are created ...\n"; -#endif -} - // destructor -cuAmpcorChunk::~cuAmpcorChunk() +cuAmpcorProcessorGrIMP::~cuAmpcorProcessorGrIMP() { corrNormalizerOverSampled.release(); @@ -562,7 +512,6 @@ cuAmpcorChunk::~cuAmpcorChunk() delete offsetInit; delete offsetZoomIn; delete offsetFinal; - delete maxLocShift; delete corrMaxValue; delete r_corrBatchSum; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.h similarity index 59% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.h index beb127097..bad19c74d 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.h @@ -1,44 +1,22 @@ /* - * @file cuAmpcorChunk.h - * @brief Ampcor processor for a batch of windows + * @file cuAmpcorProcessorGrIMP.h + * @brief Ampcor processor for a batch of windows with GrIMP workflow * * */ -#ifndef __CUAMPCORCHUNK_H -#define __CUAMPCORCHUNK_H +#ifndef __CUAMPCORPROCESSORGRIMP_H +#define __CUAMPCORPROCESSORGRIMP_H -#include "GDALImage.h" -#include "data_types.h" -#include "cuArrays.h" -#include "cuAmpcorParameter.h" -#include "cuOverSampler.h" -#include "cuSincOverSampler.h" -#include "cuCorrFrequency.h" -#include "cuCorrNormalizer.h" +#include "cuAmpcorProcessor.h" /** * cuAmpcor processor for a chunk (a batch of windows) */ -class cuAmpcorChunk{ +class cuAmpcorProcessorGrIMP : public cuAmpcorProcessor { + private: - int idxChunkDown; ///< index of the chunk in total batches, down - int idxChunkAcross; ///< index of the chunk in total batches, across - int idxChunk; ///< - int nWindowsDown; ///< number of windows in one chunk, down - int nWindowsAcross; ///< number of windows in one chunk, across - - int devId; ///< GPU device ID to use - cudaStream_t stream; ///< CUDA stream to use - - GDALImage *referenceImage; ///< reference image object - GDALImage *secondaryImage; ///< secondary image object - cuAmpcorParameter *param; ///< reference to the (global) parameters - cuArrays *offsetImage; ///< output offsets image - cuArrays *snrImage; ///< snr image - cuArrays *covImage; ///< cov image - cuArrays *peakValueImage; ///< peak value image // local variables and workers // gpu buffer to load images from file @@ -47,18 +25,18 @@ class cuAmpcorChunk{ cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; + // offset data + cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; + // windows raw (not oversampled) data, complex and real cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw; - cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; + // cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; // windows oversampled data cuArrays * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled; cuArrays * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled; cuArrays * r_corrBatch, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled; - // offset data - cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; - // oversampling processors for complex images cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler; @@ -77,7 +55,7 @@ class cuAmpcorChunk{ cuArrays *offsetInit; cuArrays *offsetZoomIn; cuArrays *offsetFinal; - cuArrays *maxLocShift; // record the maxloc from the extract center + int2 maxLocShift; // record the maxloc from the extract center cuArrays *corrMaxValue; cuArrays *i_maxloc; cuArrays *r_maxval; @@ -92,23 +70,19 @@ class cuAmpcorChunk{ public: // constructor - cuAmpcorChunk(cuAmpcorParameter *param_, + cuAmpcorProcessorGrIMP(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_); // destructor - ~cuAmpcorChunk(); + ~cuAmpcorProcessorGrIMP() override; + + // run the given chunk + void run(int, int) override; - // local methods - void setIndex(int idxDown_, int idxAcross_); void loadReferenceChunk(); void loadSecondaryChunk(); - void getRelativeOffset(int *rStartPixel, const int *oStartPixel, int diff); - // run the given chunk - void run(int, int); }; - - -#endif +#endif //__CUAMPCORPROCESSORGRIMP_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp new file mode 100644 index 000000000..aea21e964 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp @@ -0,0 +1,619 @@ +#include "cuAmpcorProcessorROIPAC.h" + +#include "cuAmpcorUtil.h" +#include +#include + +/** + * Run ampcor process for a batch of images (a chunk) + * @param[in] idxDown_ index of the chunk along Down/Azimuth direction + * @param[in] idxAcross_ index of the chunk along Across/Range direction + */ +void cuAmpcorProcessorROIPAC::run(int idxDown_, int idxAcross_) +{ + // set chunk index + setIndex(idxDown_, idxAcross_); + + // load reference image chunk + loadReferenceChunk(); + // take amplitudes + cuArraysAbs(c_referenceBatchRaw, r_referenceBatchRaw, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the raw reference image(s) + c_referenceBatchRaw->outputToFile("c_referenceBatchRaw", stream); + r_referenceBatchRaw->outputToFile("r_referenceBatchRaw", stream); +#endif + + // compute and subtract mean values (for normalized) + cuArraysSubtractMean(r_referenceBatchRaw, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the raw reference image(s) + r_referenceBatchRaw->outputToFile("r_referenceBatchRawSubMean", stream); +#endif + + // load secondary image chunk + loadSecondaryChunk(); + // take amplitudes + cuArraysAbs(c_secondaryBatchRaw, r_secondaryBatchRaw, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the raw secondary image(s) + c_secondaryBatchRaw->outputToFile("c_secondaryBatchRaw", stream); + r_secondaryBatchRaw->outputToFile("r_secondaryBatchRaw", stream); +#endif + + //cross correlation for un-oversampled data + if(param->algorithm == 0) { + cuCorrFreqDomain->execute(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw); + } else { + cuCorrTimeDomain(r_referenceBatchRaw, r_secondaryBatchRaw, r_corrBatchRaw, stream); //time domain cross correlation + } + +#ifdef CUAMPCOR_DEBUG + // dump the un-normalized correlation surface + r_corrBatchRaw->outputToFile("r_corrBatchRawUnNorm", stream); +#endif + + // normalize the correlation surface + corrNormalizerRaw->execute(r_corrBatchRaw, r_referenceBatchRaw, r_secondaryBatchRaw, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the normalized correlation surface + r_corrBatchRaw->outputToFile("r_corrBatchRaw", stream); +#endif + + // find the maximum location of none-oversampled correlation + // 41 x 41, if halfsearchrange=20 + cuArraysMaxloc2D(r_corrBatchRaw, offsetInit, r_maxval, stream); + + // estimate variance + cuEstimateVariance(r_corrBatchRaw, offsetInit, r_maxval, r_referenceBatchRaw->size, 1, r_covValue, stream); + + // estimate SNR + // step1: extraction of correlation surface around the peak + cuArraysCopyExtractCorr(r_corrBatchRaw, r_corrBatchRawZoomIn, i_corrBatchZoomInValid, offsetInit, stream); + + // step2: summation of correlation and data point values + cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream); + +#ifdef CUAMPCOR_DEBUG + r_maxval->outputToFile("r_maxval", stream); + r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream); + i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream); + r_corrBatchSum->outputToFile("r_corrBatchSum", stream); + i_corrBatchValidCount->outputToFile("i_corrBatchValidCount", stream); +#endif + + // step3: divide the peak value by the mean of surrounding values + cuEstimateSnr(r_corrBatchSum, i_corrBatchValidCount, r_maxval, r_snrValue, stream); + +#ifdef CUAMPCOR_DEBUG + offsetInit->outputToFile("i_offsetInit", stream); + r_snrValue->outputToFile("r_snrValue", stream); + r_covValue->outputToFile("r_covValue", stream); +#endif + + // Using the approximate estimation to adjust secondary image (half search window size becomes only 4 pixels) + // determine the starting pixel to extract secondary images around the max location + cuDetermineSecondaryExtractOffset(offsetInit, + maxLocShift, + param->halfSearchRangeDownRaw, // old range + param->halfSearchRangeAcrossRaw, + param->halfZoomWindowSizeRaw, // new range + param->halfZoomWindowSizeRaw, + stream); + +#ifdef CUAMPCOR_DEBUG + offsetInit->outputToFile("i_offsetInitAdjusted", stream); + maxLocShift->outputToFile("i_maxLocShift", stream); +#endif + + // oversample reference + // (deramping included in oversampler) + referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod); + // take amplitudes + cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled reference image(s) + c_referenceBatchOverSampled->outputToFile("c_referenceBatchOverSampled", stream); + r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampled", stream); +#endif + + // compute and subtract the mean value + cuArraysSubtractMean(r_referenceBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled reference image(s) with mean subtracted + r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream); +#endif + + // extract secondary and oversample + cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit, stream); + secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod); + // take amplitudes + cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the extracted raw secondary image + c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn", stream); + // dump the oversampled secondary image(s) + c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream); + r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream); +#endif + + // correlate oversampled images + if(param->algorithm == 0) { + cuCorrFreqDomain_OverSampled->execute(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn); + } + else { + cuCorrTimeDomain(r_referenceBatchOverSampled, r_secondaryBatchOverSampled, r_corrBatchZoomIn, stream); + } + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface (un-normalized) + r_corrBatchZoomIn->outputToFile("r_corrBatchZoomInUnNorm", stream); +#endif + + // normalize the correlation surface + corrNormalizerOverSampled->execute(r_corrBatchZoomIn, r_referenceBatchOverSampled, r_secondaryBatchOverSampled, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface (normalized) + r_corrBatchZoomIn->outputToFile("r_corrBatchZoomIn", stream); +#endif + + // remove the last row and col to get even sequences + cuArraysCopyExtract(r_corrBatchZoomIn, r_corrBatchZoomInAdjust, make_int2(0,0), stream); + +#ifdef CUAMPCOR_DEBUG + // dump the adjusted correlation Surface + r_corrBatchZoomInAdjust->outputToFile("r_corrBatchZoomInAdjust", stream); +#endif + + // oversample the correlation surface + if(param->oversamplingMethod) { + // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor + // we need the max loc as the center if shifted + corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled, + maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor + ); + } + else { + corrOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled); + } + +#ifdef CUAMPCOR_DEBUG + // dump the oversampled correlation surface + r_corrBatchZoomInOverSampled->outputToFile("r_corrBatchZoomInOverSampled", stream); +#endif + + //find the max again + cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, offsetZoomIn, corrMaxValue, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the max location on oversampled correlation surface + offsetZoomIn->outputToFile("i_offsetZoomIn", stream); + corrMaxValue->outputToFile("r_maxvalZoomInOversampled", stream); +#endif + + // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel) + // = (Init-HalfsearchRange) + ZoomIn/(2*ovs) + cuSubPixelOffset2Pass(offsetInit, offsetZoomIn, offsetFinal, + param->oversamplingFactor, param->rawDataOversamplingFactor, + param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw, + stream); + + // Insert the chunk results to final images + cuArraysCopyInsert(offsetFinal, offsetImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // snr + cuArraysCopyInsert(r_snrValue, snrImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // Variance. + cuArraysCopyInsert(r_covValue, covImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // peak value. + cuArraysCopyInsert(r_maxval, peakValueImage, idxDown_*param->numberWindowDownInChunk, idxAcross_*param->numberWindowAcrossInChunk,stream); + // all done + +} + +void cuAmpcorProcessorROIPAC::loadReferenceChunk() +{ + + // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw + // then copy to a batch of windows with (nImages, height, width) (leading dimension on the right) + + // get the chunk size to be loaded to gpu + int startD = param->referenceChunkStartPixelDown[idxChunk]; //start pixel down (along height) + int startA = param->referenceChunkStartPixelAcross[idxChunk]; // start pixel across (along width) + int height = param->referenceChunkHeight[idxChunk]; // number of pixels along height + int width = param->referenceChunkWidth[idxChunk]; // number of pixels along width + + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) + { + // yes, simply set the image to 0 + c_referenceBatchRaw->setZero(stream); + } + else + { + // use cpu to compute the starting positions for each window + getRelativeOffset(ChunkOffsetDown->hostData, param->referenceStartPixelDown, param->referenceChunkStartPixelDown[idxChunk]); + // copy the positions to gpu + ChunkOffsetDown->copyToDevice(stream); + // same for the across direction + getRelativeOffset(ChunkOffsetAcross->hostData, param->referenceStartPixelAcross, param->referenceChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + + // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) + if(referenceImage->isComplex()) + { + // allocate a gpu buffer to load data from cpu/file + // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 + + c_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + c_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)c_referenceChunkRaw->devData, startD, startA, height, width, stream); + + //copy the chunk to a batch format (nImages, height, width) + // if derampMethod = 0 (no deramp), take amplitudes; otherwise, copy complex data + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + // deallocate the gpu buffer + delete c_referenceChunkRaw; + } + // if the image is real + else { + r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + r_referenceChunkRaw->allocate(); + + // load the data from cpu + referenceImage->loadToDevice((void *)r_referenceChunkRaw->devData, startD, startA, height, width, stream); + + // copy the chunk (real) to a batch format (complex) + cuArraysCopyToBatchWithOffsetR2C(r_referenceChunkRaw, + param->referenceChunkHeight[idxChunk], param->referenceChunkWidth[idxChunk], + c_referenceBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + // deallocate the gpu buffer + delete r_referenceChunkRaw; + } // end of if complex + } // end of if all pixels out of range +} + +void cuAmpcorProcessorROIPAC::loadSecondaryChunk() +{ + // get the chunk size to be loaded to gpu + int height = param->secondaryChunkHeight[idxChunk]; // number of pixels along height + int width = param->secondaryChunkWidth[idxChunk]; // number of pixels along width + + // check whether all pixels are outside the original image range + if (height ==0 || width ==0) + { + // yes, simply set the image to 0 + c_secondaryBatchRaw->setZero(stream); + } + else + { + //copy to a batch format (nImages, height, width) + getRelativeOffset(ChunkOffsetDown->hostData, param->secondaryStartPixelDown, param->secondaryChunkStartPixelDown[idxChunk]); + ChunkOffsetDown->copyToDevice(stream); + getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]); + ChunkOffsetAcross->copyToDevice(stream); + + if(secondaryImage->isComplex()) + { + c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + c_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(c_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + if(param->derampMethod == 0) { + cuArraysCopyToBatchAbsWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + else { + cuArraysCopyToBatchWithOffset(c_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + } + delete c_secondaryChunkRaw; + } + else { //real image + //allocate the gpu buffer + r_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + r_secondaryChunkRaw->allocate(); + + //load a chunk from mmap to gpu + secondaryImage->loadToDevice(r_secondaryChunkRaw->devData, + param->secondaryChunkStartPixelDown[idxChunk], + param->secondaryChunkStartPixelAcross[idxChunk], + param->secondaryChunkHeight[idxChunk], + param->secondaryChunkWidth[idxChunk], + stream); + + // convert to the batch format + cuArraysCopyToBatchWithOffsetR2C(r_secondaryChunkRaw, + param->secondaryChunkHeight[idxChunk], param->secondaryChunkWidth[idxChunk], + c_secondaryBatchRaw, ChunkOffsetDown->devData, ChunkOffsetAcross->devData, stream); + delete r_secondaryChunkRaw; + } + } +} + +/// constructor +cuAmpcorProcessorROIPAC::cuAmpcorProcessorROIPAC(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_) + : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_) +{ + param = param_; + referenceImage = reference_; + secondaryImage = secondary_; + offsetImage = offsetImage_; + snrImage = snrImage_; + covImage = covImage_; + + stream = stream_; + + ChunkOffsetDown = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetDown->allocate(); + ChunkOffsetDown->allocateHost(); + ChunkOffsetAcross = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + ChunkOffsetAcross->allocate(); + ChunkOffsetAcross->allocateHost(); + + c_referenceBatchRaw = new cuArrays ( + param->windowSizeHeightRaw, param->windowSizeWidthRaw, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchRaw->allocate(); + + c_secondaryBatchRaw = new cuArrays ( + param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchRaw->allocate(); + + r_referenceBatchRaw = new cuArrays ( + param->windowSizeHeightRaw, param->windowSizeWidthRaw, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_referenceBatchRaw->allocate(); + + r_secondaryBatchRaw = new cuArrays ( + param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_secondaryBatchRaw->allocate(); + + c_secondaryBatchZoomIn = new cuArrays ( + param->searchWindowSizeHeightRawZoomIn, param->searchWindowSizeWidthRawZoomIn, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchZoomIn->allocate(); + + c_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeight, param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_referenceBatchOverSampled->allocate(); + + c_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + c_secondaryBatchOverSampled->allocate(); + + r_referenceBatchOverSampled = new cuArrays ( + param->windowSizeHeight, param->windowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_referenceBatchOverSampled->allocate(); + + r_secondaryBatchOverSampled = new cuArrays ( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + r_secondaryBatchOverSampled->allocate(); + + referenceBatchOverSampler = new cuOverSamplerC2C( + c_referenceBatchRaw->height, c_referenceBatchRaw->width, //orignal size + c_referenceBatchOverSampled->height, c_referenceBatchOverSampled->width, //oversampled size + c_referenceBatchRaw->count, stream); + + secondaryBatchOverSampler = new cuOverSamplerC2C(c_secondaryBatchZoomIn->height, c_secondaryBatchZoomIn->width, + c_secondaryBatchOverSampled->height, c_secondaryBatchOverSampled->width, c_secondaryBatchRaw->count, stream); + + r_corrBatchRaw = new cuArrays ( + param->searchWindowSizeHeightRaw-param->windowSizeHeightRaw+1, + param->searchWindowSizeWidthRaw-param->windowSizeWidthRaw+1, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchRaw->allocate(); + + r_corrBatchZoomIn = new cuArrays ( + param->searchWindowSizeHeight - param->windowSizeHeight+1, + param->searchWindowSizeWidth - param->windowSizeWidth+1, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomIn->allocate(); + + r_corrBatchZoomInAdjust = new cuArrays ( + param->searchWindowSizeHeight - param->windowSizeHeight, + param->searchWindowSizeWidth - param->windowSizeWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomInAdjust->allocate(); + + + r_corrBatchZoomInOverSampled = new cuArrays ( + param->zoomWindowSize * param->oversamplingFactor, + param->zoomWindowSize * param->oversamplingFactor, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchZoomInOverSampled->allocate(); + + offsetInit = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetInit->allocate(); + + offsetZoomIn = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetZoomIn->allocate(); + + offsetFinal = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + offsetFinal->allocate(); + + maxLocShift = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + maxLocShift->allocate(); + + corrMaxValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + corrMaxValue->allocate(); + + + // new arrays due to snr estimation + r_corrBatchRawZoomIn = new cuArrays ( + param->corrRawZoomInHeight, + param->corrRawZoomInWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchRawZoomIn->allocate(); + + i_corrBatchZoomInValid = new cuArrays ( + param->corrRawZoomInHeight, + param->corrRawZoomInWidth, + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + i_corrBatchZoomInValid->allocate(); + + + r_corrBatchSum = new cuArrays ( + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + r_corrBatchSum->allocate(); + + i_corrBatchValidCount = new cuArrays ( + param->numberWindowDownInChunk, + param->numberWindowAcrossInChunk); + i_corrBatchValidCount->allocate(); + + i_maxloc = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + i_maxloc->allocate(); + + r_maxval = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + r_maxval->allocate(); + + r_snrValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + r_snrValue->allocate(); + + r_covValue = new cuArrays (param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); + + r_covValue->allocate(); + + // end of new arrays + + if(param->oversamplingMethod) { + corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); + } + else { + corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize, + (param->zoomWindowSize)*param->oversamplingFactor, + (param->zoomWindowSize)*param->oversamplingFactor, + param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + stream); + } + if(param->algorithm == 0) { + cuCorrFreqDomain = new cuFreqCorrelator( + param->searchWindowSizeHeightRaw, param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, + stream); + cuCorrFreqDomain_OverSampled = new cuFreqCorrelator( + param->searchWindowSizeHeight, param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk, + stream); + } + + corrNormalizerRaw = std::unique_ptr(newCuNormalizer( + param->searchWindowSizeHeightRaw, + param->searchWindowSizeWidthRaw, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + )); + + corrNormalizerOverSampled = + std::unique_ptr(newCuNormalizer( + param->searchWindowSizeHeight, + param->searchWindowSizeWidth, + param->numberWindowDownInChunk * param->numberWindowAcrossInChunk + )); + + +#ifdef CUAMPCOR_DEBUG + std::cout << "all objects in chunk are created ...\n"; +#endif +} + +// destructor +cuAmpcorProcessorROIPAC::~cuAmpcorProcessorROIPAC() +{ + corrNormalizerOverSampled.release(); + corrNormalizerRaw.release(); + + if(param->oversamplingMethod) { + delete corrSincOverSampler; + } + else { + delete corrOverSampler; + } + if(param->algorithm == 0) { + delete cuCorrFreqDomain; + delete cuCorrFreqDomain_OverSampled; + } + + delete ChunkOffsetDown ; + delete ChunkOffsetAcross ; + delete c_referenceBatchRaw; + delete c_secondaryBatchRaw; + delete r_referenceBatchRaw; + delete r_secondaryBatchRaw; + delete c_secondaryBatchZoomIn; + delete c_referenceBatchOverSampled; + delete c_secondaryBatchOverSampled; + delete r_referenceBatchOverSampled; + delete r_secondaryBatchOverSampled; + delete referenceBatchOverSampler; + delete secondaryBatchOverSampler; + + delete r_corrBatchRaw; + delete r_corrBatchZoomIn; + delete r_corrBatchZoomInAdjust; + delete r_corrBatchZoomInOverSampled; + delete offsetInit; + delete offsetZoomIn; + delete offsetFinal; + delete maxLocShift; + delete corrMaxValue; + + delete r_corrBatchRawZoomIn; + delete i_corrBatchZoomInValid; + delete r_corrBatchSum; + delete i_corrBatchValidCount; + delete i_maxloc; + delete r_maxval; + delete r_snrValue; + delete r_covValue; + + // end of deletions + +} + +// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.h new file mode 100644 index 000000000..43d4151a5 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.h @@ -0,0 +1,88 @@ +/* + * @file cuAmpcorProcessorROIPAC.h + * @brief Ampcor processor for a batch of windows with ROIPAC workflow + * + * + */ + +#ifndef __CUAMPCORPROCESSORROIPAC_H +#define __CUAMPCORPROCESSORROIPAC_H + +#include "cuAmpcorProcessor.h" + + +/** + * cuAmpcor processor for a chunk (a batch of windows) + */ +class cuAmpcorProcessorROIPAC : public cuAmpcorProcessor{ +private: + + // local variables and workers + // gpu buffer to load images from file + cuArrays * c_referenceChunkRaw, * c_secondaryChunkRaw; + cuArrays * r_referenceChunkRaw, * r_secondaryChunkRaw; + + // windows raw (not oversampled) data, complex and real + cuArrays * c_referenceBatchRaw, * c_secondaryBatchRaw, * c_secondaryBatchZoomIn; + cuArrays * r_referenceBatchRaw, * r_secondaryBatchRaw; + + // windows oversampled data + cuArrays * c_referenceBatchOverSampled, * c_secondaryBatchOverSampled; + cuArrays * r_referenceBatchOverSampled, * r_secondaryBatchOverSampled; + cuArrays * r_corrBatchRaw, * r_corrBatchZoomIn, * r_corrBatchZoomInOverSampled, * r_corrBatchZoomInAdjust; + + // offset data + cuArrays *ChunkOffsetDown, *ChunkOffsetAcross; + + // oversampling processors for complex images + cuOverSamplerC2C *referenceBatchOverSampler, *secondaryBatchOverSampler; + + // oversampling processor for correlation surface + cuOverSamplerR2R *corrOverSampler; + cuSincOverSamplerR2R *corrSincOverSampler; + + // cross-correlation processor with frequency domain algorithm + cuFreqCorrelator *cuCorrFreqDomain, *cuCorrFreqDomain_OverSampled; + + // correlation surface normalizer + std::unique_ptr corrNormalizerRaw; + std::unique_ptr corrNormalizerOverSampled; + + // save offset results in different stages + cuArrays *offsetInit; + cuArrays *offsetZoomIn; + cuArrays *offsetFinal; + cuArrays *maxLocShift; // record the maxloc from the extract center + cuArrays *corrMaxValue; + cuArrays *i_maxloc; + cuArrays *r_maxval; + + // SNR estimation + cuArrays *r_corrBatchRawZoomIn; + cuArrays *r_corrBatchSum; + cuArrays *i_corrBatchZoomInValid, *i_corrBatchValidCount; + cuArrays *r_snrValue; + + // Variance estimation + cuArrays *r_covValue; + +public: + // constructor + cuAmpcorProcessorROIPAC(cuAmpcorParameter *param_, + GDALImage *reference_, GDALImage *secondary_, + cuArrays *offsetImage_, cuArrays *snrImage_, + cuArrays *covImage_, cuArrays *peakValueImage_, + cudaStream_t stream_); + // destructor + ~cuAmpcorProcessorROIPAC() override; + + // local methods + void loadReferenceChunk(); + void loadSecondaryChunk(); + // run the given chunk + void run(int, int) override; +}; + + + +#endif //__CUAMPCORPROCESSORROIPAC_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h index 55dde31fe..dac3642f5 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h @@ -78,9 +78,8 @@ void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *r void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); void cuArraysMaxloc2D(cuArrays *images, const int2 start, const int2 range, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetRoomIn, cuArrays *offsetFinal, const int2 initOrigin, const int initFactor, const int2 zoomInOrigin, const int zoomInFactor, cudaStream_t stream); - -void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, - int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream); +void cuSubPixelOffset2Pass(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, cudaStream_t stream); +void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream); //in cuCorrTimeDomain.cu: cross correlation in time domain void cuCorrTimeDomain(cuArrays *templates, cuArrays *images, cuArrays *results, cudaStream_t stream); @@ -93,14 +92,18 @@ void cuArraysElementMultiplyConjugate(cuArrays *image1, cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *maxloc, cudaStream_t stream); +void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream); // implemented in cuCorrNormalization.cu void cuArraysSumSquare(cuArrays *images, cuArrays *imagesSum, cudaStream_t stream); +void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, cuArrays *imagesValidCount, cudaStream_t stream); + // implemented in cuEstimateStats.cu void cuEstimateSnr(cuArrays *corrSum, cuArrays *maxval, cuArrays *snrValue, const int size, cudaStream_t stream); +void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream); // implemented in cuEstimateStats.cu -void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream); +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, const int templateSize, const int distance, cuArrays *covValue, cudaStream_t stream); #endif diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu index bd23b41ec..bfce8a685 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArraysCopy.cu @@ -354,6 +354,64 @@ void cuArraysCopyExtractCorr(cuArrays *imagesIn, cuArrays getLastCudaError("cuArraysCopyExtract error"); } +__global__ void cuArraysCopyExtractVaryingOffsetCorr(const real_type *imageIn, const int inNX, const int inNY, + real_type *imageOut, const int outNX, const int outNY, int *imageValid, const int nImages, + const int2 *maxloc) +{ + + // get the image index + int idxImage = blockIdx.z; + + // One thread per out point. Find the coordinates within the current image. + int outx = threadIdx.x + blockDim.x*blockIdx.x; + int outy = threadIdx.y + blockDim.y*blockIdx.y; + + // check whether thread is within output image range + if (outx < outNX && outy < outNY) + { + // Find the corresponding input. + int inx = outx + maxloc[idxImage].x - outNX/2; + int iny = outy + maxloc[idxImage].y - outNY/2; + + // Find the location in flattened array. + int idxOut = ( blockIdx.z * outNX + outx ) * outNY + outy; + int idxIn = ( blockIdx.z * inNX + inx ) * inNY + iny; + + // check whether inside of the input image + if (inx>=0 && iny>=0 && inx *imagesIn, cuArrays *imagesOut, cuArrays *imagesValid, cuArrays *maxloc, cudaStream_t stream) +{ + //assert(imagesIn->height >= imagesOut && inNY >= outNY); + const int nthreads = 16; + + dim3 threadsperblock(nthreads, nthreads,1); + + dim3 blockspergrid(IDIVUP(imagesOut->height,nthreads), IDIVUP(imagesOut->width,nthreads), imagesOut->count); + + cuArraysCopyExtractVaryingOffsetCorr<<>>(imagesIn->devData, imagesIn->height, imagesIn->width, + imagesOut->devData, imagesOut->height, imagesOut->width, imagesValid->devData, imagesOut->count, maxloc->devData); + getLastCudaError("cuArraysCopyExtract error"); +} + + // end of correlation surface extraction (Minyan Zhong) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu index 43aabd3ca..c38a38dc0 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu @@ -184,6 +184,59 @@ void cuArraysSumSquare(cuArrays *images, cuArrays *imagesS getLastCudaError("cuArraysSumValue kernel error\n"); } +// cuda kernel to compute summation on extracted correlation surface (Minyan) +template +__global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *imagesSum, int *imagesValidCount, int imageSize, int nImages) +{ + __shared__ float shmem[Nthreads]; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + + if (bid >= nImages) return; + + const int imageIdx = bid; + const int imageOffset = imageIdx * imageSize; + float* imageD = images + imageOffset; + int* imageValidD = imagesValid + imageOffset; + + float sum = 0.0f; + int count = 0; + + for (int i = tid; i < imageSize; i += Nthreads) { + sum += imageD[i] * imageD[i]; + count += imageValidD[i]; + } + + sum = sumReduceBlock(sum, shmem); + count = sumReduceBlock(count, shmem); + + if(tid ==0) { + imagesSum[bid] = sum; + imagesValidCount[bid] = count; + } +} + +/** + * Compute the variance of images (for SNR) + * @param[in] images Input images + * @param[in] imagesValid validity flags for each pixel + * @param[out] imagesSum variance + * @param[out] imagesValidCount count of total valid pixels + * @param[in] stream cudaStream + */ +void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, + cuArrays *imagesValidCount, cudaStream_t stream) +{ + const dim3 grid(images->count, 1, 1); + const int imageSize = images->width*images->height; + + cuArraysSumCorr_kernel <<>>(images->devData, imagesValid->devData, + imagesSum->devData, imagesValidCount->devData, imageSize, images->count); + getLastCudaError("cuArraysSumValueCorr kernel error\n"); +} + + // intra-block inclusive prefix sum template __device__ void inclusive_prefix_sum(real_type sum, volatile real_type *shmem) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu index 7a2a9e5b2..cb6013fed 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu @@ -49,9 +49,37 @@ void cuEstimateSnr(cuArrays *corrSum, cuArrays *maxval, cu getLastCudaError("cuda kernel estimate stats error\n"); } +__global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrValidCount, const float* maxval, float* snrValue, const int size) + +{ + int idx = threadIdx.x + blockDim.x*blockIdx.x; + + if (idx >= size) return; + + float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1); + + snrValue[idx] = maxval[idx] * maxval[idx] / mean; +} + +/** + * Estimate the signal to noise ratio (SNR) of the correlation surface + * @param[in] corrSum the sum of the correlation surface + * @param[in] corrValidCount the number of valid pixels contributing to sum + * @param[out] snrValue return snr value + * @param[in] stream cuda stream + */ +void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream) +{ + + int size = corrSum->getSize(); + cudaKernel_estimateSnr<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>> + (corrSum->devData, corrValidCount->devData, maxval->devData, snrValue->devData, size); + getLastCudaError("cuda kernel estimate stats error\n"); +} + // cuda kernel for cuEstimateVariance __global__ void cudaKernel_estimateVar(const real_type* corrBatchRaw, const int NX, const int NY, const int2* maxloc, - const real_type* maxval, const int templateSize, real3_type* covValue, const int size) + const real_type* maxval, const int templateSize, const int distance, real3_type* covValue, const int size) { // Find image id. @@ -65,22 +93,22 @@ __global__ void cudaKernel_estimateVar(const real_type* corrBatchRaw, const int real_type peak = maxval[idxImage]; // Check if maxval is on the margin. - if (px-1 < 0 || py-1 <0 || px + 1 >=NX || py+1 >=NY) { + if (px-distance < 0 || py-distance <0 || px + distance >=NX || py+distance >=NY) { covValue[idxImage] = make_real3(99.0, 99.0, 0.0); } else { int offset = NX * NY * idxImage; - int idx00 = offset + (px - 1) * NY + py - 1; - int idx01 = offset + (px - 1) * NY + py ; - int idx02 = offset + (px - 1) * NY + py + 1; - int idx10 = offset + (px ) * NY + py - 1; + int idx00 = offset + (px - distance) * NY + py - distance; + int idx01 = offset + (px - distance) * NY + py ; + int idx02 = offset + (px - distance) * NY + py + distance; + int idx10 = offset + (px ) * NY + py - distance; int idx11 = offset + (px ) * NY + py ; - int idx12 = offset + (px ) * NY + py + 1; - int idx20 = offset + (px + 1) * NY + py - 1; - int idx21 = offset + (px + 1) * NY + py ; - int idx22 = offset + (px + 1) * NY + py + 1; + int idx12 = offset + (px ) * NY + py + distance; + int idx20 = offset + (px + distance) * NY + py - distance; + int idx21 = offset + (px + distance) * NY + py ; + int idx22 = offset + (px + distance) * NY + py + distance; // second-order derivatives real_type dxx = - ( corrBatchRaw[idx21] + corrBatchRaw[idx01] - 2.0*corrBatchRaw[idx11] ); @@ -116,18 +144,19 @@ __global__ void cudaKernel_estimateVar(const real_type* corrBatchRaw, const int /** * Estimate the variance of the correlation surface * @param[in] templateSize size of reference chip + * @param[in] distance distance between a pixel * @param[in] corrBatchRaw correlation surface * @param[in] maxloc maximum location * @param[in] maxval maximum value * @param[out] covValue variance value * @param[in] stream cuda stream */ -void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, int templateSize, cuArrays *covValue, cudaStream_t stream) +void cuEstimateVariance(cuArrays *corrBatchRaw, cuArrays *maxloc, cuArrays *maxval, const int templateSize, const int distance, cuArrays *covValue, cudaStream_t stream) { int size = corrBatchRaw->count; // One dimensional launching parameters to loop over every correlation surface. cudaKernel_estimateVar<<< IDIVUP(size, NTHREADS), NTHREADS, 0, stream>>> - (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, templateSize, covValue->devData, size); + (corrBatchRaw->devData, corrBatchRaw->height, corrBatchRaw->width, maxloc->devData, maxval->devData, templateSize, distance, covValue->devData, size); getLastCudaError("cudaKernel_estimateVar error\n"); } //end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu index a93c9bb05..b03eaa333 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu @@ -176,6 +176,60 @@ void cuSubPixelOffset(cuArrays *offsetInit, } +// cuda kernel for cuSubPixelOffset +__global__ void cuSubPixelOffset2Pass_kernel(const int2 *offsetInit, const int2 *offsetZoomIn, + float2 *offsetFinal, + const float OSratio, + const float xoffset, const float yoffset, const int size) +{ + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if (idx >= size) return; + offsetFinal[idx].x = OSratio*(offsetZoomIn[idx].x ) + offsetInit[idx].x - xoffset; + offsetFinal[idx].y = OSratio*(offsetZoomIn[idx].y ) + offsetInit[idx].y - yoffset; +} + + +/** + * Determine the final offset value + * @param[in] offsetInit max location (adjusted to the starting location for extraction) determined from + * the cross-correlation before oversampling, in dimensions of pixel + * @param[in] offsetZoomIn max location from the oversampled cross-correlation surface + * @param[out] offsetFinal the combined offset value + * @param[in] OversampleRatioZoomIn the correlation surface oversampling factor + * @param[in] OversampleRatioRaw the oversampling factor of reference/secondary windows before cross-correlation + * @param[in] xHalfRangInit the original half search range along x, to be subtracted + * @param[in] yHalfRangInit the original half search range along y, to be subtracted + * + * 1. Cross-correlation is performed at first for the un-oversampled data with a larger search range. + * The secondary window is then extracted to a smaller size (a smaller search range) around the max location. + * The extraction starting location (offsetInit) - original half search range (xHalfRangeInit, yHalfRangeInit) + * = pixel size offset + * 2. Reference/secondary windows are then oversampled by OversampleRatioRaw, and cross-correlated. + * 3. The correlation surface is further oversampled by OversampleRatioZoomIn. + * The overall oversampling factor is OversampleRatioZoomIn*OversampleRatioRaw. + * The max location in oversampled correlation surface (offsetZoomIn) / overall oversampling factor + * = subpixel offset + * Final offset = pixel size offset + subpixel offset + */ +void cuSubPixelOffset2Pass(cuArrays *offsetInit, cuArrays *offsetZoomIn, + cuArrays *offsetFinal, + int OverSampleRatioZoomin, int OverSampleRatioRaw, + int xHalfRangeInit, int yHalfRangeInit, + cudaStream_t stream) +{ + int size = offsetInit->getSize(); + float OSratio = 1.0f/(float)(OverSampleRatioZoomin*OverSampleRatioRaw); + float xoffset = xHalfRangeInit ; + float yoffset = yHalfRangeInit ; + + cuSubPixelOffset2Pass_kernel<<>> + (offsetInit->devData, offsetZoomIn->devData, + offsetFinal->devData, OSratio, xoffset, yoffset, size); + getLastCudaError("cuSubPixelOffset2Pass_kernel"); + +} + + // cuda device function to compute the shift of center static inline __device__ int2 dev_adjustOffset( const int oldRange, const int newRange, const int maxloc) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu index 03eaffa40..cc32aedb2 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.cu @@ -24,7 +24,7 @@ cuSincOverSamplerR2R::cuSincOverSamplerR2R(const int i_covs_, cudaStream_t strea stream = stream_; i_intplength = int(r_relfiltlen/r_beta+0.5f); i_filtercoef = i_intplength*i_decfactor; - checkCudaErrors(cudaMalloc((void **)&r_filter, (i_filtercoef+1)*sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&r_filter, (i_filtercoef+1)*sizeof(real_type))); cuSetupSincKernel(); } @@ -35,16 +35,16 @@ cuSincOverSamplerR2R::~cuSincOverSamplerR2R() } // cuda kernel for cuSetupSincKernel -__global__ void cuSetupSincKernel_kernel(float *r_filter_, const int i_filtercoef_, - const float r_soff_, const float r_wgthgt_, const int i_weight_, - const float r_soff_inverse_, const float r_beta_, const float r_decfactor_inverse_) +__global__ void cuSetupSincKernel_kernel(real_type *r_filter_, const int i_filtercoef_, + const real_type r_soff_, const real_type r_wgthgt_, const int i_weight_, + const real_type r_soff_inverse_, const real_type r_beta_, const real_type r_decfactor_inverse_) { int i = threadIdx.x + blockDim.x*blockIdx.x; if(i > i_filtercoef_) return; - float r_wa = i - r_soff_; - float r_wgt = (1.0f - r_wgthgt_) + r_wgthgt_*cos(PI*r_wa*r_soff_inverse_); - float r_s = r_wa*r_beta_*r_decfactor_inverse_*PI; - float r_fct; + real_type r_wa = i - r_soff_; + real_type r_wgt = (1.0f - r_wgthgt_) + r_wgthgt_*cos(PI*r_wa*r_soff_inverse_); + real_type r_s = r_wa*r_beta_*r_decfactor_inverse_*PI; + real_type r_fct; if(r_s != 0.0) { r_fct = sin(r_s)/r_s; } @@ -68,10 +68,10 @@ void cuSincOverSamplerR2R::cuSetupSincKernel() const int nblocks = IDIVUP(i_filtercoef+1, nthreads); // compute some commonly used constants at first - float r_wgthgt = (1.0f - r_pedestal)/2.0f; - float r_soff = (i_filtercoef-1.0f)/2.0f; - float r_soff_inverse = 1.0f/r_soff; - float r_decfactor_inverse = 1.0f/i_decfactor; + real_type r_wgthgt = (1.0f - r_pedestal)/2.0f; + real_type r_soff = (i_filtercoef-1.0f)/2.0f; + real_type r_soff_inverse = 1.0f/r_soff; + real_type r_decfactor_inverse = 1.0f/i_decfactor; cuSetupSincKernel_kernel<<>> ( r_filter, i_filtercoef, r_soff, r_wgthgt, i_weight, @@ -81,11 +81,12 @@ void cuSincOverSamplerR2R::cuSetupSincKernel() // cuda kernel for cuSincOverSamplerR2R::execute -__global__ void cuSincInterpolation_kernel(const int nImages, - const float * imagesIn, const int inNX, const int inNY, - float * imagesOut, const int outNX, const int outNY, - int2 *centerShift, int factor, - const float * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, +// define a device function to overload the cases with constant or varying center shifts +__device__ void cuSincInterpolation_kernel_common(const int nImages, + const real_type * imagesIn, const int inNX, const int inNY, + real_type * imagesOut, const int outNX, const int outNY, + int2 centerShift, int factor, + const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, const int i_startX, const int i_startY, const int i_int_size) { // get image index @@ -95,35 +96,33 @@ __global__ void cuSincInterpolation_kernel(const int nImages, int idxY = threadIdx.y + blockDim.y*blockIdx.y; // cuda: to make sure extra allocated threads doing nothing if(idxImage >=nImages || idxX >= i_int_size || idxY >= i_int_size) return; - // decide the center shift - int2 shift = centerShift[idxImage]; // determine the output pixel indices - int outx = idxX + i_startX + shift.x*factor; + int outx = idxX + i_startX + centerShift.x*factor; if (outx >= outNX) outx-=outNX; - int outy = idxY + i_startY + shift.y*factor; + int outy = idxY + i_startY + centerShift.y*factor; if (outy >= outNY) outy-=outNY; // flattened to 1d int idxOut = idxImage*outNX*outNY + outx*outNY + outy; // index in input grids - float r_xout = (float)outx/i_covs_; + real_type r_xout = (real_type)outx/i_covs_; // integer part int i_xout = int(r_xout); // factional part - float r_xfrac = r_xout - i_xout; + real_type r_xfrac = r_xout - i_xout; // fractional part in terms of the interpolation kernel grids int i_xfrac = int(r_xfrac*i_decfactor_); // same procedure for y - float r_yout = (float)outy/i_covs_; + real_type r_yout = (real_type)outy/i_covs_; int i_yout = int(r_yout); - float r_yfrac = r_yout - i_yout; + real_type r_yfrac = r_yout - i_yout; int i_yfrac = int(r_yfrac*i_decfactor_); // temp variables - float intpData = 0.0; // interpolated value - float r_sincwgt = 0.0; // total filter weight - float r_sinc_coef; // filter weight + real_type intpData = 0.0; // interpolated value + real_type r_sincwgt = 0.0; // total filter weight + real_type r_sinc_coef; // filter weight // iterate over lines of input image // i=0 -> -i_intplength/2 @@ -135,7 +134,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages, if(inx < 0) inx+= inNX; if(inx >= inNX) inx-= inNY; - float r_xsinc_coef = r_filter_[i*i_decfactor_+i_xfrac]; + real_type r_xsinc_coef = r_filter_[i*i_decfactor_+i_xfrac]; for(int j=0; j< i_intplength_; j++) { // find the corresponding pixel in input(unsampled) image @@ -143,7 +142,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages, if(iny < 0) iny += inNY; if(iny >= inNY) iny -= inNY; - float r_ysinc_coef = r_filter_[j*i_decfactor_+i_yfrac]; + real_type r_ysinc_coef = r_filter_[j*i_decfactor_+i_yfrac]; // multiply the factors from xy r_sinc_coef = r_xsinc_coef*r_ysinc_coef; // add to total sinc weight @@ -156,6 +155,36 @@ __global__ void cuSincInterpolation_kernel(const int nImages, imagesOut[idxOut] = intpData/r_sincwgt; } +//constant shift +__global__ void cuSincInterpolation_kernel(const int nImages, + const real_type * imagesIn, const int inNX, const int inNY, + real_type * imagesOut, const int outNX, const int outNY, + int2 centerShift, int factor, + const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, + const int i_startX, const int i_startY, const int i_int_size) + { + // pass through + cuSincInterpolation_kernel_common(nImages, imagesIn, inNX, inNY, + imagesOut, outNX, outNY, centerShift, factor, + r_filter_, i_covs_, i_decfactor_, i_intplength_, + i_startX, i_startY, i_int_size); + } + +__global__ void cuSincInterpolation_kernel(const int nImages, + const real_type * imagesIn, const int inNX, const int inNY, + real_type * imagesOut, const int outNX, const int outNY, + int2 *centerShift, int factor, + const real_type * r_filter_, const int i_covs_, const int i_decfactor_, const int i_intplength_, + const int i_startX, const int i_startY, const int i_int_size) + { + // get image index + int idxImage = blockIdx.z; + cuSincInterpolation_kernel_common(nImages, imagesIn, inNX, inNY, + imagesOut, outNX, outNY, centerShift[idxImage], factor, + r_filter_, i_covs_, i_decfactor_, i_intplength_, + i_startX, i_startY, i_int_size); + } + /** * Execute sinc interpolation * @param[in] imagesIn input images @@ -165,7 +194,7 @@ __global__ void cuSincInterpolation_kernel(const int nImages, * @note rawOversamplingFactor is for the centerShift, not the signal oversampling factor */ -void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut, +void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *centerShift, int rawOversamplingFactor) { const int nImages = imagesIn->count; @@ -194,4 +223,33 @@ void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *i getLastCudaError("cuSincInterpolation_kernel"); } +void cuSincOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *imagesOut, + int2 centerShift, int rawOversamplingFactor) +{ + const int nImages = imagesIn->count; + const int inNX = imagesIn->height; + const int inNY = imagesIn->width; + const int outNX = imagesOut->height; + const int outNY = imagesOut->width; + + // only compute the overampled signals within a window + const int i_int_range = i_sincwindow * i_covs; + // set the start pixel, will be shifted by centerShift*oversamplingFactor (from raw image) + const int i_int_startX = outNX/2 - i_int_range; + const int i_int_startY = outNY/2 - i_int_range; + const int i_int_size = 2*i_int_range + 1; + // preset all pixels in out image to 0 + imagesOut->setZero(stream); + + static const int nthreads = 16; + dim3 threadsperblock(nthreads, nthreads, 1); + dim3 blockspergrid (IDIVUP(i_int_size, nthreads), IDIVUP(i_int_size, nthreads), nImages); + cuSincInterpolation_kernel<<>>(nImages, + imagesIn->devData, inNX, inNY, + imagesOut->devData, outNX, outNY, + centerShift, rawOversamplingFactor, + r_filter, i_covs, i_decfactor, i_intplength, i_int_startX, i_int_startY, i_int_size); + getLastCudaError("cuSincInterpolation_kernel"); +} + // end of file \ No newline at end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h index b2ddc5849..f0ca5ae81 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h @@ -23,6 +23,7 @@ // dependencites #include "cuArrays.h" #include "cudaUtil.h" +#include "data_types.h" #ifndef PI #define PI 3.14159265359f @@ -34,9 +35,9 @@ class cuSincOverSamplerR2R static const int i_sincwindow = 2; ///< the oversampling is only performed within \pm i_sincwindow*i_covs around the peak static const int i_weight = 1; ///< weight for cos() pedestal - const float r_pedestal = 0.0; ///< height of pedestal - const float r_beta = 0.75f; ///< a low-band pass - const float r_relfiltlen = 6.0f; ///< relative filter length + const real_type r_pedestal = 0.0; ///< height of pedestal + const real_type r_beta = 0.75f; ///< a low-band pass + const real_type r_relfiltlen = 6.0f; ///< relative filter length static const int i_decfactor = 4096; ///< max decimals between original grid to set up the sinc kernel @@ -54,7 +55,9 @@ class cuSincOverSamplerR2R // set up sinc interpolation coefficients void cuSetupSincKernel(); // execute interface - void execute(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *center, int oversamplingFactor); + void execute(cuArrays *imagesIn, cuArrays *imagesOut, cuArrays *center, int oversamplingFactor); + void execute(cuArrays *imagesIn, cuArrays *imagesOut, int2 center, int oversamplingFactor); + // destructor ~cuSincOverSamplerR2R(); }; From 78f80805d4bbf5e34601f2c5fbdb30a4197e55fd Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Tue, 21 Jan 2025 20:24:19 +0000 Subject: [PATCH 03/11] Rename corrImage -> peakValueImage parameter --- .../extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp | 1 - python/packages/nisar/workflows/dense_offsets.py | 2 +- python/packages/nisar/workflows/offsets_product.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp index d2a9615a1..aa9d47077 100644 --- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp @@ -51,7 +51,6 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM(int, mergeGrossOffset) .DEF_PARAM(str, snrImageName) .DEF_PARAM(str, covImageName) - .DEF_PARAM(str, corrImageName) .DEF_PARAM(str, peakValueImageName) .DEF_PARAM(int, rawDataOversamplingFactor) diff --git a/python/packages/nisar/workflows/dense_offsets.py b/python/packages/nisar/workflows/dense_offsets.py index e897b19cd..33d583b2a 100644 --- a/python/packages/nisar/workflows/dense_offsets.py +++ b/python/packages/nisar/workflows/dense_offsets.py @@ -101,7 +101,7 @@ def run(cfg: dict): ampcor.grossOffsetImageName = str(out_dir / 'gross_offset') ampcor.snrImageName = str(out_dir / 'snr') ampcor.covImageName = str(out_dir / 'covariance') - ampcor.corrImageName = str(out_dir / 'correlation_peak') + ampcor.peakValueImageName = str(out_dir / 'correlation_peak') # Create empty ENVI datasets. PyCuAmpcor will overwrite the # binary files. Note, use gdal to pass interleave option diff --git a/python/packages/nisar/workflows/offsets_product.py b/python/packages/nisar/workflows/offsets_product.py index 7eb293b6c..ac5a845a8 100644 --- a/python/packages/nisar/workflows/offsets_product.py +++ b/python/packages/nisar/workflows/offsets_product.py @@ -158,7 +158,7 @@ def run(cfg: dict, output_hdf5: str = None): layer_scratch_path / 'gross_offset') ampcor.snrImageName = str(layer_scratch_path / 'snr') ampcor.covImageName = str(layer_scratch_path / 'covariance') - ampcor.corrImageName = str(layer_scratch_path/ 'correlation_peak') + ampcor.peakValueImageName = str(layer_scratch_path/ 'correlation_peak') create_empty_dataset(str(layer_scratch_path / 'dense_offsets'), ampcor.numberWindowAcross, From ad755887f58ac367cf0cff1e2d579e93a1fc5d0c Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Tue, 21 Jan 2025 21:03:01 +0000 Subject: [PATCH 04/11] Expose pycuampcor workflow parameter to python --- python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp index aa9d47077..acd55e09b 100644 --- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp @@ -32,6 +32,7 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM(int, deviceID) .DEF_PARAM(int, nStreams) .DEF_PARAM(int, derampMethod) + .DEF_PARAM(int, workflow) .DEF_PARAM(str, referenceImageName) .DEF_PARAM(int, referenceImageHeight) From 6e87cd7d9c769d2aabf692c596f950ac4ae72601 Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Tue, 21 Jan 2025 22:37:05 +0000 Subject: [PATCH 05/11] Improve error message output for cudaError_t --- .../matchtemplate/pycuampcor/cudaError.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp index 183fa4cc0..01585ab63 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp @@ -15,8 +15,20 @@ #endif #endif -template -void check(T result, char const *const func, const char *const file, int const line) +template<> +void check(cudaError_t result, char const *const func, const char *const file, int const line) +{ + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s: %s) \n", + file, line, static_cast(result), func, cudaGetErrorString(result)); + DEVICE_RESET + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +template<> +void check(cufftResult_t result, char const *const func, const char *const file, int const line) { if (result) { fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", @@ -27,8 +39,6 @@ void check(T result, char const *const func, const char *const file, int const l } } -template void check(cudaError_t, char const *const, const char *const, int const); -template void check(cufftResult_t, char const *const, const char *const, int const); void __getLastCudaError(const char *errorMessage, const char *file, const int line) { From 70a2552f6ef7609ac309e86e3e96db6f3f9555a8 Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Mon, 17 Feb 2025 17:30:39 -0800 Subject: [PATCH 06/11] Include Lijun's latest pycuampcor-v2 changes * Rename roipac/grimp files to onepass/twopass * Use custom mmap image loader --- .../matchtemplate/pycuampcor/GDALImage.cpp | 164 ------------------ .../cuda/matchtemplate/pycuampcor/GDALImage.h | 84 --------- .../matchtemplate/pycuampcor/PyCuAmpcor.cpp | 6 +- .../matchtemplate/pycuampcor/SlcImage.cpp | 95 ++++++++++ .../cuda/matchtemplate/pycuampcor/SlcImage.h | 38 ++++ .../pycuampcor/cuAmpcorController.cpp | 13 +- .../pycuampcor/cuAmpcorParameter.cpp | 20 ++- .../pycuampcor/cuAmpcorParameter.h | 20 +-- .../pycuampcor/cuAmpcorProcessor.cpp | 13 +- .../pycuampcor/cuAmpcorProcessor.h | 11 +- ...GrIMP.cpp => cuAmpcorProcessorOnePass.cpp} | 34 ++-- ...ssorGrIMP.h => cuAmpcorProcessorOnePass.h} | 18 +- ...OIPAC.cpp => cuAmpcorProcessorTwoPass.cpp} | 38 ++-- ...sorROIPAC.h => cuAmpcorProcessorTwoPass.h} | 18 +- .../cuda/matchtemplate/pycuampcor/cuDeramp.cu | 12 +- .../matchtemplate/pycuampcor/cudaError.cpp | 18 +- .../cuda/matchtemplate/pycuampcor.cpp | 2 + 17 files changed, 246 insertions(+), 358 deletions(-) delete mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp delete mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp create mode 100644 cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorProcessorGrIMP.cpp => cuAmpcorProcessorOnePass.cpp} (94%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorProcessorGrIMP.h => cuAmpcorProcessorOnePass.h} (85%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorProcessorROIPAC.cpp => cuAmpcorProcessorTwoPass.cpp} (95%) rename cxx/isce3/cuda/matchtemplate/pycuampcor/{cuAmpcorProcessorROIPAC.h => cuAmpcorProcessorTwoPass.h} (85%) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp deleted file mode 100644 index aa0c30e92..000000000 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/** - * @file GDALImage.h - * @brief Implementations of GDALImage class - * - */ - -// my declaration -#include "GDALImage.h" - -// dependencies -#include -#include -#include "cudaError.h" - -/** - * Constructor - * @brief Create a GDAL image object - * @param filename a std::string with the raster image file name - * @param band the band number - * @param cacheSizeInGB read buffer size in GigaBytes - * @param useMmap whether to use memory map - */ -GDALImage::GDALImage(std::string filename, int band, int cacheSizeInGB, int useMmap) - : _useMmap(useMmap) -{ - // open the file as dataset - _poDataset = (GDALDataset *) GDALOpen(filename.c_str(), GA_ReadOnly); - // if something is wrong, throw an exception - // GDAL reports the error message - if(!_poDataset) - throw; - - // check the band info - int count = _poDataset->GetRasterCount(); - if(band > count) - { - std::cout << "The desired band " << band << " is greater than " << count << " bands available"; - throw; - } - - // get the desired band - _poBand = _poDataset->GetRasterBand(band); - if(!_poBand) - throw; - - // get the width(x), and height(y) - _width = _poBand->GetXSize(); - _height = _poBand->GetYSize(); - - _dataType = _poBand->GetRasterDataType(); - // determine the image type - _isComplex = GDALDataTypeIsComplex(_dataType); - // determine the pixel size in bytes - _pixelSize = GDALGetDataTypeSize(_dataType); - - _bufferSize = 1024*1024*1024*cacheSizeInGB; - - // checking whether using memory map - if(_useMmap) { - - char **papszOptions = NULL; - // if cacheSizeInGB = 0, use default - // else set the option - if(cacheSizeInGB > 0) - papszOptions = CSLSetNameValue( papszOptions, - "CACHE_SIZE", - std::to_string(_bufferSize).c_str()); - - // space between two lines - GIntBig pnLineSpace; - // set up the virtual mem buffer - _poBandVirtualMem = GDALGetVirtualMemAuto( - static_cast(_poBand), - GF_Read, - &_pixelSize, - &pnLineSpace, - papszOptions); - if(!_poBandVirtualMem) - throw; - - // get the starting pointer - _memPtr = CPLVirtualMemGetAddr(_poBandVirtualMem); - } - else { // use a buffer - checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize)); - } - // make sure memPtr is not Null - if (!_memPtr) - { - std::cout << "unable to locate the memory buffer\n"; - throw; - } - // all done -} - - -/** - * Load a tile of data h_tile x w_tile from CPU to GPU - * @param dArray pointer for array in device memory - * @param h_offset Down/Height offset - * @param w_offset Across/Width offset - * @param h_tile Down/Height tile size - * @param w_tile Across/Width tile size - * @param stream CUDA stream for copying - * @note Need to use size_t type to pass the parameters to cudaMemcpy2D correctly - */ -void GDALImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, - size_t h_tile, size_t w_tile, cudaStream_t stream) -{ - - size_t tileStartOffset = (h_offset*_width + w_offset)*_pixelSize; - - char * startPtr = (char *)_memPtr ; - startPtr += tileStartOffset; - - if (_useMmap) { - // direct copy from memory map buffer to device memory - checkCudaErrors(cudaMemcpy2DAsync(dArray, // dst - w_tile*_pixelSize, // dst pitch - startPtr, // src - _width*_pixelSize, // src pitch - w_tile*_pixelSize, // width in Bytes - h_tile, // height - cudaMemcpyHostToDevice,stream)); - } - else { // use a cpu buffer to load image data to gpu - - // get the total tile size in bytes - size_t tileSize = h_tile*w_tile*_pixelSize; - // if the size is bigger than existing buffer, reallocate - if (tileSize > _bufferSize) { - // TODO: fit the pagesize - _bufferSize = tileSize; - checkCudaErrors(cudaFree(_memPtr)); - checkCudaErrors(cudaMallocHost((void **)&_memPtr, _bufferSize)); - } - // copy from file to buffer - CPLErr err = _poBand->RasterIO(GF_Read, //eRWFlag - w_offset, h_offset, //nXOff, nYOff - w_tile, h_tile, // nXSize, nYSize - _memPtr, // pData - w_tile*h_tile, 1, // nBufXSize, nBufYSize - _dataType, //eBufType - 0, 0 //nPixelSpace, nLineSpace in pData - ); - if(err != CE_None) - throw; // throw if reading error occurs; message reported by GDAL - - // copy from buffer to gpu - checkCudaErrors(cudaMemcpyAsync(dArray, _memPtr, tileSize, cudaMemcpyHostToDevice, stream)); - } - // all done -} - -/// destructor -GDALImage::~GDALImage() -{ - // free the virtual memory - CPLVirtualMemFree(_poBandVirtualMem), - // free the GDAL Dataset, close the file - delete _poDataset; -} - -// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h deleted file mode 100644 index 6ce0a2d6e..000000000 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/GDALImage.h +++ /dev/null @@ -1,84 +0,0 @@ -/** - * @file GDALImage.h - * @brief Interface with GDAL vrt driver - * - * To read image file with the GDAL vrt driver, including SLC, GeoTIFF images - * @warning Only single precision images are supported: complex(pixelOffset=8) or real(pixelOffset=4). - * @warning Only single band file is currently supported. - */ - -// code guard -#ifndef __GDALIMAGE_H -#define __GDALIMAGE_H - -// dependencies -#include -#include -#include -#include - - -class GDALImage{ -public: - // specify the types - using size_t = std::size_t; - -private: - int _height; ///< image height - int _width; ///< image width - - void * _memPtr = NULL; ///< pointer to buffer - - int _pixelSize; ///< pixel size in bytes - - int _isComplex; ///< whether the image is complex - - size_t _bufferSize; ///< buffer size - int _useMmap; ///< whether to use memory map - - // GDAL temporary objects - GDALDataType _dataType; - CPLVirtualMem * _poBandVirtualMem = NULL; - GDALDataset * _poDataset = NULL; - GDALRasterBand * _poBand = NULL; - -public: - //disable default constructor - GDALImage() = delete; - // constructor - GDALImage(std::string fn, int band=1, int cacheSizeInGB=0, int useMmap=1); - // destructor - ~GDALImage(); - - // get class properties - void * getmemPtr() - { - return(_memPtr); - } - - int getHeight() { - return (_height); - } - - int getWidth() - { - return (_width); - } - - int getPixelSize() - { - return _pixelSize; - } - - bool isComplex() - { - return _isComplex; - } - - // load data from cpu buffer to gpu - void loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream); - -}; - -#endif //__GDALIMAGE_H -// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp index 92334d671..d9ebfb84f 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/PyCuAmpcor.cpp @@ -37,9 +37,11 @@ PYBIND11_MODULE(PyCuAmpcor, m) .DEF_PARAM(str, referenceImageName) .DEF_PARAM(int, referenceImageHeight) .DEF_PARAM(int, referenceImageWidth) + .DEF_PARAM(int, referenceImageDataType) .DEF_PARAM(str, secondaryImageName) .DEF_PARAM(int, secondaryImageHeight) .DEF_PARAM(int, secondaryImageWidth) + .DEF_PARAM(int, secondaryImageDataType) .DEF_PARAM(int, numberWindowDown) .DEF_PARAM(int, numberWindowAcross) @@ -68,8 +70,8 @@ PYBIND11_MODULE(PyCuAmpcor, m) .DEF_PARAM_RENAME(int, referenceStartPixelAcrossStatic, referenceStartPixelAcross0) .DEF_PARAM_RENAME(int, referenceStartPixelDownStatic, referenceStartPixelDown0) - .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingMethod, oversamplingMethod) - .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingFactor, oversamplingFactor) + .DEF_PARAM(int, corrSurfaceOverSamplingMethod) + .DEF_PARAM(int, corrSurfaceOverSamplingFactor) .DEF_PARAM_RENAME(int, mmapSize, mmapSizeInGB) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp new file mode 100644 index 000000000..ed4bf14f5 --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.cpp @@ -0,0 +1,95 @@ +#include "SlcImage.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cudaError.h" +#include +#include + +SlcImage::SlcImage(const std::string& filepath, size_t img_height, size_t img_width, size_t pixel_size, size_t buffer_size) + : width(img_width), height(img_height), pixel_size(pixel_size), fd(-1), mapped_data(nullptr), + mapped_offset(0), mapped_size(0) +{ + file_size = width * height * pixel_size; + max_map_size = buffer_size*1024*1024*1024; + page_size = sysconf(_SC_PAGE_SIZE); // Get system page size + + // Open the file + fd = open(filepath.c_str(), O_RDONLY); + if (fd == -1) { + throw std::runtime_error("Failed to open file: " + filepath); + } +} + +void SlcImage::remapIfNeeded(size_t required_start, size_t required_end) +{ + + if(required_start < mapped_offset || required_end > mapped_offset + mapped_size) + { + // out of range, remap + // unmap first, if neccesary + if(mapped_data!=nullptr) + munmap(mapped_data, mapped_size); + // align new mapping offset + // round to the page size + mapped_offset = (required_start/page_size)*page_size; + // compute the mapped size + mapped_size = file_size - mapped_offset; + // not to exceed the buffer size + if (mapped_size > max_map_size) { + mapped_size = max_map_size; + } + // remap + mapped_data = mmap(nullptr, mapped_size, PROT_READ, MAP_PRIVATE, fd, mapped_offset); + if (mapped_data == MAP_FAILED) { + throw std::runtime_error("Failed to mmap file at offset " + std::to_string(mapped_offset)); + } + } + // else - in range, do nothing +} + + +/// load a tile of data h_tile x w_tile from CPU (mmap) to GPU +/// @param dArray pointer for array in device memory +/// @param h_offset Down/Height offset +/// @param w_offset Across/Width offset +/// @param h_tile Down/Height tile size +/// @param w_tile Across/Width tile size +/// @param stream CUDA stream for copying +void SlcImage::loadToDevice(void *dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream) +{ + size_t tileStartAddress = (h_offset*width + w_offset)*pixel_size; + size_t tileLastAddress = tileStartAddress + (h_tile*width + w_tile)*pixel_size; + + remapIfNeeded(tileStartAddress, tileLastAddress); + + char *startPtr = (char *)mapped_data ; + startPtr += tileStartAddress - mapped_offset; + + // @note + // We assume down/across directions as rows/cols. Therefore, SLC mmap and device array both use row major. + // cuBlas assumes both source and target arrays are column major. + // To use cublasSetMatrix, we need to switch w_tile/h_tile for rows/cols + // checkCudaErrors(cublasSetMatrixAsync(w_tile, h_tile, pixelsize, startPtr, width, dArray, w_tile, stream)); + + checkCudaErrors(cudaMemcpy2DAsync(dArray, w_tile*pixel_size, startPtr, width*pixel_size, + w_tile*pixel_size, h_tile, cudaMemcpyHostToDevice,stream)); +} + +SlcImage::~SlcImage() +{ + if (mapped_data!=nullptr) { + munmap(mapped_data, mapped_size); + } + if (fd != -1) { + close(fd); + } +} + +// end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h new file mode 100644 index 000000000..a22650f1e --- /dev/null +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/SlcImage.h @@ -0,0 +1,38 @@ +// -*- c++ -*- +// file slcimage.h +// a mmap loader for slc images + +#ifndef __SLCIMAGE_H +#define __SLCIMAGE_H + +#include +#include + +class SlcImage{ +public: + // disable default constructor + SlcImage()=delete; + // constructor + SlcImage(const std::string& fn, size_t image_height, size_t image_width, size_t pixel_size, size_t buffersize); + // interface + void loadToDevice(void* dArray, size_t h_offset, size_t w_offset, size_t h_tile, size_t w_tile, cudaStream_t stream); + // destructor + ~SlcImage(); + +private: + int fd; + size_t file_size; + size_t pixel_size; + size_t height; + size_t width; + + void* mapped_data; + size_t page_size; + size_t mapped_offset; + size_t mapped_size; + size_t max_map_size; + + void remapIfNeeded(size_t required_start, size_t required_end); +}; + +#endif //__SLCIMAGE_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp index 57a792259..ceb0a6759 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorController.cpp @@ -7,7 +7,7 @@ #include "cuAmpcorController.h" // dependencies -#include "GDALImage.h" +#include "SlcImage.h" #include "cuArrays.h" #include "cudaUtil.h" #include "cuAmpcorProcessor.h" @@ -45,14 +45,15 @@ void cuAmpcorController::runAmpcor() { // set the gpu id param->deviceID = gpuDeviceInit(param->deviceID); - // initialize the gdal driver - GDALAllRegister(); - // reference and secondary images; use band=1 as default + + // reference and secondary images // TODO: selecting band std::cout << "Opening reference image " << param->referenceImageName << "...\n"; - GDALImage *referenceImage = new GDALImage(param->referenceImageName, 1, param->mmapSizeInGB); + SlcImage *referenceImage = new SlcImage(param->referenceImageName, param->referenceImageHeight, param->referenceImageWidth, + param->referenceImageDataType*sizeof(float), param->mmapSizeInGB); std::cout << "Opening secondary image " << param->secondaryImageName << "...\n"; - GDALImage *secondaryImage = new GDALImage(param->secondaryImageName, 1, param->mmapSizeInGB); + SlcImage *secondaryImage = new SlcImage(param->secondaryImageName, param->secondaryImageHeight, param->secondaryImageWidth, + param->secondaryImageDataType*sizeof(float), param->mmapSizeInGB); cuArrays *offsetImage, *offsetImageRun; cuArrays *snrImage, *snrImageRun; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp index 3fbb85222..a06322180 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp @@ -25,7 +25,7 @@ cuAmpcorParameter::cuAmpcorParameter() algorithm = 0; //0 freq; 1 time deviceID = 0; nStreams = 1; - derampMethod = 1; + derampMethod = 0; workflow = 0; windowSizeWidthRaw = 64; @@ -37,15 +37,17 @@ cuAmpcorParameter::cuAmpcorParameter() skipSampleDownRaw = 64; rawDataOversamplingFactor = 2; //antialiasing zoomWindowSize = 16; // correlation surface oversampling zoom - oversamplingFactor = 16; // correlation surface oversampling - oversamplingMethod = 0; + corrSurfaceOverSamplingFactor = 16; // correlation surface oversampling + corrSurfaceOverSamplingMethod = 0; referenceImageName = "reference.slc"; referenceImageWidth = 1000; referenceImageHeight = 1000; + referenceImageDataType = 2; // complex secondaryImageName = "secondary.slc"; secondaryImageWidth = 1000; secondaryImageHeight = 1000; + secondaryImageDataType = 2; // complex offsetImageName = "DenseOffset.bip"; grossOffsetImageName = "GrossOffset.bip"; snrImageName = "snr.bip"; @@ -78,10 +80,10 @@ void cuAmpcorParameter::setupParameters() // switch (workflow) { case 0: - _setupParameters_ROIPAC(); + _setupParameters_TwoPass(); break; case 1: - _setupParameters_GrIMP(); + _setupParameters_OnePass(); break; default: throw std::invalid_argument("Unsupported workflow"); @@ -100,7 +102,7 @@ void cuAmpcorParameter::setupParameters() allocateArrays(); } -void cuAmpcorParameter::_setupParameters_ROIPAC() +void cuAmpcorParameter::_setupParameters_TwoPass() { // Size to extract the raw correlation surface for snr/cov corrRawZoomInHeight = std::min(corrStatWindowSize, 2*halfSearchRangeDownRaw+1); @@ -141,7 +143,7 @@ void cuAmpcorParameter::_setupParameters_ROIPAC() } -void cuAmpcorParameter::_setupParameters_GrIMP() +void cuAmpcorParameter::_setupParameters_OnePass() { // template window size (after antialiasing oversampling) @@ -189,8 +191,8 @@ void cuAmpcorParameter::_setupParameters_GrIMP() std::min(zoomWindowSize+1, corrWindowSize.y)); // oversampled correlation surface size - corrZoomInOversampledSize = make_int2(corrZoomInSize.x * oversamplingFactor, - corrZoomInSize.y * oversamplingFactor); + corrZoomInOversampledSize = make_int2(corrZoomInSize.x * corrSurfaceOverSamplingFactor, + corrZoomInSize.y * corrSurfaceOverSamplingFactor); fprintf(stderr, "zoomWindowSize is (%d, %d)!\n", corrZoomInSize.x, corrZoomInSize.y ); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h index 9dc70f04e..f35ab7606 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h @@ -36,7 +36,7 @@ class cuAmpcorParameter{ int deviceID; ///< Targeted GPU device ID: use -1 to auto select int nStreams; ///< Number of streams to asynchonize data transfers and compute kernels int derampMethod; ///< Method for deramping 0=None, 1=average - int workflow; ///< Workflow 0=ROIPAC, 1=GrIMP + int workflow; ///< Workflow 0: two passes, first pass without antialiasing oversampling, 1: one pass with antialiasing oversampling // chip or window size for raw data int windowSizeHeightRaw; ///< Template window height (original size) @@ -71,8 +71,8 @@ class cuAmpcorParameter{ int zoomWindowSize; ///< Zoom-in window size in correlation surface (same for down and across directions) int halfZoomWindowSizeRaw; ///< half of zoomWindowSize/rawDataOversamplingFactor - int oversamplingFactor; ///< Oversampling factor for interpolating correlation surface - int oversamplingMethod; ///< correlation surface oversampling method 0 = fft (default) 1 = sinc + int corrSurfaceOverSamplingFactor; ///< Oversampling factor for interpolating correlation surface + int corrSurfaceOverSamplingMethod; ///< correlation surface oversampling method 0 = fft (default) 1 = sinc // correlation surface int2 corrWindowSize; // 2*halfSearchRange + 1 @@ -81,9 +81,7 @@ class cuAmpcorParameter{ int corrStatWindowSize; ///< correlation surface size used to estimate snr - float thresholdSNR; ///< Threshold of Signal noise ratio to remove noisy data - - // parameters used in the first pass in ROIPAC workflow + // parameters used in the first pass in two-pass workflow // @TODO move them to workflow specific files int searchWindowSizeHeightRawZoomIn; int searchWindowSizeWidthRawZoomIn; @@ -93,13 +91,13 @@ class cuAmpcorParameter{ //reference image std::string referenceImageName; ///< reference SLC image name - int imageDataType1; ///< reference image data type, 2=cfloat=complex=float2 1=float + int referenceImageDataType; ///< reference image data type, 2=cfloat=complex=float2 1=float int referenceImageHeight; ///< reference image height int referenceImageWidth; ///< reference image width //secondary image std::string secondaryImageName; ///< secondary SLC image name - int imageDataType2; ///< secondary image data type, 2=cfloat=complex=float2 1=float + int secondaryImageDataType; ///< secondary image data type, 2=cfloat=complex=float2 1=float int secondaryImageHeight; ///< secondary image height int secondaryImageWidth; ///< secondary image width @@ -149,7 +147,7 @@ class cuAmpcorParameter{ std::string offsetImageName; ///< Offset fields output filename std::string snrImageName; ///< Output SNR filename std::string covImageName; ///< Output variance filename - std::string peakValueImageName; ///< Output correlation surface peak value filename + std::string peakValueImageName; ///< Output normalized correlation surface peak value filename // Class constructor and default parameters setter cuAmpcorParameter(); @@ -175,8 +173,8 @@ class cuAmpcorParameter{ void checkPixelInImageRange(); // Process other parameters after Python Input void setupParameters(); - void _setupParameters_ROIPAC(); - void _setupParameters_GrIMP(); + void _setupParameters_TwoPass(); + void _setupParameters_OnePass(); }; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp index 26f584305..d96ff8f57 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.cpp @@ -1,23 +1,24 @@ #include "cuAmpcorProcessor.h" -#include "cuAmpcorProcessorROIPAC.h" -#include "cuAmpcorProcessorGrIMP.h" +#include "cuAmpcorProcessorTwoPass.h" +#include "cuAmpcorProcessorOnePass.h" +#include // Factory method implementation // create the batch processor for a given {workflow} std::unique_ptr cuAmpcorProcessor::create(int workflow, cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, + SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_) { if (workflow == 0) { - return std::unique_ptr(new cuAmpcorProcessorROIPAC( + return std::unique_ptr(new cuAmpcorProcessorTwoPass( param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_)); } else if (workflow == 1) { - return std::unique_ptr(new cuAmpcorProcessorGrIMP( + return std::unique_ptr(new cuAmpcorProcessorOnePass( param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_)); } else { @@ -27,7 +28,7 @@ std::unique_ptr cuAmpcorProcessor::create(int workflow, // constructor cuAmpcorProcessor::cuAmpcorProcessor(cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, + SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h index 4d07d1443..2e084a67a 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessor.h @@ -8,7 +8,7 @@ #ifndef __CUAMPCORPROCESSOR_H #define __CUAMPCORPROCESSOR_H -#include "GDALImage.h" +#include "SlcImage.h" #include "data_types.h" #include "cuArrays.h" #include "cuAmpcorParameter.h" @@ -16,6 +16,7 @@ #include "cuSincOverSampler.h" #include "cuCorrFrequency.h" #include "cuCorrNormalizer.h" +#include /** @@ -33,8 +34,8 @@ class cuAmpcorProcessor{ int devId; ///< GPU device ID to use cudaStream_t stream; ///< CUDA stream to use - GDALImage *referenceImage; ///< reference image object - GDALImage *secondaryImage; ///< secondary image object + SlcImage *referenceImage; ///< reference image object + SlcImage *secondaryImage; ///< secondary image object cuAmpcorParameter *param; ///< reference to the (global) parameters cuArrays *offsetImage; ///< output offsets image cuArrays *snrImage; ///< snr image @@ -45,7 +46,7 @@ class cuAmpcorProcessor{ public: // default constructor and destructor cuAmpcorProcessor(cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, + SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_); @@ -54,7 +55,7 @@ class cuAmpcorProcessor{ // Factory method (virtual constructor) static std::unique_ptr create(int workflow, cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, + SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp similarity index 94% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp index e66111f30..d7a098e3b 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp @@ -1,4 +1,4 @@ -#include "cuAmpcorProcessorGrIMP.h" +#include "cuAmpcorProcessorOnePass.h" #include "cuAmpcorUtil.h" #include @@ -9,7 +9,7 @@ * @param[in] idxDown_ index of the chunk along Down/Azimuth direction * @param[in] idxAcross_ index of the chunk along Across/Range direction */ -void cuAmpcorProcessorGrIMP::run(int idxDown_, int idxAcross_) +void cuAmpcorProcessorOnePass::run(int idxDown_, int idxAcross_) { // set chunk index setIndex(idxDown_, idxAcross_); @@ -81,7 +81,7 @@ void cuAmpcorProcessorGrIMP::run(int idxDown_, int idxAcross_) // find the maximum location of the correlation surface, in a rectangle area {range} from {start} int extraPadSize = param->halfZoomWindowSizeRaw*param->rawDataOversamplingFactor; int2 start = make_int2(extraPadSize, extraPadSize); - int2 range = make_int2(r_corrBatch->height-extraPadSize, r_corrBatch->width-extraPadSize); + int2 range = make_int2(r_corrBatch->height-2*extraPadSize, r_corrBatch->width-2*extraPadSize); cuArraysMaxloc2D(r_corrBatch, start, range, offsetInit, r_maxval, stream); #ifdef CUAMPCOR_DEBUG @@ -101,7 +101,7 @@ void cuAmpcorProcessorGrIMP::run(int idxDown_, int idxAcross_) // statistics of correlation surface // estimate variance on r_corrBatch - cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, param->oversamplingFactor, r_covValue, stream); + cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, param->corrSurfaceOverSamplingFactor, r_covValue, stream); // snr on the extracted surface r_corrBatchZoomIn cuArraysSumSquare(r_corrBatchZoomIn, r_corrBatchSum, stream); @@ -114,11 +114,11 @@ void cuAmpcorProcessorGrIMP::run(int idxDown_, int idxAcross_) #endif // oversample the correlation surface - if(param->oversamplingMethod) { + if(param->corrSurfaceOverSamplingMethod) { // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor // we need the max loc as the center if shifted corrSincOverSampler->execute(r_corrBatchZoomIn, r_corrBatchZoomInOverSampled, - maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor + maxLocShift, param->corrSurfaceOverSamplingFactor*param->rawDataOversamplingFactor ); } @@ -144,8 +144,8 @@ void cuAmpcorProcessorGrIMP::run(int idxDown_, int idxAcross_) cuSubPixelOffset(offsetInit, offsetZoomIn, offsetFinal, make_int2(param->corrWindowSize.x/2, param->corrWindowSize.y/2), // init offset origin param->rawDataOversamplingFactor, // init offset factor - make_int2(param->corrZoomInSize.x/2*param->oversamplingFactor, param->corrZoomInSize.y/2*param->oversamplingFactor), - param->rawDataOversamplingFactor*param->oversamplingFactor, + make_int2(param->corrZoomInSize.x/2*param->corrSurfaceOverSamplingFactor, param->corrZoomInSize.y/2*param->corrSurfaceOverSamplingFactor), + param->rawDataOversamplingFactor*param->corrSurfaceOverSamplingFactor, stream); #ifdef CUAMPCOR_DEBUG @@ -167,7 +167,7 @@ void cuAmpcorProcessorGrIMP::run(int idxDown_, int idxAcross_) /// constructor -cuAmpcorProcessorGrIMP::cuAmpcorProcessorGrIMP(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, +cuAmpcorProcessorOnePass::cuAmpcorProcessorOnePass(cuAmpcorParameter *param_, SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_) : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_) @@ -277,8 +277,8 @@ cuAmpcorProcessorGrIMP::cuAmpcorProcessorGrIMP(cuAmpcorParameter *param_, GDALIm // end of new arrays - if(param->oversamplingMethod) { - corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); + if(param->corrSurfaceOverSamplingMethod) { + corrSincOverSampler = new cuSincOverSamplerR2R(param->corrSurfaceOverSamplingFactor, stream); } else { corrOverSampler= new cuOverSamplerR2R( @@ -308,7 +308,7 @@ cuAmpcorProcessorGrIMP::cuAmpcorProcessorGrIMP(cuAmpcorParameter *param_, GDALIm #endif } -void cuAmpcorProcessorGrIMP::loadReferenceChunk() +void cuAmpcorProcessorOnePass::loadReferenceChunk() { // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw @@ -370,7 +370,7 @@ void cuAmpcorProcessorGrIMP::loadReferenceChunk() #endif // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) - if(referenceImage->isComplex()) + if(param->referenceImageDataType==2) { // allocate a gpu buffer to load data from cpu/file // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 @@ -413,7 +413,7 @@ void cuAmpcorProcessorGrIMP::loadReferenceChunk() } // end of if all pixels out of range } -void cuAmpcorProcessorGrIMP::loadSecondaryChunk() +void cuAmpcorProcessorOnePass::loadSecondaryChunk() { // get the chunk size to be loaded to gpu int height = param->secondaryChunkHeight[idxChunk]; // number of pixels along height @@ -433,7 +433,7 @@ void cuAmpcorProcessorGrIMP::loadSecondaryChunk() getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]); ChunkOffsetAcross->copyToDevice(stream); - if(secondaryImage->isComplex()) + if(param->secondaryImageDataType==2) { c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); c_secondaryChunkRaw->allocate(); @@ -481,11 +481,11 @@ void cuAmpcorProcessorGrIMP::loadSecondaryChunk() } // destructor -cuAmpcorProcessorGrIMP::~cuAmpcorProcessorGrIMP() +cuAmpcorProcessorOnePass::~cuAmpcorProcessorOnePass() { corrNormalizerOverSampled.release(); - if(param->oversamplingMethod) { + if(param->corrSurfaceOverSamplingMethod) { delete corrSincOverSampler; } else { diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h similarity index 85% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.h rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h index bad19c74d..fb5f1e8b4 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.h @@ -1,12 +1,12 @@ /* - * @file cuAmpcorProcessorGrIMP.h - * @brief Ampcor processor for a batch of windows with GrIMP workflow + * @file cuAmpcorProcessorOnePass.h + * @brief Ampcor processor for a batch of windows with OnePass workflow * * */ -#ifndef __CUAMPCORPROCESSORGRIMP_H -#define __CUAMPCORPROCESSORGRIMP_H +#ifndef __CUAMPCORPROCESSOROnePass_H +#define __CUAMPCORPROCESSOROnePass_H #include "cuAmpcorProcessor.h" @@ -14,7 +14,7 @@ /** * cuAmpcor processor for a chunk (a batch of windows) */ -class cuAmpcorProcessorGrIMP : public cuAmpcorProcessor { +class cuAmpcorProcessorOnePass : public cuAmpcorProcessor { private: @@ -70,13 +70,13 @@ class cuAmpcorProcessorGrIMP : public cuAmpcorProcessor { public: // constructor - cuAmpcorProcessorGrIMP(cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, + cuAmpcorProcessorOnePass(cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_); // destructor - ~cuAmpcorProcessorGrIMP() override; + ~cuAmpcorProcessorOnePass() override; // run the given chunk void run(int, int) override; @@ -85,4 +85,4 @@ class cuAmpcorProcessorGrIMP : public cuAmpcorProcessor { void loadSecondaryChunk(); }; -#endif //__CUAMPCORPROCESSORGRIMP_H +#endif //__CUAMPCORPROCESSOROnePass_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp similarity index 95% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp index aea21e964..52a384e41 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp @@ -1,4 +1,4 @@ -#include "cuAmpcorProcessorROIPAC.h" +#include "cuAmpcorProcessorTwoPass.h" #include "cuAmpcorUtil.h" #include @@ -9,7 +9,7 @@ * @param[in] idxDown_ index of the chunk along Down/Azimuth direction * @param[in] idxAcross_ index of the chunk along Across/Range direction */ -void cuAmpcorProcessorROIPAC::run(int idxDown_, int idxAcross_) +void cuAmpcorProcessorTwoPass::run(int idxDown_, int idxAcross_) { // set chunk index setIndex(idxDown_, idxAcross_); @@ -174,11 +174,11 @@ void cuAmpcorProcessorROIPAC::run(int idxDown_, int idxAcross_) #endif // oversample the correlation surface - if(param->oversamplingMethod) { - // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversamplingfactor + if(param->corrSurfaceOverSamplingMethod) { + // sinc interpolator only computes (-i_sincwindow, i_sincwindow)*oversampling factor // we need the max loc as the center if shifted corrSincOverSampler->execute(r_corrBatchZoomInAdjust, r_corrBatchZoomInOverSampled, - maxLocShift, param->oversamplingFactor*param->rawDataOversamplingFactor + maxLocShift, param->corrSurfaceOverSamplingFactor*param->rawDataOversamplingFactor ); } else { @@ -202,7 +202,7 @@ void cuAmpcorProcessorROIPAC::run(int idxDown_, int idxAcross_) // determine the final offset from non-oversampled (pixel) and oversampled (sub-pixel) // = (Init-HalfsearchRange) + ZoomIn/(2*ovs) cuSubPixelOffset2Pass(offsetInit, offsetZoomIn, offsetFinal, - param->oversamplingFactor, param->rawDataOversamplingFactor, + param->corrSurfaceOverSamplingFactor, param->rawDataOversamplingFactor, param->halfSearchRangeDownRaw, param->halfSearchRangeAcrossRaw, stream); @@ -218,7 +218,7 @@ void cuAmpcorProcessorROIPAC::run(int idxDown_, int idxAcross_) } -void cuAmpcorProcessorROIPAC::loadReferenceChunk() +void cuAmpcorProcessorTwoPass::loadReferenceChunk() { // we first load the whole chunk of image from cpu to a gpu buffer c(r)_referenceChunkRaw @@ -247,7 +247,7 @@ void cuAmpcorProcessorROIPAC::loadReferenceChunk() ChunkOffsetAcross->copyToDevice(stream); // check whether the image is complex (e.g., SLC) or real( e.g. TIFF) - if(referenceImage->isComplex()) + if(param->referenceImageDataType==2) { // allocate a gpu buffer to load data from cpu/file // try allocate/deallocate the buffer on the fly to save gpu memory 07/09/19 @@ -291,7 +291,7 @@ void cuAmpcorProcessorROIPAC::loadReferenceChunk() } // end of if all pixels out of range } -void cuAmpcorProcessorROIPAC::loadSecondaryChunk() +void cuAmpcorProcessorTwoPass::loadSecondaryChunk() { // get the chunk size to be loaded to gpu int height = param->secondaryChunkHeight[idxChunk]; // number of pixels along height @@ -311,7 +311,7 @@ void cuAmpcorProcessorROIPAC::loadSecondaryChunk() getRelativeOffset(ChunkOffsetAcross->hostData, param->secondaryStartPixelAcross, param->secondaryChunkStartPixelAcross[idxChunk]); ChunkOffsetAcross->copyToDevice(stream); - if(secondaryImage->isComplex()) + if(param->secondaryImageDataType==2) { c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); c_secondaryChunkRaw->allocate(); @@ -359,7 +359,7 @@ void cuAmpcorProcessorROIPAC::loadSecondaryChunk() } /// constructor -cuAmpcorProcessorROIPAC::cuAmpcorProcessorROIPAC(cuAmpcorParameter *param_, GDALImage *reference_, GDALImage *secondary_, +cuAmpcorProcessorTwoPass::cuAmpcorProcessorTwoPass(cuAmpcorParameter *param_, SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_) : cuAmpcorProcessor(param_, reference_, secondary_, offsetImage_, snrImage_, covImage_, peakValueImage_, stream_) @@ -456,8 +456,8 @@ cuAmpcorProcessorROIPAC::cuAmpcorProcessorROIPAC(cuAmpcorParameter *param_, GDAL r_corrBatchZoomInOverSampled = new cuArrays ( - param->zoomWindowSize * param->oversamplingFactor, - param->zoomWindowSize * param->oversamplingFactor, + param->zoomWindowSize * param->corrSurfaceOverSamplingFactor, + param->zoomWindowSize * param->corrSurfaceOverSamplingFactor, param->numberWindowDownInChunk, param->numberWindowAcrossInChunk); r_corrBatchZoomInOverSampled->allocate(); @@ -522,13 +522,13 @@ cuAmpcorProcessorROIPAC::cuAmpcorProcessorROIPAC(cuAmpcorParameter *param_, GDAL // end of new arrays - if(param->oversamplingMethod) { - corrSincOverSampler = new cuSincOverSamplerR2R(param->oversamplingFactor, stream); + if(param->corrSurfaceOverSamplingMethod) { + corrSincOverSampler = new cuSincOverSamplerR2R(param->corrSurfaceOverSamplingFactor, stream); } else { corrOverSampler= new cuOverSamplerR2R(param->zoomWindowSize, param->zoomWindowSize, - (param->zoomWindowSize)*param->oversamplingFactor, - (param->zoomWindowSize)*param->oversamplingFactor, + (param->zoomWindowSize)*param->corrSurfaceOverSamplingFactor, + (param->zoomWindowSize)*param->corrSurfaceOverSamplingFactor, param->numberWindowDownInChunk*param->numberWindowAcrossInChunk, stream); } @@ -563,12 +563,12 @@ cuAmpcorProcessorROIPAC::cuAmpcorProcessorROIPAC(cuAmpcorParameter *param_, GDAL } // destructor -cuAmpcorProcessorROIPAC::~cuAmpcorProcessorROIPAC() +cuAmpcorProcessorTwoPass::~cuAmpcorProcessorTwoPass() { corrNormalizerOverSampled.release(); corrNormalizerRaw.release(); - if(param->oversamplingMethod) { + if(param->corrSurfaceOverSamplingMethod) { delete corrSincOverSampler; } else { diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h similarity index 85% rename from cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.h rename to cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h index 43d4151a5..de2a29d80 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.h @@ -1,12 +1,12 @@ /* - * @file cuAmpcorProcessorROIPAC.h - * @brief Ampcor processor for a batch of windows with ROIPAC workflow + * @file cuAmpcorProcessorTwoPass.h + * @brief Ampcor processor for a batch of windows with TwoPass workflow * * */ -#ifndef __CUAMPCORPROCESSORROIPAC_H -#define __CUAMPCORPROCESSORROIPAC_H +#ifndef __CUAMPCORPROCESSORTwoPass_H +#define __CUAMPCORPROCESSORTwoPass_H #include "cuAmpcorProcessor.h" @@ -14,7 +14,7 @@ /** * cuAmpcor processor for a chunk (a batch of windows) */ -class cuAmpcorProcessorROIPAC : public cuAmpcorProcessor{ +class cuAmpcorProcessorTwoPass : public cuAmpcorProcessor{ private: // local variables and workers @@ -68,13 +68,13 @@ class cuAmpcorProcessorROIPAC : public cuAmpcorProcessor{ public: // constructor - cuAmpcorProcessorROIPAC(cuAmpcorParameter *param_, - GDALImage *reference_, GDALImage *secondary_, + cuAmpcorProcessorTwoPass(cuAmpcorParameter *param_, + SlcImage *reference_, SlcImage *secondary_, cuArrays *offsetImage_, cuArrays *snrImage_, cuArrays *covImage_, cuArrays *peakValueImage_, cudaStream_t stream_); // destructor - ~cuAmpcorProcessorROIPAC() override; + ~cuAmpcorProcessorTwoPass() override; // local methods void loadReferenceChunk(); @@ -85,4 +85,4 @@ class cuAmpcorProcessorROIPAC : public cuAmpcorProcessor{ -#endif //__CUAMPCORPROCESSORROIPAC_H +#endif //__CUAMPCORPROCESSORTwoPass_H diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu index 8ccf9e05f..46d00a22a 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu @@ -97,12 +97,18 @@ __global__ void cuDerampMethod1_kernel(real2_type *images, const int imageNX, in phaseDiffX += cprod; } } - complexSumReduceBlock(phaseDiffX, shmem); - //phaseDiffX *= normCoef; real_type phaseX = atan2(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON - + +#ifdef CUAMPCOR_DEBUG + // output the phase ramp in debug mode + if (threadIdx.x==0) + { + printf("debug deramp az: %g rg: %g\n", phaseX, phaseY); + } +#endif + for (int i = tid; i < imageSize; i += nthreads) { pixelIdxX = i%imageNY; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp index 01585ab63..183fa4cc0 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cudaError.cpp @@ -15,20 +15,8 @@ #endif #endif -template<> -void check(cudaError_t result, char const *const func, const char *const file, int const line) -{ - if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s: %s) \n", - file, line, static_cast(result), func, cudaGetErrorString(result)); - DEVICE_RESET - // Make sure we call CUDA Device Reset before exiting - exit(EXIT_FAILURE); - } -} - -template<> -void check(cufftResult_t result, char const *const func, const char *const file, int const line) +template +void check(T result, char const *const func, const char *const file, int const line) { if (result) { fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", @@ -39,6 +27,8 @@ void check(cufftResult_t result, char const *const func, const char *const file, } } +template void check(cudaError_t, char const *const, const char *const, int const); +template void check(cufftResult_t, char const *const, const char *const, int const); void __getLastCudaError(const char *errorMessage, const char *file, const int line) { diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp index acd55e09b..bdbe868af 100644 --- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp @@ -37,9 +37,11 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM(str, referenceImageName) .DEF_PARAM(int, referenceImageHeight) .DEF_PARAM(int, referenceImageWidth) + .DEF_PARAM(int, referenceImageDataType) .DEF_PARAM(str, secondaryImageName) .DEF_PARAM(int, secondaryImageHeight) .DEF_PARAM(int, secondaryImageWidth) + .DEF_PARAM(int, secondaryImageDataType) .DEF_PARAM(int, numberWindowDown) .DEF_PARAM(int, numberWindowAcross) From c91fe4f8e589631d1a554a1c6a563c73b41fcd6b Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Wed, 19 Feb 2025 18:34:57 +0000 Subject: [PATCH 07/11] Rename correlation surface parameters --- .../extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp index bdbe868af..983b78e14 100644 --- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp @@ -70,8 +70,8 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM_RENAME(int, referenceStartPixelAcrossStatic, referenceStartPixelAcross0) .DEF_PARAM_RENAME(int, referenceStartPixelDownStatic, referenceStartPixelDown0) - .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingMethod, oversamplingMethod) - .DEF_PARAM_RENAME(int, corrSurfaceOverSamplingFactor, oversamplingFactor) + .DEF_PARAM(int, corrSurfaceOverSamplingMethod) + .DEF_PARAM(int, corrSurfaceOverSamplingFactor) .DEF_PARAM_RENAME(int, mmapSize, mmapSizeInGB) From ce50a22c3947cda63dbcd2791f3a501db52c17e1 Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Wed, 19 Feb 2025 18:35:10 +0000 Subject: [PATCH 08/11] Fix source file names --- cxx/isce3/cuda/Sources.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cxx/isce3/cuda/Sources.cmake b/cxx/isce3/cuda/Sources.cmake index 07555053f..316699c59 100644 --- a/cxx/isce3/cuda/Sources.cmake +++ b/cxx/isce3/cuda/Sources.cmake @@ -31,8 +31,8 @@ image/ResampSlc.cpp matchtemplate/pycuampcor/cuAmpcorController.cpp matchtemplate/pycuampcor/cuAmpcorParameter.cpp matchtemplate/pycuampcor/cuAmpcorProcessor.cpp -matchtemplate/pycuampcor/cuAmpcorProcessorGrIMP.cpp -matchtemplate/pycuampcor/cuAmpcorProcessorROIPAC.cpp +matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp +matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp matchtemplate/pycuampcor/cuArrays.cpp matchtemplate/pycuampcor/cuArraysCopy.cu matchtemplate/pycuampcor/cuArraysPadding.cu @@ -48,7 +48,7 @@ matchtemplate/pycuampcor/cuOverSampler.cpp matchtemplate/pycuampcor/cuSincOverSampler.cu matchtemplate/pycuampcor/cudaError.cpp matchtemplate/pycuampcor/cudaUtil.cpp -matchtemplate/pycuampcor/GDALImage.cpp +matchtemplate/pycuampcor/SlcImage.cpp product/SubSwaths.cu signal/gpuAzimuthFilter.cu signal/gpuCrossMul.cu From 0f234db3494e68700cdfda91365309dff46a86fe Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Fri, 28 Feb 2025 11:56:28 -0800 Subject: [PATCH 09/11] Drop in Lijun's latest v2 bugfixes --- .../matchtemplate/pycuampcor/cuAmpcorParameter.cpp | 14 +++++++++++--- .../matchtemplate/pycuampcor/cuAmpcorParameter.h | 2 ++ .../pycuampcor/cuAmpcorProcessorOnePass.cpp | 4 ++-- .../pycuampcor/cuAmpcorProcessorTwoPass.cpp | 4 ++-- .../cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h | 2 +- .../cuda/matchtemplate/pycuampcor/cuArrays.cpp | 1 + cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h | 8 +++++++- .../pycuampcor/cuCorrNormalization.cu | 14 +++++++------- .../matchtemplate/pycuampcor/cuEstimateStats.cu | 6 +++--- .../cuda/matchtemplate/pycuampcor/cuOffset.cu | 8 ++++---- .../matchtemplate/pycuampcor/cuSincOverSampler.h | 2 +- 11 files changed, 41 insertions(+), 24 deletions(-) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp index a06322180..e3beceb2f 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp @@ -5,7 +5,7 @@ #include "cuAmpcorParameter.h" #include - +#include #include @@ -158,8 +158,8 @@ void cuAmpcorParameter::_setupParameters_OnePass() halfSearchRangeAcrossRaw += halfZoomWindowSizeRaw; // add extra search range to ensure enough area to oversample correlation surface - searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; - searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw; + searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeAcrossRaw; + searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeDownRaw; windowSizeHeightRawEnlarged = searchWindowSizeHeightRaw; windowSizeWidthRawEnlarged = searchWindowSizeWidthRaw; @@ -193,6 +193,14 @@ void cuAmpcorParameter::_setupParameters_OnePass() // oversampled correlation surface size corrZoomInOversampledSize = make_int2(corrZoomInSize.x * corrSurfaceOverSamplingFactor, corrZoomInSize.y * corrSurfaceOverSamplingFactor); + + int totalOS = rawDataOversamplingFactor*corrSurfaceOverSamplingFactor; + // search start : image center - 1 pixel * totalOS + corrZoomInOversampledSearchStart = make_int2((corrZoomInSize.x-1)*corrSurfaceOverSamplingFactor/2 - totalOS, + (corrZoomInSize.y-1)*corrSurfaceOverSamplingFactor/2 - totalOS); + // search range \pm 1 pixel * totalOS + corrZoomInOversampledSearchRange = make_int2(2*totalOS, 2*totalOS); + fprintf(stderr, "zoomWindowSize is (%d, %d)!\n", corrZoomInSize.x, corrZoomInSize.y ); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h index f35ab7606..e7d9e0503 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h @@ -78,6 +78,8 @@ class cuAmpcorParameter{ int2 corrWindowSize; // 2*halfSearchRange + 1 int2 corrZoomInSize; // zoomWindowSize+1 int2 corrZoomInOversampledSize; // corrZoomInSize * oversamplingFactor + int2 corrZoomInOversampledSearchStart; // search start position in the oversampled correlation surface + int2 corrZoomInOversampledSearchRange; // search range in the oversampled correlation surface int corrStatWindowSize; ///< correlation surface size used to estimate snr diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp index d7a098e3b..4b3b2bedb 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp @@ -131,8 +131,8 @@ void cuAmpcorProcessorOnePass::run(int idxDown_, int idxAcross_) r_corrBatchZoomInOverSampled->outputToFile("r_corrBatchZoomInOverSampled", stream); #endif - //find the max again - cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, offsetZoomIn, corrMaxValue, stream); + //find the max again, within the range of \pm 1 pixel * totalOS + cuArraysMaxloc2D(r_corrBatchZoomInOverSampled, param->corrZoomInOversampledSearchStart, param->corrZoomInOversampledSearchRange, offsetZoomIn, corrMaxValue, stream); #ifdef CUAMPCOR_DEBUG // dump the max location on oversampled correlation surface diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp index 52a384e41..9207bd574 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorTwoPass.cpp @@ -275,7 +275,7 @@ void cuAmpcorProcessorTwoPass::loadReferenceChunk() } // if the image is real else { - r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); + r_referenceChunkRaw = new cuArrays (param->maxReferenceChunkHeight, param->maxReferenceChunkWidth); r_referenceChunkRaw->allocate(); // load the data from cpu @@ -313,7 +313,7 @@ void cuAmpcorProcessorTwoPass::loadSecondaryChunk() if(param->secondaryImageDataType==2) { - c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); + c_secondaryChunkRaw = new cuArrays (param->maxSecondaryChunkHeight, param->maxSecondaryChunkWidth); c_secondaryChunkRaw->allocate(); //load a chunk from mmap to gpu diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h index dac3642f5..ef682df94 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h @@ -78,7 +78,7 @@ void cuCorrNormalizeSAT(cuArrays *correlation, cuArrays *r void cuArraysMaxloc2D(cuArrays *images, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); void cuArraysMaxloc2D(cuArrays *images, const int2 start, const int2 range, cuArrays *maxloc, cuArrays *maxval, cudaStream_t stream); void cuSubPixelOffset(cuArrays *offsetInit, cuArrays *offsetRoomIn, cuArrays *offsetFinal, const int2 initOrigin, const int initFactor, const int2 zoomInOrigin, const int zoomInFactor, cudaStream_t stream); -void cuSubPixelOffset2Pass(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, cudaStream_t stream); +void cuSubPixelOffset2Pass(cuArrays *offsetInit, cuArrays *offsetZoomIn, cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, cudaStream_t stream); void cuDetermineSecondaryExtractOffset(cuArrays *maxLoc, cuArrays *maxLocShift, int xOldRange, int yOldRange, int xNewRange, int yNewRange, cudaStream_t stream); //in cuCorrTimeDomain.cu: cross correlation in time domain diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp index 642c29c52..bb08c3d56 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.cpp @@ -139,6 +139,7 @@ void cuArrays::outputHostToFile(std::string fn) file.open(fn.c_str(), std::ios_base::binary); file.write((char *)hostData, getByteSize()); file.close(); + std::cout << fn << "size: height " << height << " width "<< width << " count " << count << "\n"; } // instantiations, required by python extensions diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h index 7e92acd2b..5fb7878dc 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuArrays.h @@ -13,7 +13,7 @@ // cuda dependencies #include - +#include #include template @@ -104,5 +104,11 @@ class cuArrays{ }; +std::ostream& operator<<(std::ostream& os, const float2& p); +std::ostream& operator<<(std::ostream& os, const float3& p); +std::ostream& operator<<(std::ostream& os, const double2& p); +std::ostream& operator<<(std::ostream& os, const float3& p); +std::ostream& operator<<(std::ostream& os, const int2& p); + #endif //__CUARRAYS_H //end of file diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu index c38a38dc0..0c777dbbe 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuCorrNormalization.cu @@ -186,9 +186,9 @@ void cuArraysSumSquare(cuArrays *images, cuArrays *imagesS // cuda kernel to compute summation on extracted correlation surface (Minyan) template -__global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *imagesSum, int *imagesValidCount, int imageSize, int nImages) +__global__ void cuArraysSumCorr_kernel(real_type *images, int *imagesValid, real_type *imagesSum, int *imagesValidCount, int imageSize, int nImages) { - __shared__ float shmem[Nthreads]; + __shared__ real_type shmem[Nthreads]; const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -197,11 +197,11 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i const int imageIdx = bid; const int imageOffset = imageIdx * imageSize; - float* imageD = images + imageOffset; + real_type* imageD = images + imageOffset; int* imageValidD = imagesValid + imageOffset; - float sum = 0.0f; - int count = 0; + real_type sum = 0.0f; + real_type count = 0; for (int i = tid; i < imageSize; i += Nthreads) { sum += imageD[i] * imageD[i]; @@ -213,7 +213,7 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i if(tid ==0) { imagesSum[bid] = sum; - imagesValidCount[bid] = count; + imagesValidCount[bid] = (int)count; } } @@ -225,7 +225,7 @@ __global__ void cuArraysSumCorr_kernel(float *images, int *imagesValid, float *i * @param[out] imagesValidCount count of total valid pixels * @param[in] stream cudaStream */ -void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, +void cuArraysSumCorr(cuArrays *images, cuArrays *imagesValid, cuArrays *imagesSum, cuArrays *imagesValidCount, cudaStream_t stream) { const dim3 grid(images->count, 1, 1); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu index cb6013fed..e8a92070d 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuEstimateStats.cu @@ -49,14 +49,14 @@ void cuEstimateSnr(cuArrays *corrSum, cuArrays *maxval, cu getLastCudaError("cuda kernel estimate stats error\n"); } -__global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrValidCount, const float* maxval, float* snrValue, const int size) +__global__ void cudaKernel_estimateSnr(const real_type* corrSum, const int* corrValidCount, const real_type* maxval, real_type* snrValue, const int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if (idx >= size) return; - float mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1); + real_type mean = (corrSum[idx] - maxval[idx] * maxval[idx]) / (corrValidCount[idx] - 1); snrValue[idx] = maxval[idx] * maxval[idx] / mean; } @@ -68,7 +68,7 @@ __global__ void cudaKernel_estimateSnr(const float* corrSum, const int* corrVali * @param[out] snrValue return snr value * @param[in] stream cuda stream */ -void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream) +void cuEstimateSnr(cuArrays *corrSum, cuArrays *corrValidCount, cuArrays *maxval, cuArrays *snrValue, cudaStream_t stream) { int size = corrSum->getSize(); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu index b03eaa333..f6bca24da 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOffset.cu @@ -178,9 +178,9 @@ void cuSubPixelOffset(cuArrays *offsetInit, // cuda kernel for cuSubPixelOffset __global__ void cuSubPixelOffset2Pass_kernel(const int2 *offsetInit, const int2 *offsetZoomIn, - float2 *offsetFinal, - const float OSratio, - const float xoffset, const float yoffset, const int size) + real2_type *offsetFinal, + const real_type OSratio, + const real_type xoffset, const real_type yoffset, const int size) { int idx = threadIdx.x + blockDim.x*blockIdx.x; if (idx >= size) return; @@ -212,7 +212,7 @@ __global__ void cuSubPixelOffset2Pass_kernel(const int2 *offsetInit, const int2 * Final offset = pixel size offset + subpixel offset */ void cuSubPixelOffset2Pass(cuArrays *offsetInit, cuArrays *offsetZoomIn, - cuArrays *offsetFinal, + cuArrays *offsetFinal, int OverSampleRatioZoomin, int OverSampleRatioRaw, int xHalfRangeInit, int yHalfRangeInit, cudaStream_t stream) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h index f0ca5ae81..741fb90d8 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuSincOverSampler.h @@ -45,7 +45,7 @@ class cuSincOverSamplerR2R int i_intplength; ///< actual filter length = r_relfiltlen/r_beta int i_filtercoef; //< length of the sinc kernel i_intplength*i_decfactor+1 - float * r_filter; // sinc kernel with size i_filtercoef + real_type * r_filter; // sinc kernel with size i_filtercoef cudaStream_t stream; From bc3ede157e24b246699a497c1799305f1bdcf620 Mon Sep 17 00:00:00 2001 From: Ryan Burns Date: Fri, 28 Feb 2025 12:21:26 -0800 Subject: [PATCH 10/11] Merge Lijun's corrwindow size error fix --- .../cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp index e3beceb2f..6727b683c 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cpp @@ -122,8 +122,8 @@ void cuAmpcorParameter::_setupParameters_TwoPass() windowSizeWidth = windowSizeWidthRaw*rawDataOversamplingFactor; // windowSizeHeight = windowSizeHeightRaw*rawDataOversamplingFactor; - searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeDownRaw; - searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeAcrossRaw; + searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeAcrossRaw; + searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeDownRaw; searchWindowSizeWidthRawZoomIn = windowSizeWidthRaw + 2*halfZoomWindowSizeRaw; searchWindowSizeHeightRawZoomIn = windowSizeHeightRaw + 2*halfZoomWindowSizeRaw; @@ -153,11 +153,11 @@ void cuAmpcorParameter::_setupParameters_OnePass() // serve as extra margin for search range halfZoomWindowSizeRaw = zoomWindowSize/(2*rawDataOversamplingFactor); - // add to the search range + // add extra search range to ensure enough area to oversample correlation surface halfSearchRangeDownRaw += halfZoomWindowSizeRaw; halfSearchRangeAcrossRaw += halfZoomWindowSizeRaw; - // add extra search range to ensure enough area to oversample correlation surface + // search window size before antialiasing oversampling searchWindowSizeWidthRaw = windowSizeWidthRaw + 2*halfSearchRangeAcrossRaw; searchWindowSizeHeightRaw = windowSizeHeightRaw + 2*halfSearchRangeDownRaw; From d759744aab927bb834ab10790f73078043fb956e Mon Sep 17 00:00:00 2001 From: Lijun Zhu Date: Tue, 29 Jul 2025 12:18:31 -0700 Subject: [PATCH 11/11] pycuampcor-v2: fix the distance in variance estimate in one-pass workflow --- .../cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp index 4b3b2bedb..060f6f574 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorProcessorOnePass.cpp @@ -101,7 +101,7 @@ void cuAmpcorProcessorOnePass::run(int idxDown_, int idxAcross_) // statistics of correlation surface // estimate variance on r_corrBatch - cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, param->corrSurfaceOverSamplingFactor, r_covValue, stream); + cuEstimateVariance(r_corrBatch, offsetInit, r_maxval, r_referenceBatchOverSampled->size, param->rawDataOversamplingFactor, r_covValue, stream); // snr on the extracted surface r_corrBatchZoomIn cuArraysSumSquare(r_corrBatchZoomIn, r_corrBatchSum, stream);