SIMD学习笔记2：高斯卷积计算优化

本文主要是介绍SIMD学习笔记2：高斯卷积计算优化，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

https://github.com/gredx/simd-parallel-conv
https://zhuanlan.zhihu.com/p/419806079
https://www.cnblogs.com/Imageshop/p/9069650.html
https://zhuanlan.zhihu.com/p/308004749
https://zhuanlan.zhihu.com/p/83694328

SSE图像算法优化系列十八：三次卷积插值的进一步SSE优化。
基于CPU SIMD和winograd的卷积计算加速技术_
如何学习SIMD(单指令多数据流)并应用？
SSE图像算法优化系列九：灵活运用SIMD指令16倍提升Sobel边缘检测的速度（4000*3000的24位图像时间由480ms降低到30ms）。
SSE图像算法优化系列二：高斯模糊算法的全面优化过程分享（一）。
数字图像处理之高斯滤波加速优化

Opencv findcontours函数原理，以及python numpy实现
AVX256加速矩阵乘法

microsoft/ DirectXMath github SIMD

我要实现循环卷积sse，暂时没有找到比较好的写法：

优化前

void gaussianConvolution(Matrix<double>& srcIamge, Matrix<double>& desImage, Matrix<double>& kernel)
{int kernelSize = kernel.numCols();//卷积填充int startOffset = -1 * int(kernelSize / 2);for (int i = 0; i < srcIamge.numRows(); i++){for (int j = 0; j < srcIamge.numCols(); j++){double blurredPixel = 0.0;for (int kx = 0; kx < kernelSize; kx++){for (int ky = 0; ky < kernelSize; ky++){int x = i + startOffset + kx, y = j + startOffset + ky;GetPixelWrapAround(srcIamge, x, y);blurredPixel += kernel.get(kx, ky)* srcIamge.get(x, y);}}desImage.set(i, j, blurredPixel);}}
}void  GetPixelWrapAround(const Matrix<double>& image, int& x, int& y)
{int w = image.numRows();int h = image.numCols();x = (x % w + w) % w;y = (y % h + h) % h;
}

sse优化后：

void greenNoise::gaussianConvolutionSSE(Matrix<double>& srcImage, Matrix<double>& desImage, Matrix<double>& kernel)
{int kernelSize = kernel.numCols();int width = srcImage.numRows();int height = srcImage.numCols();int startOffset = -1 * static_cast<int>(kernelSize / 2);double temp[4];for (int i = 0; i < width; i++){for (int j = 0; j < height; j++){double blurredPixel = 0.0;for (int kx = 0; kx < kernelSize; kx++){int x = (i + startOffset + kx + width) % width;for (int ky = 0; ky < kernelSize-3; ky+=4){//int y = (j + startOffset + ky + height) % height;int y0 = j + startOffset + ky + height;int y1 = (y0 + 1)% height;int y2 = (y0 + 2) % height;int y3 = (y0 + 3) % height;y0 = y0 % height;__m256d srcValues = _mm256_set_pd(srcImage.get(x, y0), srcImage.get(x, y1), srcImage.get(x, y2), srcImage.get(x, y3));__m256d kernelValues = _mm256_set_pd(kernel.get(kx, ky), kernel.get(kx, ky+1), kernel.get(kx, ky+2), kernel.get(kx, ky+3));__m256d resultVec = _mm256_mul_pd(srcValues, kernelValues);_mm256_storeu_pd(temp, resultVec);blurredPixel += temp[0]+ temp[1] + temp[2] + temp[3] ;}// Process the remaining elements (if any) without SSEfor (int ky = kernelSize - kernelSize % 4; ky < kernelSize; ++ky){int y = (j + startOffset + ky + height) % height;blurredPixel += kernel.get(kx, ky) * srcImage.get(x, y);}}desImage.set(i, j, blurredPixel);}}
}

加入多线程：

void greenNoise::parallelGaussianConvolutionSSE(Matrix<double>& srcImage, Matrix<double>& desImage, Matrix<double>& kernel)
{int kernelSize = kernel.numCols();int width = srcImage.numRows();int height = srcImage.numCols();int startOffset = -1 * static_cast<int>(kernelSize / 2);std::vector<std::thread> threads;//std::mutex mutex; // Mutex to control access to the result matrixconst int numThreads = std::thread::hardware_concurrency(); // Number of available threadsconst int rowsPerThread = (width + numThreads - 1) / numThreads; // Rows per threadfor (int t = 0; t < numThreads; ++t){threads.emplace_back([&srcImage, &desImage, &kernel, t, rowsPerThread,kernelSize, width, height, startOffset](){for (int i = t* rowsPerThread; i < std::min(width, (t +1)* rowsPerThread); i++){for (int j = 0; j < height; j++){double temp[4];double blurredPixel = 0.0;for (int kx = 0; kx < kernelSize; kx++){int x = (i + startOffset + kx + width) % width;for (int ky = 0; ky < kernelSize - 3; ky += 4){//int y = (j + startOffset + ky + height) % height;int y0 = j + startOffset + ky + height;int y1 = (y0 + 1) % height;int y2 = (y0 + 2) % height;int y3 = (y0 + 3) % height;y0 = y0 % height;__m256d srcValues = _mm256_set_pd(srcImage.get(x, y0), srcImage.get(x, y1), srcImage.get(x, y2), srcImage.get(x, y3));__m256d kernelValues = _mm256_set_pd(kernel.get(kx, ky), kernel.get(kx, ky + 1), kernel.get(kx, ky + 2), kernel.get(kx, ky + 3));__m256d resultVec = _mm256_mul_pd(srcValues, kernelValues);_mm256_storeu_pd(temp, resultVec);blurredPixel += temp[0] + temp[1] + temp[2] + temp[3];}// Process the remaining elements (if any) without SSEfor (int ky = kernelSize - kernelSize % 4; ky < kernelSize; ++ky){int y = (j + startOffset + ky + height) % height;blurredPixel += kernel.get(kx, ky) * srcImage.get(x, y);}}desImage.set(i, j, blurredPixel);}}});}for (auto& thread : threads){thread.join();}}