我没有发现任何类似的问题。我正在尝试编写高斯-约旦逆矩阵算法。该算法的思想很简单:)
我只想对下三角矩阵求逆。我得到了几乎正确的答案。我在哪里做错了?有人可以帮助我吗?我会很感激的。
Ñ 在一个方向上的矩阵的大小(N%16 = 0)
dim3threadsPerBlock(n / 16,n / 16);
dim3 numBlocks(16,16);
我知道这是一个简单的实现,但起初我需要它才能正常工作:)有人可以帮助我还是给我任何提示?我会很感激的。非常感谢!
有完整的cpu代码:
#include <stdio.h> #include <iostream> #include <fstream> #include <vector> #include <string> #pragma comment(lib, "cuda.lib") #pragma comment(lib, "cudart.lib") #include <cuda.h> #include <math.h> #include <cuda_runtime.h> #include <cuda_runtime_api.h> #include "device_launch_parameters.h" #include <cublas_v2.h> using namespace std; __global__ void gaussjordan(float *A, float *I,int n, int i) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; float P; if(x<n && y<n) if(x>i) if(y>=i){ P=A[x*n+i]/A[i*n+i]; I[x*n+y]-= I[i*n+y]*P; A[x*n+y]-= A[i*n+y]*P; } __syncthreads(); } __global__ void dev(float *d_A, float *dI, int h) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if(x<h && y<h) if(d_A[x*h+x]!=0){ dI[x*h+y] /= d_A[x*h+x]; d_A[x*h+y] /= d_A[x*h+x]; } __syncthreads(); } void savetofile(float *A, string s, int n, int h) { std::ofstream plik; plik.open(s); for(int j=0;j<h;j++){ for(int i=0;i<h;i++){ plik<<A[j*n+i]<<"\t";} plik<<endl;} plik.close(); } int main() { int n = 16; // creating input float *iL = new float [n*n]; float *L = new float [n*n]; for(int i=0;i<n;i++) for(int j=0;j<n;j++) if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005; else L[i*n+j] = 0; savetofile(L,"L.txt",n,n); cout<<"inv\n"; float *d_A, *d_L,*I, *dI; float time; cudaError_t err; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); int ddsize = n*n*sizeof(float); dim3 threadsPerBlock(n/16,n/16); dim3 numBlocks(16,16); // memory allocation err= cudaMalloc( (void**) &d_A, ddsize); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} err= cudaMalloc( (void**) &dI, ddsize); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} I = new float[n*n]; for(int i=0;i<n;i++){ for(int j=0;j<n;j++){ if(i==j) I[i*n+i]=1.0; else I[i*n+j]=0.0;}} //copy data from GPU to CPU err =cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} err =cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} //timer start cudaEventRecord( start, 0); // L^(-1) for(int i=0;i<n;i++){ gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i); } dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n); err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &time, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); std::cout<<"Cuda Time - inverse: "<< time <<"ms\n"; savetofile(iL,"inv.txt",n,n); savetofile(L,"I.txt",n,n); cudaFree(d_A); cudaFree(dI); delete []I; delete []L; delete []iL; system("Pause"); return 0; }
有我的输出:
60.6061 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -34.1233 -2.13675 -0 -0 -0 -0 -0 -0 0 0 0 0 0 0 0 0 -48.5115 1.91603 -0.0799201 -0 -0 -0 -0 -0 0 0 0 0 0 0 0 0 -49.4891 1.8697 0.0748167 -0.0196634 -0 -0 -0 -0 0 0 0 0 0 0 0 0 -49.8332 1.84732 0.0725876 0.018747 -0.00767828 -0 -0 -0 0 0 0 0 0 0 0 0 -50.0073 1.83403 0.071321 0.0182352 0.00739595 -0.00376795 -0 -0 0 0 0 0 0 0 0 0 -50.112 1.82521 0.0705011 0.0179073 0.0072164 0.00365346 -0.00212282 -0 0 0 0 0 0 0 0 0 -50.1818 1.81893 0.0699261 0.0176789 0.00709196 0.00357445 0.00206784 -0.00131234 0 0 0 0 0 0 0 0 -50.2316 1.81423 0.0695003 0.0175105 0.00700059 0.00351662 0.0020277 0.00128271 -0.00086736 -0 -0 -0 -0 -0 -0 -0 -50.2689 1.81057 0.0691722 0.0173813 0.00693062 0.00347244 0.00199711 0.00126017 0.000850006 -0.000602925 -0 -0 -0 -0 -0 -0 -50.2979 1.80765 0.0689115 0.0172789 0.0068753 0.00343758 0.00197301 0.00124245 0.000836382 0.000592093 -0.000435975 -0 -0 -0 -0 -0 -50.321 1.80527 0.0686993 0.0171957 0.00683047 0.00340937 0.00195354 0.00122815 0.000825401 0.000583374 0.000428868 -0.00032541 -0 -0 -0 -0 -50.34 1.80328 0.0685233 0.0171269 0.0067934 0.00338607 0.00193748 0.00121637 0.000816362 0.000576204 0.000423029 0.000320554 -0.000249293 -0 -0 -0 -50.3557 1.80159 0.0683749 0.0170689 0.00676223 0.0033665 0.001924 0.00120649 0.000808792 0.000570204 0.000418147 0.000316498 0.000245864 -0.000195186 -0 -0 -50.369 1.80015 0.0682481 0.0170195 0.00673566 0.00334983 0.00191253 0.00119809 0.000802358 0.000565109 0.000414005 0.000313058 0.000242958 0.000192695 -0.000155673 -0 -50.3805 1.7989 0.0681385 0.0169768 0.00671274 0.00333547 0.00190265 0.00119086 0.000796824 0.000560729 0.000410446 0.000310105 0.000240465 0.000190559 0.00015382 -0.000126146
它应该是:
60,6060606060606 4,44089209850063e-16 4,85722573273506e-17 -3,12250225675825e-17 0 1,73472347597681e-18 -1,08420217248550e-18 -7,58941520739853e-19 4,33680868994202e-19 -5,42101086242752e-19 0 -6,93889390390723e-18 0 -1,38777878078145e-17 0 1,18720137887163e-17 -34,1232841232841 -2,13675213675214 0 8,67361737988404e-18 3,03576608295941e-18 8,67361737988404e-19 -1,73472347597681e-18 1,35525271560688e-18 -8,67361737988404e-19 1,00288700954909e-18 0 0 6,93889390390723e-18 6,93889390390723e-18 -1,38777878078145e-17 3,02221355580334e-18 -17,9130271437964 1,91603268526345 -0,0799200799200800 1,30104260698261e-18 1,95156391047391e-18 -9,75781955236954e-19 1,95156391047391e-18 2,16840434497101e-19 -3,52365706057789e-19 -1,62630325872826e-19 1,38777878078145e-17 -3,46944695195361e-18 0 0 0 -2,72405795836983e-18 -2,86140643299924 0,0760191125748172 0,0748166415934231 -0,0196633632216454 -2,41234983378025e-18 7,99599102208060e-19 3,25260651745651e-19 -4,74338450462408e-19 2,67662411332359e-19 2,91379333855479e-19 -2,16840434497101e-18 -4,33680868994202e-19 1,30104260698261e-18 0 0 6,86096687275983e-20 -1,33482739506506 0,0346053236774996 0,00125734163772674 0,0187469132242915 -0,00767825058738617 5,35324822664718e-19 -2,23616698075135e-19 5,08219768352580e-20 5,92923063078010e-20 1,74488787134386e-19 -4,33680868994202e-19 4,33680868994202e-19 -2,16840434497101e-19 2,16840434497101e-19 0 -1,19008129089229e-19 -0,793561224702690 0,0203250367373064 0,000727127971238783 0,000177630032830862 0,00739591005669882 -0,00376795430225022 4,98055372985529e-19 -3,84552958053452e-19 3,20178454062126e-19 -1,35525271560688e-19 6,50521303491303e-19 -1,08420217248550e-19 1,08420217248550e-19 -2,16840434497101e-19 0 -7,15742840429884e-20 -0,532255026297144 0,0135340901236068 0,000479383336751935 0,000115847127348313 4,51920594555328e-05 0,00365346070706817 -0,00212282675610843 1,37219337455197e-19 -5,14996031930615e-19 3,30342849429177e-19 0 -2,71050543121376e-19 1,08420217248550e-19 0 0 5,08219768352580e-20 -0,384130052448431 0,00972113086608457 0,000342250536212794 8,21235560483452e-05 3,18129608485860e-05 1,56232096436654e-05 0,00206784220009096 -0,00131233595800525 6,39509875176997e-20 -3,37542629480839e-19 -1,08420217248550e-19 2,16840434497101e-19 0 0 0 -8,47032947254300e-22 -0,291692030052418 0,00735419164507677 0,000257375648850429 6,15185225200113e-05 2,37495210052671e-05 1,16038017329438e-05 6,53368676878396e-06 0,00128271813402154 -0,000867362869930264 1,77876918923403e-19 1,62630325872826e-19 -1,89735380184963e-19 1,62630325872826e-19 0 0 -9,07384044746169e-20 -0,229596895430646 0,00578230937666655 0,000201707743336976 4,79768824589291e-05 1,84020572663637e-05 8,96002707181433e-06 5,05525466995835e-06 3,12009781742606e-06 0,000850011219708818 -0,000602925394011745 0 2,71050543121376e-20 -8,13151629364128e-20 5,42101086242752e-20 -5,42101086242752e-20 7,73976355553617e-20 -0,185720949479909 0,00466765632076680 0,000162419592307734 3,85318721641536e-05 1,47407053519860e-05 7,17308297585328e-06 4,02178178072207e-06 2,48428717850195e-06 1,64547815065802e-06 0,000592092919336558 -0,000435974905284452 0 0 8,13151629364128e-20 -1,08420217248550e-19 2,64697796016969e-20 -0,153867987373140 0,00385473267086607 0,000133863548213241 3,17506489004575e-05 1,20962229586152e-05 5,86799087221288e-06 3,28276799988068e-06 2,02338706451671e-06 1,33735029942045e-06 9,34275734555363e-07 0,000428867197061432 -0,000325409609345764 0 2,71050543121376e-20 0 -1,09055491958991e-20 -0,129703518509601 0,00324211947468978 0,000112403568308126 2,65969300905272e-05 1,01402805713936e-05 4,89779294849866e-06 2,73496124917826e-06 1,68586638861081e-06 1,11012300345236e-06 7,73556738632873e-07 5,60933254708493e-07 0,000320553621268105 -0,000249293253625970 5,42101086242752e-20 0 -1,01114558078482e-20 -0,110691345431593 0,00276839969825208 9,59884298624889e-05 2,25961759289096e-05 8,63052307521336e-06 4,15554692230644e-06 2,31688356971108e-06 1,42511604039733e-06 9,39229137057347e-07 6,51934526276135e-07 4,72019315851685e-07 3,53897320062806e-07 0,000245863313382516 -0,000195185934120844 0 -1,24407964127975e-20 -0,0958269169656213 0,00239699666599593 8,28626202960276e-05 1,95227026042985e-05 7,41637441475814e-06 3,57424367962823e-06 1,99334817579930e-06 1,21993241781196e-06 8,05577604288488e-07 5,57554928001086e-07 4,03155267486669e-07 3,01723475812485e-07 2,31838854154289e-07 0,000192695260333710 -0,000155673036807333 -2,34522247271034e-20 -0,0838002301027703 0,00209415237243389 7,23249901251223e-05 1,70229067498473e-05 6,46008752692950e-06 3,11455737751181e-06 1,73159030599080e-06 1,06073213436631e-06 6,96842172109705e-07 4,82764206408816e-07 3,49217230232344e-07 2,60145440758586e-07 2,00286821017368e-07 1,56906945950947e-07 0,000153820426928509 -0,000126146355001072
看来问题出在您的gaussjordan内核中。
gaussjordan
在原始(L)矩阵上进行高斯-约旦消除时,可以仅对枢轴点右侧的行元素进行处理。
L
但是,当您将相同的行操作应用于单位矩阵以创建逆(I)时,有必要将等效的行操作应用于行的 每个成员 ,而不仅仅是枢轴点右侧的成员。
I
因此,如果您这样修改gaussjordan内核:
__global__ void gaussjordan(float *A, float *I,int n, int i) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; float P; if(x<n && y<n) if(x>i){ // this limits operation to rows below the pivot point P=A[x*n+i]/A[i*n+i]; I[x*n+y] -= I[i*n+y]*P; // apply for every row member if(y>=i){ //limits to row members to the right of the pivot A[x*n+y] -= A[i*n+y]*P; // apply only to members right of pivot } } }
相信您会得到更好的结果。通过上述更改,我相信可以在floatvs. 的精度范围内复制您的预期结果double。
float
double