#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#define N 100
float A[N][N];
int i,j,it,n;
float t_1;
float x[N],z[N],e[N],zmax,emax;
#define MAX_RANGE 9999
#define funcCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
printf( "Failed to run stmt %d ", __LINE__); \
printf( "Got CUDA error ... %s ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)
__global__ void eignvShared(float * A, float * C,
int numARows, int numAColumns,int numCRows, int numCColumns)
{
__shared__ float sA[32][32];
int Row = blockDim.y*blockIdx.y + threadIdx.y;
int Col = blockDim.x*blockIdx.x + threadIdx.x;
float Cvalue = 0.0;
sA[threadIdx.y][threadIdx.x] = 0.0;
for (int k = 0; k < (((numAColumns - 1)/ 32) + 1); k++)
{
if ( (Row < numARows) && (threadIdx.x + (k*32)) < numAColumns)
{
sA[threadIdx.y][threadIdx.x] = A[(Row*numAColumns) + threadIdx.x + (k*32)];
}
else
{
sA[threadIdx.y][threadIdx.x] = 0.0;
}
__syncthreads();
for (int j = 0; j < 32; ++j)
{
Cvalue += sA[threadIdx.y][j] ;
}
}
if ( Col < numAColumns)
{
C[numAColumns + Col] = Cvalue;
}
}
void eignvOnHost(float * A, float * C, int numARows,
int numAColumns,int numCRows, int numCColumns)
{
printf("\nEnter the column vector\n");
scanf("%d",&n);
for(i=1; i<=N; i++)
{
x[i]=A[i][n];
}
for(it=0;it<100; it++)
{
for(i=1; i<=N; i++)
{
z[i]=0;
for(j=1; j<=N; j++)
{
z[i]=z[i]+A[i][j]*x[j];
}
}
zmax=fabs(z[1]);
for(i=2; i<=N; i++)
{
if((fabs(z[i]))>zmax)
zmax=fabs(z[i]);
}
for(i=1; i<=N; i++)
{
z[i]=z[i]/zmax;
}
for(i=1; i<=N; i++)
{
e[i]=0;
e[i]=fabs((fabs(z[i]))-(fabs(x[i])));
}
emax=e[1];
for(i=2; i<=N; i++)
{
if(e[i]>emax)
emax=e[i];
}
for(i=1; i<=N; i++)
{
x[i]=z[i];
}
}
return;
}
int main(int argc, char ** argv) {
float * hostA;
float * hostC;
float * hostComputedC;
float * deviceA;
float * deviceC;
clock_t c_1,c_2;
int numARows = 512;
int numAColumns = 512;
int numCRows;
int numCColumns;
c_1=time(NULL);
hostA = (float *) malloc(sizeof(float)*numARows*numAColumns);
for (int i = 0; i < numARows*numAColumns; i++)
{
hostA[i] = (rand() % MAX_RANGE) / 2.0;
}
numCRows =1;
numCColumns = numAColumns;
hostC = (float *) malloc(sizeof(float)*numCRows*numCColumns);
hostComputedC = (float *) malloc(sizeof(float)*numCRows*numCColumns);
funcCheck(cudaMalloc((void **)&deviceA, sizeof(float)*numARows*numAColumns));
funcCheck(cudaMalloc((void **)&deviceC, sizeof(float)*numCRows*numCColumns));
funcCheck(cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice));
dim3 dimBlock(32, 32, 1);
dim3 dimGrid((numCColumns/32) + 1, (numCRows/32) + 1, 1);
eignvShared<<<dimGrid, dimBlock>>>(deviceA, deviceC, numARows, numAColumns, numCRows, numCColumns);
cudaError_t err1 = cudaPeekAtLastError();
cudaDeviceSynchronize();
printf( "Got CUDA error ... %s \n", cudaGetErrorString(err1));
funcCheck(cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns, cudaMemcpyDeviceToHost));
eignvOnHost(hostA, hostComputedC, numARows, numAColumns,numCRows, numCColumns);
for (int i=0; i < numCColumns*numCRows; i++)
{
if (hostComputedC[i] != hostC[i] )
{
printf("Mismatch at Col = %d hostComputed[] = %f --device[] %f\n", i % numCColumns, hostComputedC[i], hostC[i]);
break;
}
}
funcCheck(cudaFree(deviceA));
funcCheck(cudaFree(deviceC));
free(hostA);
free(hostC);
free(hostComputedC);
c_2=time(NULL);
t_1 = (float)(c_2-c_1);
printf("Execution time: %f \n",t_1);
return 0;
}
What I have tried:
how can i run this cuda program?
i have linux ubuntu
and visual stodio 2010
it compute the eigenvalue and eigenvector
please help me