俺もやってみたらこうなりましたが。 Using device 0: GeForce 9500 GT GPU threads : 512 Processing time GPU: 5.406832 (ms) Processing time CPU: 18.742046 (ms) Test PASSED GPU: 78.0000 87.0000 177.0000 1077.0000 CPU: 78.0000 87.0000 177.0000 1077.0000 Press ENTER to exit...
カーネルはこう。 __global__ void testKernel( float* g_idata1, float* g_idata2, float* g_odata, int n) { // access thread id const unsigned int tid = threadIdx.x; // access number of threads in this block const unsigned int num_threads = blockDim.x; __syncthreads(); unsigned int startaddress = n * 1024 * num_threads; for (int j = 0; j < num_threads; j++) { for (int k = 0; k < 1024; k = k + num_threads) { unsigned int accessAddress = startaddress + k + tid; g_odata[accessAddress] = (g_idata1[accessAddress] + g_idata2[accessAddress]) / 2.0; } __syncthreads(); } }