Running OpenCL kernel on multiple GPUs? -
right programmed made several algorithms running in parallel on 1 gpu, of them have same problem, when try execute them on several gpus (for example, 3). problem code, executed on 1 gpu executes same amount of time on 3 gpus (not faster). tried execute more data, tried different tasks executed, nothing helped. finally, ended trying run easiest task elements sum , still got awful mistake. why don't believe problem of particular algorithm , feel there mistake in code (or in approach parallelizing code on several gpus).
here header file parallel.cpp class:
#ifndef parallel_h #define parallel_h #define __no_std_vector // use cl::vector , cl::string , #define __no_std_string // not stl versions, more on later #include <cl/cl.h> class parallel { public: parallel(); int executeattachvectorskernel(int*, int*, int*, int); static void getmaxworkgroupsize(int*, int*, int*); virtual ~parallel(); protected: private: char* file_contents(const char*, int*); void getshortinfo(cl_device_id); int init(void); cl_platform_id platform; cl_device_id* devices; cl_uint num_devices; cl_command_queue* queues; int* wgsizes; int* wgnumbers; cl_context context; cl_program program; cl_kernel kernel; cl_mem input1; cl_mem input2; cl_mem output; }; #endif // parallel_h here initialization method init:
int parallel::init() { cl_int err; //connect first platfrom err = clgetplatformids(1, &platform, null); if (err != cl_success) { cerr << "error occured while executing clgetplatformids" << endl; return exit_failure; } //get devices number err = clgetdeviceids(platform, cl_device_type_gpu, 0, null, &num_devices); if (err != cl_success) { cerr << "error: failed create device group:" << endl; return exit_failure; } cout << "num devices =" << num_devices << endl; devices = new cl_device_id[num_devices]; //get gpu devices err = clgetdeviceids(platform, cl_device_type_gpu, num_devices, devices, null); //create 1 context devices context = clcreatecontext(null, num_devices, devices, null, null, &err); if (!context) { cerr << "error: failed create compute context!" << endl; return exit_failure; } queues = new cl_command_queue[num_devices]; wgnumbers = new int[num_devices]; wgsizes = new int[num_devices]; for(int = 0; < num_devices; i++) { //create command queue every device queues[i] = clcreatecommandqueue(context, devices[i], 0, &err); if (!queues[i]) { cerr << "error: failed create command commands!" << endl; return exit_failure; } cl_ulong temp; clgetdeviceinfo(devices[i], cl_device_max_work_group_size, sizeof(temp), &temp, null); wgsizes[i] = (int)temp; clgetdeviceinfo(devices[i], cl_device_max_work_item_sizes, sizeof(temp), &temp, null); wgnumbers[i] = (int)temp; } //translate kernel code chars int pl; size_t program_length; string path = "./kernel/kernel_av.cl"; char* csourcecl = file_contents(path.c_str(), &pl); program_length = (size_t)pl; //create program program = clcreateprogramwithsource(context, 1, (const char **) &csourcecl, &program_length, &err); if (!program) { cerr << "error: failed create compute program!" << endl; return exit_failure; } //create executable err = clbuildprogram(program, 0, null, null, null, null); if (err != cl_success) { size_t len; char buffer[2048]; cerr << "error: failed build program executable!" << endl; exit(1); } // create compute kernel in program kernel = clcreatekernel(program, "calculate2dim", &err); if (err != cl_success) { cerr << "error: failed create compute kernel!" << endl; exit(1); } } the method executes kernel here:
int parallel::executeattachvectorskernel(int* data1, int* data2, int* results, int vectors_num) { cl_int err; size_t global; // global domain size our calculation size_t local; // local domain size our calculation int partition = vectors_num/num_devices; unsigned int count = partition; input1 = clcreatebuffer(context, cl_mem_read_only, sizeof(int) * count, null, null); input2 = clcreatebuffer(context, cl_mem_read_only, sizeof(int) * count, null, null); output = clcreatebuffer(context, cl_mem_write_only, sizeof(int) * count, null, null); if (!input1 || !input2 || !output) { cerr << "error: failed allocate device memory!" << endl; exit(1); } int** data1_apart = new int*[num_devices]; int** data2_apart = new int*[num_devices]; int** results_apart = new int*[num_devices]; for(int = 0; < num_devices; i++) { cout << "executing parallel part on gpu " << + 1 << endl; cout << "partition size = " << partition << endl; data1_apart[i] = new int[partition]; data2_apart[i] = new int[partition]; results_apart[i] = new int[partition]; for(int j = i*partition, k = 0; k < partition; j++, k++) { data1_apart[i][k] = data1[j]; data2_apart[i][k] = data2[j]; } //transfer input vector device memory err = clenqueuewritebuffer(queues[i], input1, cl_true, 0, sizeof(int) * count, data1_apart[i], 0, null, null); err = clenqueuewritebuffer(queues[i], input2, cl_true, 0, sizeof(int) * count, data2_apart[i], 0, null, null); if (err != cl_success) { cerr << "error: failed write source array!" << endl; exit(1); } int parameter4 = count/wgnumbers[i]; //set arguments compute kernel err = 0; err = clsetkernelarg(kernel, 0, sizeof(cl_mem), &input1); err |= clsetkernelarg(kernel, 1, sizeof(cl_mem), &input2); err |= clsetkernelarg(kernel, 2, sizeof(cl_mem), &output); err |= clsetkernelarg(kernel, 3, sizeof(int), ¶meter4); if (err != cl_success) { cerr << "error: failed set kernel arguments! " << err << endl; exit(1); } global = wgnumbers[i]; local = wgsizes[i]; if(local > global) { local = global; } cout << "global = " << global << " local = " << local << endl; err = clenqueuendrangekernel(queues[i], kernel, 1, null, &global, &local, 0, null, null); if (err) { cerr << "error: failed execute kernel!" << endl; return exit_failure; } } for(int = 0; < num_devices; i++) { //wait commands complete clfinish(queues[i]); //read results device verify output err = clenqueuereadbuffer(queues[i], output, cl_true, 0, sizeof(int) * count, results_apart[i], 0, null, null ); if (err != cl_success) { cerr << "error: failed read output array! " << err << endl; exit(1); } for(int j = 0; j < partition; j++) { results[i*partition + j] = results_apart[i][j]; } delete [] data1_apart[i]; delete [] data2_apart[i]; delete [] results_apart[i]; } clreleasememobject(input1); clreleasememobject(input2); clreleasememobject(output); delete [] data1_apart; delete [] data2_apart; } before posting question stackoverflow fighting problem 2-3 weeks , need someone's help, highly appreciate thoughts , answers!
here think happening. call clenqueuendrangekernel once each participating opencl device. @ point, none of kernels have started execution because clflush has not been called. next, make clfinish each queue. first clfinish call causes first queued work group run. waits finish. once first work group completes, clfinish returns control app. app calls clfinish next queue. triggers second work grout run, , waits finish. work runs sequentially. solution may simple calling clfush after each call clenqueuendrangekernel. how amd system behaves. post working example shortly.
Comments
Post a Comment