Android GPU and CPU programming: inconsistent processing time

I am currently doing image tracking: due to the camera, I can track the finger touch interacting with the Android system. Image processing is completed on the GPU with OpenCL: I convert the camera output into black-and-white frames to obtain white spots. The processing time of this method is 65ms. Because my goal is to make the program smoother, Therefore, I used the opencv method to perform the same operation on the CPU. The processing time is 115ms. The problem is that when using the opencv method, the program feels more sensitive and faster, and I don't know how the processing time is longer in this case: this seems to contradict me. For measurement, I do this:

start= clock();
finish = clock();
double time =((double)finish -start)/CLOCKS_PER_SEC;
std::cout<<"process time : "<< time<<std::endl;

This is my code:

static cv::Mat              original_Right,binary_Right;
static cv::Mat              original_Left, binary_Left;
int                 width, height;
clock_t                 start,finish;
double time = 0.0;

width = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_WIDTH);
height = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT);
original_Right.create(height, width, CV_8UC3);


//--------------------------- Camera 2 ---------------------------------
int width_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_WIDTH);
int height_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT);
original_Left.create(height_2, width_2, CV_8UC3);


binary_Right.create(height, width, CV_32F); // FOR GPU
binary_Left.create(height_2, width_2, CV_32F); // FOR GPU
//binary_Right.create(height, width, CV_8UC1); // FOR cpu
//binary_Left.create(height_2, width_2, CV_8UC1); // FOR cpu

Core::running_ = true;


//------------------------------------ SET UP THE GPU -----------------------------------------
cl_context              context;
cl_context_properties   properties [3];
cl_kernel               kernel;
cl_command_queue        command_queue;
cl_program              program;
cl_int                  err;
cl_uint                 num_of_platforms=0;
cl_platform_id          platform_id;
cl_device_id            device_id;
cl_uint                 num_of_devices=0;
cl_mem                  input, output;

size_t                  global;

int                     data_size =height*width*3;


//load opencl source
FILE *fp;
char fileName[] = "./helloTedKrissV2.cl";
char *source_str;

 //Load the source code containing the kernel
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
global = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);


//retreives a list of platforms available
if(clGetPlatformIDs(1,&platform_id, &num_of_platforms)!=CL_SUCCESS){
    std::cout<<"unable to get a platform_id"<<std::endl;
};

// to get a supported GPU device
if(clGetdeviceids(platform_id,CL_DEVICE_TYPE_GPU,1,&device_id, &num_of_devices)!= CL_SUCCESS){
    std::cout<<"unable to get a device_id"<<std::endl;      
};

//context properties list - must be terminated with 0
properties[0]=CL_CONTEXT_PLATFORM;
properties[1]=(cl_context_properties) platform_id;
properties[2]=0;

// create a context with the gpu device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);

//create command queue using the context and device
command_queue = clCreateCommandQueue(context,device_id,0,&err);

//create a program from the kernel source code
program= clCreateProgramWithSource(context,1,(const char **) &source_str, NULL,&err);

// compile the program
if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){
    size_t length;
    std::cout<<"Error building program"<<std::endl;
    char buffer[4096];
    clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG, sizeof(buffer),buffer,&length);
    std::cout<< buffer <<std::endl;
}

//specify which kernel from the program to execute
kernel = clCreateKernel(program,"imageProcessing",&err);




while (this->isRunning() == true) { 

    start= clock(); //--------------------- START----------------------

    //----------------------FRAME---------------------
    this->camera_Right.readFrame(original_Right);
    if (original_Right.empty() == true ) {
        std::cerr << "[Core/Error] Original  frame is empty." << std::endl;
        break;
    }

    this->camera_Left.readFrame(original_Left);
    if (original_Left.empty() == true ) {
        std::cerr << "[Core/Error] Original 2  frame is empty." << std::endl;
        break;
    }
    //----------------------FRAME---------------------



  //------------------------------------------------IMP GPU ------------------------------------------------------

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR  , sizeof(unsigned char)*data_size,NULL,NULL);
    output =clCreateBuffer(context,CL_MEM_READ_WRITE   | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL);

   if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Right.data ,0,NULL,NULL )!= CL_SUCCESS){};

    //set the argument list for the kernel command
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input);
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output);
    global = data_size  ;
    //enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL);
    clFinish(command_queue);
    //copy the results from out of the  output buffer
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Right.data,0,NULL,NULL )!= CL_SUCCESS){};

    clReleaseMemObject(input);
    clReleaseMemObject(output);

    //------------------------------------------------IMP GPU ------------------------------------------------------

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR  , sizeof(unsigned char)*data_size,NULL,NULL);
    output =clCreateBuffer(context,CL_MEM_READ_WRITE   | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL);

   if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Left.data ,0,NULL,NULL )!= CL_SUCCESS){};

    //set the argument list for the kernel command
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input);
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output);
    global = data_size  ;
    //enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL);
    clFinish(command_queue);
    //copy the results from out of the  output buffer
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Left.data,0,NULL,NULL )!= CL_SUCCESS){};

   clReleaseMemObject(input);
   clReleaseMemObject(output);

    //------------------------------------------------IMP GPU ------------------------------------------------------

  // cpu METHOD
  // adok::processing::doImageProcessing(original_Right, binary_Right);
  // adok::processing::doImageProcessing(original_Left, binary_Left);

    //-------------------------------------------------------------- TRACKING ------------------------------------------------------

adok::tracking::doFingerContoursTracking(binary_Right,binary_Left, this->fingerContours, this->perspective_Right,this->perspective_Left, this->distortion_Right,this->distortion_Left, this);

    //------------------------------------------- TRACKING -----------------------------------------

 //------------------------------SEND COORDINATES TO ANDROID BOARD--------------------
if (getSideRight() && !getSideLeft() ) {
        std::cout<<"RIGHT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), RIGHT);
    }else if (!getSideRight() && getSideLeft() ){
        std::cout<<"LEFT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Left.getPerspectiveMatrix(), LEFT);
    }else if (getSideRight() && getSideLeft() ){
        std::cout<<"RIGHT & LEFT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), this->perspective_Left.getPerspectiveMatrix());

    }

this->setSideRight(0);
this->setSideLeft(0);

finish = clock();
time =(double)(finish - start)/CLOCKS_PER_SEC;
std::cout << "Time: " << time << std::endl; // ------------END-----------

}
clReleaseCommandQueue(command_queue);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseContext(context);
this->stop();

}

It's also strange that when I grab frames on the CPU for 5 milliseconds and on the GPU for 15 milliseconds, I don't know why it increases

And I'm working on Android xu4

resolvent:

In GPU calculation, sometimes it may take more time than CPU calculation. For GPU calculation, the main process sends data to GPU memory, and after mathematical calculation, GPU sends data back to CPU. Therefore, it takes time for data transmission and reception back to CPU. If the calculated buffer size is large and the transmission time is large, The cudnn library, together with the GPU processor, makes it many times faster. Therefore, if your program does not use cudnn, it may be slower

The content of this article comes from the network collection of netizens. It is used as a learning reference. The copyright belongs to the original author.
THE END
分享
二维码
< <上一篇
下一篇>>