SDAccel Dataflow 那点事儿（一）

dataflow可谓是FPGA性能体现的绝佳表现方式，数据在硬件中能流动起来，靠的就是dataflow!
下面我们将探究SDAccel OpenCL开发方式下两种dataflow的实现方式：pipe传输方式与function传输方式。
以及SDAccel HLS 开发方式下两种dataflow的编程风格：loop 与 stream
SDAccel Dataflow 那点事（一）
SDAccel Dataflow 那点事（二）
SDAccel Dataflow 那点事（三）
SDAccel Dataflow 那点事（四）

dataflow function 方式简介

下图是Dataflow优化的示意图。在不做Dataflow之前，func_C的输入需要依赖func_A和func_B的计算处理，8个时钟周期才能完成一次fun_C的计算结果输出。在做完Dataflow之后，func_A和func_C的启动间隔加快，使得5个周期就可以完成一次func_C的计算结果输出。从理论上上来说，Dataflow也属于更高层次上的一种Pipeline。

代码实现

下面我们通过一个简单的向量乘法的例子来说明dataflow的function层面的实现。

实现示意图

Data Flow based Adder will be implemented as below:
                _____________
                |             |<----- Input Vector from Global Memory
                |  read_input |       __
                |_____________|----->|  |
                 _____________       |  | buffer_in
                |             |<-----|__|
                | compute_add |       __
                |_____________|----->|  |
                 _____________       |  | buffer_out
                |              |<----|__|
                | write_result |
                |______________|-----> Output result to Global Memory

host端代码


//OpenCL utility layer include
#include "xcl2.hpp"
#include <vector>

#define DATA_SIZE 4096
#define INCR_VALUE 10

int main(int argc, char** argv)
{
    //Allocate Memory in Host Memory
    size_t vector_size_bytes = sizeof(int) * DATA_SIZE;
    std::vector<int,aligned_allocator<int>> source_input     (DATA_SIZE);
    std::vector<int,aligned_allocator<int>> source_hw_results(DATA_SIZE);
    std::vector<int,aligned_allocator<int>> source_sw_results(DATA_SIZE);

    // Create the test data and Software Result
    for(int i = 0 ; i < DATA_SIZE ; i++){
        source_input[i] = i;
        source_sw_results[i] = i + INCR_VALUE;
        source_hw_results[i] = 0;
    }

//OPENCL HOST CODE AREA START
    std::vector<cl::Device> devices = xcl::get_xil_devices();
    cl::Device device = devices[0];

    cl::Context context(device);
    cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
    std::string device_name = device.getInfo<CL_DEVICE_NAME>();

    //Create Program and Kernel
    std::string binaryFile = xcl::find_binary_file(device_name,"adder");
    cl::Program::Binaries bins = xcl::import_binary_file(binaryFile);
    devices.resize(1);
    cl::Program program(context, devices, bins);
    cl::Kernel krnl_adder(program,"adder");

    //Allocate Buffer in Global Memory
    std::vector<cl::Memory> inBufVec, outBufVec;
    cl::Buffer buffer_input (context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
            vector_size_bytes,source_input.data());
    cl::Buffer buffer_output(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,
            vector_size_bytes,source_hw_results.data());
    inBufVec.push_back(buffer_input);
    outBufVec.push_back(buffer_output);

    //Copy input data to device global memory
    q.enqueueMigrateMemObjects(inBufVec,0/* 0 means from host*/);

    int inc = INCR_VALUE;
    int size = DATA_SIZE;
    //Set the Kernel Arguments
    int narg=0;
    krnl_adder.setArg(narg++,buffer_input);
    krnl_adder.setArg(narg++,buffer_output);
    krnl_adder.setArg(narg++,inc);
    krnl_adder.setArg(narg++,size);

    //Launch the Kernel
    q.enqueueTask(krnl_adder);

    //Copy Result from Device Global Memory to Host Local Memory
    q.enqueueMigrateMemObjects(outBufVec,CL_MIGRATE_MEM_OBJECT_HOST);
    q.finish();

//OPENCL HOST CODE AREA END

    // Compare the results of the Device to the simulation
    int match = 0;
    for (int i = 0 ; i < DATA_SIZE ; i++){
        if (source_hw_results[i] != source_sw_results[i]){
            std::cout << "Error: Result mismatch" << std::endl;
            std::cout << "i = " << i << " CPU result = " << source_sw_results[i]
                << " Device result = " << source_hw_results[i] << std::endl;
            match = 1;
            break;
        }
    }

    std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
    return (match ? EXIT_FAILURE :  EXIT_SUCCESS);
}

device端代码


#define BUFFER_SIZE 4096
//Includes
// Read Data from Global Memory and write into buffer_in
static void read_input(__global int *in, int * buffer_in,
        int size)
{
    for (int i = 0 ; i < size ; i++){
        buffer_in[i] =  in[i];
    }
}

// Read Input data from buffer_in and write the result into buffer_out
static void compute_add(int * buffer_in , int * buffer_out
        , int inc, int size)
{
    for (int i = 0 ; i < size ; i++){
        buffer_out[i] = buffer_in[i] + inc;
    }
}

// Read result from buffer_out and write the result to Global Memory
static void write_result(__global int *out, int* buffer_out,
        int size)
{
    for (int i = 0 ; i < size ; i++){
        out[i] = buffer_out[i];
    }
}

/*
    Vector Addition Kernel Implementation using dataflow
    Arguments:
        in   (input)  --> Input Vector
        out  (output) --> Output Vector
        inc  (input)  --> Increment
        size (input)  --> Size of Vector in Integer
   */
__kernel
__attribute__ ((reqd_work_group_size(1, 1, 1)))
__attribute__ ((xcl_dataflow))
void adder(__global int *in, __global int *out, int inc, int size)
{
    int buffer_in[BUFFER_SIZE];
    int buffer_out[BUFFER_SIZE];

    read_input(in,buffer_in,size);
    compute_add(buffer_in,buffer_out,inc,size);
    write_result(out,buffer_out,size);
}

实验结果

硬件仿真波形图

参考

xilinx github SDAccel_Examples/getting_started/dataflow/
ug1253 SDx Pragma Reference Guide 2017.2
ug1207 SDAccel Environment Optmizaton Guide