SDAccel Dataflow 那点事儿(一)




dataflow可谓是FPGA性能体现的绝佳表现方式,数据在硬件中能流动起来,靠的就是dataflow!
下面我们将探究SDAccel OpenCL开发方式下两种dataflow的实现方式:pipe传输方式与function传输方式。
以及SDAccel HLS 开发方式下两种dataflow的编程风格:loopstream
SDAccel Dataflow 那点事(一)
SDAccel Dataflow 那点事(二)
SDAccel Dataflow 那点事(三)
SDAccel Dataflow 那点事(四)

dataflow function 方式简介

下图是Dataflow优化的示意图。在不做Dataflow之前,func_C的输入需要依赖func_Afunc_B的计算处理,8个时钟周期才能完成一次fun_C的计算结果输出。在做完Dataflow之后,func_Afunc_C的启动间隔加快,使得5个周期就可以完成一次func_C的计算结果输出。从理论上上来说,Dataflow也属于更高层次上的一种Pipeline

代码实现

下面我们通过一个简单的向量乘法的例子来说明dataflow的function层面的实现。

  • 实现示意图
1
2
3
4
5
6
7
8
9
10
11
12
13
Data Flow based Adder will be implemented as below:
_____________
| |<----- Input Vector from Global Memory
| read_input | __
|_____________|----->| |
_____________ | | buffer_in
| |<-----|__|
| compute_add | __
|_____________|----->| |
_____________ | | buffer_out
| |<----|__|
| write_result |
|______________|-----> Output result to Global Memory
  • host端代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

//OpenCL utility layer include
#include "xcl2.hpp"
#include <vector>

#define DATA_SIZE 4096
#define INCR_VALUE 10

int main(int argc, char** argv)
{
//Allocate Memory in Host Memory
size_t vector_size_bytes = sizeof(int) * DATA_SIZE;
std::vector<int,aligned_allocator<int>> source_input (DATA_SIZE);
std::vector<int,aligned_allocator<int>> source_hw_results(DATA_SIZE);
std::vector<int,aligned_allocator<int>> source_sw_results(DATA_SIZE);

// Create the test data and Software Result
for(int i = 0 ; i < DATA_SIZE ; i++){
source_input[i] = i;
source_sw_results[i] = i + INCR_VALUE;
source_hw_results[i] = 0;
}

//OPENCL HOST CODE AREA START
std::vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];

cl::Context context(device);
cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
std::string device_name = device.getInfo<CL_DEVICE_NAME>();

//Create Program and Kernel
std::string binaryFile = xcl::find_binary_file(device_name,"adder");
cl::Program::Binaries bins = xcl::import_binary_file(binaryFile);
devices.resize(1);
cl::Program program(context, devices, bins);
cl::Kernel krnl_adder(program,"adder");

//Allocate Buffer in Global Memory
std::vector<cl::Memory> inBufVec, outBufVec;
cl::Buffer buffer_input (context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
vector_size_bytes,source_input.data());
cl::Buffer buffer_output(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,
vector_size_bytes,source_hw_results.data());
inBufVec.push_back(buffer_input);
outBufVec.push_back(buffer_output);

//Copy input data to device global memory
q.enqueueMigrateMemObjects(inBufVec,0/* 0 means from host*/);

int inc = INCR_VALUE;
int size = DATA_SIZE;
//Set the Kernel Arguments
int narg=0;
krnl_adder.setArg(narg++,buffer_input);
krnl_adder.setArg(narg++,buffer_output);
krnl_adder.setArg(narg++,inc);
krnl_adder.setArg(narg++,size);

//Launch the Kernel
q.enqueueTask(krnl_adder);

//Copy Result from Device Global Memory to Host Local Memory
q.enqueueMigrateMemObjects(outBufVec,CL_MIGRATE_MEM_OBJECT_HOST);
q.finish();

//OPENCL HOST CODE AREA END

// Compare the results of the Device to the simulation
int match = 0;
for (int i = 0 ; i < DATA_SIZE ; i++){
if (source_hw_results[i] != source_sw_results[i]){
std::cout << "Error: Result mismatch" << std::endl;
std::cout << "i = " << i << " CPU result = " << source_sw_results[i]
<< " Device result = " << source_hw_results[i] << std::endl;
match = 1;
break;
}
}

std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
return (match ? EXIT_FAILURE : EXIT_SUCCESS);
}
  • device端代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

#define BUFFER_SIZE 4096
//Includes
// Read Data from Global Memory and write into buffer_in
static void read_input(__global int *in, int * buffer_in,
int size)
{
for (int i = 0 ; i < size ; i++){
buffer_in[i] = in[i];
}
}

// Read Input data from buffer_in and write the result into buffer_out
static void compute_add(int * buffer_in , int * buffer_out
, int inc, int size)
{
for (int i = 0 ; i < size ; i++){
buffer_out[i] = buffer_in[i] + inc;
}
}

// Read result from buffer_out and write the result to Global Memory
static void write_result(__global int *out, int* buffer_out,
int size)
{
for (int i = 0 ; i < size ; i++){
out[i] = buffer_out[i];
}
}

/*
Vector Addition Kernel Implementation using dataflow
Arguments:
in (input) --> Input Vector
out (output) --> Output Vector
inc (input) --> Increment
size (input) --> Size of Vector in Integer
*/
__kernel
__attribute__ ((reqd_work_group_size(1, 1, 1)))
__attribute__ ((xcl_dataflow))
void adder(__global int *in, __global int *out, int inc, int size)
{
int buffer_in[BUFFER_SIZE];
int buffer_out[BUFFER_SIZE];

read_input(in,buffer_in,size);
compute_add(buffer_in,buffer_out,inc,size);
write_result(out,buffer_out,size);
}

实验结果

  • 硬件仿真波形图

参考

xilinx github SDAccel_Examples/getting_started/dataflow/
ug1253 SDx Pragma Reference Guide 2017.2
ug1207 SDAccel Environment Optmizaton Guide

-------------本文结束 感谢您的阅读-------------
0%