SDAccel Dataflow 那点事儿(二)




dataflow可谓是FPGA性能体现的绝佳表现方式,数据在硬件中能流动起来,靠的就是dataflow!
下面我们将探究SDAccel OpenCL开发方式下两种dataflow的实现方式:pipe传输方式与function传输方式。
以及SDAccel HLS 开发方式下两种dataflow的编程风格:loopstream
SDAccel Dataflow 那点事儿(一)
SDAccel Dataflow 那点事儿(二)
SDAccel Dataflow 那点事儿(三)
SDAccel Dataflow 那点事儿(四)

dataflow pipe 方式简介

对于OpenCL来说,Dataflow的对象除了function的方式之外,还有Kernel的方式,因此,DataflowOpenCL中还有一种在KernelKernel之间的传输方式,也就是通过pipe传输。pipe的传输机制采用的其实是FIFO的方式,因此在使用pipe传输的过程中需要设置FIFO的深度,也就是在传输过程中有多少的缓存深度。

代码实现

  • 实现示意图
1
2
3
4
5
6
7
8
9
10
11
12
13
14

PIPE Memory based Adder will be implemented as below:
_____________
| |<----- Input Vector from Global Memory
| read_input | __
|_____________|----->| |
_____________ | | p0
| |<-----|__|
| compute_add | __
|_____________|----->| |
______________ | | p1
| |<----|__|
| write_result |
|______________|-----> Output result to Global Memory
  • host端代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

/*******************************************************************************
Description: SDx Vector Addition using Blocking Pipes Operation
*******************************************************************************/

#define INCR_VALUE 10

#include <iostream>
#include <cstring>
#include <stdio.h>
#include <vector>

//OpenCL utility layer include
#include "xcl.h"
#include "oclHelper.h"


#define OCL_CHECK(call) \
do { \
cl_int err = call; \
if (err != CL_SUCCESS) { \
printf("Error calling " #call ", error: %s\n", oclErrorCode(err)); \
exit(EXIT_FAILURE); \
} \
} while (0);

template <typename T>
struct aligned_allocator
{
using value_type = T;
T* allocate(std::size_t num)
{
void* ptr = nullptr;
if (posix_memalign(&ptr,4096,num*sizeof(T)))
throw std::bad_alloc();
return reinterpret_cast<T*>(ptr);
}
void deallocate(T* p, std::size_t num)
{
free(p);
}
};

int main(int argc, char** argv)
{
size_t data_size = 1024*1024;

/* Reducing the data size for emulation mode * /
char * xcl_mode = getenv("XCL_EMULATION_MODE");
if (xcl_mode != NULL){
data_size = 1024;
}

//Allocate Memory in Host Memory
size_t vector_size_bytes = sizeof(int) * data_size;
std::vector<int,aligned_allocator<int>> source_input (data_size);
std::vector<int,aligned_allocator<int>> source_hw_results(data_size);
std::vector<int,aligned_allocator<int>> source_sw_results(data_size);

// Create the test data and Software Result
for(size_t i = 0 ; i < data_size; i++){
source_input[i] = i;
source_sw_results[i] = i + INCR_VALUE;
source_hw_results[i] = 0;
}

//OPENCL HOST CODE AREA START
//Create Program and Kernels.
xcl_world world = xcl_world_single();
cl_program program = xcl_import_binary(world,"adder");
cl_kernel krnl_adder_stage = xcl_get_kernel(program, "adder_stage");
//Creating additional Kernels
cl_kernel krnl_input_stage = xcl_get_kernel(program, "input_stage");
cl_kernel krnl_output_stage = xcl_get_kernel(program, "output_stage");


// By-default xcl_world_single create command queues with sequential command.
// For this example, user to replace command queue with out of order command queue
clReleaseCommandQueue(world.command_queue);
int err;
world.command_queue = clCreateCommandQueue(world.context, world.device_id,
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE,
&err);
if (err != CL_SUCCESS){
std::cout << "Error: Failed to create a command queue!" << std::endl;
std::cout << "Test failed" << std::endl;
return EXIT_FAILURE;
}


//Allocate Buffer in Global Memory
cl_mem buffer_output = xcl_malloc(world, CL_MEM_WRITE_ONLY, vector_size_bytes);
cl_mem buffer_input = clCreateBuffer(world.context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
vector_size_bytes, source_input.data(), NULL);

cl_event write_event;
// Using clEnqueueMigrateMemObjects() instead of clEnqueueWriteBuffer() to avoid
// deadlock in real hardware which can be noticed only for large dataset.
// Rootcause: design leads to a deadlock when host->DDR and
// output_stage->DDR causes a contention and deadlock. In small dataset, the
// data gets transferred from host-> DDR in 1 burst and hence no deadlock.
// Solution: Start output_stage when host->DDR data transfer is completed.
// clEnqueueMigrateMemObject() event is used for all three kernels to avoid deadlock.

//Copy input data to device global memory
OCL_CHECK(clEnqueueMigrateMemObjects(world.command_queue,1, &buffer_input,
0 /* flags, 0 means from host*/,0, NULL,&write_event));

//Wait
clFinish(world.command_queue);

int inc = INCR_VALUE;
int size = data_size;
//Set the Kernel Arguments
xcl_set_kernel_arg(krnl_input_stage,0,sizeof(cl_mem),&buffer_input);
xcl_set_kernel_arg(krnl_input_stage,1,sizeof(int),&size);
xcl_set_kernel_arg(krnl_adder_stage,0,sizeof(int),&inc);
xcl_set_kernel_arg(krnl_adder_stage,1,sizeof(int),&size);
xcl_set_kernel_arg(krnl_output_stage,0,sizeof(cl_mem),&buffer_output);
xcl_set_kernel_arg(krnl_output_stage,1,sizeof(int),&size);

//Launch the Kernel
OCL_CHECK(clEnqueueTask(world.command_queue,krnl_input_stage, 1, &write_event, NULL));
OCL_CHECK(clEnqueueTask(world.command_queue,krnl_adder_stage, 1, &write_event, NULL));
OCL_CHECK(clEnqueueTask(world.command_queue,krnl_output_stage,1, &write_event, NULL));

//wait for all kernels to finish their operations
clFinish(world.command_queue);

//Copy Result from Device Global Memory to Host Local Memory
xcl_memcpy_from_device(world, source_hw_results.data(), buffer_output,vector_size_bytes);

//Release Device Memories and Kernels
clReleaseMemObject(buffer_input);
clReleaseMemObject(buffer_output);
clReleaseKernel(krnl_input_stage);
clReleaseKernel(krnl_adder_stage);
clReleaseKernel(krnl_output_stage);
clReleaseProgram(program);
xcl_release_world(world);
//OPENCL HOST CODE AREA END

// Compare the results of the Device to the simulation
int match = 0;
for (size_t i = 0 ; i < data_size; i++){
if (source_hw_results[i] != source_sw_results[i]){
std::cout << "Error: Result mismatch" << std::endl;
std::cout << "i = " << i << " CPU result = " << source_sw_results[i]
<< " Device result = " << source_hw_results[i] << std::endl;
match = 1;
break;
}
}

std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
return (match ? EXIT_FAILURE : EXIT_SUCCESS);
}
  • device端代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58


//Declaring PIPE memory with Depth 32
pipe int p0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe int p1 __attribute__((xcl_reqd_pipe_depth(32)));
// In Kernel code, xcl_reqd_pipe_depth attribute is used to define the PIPE
// Memory depth to 32. Depth 32 means that PIPE memory can hold maximum 32
// elements at a given time. If PIPE memory is full, any blocking write command
// will go into wait state until some other kernel reads element from PIPE
// memory. Similarly if PIPE memory is empty (no element in memory), any
// blocking read command on this memory will go into wait state until some
// other kernel writes elements to PIPE Memory.
// This blocking read and write functionality allow designer to synchronize the
// data across multiple kernels


// Input Stage Kernel : Read Data from Global Memory and write into Pipe P0
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void input_stage(__global int *input, int size)
{
__attribute__((xcl_pipeline_loop))
mem_rd: for (int i = 0 ; i < size ; i++)
{
//blocking Write command to pipe P0
write_pipe_block(p0, &input[i]);
}
}

// Adder Stage Kernel: Read Input data from Pipe P0 and write the result
// into Pipe P1
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void adder_stage(int inc, int size)
{
__attribute__((xcl_pipeline_loop))
execute: for(int i = 0 ; i < size ; i++)
{
int input_data, output_data;
//blocking read command to Pipe P0
read_pipe_block(p0, &input_data);
output_data = input_data + inc;
//blocking write command to Pipe P1
write_pipe_block(p1, &output_data);
}
}


// Output Stage Kernel: Read result from Pipe P1 and write the result to Global
// Memory
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void output_stage(__global int *output, int size)
{
__attribute__((xcl_pipeline_loop))
mem_wr: for (int i = 0 ; i < size ; i++)
{
//blocking read command to Pipe P1
read_pipe_block(p1, &output[i]);
}
}

关于Pipe不支持结构体的解决办法

当前针对SDAccel 2017.2 版本测试发现关于Pipe的Dataflow方式是不支持结构体类型的,因此对于借助向量化的数据来说,结构体的数据类型传输会造成一定的困难,基于此问题,我们采用Python脚本加宏定义的方式进行代码生成,进而实现结构体的Pipe传输

  • 结构体类型的向量加法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

#include "pipe.cl"

#define VEC_SIZE 4
#define LANE 4
//Input Stage Kernel : Read Data (channel_vec Type) from Global Memory and write into Pipe
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void input_stage(__global channel_vec *input, int size) {
__attribute__((xcl_pipeline_loop))
for (int k = 0; k < size; k++) {
data_ch0_write_pipe_block(input[k]);
}
}

kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void add_stage(int inc, int size) {
channel_vec input,output;
__attribute__((xcl_pipeline_loop))
for(int i = 0; i < size; i++){
data_ch0_read_pipe_block(input);
for(unsigned char ll=0; ll<LANE; ll++){
for(unsigned char vv=0; vv<VEC_SIZE; vv++){
output.lane[ll].data[vv] = input.lane[ll].data[vv] + inc;
}
}
data_ch2_write_pipe_block(output);
}
}

kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void output_stage(__global channel_vec *output, int size) {
__attribute__((xcl_pipeline_loop))
for(int i = 0; i < size; i++){
data_ch2_read_pipe_block(output[i]);
}
}
  • 宏定义模式
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256

#ifndef _PIPE_H
#define _PIPE_H
typedef struct {
float data[4];
} lane_data;
typedef struct {
lane_data lane[4];
} channel_vec;
pipe float data_ch0_0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_1 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_1 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_1 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_2 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_2 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_2 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_3 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_3 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_3 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_4 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_4 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_4 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_5 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_5 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_5 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_6 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_6 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_6 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_7 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_7 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_7 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_8 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_8 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_8 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_9 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_9 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_9 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_10 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_10 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_10 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_11 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_11 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_11 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_12 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_12 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_12 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_13 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_13 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_13 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_14 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_14 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_14 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_15 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_15 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_15 __attribute__((xcl_reqd_pipe_depth(32)));
#define data_ch0_write_pipe_block(input_data) {float temp;\
temp = input_data.lane[0].data[0]; \
write_pipe_block(data_ch0_0, &temp);\
temp = input_data.lane[0].data[1]; \
write_pipe_block(data_ch0_1, &temp);\
temp = input_data.lane[0].data[2]; \
write_pipe_block(data_ch0_2, &temp);\
temp = input_data.lane[0].data[3]; \
write_pipe_block(data_ch0_3, &temp);\
temp = input_data.lane[1].data[0]; \
write_pipe_block(data_ch0_4, &temp);\
temp = input_data.lane[1].data[1]; \
write_pipe_block(data_ch0_5, &temp);\
temp = input_data.lane[1].data[2]; \
write_pipe_block(data_ch0_6, &temp);\
temp = input_data.lane[1].data[3]; \
write_pipe_block(data_ch0_7, &temp);\
temp = input_data.lane[2].data[0]; \
write_pipe_block(data_ch0_8, &temp);\
temp = input_data.lane[2].data[1]; \
write_pipe_block(data_ch0_9, &temp);\
temp = input_data.lane[2].data[2]; \
write_pipe_block(data_ch0_10, &temp);\
temp = input_data.lane[2].data[3]; \
write_pipe_block(data_ch0_11, &temp);\
temp = input_data.lane[3].data[0]; \
write_pipe_block(data_ch0_12, &temp);\
temp = input_data.lane[3].data[1]; \
write_pipe_block(data_ch0_13, &temp);\
temp = input_data.lane[3].data[2]; \
write_pipe_block(data_ch0_14, &temp);\
temp = input_data.lane[3].data[3]; \
write_pipe_block(data_ch0_15, &temp);}
#define data_ch0_read_pipe_block(input_data) {float temp;\
read_pipe_block(data_ch0_0, &temp);\
input_data.lane[0].data[0] = temp; \
read_pipe_block(data_ch0_1, &temp);\
input_data.lane[0].data[1] = temp; \
read_pipe_block(data_ch0_2, &temp);\
input_data.lane[0].data[2] = temp; \
read_pipe_block(data_ch0_3, &temp);\
input_data.lane[0].data[3] = temp; \
read_pipe_block(data_ch0_4, &temp);\
input_data.lane[1].data[0] = temp; \
read_pipe_block(data_ch0_5, &temp);\
input_data.lane[1].data[1] = temp; \
read_pipe_block(data_ch0_6, &temp);\
input_data.lane[1].data[2] = temp; \
read_pipe_block(data_ch0_7, &temp);\
input_data.lane[1].data[3] = temp; \
read_pipe_block(data_ch0_8, &temp);\
input_data.lane[2].data[0] = temp; \
read_pipe_block(data_ch0_9, &temp);\
input_data.lane[2].data[1] = temp; \
read_pipe_block(data_ch0_10, &temp);\
input_data.lane[2].data[2] = temp; \
read_pipe_block(data_ch0_11, &temp);\
input_data.lane[2].data[3] = temp; \
read_pipe_block(data_ch0_12, &temp);\
input_data.lane[3].data[0] = temp; \
read_pipe_block(data_ch0_13, &temp);\
input_data.lane[3].data[1] = temp; \
read_pipe_block(data_ch0_14, &temp);\
input_data.lane[3].data[2] = temp; \
read_pipe_block(data_ch0_15, &temp);\
input_data.lane[3].data[3] = temp;}
#define data_ch1_write_pipe_block(input_data) {float temp;\
temp = input_data.lane[0].data[0]; \
write_pipe_block(data_ch1_0, &temp);\
temp = input_data.lane[0].data[1]; \
write_pipe_block(data_ch1_1, &temp);\
temp = input_data.lane[0].data[2]; \
write_pipe_block(data_ch1_2, &temp);\
temp = input_data.lane[0].data[3]; \
write_pipe_block(data_ch1_3, &temp);\
temp = input_data.lane[1].data[0]; \
write_pipe_block(data_ch1_4, &temp);\
temp = input_data.lane[1].data[1]; \
write_pipe_block(data_ch1_5, &temp);\
temp = input_data.lane[1].data[2]; \
write_pipe_block(data_ch1_6, &temp);\
temp = input_data.lane[1].data[3]; \
write_pipe_block(data_ch1_7, &temp);\
temp = input_data.lane[2].data[0]; \
write_pipe_block(data_ch1_8, &temp);\
temp = input_data.lane[2].data[1]; \
write_pipe_block(data_ch1_9, &temp);\
temp = input_data.lane[2].data[2]; \
write_pipe_block(data_ch1_10, &temp);\
temp = input_data.lane[2].data[3]; \
write_pipe_block(data_ch1_11, &temp);\
temp = input_data.lane[3].data[0]; \
write_pipe_block(data_ch1_12, &temp);\
temp = input_data.lane[3].data[1]; \
write_pipe_block(data_ch1_13, &temp);\
temp = input_data.lane[3].data[2]; \
write_pipe_block(data_ch1_14, &temp);\
temp = input_data.lane[3].data[3]; \
write_pipe_block(data_ch1_15, &temp);}
#define data_ch1_read_pipe_block(input_data) {float temp;\
read_pipe_block(data_ch1_0, &temp);\
input_data.lane[0].data[0] = temp; \
read_pipe_block(data_ch1_1, &temp);\
input_data.lane[0].data[1] = temp; \
read_pipe_block(data_ch1_2, &temp);\
input_data.lane[0].data[2] = temp; \
read_pipe_block(data_ch1_3, &temp);\
input_data.lane[0].data[3] = temp; \
read_pipe_block(data_ch1_4, &temp);\
input_data.lane[1].data[0] = temp; \
read_pipe_block(data_ch1_5, &temp);\
input_data.lane[1].data[1] = temp; \
read_pipe_block(data_ch1_6, &temp);\
input_data.lane[1].data[2] = temp; \
read_pipe_block(data_ch1_7, &temp);\
input_data.lane[1].data[3] = temp; \
read_pipe_block(data_ch1_8, &temp);\
input_data.lane[2].data[0] = temp; \
read_pipe_block(data_ch1_9, &temp);\
input_data.lane[2].data[1] = temp; \
read_pipe_block(data_ch1_10, &temp);\
input_data.lane[2].data[2] = temp; \
read_pipe_block(data_ch1_11, &temp);\
input_data.lane[2].data[3] = temp; \
read_pipe_block(data_ch1_12, &temp);\
input_data.lane[3].data[0] = temp; \
read_pipe_block(data_ch1_13, &temp);\
input_data.lane[3].data[1] = temp; \
read_pipe_block(data_ch1_14, &temp);\
input_data.lane[3].data[2] = temp; \
read_pipe_block(data_ch1_15, &temp);\
input_data.lane[3].data[3] = temp;}
#define data_ch2_write_pipe_block(input_data) {float temp;\
temp = input_data.lane[0].data[0]; \
write_pipe_block(data_ch2_0, &temp);\
temp = input_data.lane[0].data[1]; \
write_pipe_block(data_ch2_1, &temp);\
temp = input_data.lane[0].data[2]; \
write_pipe_block(data_ch2_2, &temp);\
temp = input_data.lane[0].data[3]; \
write_pipe_block(data_ch2_3, &temp);\
temp = input_data.lane[1].data[0]; \
write_pipe_block(data_ch2_4, &temp);\
temp = input_data.lane[1].data[1]; \
write_pipe_block(data_ch2_5, &temp);\
temp = input_data.lane[1].data[2]; \
write_pipe_block(data_ch2_6, &temp);\
temp = input_data.lane[1].data[3]; \
write_pipe_block(data_ch2_7, &temp);\
temp = input_data.lane[2].data[0]; \
write_pipe_block(data_ch2_8, &temp);\
temp = input_data.lane[2].data[1]; \
write_pipe_block(data_ch2_9, &temp);\
temp = input_data.lane[2].data[2]; \
write_pipe_block(data_ch2_10, &temp);\
temp = input_data.lane[2].data[3]; \
write_pipe_block(data_ch2_11, &temp);\
temp = input_data.lane[3].data[0]; \
write_pipe_block(data_ch2_12, &temp);\
temp = input_data.lane[3].data[1]; \
write_pipe_block(data_ch2_13, &temp);\
temp = input_data.lane[3].data[2]; \
write_pipe_block(data_ch2_14, &temp);\
temp = input_data.lane[3].data[3]; \
write_pipe_block(data_ch2_15, &temp);}
#define data_ch2_read_pipe_block(input_data) {float temp;\
read_pipe_block(data_ch2_0, &temp);\
input_data.lane[0].data[0] = temp; \
read_pipe_block(data_ch2_1, &temp);\
input_data.lane[0].data[1] = temp; \
read_pipe_block(data_ch2_2, &temp);\
input_data.lane[0].data[2] = temp; \
read_pipe_block(data_ch2_3, &temp);\
input_data.lane[0].data[3] = temp; \
read_pipe_block(data_ch2_4, &temp);\
input_data.lane[1].data[0] = temp; \
read_pipe_block(data_ch2_5, &temp);\
input_data.lane[1].data[1] = temp; \
read_pipe_block(data_ch2_6, &temp);\
input_data.lane[1].data[2] = temp; \
read_pipe_block(data_ch2_7, &temp);\
input_data.lane[1].data[3] = temp; \
read_pipe_block(data_ch2_8, &temp);\
input_data.lane[2].data[0] = temp; \
read_pipe_block(data_ch2_9, &temp);\
input_data.lane[2].data[1] = temp; \
read_pipe_block(data_ch2_10, &temp);\
input_data.lane[2].data[2] = temp; \
read_pipe_block(data_ch2_11, &temp);\
input_data.lane[2].data[3] = temp; \
read_pipe_block(data_ch2_12, &temp);\
input_data.lane[3].data[0] = temp; \
read_pipe_block(data_ch2_13, &temp);\
input_data.lane[3].data[1] = temp; \
read_pipe_block(data_ch2_14, &temp);\
input_data.lane[3].data[2] = temp; \
read_pipe_block(data_ch2_15, &temp);\
input_data.lane[3].data[3] = temp;}
#endif
  • Python 脚本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

import sys
if __name__ == '__main__':
if len(sys.argv) != 3:
print 'Usage: python [lane_num] [vec_size]'
exit(1)
lane = int(sys.argv[1])
vec_num = int(sys.argv[2])
all = lane*vec_num
code_str = '#ifndef _PIPE_H\n'+'#define _PIPE_H\n'
code_str += 'typedef struct {\n' +\
' float data['+str(vec_num) +'];\n' +\
'} lane_data;\n'
code_str += 'typedef struct {\n' +\
' lane_data lane['+str(lane) +'];\n' +\
'} channel_vec;\n'
for i in range(0,all):
code_str += 'pipe float data_ch0_' + str(i) + ' __attribute__((xcl_reqd_pipe_depth(32)));\n' + \
'pipe float data_ch1_' + str(i) + ' __attribute__((xcl_reqd_pipe_depth(32)));\n' + \
'pipe float data_ch2_' + str(i) + ' __attribute__((xcl_reqd_pipe_depth(32)));\n'
code_str += '#define data_ch0_write_pipe_block(input_data) '+\
'{float temp;\\\n'
count = 0
for i in range(0,lane):
for j in range(0,vec_num):
count = count + 1
if (count <= (all-1)):
code_str += ' temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
' write_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);\\\n'
else:
code_str += ' temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
' write_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);}\n'

code_str += '#define data_ch0_read_pipe_block(input_data) '+\
'{float temp;\\\n'
count = 0
for i in range(0,lane):
for j in range(0,vec_num):
count = count + 1
if (count <= (all-1)):
code_str += ' read_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);\\\n'+\
' input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp; \\\n'

else:
code_str += ' read_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);\\\n'+\
' input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp;} \n'


code_str += '#define data_ch1_write_pipe_block(input_data) '+\
'{float temp;\\\n'
count = 0
for i in range(0,lane):
for j in range(0,vec_num):
count = count + 1
if (count <= (all-1)):
code_str += ' temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
' write_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);\\\n'
else:
code_str += ' temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
' write_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);}\n'

code_str += '#define data_ch1_read_pipe_block(input_data) '+\
'{float temp;\\\n'
count = 0
for i in range(0,lane):
for j in range(0,vec_num):
count = count + 1
if (count <= (all-1)):
code_str += ' read_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);\\\n'+\
' input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp; \\\n'

else:
code_str += ' read_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);\\\n'+\
' input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp;} \n'


code_str += '#define data_ch2_write_pipe_block(input_data) '+\
'{float temp;\\\n'
count = 0
for i in range(0,lane):
for j in range(0,vec_num):
count = count + 1
if (count <= (all-1)):
code_str += ' temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
' write_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);\\\n'
else:
code_str += ' temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
' write_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);}\n'

code_str += '#define data_ch2_read_pipe_block(input_data) '+\
'{float temp;\\\n'
count = 0
for i in range(0,lane):
for j in range(0,vec_num):
count = count + 1
if (count <= (all-1)):
code_str += ' read_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);\\\n'+\
' input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp; \\\n'

else:
code_str += ' read_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);\\\n'+\
' input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp;} \n'



code_str += '#endif\n'
fd = open('pipe.cl', 'w')
fd.write(code_str)
fd.close()

实验结果

  • 非结构体类型Pipe硬件仿真波形图

  • 结构体类型Pipe硬件仿真波形图

参考

xilinx github SDAccel_Examples/getting_started/dataflow/
ug1253 SDx Pragma Reference Guide 2017.2
ug1207 SDAccel Environment Optmizaton Guide

-------------本文结束 感谢您的阅读-------------
0%