循环优化之最佳循环(Perfect Loop)

前言

前面我们介绍过了循环优化之融合篇现在我们再从另一个角度进行loop编程风格的优化。在SDAccel中,我们推荐的loop嵌套的形式是perfect loop(下图列出不同的loop循环结构)。对于非perfect loop的形式,我们可以采用一些方法将其优化为 perfect loop 的形式。同样以最近邻的程序,我们进行进一步的探讨。

nearest_bad

源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

/*******************************************************************************
SDx Key Concept :

This is a nearest neighbor of a point example showcases how making a loop
nest perfect or semi-perfect can help improve performance.
*******************************************************************************/

/*
Kernel Description : [Good case]

Finding the nearest neighbor of a point from a given set of points (of up to
MAX_SIZE points). This is the bad version of the kernel where the loop nest
is neither perfect nor semi-perfect. Hence automatic flattening of the loop
nest does not happen and performance drops.

Note : This is the bad version of the kernel. The good version is present in
nearest_good.cl

Arguments :

int *in (input ) --> Input Points Array - represented as integer
int *point (input ) --> Current Point for which the nearest neighbor
is found
int *out (output) --> Output Point
int size (input ) --> Size of the input array
int dim (input ) --> #Dimensions of the points
Kernel Configuration :

MAX_DIM - #Dimensions of the input points can be up to MAX_DIM
MAX_SIZE - Size of the input array can be up to MAX_SIZE
*/

// Compute distances using unsigned long
// and to avoid square root operation.
// Maximum possible distance between two points
#define INF_DIST ULONG_MAX

#define SQUARE(x) ((x)*(x))

// Maximum #Dimensions for a point
#define MAX_DIM 16

// Maximum size of point array
#define MAX_SIZE 16384

__kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void nearest_bad(
const __global int *in, // Input Points Array
const __global int *point, // Current Point
__global int *out, // Output Point
int size, // Size of the input array
int dim // #Dimensions of the points
)
{
// Local memory to store input and output matrices
// Local memory is implemented as BRAM memory blocks

// Holds the input array of points
int in_local[MAX_SIZE][MAX_DIM] __attribute__((xcl_array_partition(complete, 2)));

// Holds the point for which the nearest neighbor is to be found
int point_local[MAX_DIM] __attribute__((xcl_array_partition(complete, 1)));

// Holds the current nearest point
int point_nearest[MAX_DIM] __attribute__((xcl_array_partition(complete, 1)));

// min_dist holds the minimum distance till now
unsigned long min_dist = INF_DIST;

// curr_dist holds the value of distance between point_local and
// the current point
unsigned long curr_dist;

// Burst reads on input from global memory, Points are read as
// an array of integers and saved to in_local.
__attribute__((xcl_pipeline_loop))
readInput: for(int itr = 0, i = 0, j = 0; itr < size*dim; itr++, j++){
if(j == dim) { j = 0; i++;}
in_local[i][j] = in[itr];
}

// Burst reads the point for which nearest neighbor is to be found
__attribute__((xcl_pipeline_loop))
readCurrPt: for(int i = 0; i < dim; i++){
point_local[i] = point[i];
}

// Find the nearest neighbor

// nearest1 loop goes over all the points
// nearest2 loop finds the distance between point_local and the current
// point. Based on this the minimum distance and the closest neighbor
// are updated.

// In nearest2 loop, there are specific conditions like if(j==0).
// This is for enabling loop flatten to improve performance.
nearest1: for(int i = 0; i < size; i++) {
curr_dist = 0;

__attribute__((xcl_pipeline_loop))
nearest2: for(int j = 0; j < dim; j++) {
curr_dist += SQUARE(point_local[j] - in_local[i][j]);
}

if(curr_dist < min_dist) {
min_dist = curr_dist;

__attribute__((opencl_unroll_hint))
nearest3: for(int k = 0; k < MAX_DIM; k++) {
point_nearest[k] = in_local[i][k];
}
}
}

// Burst writes the nearest neighbor to out
__attribute__((xcl_pipeline_loop))
wirteOuput: for(int i = 0; i < dim; i++) {
out[i] = point_nearest[i];
}
}

nearest_good

源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121


/*******************************************************************************
SDx Key Concept :

This is a nearest neighbor of a point example showcases how making a loop
nest perfect or semi-perfect can help improve performance.
*******************************************************************************/

/*
Kernel Description : [Good case]

Finding the nearest neighbor of a point from a given set of points (of up to
MAX_SIZE points). This example showcases how making a loop nest perfect or
semi perfect can help improve performance.

Note : This is the good version of the kernel. The bad version is present in
nearest_bad.cl

Arguments :

int *in (input ) --> Input Points Array - represented as integer
int *point (input ) --> Current Point for which the nearest neighbor
is found
int *out (output) --> Output Point
int size (input ) --> Size of the input array
int dim (input ) --> #Dimensions of the points
Kernel Configuration :

MAX_DIM - #Dimensions of the input points can be up to MAX_DIM
MAX_SIZE - Size of the input array can be up to MAX_SIZE
*/

#include <limits.h>

// Compute distances using unsigned long
// and to avoid square root operation.
// Maximum possible distance between two points
#define INF_DIST ULONG_MAX

#define SQUARE(x) ((x)*(x))

// Maximum #Dimensions for a point
#define MAX_DIM 16

// Maximum size of point array
#define MAX_SIZE 16384

__kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void nearest_good(
const __global int *in, // Input Points Array
const __global int *point, // Current Point
__global int *out, // Output Point
int size, // Size of the input array
int dim // #Dimensions of the points
)
{
// Local memory to store input and output matrices
// Local memory is implemented as BRAM memory blocks

// Holds the input array of points
int in_local[MAX_SIZE][MAX_DIM] __attribute__((xcl_array_partition(complete, 2)));

// Holds the point for which the nearest neighbor is to be found
int point_local[MAX_DIM] __attribute__((xcl_array_partition(complete, 1)));

// Holds the current nearest point
int point_nearest[MAX_DIM] __attribute__((xcl_array_partition(complete, 1)));

// min_dist holds the minimum distance till now
unsigned long min_dist = INF_DIST;

// curr_dist holds the value of distance between point_local and
// the current point
unsigned long curr_dist;

// Burst reads on input from global memory, Points are read as
// an array of integers and saved to in_local.
__attribute__((xcl_pipeline_loop))
readInput: for(int itr = 0, i = 0, j = 0; itr < size*dim; itr++, j++){
if(j == dim) { j = 0; i++;}
in_local[i][j] = in[itr];
}

// Burst reads the point for which nearest neighbor is to be found
__attribute__((xcl_pipeline_loop))
readCurrPt: for(int i = 0; i < dim; i++){
point_local[i] = point[i];
}

// Find the nearest neighbor

// nearest1 loop goes over all the points
// nearest2 loop finds the distance between point_local and the current
// point. Based on this the minimum distance and the closest neighbor
// are updated.

// In nearest2 loop, there are specific conditions like if(j==0).
// This is for enabling loop flatten to improve performance.
nearest1: for(int i = 0; i < size; i++) {
__attribute__((xcl_pipeline_loop))
nearest2: for(int j = 0; j < dim; j++) {
if(j == 0) curr_dist = 0;

curr_dist += SQUARE(point_local[j] - in_local[i][j]);

if(j == dim-1 && curr_dist < min_dist) {
min_dist = curr_dist;
nearest3: for(int k = 0; k < MAX_DIM; k++) {
point_nearest[k] = in_local[i][k];
}
}
}
}

// Burst writes the nearest neighbor to out
__attribute__((xcl_pipeline_loop))
wirteOuput: for(int i = 0; i < dim; i++) {
out[i] = point_nearest[i];
}
}

对比分析

硬件仿真结果

performence分析

  • nearest_bad

  • nearest_good

资源占用分析

  • nearest_bad

  • nearest_good

参考代码风格

下图基于此例子总结展示了如何将并列的for循环格式转换成为最佳的嵌套for循环。

参考

xilinx github SDAccel_Examples/getting_started

HLS视频教程18:FOR循环优化 — 嵌套的FOR循环

-------------本文结束 感谢您的阅读-------------
0%