循环优化之融合篇（Fusion Loop）

前言

通过最近邻算法的实现，展示在实际的优化中，如何巧妙的将嵌套的for循环进行融合。

嵌套for循环


#define MAX_DIMS 5


 // This is a simple implementation of a linear search algorithm. We use two
 // loops. The outer loop cycles through each of the search space and the inner
 // loop calculates the distance to a particular point.
 // len 指的是有多少已知点
 // dim 指的是每一个点的维度是多少
 // search_point 指的是确定的用于寻找最近邻的固定点（1个点，dim维）
 // point是随机的样本点（len个点，dim维）
 // out是search_point的最近邻点
kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
nearest_neighbor(global int *out, global const int *points,
                      global const int *search_point, const int len,
                      const int dim) {
    int best_i = 0;
    int best_dist = INT_MAX;
    int s_point[MAX_DIMS];

    // 从global memory 搬移到 local memory
    __attribute__((xcl_pipeline_loop))
    for (int d = 0; d < dim; ++d) {
        s_point[d] = search_point[d];
    }

    // 遍历所有点与s_point的最近距离，记录最近距离以及第几个点
    find_best:
    for (int p = 0; p < len; ++p) {
        int dist = 0;

        // Calculate the distance in a n-dimensional space
        __attribute__((xcl_pipeline_loop))
        dist_calc:
        for (int c = 0; c < dim; ++c) {
            int dx = points[dim * p + c] - s_point[c];
            dist += dx * dx;
        }

        if (dist < best_dist) {
            best_i = p;
            best_dist = dist;
        }
    }

    //将最近的距离点写回global memory
    __attribute__((xcl_pipeline_loop))
    write_best:
    for (int c = 0; c < dim; ++c) {
        out[c] = points[best_i * dim + c];
    }
}

嵌套for循环的融合


#define MAX_DIMS 5

// This implementation fuses the distance calculation and the iteration through
// the search space into one loop.
// len 指的是有多少已知点
// dim 指的是每一个点的维度是多少
// search_point 指的是确定的用于寻找最近邻的固定点（1个点，dim维）
// point是随机的样本点（len个点，dim维）
// out是search_point的最近邻点
kernel __attribute__((reqd_work_group_size(1, 1, 1))) void
nearest_neighbor_loop_fusion(global int *out, global const int *points,
                       global const int *search_point, const int len,
                       const int dim) {
    int best_i = 0;
    int best_dist = INT_MAX;
    int s_point[MAX_DIMS];

    __attribute__((xcl_pipeline_loop))
    for (int d = 0; d < dim; ++d) {
        s_point[d] = search_point[d];
    }

    int dist = 0;
    int iterations = len * dim;

    // This loop iterates through each point and through each of the dimension.
    // The combined loop performs the same number of iterations as the previous
    // implementation but this approach give the compiler more opportunity to
    // optimize the operations.
    __attribute__((xcl_pipeline_loop))
    find_best:
    for (int p = 0, c = 0, itr = 0; itr < iterations; itr++) {
        int dx = points[dim * p + c] - s_point[c];
        dist += dx * dx;
        // Defines the end of the dimension calculation(The inner loop in the
        // previous example)
        if (c == dim - 1) {
            if (dist < best_dist) {
                best_i = p;
                best_dist = dist;
            }
            c = 0;
            dist = 0;
            p++;
        } else {
            c++;
        }
    }
    __attribute__((xcl_pipeline_loop))
    write_best:
    for (int c = 0; c < dim; ++c) {
        out[c] = points[best_i * dim + c];
    }
}

实验结果

软件仿真结果

performence分析

nearest_neighbor

nearest_neighbor_loop_fusion

资源占用分析

nearest_neighbor

nearest_neighbor_loop_fusion

总结

对于嵌套的for循环，若是内层的for循环无法进行循环展开，那么我们只能将内层循环进行pipeline，而不能兼顾到外层循环的pipeline状况。编译器对于内层for循环进行pipeline后，会默认的去尝试能否对外层的for循环进行flatten，如果是perfect loop 或者 semi-perfect loop能够成功的flatten，但是对于imperfect loop 则不能。因此，为了避免for循环的嵌套带来的优化问题，一种很好的解决方式就是将for循环进行融合，直接对最外层的for循环进行pipeline处理。这样既能减少Iteration Latency。还能减少FF与LUT资源的利用率。

参考

xilinx github SDAccel_Examples/getting_started