经典设计结构之移位寄存器OpenCL实现

fir滤波器与移位寄存器的结构匹配度100%

fir滤波器原理

FIR(Finite Impulse Response)滤波器：有限长单位冲激响应滤波器，又称为非递归型滤波器，是数字信号处理系统中最基本的元件，它可以在保证任意幅频特性的同时具有严格的线性相频特性，同时其单位抽样响应是有限长的，因而滤波器是稳定的系统。因此，FIR滤波器在通信、图像处理、模式识别等领域都有着广泛的应用。
在这里我们不具体谈FIR过多的原理知识，而是面向计算的优化，我们预先确定一组参数，来实现一个11阶的FIR滤波器。
计算流程如下：

fir简单实现方式


#define N_COEFF 11

// A naive implementation of the Finite Impulse Response filter.
__kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void fir_naive(__global int* restrict output,
               __global int* restrict signal,
               __global int* restrict coeff,
               long signal_length) {

    int coeff_reg[N_COEFF];
    read_coef: for (int i = 0 ; i < N_COEFF ; i++) coeff_reg[i] = coeff[i];

    outer_loop:
    for (int j = 0; j < signal_length; j++) {
        int acc = 0;
        shift_loop:
        __attribute__((xcl_pipeline_loop))
        for (int i = min(j,N_COEFF-1); i >= 0; i--) {
            acc += signal[j-i] * coeff_reg[i];
        }
        output[j] = acc;
    }
}

fir移位寄存器实现方式


// FIR using shift register
__kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void fir_shift_register(__global int* restrict output,
                        __global int* restrict signal,
                        __global int* restrict coeff,
                        long signal_length) {
    int coeff_reg[N_COEFF];

    // Partitioning of this array is required because the shift register
    // operation will need access to each of the values of the array in
    // the same clock. Without partitioning the operation will need to
    // be performed over multiple cycles because of the limited memory
    // ports available to the array.
    int shift_reg[N_COEFF] __attribute__((xcl_array_partition(complete, 0)));

    init_loop:
    for (int i = 0; i < N_COEFF; i++) {
        shift_reg[i] = 0;
        coeff_reg[i] = coeff[i];
    }

    outer_loop:
    for(int j = 0; j < signal_length; j++) {
        int acc = 0;
        int x = signal[j];

        // This is the shift register operation. The N_COEFF variable is defined
        // at compile time so the compiler knows the number of operations
        // performed by the loop. This loop does not require the unroll
        // attribute because the outer loop will be automatically pipelined so
        // the compiler will unroll this loop in the process.
        shift_loop:
        for (int i = N_COEFF-1; i >= 0; i--) {
            if (i == 0) {
                acc += x * coeff_reg[0];
                shift_reg[0] = x;
            } else {
                shift_reg[i] = shift_reg[i-1];
                acc += shift_reg[i] * coeff_reg[i];
            }
        }
        output[j] = acc;
    }
}

移位寄存器应用总结

在该例程中，巧妙的利用了移位寄存器的方法，对处理过的数据进行存储且移位，大幅度的减少了与global memory接口的频繁交互。移位寄存器在实现的过程中，需要注意的是因为牵扯到数组间的移位，因此一定要将数组切割成寄存器的形式，否则会产生carry dependency使得循环的II值变大。

参考

xilinx github SDAccel_Examples/getting_started