SDAccel Dataflow 那点事儿（二）

dataflow可谓是FPGA性能体现的绝佳表现方式，数据在硬件中能流动起来，靠的就是dataflow!
下面我们将探究SDAccel OpenCL开发方式下两种dataflow的实现方式：pipe传输方式与function传输方式。
以及SDAccel HLS 开发方式下两种dataflow的编程风格：loop 与 stream
SDAccel Dataflow 那点事儿（一）
SDAccel Dataflow 那点事儿（二）
SDAccel Dataflow 那点事儿（三）
SDAccel Dataflow 那点事儿（四）

dataflow pipe 方式简介

对于OpenCL来说，Dataflow的对象除了function的方式之外，还有Kernel的方式，因此，Dataflow在OpenCL中还有一种在Kernel与Kernel之间的传输方式,也就是通过pipe传输。pipe的传输机制采用的其实是FIFO的方式，因此在使用pipe传输的过程中需要设置FIFO的深度，也就是在传输过程中有多少的缓存深度。

代码实现

实现示意图


PIPE Memory based Adder will be implemented as below:
                 _____________
                |             |<----- Input Vector from Global Memory
                |  read_input |       __
                |_____________|----->|  |
                 _____________       |  | p0
                |             |<-----|__|
                | compute_add |       __
                |_____________|----->|  |
                 ______________      |  | p1
                |              |<----|__|
                | write_result |
                |______________|-----> Output result to Global Memory

host端代码


/*******************************************************************************
Description: SDx Vector Addition using Blocking Pipes Operation
*******************************************************************************/

#define INCR_VALUE 10

#include <iostream>
#include <cstring>
#include <stdio.h>
#include <vector>

//OpenCL utility layer include
#include "xcl.h"
#include "oclHelper.h"


#define OCL_CHECK(call)                                                        \
  do {                                                                         \
    cl_int err = call;                                                         \
    if (err != CL_SUCCESS) {                                                   \
      printf("Error calling " #call ", error: %s\n", oclErrorCode(err));       \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0);

template <typename T>
struct aligned_allocator
{
  using value_type = T;
  T* allocate(std::size_t num)
  {
    void* ptr = nullptr;
    if (posix_memalign(&ptr,4096,num*sizeof(T)))
      throw std::bad_alloc();
    return reinterpret_cast<T*>(ptr);
  }
  void deallocate(T* p, std::size_t num)
  {
    free(p);
  }
};

int main(int argc, char** argv)
{
    size_t data_size = 1024*1024;

    /* Reducing the data size for emulation mode * /
    char * xcl_mode = getenv("XCL_EMULATION_MODE");
    if (xcl_mode != NULL){
        data_size = 1024;
    }

    //Allocate Memory in Host Memory
    size_t vector_size_bytes = sizeof(int) * data_size;
    std::vector<int,aligned_allocator<int>> source_input     (data_size);
    std::vector<int,aligned_allocator<int>> source_hw_results(data_size);
    std::vector<int,aligned_allocator<int>> source_sw_results(data_size);

    // Create the test data and Software Result
    for(size_t i = 0 ; i < data_size; i++){
        source_input[i] = i;
        source_sw_results[i] = i + INCR_VALUE;
        source_hw_results[i] = 0;
    }

//OPENCL HOST CODE AREA START
    //Create Program and Kernels.
    xcl_world world = xcl_world_single();
    cl_program program = xcl_import_binary(world,"adder");
    cl_kernel krnl_adder_stage   = xcl_get_kernel(program, "adder_stage");
    //Creating additional Kernels
    cl_kernel krnl_input_stage   = xcl_get_kernel(program, "input_stage");
    cl_kernel krnl_output_stage  = xcl_get_kernel(program, "output_stage");


    // By-default xcl_world_single create command queues with sequential command.
    // For this example, user to replace command queue with out of order command queue
    clReleaseCommandQueue(world.command_queue);
    int err;
    world.command_queue = clCreateCommandQueue(world.context, world.device_id,
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE,
            &err);
    if (err != CL_SUCCESS){
        std::cout << "Error: Failed to create a command queue!" << std::endl;
        std::cout << "Test failed" << std::endl;
        return EXIT_FAILURE;
    }


    //Allocate Buffer in Global Memory
    cl_mem buffer_output = xcl_malloc(world, CL_MEM_WRITE_ONLY, vector_size_bytes);
    cl_mem buffer_input = clCreateBuffer(world.context,
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
            vector_size_bytes, source_input.data(), NULL);

    cl_event write_event;
    // Using clEnqueueMigrateMemObjects() instead of clEnqueueWriteBuffer() to avoid
    // deadlock in real hardware which can be noticed only for large dataset.
    // Rootcause: design leads to a deadlock when host->DDR and
    // output_stage->DDR causes a contention and deadlock. In small dataset, the
    // data gets transferred from host-> DDR in 1 burst and hence no deadlock.
    // Solution: Start output_stage when host->DDR data transfer is completed.
    // clEnqueueMigrateMemObject() event is used for all three kernels to avoid deadlock.

    //Copy input data to device global memory
    OCL_CHECK(clEnqueueMigrateMemObjects(world.command_queue,1, &buffer_input,
                0 /* flags, 0 means from host*/,0, NULL,&write_event));

    //Wait
    clFinish(world.command_queue);

    int inc = INCR_VALUE;
    int size = data_size;
    //Set the Kernel Arguments
    xcl_set_kernel_arg(krnl_input_stage,0,sizeof(cl_mem),&buffer_input);
    xcl_set_kernel_arg(krnl_input_stage,1,sizeof(int),&size);
    xcl_set_kernel_arg(krnl_adder_stage,0,sizeof(int),&inc);
    xcl_set_kernel_arg(krnl_adder_stage,1,sizeof(int),&size);
    xcl_set_kernel_arg(krnl_output_stage,0,sizeof(cl_mem),&buffer_output);
    xcl_set_kernel_arg(krnl_output_stage,1,sizeof(int),&size);

    //Launch the Kernel
    OCL_CHECK(clEnqueueTask(world.command_queue,krnl_input_stage, 1, &write_event, NULL));
    OCL_CHECK(clEnqueueTask(world.command_queue,krnl_adder_stage, 1, &write_event, NULL));
    OCL_CHECK(clEnqueueTask(world.command_queue,krnl_output_stage,1, &write_event, NULL));

    //wait for all kernels to finish their operations
    clFinish(world.command_queue);

    //Copy Result from Device Global Memory to Host Local Memory
    xcl_memcpy_from_device(world, source_hw_results.data(), buffer_output,vector_size_bytes);

    //Release Device Memories and Kernels
    clReleaseMemObject(buffer_input);
    clReleaseMemObject(buffer_output);
    clReleaseKernel(krnl_input_stage);
    clReleaseKernel(krnl_adder_stage);
    clReleaseKernel(krnl_output_stage);
    clReleaseProgram(program);
    xcl_release_world(world);
//OPENCL HOST CODE AREA END

    // Compare the results of the Device to the simulation
    int match = 0;
    for (size_t i = 0 ; i < data_size; i++){
        if (source_hw_results[i] != source_sw_results[i]){
            std::cout << "Error: Result mismatch" << std::endl;
            std::cout << "i = " << i << " CPU result = " << source_sw_results[i]
                << " Device result = " << source_hw_results[i] << std::endl;
            match = 1;
            break;
        }
    }

    std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
    return (match ? EXIT_FAILURE :  EXIT_SUCCESS);
}

device端代码



//Declaring PIPE memory with Depth 32
pipe int p0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe int p1 __attribute__((xcl_reqd_pipe_depth(32)));
//  In Kernel code, xcl_reqd_pipe_depth attribute is used to define the PIPE
//  Memory depth to 32. Depth 32 means that PIPE memory can hold maximum 32
//  elements at a given time. If PIPE memory is full, any blocking write command
//  will go into wait state until some other kernel reads element from PIPE
//  memory. Similarly if PIPE memory is empty (no element in memory), any
//  blocking read command on this memory will go into wait state until some
//  other kernel writes elements to PIPE Memory.
//  This blocking read and write functionality allow designer to synchronize the
//  data across multiple kernels


// Input Stage Kernel : Read Data from Global Memory and write into Pipe P0
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void input_stage(__global int *input, int size)
{
    __attribute__((xcl_pipeline_loop))
    mem_rd: for (int i = 0 ; i < size ; i++)
    {
        //blocking Write command to pipe P0
        write_pipe_block(p0, &input[i]);
    }
}

// Adder Stage Kernel: Read Input data from Pipe P0 and write the result
// into Pipe P1
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void adder_stage(int inc, int size)
{
    __attribute__((xcl_pipeline_loop))
    execute: for(int i = 0 ; i < size ;  i++)
    {
        int input_data, output_data;
        //blocking read command to Pipe P0
        read_pipe_block(p0, &input_data);
        output_data = input_data + inc;
        //blocking write command to Pipe P1
        write_pipe_block(p1, &output_data);
    }
}


// Output Stage Kernel: Read result from Pipe P1 and write the result to Global
// Memory
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void output_stage(__global int *output, int size)
{
    __attribute__((xcl_pipeline_loop))
    mem_wr: for (int i = 0 ; i < size ; i++)
    {
        //blocking read command to Pipe P1
        read_pipe_block(p1, &output[i]);
    }
}

关于Pipe不支持结构体的解决办法

当前针对SDAccel 2017.2 版本测试发现关于Pipe的Dataflow方式是不支持结构体类型的，因此对于借助向量化的数据来说，结构体的数据类型传输会造成一定的困难，基于此问题，我们采用Python脚本加宏定义的方式进行代码生成，进而实现结构体的Pipe传输

结构体类型的向量加法


#include "pipe.cl"

#define VEC_SIZE       4
#define LANE		   4
//Input Stage Kernel : Read Data (channel_vec Type) from Global Memory and write into Pipe
kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void input_stage(__global channel_vec *input, int size) {
	__attribute__((xcl_pipeline_loop))
	for (int k = 0; k < size; k++) {
	    data_ch0_write_pipe_block(input[k]);
	}
}

kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void add_stage(int inc, int size) {
	channel_vec input,output;
	__attribute__((xcl_pipeline_loop))
	for(int i = 0; i < size; i++){
		data_ch0_read_pipe_block(input);
		for(unsigned char ll=0; ll<LANE; ll++){
			for(unsigned char vv=0; vv<VEC_SIZE; vv++){
				output.lane[ll].data[vv] = input.lane[ll].data[vv] + inc;
			}
		}
		data_ch2_write_pipe_block(output);
	}
}

kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void output_stage(__global channel_vec *output, int size) {
	__attribute__((xcl_pipeline_loop))
	for(int i = 0; i < size; i++){
		data_ch2_read_pipe_block(output[i]);
	}
}

宏定义模式


#ifndef _PIPE_H
#define _PIPE_H
typedef struct {
	float data[4];
} lane_data;
typedef struct {
	lane_data lane[4];
} channel_vec;
pipe float data_ch0_0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_0 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_1 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_1 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_1 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_2 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_2 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_2 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_3 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_3 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_3 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_4 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_4 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_4 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_5 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_5 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_5 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_6 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_6 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_6 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_7 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_7 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_7 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_8 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_8 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_8 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_9 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_9 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_9 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_10 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_10 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_10 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_11 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_11 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_11 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_12 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_12 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_12 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_13 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_13 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_13 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_14 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_14 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_14 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch0_15 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch1_15 __attribute__((xcl_reqd_pipe_depth(32)));
pipe float data_ch2_15 __attribute__((xcl_reqd_pipe_depth(32)));
#define data_ch0_write_pipe_block(input_data)  {float temp;\
                                           temp = input_data.lane[0].data[0]; \
                                           write_pipe_block(data_ch0_0, &temp);\
                                           temp = input_data.lane[0].data[1]; \
                                           write_pipe_block(data_ch0_1, &temp);\
                                           temp = input_data.lane[0].data[2]; \
                                           write_pipe_block(data_ch0_2, &temp);\
                                           temp = input_data.lane[0].data[3]; \
                                           write_pipe_block(data_ch0_3, &temp);\
                                           temp = input_data.lane[1].data[0]; \
                                           write_pipe_block(data_ch0_4, &temp);\
                                           temp = input_data.lane[1].data[1]; \
                                           write_pipe_block(data_ch0_5, &temp);\
                                           temp = input_data.lane[1].data[2]; \
                                           write_pipe_block(data_ch0_6, &temp);\
                                           temp = input_data.lane[1].data[3]; \
                                           write_pipe_block(data_ch0_7, &temp);\
                                           temp = input_data.lane[2].data[0]; \
                                           write_pipe_block(data_ch0_8, &temp);\
                                           temp = input_data.lane[2].data[1]; \
                                           write_pipe_block(data_ch0_9, &temp);\
                                           temp = input_data.lane[2].data[2]; \
                                           write_pipe_block(data_ch0_10, &temp);\
                                           temp = input_data.lane[2].data[3]; \
                                           write_pipe_block(data_ch0_11, &temp);\
                                           temp = input_data.lane[3].data[0]; \
                                           write_pipe_block(data_ch0_12, &temp);\
                                           temp = input_data.lane[3].data[1]; \
                                           write_pipe_block(data_ch0_13, &temp);\
                                           temp = input_data.lane[3].data[2]; \
                                           write_pipe_block(data_ch0_14, &temp);\
                                           temp = input_data.lane[3].data[3]; \
                                           write_pipe_block(data_ch0_15, &temp);}
#define data_ch0_read_pipe_block(input_data)  {float temp;\
                                           read_pipe_block(data_ch0_0, &temp);\
                                           input_data.lane[0].data[0] = temp; \
                                           read_pipe_block(data_ch0_1, &temp);\
                                           input_data.lane[0].data[1] = temp; \
                                           read_pipe_block(data_ch0_2, &temp);\
                                           input_data.lane[0].data[2] = temp; \
                                           read_pipe_block(data_ch0_3, &temp);\
                                           input_data.lane[0].data[3] = temp; \
                                           read_pipe_block(data_ch0_4, &temp);\
                                           input_data.lane[1].data[0] = temp; \
                                           read_pipe_block(data_ch0_5, &temp);\
                                           input_data.lane[1].data[1] = temp; \
                                           read_pipe_block(data_ch0_6, &temp);\
                                           input_data.lane[1].data[2] = temp; \
                                           read_pipe_block(data_ch0_7, &temp);\
                                           input_data.lane[1].data[3] = temp; \
                                           read_pipe_block(data_ch0_8, &temp);\
                                           input_data.lane[2].data[0] = temp; \
                                           read_pipe_block(data_ch0_9, &temp);\
                                           input_data.lane[2].data[1] = temp; \
                                           read_pipe_block(data_ch0_10, &temp);\
                                           input_data.lane[2].data[2] = temp; \
                                           read_pipe_block(data_ch0_11, &temp);\
                                           input_data.lane[2].data[3] = temp; \
                                           read_pipe_block(data_ch0_12, &temp);\
                                           input_data.lane[3].data[0] = temp; \
                                           read_pipe_block(data_ch0_13, &temp);\
                                           input_data.lane[3].data[1] = temp; \
                                           read_pipe_block(data_ch0_14, &temp);\
                                           input_data.lane[3].data[2] = temp; \
                                           read_pipe_block(data_ch0_15, &temp);\
                                           input_data.lane[3].data[3] = temp;}
#define data_ch1_write_pipe_block(input_data)  {float temp;\
                                           temp = input_data.lane[0].data[0]; \
                                           write_pipe_block(data_ch1_0, &temp);\
                                           temp = input_data.lane[0].data[1]; \
                                           write_pipe_block(data_ch1_1, &temp);\
                                           temp = input_data.lane[0].data[2]; \
                                           write_pipe_block(data_ch1_2, &temp);\
                                           temp = input_data.lane[0].data[3]; \
                                           write_pipe_block(data_ch1_3, &temp);\
                                           temp = input_data.lane[1].data[0]; \
                                           write_pipe_block(data_ch1_4, &temp);\
                                           temp = input_data.lane[1].data[1]; \
                                           write_pipe_block(data_ch1_5, &temp);\
                                           temp = input_data.lane[1].data[2]; \
                                           write_pipe_block(data_ch1_6, &temp);\
                                           temp = input_data.lane[1].data[3]; \
                                           write_pipe_block(data_ch1_7, &temp);\
                                           temp = input_data.lane[2].data[0]; \
                                           write_pipe_block(data_ch1_8, &temp);\
                                           temp = input_data.lane[2].data[1]; \
                                           write_pipe_block(data_ch1_9, &temp);\
                                           temp = input_data.lane[2].data[2]; \
                                           write_pipe_block(data_ch1_10, &temp);\
                                           temp = input_data.lane[2].data[3]; \
                                           write_pipe_block(data_ch1_11, &temp);\
                                           temp = input_data.lane[3].data[0]; \
                                           write_pipe_block(data_ch1_12, &temp);\
                                           temp = input_data.lane[3].data[1]; \
                                           write_pipe_block(data_ch1_13, &temp);\
                                           temp = input_data.lane[3].data[2]; \
                                           write_pipe_block(data_ch1_14, &temp);\
                                           temp = input_data.lane[3].data[3]; \
                                           write_pipe_block(data_ch1_15, &temp);}
#define data_ch1_read_pipe_block(input_data)  {float temp;\
                                           read_pipe_block(data_ch1_0, &temp);\
                                           input_data.lane[0].data[0] = temp; \
                                           read_pipe_block(data_ch1_1, &temp);\
                                           input_data.lane[0].data[1] = temp; \
                                           read_pipe_block(data_ch1_2, &temp);\
                                           input_data.lane[0].data[2] = temp; \
                                           read_pipe_block(data_ch1_3, &temp);\
                                           input_data.lane[0].data[3] = temp; \
                                           read_pipe_block(data_ch1_4, &temp);\
                                           input_data.lane[1].data[0] = temp; \
                                           read_pipe_block(data_ch1_5, &temp);\
                                           input_data.lane[1].data[1] = temp; \
                                           read_pipe_block(data_ch1_6, &temp);\
                                           input_data.lane[1].data[2] = temp; \
                                           read_pipe_block(data_ch1_7, &temp);\
                                           input_data.lane[1].data[3] = temp; \
                                           read_pipe_block(data_ch1_8, &temp);\
                                           input_data.lane[2].data[0] = temp; \
                                           read_pipe_block(data_ch1_9, &temp);\
                                           input_data.lane[2].data[1] = temp; \
                                           read_pipe_block(data_ch1_10, &temp);\
                                           input_data.lane[2].data[2] = temp; \
                                           read_pipe_block(data_ch1_11, &temp);\
                                           input_data.lane[2].data[3] = temp; \
                                           read_pipe_block(data_ch1_12, &temp);\
                                           input_data.lane[3].data[0] = temp; \
                                           read_pipe_block(data_ch1_13, &temp);\
                                           input_data.lane[3].data[1] = temp; \
                                           read_pipe_block(data_ch1_14, &temp);\
                                           input_data.lane[3].data[2] = temp; \
                                           read_pipe_block(data_ch1_15, &temp);\
                                           input_data.lane[3].data[3] = temp;}
#define data_ch2_write_pipe_block(input_data)  {float temp;\
                                           temp = input_data.lane[0].data[0]; \
                                           write_pipe_block(data_ch2_0, &temp);\
                                           temp = input_data.lane[0].data[1]; \
                                           write_pipe_block(data_ch2_1, &temp);\
                                           temp = input_data.lane[0].data[2]; \
                                           write_pipe_block(data_ch2_2, &temp);\
                                           temp = input_data.lane[0].data[3]; \
                                           write_pipe_block(data_ch2_3, &temp);\
                                           temp = input_data.lane[1].data[0]; \
                                           write_pipe_block(data_ch2_4, &temp);\
                                           temp = input_data.lane[1].data[1]; \
                                           write_pipe_block(data_ch2_5, &temp);\
                                           temp = input_data.lane[1].data[2]; \
                                           write_pipe_block(data_ch2_6, &temp);\
                                           temp = input_data.lane[1].data[3]; \
                                           write_pipe_block(data_ch2_7, &temp);\
                                           temp = input_data.lane[2].data[0]; \
                                           write_pipe_block(data_ch2_8, &temp);\
                                           temp = input_data.lane[2].data[1]; \
                                           write_pipe_block(data_ch2_9, &temp);\
                                           temp = input_data.lane[2].data[2]; \
                                           write_pipe_block(data_ch2_10, &temp);\
                                           temp = input_data.lane[2].data[3]; \
                                           write_pipe_block(data_ch2_11, &temp);\
                                           temp = input_data.lane[3].data[0]; \
                                           write_pipe_block(data_ch2_12, &temp);\
                                           temp = input_data.lane[3].data[1]; \
                                           write_pipe_block(data_ch2_13, &temp);\
                                           temp = input_data.lane[3].data[2]; \
                                           write_pipe_block(data_ch2_14, &temp);\
                                           temp = input_data.lane[3].data[3]; \
                                           write_pipe_block(data_ch2_15, &temp);}
#define data_ch2_read_pipe_block(input_data)  {float temp;\
                                           read_pipe_block(data_ch2_0, &temp);\
                                           input_data.lane[0].data[0] = temp; \
                                           read_pipe_block(data_ch2_1, &temp);\
                                           input_data.lane[0].data[1] = temp; \
                                           read_pipe_block(data_ch2_2, &temp);\
                                           input_data.lane[0].data[2] = temp; \
                                           read_pipe_block(data_ch2_3, &temp);\
                                           input_data.lane[0].data[3] = temp; \
                                           read_pipe_block(data_ch2_4, &temp);\
                                           input_data.lane[1].data[0] = temp; \
                                           read_pipe_block(data_ch2_5, &temp);\
                                           input_data.lane[1].data[1] = temp; \
                                           read_pipe_block(data_ch2_6, &temp);\
                                           input_data.lane[1].data[2] = temp; \
                                           read_pipe_block(data_ch2_7, &temp);\
                                           input_data.lane[1].data[3] = temp; \
                                           read_pipe_block(data_ch2_8, &temp);\
                                           input_data.lane[2].data[0] = temp; \
                                           read_pipe_block(data_ch2_9, &temp);\
                                           input_data.lane[2].data[1] = temp; \
                                           read_pipe_block(data_ch2_10, &temp);\
                                           input_data.lane[2].data[2] = temp; \
                                           read_pipe_block(data_ch2_11, &temp);\
                                           input_data.lane[2].data[3] = temp; \
                                           read_pipe_block(data_ch2_12, &temp);\
                                           input_data.lane[3].data[0] = temp; \
                                           read_pipe_block(data_ch2_13, &temp);\
                                           input_data.lane[3].data[1] = temp; \
                                           read_pipe_block(data_ch2_14, &temp);\
                                           input_data.lane[3].data[2] = temp; \
                                           read_pipe_block(data_ch2_15, &temp);\
                                           input_data.lane[3].data[3] = temp;}
#endif

Python 脚本


import sys
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print 'Usage: python [lane_num] [vec_size]'
	exit(1)
    lane = int(sys.argv[1])
    vec_num = int(sys.argv[2])
    all = lane*vec_num
    code_str = '#ifndef _PIPE_H\n'+'#define _PIPE_H\n'
    code_str += 'typedef struct {\n' +\
                '	float data['+str(vec_num) +'];\n'  +\
                '} lane_data;\n'
    code_str += 'typedef struct {\n' +\
                '	lane_data lane['+str(lane) +'];\n'  +\
                '} channel_vec;\n'
    for i in range(0,all):
        code_str += 'pipe float data_ch0_' + str(i) + ' __attribute__((xcl_reqd_pipe_depth(32)));\n' + \
                    'pipe float data_ch1_' + str(i) + ' __attribute__((xcl_reqd_pipe_depth(32)));\n' + \
                    'pipe float data_ch2_' + str(i) + ' __attribute__((xcl_reqd_pipe_depth(32)));\n'
    code_str += '#define data_ch0_write_pipe_block(input_data)  '+\
                '{float temp;\\\n'
    count = 0
    for i in range(0,lane):
    	for j in range(0,vec_num):
    		count = count + 1
    		if (count <= (all-1)):
    			code_str += '                                           temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
    						'                                           write_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);\\\n'
    		else:
    			code_str += '                                           temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
    						'                                           write_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);}\n'

    code_str += '#define data_ch0_read_pipe_block(input_data)  '+\
                '{float temp;\\\n'
    count = 0
    for i in range(0,lane):
    	for j in range(0,vec_num):
    		count = count + 1
    		if (count <= (all-1)):
    			code_str += '                                           read_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);\\\n'+\
    			            '                                           input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp; \\\n'

    		else:
    			code_str += '                                           read_pipe_block(data_ch0_'+str(i*vec_num+j)+', &temp);\\\n'+\
    			            '                                           input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp;} \n'


    code_str += '#define data_ch1_write_pipe_block(input_data)  '+\
                '{float temp;\\\n'
    count = 0
    for i in range(0,lane):
    	for j in range(0,vec_num):
    		count = count + 1
    		if (count <= (all-1)):
    			code_str += '                                           temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
    						'                                           write_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);\\\n'
    		else:
    			code_str += '                                           temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
    						'                                           write_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);}\n'

    code_str += '#define data_ch1_read_pipe_block(input_data)  '+\
                '{float temp;\\\n'
    count = 0
    for i in range(0,lane):
    	for j in range(0,vec_num):
    		count = count + 1
    		if (count <= (all-1)):
    			code_str += '                                           read_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);\\\n'+\
    			            '                                           input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp; \\\n'

    		else:
    			code_str += '                                           read_pipe_block(data_ch1_'+str(i*vec_num+j)+', &temp);\\\n'+\
    			            '                                           input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp;} \n'


    code_str += '#define data_ch2_write_pipe_block(input_data)  '+\
                '{float temp;\\\n'
    count = 0
    for i in range(0,lane):
    	for j in range(0,vec_num):
    		count = count + 1
    		if (count <= (all-1)):
    			code_str += '                                           temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
    						'                                           write_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);\\\n'
    		else:
    			code_str += '                                           temp = input_data.lane[' + str(i) + '].data[' + str(j) + ']; \\\n' +\
    						'                                           write_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);}\n'

    code_str += '#define data_ch2_read_pipe_block(input_data)  '+\
                '{float temp;\\\n'
    count = 0
    for i in range(0,lane):
    	for j in range(0,vec_num):
    		count = count + 1
    		if (count <= (all-1)):
    			code_str += '                                           read_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);\\\n'+\
    			            '                                           input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp; \\\n'

    		else:
    			code_str += '                                           read_pipe_block(data_ch2_'+str(i*vec_num+j)+', &temp);\\\n'+\
    			            '                                           input_data.lane[' + str(i) + '].data[' + str(j) + '] = temp;} \n'



    code_str += '#endif\n'
    fd = open('pipe.cl', 'w')
    fd.write(code_str)
    fd.close()