6. TopsGraph 端到端示例

以下提供了一个端到端的 demo,完整的描述了从安装 sdk,构图,编译,到执行所有流程。

6.1. 安装 tops-sdk

# First, get and install the released sdk. such as:
dpkg -i tops-sdk_***.deb
# Once the sdk is installed, the location of header file for hlir builder is:
tree /usr/include/gcu/hlir/builder/
# |-- hlir_builder.h
# |-- hlir_builder_client_ops.h
# |-- hlir_builder_common.h
# |-- hlir_builder_ops.h
# |-- hlir_builder_structs.h
# and the lib is:
ls -l /usr/lib/libdtu_sdk.so
# /usr/lib/libdtu_sdk.so -> libdtu_sdk.so.3
ls -l /opt/tops/lib/libtopsrt.so
# /opt/tops/lib/libtopsrt.so -> libtopsrt.so.1

6.2. CMakeLists 文件

cmake_minimum_required(VERSION 3.2)
project(hlir_builder_demo)
set(CMAKE_CXX_STANDARD 14)

# if libdtu_sdk.so is compiled with ABI=0(default), uncomment Line 6 and comment Line 7
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)   # Line 6
# if libdtu_sdk.so is compiled with ABI=1, comment Line 6 and uncomment Line 7
#add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=1)  # Line 7

include_directories(/usr/include/gcu)
include_directories(/opt/tops/include)
aux_source_directory(${CMAKE_CURRENT_LIST_DIR}/src demo_src)
link_directories(/usr/lib)
link_directories(/opt/tops/lib)
add_executable(${PROJECT_NAME} ${demo_src})
target_link_libraries(${PROJECT_NAME} -ldtu_sdk -ltopsrt)

6.3. demo 源码

以下 C++ 代码实现了 matmul 功能,构图使用了 TopsGraphBuilder API, 编译使用了 TopsGraphCompiler API, 执行使用了 TopsRuntime API:

#include <string>
#include <vector>
#include <sstream>
#include <iostream>
#include "hlir/builder/hlir_builder.h"
#include "tops_graph_compiler/tops_graph_compiler.h"
#include "tops/tops_runtime.h"
#include "tops/tops_ext.h"

int main() {
  // stage 1: build the HLIR Module
  auto builder = std::make_shared<builder::Builder>();
  builder->SetShapeInference(true);
  auto ptype = builder::PrimitiveType::F32();
  std::vector<int64_t> shape = {2, 2};
  builder::Type type(shape, ptype);
  auto arg0 = builder->CreateInput(type);
  auto arg1 = builder->CreateInput(type);
  builder::DotDimensionNumbers dims_attr({}, {}, {1}, {0});
  auto res = builder::DotGeneral(arg0, arg1, dims_attr);
  res.SetAttribute("op_name", builder::Attribute("MatMul"));
  builder->SetOutput({res});
  builder->Dump();
  auto hlir_module = builder->GetModule();

  // stage 2: compile
  topsgraphProgram program;
  auto ret = topsgraphCreateProgramFromModule(&program, hlir_module.get());
  const char * options[] = {
      "-arch=gcu300",
      "-resource=1c12s",
      "-hlir=tops-hlir-pipeline{}"};
  topsgraphCompileProgram(program, 3, options);
  size_t binary_size = 0;
  topsgraphGetBinSize(program, &binary_size);
  char* binary = new char[binary_size];
  ret = topsgraphGetBin(program, binary);

  // stage 3: run
  topsInit(0);
  int device_id = 0;
  topsSetDevice(device_id);
  topsExecutable_t exec;
  topsCreateExecutable(&exec, binary, binary_size);
  delete [] binary;
  topsgraphDestroyProgram(&program);
  topsResource_t resource;
  topsCreateResourceForExecutable(&resource, exec);
  topsStream_t stream;
  topsStreamCreate(&stream);
  std::vector<int *> dev_inputs;
  std::vector<int *> dev_outputs;
  std::vector<float> lhs{0, 1, 2, 3};
  std::vector<float> rhs{1, 2, 3, 4};
  std::vector<void*> data_ptrs;
  data_ptrs.emplace_back(static_cast<void*>(lhs.data()));
  data_ptrs.emplace_back(static_cast<void*>(rhs.data()));
  uint64_t input_count = 0;
  topsExecutableQueryInfo(exec, topsExecutableInfoInputCount, &input_count);
  uint64_t *input_size_list = (uint64_t *)malloc(sizeof(uint64_t)* input_count);
  topsExecutableQueryInfo(exec, topsExecutableInfoInputSizeList,
                          input_size_list);
  for (size_t index = 0; index < input_count; index++) {
    auto input_size = (size_t)input_size_list[index];
    int *input = nullptr;
    topsMallocForResource((void**)&input, input_size, resource);
    topsMemcpyAsync(input, data_ptrs[index], input_size_list[index],
                    topsMemcpyHostToDevice, stream);
    dev_inputs.push_back(input);
  }
  uint64_t output_count = 0;
  topsExecutableQueryInfo(exec, topsExecutableInfoOutputCount, &output_count);
  auto output_size_list = (uint64_t *)malloc(sizeof(uint64_t)* output_count);
  topsExecutableQueryInfo(exec, topsExecutableInfoOutputSizeList,
                          output_size_list);
  for (size_t i = 0; i < output_count; i++) {
    uint64_t output_size = output_size_list[i];
    int *output = nullptr;
    topsMallocForResource((void**)&output, output_size, resource);
    dev_outputs.push_back(output);
  }
  topsLaunchExecutableV2(exec, resource,
                        (void**)dev_inputs.data(), dev_inputs.size(),
                        nullptr, nullptr,
                        (void**)dev_outputs.data(), dev_outputs.size(),
                        stream);
  auto output_rank_list = (uint64_t *)malloc(sizeof(uint64_t)* output_count);
  topsExecutableQueryInfo(exec, topsExecutableInfoOutputRank, output_rank_list);
  uint64_t output_dims_size =
          std::accumulate(output_rank_list, output_rank_list + output_count, 0);
  uint64_t *output_dim_list =
          (uint64_t *)malloc(sizeof(uint64_t) * output_dims_size);
  topsExecutableQueryInfo(exec, topsExecutableInfoOutputDimsList,
                          output_dim_list);
  uint64_t dim_index = 0;
  for (size_t i = 0; i < output_count; i++) {
    uint64_t output_size = output_size_list[i];
    std::vector<uint64_t> shape_v;
    for(size_t j =0; j < output_rank_list[i]; j++) {
      shape_v.push_back(output_dim_list[dim_index++]);
    }
    void *host_output = malloc(output_size);
    topsMemcpyAsync(host_output, dev_outputs[i], output_size,
                    topsMemcpyDeviceToHost, stream);
    topsStreamSynchronize(stream);
    float* output_data = static_cast<float*>(host_output);
    std::cout << "output data: ";
    for (int j = 0; j < 4; ++j) {
      std::cout << output_data[j] << ", ";
    }
    std::cout << std::endl;
    free(host_output);
  }
  for (auto dev_input : dev_inputs) {
    topsFree(dev_input);
  }
  for (auto dev_output : dev_outputs) {
    topsFree(dev_output);
  }
  topsStreamDestroy(stream);
  topsDestroyResource(resource);
  topsDestroyExecutable(exec);
  return 0;
}

6.4. 源码编译

假设源码目录结构如下所示:

demo
|-- CMakeLists.txt
`-- src
    `-- demo.cpp

执行以下命令进行编译:

mkdir build
cd build
cmake ..
make

编译过程输出如下:

root@c76cafeb287f:/home/develop/hlir_builder/demo/build# cmake ..
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Configuring done
-- Generating done
-- Build files have been written to: /home/develop/hlir_builder/demo/build
root@c76cafeb287f:/home/develop/hlir_builder/demo/build# make
[ 50%] Building CXX object CMakeFiles/hlir_builder_demo.dir/src/demo.cpp.o
[100%] Linking CXX executable hlir_builder_demo
[100%] Built target hlir_builder_demo

编译完之后,我们可执行生成的 hlir_builder_demo 文件,会输出对应的 HLIR Module 和 topsruntime 执行结果:

root@c76cafeb287f:/home/develop/hlir_builder/demo/build# ./hlir_builder_demo
# dumped ir
module @hlir_module {
  func @main(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
    %0 = "dtu_hlir.dot_general"(%arg0, %arg1) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[]> : tensor<0xi64>, lhs_contracting_dimensions = dense<1> : tensor<1xi64>, rhs_batching_dimensions = dense<[]> : tensor<0xi64>, rhs_contracting_dimensions = dense<0> : tensor<1xi64>}, op_name = "MatMul", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
    return %0 : tensor<2x2xf32>
  }
}
# calculation result
output data: 3, 4, 11, 16,