1. Kernel Library API¶

struct tops::bfloat¶

#include <bfloat.h>

Declaration

The bfloat16 floating-point format is supported in tops namespace. You can declare a bfloat16 number on both host and device side by using tops::bfloat.

#include <tops/bfloat.h>
tops::bfloat a(12);

You can initialize a tops::bfloat variable with another type of variable. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/bfloat.h>
short a = 12;
tops::bfloat b(a);
tops::bfloat c = (tops::bfloat)a; // explicit convert required
tops::bfloat d{a};
tops::bfloat e(42);

Type Convertion

The tops::bfloat also supports type conversions. You can convert a tops::bfloat variable to a certain type. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/bfloat.h>
tops::bfloat a(12.5);
float b = a;  // b equals to 12.500
int c = a;    // c equals to 12
short d = a;  // d equals to 12

Compound Assignment Operators

The tops::bfloat also supports Compound Assignment Operators. Operators +=, -=, *= and /= are supported.

#include <tops/bfloat.h>
tops::bfloat a(12);
tops::bfloat b(4);
a += b;   // a equals to 16.0
a -= b;   // a equals to 12.0
a *= b;   // a equals to 48.0
a /= b;   // a equals to 12.0

Comparison Operators

The tops::bfloat also supports Comparison Operators. Operators ==, !=, >, <, >= and <= are supported.

#include <tops/bfloat.h>
tops::bfloat a(12);
tops::bfloat b(4);
bool c;
c = (a == b);  // c equals to False
c = (a != b);  // c equals to True
c = (a > b);   // c equals to True
c = (a >= b);  // c equals to True
c = (a < b);   // c equals to False
c = (a <= b);  // c equals to False

Four Arithmetic Operations

The tops::bfloat supports Four Arithmetic Operations. Operators +, -, * and / are supported.

#include <tops/bfloat.h>
tops::bfloat a(12);
tops::bfloat b(4);
tops::bfloat c;
c = a + b;    // c equals to 16.0
c = a - b;    // c equals to 8.0
c = a * b;    // c equals to 48.0
c = a / b;    // c equals to 3.0

Private Functions

inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const

inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const

__forceinline__ __host__ __device__ bfloat() = default¶

Private Members

uint16_t value¶: tops::bfloat in bit representation

template<typename ...Args> struct tops::dte_chain¶

Public Functions

inline __device__ __forceinline__ dte_chain(Args... args)¶

inline __device__ __forceinline__ void connect()¶

inline __device__ __forceinline__ void trigger()¶

inline __device__ __forceinline__ void wait()¶

inline __device__ __forceinline__ void trigger_and_wait()¶

Private Members

tops_dte_ctx_t *dtes[sizeof...(Args)]¶: DTE contexts in current dte_chain.

tops::event evs[sizeof...(Args)]¶: tops::event corresponding to each DTE context in dte_chain.

int ndte¶: The number of DTE contexts in dte_chain.

struct tops::half¶

#include <half.h>

Declaration

Half-precision floating-point data type tops::half is supported in tops namespace. You can declare a half number on both host and device side by using tops::half.

#include <tops/half.h>
tops::half a(12);

You can initialize a variable of type tops::half with different types of variables. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/half.h> 
short a = 12;
tops::half b(a);
tops::half c = (tops::half)a; // explicit convert required
tops::half d{a};
tops::half e(42);

Type Convertion

The tops::half supports type conversions. You can convert a tops::half variable to a certain type. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/half.h> 
tops::half a(12.5);
float b = a;  // b equals to 12.500
int c = a;    // c equals to 12
short d = a;  // d equals to 12

Compound Assignment Operators

The tops::half supports Compound Assignment Operators. Operators +=, -=, *= and /= are supported.

#include <tops/half.h> 
tops::half a(12);
tops::half b(4);
a += b;   // a equals to 16.0
a -= b;   // a equals to 12.0
a *= b;   // a equals to 48.0
a /= b;   // a equals to 12.0

Comparison Operators

The tops::half supports Comparison Operators. Operators ==, !=, >, <, >= and <= are supported.

#include <tops/half.h> 
tops::half a(12);
tops::half b(4);
bool c = (a == b);   // c equals to False
bool c = (a != b);   // c equals to True
bool c = (a > b);   // c equals to True
bool c = (a >= b);   // c equals to True
bool c = (a < b);   // c equals to False
bool c = (a <= b);   // c equals to False

Four Arithmetic Operations

The tops::half supports Four Arithmetic Operations. Operators +, -, * and / are supported.

#include <tops/half.h> 
tops::half a(12);
tops::half b(4);
tops::half c = a + b;   // c equals to 16.0
tops::half c = a - b;   // c equals to 8.0
tops::half c = a * b;   // c equals to 48.0
tops::half c = a / b;   // c equals to 3.0

Private Functions

inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const

inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const

__forceinline__ __host__ __device__ half() = default¶

Private Members

uint16_t value¶

struct tops::InputType¶

Public Functions

inline __device__ __forceinline__ InputType(int id)¶

inline __device__ __forceinline__ InputType operator+(int i)¶

Public Members

int mc_id¶

struct tops::InputTypeG¶

Public Functions

inline __device__ __forceinline__ InputType operator()(int mc_id) const¶

inline __device__ __forceinline__ operator InputType() const¶

inline __device__ __forceinline__ InputType operator+(int i)¶

template<typename T> struct is_input_io¶

template<> struct tops::is_input_io<InputType>¶

Public Static Attributes

static const bool value = true¶

template<> struct tops::is_input_io<OutputType>¶

Public Static Attributes

static const bool value = false¶

template<typename T> struct tops::MaxValue¶

Public Static Attributes

static constexpr T value = std::numeric_limits<T>::max()¶

template<> struct tops::MaxValue<tops::bfloat>¶

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::max()¶

template<> struct tops::MaxValue<tops::half>¶

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::max()¶

template<typename T> struct tops::MinValue¶

Public Static Attributes

static constexpr T value = std::numeric_limits<T>::lowest()¶

template<> struct tops::MinValue<tops::bfloat>¶

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::lowest()¶

template<> struct tops::MinValue<tops::half>¶

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::lowest()¶

struct tops::OutputType¶

Public Functions

inline __device__ __forceinline__ OutputType(int id)¶

inline __device__ __forceinline__ OutputType operator+(int i)¶

Public Members

int mc_id¶

struct tops::OutputTypeG¶

Public Functions

inline __device__ __forceinline__ OutputType operator()(int mc_id) const¶

inline __device__ __forceinline__ operator OutputType() const¶

inline __device__ __forceinline__ OutputType operator+(int i)¶

template<typename T> struct scalar2vector¶

template<> struct tops::scalar2vector<bfloat>¶

Public Types

typedef vbfloat type¶

template<> struct tops::scalar2vector<bool>¶

Public Types

typedef vuchar type¶

template<> struct tops::scalar2vector<char>¶

Public Types

typedef vchar type¶

template<> struct tops::scalar2vector<float>¶

Public Types

typedef vfloat type¶

template<> struct tops::scalar2vector<half>¶

Public Types

typedef vhalf type¶

template<> struct tops::scalar2vector<int>¶

Public Types

typedef vint type¶

template<> struct tops::scalar2vector<short>¶

Public Types

typedef vshort type¶

template<> struct tops::scalar2vector<signed char>¶

Public Types

typedef vchar type¶

template<> struct tops::scalar2vector<unsigned char>¶

Public Types

typedef vuchar type¶

template<> struct tops::scalar2vector<unsigned int>¶

Public Types

typedef vuint type¶

template<> struct tops::scalar2vector<unsigned short>¶

Public Types

typedef vushort type¶

namespace tops¶

Typedefs

typedef bfloat bfloat16¶: Both tops::bfloat and tops::bfloat16 represent the bfloat16 floating-point format.

Functions

template<typename VT> __device__ __forceinline__ VT vbroadcast(const bfloat &e, bool dummy = true)¶: A tops::bfloat can be broadcast to a vbfloat vector, which includes 64 elements of tops::bfloat.

template<typename T> __device__ __forceinline__ T read_local(const T &from)¶

template<typename T> __device__ __forceinline__ void write_local(const T &val, T &to)¶

template<typename T> __device__ __forceinline__ T read_shared(const T &from)¶

template<typename T> __device__ __forceinline__ void write_shared(const T &val, T &to)¶

template<typename T> __device__ __forceinline__ T read_global(const T &from)¶

template<typename T> __device__ __forceinline__ void write_global(const T &val, T &to)¶

template<typename ...Args> __device__ __forceinline__ dte_chain<Args...> make_dte_chain(Args&&... args)¶

template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename IO, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename FT, typename ...Args> __device__ __forceinline__ void elemwise_tiles(const FT &f, int size, IO io, T *addr, Args&&... args)¶

template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename FT> __device__ __forceinline__ void elemwise_tiles(const FT &f, int size)¶

template<int tile_size = 0, bool vectorized = true, bool async = true, typename IO, typename T, typename FT, typename std::enable_if<std::is_same<IO, InputType>::value || std::is_same<IO, OutputType>::value, bool>::type = true, typename ...Args> __device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, IO io, T *addr, Args&&... args)¶

tops::elemwise_kernel is used to perform element-wise operations.

Parameters

f – Anonymous function for data computation on L1 memory.
total_size – The total size of elements in the input/output data.
io – tops::Input or tops::Output.
addr – pointer to the input/output data on device memory(L3).
args – Pairs of type (tops::Input/tops::Output) and address.

template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args> __device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, InputTypeG io, T *addr, Args&&... args)¶

template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args> __device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, OutputTypeG io, T *addr, Args&&... args)¶

template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename VFT, typename SFT, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename ...ARGS> __device__ __forceinline__ void elemwise_local(const VFT &vf, const SFT &sf, int size, IO io, T *addr, ARGS&&... args)¶

template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename FT, typename std::enable_if<input_count + output_count != 0, bool>::type = true, typename ...ARGS> __device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)¶

template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename FT> __device__ __forceinline__ void elemwise_local(const FT &f, int size)¶

template<int LATENCY = 10, typename T, typename IO, typename FT, typename ...ARGS> __device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)¶

template<typename VT> __device__ __forceinline__ VT vbroadcast(const half &e, bool dummy = true)¶: A tops::half can be broadcast to a vhalf vector, which includes 64 elements of tops::half.

__device__ __attribute__ ((weak)) InputTypeG Input

template<int tile_size0, int tile_size1, int tile_size2, typename IT, typename OT, typename FT> __device__ __forceinline__ void reduce_tiles(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], int (&offsets)[3], int (&sizes)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0, bool cross_thread = false)¶

template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename IT, typename OT, typename FT> __device__ __forceinline__ void reduction_kernel(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0)¶

tops::reduction_kernel is used to perform reduction operations on the 1st dimension (i.e., the middle dimension) of a 3D tensor.

Parameters

f – Anonymous function for data computation on L1 memory.
output_addr – [out] Pointer to output data on device memory(L3).
output_shape – An int array describing the shape of the output.
input_addr – [in] Pointer to input data on device memory(L3).
input_shape – An int array describing the shape of the input.
identity_value – Identity element of reduction op.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.

template<typename T, typename VFT, typename SFT> __device__ __forceinline__ void reduction_local(const VFT &vf, const SFT &sf, mdspan &output, const mdspan &input, int reduce_dim = 0)¶

Variables

__shared__ unsigned char __tops_all_smem []

namespace tops::nn¶

Enums

enum DotForwardMode¶

Values:

enumerator SplitCoMode0¶

enumerator SplitCoMode1¶

enumerator SplitN¶

enumerator SplitH¶

enumerator ErrorNoMatchMode¶

Functions

template<typename T, std::size_t N> __device__ void get_l3_buf(T *buf_l3, T *buf_l1)¶

template<std::size_t N> __global__ void transpose_l3tol3 (float *from, float *to, int *from_sp, int *to_sp, int *offset_sp)

inline DotForwardMode getDotForwardMode(const int &n, const int &ci, const int &co)¶

template<std::size_t RANK> inline void transposeWrapper(float *src_d, float *reshape_d, int *src_shape_h, int *dst_shape_h, int *layout_h)¶

inline topsError_t dot_(float *lhs, float *rhs, float *out, int n, int k, int m, int gridsz = 1, int blocksz = 1, topsStream_t stream = nullptr)¶

inline topsError_t dot (float *lhs, float *rhs, float *out, const int &n, const int &k, const int &m) __attribute__((diagnose_if(1

if (mode==SplitCoMode0)

template<int tile_size = 0, typename T, typename FT> __device__ __forceinline__ void select_kernel(FT f, T *output, T *lhs, T *rhs, int size, int output_mc = 0, int lhs_mc = 0, int rhs_mc = 0)¶

Adds corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] + rhs[i]. \)

Subtracts corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] - rhs[i]. \)

Multiplies corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] * rhs[i]. \)

Divides corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] / rhs[i]. \)

Computes the element-wise modulus of two arrays and stores the result in the output array.

\( output[i] = lhs[i]\mod rhs[i]. \)

Computes the element-wise maximum of two arrays and stores the result in the output array.

\( output[i] = \max(lhs[i], rhs[i]). \)

Computes the element-wise minimum of two arrays and stores the result in the output array.

\( output[i] = \min(lhs[i], rhs[i]). \)

Note

Supported vector types: vchar/vuchar/vshort/vushort/vint/vuint.

Parameters

output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.

template<int tile_size = 0, typename T, typename FT> __global__ void select (FT f, T *output, T *lhs, T *rhs, int size, int output_mc=0, int lhs_mc=0, int rhs_mc=0)

Select from two arrays by condition.

\( output[i] = cond[i] ? lhs[i] : rhs[i]. \)

Parameters

f – Conditional expressions on rhs and lhs.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.

template<typename T> __device__ __forceinline__ void broadcast_in_dim_kernel(T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)¶

template<typename T> __global__ void broadcast_in_dim (T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)

Broadcasts a two-dimensional array to a three-dimensional array with customizable broadcast dimension and size.

Parameters

output – Pointer to the output array on device memory(L3).
input – Pointer to the input array on device memory(L3).
dim0_size – The size of dimension 0 in the input array.
dim1_size – The size of dimension 1 in the input array.
broadcast_dim – The desired broadcast dimension in the output array, should be 0, 1 or 2.
broadcast_dim_size – The size of the broadcast dimension in the output array.

template<typename T> __device__ __forceinline__ T __reduction_add(T lhs, T rhs)¶

template<typename T> __device__ __forceinline__ T __reduction_max(T lhs, T rhs)¶

template<typename T> __device__ __forceinline__ T __reduction_min(T lhs, T rhs)¶

template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void sum (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)

Performs a ReduceSum operation on the first (i.e., middle) dimension of a 3D tensor.

Template Parameters

tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.

Parameters

output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.

template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void max (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)

Performs a ReduceMax operation on a 3D tensor along the first (i.e. middle) dimension.

Template Parameters

tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.

Parameters

output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.

template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void min (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)

Performs a ReduceMin operation on a 3D tensor along the first (i.e. middle) dimension.

Template Parameters

tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.

Parameters

output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.

Variables

topsError_t tops::nn::dot is used for demo For performance

topsError_t tops::nn::dot is used for demo For pls use topsDNN

topsError_t tops::nn::dot is used for demo For pls use warning {auto mode = getDotForwardMode(n, k, m)

float *lhs_reshape_d¶

float *rhs_reshape_d¶

topsError_t error = topsMalloc(&lhs_reshape_d, n * k * sizeof(float))¶

return topsSuccess

file bfloat.h

#include <>

A builtin float type, tops::bfloat.

Defines

__TOPS_BFLOAT_CTR(t)¶

__TOPS_BFLOAT_CONV(t)¶

__TOPS_BFLOAT_ASSIGN(op)¶

__TOPS_BFLOAT_CMP(op)¶

__TOPS_BFLOAT_BIN(op)¶

file debug.h: #include <>

file dte_chain.h: #include <>

#include <>

file elemwise.h

#include <>

#include “kernel_common.h”

#include <>

Defines

__TOPS_ELEMWISE_TILE_SIZE_ALIGN¶

__TOPS_ELEMWISE_MAX_CDTE_VC¶

__TOPS_SHARED_MEM_STRIDE_BY_T¶

__TOPS_ASSERT_L1_SIZE¶

__TOPS_SMEM_INPUT_ADDR(pp)¶

__TOPS_SMEM_OUTPUT_ADDR(pp)¶

__TOPS_ELEMWISE_INPUT_FLOW¶

__TOPS_ELEMWISE_OUTPUT_FLOW¶

__TOPS_ELEMWISE_INNER_INPUT_FLOW¶

__TOPS_ELEMWISE_INNER_OUTPUT_FLOW¶

__TOPS_ELEMWISE_OUTPUT_FINI_FLOW¶

file elemwise_local.h

#include <>

#include “kernel_common.h”

#include <>

Defines

__TOPS_ELEMWISE_LOCAL_EXT32(hi, lo)¶

file half.h

#include <>

A builtin float type, tops::half.

Defines

__TOPS_HALF_CTR(t)¶

__TOPS_HALF_CONV(t)¶

__TOPS_HALF_ASSIGN(op)¶

__TOPS_HALF_CMP(op)¶

__TOPS_HALF_BIN(op)¶

Functions

__device__ __forceinline__ uint16_t __float2half_rn(float x)¶

Convert a float x to tops::half y and return the binary of y, which is represented by uint16 type.

float x = 42;
uint16_t y = __float2half_rn(a); // b equals to 20800
                                 // 0 10100 0101000000

__device__ __forceinline__ float __half2float(uint16_t x)¶

Convert a tops::half x to float y and return y. x is represented by uint16 type.

uint16_t x = 20800;         // 20800 = 0b0101000101000000
float a = __half2float(x);  // a equals to 42.000

file kernel_common.h

#include <>

#include <tops/bfloat.h>

#include <tops/half.h>

#include <>

Defines

__TOPS_VECTOR_LENGTH¶

__TOPS_MAX_L1_SIZE¶

__TOPS_SHARED_MEM_STRIDE¶

__TOPS_COMPUTE_TILE_SIZE(tile_size, tt, nargs)¶

file nn.h: #include <tops/nn/primitive.h>

#include <tops/nn/math.h>

#include <tops/nn/reduction.h>

file dot.h

#include <>

Defines

__TOPS_SUB_CI_FP32¶

__TOPS_SUB_CO_FP32¶

file math.h

#include <>

#include <tops/elemwise.h>

#include <tops/elemwise_local.h>

#include <tops/nn/utils.h>

Defines

__TOPS_UNARY_KERNEL_OP(op)¶

__TOPS_UNARY_OP(op)¶

file math.h: #include “device_math.h”

#include “host_math.h”

file primitive.h

#include <>

#include <tops/elemwise.h>

#include <tops/elemwise_local.h>

#include <tops/nn/utils.h>

#include <assert.h>

Defines

TOPSKernel(...)¶

__TOPS_BINARY_KERNEL_OP(op)¶

__TOPS_BINARY_OP(op)¶

file reduction.h

#include <>

#include <tops/reduction.h>

#include <tops/nn/utils.h>

#include <tops/elemwise_local.h>

#include <tops/reduction_local.h>

Defines

__TOPS_REDUCTION_KERNEL_OP(n, op)¶

file reduction.h

#include <>

#include “kernel_common.h”

Defines

__TOPS_REDUCTION_IS_ASYNC¶

__TOPS_REDUCE_DEFAULT_TILE_SIZE¶

__TOPS_ASSERT_L1_SIZE

__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)¶

__TOPS_REDUCTION_SMEM_OUTPUT_ADDR¶

__TOPS_REDUCTION_INPUT_FLOW¶

__TOPS_REDUCTION_INNER_INPUT_FLOW¶

__TOPS_REDUCTION_INNER_OUTPUT_FLOW¶

__TOPS_REDUCTION_OUTPUT_FLOW¶

__TOPS_VECTOR_MEM_LOCATION¶

file utils.h: #include <>

#include <tops/kernel_common.h>

file reduction_local.h: #include <>

#include <tops/elemwise_local.h>

#include <assert.h>

file assert.h: #include <>

file device_math.h

#include <>

#include <assert.h>

Defines

__TOPS_KRT_MATH_WRAPPER1_P(op)¶

__TOPS_KRT_MATH_WRAPPER1(op)¶

__TOPS_KRT_MATH_WRAPPER2_P(op)¶

__TOPS_KRT_MATH_WRAPPER2(op)¶

__TOPS_MATH_BUILTIN(mf, md)¶: doxygen group Scalar

__TOPS_MATH_BUILTIN2(mf, md)¶

__TOPS_MATH_BUILTIN3(mf, md)¶

__TOPS_MATH_BUILTIN_D(mf, md)¶

__TOPS_MATH_BUILTIN2_D(mf, md)¶

__TOPS_MATH_BUILTIN3_D(mf, md)¶

Functions

__device__ __forceinline__ float __powf(float a, float b)¶

__device__ __forceinline__ float fmaxf(float a, float b)¶

__device__ __forceinline__ float __fmaxf(float a, float b)¶

__device__ __forceinline__ float fminf(float a, float b)¶

__device__ __forceinline__ float __fminf(float a, float b)¶

__device__ __forceinline__ float fabsf(float a)¶

__device__ __forceinline__ float __fabsf(float a)¶

__device__ unsigned int __popcountsi2(unsigned int)¶

__device__ __forceinline__ unsigned int __popc(unsigned int val)¶

__device__ unsigned long long int __popcountdi2(unsigned long long int)¶

__device__ __forceinline__ unsigned long long int __popcll(unsigned long long int val)¶

__device__ unsigned long long int __ffsdi2(unsigned long long int)¶

__device__ __forceinline__ unsigned long long int __ffs(unsigned long long int val)¶

template<typename T> __device__ __forceinline__ T __mul24(T, T)¶

file host_math.h

Functions

static inline float sign(float s)¶

static inline float sigmoid(float s)¶

static inline float rsqrt(float s)¶

static inline float gelu(float s)¶

static inline float softplus(float s)¶

file stdio.h

file wchar.h

group bfloat

This section describes one of the builtin types tops::bfloat.

Defines

__TOPS_BFLOAT_CMP(op)

__TOPS_BFLOAT_BIN(op)

Typedefs

typedef bfloat bfloat16¶: Both tops::bfloat and tops::bfloat16 represent the bfloat16 floating-point format.

group Elementwise

Defines

__TOPS_ELEMWISE_TILE_SIZE_ALIGN

__TOPS_ELEMWISE_MAX_CDTE_VC

__TOPS_SHARED_MEM_STRIDE_BY_T

__TOPS_ASSERT_L1_SIZE

__TOPS_SMEM_INPUT_ADDR(pp)

__TOPS_SMEM_OUTPUT_ADDR(pp)

__TOPS_ELEMWISE_INPUT_FLOW

__TOPS_ELEMWISE_OUTPUT_FLOW

__TOPS_ELEMWISE_INNER_INPUT_FLOW

__TOPS_ELEMWISE_INNER_OUTPUT_FLOW

__TOPS_ELEMWISE_OUTPUT_FINI_FLOW

group half

This section describes one of the builtin types tops::half.

Defines

__TOPS_HALF_CMP(op)

__TOPS_HALF_BIN(op)

group nn

Defines

__TOPS_UNARY_KERNEL_OP(op)

__TOPS_UNARY_OP(op)

__TOPS_BINARY_KERNEL_OP(op)

__TOPS_BINARY_OP(op)

__TOPS_REDUCTION_KERNEL_OP(n, op)

group Reduction

Defines

__TOPS_REDUCTION_IS_ASYNC

__TOPS_REDUCE_DEFAULT_TILE_SIZE

__TOPS_ASSERT_L1_SIZE

__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)

__TOPS_REDUCTION_SMEM_OUTPUT_ADDR

__TOPS_REDUCTION_INPUT_FLOW

__TOPS_REDUCTION_INNER_INPUT_FLOW

__TOPS_REDUCTION_INNER_OUTPUT_FLOW

__TOPS_REDUCTION_OUTPUT_FLOW

__TOPS_VECTOR_MEM_LOCATION

group Scalar

dir include

dir include/tops/nn

dir include/tops

dir include/tops_wrappers