1. Kernel Library API

struct tops::bfloat
#include <bfloat.h>

Declaration

The bfloat16 floating-point format is supported in tops namespace. You can declare a bfloat16 number on both host and device side by using tops::bfloat.

#include <tops/bfloat.h>
tops::bfloat a(12);

You can initialize a tops::bfloat variable with another type of variable. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/bfloat.h>
short a = 12;
tops::bfloat b(a);
tops::bfloat c = (tops::bfloat)a; // explicit convert required
tops::bfloat d{a};
tops::bfloat e(42);

Type Convertion

The tops::bfloat also supports type conversions. You can convert a tops::bfloat variable to a certain type. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/bfloat.h>
tops::bfloat a(12.5);
float b = a;  // b equals to 12.500
int c = a;    // c equals to 12
short d = a;  // d equals to 12

Compound Assignment Operators

The tops::bfloat also supports Compound Assignment Operators. Operators +=, -=, *= and /= are supported.

#include <tops/bfloat.h>
tops::bfloat a(12);
tops::bfloat b(4);
a += b;   // a equals to 16.0
a -= b;   // a equals to 12.0
a *= b;   // a equals to 48.0
a /= b;   // a equals to 12.0

Comparison Operators

The tops::bfloat also supports Comparison Operators. Operators ==, !=, >, <, >= and <= are supported.

#include <tops/bfloat.h>
tops::bfloat a(12);
tops::bfloat b(4);
bool c;
c = (a == b);  // c equals to False
c = (a != b);  // c equals to True
c = (a > b);   // c equals to True
c = (a >= b);  // c equals to True
c = (a < b);   // c equals to False
c = (a <= b);  // c equals to False

Four Arithmetic Operations

The tops::bfloat supports Four Arithmetic Operations. Operators +, -, * and / are supported.

#include <tops/bfloat.h>
tops::bfloat a(12);
tops::bfloat b(4);
tops::bfloat c;
c = a + b;    // c equals to 16.0
c = a - b;    // c equals to 8.0
c = a * b;    // c equals to 48.0
c = a / b;    // c equals to 3.0

Private Functions

inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const
inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const
__forceinline__ __host__ __device__ bfloat() = default

Private Members

uint16_t value

tops::bfloat in bit representation

template<typename ...Args>
struct tops::dte_chain

Public Functions

inline __device__ __forceinline__ dte_chain(Args... args)
inline __device__ __forceinline__ void connect()
inline __device__ __forceinline__ void trigger()
inline __device__ __forceinline__ void wait()
inline __device__ __forceinline__ void trigger_and_wait()

Private Members

tops_dte_ctx_t *dtes[sizeof...(Args)]

DTE contexts in current dte_chain.

tops::event evs[sizeof...(Args)]

tops::event corresponding to each DTE context in dte_chain.

int ndte

The number of DTE contexts in dte_chain.

struct tops::half
#include <half.h>

Declaration

Half-precision floating-point data type tops::half is supported in tops namespace. You can declare a half number on both host and device side by using tops::half.

#include <tops/half.h>
tops::half a(12);

You can initialize a variable of type tops::half with different types of variables. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/half.h> 
short a = 12;
tops::half b(a);
tops::half c = (tops::half)a; // explicit convert required
tops::half d{a};
tops::half e(42);

Type Convertion

The tops::half supports type conversions. You can convert a tops::half variable to a certain type. Type double, float, long, unsigned long, int, unsigned int, short, unsigned short, char, signed char, unsigned char, and bool are supported.

#include <tops/half.h> 
tops::half a(12.5);
float b = a;  // b equals to 12.500
int c = a;    // c equals to 12
short d = a;  // d equals to 12

Compound Assignment Operators

The tops::half supports Compound Assignment Operators. Operators +=, -=, *= and /= are supported.

#include <tops/half.h> 
tops::half a(12);
tops::half b(4);
a += b;   // a equals to 16.0
a -= b;   // a equals to 12.0
a *= b;   // a equals to 48.0
a /= b;   // a equals to 12.0

Comparison Operators

The tops::half supports Comparison Operators. Operators ==, !=, >, <, >= and <= are supported.

#include <tops/half.h> 
tops::half a(12);
tops::half b(4);
bool c = (a == b);   // c equals to False
bool c = (a != b);   // c equals to True
bool c = (a > b);   // c equals to True
bool c = (a >= b);   // c equals to True
bool c = (a < b);   // c equals to False
bool c = (a <= b);   // c equals to False

Four Arithmetic Operations

The tops::half supports Four Arithmetic Operations. Operators +, -, * and / are supported.

#include <tops/half.h> 
tops::half a(12);
tops::half b(4);
tops::half c = a + b;   // c equals to 16.0
tops::half c = a - b;   // c equals to 8.0
tops::half c = a * b;   // c equals to 48.0
tops::half c = a / b;   // c equals to 3.0

Private Functions

inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const
inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const
__forceinline__ __host__ __device__ half() = default

Private Members

uint16_t value
struct tops::InputType

Public Functions

inline __device__ __forceinline__ InputType(int id)
inline __device__ __forceinline__ InputType operator+(int i)

Public Members

int mc_id
struct tops::InputTypeG

Public Functions

inline __device__ __forceinline__ InputType operator()(int mc_id) const
inline __device__ __forceinline__ operator InputType() const
inline __device__ __forceinline__ InputType operator+(int i)
template<typename T>
struct is_input_io
template<>
struct tops::is_input_io<InputType>

Public Static Attributes

static const bool value = true
template<>
struct tops::is_input_io<OutputType>

Public Static Attributes

static const bool value = false
template<typename T>
struct tops::MaxValue

Public Static Attributes

static constexpr T value = std::numeric_limits<T>::max()
template<>
struct tops::MaxValue<tops::bfloat>

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::max()
template<>
struct tops::MaxValue<tops::half>

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::max()
template<typename T>
struct tops::MinValue

Public Static Attributes

static constexpr T value = std::numeric_limits<T>::lowest()
template<>
struct tops::MinValue<tops::bfloat>

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::lowest()
template<>
struct tops::MinValue<tops::half>

Public Static Attributes

static constexpr float value = std::numeric_limits<float>::lowest()
struct tops::OutputType

Public Functions

inline __device__ __forceinline__ OutputType(int id)
inline __device__ __forceinline__ OutputType operator+(int i)

Public Members

int mc_id
struct tops::OutputTypeG

Public Functions

inline __device__ __forceinline__ OutputType operator()(int mc_id) const
inline __device__ __forceinline__ operator OutputType() const
inline __device__ __forceinline__ OutputType operator+(int i)
template<typename T>
struct scalar2vector
template<>
struct tops::scalar2vector<bfloat>

Public Types

typedef vbfloat type
template<>
struct tops::scalar2vector<bool>

Public Types

typedef vuchar type
template<>
struct tops::scalar2vector<char>

Public Types

typedef vchar type
template<>
struct tops::scalar2vector<float>

Public Types

typedef vfloat type
template<>
struct tops::scalar2vector<half>

Public Types

typedef vhalf type
template<>
struct tops::scalar2vector<int>

Public Types

typedef vint type
template<>
struct tops::scalar2vector<short>

Public Types

typedef vshort type
template<>
struct tops::scalar2vector<signed char>

Public Types

typedef vchar type
template<>
struct tops::scalar2vector<unsigned char>

Public Types

typedef vuchar type
template<>
struct tops::scalar2vector<unsigned int>

Public Types

typedef vuint type
template<>
struct tops::scalar2vector<unsigned short>

Public Types

typedef vushort type
namespace tops

Typedefs

typedef bfloat bfloat16

Both tops::bfloat and tops::bfloat16 represent the bfloat16 floating-point format.

Functions

template<typename VT>
__device__ __forceinline__ VT vbroadcast(const bfloat &e, bool dummy = true)

A tops::bfloat can be broadcast to a vbfloat vector, which includes 64 elements of tops::bfloat.

template<typename T>
__device__ __forceinline__ T read_local(const T &from)
template<typename T>
__device__ __forceinline__ void write_local(const T &val, T &to)
template<typename T>
__device__ __forceinline__ T read_shared(const T &from)
template<typename T>
__device__ __forceinline__ void write_shared(const T &val, T &to)
template<typename T>
__device__ __forceinline__ T read_global(const T &from)
template<typename T>
__device__ __forceinline__ void write_global(const T &val, T &to)
template<typename ...Args>
__device__ __forceinline__ dte_chain<Args...> make_dte_chain(Args&&... args)
template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename IO, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_tiles(const FT &f, int size, IO io, T *addr, Args&&... args)
template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename FT>
__device__ __forceinline__ void elemwise_tiles(const FT &f, int size)
template<int tile_size = 0, bool vectorized = true, bool async = true, typename IO, typename T, typename FT, typename std::enable_if<std::is_same<IO, InputType>::value || std::is_same<IO, OutputType>::value, bool>::type = true, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, IO io, T *addr, Args&&... args)

tops::elemwise_kernel is used to perform element-wise operations.

Parameters
  • f – Anonymous function for data computation on L1 memory.

  • total_size – The total size of elements in the input/output data.

  • io – tops::Input or tops::Output.

  • addr – pointer to the input/output data on device memory(L3).

  • args – Pairs of type (tops::Input/tops::Output) and address.

template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, InputTypeG io, T *addr, Args&&... args)
template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, OutputTypeG io, T *addr, Args&&... args)
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename VFT, typename SFT, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const VFT &vf, const SFT &sf, int size, IO io, T *addr, ARGS&&... args)
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename FT, typename std::enable_if<input_count + output_count != 0, bool>::type = true, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename FT>
__device__ __forceinline__ void elemwise_local(const FT &f, int size)
template<int LATENCY = 10, typename T, typename IO, typename FT, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)
template<typename VT>
__device__ __forceinline__ VT vbroadcast(const half &e, bool dummy = true)

A tops::half can be broadcast to a vhalf vector, which includes 64 elements of tops::half.

__device__ __attribute__ ((weak)) InputTypeG Input
template<int tile_size0, int tile_size1, int tile_size2, typename IT, typename OT, typename FT>
__device__ __forceinline__ void reduce_tiles(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], int (&offsets)[3], int (&sizes)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0, bool cross_thread = false)
template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename IT, typename OT, typename FT>
__device__ __forceinline__ void reduction_kernel(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0)

tops::reduction_kernel is used to perform reduction operations on the 1st dimension (i.e., the middle dimension) of a 3D tensor.

Parameters
  • f – Anonymous function for data computation on L1 memory.

  • output_addr[out] Pointer to output data on device memory(L3).

  • output_shape – An int array describing the shape of the output.

  • input_addr[in] Pointer to input data on device memory(L3).

  • input_shape – An int array describing the shape of the input.

  • identity_value – Identity element of reduction op.

  • output_mc_id – Memory control id for output memory.

  • input_mc_id – Memory control id for input memory.

template<typename T, typename VFT, typename SFT>
__device__ __forceinline__ void reduction_local(const VFT &vf, const SFT &sf, mdspan &output, const mdspan &input, int reduce_dim = 0)

Variables

__shared__ unsigned char __tops_all_smem []
namespace tops::nn

Enums

enum DotForwardMode

Values:

enumerator SplitCoMode0
enumerator SplitCoMode1
enumerator SplitN
enumerator SplitH
enumerator ErrorNoMatchMode

Functions

template<typename T, std::size_t N>
__device__ void get_l3_buf(T *buf_l3, T *buf_l1)
template<std::size_t N> __global__ void transpose_l3tol3 (float *from, float *to, int *from_sp, int *to_sp, int *offset_sp)
inline DotForwardMode getDotForwardMode(const int &n, const int &ci, const int &co)
template<std::size_t RANK>
inline void transposeWrapper(float *src_d, float *reshape_d, int *src_shape_h, int *dst_shape_h, int *layout_h)
inline topsError_t dot_(float *lhs, float *rhs, float *out, int n, int k, int m, int gridsz = 1, int blocksz = 1, topsStream_t stream = nullptr)
inline topsError_t dot (float *lhs, float *rhs, float *out, const int &n, const int &k, const int &m) __attribute__((diagnose_if(1
if (mode==SplitCoMode0)
template<int tile_size = 0, typename T, typename FT>
__device__ __forceinline__ void select_kernel(FT f, T *output, T *lhs, T *rhs, int size, int output_mc = 0, int lhs_mc = 0, int rhs_mc = 0)

Adds corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] + rhs[i]. \)

Subtracts corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] - rhs[i]. \)

Multiplies corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] * rhs[i]. \)

Divides corresponding elements of two arrays and stores the result in the output array.

\( output[i] = lhs[i] / rhs[i]. \)

Computes the element-wise modulus of two arrays and stores the result in the output array.

\( output[i] = lhs[i]\mod rhs[i]. \)

Computes the element-wise maximum of two arrays and stores the result in the output array.

\( output[i] = \max(lhs[i], rhs[i]). \)

Computes the element-wise minimum of two arrays and stores the result in the output array.

\( output[i] = \min(lhs[i], rhs[i]). \)

Note

Supported vector types: vchar/vuchar/vshort/vushort/vint/vuint.

Parameters
  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

template<int tile_size = 0, typename T, typename FT> __global__ void select (FT f, T *output, T *lhs, T *rhs, int size, int output_mc=0, int lhs_mc=0, int rhs_mc=0)

Select from two arrays by condition.

\( output[i] = cond[i] ? lhs[i] : rhs[i]. \)

Parameters
  • f – Conditional expressions on rhs and lhs.

  • output – Pointer to the output array on the device (L3 memory).

  • lhs – Pointer to the first input array on the device (L3 memory).

  • rhs – Pointer to the second input array on the device (L3 memory).

  • size – The total number of elements in the array.

  • output_mc – Memory control id for output memory.

  • lhs_mc – Memory control id for lhs input memory.

  • rhs_mc – Memory control id for rhs input memory.

template<typename T>
__device__ __forceinline__ void broadcast_in_dim_kernel(T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)
template<typename T> __global__ void broadcast_in_dim (T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)

Broadcasts a two-dimensional array to a three-dimensional array with customizable broadcast dimension and size.

Parameters
  • output – Pointer to the output array on device memory(L3).

  • input – Pointer to the input array on device memory(L3).

  • dim0_size – The size of dimension 0 in the input array.

  • dim1_size – The size of dimension 1 in the input array.

  • broadcast_dim – The desired broadcast dimension in the output array, should be 0, 1 or 2.

  • broadcast_dim_size – The size of the broadcast dimension in the output array.

template<typename T>
__device__ __forceinline__ T __reduction_add(T lhs, T rhs)
template<typename T>
__device__ __forceinline__ T __reduction_max(T lhs, T rhs)
template<typename T>
__device__ __forceinline__ T __reduction_min(T lhs, T rhs)
template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void sum (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)

Performs a ReduceSum operation on the first (i.e., middle) dimension of a 3D tensor.

Template Parameters
  • tile_size0 – tile size on the 0th dimension.

  • tile_size1 – tile size on the 1st dimension.

  • tile_size2 – tile size on the 2nd dimension.

Parameters
  • output_addr – Pointer to output data on device memory(L3).

  • input_addr – Pointer to input data on device memory(L3).

  • dim0 – The size of the 0th dimension of the input 3D tensor.

  • dim1 – The size of the 1st dimension of the input 3D tensor.

  • dim2 – The size of the 2nd dimension of the input 3D tensor.

  • output_mc_id – Memory control id for output memory.

  • input_mc_id – Memory control id for input memory.

template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void max (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)

Performs a ReduceMax operation on a 3D tensor along the first (i.e. middle) dimension.

Template Parameters
  • tile_size0 – tile size on the 0th dimension.

  • tile_size1 – tile size on the 1st dimension.

  • tile_size2 – tile size on the 2nd dimension.

Parameters
  • output_addr – Pointer to output data on device memory(L3).

  • input_addr – Pointer to input data on device memory(L3).

  • dim0 – The size of the 0th dimension of the input 3D tensor.

  • dim1 – The size of the 1st dimension of the input 3D tensor.

  • dim2 – The size of the 2nd dimension of the input 3D tensor.

  • output_mc_id – Memory control id for output memory.

  • input_mc_id – Memory control id for input memory.

template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void min (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)

Performs a ReduceMin operation on a 3D tensor along the first (i.e. middle) dimension.

Template Parameters
  • tile_size0 – tile size on the 0th dimension.

  • tile_size1 – tile size on the 1st dimension.

  • tile_size2 – tile size on the 2nd dimension.

Parameters
  • output_addr – Pointer to output data on device memory(L3).

  • input_addr – Pointer to input data on device memory(L3).

  • dim0 – The size of the 0th dimension of the input 3D tensor.

  • dim1 – The size of the 1st dimension of the input 3D tensor.

  • dim2 – The size of the 2nd dimension of the input 3D tensor.

  • output_mc_id – Memory control id for output memory.

  • input_mc_id – Memory control id for input memory.

Variables

topsError_t tops::nn::dot is used for demo For performance
topsError_t tops::nn::dot is used for demo For pls use topsDNN
topsError_t tops::nn::dot is used for demo For pls use warning  {auto mode = getDotForwardMode(n, k, m)
float *lhs_reshape_d
float *rhs_reshape_d
topsError_t error = topsMalloc(&lhs_reshape_d, n * k * sizeof(float))
return topsSuccess
file bfloat.h
#include <>
#include <>
#include <>

A builtin float type, tops::bfloat.

Defines

__TOPS_BFLOAT_CTR(t)
__TOPS_BFLOAT_CONV(t)
__TOPS_BFLOAT_ASSIGN(op)
__TOPS_BFLOAT_CMP(op)
__TOPS_BFLOAT_BIN(op)
file debug.h
#include <>
file dte_chain.h
#include <>
#include <>
file elemwise.h
#include <>
#include “kernel_common.h
#include <>
#include <>
#include <>

Defines

__TOPS_ELEMWISE_TILE_SIZE_ALIGN
__TOPS_ELEMWISE_MAX_CDTE_VC
__TOPS_SHARED_MEM_STRIDE_BY_T
__TOPS_ASSERT_L1_SIZE
__TOPS_SMEM_INPUT_ADDR(pp)
__TOPS_SMEM_OUTPUT_ADDR(pp)
__TOPS_ELEMWISE_INPUT_FLOW
__TOPS_ELEMWISE_OUTPUT_FLOW
__TOPS_ELEMWISE_INNER_INPUT_FLOW
__TOPS_ELEMWISE_INNER_OUTPUT_FLOW
__TOPS_ELEMWISE_OUTPUT_FINI_FLOW
file elemwise_local.h
#include <>
#include <>
#include “kernel_common.h
#include <>
#include <>

Defines

__TOPS_ELEMWISE_LOCAL_EXT32(hi, lo)
file half.h
#include <>
#include <>
#include <>

A builtin float type, tops::half.

Defines

__TOPS_HALF_CTR(t)
__TOPS_HALF_CONV(t)
__TOPS_HALF_ASSIGN(op)
__TOPS_HALF_CMP(op)
__TOPS_HALF_BIN(op)

Functions

__device__ __forceinline__ uint16_t __float2half_rn(float x)

Convert a float x to tops::half y and return the binary of y, which is represented by uint16 type.

float x = 42;
uint16_t y = __float2half_rn(a); // b equals to 20800
                                 // 0 10100 0101000000

__device__ __forceinline__ float __half2float(uint16_t x)

Convert a tops::half x to float y and return y. x is represented by uint16 type.

uint16_t x = 20800;         // 20800 = 0b0101000101000000
float a = __half2float(x);  // a equals to 42.000

file kernel_common.h
#include <>
#include <tops/bfloat.h>
#include <tops/half.h>
#include <>
#include <>

Defines

__TOPS_VECTOR_LENGTH
__TOPS_MAX_L1_SIZE
__TOPS_SHARED_MEM_STRIDE
__TOPS_COMPUTE_TILE_SIZE(tile_size, tt, nargs)
file nn.h
#include <tops/nn/math.h>
file dot.h
#include <>
#include <>
#include <>
#include <>

Defines

__TOPS_SUB_CI_FP32
__TOPS_SUB_CO_FP32
file math.h
#include <>
#include <tops/elemwise.h>
#include <tops/nn/utils.h>

Defines

__TOPS_UNARY_KERNEL_OP(op)
__TOPS_UNARY_OP(op)
file math.h
#include “device_math.h
#include “host_math.h
file primitive.h
#include <>
#include <tops/elemwise.h>
#include <tops/nn/utils.h>
#include <assert.h>

Defines

TOPSKernel(...)
__TOPS_BINARY_KERNEL_OP(op)
__TOPS_BINARY_OP(op)
file reduction.h
#include <>
#include <tops/reduction.h>
#include <tops/nn/utils.h>

Defines

__TOPS_REDUCTION_KERNEL_OP(n, op)
file reduction.h
#include <>
#include “kernel_common.h

Defines

__TOPS_REDUCTION_IS_ASYNC
__TOPS_REDUCE_DEFAULT_TILE_SIZE
__TOPS_ASSERT_L1_SIZE
__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)
__TOPS_REDUCTION_SMEM_OUTPUT_ADDR
__TOPS_REDUCTION_INPUT_FLOW
__TOPS_REDUCTION_INNER_INPUT_FLOW
__TOPS_REDUCTION_INNER_OUTPUT_FLOW
__TOPS_REDUCTION_OUTPUT_FLOW
__TOPS_VECTOR_MEM_LOCATION
file utils.h
#include <>
file reduction_local.h
#include <>
#include <assert.h>
file assert.h
#include <>
file device_math.h
#include <>
#include <>
#include <>
#include <assert.h>

Defines

__TOPS_KRT_MATH_WRAPPER1_P(op)
__TOPS_KRT_MATH_WRAPPER1(op)
__TOPS_KRT_MATH_WRAPPER2_P(op)
__TOPS_KRT_MATH_WRAPPER2(op)
__TOPS_MATH_BUILTIN(mf, md)

doxygen group Scalar

__TOPS_MATH_BUILTIN2(mf, md)
__TOPS_MATH_BUILTIN3(mf, md)
__TOPS_MATH_BUILTIN_D(mf, md)
__TOPS_MATH_BUILTIN2_D(mf, md)
__TOPS_MATH_BUILTIN3_D(mf, md)

Functions

__device__ __forceinline__ float __powf(float a, float b)
__device__ __forceinline__ float fmaxf(float a, float b)
__device__ __forceinline__ float __fmaxf(float a, float b)
__device__ __forceinline__ float fminf(float a, float b)
__device__ __forceinline__ float __fminf(float a, float b)
__device__ __forceinline__ float fabsf(float a)
__device__ __forceinline__ float __fabsf(float a)
__device__ unsigned int __popcountsi2(unsigned int)
__device__ __forceinline__ unsigned int __popc(unsigned int val)
__device__ unsigned long long int __popcountdi2(unsigned long long int)
__device__ __forceinline__ unsigned long long int __popcll(unsigned long long int val)
__device__ unsigned long long int __ffsdi2(unsigned long long int)
__device__ __forceinline__ unsigned long long int __ffs(unsigned long long int val)
template<typename T>
__device__ __forceinline__ T __mul24(T, T)
file host_math.h

Functions

static inline float sign(float s)
static inline float sigmoid(float s)
static inline float rsqrt(float s)
static inline float gelu(float s)
static inline float softplus(float s)
file stdio.h
file wchar.h
group bfloat

This section describes one of the builtin types tops::bfloat.

Defines

__TOPS_BFLOAT_CMP(op)
__TOPS_BFLOAT_BIN(op)

Typedefs

typedef bfloat bfloat16

Both tops::bfloat and tops::bfloat16 represent the bfloat16 floating-point format.

group Elementwise

Defines

__TOPS_ELEMWISE_TILE_SIZE_ALIGN
__TOPS_ELEMWISE_MAX_CDTE_VC
__TOPS_SHARED_MEM_STRIDE_BY_T
__TOPS_ASSERT_L1_SIZE
__TOPS_SMEM_INPUT_ADDR(pp)
__TOPS_SMEM_OUTPUT_ADDR(pp)
__TOPS_ELEMWISE_INPUT_FLOW
__TOPS_ELEMWISE_OUTPUT_FLOW
__TOPS_ELEMWISE_INNER_INPUT_FLOW
__TOPS_ELEMWISE_INNER_OUTPUT_FLOW
__TOPS_ELEMWISE_OUTPUT_FINI_FLOW
group half

This section describes one of the builtin types tops::half.

Defines

__TOPS_HALF_CMP(op)
__TOPS_HALF_BIN(op)
group nn

Defines

__TOPS_UNARY_KERNEL_OP(op)
__TOPS_UNARY_OP(op)
__TOPS_BINARY_KERNEL_OP(op)
__TOPS_BINARY_OP(op)
__TOPS_REDUCTION_KERNEL_OP(n, op)
group Reduction

Defines

__TOPS_REDUCTION_IS_ASYNC
__TOPS_REDUCE_DEFAULT_TILE_SIZE
__TOPS_ASSERT_L1_SIZE
__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)
__TOPS_REDUCTION_SMEM_OUTPUT_ADDR
__TOPS_REDUCTION_INPUT_FLOW
__TOPS_REDUCTION_INNER_INPUT_FLOW
__TOPS_REDUCTION_INNER_OUTPUT_FLOW
__TOPS_REDUCTION_OUTPUT_FLOW
__TOPS_VECTOR_MEM_LOCATION
group Scalar
dir include
dir include/tops/nn
dir include/tops
dir include/tops_wrappers