1. Kernel Library API¶
-
struct tops::bfloat¶
- #include <bfloat.h>
Declaration
The bfloat16 floating-point format is supported in
tops
namespace. You can declare a bfloat16 number on bothhost
anddevice
side by usingtops::bfloat
.#include <tops/bfloat.h> tops::bfloat a(12);
You can initialize a
tops::bfloat
variable with another type of variable. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/bfloat.h> short a = 12; tops::bfloat b(a); tops::bfloat c = (tops::bfloat)a; // explicit convert required tops::bfloat d{a}; tops::bfloat e(42);
Type Convertion
The
tops::bfloat
also supports type conversions. You can convert atops::bfloat
variable to a certain type. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/bfloat.h> tops::bfloat a(12.5); float b = a; // b equals to 12.500 int c = a; // c equals to 12 short d = a; // d equals to 12
Compound Assignment Operators
The
tops::bfloat
also supports Compound Assignment Operators. Operators+=
,-=
,*=
and/=
are supported.#include <tops/bfloat.h> tops::bfloat a(12); tops::bfloat b(4); a += b; // a equals to 16.0 a -= b; // a equals to 12.0 a *= b; // a equals to 48.0 a /= b; // a equals to 12.0
Comparison Operators
The
tops::bfloat
also supports Comparison Operators. Operators==
,!=
,>
,<
,>=
and<=
are supported.#include <tops/bfloat.h> tops::bfloat a(12); tops::bfloat b(4); bool c; c = (a == b); // c equals to False c = (a != b); // c equals to True c = (a > b); // c equals to True c = (a >= b); // c equals to True c = (a < b); // c equals to False c = (a <= b); // c equals to False
Four Arithmetic Operations
The
tops::bfloat
supports Four Arithmetic Operations. Operators+
,-
,*
and/
are supported.#include <tops/bfloat.h> tops::bfloat a(12); tops::bfloat b(4); tops::bfloat c; c = a + b; // c equals to 16.0 c = a - b; // c equals to 8.0 c = a * b; // c equals to 48.0 c = a / b; // c equals to 3.0
Private Functions
- inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const
- inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const
-
__forceinline__ __host__ __device__ bfloat() = default¶
Private Members
-
uint16_t value¶
tops::bfloat
in bit representation
-
struct tops::half¶
- #include <half.h>
Declaration
Half-precision floating-point data type
tops::half
is supported intops
namespace. You can declare a half number on bothhost
anddevice
side by usingtops::half
.#include <tops/half.h> tops::half a(12);
You can initialize a variable of type
tops::half
with different types of variables. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/half.h> short a = 12; tops::half b(a); tops::half c = (tops::half)a; // explicit convert required tops::half d{a}; tops::half e(42);
Type Convertion
The
tops::half
supports type conversions. You can convert atops::half
variable to a certain type. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/half.h> tops::half a(12.5); float b = a; // b equals to 12.500 int c = a; // c equals to 12 short d = a; // d equals to 12
Compound Assignment Operators
The
tops::half
supports Compound Assignment Operators. Operators+=
,-=
,*=
and/=
are supported.#include <tops/half.h> tops::half a(12); tops::half b(4); a += b; // a equals to 16.0 a -= b; // a equals to 12.0 a *= b; // a equals to 48.0 a /= b; // a equals to 12.0
Comparison Operators
The
tops::half
supports Comparison Operators. Operators==
,!=
,>
,<
,>=
and<=
are supported.#include <tops/half.h> tops::half a(12); tops::half b(4); bool c = (a == b); // c equals to False bool c = (a != b); // c equals to True bool c = (a > b); // c equals to True bool c = (a >= b); // c equals to True bool c = (a < b); // c equals to False bool c = (a <= b); // c equals to False
Four Arithmetic Operations
The
tops::half
supports Four Arithmetic Operations. Operators+
,-
,*
and/
are supported.#include <tops/half.h> tops::half a(12); tops::half b(4); tops::half c = a + b; // c equals to 16.0 tops::half c = a - b; // c equals to 8.0 tops::half c = a * b; // c equals to 48.0 tops::half c = a / b; // c equals to 3.0
Private Functions
- inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const
- inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const
-
__forceinline__ __host__ __device__ half() = default¶
Private Members
-
uint16_t value¶
-
struct tops::InputType¶
Public Functions
-
inline __device__ __forceinline__ InputType(int id)¶
Public Members
-
int mc_id¶
-
inline __device__ __forceinline__ InputType(int id)¶
-
template<typename T>
struct is_input_io¶
-
template<>
struct tops::is_input_io<InputType>¶ Public Static Attributes
-
static const bool value = true¶
-
static const bool value = true¶
-
template<>
struct tops::is_input_io<OutputType>¶ Public Static Attributes
-
static const bool value = false¶
-
static const bool value = false¶
-
template<>
struct tops::MaxValue<tops::bfloat>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
template<>
struct tops::MaxValue<tops::half>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
template<>
struct tops::MinValue<tops::bfloat>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
template<>
struct tops::MinValue<tops::half>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
struct tops::OutputType¶
Public Functions
-
inline __device__ __forceinline__ OutputType(int id)¶
-
inline __device__ __forceinline__ OutputType operator+(int i)¶
Public Members
-
int mc_id¶
-
inline __device__ __forceinline__ OutputType(int id)¶
-
struct tops::OutputTypeG¶
Public Functions
-
inline __device__ __forceinline__ OutputType operator()(int mc_id) const¶
-
inline __device__ __forceinline__ operator OutputType() const¶
-
inline __device__ __forceinline__ OutputType operator+(int i)¶
-
inline __device__ __forceinline__ OutputType operator()(int mc_id) const¶
-
template<typename T>
struct scalar2vector¶
-
namespace tops¶
Typedefs
-
typedef bfloat bfloat16¶
Both
tops::bfloat
andtops::bfloat16
represent the bfloat16 floating-point format.
Functions
-
template<typename VT>
__device__ __forceinline__ VT vbroadcast(const bfloat &e, bool dummy = true)¶ A
tops::bfloat
can be broadcast to avbfloat
vector, which includes 64 elements oftops::bfloat
.
-
template<typename ...Args>
__device__ __forceinline__ dte_chain<Args...> make_dte_chain(Args&&... args)¶
-
template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename IO, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_tiles(const FT &f, int size, IO io, T *addr, Args&&... args)¶
-
template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename FT>
__device__ __forceinline__ void elemwise_tiles(const FT &f, int size)¶
-
template<int tile_size = 0, bool vectorized = true, bool async = true, typename IO, typename T, typename FT, typename std::enable_if<std::is_same<IO, InputType>::value || std::is_same<IO, OutputType>::value, bool>::type = true, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, IO io, T *addr, Args&&... args)¶ tops::elemwise_kernel
is used to perform element-wise operations.- Parameters
f – Anonymous function for data computation on L1 memory.
total_size – The total size of elements in the input/output data.
io – tops::Input or tops::Output.
addr – pointer to the input/output data on device memory(L3).
args – Pairs of type (tops::Input/tops::Output) and address.
-
template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, InputTypeG io, T *addr, Args&&... args)¶
-
template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, OutputTypeG io, T *addr, Args&&... args)¶
-
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename VFT, typename SFT, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const VFT &vf, const SFT &sf, int size, IO io, T *addr, ARGS&&... args)¶
-
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename FT, typename std::enable_if<input_count + output_count != 0, bool>::type = true, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)¶
-
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename FT>
__device__ __forceinline__ void elemwise_local(const FT &f, int size)¶
-
template<int LATENCY = 10, typename T, typename IO, typename FT, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)¶
-
template<typename VT>
__device__ __forceinline__ VT vbroadcast(const half &e, bool dummy = true)¶ A
tops::half
can be broadcast to avhalf
vector, which includes 64 elements oftops::half
.
- __device__ __attribute__ ((weak)) InputTypeG Input
-
template<int tile_size0, int tile_size1, int tile_size2, typename IT, typename OT, typename FT>
__device__ __forceinline__ void reduce_tiles(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], int (&offsets)[3], int (&sizes)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0, bool cross_thread = false)¶
-
template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename IT, typename OT, typename FT>
__device__ __forceinline__ void reduction_kernel(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0)¶ tops::reduction_kernel
is used to perform reduction operations on the 1st dimension (i.e., the middle dimension) of a 3D tensor.- Parameters
f – Anonymous function for data computation on L1 memory.
output_addr – [out] Pointer to output data on device memory(L3).
output_shape – An int array describing the shape of the output.
input_addr – [in] Pointer to input data on device memory(L3).
input_shape – An int array describing the shape of the input.
identity_value – Identity element of reduction op.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
Variables
- __shared__ unsigned char __tops_all_smem []
-
typedef bfloat bfloat16¶
-
namespace tops::nn¶
Enums
Functions
- template<std::size_t N> __global__ void transpose_l3tol3 (float *from, float *to, int *from_sp, int *to_sp, int *offset_sp)
-
inline DotForwardMode getDotForwardMode(const int &n, const int &ci, const int &co)¶
-
template<std::size_t RANK>
inline void transposeWrapper(float *src_d, float *reshape_d, int *src_shape_h, int *dst_shape_h, int *layout_h)¶
-
inline topsError_t dot_(float *lhs, float *rhs, float *out, int n, int k, int m, int gridsz = 1, int blocksz = 1, topsStream_t stream = nullptr)¶
- inline topsError_t dot (float *lhs, float *rhs, float *out, const int &n, const int &k, const int &m) __attribute__((diagnose_if(1
- if (mode==SplitCoMode0)
-
template<int tile_size = 0, typename T, typename FT>
__device__ __forceinline__ void select_kernel(FT f, T *output, T *lhs, T *rhs, int size, int output_mc = 0, int lhs_mc = 0, int rhs_mc = 0)¶ Adds corresponding elements of two arrays and stores the result in the output array.
\( output[i] = lhs[i] + rhs[i]. \)
Subtracts corresponding elements of two arrays and stores the result in the output array.
\( output[i] = lhs[i] - rhs[i]. \)
Multiplies corresponding elements of two arrays and stores the result in the output array.
\( output[i] = lhs[i] * rhs[i]. \)
Divides corresponding elements of two arrays and stores the result in the output array.
\( output[i] = lhs[i] / rhs[i]. \)
Computes the element-wise modulus of two arrays and stores the result in the output array.
\( output[i] = lhs[i]\mod rhs[i]. \)
Computes the element-wise maximum of two arrays and stores the result in the output array.
\( output[i] = \max(lhs[i], rhs[i]). \)
Computes the element-wise minimum of two arrays and stores the result in the output array.
\( output[i] = \min(lhs[i], rhs[i]). \)
Note
Supported vector types: vchar/vuchar/vshort/vushort/vint/vuint.
- Parameters
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
- template<int tile_size = 0, typename T, typename FT> __global__ void select (FT f, T *output, T *lhs, T *rhs, int size, int output_mc=0, int lhs_mc=0, int rhs_mc=0)
Select from two arrays by condition.
\( output[i] = cond[i] ? lhs[i] : rhs[i]. \)
- Parameters
f – Conditional expressions on rhs and lhs.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
-
template<typename T>
__device__ __forceinline__ void broadcast_in_dim_kernel(T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)¶
- template<typename T> __global__ void broadcast_in_dim (T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)
Broadcasts a two-dimensional array to a three-dimensional array with customizable broadcast dimension and size.
- Parameters
output – Pointer to the output array on device memory(L3).
input – Pointer to the input array on device memory(L3).
dim0_size – The size of dimension 0 in the input array.
dim1_size – The size of dimension 1 in the input array.
broadcast_dim – The desired broadcast dimension in the output array, should be 0, 1 or 2.
broadcast_dim_size – The size of the broadcast dimension in the output array.
- template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void sum (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)
Performs a ReduceSum operation on the first (i.e., middle) dimension of a 3D tensor.
- Template Parameters
tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.
- Parameters
output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
- template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void max (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)
Performs a ReduceMax operation on a 3D tensor along the first (i.e. middle) dimension.
- Template Parameters
tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.
- Parameters
output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
- template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void min (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)
Performs a ReduceMin operation on a 3D tensor along the first (i.e. middle) dimension.
- Template Parameters
tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.
- Parameters
output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
Variables
- topsError_t tops::nn::dot is used for demo For performance
- topsError_t tops::nn::dot is used for demo For pls use topsDNN
- topsError_t tops::nn::dot is used for demo For pls use warning {auto mode = getDotForwardMode(n, k, m)
-
float *lhs_reshape_d¶
-
float *rhs_reshape_d¶
-
topsError_t error = topsMalloc(&lhs_reshape_d, n * k * sizeof(float))¶
- return topsSuccess
- file bfloat.h
- #include <>#include <>#include <>
A builtin float type, tops::bfloat.
- file debug.h
- #include <>
- file dte_chain.h
- #include <>#include <>
- file elemwise.h
- #include <>#include “kernel_common.h”#include <>#include <>#include <>
Defines
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN¶
-
__TOPS_ELEMWISE_MAX_CDTE_VC¶
-
__TOPS_SHARED_MEM_STRIDE_BY_T¶
-
__TOPS_ASSERT_L1_SIZE¶
-
__TOPS_SMEM_INPUT_ADDR(pp)¶
-
__TOPS_SMEM_OUTPUT_ADDR(pp)¶
-
__TOPS_ELEMWISE_INPUT_FLOW¶
-
__TOPS_ELEMWISE_OUTPUT_FLOW¶
-
__TOPS_ELEMWISE_INNER_INPUT_FLOW¶
-
__TOPS_ELEMWISE_INNER_OUTPUT_FLOW¶
-
__TOPS_ELEMWISE_OUTPUT_FINI_FLOW¶
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN¶
- file elemwise_local.h
- #include <>#include <>#include “kernel_common.h”#include <>#include <>
Defines
-
__TOPS_ELEMWISE_LOCAL_EXT32(hi, lo)¶
-
__TOPS_ELEMWISE_LOCAL_EXT32(hi, lo)¶
- file half.h
- #include <>#include <>#include <>
A builtin float type, tops::half.
Defines
-
__TOPS_HALF_CTR(t)¶
-
__TOPS_HALF_CONV(t)¶
-
__TOPS_HALF_ASSIGN(op)¶
-
__TOPS_HALF_CMP(op)¶
-
__TOPS_HALF_BIN(op)¶
Functions
-
__device__ __forceinline__ uint16_t __float2half_rn(float x)¶
Convert a
float x
totops::half y
and return the binary ofy
, which is represented byuint16
type.float x = 42; uint16_t y = __float2half_rn(a); // b equals to 20800 // 0 10100 0101000000
-
__device__ __forceinline__ float __half2float(uint16_t x)¶
Convert a
tops::half x
tofloat y
and returny
.x
is represented byuint16
type.uint16_t x = 20800; // 20800 = 0b0101000101000000 float a = __half2float(x); // a equals to 42.000
-
__TOPS_HALF_CTR(t)¶
- file kernel_common.h
- #include <>#include <tops/bfloat.h>#include <tops/half.h>#include <>#include <>
- file nn.h
- #include <tops/nn/primitive.h>#include <tops/nn/math.h>#include <tops/nn/reduction.h>
- file dot.h
- #include <>#include <>#include <>#include <>
- file math.h
- #include <>#include <tops/elemwise.h>#include <tops/elemwise_local.h>#include <tops/nn/utils.h>
- file math.h
- #include “device_math.h”#include “host_math.h”
- file primitive.h
- #include <>#include <tops/elemwise.h>#include <tops/elemwise_local.h>#include <tops/nn/utils.h>#include <assert.h>
- file reduction.h
- #include <>#include <tops/reduction.h>#include <tops/nn/utils.h>#include <tops/elemwise_local.h>#include <tops/reduction_local.h>
Defines
-
__TOPS_REDUCTION_KERNEL_OP(n, op)¶
-
__TOPS_REDUCTION_KERNEL_OP(n, op)¶
- file reduction.h
- #include <>#include “kernel_common.h”
Defines
-
__TOPS_REDUCTION_IS_ASYNC¶
-
__TOPS_REDUCE_DEFAULT_TILE_SIZE¶
-
__TOPS_ASSERT_L1_SIZE
-
__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)¶
-
__TOPS_REDUCTION_SMEM_OUTPUT_ADDR¶
-
__TOPS_REDUCTION_INPUT_FLOW¶
-
__TOPS_REDUCTION_INNER_INPUT_FLOW¶
-
__TOPS_REDUCTION_INNER_OUTPUT_FLOW¶
-
__TOPS_REDUCTION_OUTPUT_FLOW¶
-
__TOPS_VECTOR_MEM_LOCATION¶
-
__TOPS_REDUCTION_IS_ASYNC¶
- file utils.h
- #include <>#include <tops/kernel_common.h>
- file reduction_local.h
- #include <>#include <tops/elemwise_local.h>#include <assert.h>
- file assert.h
- #include <>
- file device_math.h
- #include <>#include <>#include <>#include <assert.h>
Defines
-
__TOPS_KRT_MATH_WRAPPER1_P(op)¶
-
__TOPS_KRT_MATH_WRAPPER1(op)¶
-
__TOPS_KRT_MATH_WRAPPER2_P(op)¶
-
__TOPS_KRT_MATH_WRAPPER2(op)¶
-
__TOPS_MATH_BUILTIN(mf, md)¶
doxygen group Scalar
-
__TOPS_MATH_BUILTIN2(mf, md)¶
-
__TOPS_MATH_BUILTIN3(mf, md)¶
-
__TOPS_MATH_BUILTIN_D(mf, md)¶
-
__TOPS_MATH_BUILTIN2_D(mf, md)¶
-
__TOPS_MATH_BUILTIN3_D(mf, md)¶
Functions
-
__device__ __forceinline__ float __powf(float a, float b)¶
-
__device__ __forceinline__ float fmaxf(float a, float b)¶
-
__device__ __forceinline__ float __fmaxf(float a, float b)¶
-
__device__ __forceinline__ float fminf(float a, float b)¶
-
__device__ __forceinline__ float __fminf(float a, float b)¶
-
__device__ __forceinline__ float fabsf(float a)¶
-
__device__ __forceinline__ float __fabsf(float a)¶
-
__device__ unsigned int __popcountsi2(unsigned int)¶
-
__device__ __forceinline__ unsigned int __popc(unsigned int val)¶
-
__device__ unsigned long long int __popcountdi2(unsigned long long int)¶
-
__device__ __forceinline__ unsigned long long int __popcll(unsigned long long int val)¶
-
__device__ unsigned long long int __ffsdi2(unsigned long long int)¶
-
__device__ __forceinline__ unsigned long long int __ffs(unsigned long long int val)¶
-
__TOPS_KRT_MATH_WRAPPER1_P(op)¶
- file host_math.h
- file stdio.h
- file wchar.h
- group bfloat
This section describes one of the builtin types
tops::bfloat
.Defines
-
__TOPS_BFLOAT_CMP(op)
-
__TOPS_BFLOAT_BIN(op)
Typedefs
-
typedef bfloat bfloat16¶
Both
tops::bfloat
andtops::bfloat16
represent the bfloat16 floating-point format.
-
__TOPS_BFLOAT_CMP(op)
- group Elementwise
Defines
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN
-
__TOPS_ELEMWISE_MAX_CDTE_VC
-
__TOPS_SHARED_MEM_STRIDE_BY_T
-
__TOPS_ASSERT_L1_SIZE
-
__TOPS_SMEM_INPUT_ADDR(pp)
-
__TOPS_SMEM_OUTPUT_ADDR(pp)
-
__TOPS_ELEMWISE_INPUT_FLOW
-
__TOPS_ELEMWISE_OUTPUT_FLOW
-
__TOPS_ELEMWISE_INNER_INPUT_FLOW
-
__TOPS_ELEMWISE_INNER_OUTPUT_FLOW
-
__TOPS_ELEMWISE_OUTPUT_FINI_FLOW
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN
- group half
This section describes one of the builtin types
tops::half
.Defines
-
__TOPS_HALF_CMP(op)
-
__TOPS_HALF_BIN(op)
-
__TOPS_HALF_CMP(op)
- group nn
Defines
-
__TOPS_UNARY_KERNEL_OP(op)
-
__TOPS_UNARY_OP(op)
-
__TOPS_BINARY_KERNEL_OP(op)
-
__TOPS_BINARY_OP(op)
-
__TOPS_REDUCTION_KERNEL_OP(n, op)
-
__TOPS_UNARY_KERNEL_OP(op)
- group Reduction
Defines
-
__TOPS_REDUCTION_IS_ASYNC
-
__TOPS_REDUCE_DEFAULT_TILE_SIZE
-
__TOPS_ASSERT_L1_SIZE
-
__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)
-
__TOPS_REDUCTION_SMEM_OUTPUT_ADDR
-
__TOPS_REDUCTION_INPUT_FLOW
-
__TOPS_REDUCTION_INNER_INPUT_FLOW
-
__TOPS_REDUCTION_INNER_OUTPUT_FLOW
-
__TOPS_REDUCTION_OUTPUT_FLOW
-
__TOPS_VECTOR_MEM_LOCATION
-
__TOPS_REDUCTION_IS_ASYNC
- group Scalar
- dir include
- dir include/tops/nn
- dir include/tops
- dir include/tops_wrappers