1. Kernel Library API¶
-
struct tops::bfloat¶
- #include <bfloat.h>
Declaration
The bfloat16 floating-point format is supported in
tops
namespace. You can declare a bfloat16 number on bothhost
anddevice
side by usingtops::bfloat
.#include <tops/bfloat.h> tops::bfloat a(12);
You can initialize a
tops::bfloat
variable with another type of variable. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/bfloat.h> short a = 12; tops::bfloat b(a); tops::bfloat c = (tops::bfloat)a; // explicit convert required tops::bfloat d{a}; tops::bfloat e(42);
Type Convertion
The
tops::bfloat
also supports type conversions. You can convert atops::bfloat
variable to a certain type. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/bfloat.h> tops::bfloat a(12.5); float b = a; // b equals to 12.500 int c = a; // c equals to 12 short d = a; // d equals to 12
Compound Assignment Operators
The
tops::bfloat
also supports Compound Assignment Operators. Operators+=
,-=
,*=
and/=
are supported.#include <tops/bfloat.h> tops::bfloat a(12); tops::bfloat b(4); a += b; // a equals to 16.0 a -= b; // a equals to 12.0 a *= b; // a equals to 48.0 a /= b; // a equals to 12.0
Comparison Operators
The
tops::bfloat
also supports Comparison Operators. Operators==
,!=
,>
,<
,>=
and<=
are supported.#include <tops/bfloat.h> tops::bfloat a(12); tops::bfloat b(4); bool c; c = (a == b); // c equals to False c = (a != b); // c equals to True c = (a > b); // c equals to True c = (a >= b); // c equals to True c = (a < b); // c equals to False c = (a <= b); // c equals to False
Four Arithmetic Operations
The
tops::bfloat
supports Four Arithmetic Operations. Operators+
,-
,*
and/
are supported.#include <tops/bfloat.h> tops::bfloat a(12); tops::bfloat b(4); tops::bfloat c; c = a + b; // c equals to 16.0 c = a - b; // c equals to 8.0 c = a * b; // c equals to 48.0 c = a / b; // c equals to 3.0
Private Functions
- inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const
- inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const
-
__forceinline__ __host__ __device__ bfloat() = default¶
Private Members
-
uint16_t value¶
tops::bfloat
in bit representation
-
struct tops::half¶
- #include <half.h>
Declaration
Half-precision floating-point data type
tops::half
is supported intops
namespace. You can declare a half number on bothhost
anddevice
side by usingtops::half
.#include <tops/half.h> tops::half a(12);
You can initialize a variable of type
tops::half
with different types of variables. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/half.h> short a = 12; tops::half b(a); tops::half c = (tops::half)a; // explicit convert required tops::half d{a}; tops::half e(42);
Type Convertion
The
tops::half
supports type conversions. You can convert atops::half
variable to a certain type. Typedouble
,float
,long
,unsigned long
,int
,unsigned int
,short
,unsigned short
,char
,signed char
,unsigned char
, andbool
are supported.#include <tops/half.h> tops::half a(12.5); float b = a; // b equals to 12.500 int c = a; // c equals to 12 short d = a; // d equals to 12
Compound Assignment Operators
The
tops::half
supports Compound Assignment Operators. Operators+=
,-=
,*=
and/=
are supported.#include <tops/half.h> tops::half a(12); tops::half b(4); a += b; // a equals to 16.0 a -= b; // a equals to 12.0 a *= b; // a equals to 48.0 a /= b; // a equals to 12.0
Comparison Operators
The
tops::half
supports Comparison Operators. Operators==
,!=
,>
,<
,>=
and<=
are supported.#include <tops/half.h> tops::half a(12); tops::half b(4); bool c = (a == b); // c equals to False bool c = (a != b); // c equals to True bool c = (a > b); // c equals to True bool c = (a >= b); // c equals to True bool c = (a < b); // c equals to False bool c = (a <= b); // c equals to False
Four Arithmetic Operations
The
tops::half
supports Four Arithmetic Operations. Operators+
,-
,*
and/
are supported.#include <tops/half.h> tops::half a(12); tops::half b(4); tops::half c = a + b; // c equals to 16.0 tops::half c = a - b; // c equals to 8.0 tops::half c = a * b; // c equals to 48.0 tops::half c = a / b; // c equals to 3.0
Private Functions
- inline __forceinline__ __host__ __device__ uint16_t from_float (float v) const
- inline __forceinline__ __host__ __device__ float to_float (uint16_t v) const
-
__forceinline__ __host__ __device__ half() = default¶
Private Members
-
uint16_t value¶
-
struct tops::InputType¶
Public Functions
-
inline __device__ __forceinline__ InputType(int id)¶
Public Members
-
int mc_id¶
-
inline __device__ __forceinline__ InputType(int id)¶
-
template<typename T>
struct is_input_io¶
-
template<>
struct tops::is_input_io<InputType>¶ Public Static Attributes
-
static const bool value = true¶
-
static const bool value = true¶
-
template<>
struct tops::is_input_io<OutputType>¶ Public Static Attributes
-
static const bool value = false¶
-
static const bool value = false¶
-
template<>
struct tops::MaxValue<tops::bfloat>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
template<>
struct tops::MaxValue<tops::half>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
static constexpr float value = std::numeric_limits<float>::max()¶
-
template<>
struct tops::MinValue<tops::bfloat>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
template<>
struct tops::MinValue<tops::half>¶ Public Static Attributes
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
static constexpr float value = std::numeric_limits<float>::lowest()¶
-
struct tops::OutputType¶
Public Functions
-
inline __device__ __forceinline__ OutputType(int id)¶
-
inline __device__ __forceinline__ OutputType operator+(int i)¶
Public Members
-
int mc_id¶
-
inline __device__ __forceinline__ OutputType(int id)¶
-
struct tops::OutputTypeG¶
Public Functions
-
inline __device__ __forceinline__ OutputType operator()(int mc_id) const¶
-
inline __device__ __forceinline__ operator OutputType() const¶
-
inline __device__ __forceinline__ OutputType operator+(int i)¶
-
inline __device__ __forceinline__ OutputType operator()(int mc_id) const¶
-
template<typename T>
struct scalar2vector¶
-
namespace tops¶
Typedefs
-
typedef bfloat bfloat16¶
Both
tops::bfloat
andtops::bfloat16
represent the bfloat16 floating-point format.
Functions
-
template<typename VT>
__device__ __forceinline__ VT vbroadcast(const bfloat &e, bool dummy = true)¶ A
tops::bfloat
can be broadcast to avbfloat
vector, which includes 64 elements oftops::bfloat
.
-
template<typename ...Args>
__device__ __forceinline__ dte_chain<Args...> make_dte_chain(Args&&... args)¶
-
template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename IO, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_tiles(const FT &f, int size, IO io, T *addr, Args&&... args)¶
-
template<int tile_size, bool vectorized = true, bool async = true, bool is_rem = false, int input_count = 0, int output_count = 0, int input_ev_count = 0, int output_ev_count = 0, typename T, typename FT>
__device__ __forceinline__ void elemwise_tiles(const FT &f, int size)¶
-
template<int tile_size = 0, bool vectorized = true, bool async = true, typename IO, typename T, typename FT, typename std::enable_if<std::is_same<IO, InputType>::value || std::is_same<IO, OutputType>::value, bool>::type = true, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, IO io, T *addr, Args&&... args)¶ tops::elemwise_kernel
is used to perform element-wise operations.- Parameters
f – Anonymous function for data computation on L1 memory.
total_size – The total size of elements in the input/output data.
io – tops::Input or tops::Output.
addr – pointer to the input/output data on device memory(L3).
args – Pairs of type (tops::Input/tops::Output) and address.
-
template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, InputTypeG io, T *addr, Args&&... args)¶
-
template<int tile_size = 0, bool vectorized = true, bool async = true, typename T, typename FT, typename ...Args>
__device__ __forceinline__ void elemwise_kernel(const FT &f, int total_size, OutputTypeG io, T *addr, Args&&... args)¶
-
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename VFT, typename SFT, typename std::enable_if<input_count + output_count == 0, bool>::type = true, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const VFT &vf, const SFT &sf, int size, IO io, T *addr, ARGS&&... args)¶
-
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename IO, typename FT, typename std::enable_if<input_count + output_count != 0, bool>::type = true, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)¶
-
template<int LATENCY = 10, typename T, int input_count = 0, int output_count = 0, typename FT>
__device__ __forceinline__ void elemwise_local(const FT &f, int size)¶
-
template<int LATENCY = 10, typename T, typename IO, typename FT, typename ...ARGS>
__device__ __forceinline__ void elemwise_local(const FT &f, int size, IO io, T *addr, ARGS&&... args)¶
-
template<typename VT>
__device__ __forceinline__ VT vbroadcast(const half &e, bool dummy = true)¶ A
tops::half
can be broadcast to avhalf
vector, which includes 64 elements oftops::half
.
- __device__ __attribute__ ((weak)) InputTypeG Input
-
template<int tile_size0, int tile_size1, int tile_size2, typename IT, typename OT, typename FT>
__device__ __forceinline__ void reduce_tiles(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], int (&offsets)[3], int (&sizes)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0, bool cross_thread = false)¶
-
template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename IT, typename OT, typename FT>
__device__ __forceinline__ void reduction_kernel(const FT &f, OT *output_addr, int (&output_shape)[2], IT *input_addr, int (&input_shape)[3], OT identity_value, int output_mc_id = 0, int input_mc_id = 0)¶ tops::reduction_kernel
is used to perform reduction operations on the 1st dimension (i.e., the middle dimension) of a 3D tensor.- Parameters
f – Anonymous function for data computation on L1 memory.
output_addr – [out] Pointer to output data on device memory(L3).
output_shape – An int array describing the shape of the output.
input_addr – [in] Pointer to input data on device memory(L3).
input_shape – An int array describing the shape of the input.
identity_value – Identity element of reduction op.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
Variables
- __shared__ unsigned char __tops_all_smem []
-
typedef bfloat bfloat16¶
-
namespace tops::nn¶
Enums
Functions
- template<std::size_t N> __global__ void transpose_l3tol3 (float *from, float *to, int *from_sp, int *to_sp, int *offset_sp)
-
inline DotForwardMode getDotForwardMode(const int &n, const int &ci, const int &co)¶
-
template<std::size_t RANK>
inline void transposeWrapper(float *src_d, float *reshape_d, int *src_shape_h, int *dst_shape_h, int *layout_h)¶
-
inline topsError_t dot_(float *lhs, float *rhs, float *out, int n, int k, int m, int gridsz = 1, int blocksz = 1, topsStream_t stream = nullptr)¶
- inline topsError_t dot (float *lhs, float *rhs, float *out, const int &n, const int &k, const int &m) __attribute__((diagnose_if(1
- if (mode==SplitCoMode0)
-
template<int tile_size = 0, typename T, typename FT>
__device__ __forceinline__ void select_kernel(FT f, T *output, T *lhs, T *rhs, int size, int output_mc = 0, int lhs_mc = 0, int rhs_mc = 0)¶ Adds corresponding elements of two arrays and stores the result in the output array.
Subtracts corresponding elements of two arrays and stores the result in the output array.
Multiplies corresponding elements of two arrays and stores the result in the output array.
Divides corresponding elements of two arrays and stores the result in the output array.
Computes the element-wise modulus of two arrays and stores the result in the output array.
Computes the element-wise maximum of two arrays and stores the result in the output array.
Computes the element-wise minimum of two arrays and stores the result in the output array.
Note
Supported vector types: vchar/vuchar/vshort/vushort/vint/vuint.
- Parameters
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
- template<int tile_size = 0, typename T, typename FT> __global__ void select (FT f, T *output, T *lhs, T *rhs, int size, int output_mc=0, int lhs_mc=0, int rhs_mc=0)
Select from two arrays by condition.
- Parameters
f – Conditional expressions on rhs and lhs.
output – Pointer to the output array on the device (L3 memory).
lhs – Pointer to the first input array on the device (L3 memory).
rhs – Pointer to the second input array on the device (L3 memory).
size – The total number of elements in the array.
output_mc – Memory control id for output memory.
lhs_mc – Memory control id for lhs input memory.
rhs_mc – Memory control id for rhs input memory.
-
template<typename T>
__device__ __forceinline__ void broadcast_in_dim_kernel(T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)¶
- template<typename T> __global__ void broadcast_in_dim (T *output, T *input, int dim0_size, int dim1_size, int broadcast_dim, int broadcast_dim_size)
Broadcasts a two-dimensional array to a three-dimensional array with customizable broadcast dimension and size.
- Parameters
output – Pointer to the output array on device memory(L3).
input – Pointer to the input array on device memory(L3).
dim0_size – The size of dimension 0 in the input array.
dim1_size – The size of dimension 1 in the input array.
broadcast_dim – The desired broadcast dimension in the output array, should be 0, 1 or 2.
broadcast_dim_size – The size of the broadcast dimension in the output array.
- template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void sum (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)
Performs a ReduceSum operation on the first (i.e., middle) dimension of a 3D tensor.
- Template Parameters
tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.
- Parameters
output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
- template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void max (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)
Performs a ReduceMax operation on a 3D tensor along the first (i.e. middle) dimension.
- Template Parameters
tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.
- Parameters
output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
- template<int tile_size0 = 0, int tile_size1 = 0, int tile_size2 = 0, typename T> __global__ void min (T *output_addr, T *input_addr, int dim0, int dim1, int dim2, int output_mc_id=0, int input_mc_id=0)
Performs a ReduceMin operation on a 3D tensor along the first (i.e. middle) dimension.
- Template Parameters
tile_size0 – tile size on the 0th dimension.
tile_size1 – tile size on the 1st dimension.
tile_size2 – tile size on the 2nd dimension.
- Parameters
output_addr – Pointer to output data on device memory(L3).
input_addr – Pointer to input data on device memory(L3).
dim0 – The size of the 0th dimension of the input 3D tensor.
dim1 – The size of the 1st dimension of the input 3D tensor.
dim2 – The size of the 2nd dimension of the input 3D tensor.
output_mc_id – Memory control id for output memory.
input_mc_id – Memory control id for input memory.
Variables
- topsError_t tops::nn::dot is used for demo For performance
- topsError_t tops::nn::dot is used for demo For pls use topsDNN
- topsError_t tops::nn::dot is used for demo For pls use warning {auto mode = getDotForwardMode(n, k, m)
-
float *lhs_reshape_d¶
-
float *rhs_reshape_d¶
-
topsError_t error = topsMalloc(&lhs_reshape_d, n * k * sizeof(float))¶
- return topsSuccess
- file bfloat.h
- #include <>#include <>#include <>
A builtin float type, tops::bfloat.
- file debug.h
- #include <>
- file dte_chain.h
- #include <>#include <>
- file elemwise.h
- #include <>#include “kernel_common.h”#include <>#include <>#include <>
Defines
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN¶
-
__TOPS_ELEMWISE_MAX_CDTE_VC¶
-
__TOPS_SHARED_MEM_STRIDE_BY_T¶
-
__TOPS_ASSERT_L1_SIZE¶
-
__TOPS_SMEM_INPUT_ADDR(pp)¶
-
__TOPS_SMEM_OUTPUT_ADDR(pp)¶
-
__TOPS_ELEMWISE_INPUT_FLOW¶
-
__TOPS_ELEMWISE_OUTPUT_FLOW¶
-
__TOPS_ELEMWISE_INNER_INPUT_FLOW¶
-
__TOPS_ELEMWISE_INNER_OUTPUT_FLOW¶
-
__TOPS_ELEMWISE_OUTPUT_FINI_FLOW¶
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN¶
- file elemwise_local.h
- #include <>#include <>#include “kernel_common.h”#include <>#include <>
Defines
-
__TOPS_ELEMWISE_LOCAL_EXT32(hi, lo)¶
-
__TOPS_ELEMWISE_LOCAL_EXT32(hi, lo)¶
- file half.h
- #include <>#include <>#include <>
A builtin float type, tops::half.
Defines
-
__TOPS_HALF_CTR(t)¶
-
__TOPS_HALF_CONV(t)¶
-
__TOPS_HALF_ASSIGN(op)¶
-
__TOPS_HALF_CMP(op)¶
-
__TOPS_HALF_BIN(op)¶
Functions
-
__device__ __forceinline__ uint16_t __float2half_rn(float x)¶
Convert a
float x
totops::half y
and return the binary ofy
, which is represented byuint16
type.float x = 42; uint16_t y = __float2half_rn(a); // b equals to 20800 // 0 10100 0101000000
-
__device__ __forceinline__ float __half2float(uint16_t x)¶
Convert a
tops::half x
tofloat y
and returny
.x
is represented byuint16
type.uint16_t x = 20800; // 20800 = 0b0101000101000000 float a = __half2float(x); // a equals to 42.000
-
__TOPS_HALF_CTR(t)¶
- file kernel_common.h
- #include <>#include <tops/bfloat.h>#include <tops/half.h>#include <>#include <>
- file nn.h
- #include <tops/nn/primitive.h>#include <tops/nn/math.h>#include <tops/nn/reduction.h>
- file dot.h
- #include <>#include <>#include <>#include <>
- file math.h
- #include <>#include <tops/elemwise.h>#include <tops/elemwise_local.h>#include <tops/nn/utils.h>
- file math.h
- #include “device_math.h”#include “host_math.h”
- file primitive.h
- #include <>#include <tops/elemwise.h>#include <tops/elemwise_local.h>#include <tops/nn/utils.h>#include <assert.h>
- file reduction.h
- #include <>#include <tops/reduction.h>#include <tops/nn/utils.h>#include <tops/elemwise_local.h>#include <tops/reduction_local.h>
Defines
-
__TOPS_REDUCTION_KERNEL_OP(n, op)¶
-
__TOPS_REDUCTION_KERNEL_OP(n, op)¶
- file reduction.h
- #include <>#include “kernel_common.h”
Defines
-
__TOPS_REDUCTION_IS_ASYNC¶
-
__TOPS_REDUCE_DEFAULT_TILE_SIZE¶
-
__TOPS_ASSERT_L1_SIZE
-
__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)¶
-
__TOPS_REDUCTION_SMEM_OUTPUT_ADDR¶
-
__TOPS_REDUCTION_INPUT_FLOW¶
-
__TOPS_REDUCTION_INNER_INPUT_FLOW¶
-
__TOPS_REDUCTION_INNER_OUTPUT_FLOW¶
-
__TOPS_REDUCTION_OUTPUT_FLOW¶
-
__TOPS_VECTOR_MEM_LOCATION¶
-
__TOPS_REDUCTION_IS_ASYNC¶
- file utils.h
- #include <>#include <tops/kernel_common.h>
- file reduction_local.h
- #include <>#include <tops/elemwise_local.h>#include <assert.h>
- file assert.h
- #include <>
- file device_math.h
- #include <>#include <>#include <>#include <assert.h>
Defines
-
__TOPS_KRT_MATH_WRAPPER1_P(op)¶
-
__TOPS_KRT_MATH_WRAPPER1(op)¶
-
__TOPS_KRT_MATH_WRAPPER2_P(op)¶
-
__TOPS_KRT_MATH_WRAPPER2(op)¶
-
__TOPS_MATH_BUILTIN(mf, md)¶
doxygen group Scalar
-
__TOPS_MATH_BUILTIN2(mf, md)¶
-
__TOPS_MATH_BUILTIN3(mf, md)¶
-
__TOPS_MATH_BUILTIN_D(mf, md)¶
-
__TOPS_MATH_BUILTIN2_D(mf, md)¶
-
__TOPS_MATH_BUILTIN3_D(mf, md)¶
Functions
-
__device__ __forceinline__ float __powf(float a, float b)¶
-
__device__ __forceinline__ float fmaxf(float a, float b)¶
-
__device__ __forceinline__ float __fmaxf(float a, float b)¶
-
__device__ __forceinline__ float fminf(float a, float b)¶
-
__device__ __forceinline__ float __fminf(float a, float b)¶
-
__device__ __forceinline__ float fabsf(float a)¶
-
__device__ __forceinline__ float __fabsf(float a)¶
-
__device__ unsigned int __popcountsi2(unsigned int)¶
-
__device__ __forceinline__ unsigned int __popc(unsigned int val)¶
-
__device__ unsigned long long int __popcountdi2(unsigned long long int)¶
-
__device__ __forceinline__ unsigned long long int __popcll(unsigned long long int val)¶
-
__device__ unsigned long long int __ffsdi2(unsigned long long int)¶
-
__device__ __forceinline__ unsigned long long int __ffs(unsigned long long int val)¶
-
__TOPS_KRT_MATH_WRAPPER1_P(op)¶
- file host_math.h
- file stdio.h
- file wchar.h
- group bfloat
This section describes one of the builtin types
tops::bfloat
.Defines
-
__TOPS_BFLOAT_CMP(op)
-
__TOPS_BFLOAT_BIN(op)
Typedefs
-
typedef bfloat bfloat16¶
Both
tops::bfloat
andtops::bfloat16
represent the bfloat16 floating-point format.
-
__TOPS_BFLOAT_CMP(op)
- group Elementwise
Defines
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN
-
__TOPS_ELEMWISE_MAX_CDTE_VC
-
__TOPS_SHARED_MEM_STRIDE_BY_T
-
__TOPS_ASSERT_L1_SIZE
-
__TOPS_SMEM_INPUT_ADDR(pp)
-
__TOPS_SMEM_OUTPUT_ADDR(pp)
-
__TOPS_ELEMWISE_INPUT_FLOW
-
__TOPS_ELEMWISE_OUTPUT_FLOW
-
__TOPS_ELEMWISE_INNER_INPUT_FLOW
-
__TOPS_ELEMWISE_INNER_OUTPUT_FLOW
-
__TOPS_ELEMWISE_OUTPUT_FINI_FLOW
-
__TOPS_ELEMWISE_TILE_SIZE_ALIGN
- group half
This section describes one of the builtin types
tops::half
.Defines
-
__TOPS_HALF_CMP(op)
-
__TOPS_HALF_BIN(op)
-
__TOPS_HALF_CMP(op)
- group nn
Defines
-
__TOPS_UNARY_KERNEL_OP(op)
-
__TOPS_UNARY_OP(op)
-
__TOPS_BINARY_KERNEL_OP(op)
-
__TOPS_BINARY_OP(op)
-
__TOPS_REDUCTION_KERNEL_OP(n, op)
-
__TOPS_UNARY_KERNEL_OP(op)
- group Reduction
Defines
-
__TOPS_REDUCTION_IS_ASYNC
-
__TOPS_REDUCE_DEFAULT_TILE_SIZE
-
__TOPS_ASSERT_L1_SIZE
-
__TOPS_REDUCTION_SMEM_INPUT_ADDR(pp)
-
__TOPS_REDUCTION_SMEM_OUTPUT_ADDR
-
__TOPS_REDUCTION_INPUT_FLOW
-
__TOPS_REDUCTION_INNER_INPUT_FLOW
-
__TOPS_REDUCTION_INNER_OUTPUT_FLOW
-
__TOPS_REDUCTION_OUTPUT_FLOW
-
__TOPS_VECTOR_MEM_LOCATION
-
__TOPS_REDUCTION_IS_ASYNC
- group Scalar
- dir include
- dir include/tops/nn
- dir include/tops
- dir include/tops_wrappers