diff --git a/nanovdb/nanovdb/NanoVDB.h b/nanovdb/nanovdb/NanoVDB.h index fde5c47..cff460a 100644 --- a/nanovdb/nanovdb/NanoVDB.h +++ b/nanovdb/nanovdb/NanoVDB.h @@ -140,7 +140,27 @@ #define NANOVDB_ALIGN(n) alignas(n) #endif // !defined(NANOVDB_ALIGN) -#ifdef __CUDACC_RTC__ +#ifdef __KERNEL_METAL__ + +using namespace metal; +#define std metal +#define double uint64_t +#define __global__ device +#define __local__ thread +#define __constant__ constant +#define sqrtf sqrt +#define rintf rint +#define fminf fmin +#define fmaxf fmax +#define floorf floor +#define ceilf ceil +#define fabs abs +#define fmaf fma +#define tanf tan + +#define NANOVDB_ASSERT(x) + +#elif defined(__CUDACC_RTC__) typedef signed char int8_t; typedef short int16_t; @@ -157,6 +177,10 @@ typedef unsigned long long uint64_t; #else // !__CUDACC_RTC__ +#define __constant__ const +#define __global__ +#define __local__ + #include // for abs in clang7 #include // for types like int32_t etc #include // for size_t type @@ -262,7 +286,7 @@ enum class GridType : uint32_t { Unknown = 0, Index = 19,// index into an external array of values End = 20 }; -#ifndef __CUDACC_RTC__ +#if !defined(__CUDACC_RTC__) && !defined(__KERNEL_METAL__) /// @brief Retuns a c-string used to describe a GridType inline const char* toStr(GridType gridType) { @@ -289,7 +313,7 @@ enum class GridClass : uint32_t { Unknown = 0, IndexGrid = 8,// grid whose values are offsets, e.g. into an external array End = 9 }; -#ifndef __CUDACC_RTC__ +#if !defined(__CUDACC_RTC__) && !defined(__KERNEL_METAL__) /// @brief Retuns a c-string used to describe a GridClass inline const char* toStr(GridClass gridClass) { @@ -313,7 +337,7 @@ enum class GridFlags : uint32_t { End = 1 << 6, }; -#ifndef __CUDACC_RTC__ +#if !defined(__CUDACC_RTC__) && !defined(__KERNEL_METAL__) /// @brief Retuns a c-string used to describe a GridFlags inline const char* toStr(GridFlags gridFlags) { @@ -355,13 +379,13 @@ enum class GridBlindDataSemantic : uint32_t { Unknown = 0, template struct is_same { - static constexpr bool value = false; + static __constant__ constexpr bool value = false; }; template struct is_same { - static constexpr bool value = true; + static __constant__ constexpr bool value = true; }; // --------------------------> enable_if <------------------------------------ @@ -383,13 +407,13 @@ struct enable_if template struct is_const { - static constexpr bool value = false; + static __constant__ constexpr bool value = false; }; template struct is_const { - static constexpr bool value = true; + static __constant__ constexpr bool value = true; }; // --------------------------> remove_const <------------------------------------ @@ -412,7 +436,7 @@ struct remove_const template struct is_floating_point { - static const bool value = is_same::value || is_same::value; + static __constant__ const bool value = is_same::value || is_same::value; }; // --------------------------> is_specialization <------------------------------------ @@ -425,12 +449,12 @@ struct is_floating_point template class TemplateType> struct is_specialization { - static const bool value = false; + static __constant__ const bool value = false; }; template class TemplateType> struct is_specialization, TemplateType> { - static const bool value = true; + static __constant__ const bool value = true; }; // --------------------------> Value Map <------------------------------------ @@ -495,19 +519,19 @@ struct BuildToValueMap // --------------------------> utility functions related to alignment <------------------------------------ /// @brief return true if the specified pointer is aligned -__hostdev__ inline static bool isAligned(const void* p) +__hostdev__ inline static bool isAligned(__global__ const void* p) { return uint64_t(p) % NANOVDB_DATA_ALIGNMENT == 0; } /// @brief return true if the specified pointer is aligned and not NULL -__hostdev__ inline static bool isValid(const void* p) +__hostdev__ inline static bool isValid(__global__ const void* p) { return p != nullptr && uint64_t(p) % NANOVDB_DATA_ALIGNMENT == 0; } /// @brief return the smallest number of bytes that when added to the specified pointer results in an aligned pointer -__hostdev__ inline static uint64_t alignmentPadding(const void* p) +__hostdev__ inline static uint64_t alignmentPadding(__global__ const void* p) { NANOVDB_ASSERT(p); return (NANOVDB_DATA_ALIGNMENT - (uint64_t(p) % NANOVDB_DATA_ALIGNMENT)) % NANOVDB_DATA_ALIGNMENT; @@ -515,43 +539,66 @@ __hostdev__ inline static uint64_t alignmentPadding(const void* p) /// @brief offset the specified pointer so it is aligned. template -__hostdev__ inline static T* alignPtr(T* p) +__hostdev__ inline static __global__ T* alignPtr(__global__ T* p) { NANOVDB_ASSERT(p); - return reinterpret_cast( (uint8_t*)p + alignmentPadding(p) ); + return reinterpret_cast<__global__ T*>( (__global__ uint8_t*)p + alignmentPadding(p) ); } /// @brief offset the specified pointer so it is aligned. template -__hostdev__ inline static const T* alignPtr(const T* p) +__hostdev__ inline static __global__ const T* alignPtr(__global__ const T* p) { NANOVDB_ASSERT(p); - return reinterpret_cast( (const uint8_t*)p + alignmentPadding(p) ); + return reinterpret_cast<__global__ const T*>( (__global__ const uint8_t*)p + alignmentPadding(p) ); } // --------------------------> PtrDiff PtrAdd <------------------------------------ template -__hostdev__ inline static int64_t PtrDiff(const T1* p, const T2* q) +__hostdev__ inline static int64_t PtrDiff(__global__ const T1* p, __global__ const T2* q) { NANOVDB_ASSERT(p && q); - return reinterpret_cast(p) - reinterpret_cast(q); + return reinterpret_cast<__global__ const char*>(p) - reinterpret_cast<__global__ const char*>(q); } +#if defined(__KERNEL_METAL__) +template +__hostdev__ inline static int64_t PtrDiff(__local__ const T1* p, __local__ const T2* q) +{ + NANOVDB_ASSERT(p && q); + return reinterpret_cast<__local__ const char*>(p) - reinterpret_cast<__local__ const char*>(q); +} +#endif template -__hostdev__ inline static DstT* PtrAdd(SrcT *p, int64_t offset) +__hostdev__ inline static __global__ DstT* PtrAdd(__global__ SrcT *p, int64_t offset) { NANOVDB_ASSERT(p); - return reinterpret_cast(reinterpret_cast(p) + offset); + return reinterpret_cast<__global__ DstT*>(reinterpret_cast<__global__ char*>(p) + offset); } +#if defined(__KERNEL_METAL__) +template +__hostdev__ inline static __local__ DstT* PtrAdd(__local__ SrcT *p, int64_t offset) +{ + NANOVDB_ASSERT(p); + return reinterpret_cast<__local__ DstT*>(reinterpret_cast<__local__ char*>(p) + offset); +} +#endif template -__hostdev__ inline static const DstT* PtrAdd(const SrcT *p, int64_t offset) +__hostdev__ inline static __global__ const DstT* PtrAdd(__global__ const SrcT *p, int64_t offset) { NANOVDB_ASSERT(p); - return reinterpret_cast(reinterpret_cast(p) + offset); + return reinterpret_cast<__global__ const DstT*>(reinterpret_cast<__global__ const char*>(p) + offset); } - +#if defined(__KERNEL_METAL__) +template +__hostdev__ inline static __local__ const DstT* PtrAdd(__local__ const SrcT *p, int64_t offset) +{ + NANOVDB_ASSERT(p); + return reinterpret_cast<__local__ const DstT*>(reinterpret_cast<__local__ const char*>(p) + offset); +} +#endif // --------------------------> Rgba8 <------------------------------------ /// @brief 8-bit red, green, blue, alpha packed into 32 bit unsigned int @@ -562,13 +609,13 @@ class Rgba8 uint32_t packed;// 32 bit packed representation } mData; public: - static const int SIZE = 4; + static __constant__ const int SIZE = 4; using ValueType = uint8_t; - Rgba8(const Rgba8&) = default; - Rgba8(Rgba8&&) = default; - Rgba8& operator=(Rgba8&&) = default; - Rgba8& operator=(const Rgba8&) = default; + Rgba8(__global__ const Rgba8&) = default; + Rgba8(__global__ Rgba8&&) = default; + __global__ Rgba8& operator=(__global__ Rgba8&&) __global__ = default; + __global__ Rgba8& operator=(__global__ const Rgba8&) __global__ = default; __hostdev__ Rgba8() : mData{0,0,0,0} {static_assert(sizeof(uint32_t) == sizeof(Rgba8),"Unexpected sizeof");} __hostdev__ Rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a = 255u) : mData{r, g, b, a} {} explicit __hostdev__ Rgba8(uint8_t v) : Rgba8(v,v,v,v) {} @@ -579,8 +626,8 @@ public: (uint8_t(0.5f + a * 255.0f))}// round to nearest { } - __hostdev__ bool operator<(const Rgba8& rhs) const { return mData.packed < rhs.mData.packed; } - __hostdev__ bool operator==(const Rgba8& rhs) const { return mData.packed == rhs.mData.packed; } + __hostdev__ bool operator<(__global__ const Rgba8& rhs) const { return mData.packed < rhs.mData.packed; } + __hostdev__ bool operator==(__global__ const Rgba8& rhs) const { return mData.packed == rhs.mData.packed; } __hostdev__ float lengthSqr() const { return 0.0000153787005f*(float(mData.c[0])*mData.c[0] + @@ -588,18 +635,18 @@ public: float(mData.c[2])*mData.c[2]);//1/255^2 } __hostdev__ float length() const { return sqrtf(this->lengthSqr() ); } - __hostdev__ const uint8_t& operator[](int n) const { return mData.c[n]; } - __hostdev__ uint8_t& operator[](int n) { return mData.c[n]; } - __hostdev__ const uint32_t& packed() const { return mData.packed; } - __hostdev__ uint32_t& packed() { return mData.packed; } - __hostdev__ const uint8_t& r() const { return mData.c[0]; } - __hostdev__ const uint8_t& g() const { return mData.c[1]; } - __hostdev__ const uint8_t& b() const { return mData.c[2]; } - __hostdev__ const uint8_t& a() const { return mData.c[3]; } - __hostdev__ uint8_t& r() { return mData.c[0]; } - __hostdev__ uint8_t& g() { return mData.c[1]; } - __hostdev__ uint8_t& b() { return mData.c[2]; } - __hostdev__ uint8_t& a() { return mData.c[3]; } + __hostdev__ __global__ const uint8_t& operator[](int n) const __global__ { return mData.c[n]; } + __hostdev__ __global__ uint8_t& operator[](int n) __global__ { return mData.c[n]; } + __hostdev__ __global__ const uint32_t& packed() const __global__ { return mData.packed; } + __hostdev__ __global__ uint32_t& packed() __global__ { return mData.packed; } + __hostdev__ __global__ const uint8_t& r() const __global__ { return mData.c[0]; } + __hostdev__ __global__ const uint8_t& g() const __global__ { return mData.c[1]; } + __hostdev__ __global__ const uint8_t& b() const __global__ { return mData.c[2]; } + __hostdev__ __global__ const uint8_t& a() const __global__ { return mData.c[3]; } + __hostdev__ __global__ uint8_t& r() __global__ { return mData.c[0]; } + __hostdev__ __global__ uint8_t& g() __global__ { return mData.c[1]; } + __hostdev__ __global__ uint8_t& b() __global__ { return mData.c[2]; } + __hostdev__ __global__ uint8_t& a() __global__ { return mData.c[3]; } };// Rgba8 using PackedRGBA8 = Rgba8;// for backwards compatibility @@ -660,17 +707,17 @@ public: NANOVDB_ASSERT(minor < (1u << 11));// max value of minor is 2047 NANOVDB_ASSERT(patch < (1u << 10));// max value of patch is 1023 } - __hostdev__ bool operator==(const Version &rhs) const {return mData == rhs.mData;} - __hostdev__ bool operator< (const Version &rhs) const {return mData < rhs.mData;} - __hostdev__ bool operator<=(const Version &rhs) const {return mData <= rhs.mData;} - __hostdev__ bool operator> (const Version &rhs) const {return mData > rhs.mData;} - __hostdev__ bool operator>=(const Version &rhs) const {return mData >= rhs.mData;} + __hostdev__ bool operator==(__global__ const Version &rhs) const {return mData == rhs.mData;} + __hostdev__ bool operator< (__global__ const Version &rhs) const {return mData < rhs.mData;} + __hostdev__ bool operator<=(__global__ const Version &rhs) const {return mData <= rhs.mData;} + __hostdev__ bool operator> (__global__ const Version &rhs) const {return mData > rhs.mData;} + __hostdev__ bool operator>=(__global__ const Version &rhs) const {return mData >= rhs.mData;} __hostdev__ uint32_t id() const { return mData; } __hostdev__ uint32_t getMajor() const { return (mData >> 21) & ((1u << 11) - 1);} __hostdev__ uint32_t getMinor() const { return (mData >> 10) & ((1u << 11) - 1);} __hostdev__ uint32_t getPatch() const { return mData & ((1u << 10) - 1);} -#ifndef __CUDACC_RTC__ +#if !defined(__CUDACC_RTC__) && !defined(__KERNEL_METAL__) const char* c_str() const { char *buffer = (char*)malloc(4 + 1 + 4 + 1 + 4 + 1);// xxxx.xxxx.xxxx\0 @@ -749,7 +796,7 @@ struct Maximum //@} template -__hostdev__ inline bool isApproxZero(const Type& x) +__hostdev__ inline bool isApproxZero(__global__ const Type& x) { return !(x > Tolerance::value()) && !(x < -Tolerance::value()); } @@ -771,10 +818,12 @@ __hostdev__ inline float Min(float a, float b) { return fminf(a, b); } +#ifndef __KERNEL_METAL__ __hostdev__ inline double Min(double a, double b) { return fmin(a, b); } +#endif template __hostdev__ inline Type Max(Type a, Type b) { @@ -793,45 +842,55 @@ __hostdev__ inline float Max(float a, float b) { return fmaxf(a, b); } +#ifndef __KERNEL_METAL__ __hostdev__ inline double Max(double a, double b) { return fmax(a, b); } +#endif __hostdev__ inline float Clamp(float x, float a, float b) { return Max(Min(x, b), a); } +#ifndef __KERNEL_METAL__ __hostdev__ inline double Clamp(double x, double a, double b) { return Max(Min(x, b), a); } +#endif __hostdev__ inline float Fract(float x) { return x - floorf(x); } +#ifndef __KERNEL_METAL__ __hostdev__ inline double Fract(double x) { return x - floor(x); } +#endif __hostdev__ inline int32_t Floor(float x) { return int32_t(floorf(x)); } +#ifndef __KERNEL_METAL__ __hostdev__ inline int32_t Floor(double x) { return int32_t(floor(x)); } +#endif __hostdev__ inline int32_t Ceil(float x) { return int32_t(ceilf(x)); } +#ifndef __KERNEL_METAL__ __hostdev__ inline int32_t Ceil(double x) { return int32_t(ceil(x)); } +#endif template __hostdev__ inline T Pow2(T x) @@ -875,46 +934,78 @@ __hostdev__ inline int Abs(int x) } template class Vec3T> -__hostdev__ inline CoordT Round(const Vec3T& xyz); +__hostdev__ inline CoordT Round(__global__ const Vec3T& xyz); +#if defined(__KERNEL_METAL__) +template class Vec3T> +__hostdev__ inline CoordT Round(__local__ const Vec3T& xyz); +#endif template class Vec3T> -__hostdev__ inline CoordT Round(const Vec3T& xyz) +__hostdev__ inline CoordT Round(__global__ const Vec3T& xyz) { return CoordT(int32_t(rintf(xyz[0])), int32_t(rintf(xyz[1])), int32_t(rintf(xyz[2]))); //return CoordT(int32_t(roundf(xyz[0])), int32_t(roundf(xyz[1])), int32_t(roundf(xyz[2])) ); //return CoordT(int32_t(floorf(xyz[0] + 0.5f)), int32_t(floorf(xyz[1] + 0.5f)), int32_t(floorf(xyz[2] + 0.5f))); } +#if defined(__KERNEL_METAL__) +template class Vec3T> +__hostdev__ inline CoordT Round(__local__ const Vec3T& xyz) +{ + return CoordT(int32_t(rintf(xyz[0])), int32_t(rintf(xyz[1])), int32_t(rintf(xyz[2]))); + //return CoordT(int32_t(roundf(xyz[0])), int32_t(roundf(xyz[1])), int32_t(roundf(xyz[2])) ); + //return CoordT(int32_t(floorf(xyz[0] + 0.5f)), int32_t(floorf(xyz[1] + 0.5f)), int32_t(floorf(xyz[2] + 0.5f))); +} +#endif template class Vec3T> -__hostdev__ inline CoordT Round(const Vec3T& xyz) +__hostdev__ inline CoordT Round(__global__ const Vec3T& xyz) { return CoordT(int32_t(floor(xyz[0] + 0.5)), int32_t(floor(xyz[1] + 0.5)), int32_t(floor(xyz[2] + 0.5))); } +#if defined(__KERNEL_METAL__) +template class Vec3T> +__hostdev__ inline CoordT Round(__local__ const Vec3T& xyz) +{ + return CoordT(int32_t(floor(xyz[0] + 0.5)), int32_t(floor(xyz[1] + 0.5)), int32_t(floor(xyz[2] + 0.5))); +} +#endif template class Vec3T> -__hostdev__ inline CoordT RoundDown(const Vec3T& xyz) +__hostdev__ inline CoordT RoundDown(__global__ const Vec3T& xyz) { return CoordT(Floor(xyz[0]), Floor(xyz[1]), Floor(xyz[2])); } - +#if defined(__KERNEL_METAL__) +template class Vec3T> +__hostdev__ inline CoordT RoundDown(__local__ const Vec3T& xyz) +{ + return CoordT(Floor(xyz[0]), Floor(xyz[1]), Floor(xyz[2])); +} +#endif //@{ /// Return the square root of a floating-point value. __hostdev__ inline float Sqrt(float x) { return sqrtf(x); } +#ifndef __KERNEL_METAL__ __hostdev__ inline double Sqrt(double x) { return sqrt(x); } +#endif //@} /// Return the sign of the given value as an integer (either -1, 0 or 1). template -__hostdev__ inline T Sign(const T &x) { return ((T(0) < x)?T(1):T(0)) - ((x < T(0))?T(1):T(0)); } +__hostdev__ inline T Sign(__global__ const T &x) { return ((T(0) < x)?T(1):T(0)) - ((x < T(0))?T(1):T(0)); } +#if defined(__KERNEL_METAL__) +template +__hostdev__ inline T Sign(__local__ const T &x) { return ((T(0) < x)?T(1):T(0)) - ((x < T(0))?T(1):T(0)); } +#endif template -__hostdev__ inline int MinIndex(const Vec3T& v) +__hostdev__ inline int MinIndex(__global__ const Vec3T& v) { #if 0 static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values @@ -930,8 +1021,27 @@ __hostdev__ inline int MinIndex(const Vec3T& v) #endif } +#if defined(__KERNEL_METAL__) template -__hostdev__ inline int MaxIndex(const Vec3T& v) +__hostdev__ inline int MinIndex(__local__ const Vec3T& v) +{ +#if 0 + static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values + const int hashKey = ((v[0] < v[1]) << 2) + ((v[0] < v[2]) << 1) + (v[1] < v[2]); // ?*4+?*2+?*1 + return hashTable[hashKey]; +#else + if (v[0] < v[1] && v[0] < v[2]) + return 0; + if (v[1] < v[2]) + return 1; + else + return 2; +#endif +} +#endif + +template +__hostdev__ inline int MaxIndex(__global__ const Vec3T& v) { #if 0 static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values @@ -947,6 +1057,25 @@ __hostdev__ inline int MaxIndex(const Vec3T& v) #endif } +#if defined(__KERNEL_METAL__) +template +__hostdev__ inline int MaxIndex(__local__ const Vec3T& v) +{ +#if 0 + static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values + const int hashKey = ((v[0] > v[1]) << 2) + ((v[0] > v[2]) << 1) + (v[1] > v[2]); // ?*4+?*2+?*1 + return hashTable[hashKey]; +#else + if (v[0] > v[1] && v[0] > v[2]) + return 0; + if (v[1] > v[2]) + return 1; + else + return 2; +#endif +} +#endif + /// @brief round up byteSize to the nearest wordSize, e.g. to align to machine word: AlignUp - __hostdev__ Coord& operator=(const CoordT &other) + __hostdev__ __global__ Coord& operator=(__global__ const CoordT &other) __global__ { static_assert(sizeof(Coord) == sizeof(CoordT), "Mis-matched sizeof"); mVec[0] = other[0]; @@ -1025,6 +1160,17 @@ public: mVec[2] = other[2]; return *this; } +#if defined(__KERNEL_METAL__) + template + __hostdev__ __local__ Coord& operator=(__local__ const CoordT &other) __local__ + { + static_assert(sizeof(Coord) == sizeof(CoordT), "Mis-matched sizeof"); + mVec[0] = other[0]; + mVec[1] = other[1]; + mVec[2] = other[2]; + return *this; + } +#endif /// @brief Return a new instance with coordinates masked by the given unsigned integer. __hostdev__ Coord operator&(IndexType n) const { return Coord(mVec[0] & n, mVec[1] & n, mVec[2] & n); } @@ -1036,52 +1182,52 @@ public: __hostdev__ Coord operator>>(IndexType n) const { return Coord(mVec[0] >> n, mVec[1] >> n, mVec[2] >> n); } /// @brief Return true if this Coord is lexicographically less than the given Coord. - __hostdev__ bool operator<(const Coord& rhs) const + __hostdev__ bool operator<(__global__ const Coord& rhs) const { return mVec[0] < rhs[0] ? true : mVec[0] > rhs[0] ? false : mVec[1] < rhs[1] ? true : mVec[1] > rhs[1] ? false : mVec[2] < rhs[2] ? true : false; } // @brief Return true if the Coord components are identical. - __hostdev__ bool operator==(const Coord& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; } - __hostdev__ bool operator!=(const Coord& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; } - __hostdev__ Coord& operator&=(int n) + __hostdev__ bool operator==(__global__ const Coord& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; } + __hostdev__ bool operator!=(__global__ const Coord& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; } + __hostdev__ __global__ Coord& operator&=(int n) __global__ { mVec[0] &= n; mVec[1] &= n; mVec[2] &= n; return *this; } - __hostdev__ Coord& operator<<=(uint32_t n) + __hostdev__ __global__ Coord& operator<<=(uint32_t n) __global__ { mVec[0] <<= n; mVec[1] <<= n; mVec[2] <<= n; return *this; } - __hostdev__ Coord& operator>>=(uint32_t n) + __hostdev__ __global__ Coord& operator>>=(uint32_t n) __global__ { mVec[0] >>= n; mVec[1] >>= n; mVec[2] >>= n; return *this; } - __hostdev__ Coord& operator+=(int n) + __hostdev__ __global__ Coord& operator+=(int n) __global__ { mVec[0] += n; mVec[1] += n; mVec[2] += n; return *this; } - __hostdev__ Coord operator+(const Coord& rhs) const { return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]); } - __hostdev__ Coord operator-(const Coord& rhs) const { return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]); } - __hostdev__ Coord& operator+=(const Coord& rhs) + __hostdev__ Coord operator+(__global__ const Coord& rhs) const { return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]); } + __hostdev__ Coord operator-(__global__ const Coord& rhs) const { return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]); } + __hostdev__ __global__ Coord& operator+=(__global__ const Coord& rhs) __global__ { mVec[0] += rhs[0]; mVec[1] += rhs[1]; mVec[2] += rhs[2]; return *this; } - __hostdev__ Coord& operator-=(const Coord& rhs) + __hostdev__ __global__ Coord& operator-=(__global__ const Coord& rhs) __global__ { mVec[0] -= rhs[0]; mVec[1] -= rhs[1]; @@ -1090,7 +1236,7 @@ public: } /// @brief Perform a component-wise minimum with the other Coord. - __hostdev__ Coord& minComponent(const Coord& other) + __hostdev__ __global__ Coord& minComponent(__global__ const Coord& other) __global__ { if (other[0] < mVec[0]) mVec[0] = other[0]; @@ -1102,7 +1248,7 @@ public: } /// @brief Perform a component-wise maximum with the other Coord. - __hostdev__ Coord& maxComponent(const Coord& other) + __hostdev__ __global__ Coord& maxComponent(__global__ const Coord& other) __global__ { if (other[0] > mVec[0]) mVec[0] = other[0]; @@ -1113,16 +1259,16 @@ public: return *this; } - __hostdev__ Coord offsetBy(ValueType dx, ValueType dy, ValueType dz) const + __hostdev__ Coord offsetBy(ValueType dx, ValueType dy, ValueType dz) const __global__ { return Coord(mVec[0] + dx, mVec[1] + dy, mVec[2] + dz); } - __hostdev__ Coord offsetBy(ValueType n) const { return this->offsetBy(n, n, n); } + __hostdev__ Coord offsetBy(ValueType n) const __global__ { return this->offsetBy(n, n, n); } /// Return true if any of the components of @a a are smaller than the /// corresponding components of @a b. - __hostdev__ static inline bool lessThan(const Coord& a, const Coord& b) + __hostdev__ static inline bool lessThan(__global__ const Coord& a, __global__ const Coord& b) { return (a[0] < b[0] || a[1] < b[1] || a[2] < b[2]); } @@ -1130,7 +1276,13 @@ public: /// @brief Return the largest integer coordinates that are not greater /// than @a xyz (node centered conversion). template - __hostdev__ static Coord Floor(const Vec3T& xyz) { return Coord(nanovdb::Floor(xyz[0]), nanovdb::Floor(xyz[1]), nanovdb::Floor(xyz[2])); } + __hostdev__ static Coord Floor(__global__ const Vec3T& xyz) { return Coord(nanovdb::Floor(xyz[0]), nanovdb::Floor(xyz[1]), nanovdb::Floor(xyz[2])); } +#if defined __KERNEL_METAL__ + /// @brief Return the largest integer coordinates that are not greater + /// than @a xyz (node centered conversion). + template + __hostdev__ static Coord Floor(__local__ const Vec3T& xyz) { return Coord(nanovdb::Floor(xyz[0]), nanovdb::Floor(xyz[1]), nanovdb::Floor(xyz[2])); } +#endif /// @brief Return a hash key derived from the existing coordinates. /// @details For details on this hash function please see the VDB paper. @@ -1159,7 +1311,7 @@ class Vec3 T mVec[3]; public: - static const int SIZE = 3; + static __constant__ const int SIZE = 3; using ValueType = T; Vec3() = default; __hostdev__ explicit Vec3(T x) @@ -1171,30 +1323,36 @@ public: { } template - __hostdev__ explicit Vec3(const Vec3& v) + __hostdev__ explicit Vec3(__global__ const Vec3& v) : mVec{T(v[0]), T(v[1]), T(v[2])} { } - __hostdev__ explicit Vec3(const Coord& ijk) + __hostdev__ explicit Vec3(__global__ const Coord& ijk) : mVec{T(ijk[0]), T(ijk[1]), T(ijk[2])} { } - __hostdev__ bool operator==(const Vec3& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; } - __hostdev__ bool operator!=(const Vec3& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; } + __hostdev__ bool operator==(__global__ const Vec3& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; } + __hostdev__ bool operator!=(__global__ const Vec3& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; } template - __hostdev__ Vec3& operator=(const Vec3T& rhs) + __hostdev__ __global__ Vec3& operator=(__global__ const Vec3T& rhs) { mVec[0] = rhs[0]; mVec[1] = rhs[1]; mVec[2] = rhs[2]; return *this; } - __hostdev__ const T& operator[](int i) const { return mVec[i]; } - __hostdev__ T& operator[](int i) { return mVec[i]; } + __hostdev__ __global__ const T& operator[](int i) const __global__ { return mVec[i]; } +#if defined(__KERNEL_METAL__) + __hostdev__ __local__ const T& operator[](int i) const __local__ { return mVec[i]; } +#endif + __hostdev__ __global__ T& operator[](int i) __global__ { return mVec[i]; } +#if defined(__KERNEL_METAL__) + __hostdev__ __local__ T& operator[](int i) __local__ { return mVec[i]; } +#endif template - __hostdev__ T dot(const Vec3T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2]; } + __hostdev__ T dot(__global__ const Vec3T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2]; } template - __hostdev__ Vec3 cross(const Vec3T& v) const + __hostdev__ Vec3 cross(__global__ const Vec3T& v) const { return Vec3(mVec[1] * v[2] - mVec[2] * v[1], mVec[2] * v[0] - mVec[0] * v[2], @@ -1206,37 +1364,62 @@ public: } __hostdev__ T length() const { return Sqrt(this->lengthSqr()); } __hostdev__ Vec3 operator-() const { return Vec3(-mVec[0], -mVec[1], -mVec[2]); } - __hostdev__ Vec3 operator*(const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); } - __hostdev__ Vec3 operator/(const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); } - __hostdev__ Vec3 operator+(const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); } - __hostdev__ Vec3 operator-(const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); } - __hostdev__ Vec3 operator*(const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); } - __hostdev__ Vec3 operator/(const T& s) const { return (T(1) / s) * (*this); } - __hostdev__ Vec3& operator+=(const Vec3& v) + __hostdev__ Vec3 operator*(__global__ const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); } +#if defined(__KERNEL_METAL__) + __hostdev__ Vec3 operator*(__local__ const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); } +#endif + __hostdev__ Vec3 operator/(__global__ const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); } +#if defined(__KERNEL_METAL__) + __hostdev__ Vec3 operator/(__local__ const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); } +#endif + __hostdev__ Vec3 operator+(__global__ const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); } +#if defined(__KERNEL_METAL__) + __hostdev__ Vec3 operator-(__local__ const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); } + __hostdev__ Vec3 operator+(__local__ const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); } +#endif + __hostdev__ Vec3 operator-(__global__ const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); } + __hostdev__ Vec3 operator*(__global__ const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); } +#if defined(__KERNEL_METAL__) + __hostdev__ Vec3 operator*(__local__ const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); } +#endif + __hostdev__ Vec3 operator/(__global__ const T& s) const { return (T(1) / s) * (*this); } + __hostdev__ __global__ Vec3& operator+=(__global__ const Vec3& v) { mVec[0] += v[0]; mVec[1] += v[1]; mVec[2] += v[2]; return *this; } - __hostdev__ Vec3& operator-=(const Vec3& v) + __hostdev__ __global__ Vec3& operator-=(__global__ const Vec3& v) { mVec[0] -= v[0]; mVec[1] -= v[1]; mVec[2] -= v[2]; return *this; } - __hostdev__ Vec3& operator*=(const T& s) + __hostdev__ __global__ Vec3& operator*=(__global__ const T& s) { mVec[0] *= s; mVec[1] *= s; mVec[2] *= s; return *this; } - __hostdev__ Vec3& operator/=(const T& s) { return (*this) *= T(1) / s; } - __hostdev__ Vec3& normalize() { return (*this) /= this->length(); } +#if defined __KERNEL_METAL__ + __hostdev__ __local__ Vec3& operator*=(__local__ const T& s) + { + mVec[0] *= s; + mVec[1] *= s; + mVec[2] *= s; + return *this; + } +#endif + __hostdev__ __global__ Vec3& operator/=(__global__ const T& s) { return (*this) *= T(1) / s; } +#if defined __KERNEL_METAL__ + __hostdev__ __local__ Vec3& operator/=(__local__ const T& s) { return (*this) *= T(1) / s; } +#endif + __hostdev__ __global__ Vec3& normalize() { return (*this) /= this->length(); } /// @brief Perform a component-wise minimum with the other Coord. - __hostdev__ Vec3& minComponent(const Vec3& other) + __hostdev__ __global__ Vec3& minComponent(__global__ const Vec3& other) { if (other[0] < mVec[0]) mVec[0] = other[0]; @@ -1248,7 +1431,7 @@ public: } /// @brief Perform a component-wise maximum with the other Coord. - __hostdev__ Vec3& maxComponent(const Vec3& other) + __hostdev__ __global__ Vec3& maxComponent(__global__ const Vec3& other) { if (other[0] > mVec[0]) mVec[0] = other[0]; @@ -1274,15 +1457,29 @@ public: }; // Vec3 template -__hostdev__ inline Vec3 operator*(T1 scalar, const Vec3& vec) +__hostdev__ inline Vec3 operator*(T1 scalar, __global__ const Vec3& vec) { return Vec3(scalar * vec[0], scalar * vec[1], scalar * vec[2]); } +#if defined(__KERNEL_METAL__) template -__hostdev__ inline Vec3 operator/(T1 scalar, const Vec3& vec) +__hostdev__ inline Vec3 operator*(T1 scalar, __local__ const Vec3& vec) +{ + return Vec3(scalar * vec[0], scalar * vec[1], scalar * vec[2]); +} +#endif +template +__hostdev__ inline Vec3 operator/(T1 scalar, __global__ const Vec3& vec) { return Vec3(scalar / vec[0], scalar / vec[1], scalar / vec[2]); } +#if defined(__KERNEL_METAL__) +template +__hostdev__ inline Vec3 operator/(T1 scalar, __local__ const Vec3& vec) +{ + return Vec3(scalar / vec[0], scalar / vec[1], scalar / vec[2]); +} +#endif using Vec3R = Vec3; using Vec3d = Vec3; @@ -1304,7 +1501,7 @@ class Vec4 T mVec[4]; public: - static const int SIZE = 4; + static __constant__ const int SIZE = 4; using ValueType = T; Vec4() = default; __hostdev__ explicit Vec4(T x) @@ -1316,14 +1513,14 @@ public: { } template - __hostdev__ explicit Vec4(const Vec4& v) + __hostdev__ explicit Vec4(__global__ const Vec4& v) : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])} { } - __hostdev__ bool operator==(const Vec4& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2] && mVec[3] == rhs[3]; } - __hostdev__ bool operator!=(const Vec4& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2] || mVec[3] != rhs[3]; } + __hostdev__ bool operator==(__global__ const Vec4& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2] && mVec[3] == rhs[3]; } + __hostdev__ bool operator!=(__global__ const Vec4& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2] || mVec[3] != rhs[3]; } template - __hostdev__ Vec4& operator=(const Vec4T& rhs) + __hostdev__ __global__ Vec4& operator=(__global__ const Vec4T& rhs) { mVec[0] = rhs[0]; mVec[1] = rhs[1]; @@ -1331,23 +1528,23 @@ public: mVec[3] = rhs[3]; return *this; } - __hostdev__ const T& operator[](int i) const { return mVec[i]; } - __hostdev__ T& operator[](int i) { return mVec[i]; } + __hostdev__ __global__ const T& operator[](int i) const { return mVec[i]; } + __hostdev__ __global__ T& operator[](int i) { return mVec[i]; } template - __hostdev__ T dot(const Vec4T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2] + mVec[3] * v[3]; } + __hostdev__ T dot(__global__ const Vec4T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2] + mVec[3] * v[3]; } __hostdev__ T lengthSqr() const { return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2] + mVec[3] * mVec[3]; // 7 flops } __hostdev__ T length() const { return Sqrt(this->lengthSqr()); } __hostdev__ Vec4 operator-() const { return Vec4(-mVec[0], -mVec[1], -mVec[2], -mVec[3]); } - __hostdev__ Vec4 operator*(const Vec4& v) const { return Vec4(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2], mVec[3] * v[3]); } - __hostdev__ Vec4 operator/(const Vec4& v) const { return Vec4(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2], mVec[3] / v[3]); } - __hostdev__ Vec4 operator+(const Vec4& v) const { return Vec4(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2], mVec[3] + v[3]); } - __hostdev__ Vec4 operator-(const Vec4& v) const { return Vec4(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2], mVec[3] - v[3]); } - __hostdev__ Vec4 operator*(const T& s) const { return Vec4(s * mVec[0], s * mVec[1], s * mVec[2], s * mVec[3]); } - __hostdev__ Vec4 operator/(const T& s) const { return (T(1) / s) * (*this); } - __hostdev__ Vec4& operator+=(const Vec4& v) + __hostdev__ Vec4 operator*(__global__ const Vec4& v) const { return Vec4(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2], mVec[3] * v[3]); } + __hostdev__ Vec4 operator/(__global__ const Vec4& v) const { return Vec4(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2], mVec[3] / v[3]); } + __hostdev__ Vec4 operator+(__global__ const Vec4& v) const { return Vec4(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2], mVec[3] + v[3]); } + __hostdev__ Vec4 operator-(__global__ const Vec4& v) const { return Vec4(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2], mVec[3] - v[3]); } + __hostdev__ Vec4 operator*(__global__ const T& s) const { return Vec4(s * mVec[0], s * mVec[1], s * mVec[2], s * mVec[3]); } + __hostdev__ Vec4 operator/(__global__ const T& s) const { return (T(1) / s) * (*this); } + __hostdev__ __global__ Vec4& operator+=(__global__ const Vec4& v) { mVec[0] += v[0]; mVec[1] += v[1]; @@ -1355,7 +1552,7 @@ public: mVec[3] += v[3]; return *this; } - __hostdev__ Vec4& operator-=(const Vec4& v) + __hostdev__ __global__ Vec4& operator-=(__global__ const Vec4& v) { mVec[0] -= v[0]; mVec[1] -= v[1]; @@ -1363,7 +1560,7 @@ public: mVec[3] -= v[3]; return *this; } - __hostdev__ Vec4& operator*=(const T& s) + __hostdev__ __global__ Vec4& operator*=(__global__ const T& s) { mVec[0] *= s; mVec[1] *= s; @@ -1371,10 +1568,10 @@ public: mVec[3] *= s; return *this; } - __hostdev__ Vec4& operator/=(const T& s) { return (*this) *= T(1) / s; } - __hostdev__ Vec4& normalize() { return (*this) /= this->length(); } + __hostdev__ __global__ Vec4& operator/=(__global__ const T& s) { return (*this) *= T(1) / s; } + __hostdev__ __global__ Vec4& normalize() { return (*this) /= this->length(); } /// @brief Perform a component-wise minimum with the other Coord. - __hostdev__ Vec4& minComponent(const Vec4& other) + __hostdev__ __global__ Vec4& minComponent(__global__ const Vec4& other) { if (other[0] < mVec[0]) mVec[0] = other[0]; @@ -1388,7 +1585,7 @@ public: } /// @brief Perform a component-wise maximum with the other Coord. - __hostdev__ Vec4& maxComponent(const Vec4& other) + __hostdev__ __global__ Vec4& maxComponent(__global__ const Vec4& other) { if (other[0] > mVec[0]) mVec[0] = other[0]; @@ -1403,12 +1600,12 @@ public: }; // Vec4 template -__hostdev__ inline Vec4 operator*(T1 scalar, const Vec4& vec) +__hostdev__ inline Vec4 operator*(T1 scalar, __global__ const Vec4& vec) { return Vec4(scalar * vec[0], scalar * vec[1], scalar * vec[2], scalar * vec[3]); } template -__hostdev__ inline Vec4 operator/(T1 scalar, const Vec3& vec) +__hostdev__ inline Vec4 operator/(T1 scalar, __global__ const Vec3& vec) { return Vec4(scalar / vec[0], scalar / vec[1], scalar / vec[2], scalar / vec[3]); } @@ -1428,23 +1625,23 @@ struct TensorTraits; template struct TensorTraits { - static const int Rank = 0; // i.e. scalar - static const bool IsScalar = true; - static const bool IsVector = false; - static const int Size = 1; + static __constant__ const int Rank = 0; // i.e. scalar + static __constant__ const bool IsScalar = true; + static __constant__ const bool IsVector = false; + static __constant__ const int Size = 1; using ElementType = T; - static T scalar(const T& s) { return s; } + static T scalar(__global__ const T& s) { return s; } }; template struct TensorTraits { - static const int Rank = 1; // i.e. vector - static const bool IsScalar = false; - static const bool IsVector = true; - static const int Size = T::SIZE; + static __constant__ const int Rank = 1; // i.e. vector + static __constant__ const bool IsScalar = false; + static __constant__ const bool IsVector = true; + static __constant__ const int Size = T::SIZE; using ElementType = typename T::ValueType; - static ElementType scalar(const T& v) { return v.length(); } + static ElementType scalar(__global__ const T& v) { return v.length(); } }; // ----------------------------> FloatTraits <-------------------------------------- @@ -1528,71 +1725,80 @@ __hostdev__ inline GridType mapToGridType() // ----------------------------> matMult <-------------------------------------- template -__hostdev__ inline Vec3T matMult(const float* mat, const Vec3T& xyz) +__hostdev__ inline Vec3T matMult(__global__ const float* mat, __global__ const Vec3T& xyz) { return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[1], xyz[2] * mat[2])), fmaf(xyz[0], mat[3], fmaf(xyz[1], mat[4], xyz[2] * mat[5])), fmaf(xyz[0], mat[6], fmaf(xyz[1], mat[7], xyz[2] * mat[8]))); // 6 fmaf + 3 mult = 9 flops } - +#if defined(__KERNEL_METAL__) template -__hostdev__ inline Vec3T matMult(const double* mat, const Vec3T& xyz) +__hostdev__ inline Vec3T matMult(__global__ const float* mat, __local__ const Vec3T& xyz) +{ + return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[1], xyz[2] * mat[2])), + fmaf(xyz[0], mat[3], fmaf(xyz[1], mat[4], xyz[2] * mat[5])), + fmaf(xyz[0], mat[6], fmaf(xyz[1], mat[7], xyz[2] * mat[8]))); // 6 fmaf + 3 mult = 9 flops +} +#endif +#ifndef __KERNEL_METAL__ +template +__hostdev__ inline Vec3T matMult(__global__ const double* mat, __global__ const Vec3T& xyz) { return Vec3T(fma(static_cast(xyz[0]), mat[0], fma(static_cast(xyz[1]), mat[1], static_cast(xyz[2]) * mat[2])), fma(static_cast(xyz[0]), mat[3], fma(static_cast(xyz[1]), mat[4], static_cast(xyz[2]) * mat[5])), fma(static_cast(xyz[0]), mat[6], fma(static_cast(xyz[1]), mat[7], static_cast(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops } - +#endif template -__hostdev__ inline Vec3T matMult(const float* mat, const float* vec, const Vec3T& xyz) +__hostdev__ inline Vec3T matMult(__global__ const float* mat, __global__ const float* vec, __global__ const Vec3T& xyz) { return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[1], fmaf(xyz[2], mat[2], vec[0]))), fmaf(xyz[0], mat[3], fmaf(xyz[1], mat[4], fmaf(xyz[2], mat[5], vec[1]))), fmaf(xyz[0], mat[6], fmaf(xyz[1], mat[7], fmaf(xyz[2], mat[8], vec[2])))); // 9 fmaf = 9 flops } - +#ifndef __KERNEL_METAL__ template -__hostdev__ inline Vec3T matMult(const double* mat, const double* vec, const Vec3T& xyz) +__hostdev__ inline Vec3T matMult(__global__ const double* mat, __global__ const double* vec, __global__ const Vec3T& xyz) { return Vec3T(fma(static_cast(xyz[0]), mat[0], fma(static_cast(xyz[1]), mat[1], fma(static_cast(xyz[2]), mat[2], vec[0]))), fma(static_cast(xyz[0]), mat[3], fma(static_cast(xyz[1]), mat[4], fma(static_cast(xyz[2]), mat[5], vec[1]))), fma(static_cast(xyz[0]), mat[6], fma(static_cast(xyz[1]), mat[7], fma(static_cast(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops } - +#endif // matMultT: Multiply with the transpose: template -__hostdev__ inline Vec3T matMultT(const float* mat, const Vec3T& xyz) +__hostdev__ inline Vec3T matMultT(__global__ const float* mat, __global__ const Vec3T& xyz) { return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[3], xyz[2] * mat[6])), fmaf(xyz[0], mat[1], fmaf(xyz[1], mat[4], xyz[2] * mat[7])), fmaf(xyz[0], mat[2], fmaf(xyz[1], mat[5], xyz[2] * mat[8]))); // 6 fmaf + 3 mult = 9 flops } - +#ifndef __KERNEL_METAL__ template -__hostdev__ inline Vec3T matMultT(const double* mat, const Vec3T& xyz) +__hostdev__ inline Vec3T matMultT(__global__ const double* mat, __global__ const Vec3T& xyz) { return Vec3T(fma(static_cast(xyz[0]), mat[0], fma(static_cast(xyz[1]), mat[3], static_cast(xyz[2]) * mat[6])), fma(static_cast(xyz[0]), mat[1], fma(static_cast(xyz[1]), mat[4], static_cast(xyz[2]) * mat[7])), fma(static_cast(xyz[0]), mat[2], fma(static_cast(xyz[1]), mat[5], static_cast(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops } - +#endif template -__hostdev__ inline Vec3T matMultT(const float* mat, const float* vec, const Vec3T& xyz) +__hostdev__ inline Vec3T matMultT(__global__ const float* mat, __global__ const float* vec, __global__ const Vec3T& xyz) { return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[3], fmaf(xyz[2], mat[6], vec[0]))), fmaf(xyz[0], mat[1], fmaf(xyz[1], mat[4], fmaf(xyz[2], mat[7], vec[1]))), fmaf(xyz[0], mat[2], fmaf(xyz[1], mat[5], fmaf(xyz[2], mat[8], vec[2])))); // 9 fmaf = 9 flops } - +#ifndef __KERNEL_METAL__ template -__hostdev__ inline Vec3T matMultT(const double* mat, const double* vec, const Vec3T& xyz) +__hostdev__ inline Vec3T matMultT(__global__ const double* mat, __global__ const double* vec, __global__ const Vec3T& xyz) { return Vec3T(fma(static_cast(xyz[0]), mat[0], fma(static_cast(xyz[1]), mat[3], fma(static_cast(xyz[2]), mat[6], vec[0]))), fma(static_cast(xyz[0]), mat[1], fma(static_cast(xyz[1]), mat[4], fma(static_cast(xyz[2]), mat[7], vec[1]))), fma(static_cast(xyz[0]), mat[2], fma(static_cast(xyz[1]), mat[5], fma(static_cast(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops } - +#endif // ----------------------------> BBox <------------------------------------- // Base-class for static polymorphism (cannot be constructed directly) @@ -1600,22 +1806,34 @@ template struct BaseBBox { Vec3T mCoord[2]; - __hostdev__ bool operator==(const BaseBBox& rhs) const { return mCoord[0] == rhs.mCoord[0] && mCoord[1] == rhs.mCoord[1]; }; - __hostdev__ bool operator!=(const BaseBBox& rhs) const { return mCoord[0] != rhs.mCoord[0] || mCoord[1] != rhs.mCoord[1]; }; - __hostdev__ const Vec3T& operator[](int i) const { return mCoord[i]; } - __hostdev__ Vec3T& operator[](int i) { return mCoord[i]; } - __hostdev__ Vec3T& min() { return mCoord[0]; } - __hostdev__ Vec3T& max() { return mCoord[1]; } - __hostdev__ const Vec3T& min() const { return mCoord[0]; } - __hostdev__ const Vec3T& max() const { return mCoord[1]; } - __hostdev__ Coord& translate(const Vec3T& xyz) + __hostdev__ bool operator==(__global__ const BaseBBox& rhs) const __global__ { return mCoord[0] == rhs.mCoord[0] && mCoord[1] == rhs.mCoord[1]; }; + __hostdev__ bool operator!=(__global__ const BaseBBox& rhs) const __global__ { return mCoord[0] != rhs.mCoord[0] || mCoord[1] != rhs.mCoord[1]; }; + __hostdev__ __global__ const Vec3T& operator[](int i) const __global__ { return mCoord[i]; } + __hostdev__ __global__ Vec3T& operator[](int i) __global__ { return mCoord[i]; } + __hostdev__ __global__ Vec3T& min() __global__ { return mCoord[0]; } +#if defined(__KERNEL_METAL__) + __hostdev__ __global__ Vec3T& min() __local__ { return mCoord[0]; } +#endif + __hostdev__ __global__ Vec3T& max() __global__ { return mCoord[1]; } +#if defined(__KERNEL_METAL__) + __hostdev__ __global__ Vec3T& max() __local__ { return mCoord[1]; } +#endif + __hostdev__ __global__ const Vec3T& min() const __global__ { return mCoord[0]; } +#if defined(__KERNEL_METAL__) + __hostdev__ __local__ const Vec3T& min() const __local__ { return mCoord[0]; } +#endif + __hostdev__ __global__ const Vec3T& max() const __global__ { return mCoord[1]; } +#if defined(__KERNEL_METAL__) + __hostdev__ __local__ const Vec3T& max() const __local__ { return mCoord[1]; } +#endif + __hostdev__ __global__ Coord& translate(__global__ const Vec3T& xyz) __global__ { mCoord[0] += xyz; mCoord[1] += xyz; return *this; } // @brief Expand this bounding box to enclose point (i, j, k). - __hostdev__ BaseBBox& expand(const Vec3T& xyz) + __hostdev__ __global__ BaseBBox& expand(__global__ const Vec3T& xyz) __global__ { mCoord[0].minComponent(xyz); mCoord[1].maxComponent(xyz); @@ -1623,7 +1841,7 @@ struct BaseBBox } /// @brief Intersect this bounding box with the given bounding box. - __hostdev__ BaseBBox& intersect(const BaseBBox& bbox) + __hostdev__ __global__ BaseBBox& intersect(__global__ const BaseBBox& bbox) __global__ { mCoord[0].maxComponent(bbox.min()); mCoord[1].minComponent(bbox.max()); @@ -1634,7 +1852,7 @@ struct BaseBBox //{ // return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding)); //} - __hostdev__ bool isInside(const Vec3T& xyz) + __hostdev__ bool isInside(__global__ const Vec3T& xyz) { if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2]) return false; @@ -1642,10 +1860,20 @@ struct BaseBBox return false; return true; } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isInside(__local__ const Vec3T& xyz) + { + if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2]) + return false; + if (xyz[0] > mCoord[1][0] || xyz[1] > mCoord[1][1] || xyz[2] > mCoord[1][2]) + return false; + return true; + } +#endif protected: __hostdev__ BaseBBox() {} - __hostdev__ BaseBBox(const Vec3T& min, const Vec3T& max) + __hostdev__ BaseBBox(__global__ const Vec3T& min, __global__ const Vec3T& max) : mCoord{min, max} { } @@ -1659,38 +1887,45 @@ struct BBox; /// @note Min is inclusive and max is exclusive. If min = max the dimension of /// the bounding box is zero and therefore it is also empty. template -struct BBox : public BaseBBox +struct BBox +#if !defined(__KERNEL_METAL__) + : public BaseBBox +#endif { using Vec3Type = Vec3T; using ValueType = typename Vec3T::ValueType; static_assert(is_floating_point::value, "Expected a floating point coordinate type"); using BaseT = BaseBBox; +#if defined(__KERNEL_METAL__) + BaseBBox mCoord; +#else using BaseT::mCoord; +#endif + __hostdev__ BBox() : BaseT(Vec3T( Maximum::value()), Vec3T(-Maximum::value())) { } - __hostdev__ BBox(const Vec3T& min, const Vec3T& max) + __hostdev__ BBox(__global__ const Vec3T& min, __global__ const Vec3T& max) : BaseT(min, max) { } - __hostdev__ BBox(const Coord& min, const Coord& max) + __hostdev__ BBox(__global__ const Coord& min, __global__ const Coord& max) : BaseT(Vec3T(ValueType(min[0]), ValueType(min[1]), ValueType(min[2])), Vec3T(ValueType(max[0] + 1), ValueType(max[1] + 1), ValueType(max[2] + 1))) { } - __hostdev__ static BBox createCube(const Coord& min, typename Coord::ValueType dim) + __hostdev__ static BBox createCube(__global__ const Coord& min, typename Coord::ValueType dim) { return BBox(min, min.offsetBy(dim)); } - - __hostdev__ BBox(const BaseBBox& bbox) : BBox(bbox[0], bbox[1]) {} + __hostdev__ BBox(__global__ const BaseBBox& bbox) __global__ : BBox(bbox[0], bbox[1]) {} __hostdev__ bool empty() const { return mCoord[0][0] >= mCoord[1][0] || mCoord[0][1] >= mCoord[1][1] || mCoord[0][2] >= mCoord[1][2]; } __hostdev__ Vec3T dim() const { return this->empty() ? Vec3T(0) : this->max() - this->min(); } - __hostdev__ bool isInside(const Vec3T& p) const + __hostdev__ bool isInside(__global__ const Vec3T& p) const { return p[0] > mCoord[0][0] && p[1] > mCoord[0][1] && p[2] > mCoord[0][2] && p[0] < mCoord[1][0] && p[1] < mCoord[1][1] && p[2] < mCoord[1][2]; @@ -1703,24 +1938,32 @@ struct BBox : public BaseBBox /// @note Both min and max are INCLUDED in the bbox so dim = max - min + 1. So, /// if min = max the bounding box contains exactly one point and dim = 1! template -struct BBox : public BaseBBox +struct BBox +#if !defined(__KERNEL_METAL__) + : public BaseBBox +#endif { + static_assert(is_same::value, "Expected \"int\" coordinate type"); using BaseT = BaseBBox; +#if defined(__KERNEL_METAL__) + BaseBBox mCoord; +#else using BaseT::mCoord; +#endif /// @brief Iterator over the domain covered by a BBox /// @details z is the fastest-moving coordinate. class Iterator { - const BBox& mBBox; + __global__ const BBox& mBBox; CoordT mPos; public: - __hostdev__ Iterator(const BBox& b) + __hostdev__ Iterator(__global__ const BBox& b) : mBBox(b) , mPos(b.min()) { } - __hostdev__ Iterator& operator++() + __hostdev__ __global__ Iterator& operator++() { if (mPos[2] < mBBox[1][2]) {// this is the most common case ++mPos[2]; @@ -1734,7 +1977,7 @@ struct BBox : public BaseBBox } return *this; } - __hostdev__ Iterator operator++(int) + __hostdev__ Iterator operator++(int) __global__ { auto tmp = *this; ++(*this); @@ -1742,20 +1985,20 @@ struct BBox : public BaseBBox } /// @brief Return @c true if the iterator still points to a valid coordinate. __hostdev__ operator bool() const { return mPos[0] <= mBBox[1][0]; } - __hostdev__ const CoordT& operator*() const { return mPos; } + __hostdev__ __global__ const CoordT& operator*() const { return mPos; } }; // Iterator __hostdev__ Iterator begin() const { return Iterator{*this}; } __hostdev__ BBox() : BaseT(CoordT::max(), CoordT::min()) { } - __hostdev__ BBox(const CoordT& min, const CoordT& max) + __hostdev__ BBox(__global__ const CoordT& min, __global__ const CoordT& max) : BaseT(min, max) { } template - __hostdev__ BBox(BBox& other, const SplitT&) + __hostdev__ BBox(__global__ BBox& other, __global__ const SplitT&) : BaseT(other.mCoord[0], other.mCoord[1]) { NANOVDB_ASSERT(this->is_divisible()); @@ -1764,7 +2007,7 @@ struct BBox : public BaseBBox other.mCoord[0][n] = mCoord[1][n] + 1; } - __hostdev__ static BBox createCube(const CoordT& min, typename CoordT::ValueType dim) + __hostdev__ static BBox createCube(__global__ const CoordT& min, typename CoordT::ValueType dim) { return BBox(min, min.offsetBy(dim - 1)); } @@ -1778,15 +2021,23 @@ struct BBox : public BaseBBox mCoord[0][2] > mCoord[1][2]; } __hostdev__ CoordT dim() const { return this->empty() ? Coord(0) : this->max() - this->min() + Coord(1); } __hostdev__ uint64_t volume() const { auto d = this->dim(); return uint64_t(d[0])*uint64_t(d[1])*uint64_t(d[2]); } - __hostdev__ bool isInside(const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); } - /// @brief Return @c true if the given bounding box is inside this bounding box. - __hostdev__ bool isInside(const BBox& b) const + __hostdev__ bool isInside(__global__ const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isInside(__local__ const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); } +#endif + __hostdev__ bool isInside(__global__ const BBox& b) const { return !(CoordT::lessThan(b.min(), this->min()) || CoordT::lessThan(this->max(), b.max())); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isInside(__local__ const BBox& b) const + { + return !(CoordT::lessThan(b.min(), this->min()) || CoordT::lessThan(this->max(), b.max())); + } +#endif /// @brief Return @c true if the given bounding box overlaps with this bounding box. - __hostdev__ bool hasOverlap(const BBox& b) const + __hostdev__ bool hasOverlap(__global__ const BBox& b) const { return !(CoordT::lessThan(this->max(), b.min()) || CoordT::lessThan(b.max(), this->min())); } @@ -1826,6 +2077,8 @@ __hostdev__ static inline uint32_t FindLowestOn(uint32_t v) return static_cast(index); #elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS) return static_cast(__builtin_ctzl(v)); +#elif defined(__KERNEL_METAL__) + return ctz(v); #else //#warning Using software implementation for FindLowestOn(uint32_t) static const unsigned char DeBruijn[32] = { @@ -1856,6 +2109,8 @@ __hostdev__ static inline uint32_t FindHighestOn(uint32_t v) return static_cast(index); #elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS) return sizeof(unsigned long) * 8 - 1 - __builtin_clzl(v); +#elif defined(__KERNEL_METAL__) + return clz(v); #else //#warning Using software implementation for FindHighestOn(uint32_t) static const unsigned char DeBruijn[32] = { @@ -1884,6 +2139,8 @@ __hostdev__ static inline uint32_t FindLowestOn(uint64_t v) return static_cast(index); #elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS) return static_cast(__builtin_ctzll(v)); +#elif defined(__KERNEL_METAL__) + return ctz(v); #else //#warning Using software implementation for FindLowestOn(uint64_t) static const unsigned char DeBruijn[64] = { @@ -1918,6 +2175,8 @@ __hostdev__ static inline uint32_t FindHighestOn(uint64_t v) return static_cast(index); #elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS) return sizeof(unsigned long) * 8 - 1 - __builtin_clzll(v); +#elif defined(__KERNEL_METAL__) + return clz(v); #else const uint32_t* p = reinterpret_cast(&v); return p[1] ? 32u + FindHighestOn(p[1]) : FindHighestOn(p[0]); @@ -1955,8 +2214,8 @@ __hostdev__ inline uint32_t CountOn(uint64_t v) template class Mask { - static constexpr uint32_t SIZE = 1U << (3 * LOG2DIM); // Number of bits in mask - static constexpr uint32_t WORD_COUNT = SIZE >> 6; // Number of 64 bit words + static __constant__ constexpr uint32_t SIZE = 1U << (3 * LOG2DIM); // Number of bits in mask + static __constant__ constexpr uint32_t WORD_COUNT = SIZE >> 6; // Number of 64 bit words uint64_t mWords[WORD_COUNT]; public: @@ -1973,7 +2232,7 @@ public: __hostdev__ uint32_t countOn() const { uint32_t sum = 0, n = WORD_COUNT; - for (const uint64_t* w = mWords; n--; ++w) + for (__global__ const uint64_t* w = mWords; n--; ++w) sum += CountOn(*w); return sum; } @@ -1982,7 +2241,7 @@ public: inline __hostdev__ uint32_t countOn(uint32_t i) const { uint32_t n = i >> 6, sum = CountOn( mWords[n] & ((uint64_t(1) << (i & 63u))-1u) ); - for (const uint64_t* w = mWords; n--; ++w) sum += CountOn(*w); + for (__global__ const uint64_t* w = mWords; n--; ++w) sum += CountOn(*w); return sum; } @@ -1990,13 +2249,21 @@ public: class Iterator { public: - __hostdev__ Iterator() : mPos(Mask::SIZE), mParent(nullptr){} - __hostdev__ Iterator(uint32_t pos, const Mask* parent) : mPos(pos), mParent(parent){} - Iterator& operator=(const Iterator&) = default; + __hostdev__ Iterator() + : mPos(Mask::SIZE) + , mParent(nullptr) + { + } + __hostdev__ Iterator(uint32_t pos, __global__ const Mask* parent) + : mPos(pos) + , mParent(parent) + { + } + __global__ Iterator& operator=(__global__ const Iterator&) = default; __hostdev__ uint32_t operator*() const { return mPos; } __hostdev__ uint32_t pos() const { return mPos; } __hostdev__ operator bool() const { return mPos != Mask::SIZE; } - __hostdev__ Iterator& operator++() + __hostdev__ __global__ Iterator& operator++() { mPos = mParent->findNext(mPos + 1); return *this; @@ -2010,7 +2277,7 @@ public: private: uint32_t mPos; - const Mask* mParent; + __global__ const Mask* mParent; }; // Member class Iterator using OnIterator = Iterator; @@ -2034,7 +2301,7 @@ public: } /// @brief Copy constructor - __hostdev__ Mask(const Mask& other) + __hostdev__ Mask(__global__ const Mask& other) { for (uint32_t i = 0; i < WORD_COUNT; ++i) mWords[i] = other.mWords[i]; @@ -2042,36 +2309,36 @@ public: /// @brief Return a const reference to the nth word of the bit mask, for a word of arbitrary size. template - __hostdev__ const WordT& getWord(int n) const + __hostdev__ __global__ const WordT& getWord(int n) const { NANOVDB_ASSERT(n * 8 * sizeof(WordT) < SIZE); - return reinterpret_cast(mWords)[n]; + return reinterpret_cast<__global__ const WordT*>(mWords)[n]; } /// @brief Return a reference to the nth word of the bit mask, for a word of arbitrary size. template - __hostdev__ WordT& getWord(int n) + __hostdev__ __global__ WordT& getWord(int n) { NANOVDB_ASSERT(n * 8 * sizeof(WordT) < SIZE); - return reinterpret_cast(mWords)[n]; + return reinterpret_cast<__global__ WordT*>(mWords)[n]; } /// @brief Assignment operator that works with openvdb::util::NodeMask template - __hostdev__ Mask& operator=(const MaskT& other) + __hostdev__ __global__ Mask& operator=(__global__ const MaskT& other) { static_assert(sizeof(Mask) == sizeof(MaskT), "Mismatching sizeof"); static_assert(WORD_COUNT == MaskT::WORD_COUNT, "Mismatching word count"); static_assert(LOG2DIM == MaskT::LOG2DIM, "Mismatching LOG2DIM"); - auto *src = reinterpret_cast(&other); - uint64_t *dst = mWords; + __global__ auto *src = reinterpret_cast<__global__ const uint64_t*>(&other); + __global__ uint64_t *dst = mWords; for (uint32_t i = 0; i < WORD_COUNT; ++i) { *dst++ = *src++; } return *this; } - __hostdev__ bool operator==(const Mask& other) const + __hostdev__ bool operator==(__global__ const Mask& other) const { for (uint32_t i = 0; i < WORD_COUNT; ++i) { if (mWords[i] != other.mWords[i]) return false; @@ -2079,22 +2346,33 @@ public: return true; } - __hostdev__ bool operator!=(const Mask& other) const { return !((*this) == other); } + __hostdev__ bool operator!=(__global__ const Mask& other) const { return !((*this) == other); } /// @brief Return true if the given bit is set. - __hostdev__ bool isOn(uint32_t n) const { return 0 != (mWords[n >> 6] & (uint64_t(1) << (n & 63))); } - + __hostdev__ bool isOn(uint32_t n) const __global__ { return 0 != (mWords[n >> 6] & (uint64_t(1) << (n & 63))); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isOn(uint32_t n) const __local__ { return 0 != (mWords[n >> 6] & (uint64_t(1) << (n & 63))); } +#endif /// @brief Return true if the given bit is NOT set. - __hostdev__ bool isOff(uint32_t n) const { return 0 == (mWords[n >> 6] & (uint64_t(1) << (n & 63))); } + __hostdev__ bool isOff(uint32_t n) const __global__ { return 0 == (mWords[n >> 6] & (uint64_t(1) << (n & 63))); } /// @brief Return true if all the bits are set in this Mask. - __hostdev__ bool isOn() const + __hostdev__ bool isOn() const __global__ { for (uint32_t i = 0; i < WORD_COUNT; ++i) if (mWords[i] != ~uint64_t(0)) return false; return true; } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isOn() const __local__ + { + for (uint32_t i = 0; i < WORD_COUNT; ++i) + if (mWords[i] != ~uint64_t(0)) + return false; + return true; + } +#endif /// @brief Return true if none of the bits are set in this Mask. __hostdev__ bool isOff() const @@ -2115,7 +2393,7 @@ public: __hostdev__ void set(uint32_t n, bool On) { #if 1 // switch between branchless - auto &word = mWords[n >> 6]; + __global__ auto &word = mWords[n >> 6]; n &= 63; word &= ~(uint64_t(1) << n); word |= uint64_t(On) << n; @@ -2149,40 +2427,40 @@ public: __hostdev__ void toggle() { uint32_t n = WORD_COUNT; - for (auto* w = mWords; n--; ++w) + for (__global__ auto* w = mWords; n--; ++w) *w = ~*w; } __hostdev__ void toggle(uint32_t n) { mWords[n >> 6] ^= uint64_t(1) << (n & 63); } /// @brief Bitwise intersection - __hostdev__ Mask& operator&=(const Mask& other) + __hostdev__ __global__ Mask& operator&=(__global__ const Mask& other) { - uint64_t *w1 = mWords; - const uint64_t *w2 = other.mWords; + __global__ uint64_t *w1 = mWords; + __global__ const uint64_t *w2 = other.mWords; for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= *w2; return *this; } /// @brief Bitwise union - __hostdev__ Mask& operator|=(const Mask& other) + __hostdev__ __global__ Mask& operator|=(__global__ const Mask& other) { - uint64_t *w1 = mWords; - const uint64_t *w2 = other.mWords; + __global__ uint64_t *w1 = mWords; + __global__ const uint64_t *w2 = other.mWords; for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 |= *w2; return *this; } /// @brief Bitwise difference - __hostdev__ Mask& operator-=(const Mask& other) + __hostdev__ __global__ Mask& operator-=(__global__ const Mask& other) { - uint64_t *w1 = mWords; - const uint64_t *w2 = other.mWords; + __global__ uint64_t *w1 = mWords; + __global__ const uint64_t *w2 = other.mWords; for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= ~*w2; return *this; } /// @brief Bitwise XOR - __hostdev__ Mask& operator^=(const Mask& other) + __hostdev__ __global__ Mask& operator^=(__global__ const Mask& other) { - uint64_t *w1 = mWords; - const uint64_t *w2 = other.mWords; + __global__ uint64_t *w1 = mWords; + __global__ const uint64_t *w2 = other.mWords; for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 ^= *w2; return *this; } @@ -2194,7 +2472,7 @@ private: __hostdev__ uint32_t findFirst() const { uint32_t n = 0; - const uint64_t* w = mWords; + __global__ const uint64_t* w = mWords; for (; n - __hostdev__ void set(const Mat3T& mat, const Mat3T& invMat, const Vec3T& translate, double taper); + __hostdev__ void set(__global__ const Mat3T& mat, __global__ const Mat3T& invMat, __global__ const Vec3T& translate, double taper) __global__; /// @brief Initialize the member data /// @note The last (4th) row of invMat is actually ignored. template - __hostdev__ void set(const Mat4T& mat, const Mat4T& invMat, double taper) {this->set(mat, invMat, mat[3], taper);} + __hostdev__ void set(__global__ const Mat4T& mat, __global__ const Mat4T& invMat, double taper) __global__ {this->set(mat, invMat, mat[3], taper);} template - __hostdev__ void set(double scale, const Vec3T &translation, double taper); + __hostdev__ void set(double scale, __global__ const Vec3T &translation, double taper) __global__; template - __hostdev__ Vec3T applyMap(const Vec3T& xyz) const { return matMult(mMatD, mVecD, xyz); } + __hostdev__ Vec3T applyMap(__global__ const Vec3T& xyz) const { return matMult(mMatD, mVecD, xyz); } template - __hostdev__ Vec3T applyMapF(const Vec3T& xyz) const { return matMult(mMatF, mVecF, xyz); } + __hostdev__ Vec3T applyMapF(__global__ const Vec3T& xyz) const { return matMult(mMatF, mVecF, xyz); } template - __hostdev__ Vec3T applyJacobian(const Vec3T& xyz) const { return matMult(mMatD, xyz); } + __hostdev__ Vec3T applyJacobian(__global__ const Vec3T& xyz) const { return matMult(mMatD, xyz); } template - __hostdev__ Vec3T applyJacobianF(const Vec3T& xyz) const { return matMult(mMatF, xyz); } + __hostdev__ Vec3T applyJacobianF(__global__ const Vec3T& xyz) const { return matMult(mMatF, xyz); } template - __hostdev__ Vec3T applyInverseMap(const Vec3T& xyz) const + __hostdev__ Vec3T applyInverseMap(__global__ const Vec3T& xyz) const __global__ { return matMult(mInvMatD, Vec3T(xyz[0] - mVecD[0], xyz[1] - mVecD[1], xyz[2] - mVecD[2])); } +#if defined(__KERNEL_METAL__) template - __hostdev__ Vec3T applyInverseMapF(const Vec3T& xyz) const + __hostdev__ Vec3T applyInverseMap(__local__ const Vec3T& xyz) const __global__ + { + return matMult(mInvMatD, Vec3T(xyz[0] - mVecD[0], xyz[1] - mVecD[1], xyz[2] - mVecD[2])); + } +#endif + template + __hostdev__ Vec3T applyInverseMapF(const __global__ Vec3T& xyz) const __global__ { return matMult(mInvMatF, Vec3T(xyz[0] - mVecF[0], xyz[1] - mVecF[1], xyz[2] - mVecF[2])); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ Vec3T applyInverseMapF(const __local__ Vec3T& xyz) const __global__ + { + return matMult(mInvMatF, Vec3T(xyz[0] - mVecF[0], xyz[1] - mVecF[1], xyz[2] - mVecF[2])); + } +#endif template - __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return matMult(mInvMatD, xyz); } + __hostdev__ Vec3T applyInverseJacobian(__global__ const Vec3T& xyz) const __global__ { return matMult(mInvMatD, xyz); } template - __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return matMult(mInvMatF, xyz); } + __hostdev__ Vec3T applyInverseJacobianF(__global__ const Vec3T& xyz) const __global__ { return matMult(mInvMatF, xyz); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ Vec3T applyInverseJacobianF(__local__ const Vec3T& xyz) const __global__ { return matMult(mInvMatF, xyz); } +#endif template - __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return matMultT(mInvMatD, xyz); } + __hostdev__ Vec3T applyIJT(__global__ const Vec3T& xyz) const { return matMultT(mInvMatD, xyz); } template - __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return matMultT(mInvMatF, xyz); } + __hostdev__ Vec3T applyIJTF(__global__ const Vec3T& xyz) const { return matMultT(mInvMatF, xyz); } }; // Map template -__hostdev__ inline void Map::set(const Mat3T& mat, const Mat3T& invMat, const Vec3T& translate, double taper) +__hostdev__ inline void Map::set(__global__ const Mat3T& mat, __global__ const Mat3T& invMat, __global__ const Vec3T& translate, double taper) __global__ { - float *mf = mMatF, *vf = mVecF, *mif = mInvMatF; - double *md = mMatD, *vd = mVecD, *mid = mInvMatD; + __global__ float * mf = mMatF, *vf = mVecF; + __global__ float* mif = mInvMatF; + __global__ double *md = mMatD, *vd = mVecD; + __global__ double* mid = mInvMatD; mTaperF = static_cast(taper); mTaperD = taper; for (int i = 0; i < 3; ++i) { @@ -2295,8 +2593,19 @@ __hostdev__ inline void Map::set(const Mat3T& mat, const Mat3T& invMat, const Ve } template -__hostdev__ inline void Map::set(double dx, const Vec3T &trans, double taper) +__hostdev__ inline void Map::set(double dx, __global__ const Vec3T &trans, double taper) __global__ { +#if defined __KERNEL_METAL__ + const float mat[3][3] = { + {(float)dx, 0.0, 0.0}, // row 0 + {0.0, (float)dx, 0.0}, // row 1 + {0.0, 0.0, (float)dx}, // row 2 + }, idx = 1.0/(float)dx, invMat[3][3] = { + {idx, 0.0, 0.0}, // row 0 + {0.0, idx, 0.0}, // row 1 + {0.0, 0.0, idx}, // row 2 + }; +#else const double mat[3][3] = { {dx, 0.0, 0.0}, // row 0 {0.0, dx, 0.0}, // row 1 @@ -2306,6 +2615,7 @@ __hostdev__ inline void Map::set(double dx, const Vec3T &trans, double taper) {0.0, idx, 0.0}, // row 1 {0.0, 0.0, idx}, // row 2 }; +#endif this->set(mat, invMat, trans, taper); } @@ -2313,7 +2623,7 @@ __hostdev__ inline void Map::set(double dx, const Vec3T &trans, double taper) struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridBlindMetaData { - static const int MaxNameSize = 256;// due to NULL termination the maximum length is one less! + static __constant__ const int MaxNameSize = 256;// due to NULL termination the maximum length is one less! int64_t mByteOffset; // byte offset to the blind data, relative to the GridData. uint64_t mElementCount; // number of elements, e.g. point count uint32_t mFlags; // flags @@ -2328,10 +2638,10 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridBlindMetaData return blindDataCount * sizeof(GridBlindMetaData); } - __hostdev__ void setBlindData(void *ptr) { mByteOffset = PtrDiff(ptr, this); } + __hostdev__ void setBlindData(__global__ void *ptr) __global__ { mByteOffset = PtrDiff(ptr, this); } template - __hostdev__ const T* getBlindData() const { return PtrAdd(this, mByteOffset); } + __hostdev__ __global__ const T* getBlindData() const { return PtrAdd(this, mByteOffset); } }; // GridBlindMetaData @@ -2430,7 +2740,7 @@ struct NodeTrait /// @note No client code should (or can) interface with this struct so it can safely be ignored! struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData {// sizeof(GridData) = 672B - static const int MaxNameSize = 256;// due to NULL termination the maximum length is one less + static __constant__ const int MaxNameSize = 256;// due to NULL termination the maximum length is one less uint64_t mMagic; // 8B (0) magic to validate it is valid grid data. uint64_t mChecksum; // 8B (8). Checksum of grid buffer. Version mVersion;// 4B (16) major, minor, and patch version numbers @@ -2450,8 +2760,8 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData uint64_t mData1, mData2;// 2x8B (656) padding to 32 B alignment. mData1 is use for the total number of values indexed by an IndexGrid // Set and unset various bit flags - __hostdev__ void setFlagsOff() { mFlags = uint32_t(0); } - __hostdev__ void setMinMaxOn(bool on = true) + __hostdev__ void setFlagsOff() __global__ { mFlags = uint32_t(0); } + __hostdev__ void setMinMaxOn(bool on = true) __global__ { if (on) { mFlags |= static_cast(GridFlags::HasMinMax); @@ -2459,7 +2769,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData mFlags &= ~static_cast(GridFlags::HasMinMax); } } - __hostdev__ void setBBoxOn(bool on = true) + __hostdev__ void setBBoxOn(bool on = true) __global__ { if (on) { mFlags |= static_cast(GridFlags::HasBBox); @@ -2467,7 +2777,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData mFlags &= ~static_cast(GridFlags::HasBBox); } } - __hostdev__ void setLongGridNameOn(bool on = true) + __hostdev__ void setLongGridNameOn(bool on = true) __global__ { if (on) { mFlags |= static_cast(GridFlags::HasLongGridName); @@ -2475,7 +2785,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData mFlags &= ~static_cast(GridFlags::HasLongGridName); } } - __hostdev__ void setAverageOn(bool on = true) + __hostdev__ void setAverageOn(bool on = true) __global__ { if (on) { mFlags |= static_cast(GridFlags::HasAverage); @@ -2483,7 +2793,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData mFlags &= ~static_cast(GridFlags::HasAverage); } } - __hostdev__ void setStdDeviationOn(bool on = true) + __hostdev__ void setStdDeviationOn(bool on = true) __global__ { if (on) { mFlags |= static_cast(GridFlags::HasStdDeviation); @@ -2491,7 +2801,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData mFlags &= ~static_cast(GridFlags::HasStdDeviation); } } - __hostdev__ void setBreadthFirstOn(bool on = true) + __hostdev__ void setBreadthFirstOn(bool on = true) __global__ { if (on) { mFlags |= static_cast(GridFlags::IsBreadthFirst); @@ -2502,37 +2812,49 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData // Affine transformations based on double precision template - __hostdev__ Vec3T applyMap(const Vec3T& xyz) const { return mMap.applyMap(xyz); } // Pos: index -> world + __hostdev__ Vec3T applyMap(__global__ const Vec3T& xyz) const __global__ { return mMap.applyMap(xyz); } // Pos: index -> world template - __hostdev__ Vec3T applyInverseMap(const Vec3T& xyz) const { return mMap.applyInverseMap(xyz); } // Pos: world -> index + __hostdev__ Vec3T applyInverseMap(__global__ const Vec3T& xyz) const __global__ { return mMap.applyInverseMap(xyz); } // Pos: world -> index +#if defined(__KERNEL_METAL__) template - __hostdev__ Vec3T applyJacobian(const Vec3T& xyz) const { return mMap.applyJacobian(xyz); } // Dir: index -> world + __hostdev__ Vec3T applyInverseMap(__local__ const Vec3T& xyz) const __global__ { return mMap.applyInverseMap(xyz); } // Pos: world -> index +#endif template - __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return mMap.applyInverseJacobian(xyz); } // Dir: world -> index + __hostdev__ Vec3T applyJacobian(__global__ const Vec3T& xyz) const __global__ { return mMap.applyJacobian(xyz); } // Dir: index -> world template - __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return mMap.applyIJT(xyz); } + __hostdev__ Vec3T applyInverseJacobian(__global__ const Vec3T& xyz) const __global__ { return mMap.applyInverseJacobian(xyz); } // Dir: world -> index + template + __hostdev__ Vec3T applyIJT(__global__ const Vec3T& xyz) const __global__ { return mMap.applyIJT(xyz); } // Affine transformations based on single precision template - __hostdev__ Vec3T applyMapF(const Vec3T& xyz) const { return mMap.applyMapF(xyz); } // Pos: index -> world + __hostdev__ Vec3T applyMapF(__global__ const Vec3T& xyz) const __global__ { return mMap.applyMapF(xyz); } // Pos: index -> world template - __hostdev__ Vec3T applyInverseMapF(const Vec3T& xyz) const { return mMap.applyInverseMapF(xyz); } // Pos: world -> index + __hostdev__ Vec3T applyInverseMapF(__global__ const Vec3T& xyz) const __global__ { return mMap.applyInverseMapF(xyz); } // Pos: world -> index +#if defined(__KERNEL_METAL__) template - __hostdev__ Vec3T applyJacobianF(const Vec3T& xyz) const { return mMap.applyJacobianF(xyz); } // Dir: index -> world + __hostdev__ Vec3T applyInverseMapF(__local__ const Vec3T& xyz) const __global__ { return mMap.applyInverseMapF(xyz); } // Pos: world -> index +#endif template - __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return mMap.applyInverseJacobianF(xyz); } // Dir: world -> index + __hostdev__ Vec3T applyJacobianF(__global__ const Vec3T& xyz) const __global__ { return mMap.applyJacobianF(xyz); } // Dir: index -> world template - __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return mMap.applyIJTF(xyz); } + __hostdev__ Vec3T applyInverseJacobianF(__global__ const Vec3T& xyz) const __global__ { return mMap.applyInverseJacobianF(xyz); } // Dir: world -> index +#if defined(__KERNEL_METAL__) + template + __hostdev__ Vec3T applyInverseJacobianF(__local__ const Vec3T& xyz) const __global__ { return mMap.applyInverseJacobianF(xyz); } // Dir: world -> index +#endif + template + __hostdev__ Vec3T applyIJTF(__global__ const Vec3T& xyz) const __global__ { return mMap.applyIJTF(xyz); } // @brief Return a non-const void pointer to the tree - __hostdev__ void* treePtr() { return this + 1; } + __hostdev__ __global__ void* treePtr() __global__ { return this + 1; } // @brief Return a const void pointer to the tree - __hostdev__ const void* treePtr() const { return this + 1; } + __hostdev__ __global__ const void* treePtr() const __global__ { return this + 1; } /// @brief Returns a const reference to the blindMetaData at the specified linear offset. /// /// @warning The linear offset is assumed to be in the valid range - __hostdev__ const GridBlindMetaData* blindMetaData(uint32_t n) const + __hostdev__ __global__ const GridBlindMetaData* blindMetaData(uint32_t n) const __global__ { NANOVDB_ASSERT(n < mBlindMetadataCount); return PtrAdd(this, mBlindMetadataOffset) + n; @@ -2552,8 +2874,17 @@ using DefaultReadAccessor = ReadAccessor; /// /// @note This the API of this class to interface with client code template -class Grid : private GridData +class Grid +#if !defined(__KERNEL_METAL__) + : private GridData +#endif { +#if defined(__KERNEL_METAL__) + GridData _base; +#define BASE(v) _base.v +#else +#define BASE(v) DataType::v +#endif public: using TreeType = TreeT; using RootType = typename TreeT::RootType; @@ -2566,183 +2897,195 @@ public: /// @brief Disallow constructions, copy and assignment /// /// @note Only a Serializer, defined elsewhere, can instantiate this class - Grid(const Grid&) = delete; - Grid& operator=(const Grid&) = delete; + Grid(__global__ const Grid&) __global__ = delete; + __global__ Grid& operator=(__global__ const Grid&) __global__ = delete; ~Grid() = delete; - __hostdev__ Version version() const { return DataType::mVersion; } + __hostdev__ Version version() const __global__ { return BASE(mVersion); } - __hostdev__ DataType* data() { return reinterpret_cast(this); } + __hostdev__ __global__ DataType* data() __global__ { return reinterpret_cast<__global__ DataType*>(this); } - __hostdev__ const DataType* data() const { return reinterpret_cast(this); } + __hostdev__ __global__ const DataType* data() const __global__ { return reinterpret_cast<__global__ const DataType*>(this); } /// @brief Return memory usage in bytes for this class only. __hostdev__ static uint64_t memUsage() { return sizeof(GridData); } /// @brief Return the memory footprint of the entire grid, i.e. including all nodes and blind data - __hostdev__ uint64_t gridSize() const { return DataType::mGridSize; } + __hostdev__ uint64_t gridSize() const __global__ { return BASE(mGridSize); } /// @brief Return index of this grid in the buffer - __hostdev__ uint32_t gridIndex() const { return DataType::mGridIndex; } + __hostdev__ uint32_t gridIndex() const __global__ { return BASE(mGridIndex); } /// @brief Return total number of grids in the buffer - __hostdev__ uint32_t gridCount() const { return DataType::mGridCount; } + __hostdev__ uint32_t gridCount() const __global__ { return BASE(mGridCount); } /// @brief @brief Return the total number of values indexed by this IndexGrid /// /// @note This method is only defined for IndexGrid = NanoGrid template - __hostdev__ typename enable_if::value, const uint64_t&>::type valueCount() const {return DataType::mData1;} + __hostdev__ typename enable_if::value, uint64_t>::type valueCount() const {return BASE(mData1);} /// @brief Return a const reference to the tree - __hostdev__ const TreeT& tree() const { return *reinterpret_cast(this->treePtr()); } + __hostdev__ __global__ const TreeT& tree() const __global__ { return *reinterpret_cast<__global__ const TreeT*>(BASE(treePtr)()); } /// @brief Return a non-const reference to the tree - __hostdev__ TreeT& tree() { return *reinterpret_cast(this->treePtr()); } + __hostdev__ __global__ TreeT& tree() __global__ { return *reinterpret_cast<__global__ TreeT*>(BASE(treePtr)()); } /// @brief Return a new instance of a ReadAccessor used to access values in this grid - __hostdev__ AccessorType getAccessor() const { return AccessorType(this->tree().root()); } + __hostdev__ AccessorType getAccessor() const __global__ { return AccessorType(this->tree().root()); } /// @brief Return a const reference to the size of a voxel in world units - __hostdev__ const Vec3R& voxelSize() const { return DataType::mVoxelSize; } + __hostdev__ const __global__ Vec3R& voxelSize() const __global__ { return BASE(mVoxelSize); } /// @brief Return a const reference to the Map for this grid - __hostdev__ const Map& map() const { return DataType::mMap; } + __hostdev__ const __global__ Map& map() const __global__ { return BASE(mMap); } /// @brief world to index space transformation template - __hostdev__ Vec3T worldToIndex(const Vec3T& xyz) const { return this->applyInverseMap(xyz); } + __hostdev__ Vec3T worldToIndex(__global__ const Vec3T& xyz) const __global__ { return BASE(applyInverseMap)(xyz); } + +#if defined(__KERNEL_METAL__) + template + __hostdev__ Vec3T worldToIndex(__local__ const Vec3T& xyz) const __global__ { return BASE(applyInverseMap)(xyz); } +#endif /// @brief index to world space transformation template - __hostdev__ Vec3T indexToWorld(const Vec3T& xyz) const { return this->applyMap(xyz); } + __hostdev__ Vec3T indexToWorld(__global__ const Vec3T& xyz) const __global__ { return this->applyMap(xyz); } /// @brief transformation from index space direction to world space direction /// @warning assumes dir to be normalized template - __hostdev__ Vec3T indexToWorldDir(const Vec3T& dir) const { return this->applyJacobian(dir); } + __hostdev__ Vec3T indexToWorldDir(__global__ const Vec3T& dir) const __global__ { return this->applyJacobian(dir); } /// @brief transformation from world space direction to index space direction /// @warning assumes dir to be normalized template - __hostdev__ Vec3T worldToIndexDir(const Vec3T& dir) const { return this->applyInverseJacobian(dir); } + __hostdev__ Vec3T worldToIndexDir(__global__ const Vec3T& dir) const __global__ { return this->applyInverseJacobian(dir); } /// @brief transform the gradient from index space to world space. /// @details Applies the inverse jacobian transform map. template - __hostdev__ Vec3T indexToWorldGrad(const Vec3T& grad) const { return this->applyIJT(grad); } + __hostdev__ Vec3T indexToWorldGrad(__global__ const Vec3T& grad) const __global__ { return this->applyIJT(grad); } /// @brief world to index space transformation template - __hostdev__ Vec3T worldToIndexF(const Vec3T& xyz) const { return this->applyInverseMapF(xyz); } + __hostdev__ Vec3T worldToIndexF(__global__ const Vec3T& xyz) const __global__ { return BASE(applyInverseMapF)(xyz); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ Vec3T worldToIndexF(__local__ const Vec3T& xyz) const __global__ { return BASE(applyInverseMapF)(xyz); } +#endif /// @brief index to world space transformation template - __hostdev__ Vec3T indexToWorldF(const Vec3T& xyz) const { return this->applyMapF(xyz); } + __hostdev__ Vec3T indexToWorldF(__global__ const Vec3T& xyz) const __global__ { return this->applyMapF(xyz); } /// @brief transformation from index space direction to world space direction /// @warning assumes dir to be normalized template - __hostdev__ Vec3T indexToWorldDirF(const Vec3T& dir) const { return this->applyJacobianF(dir); } + __hostdev__ Vec3T indexToWorldDirF(__global__ const Vec3T& dir) const __global__ { return this->applyJacobianF(dir); } /// @brief transformation from world space direction to index space direction /// @warning assumes dir to be normalized template - __hostdev__ Vec3T worldToIndexDirF(const Vec3T& dir) const { return this->applyInverseJacobianF(dir); } + __hostdev__ Vec3T worldToIndexDirF(__global__ const Vec3T& dir) const __global__ { return BASE(applyInverseJacobianF)(dir); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ Vec3T worldToIndexDirF(__local__ const Vec3T& dir) const __global__ { return BASE(applyInverseJacobianF)(dir); } +#endif /// @brief Transforms the gradient from index space to world space. /// @details Applies the inverse jacobian transform map. template - __hostdev__ Vec3T indexToWorldGradF(const Vec3T& grad) const { return DataType::applyIJTF(grad); } + __hostdev__ Vec3T indexToWorldGradF(__global__ const Vec3T& grad) const __global__ { return BASE(applyIJTF(grad)); } /// @brief Computes a AABB of active values in world space - __hostdev__ const BBox& worldBBox() const { return DataType::mWorldBBox; } + __hostdev__ __global__ const BBox& worldBBox() const __global__ { return BASE(mWorldBBox); } /// @brief Computes a AABB of active values in index space /// /// @note This method is returning a floating point bounding box and not a CoordBBox. This makes /// it more useful for clipping rays. - __hostdev__ const BBox& indexBBox() const { return this->tree().bbox(); } + __hostdev__ __global__ const BBox& indexBBox() const __global__ { return this->tree().bbox(); } /// @brief Return the total number of active voxels in this tree. - __hostdev__ uint64_t activeVoxelCount() const { return this->tree().activeVoxelCount(); } + __hostdev__ uint64_t activeVoxelCount() const __global__ { return this->tree().activeVoxelCount(); } /// @brief Methods related to the classification of this grid - __hostdev__ bool isValid() const { return DataType::mMagic == NANOVDB_MAGIC_NUMBER; } - __hostdev__ const GridType& gridType() const { return DataType::mGridType; } - __hostdev__ const GridClass& gridClass() const { return DataType::mGridClass; } - __hostdev__ bool isLevelSet() const { return DataType::mGridClass == GridClass::LevelSet; } - __hostdev__ bool isFogVolume() const { return DataType::mGridClass == GridClass::FogVolume; } - __hostdev__ bool isStaggered() const { return DataType::mGridClass == GridClass::Staggered; } - __hostdev__ bool isPointIndex() const { return DataType::mGridClass == GridClass::PointIndex; } - __hostdev__ bool isGridIndex() const { return DataType::mGridClass == GridClass::IndexGrid; } - __hostdev__ bool isPointData() const { return DataType::mGridClass == GridClass::PointData; } - __hostdev__ bool isMask() const { return DataType::mGridClass == GridClass::Topology; } - __hostdev__ bool isUnknown() const { return DataType::mGridClass == GridClass::Unknown; } - __hostdev__ bool hasMinMax() const { return DataType::mFlags & static_cast(GridFlags::HasMinMax); } - __hostdev__ bool hasBBox() const { return DataType::mFlags & static_cast(GridFlags::HasBBox); } - __hostdev__ bool hasLongGridName() const { return DataType::mFlags & static_cast(GridFlags::HasLongGridName); } - __hostdev__ bool hasAverage() const { return DataType::mFlags & static_cast(GridFlags::HasAverage); } - __hostdev__ bool hasStdDeviation() const { return DataType::mFlags & static_cast(GridFlags::HasStdDeviation); } - __hostdev__ bool isBreadthFirst() const { return DataType::mFlags & static_cast(GridFlags::IsBreadthFirst); } + __hostdev__ bool isValid() const __global__ { return BASE(mMagic) == NANOVDB_MAGIC_NUMBER; } + __hostdev__ const __global__ GridType& gridType() const __global__ { return BASE(mGridType); } + __hostdev__ const __global__ GridClass& gridClass() const __global__ { return BASE(mGridClass); } + __hostdev__ bool isLevelSet() const __global__ { return BASE(mGridClass) == GridClass::LevelSet; } + __hostdev__ bool isFogVolume() const __global__ { return BASE(mGridClass) == GridClass::FogVolume; } + __hostdev__ bool isStaggered() const __global__ { return BASE(mGridClass) == GridClass::Staggered; } + __hostdev__ bool isPointIndex() const __global__ { return BASE(mGridClass) == GridClass::PointIndex; } + __hostdev__ bool isGridIndex() const __global__ { return BASE(mGridClass) == GridClass::IndexGrid; } + __hostdev__ bool isPointData() const __global__ { return BASE(mGridClass) == GridClass::PointData; } + __hostdev__ bool isMask() const __global__ { return BASE(mGridClass) == GridClass::Topology; } + __hostdev__ bool isUnknown() const __global__ { return BASE(mGridClass) == GridClass::Unknown; } + __hostdev__ bool hasMinMax() const __global__ { return BASE(mFlags) & static_cast(GridFlags::HasMinMax); } + __hostdev__ bool hasBBox() const __global__ { return BASE(mFlags) & static_cast(GridFlags::HasBBox); } + __hostdev__ bool hasLongGridName() const __global__ { return BASE(mFlags) & static_cast(GridFlags::HasLongGridName); } + __hostdev__ bool hasAverage() const __global__ { return BASE(mFlags) & static_cast(GridFlags::HasAverage); } + __hostdev__ bool hasStdDeviation() const __global__ { return BASE(mFlags) & static_cast(GridFlags::HasStdDeviation); } + __hostdev__ bool isBreadthFirst() const __global__ { return BASE(mFlags) & static_cast(GridFlags::IsBreadthFirst); } /// @brief return true if the specified node type is layed out breadth-first in memory and has a fixed size. /// This allows for sequential access to the nodes. template - __hostdev__ bool isSequential() const { return NodeT::FIXED_SIZE && this->isBreadthFirst(); } + __hostdev__ bool isSequential() const __global__ { return NodeT::FIXED_SIZE && this->isBreadthFirst(); } /// @brief return true if the specified node level is layed out breadth-first in memory and has a fixed size. /// This allows for sequential access to the nodes. template - __hostdev__ bool isSequential() const { return NodeTrait::type::FIXED_SIZE && this->isBreadthFirst(); } + __hostdev__ bool isSequential() const __global__ { return NodeTrait::type::FIXED_SIZE && this->isBreadthFirst(); } /// @brief Return a c-string with the name of this grid - __hostdev__ const char* gridName() const + __hostdev__ __global__ const char* gridName() const __global__ { if (this->hasLongGridName()) { NANOVDB_ASSERT(DataType::mBlindMetadataCount>0); - const auto &metaData = this->blindMetaData(DataType::mBlindMetadataCount-1);// always the last + __global__ const auto &metaData = this->blindMetaData(BASE(mBlindMetadataCount)-1);// always the last NANOVDB_ASSERT(metaData.mDataClass == GridBlindDataClass::GridName); return metaData.template getBlindData(); } - return DataType::mGridName; + return BASE(mGridName); } /// @brief Return a c-string with the name of this grid, truncated to 255 characters - __hostdev__ const char* shortGridName() const { return DataType::mGridName; } - + __hostdev__ __global__ const char* shortGridName() const __global__ { return BASE(mGridName); } /// @brief Return checksum of the grid buffer. - __hostdev__ uint64_t checksum() const { return DataType::mChecksum; } + __hostdev__ uint64_t checksum() const __global__ { return BASE(mChecksum); } /// @brief Return true if this grid is empty, i.e. contains no values or nodes. - __hostdev__ bool isEmpty() const { return this->tree().isEmpty(); } + __hostdev__ bool isEmpty() const __global__ { return this->tree().isEmpty(); } /// @brief Return the count of blind-data encoded in this grid - __hostdev__ uint32_t blindDataCount() const { return DataType::mBlindMetadataCount; } + __hostdev__ uint32_t blindDataCount() const __global__ { return BASE(mBlindMetadataCount); } /// @brief Return the index of the blind data with specified semantic if found, otherwise -1. - __hostdev__ int findBlindDataForSemantic(GridBlindDataSemantic semantic) const; + __hostdev__ int findBlindDataForSemantic(GridBlindDataSemantic semantic) const __global__; /// @brief Returns a const pointer to the blindData at the specified linear offset. /// /// @warning Point might be NULL and the linear offset is assumed to be in the valid range - __hostdev__ const void* blindData(uint32_t n) const + __hostdev__ __global__ const void* blindData(uint32_t n) const __global__ { - if (DataType::mBlindMetadataCount == 0u) { + if (BASE(mBlindMetadataCount) == 0u) { return nullptr; } NANOVDB_ASSERT(n < DataType::mBlindMetadataCount); return this->blindMetaData(n).template getBlindData(); } - - __hostdev__ const GridBlindMetaData& blindMetaData(uint32_t n) const { return *DataType::blindMetaData(n); } + + __hostdev__ __global__ const GridBlindMetaData& blindMetaData(uint32_t n) const __global__ { return *BASE(blindMetaData)(n); } private: static_assert(sizeof(GridData) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(GridData) is misaligned"); }; // Class Grid template -__hostdev__ int Grid::findBlindDataForSemantic(GridBlindDataSemantic semantic) const +__hostdev__ int Grid::findBlindDataForSemantic(GridBlindDataSemantic semantic) const __global__ { for (uint32_t i = 0, n = this->blindDataCount(); i < n; ++i) if (this->blindMetaData(i).mSemantic == semantic) @@ -2762,14 +3105,14 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) TreeData uint64_t mVoxelCount;// 8B, total number of active voxels in the root and all its child nodes. // No padding since it's always 32B aligned template - __hostdev__ void setRoot(const RootT* root) { mNodeOffset[3] = PtrDiff(root, this); } + __hostdev__ void setRoot(__global__ const RootT* root) __global__ { mNodeOffset[3] = PtrDiff(root, this); } template - __hostdev__ RootT* getRoot() { return PtrAdd(this, mNodeOffset[3]); } + __hostdev__ __global__ RootT* getRoot() __global__ { return PtrAdd(this, mNodeOffset[3]); } template - __hostdev__ const RootT* getRoot() const { return PtrAdd(this, mNodeOffset[3]); } + __hostdev__ __global__ const RootT* getRoot() const __global__ { return PtrAdd(this, mNodeOffset[3]); } template - __hostdev__ void setFirstNode(const NodeT* node) + __hostdev__ void setFirstNode(__global__ const NodeT* node) __global__ { mNodeOffset[NodeT::LEVEL] = node ? PtrDiff(node, this) : 0; } @@ -2795,8 +3138,17 @@ struct GridTree /// @brief VDB Tree, which is a thin wrapper around a RootNode. template -class Tree : private TreeData +class Tree +#if !defined(__KERNEL_METAL__) + : private TreeData +#endif { +#if defined(__KERNEL_METAL__) + TreeData _base; +#define BASE(v) _base.v +#else +#define BASE(v) DataType::v +#endif static_assert(RootT::LEVEL == 3, "Tree depth is not supported"); static_assert(RootT::ChildNodeType::LOG2DIM == 5, "Tree configuration is not supported"); static_assert(RootT::ChildNodeType::ChildNodeType::LOG2DIM == 4, "Tree configuration is not supported"); @@ -2817,79 +3169,86 @@ public: using Node0 = LeafNodeType; /// @brief This class cannot be constructed or deleted - Tree() = delete; - Tree(const Tree&) = delete; - Tree& operator=(const Tree&) = delete; - ~Tree() = delete; + Tree() __global__ = delete; + Tree(__global__ const Tree&) __global__ = delete; + __global__ Tree& operator=(__global__ const Tree&) __global__ = delete; + ~Tree() __global__ = delete; - __hostdev__ DataType* data() { return reinterpret_cast(this); } + __hostdev__ __global__ DataType* data() __global__ { return reinterpret_cast<__global__ DataType*>(this); } - __hostdev__ const DataType* data() const { return reinterpret_cast(this); } + __hostdev__ __global__ const DataType* data() const __global__ { return reinterpret_cast<__global__ const DataType*>(this); } /// @brief return memory usage in bytes for the class __hostdev__ static uint64_t memUsage() { return sizeof(DataType); } - __hostdev__ RootT& root() { return *DataType::template getRoot(); } + __hostdev__ __global__ RootT& root() __global__ { return *BASE(template) getRoot(); } - __hostdev__ const RootT& root() const { return *DataType::template getRoot(); } + __hostdev__ __global__ const RootT& root() const __global__ { return *BASE(template) getRoot(); } - __hostdev__ AccessorType getAccessor() const { return AccessorType(this->root()); } + __hostdev__ AccessorType getAccessor() const __global__ { return AccessorType(this->root()); } /// @brief Return the value of the given voxel (regardless of state or location in the tree.) - __hostdev__ ValueType getValue(const CoordType& ijk) const { return this->root().getValue(ijk); } + __hostdev__ ValueType getValue(__global__ const CoordType& ijk) const __global__ { return this->root().getValue(ijk); } +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __global__ { return this->root().getValue(ijk); } +#endif /// @brief Return the active state of the given voxel (regardless of state or location in the tree.) - __hostdev__ bool isActive(const CoordType& ijk) const { return this->root().isActive(ijk); } + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __global__ { return this->root().isActive(ijk); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __global__ { return this->root().isActive(ijk); } + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __local__ { return this->root().isActive(ijk); } +#endif /// @brief Return true if this tree is empty, i.e. contains no values or nodes - __hostdev__ bool isEmpty() const { return this->root().isEmpty(); } + __hostdev__ bool isEmpty() const __global__ { return this->root().isEmpty(); } /// @brief Combines the previous two methods in a single call - __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const { return this->root().probeValue(ijk, v); } + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ ValueType& v) const { return this->root().probeValue(ijk, v); } /// @brief Return a const reference to the background value. - __hostdev__ const ValueType& background() const { return this->root().background(); } + __hostdev__ __global__ const ValueType& background() const __global__ { return this->root().background(); } /// @brief Sets the extrema values of all the active values in this tree, i.e. in all nodes of the tree - __hostdev__ void extrema(ValueType& min, ValueType& max) const; + __hostdev__ void extrema(__global__ ValueType& min, __global__ ValueType& max) const __global__; /// @brief Return a const reference to the index bounding box of all the active values in this tree, i.e. in all nodes of the tree - __hostdev__ const BBox& bbox() const { return this->root().bbox(); } + __hostdev__ __global__ const BBox& bbox() const __global__ { return this->root().bbox(); } /// @brief Return the total number of active voxels in this tree. - __hostdev__ uint64_t activeVoxelCount() const { return DataType::mVoxelCount; } + __hostdev__ uint64_t activeVoxelCount() const __global__ { return BASE(mVoxelCount); } /// @brief Return the total number of active tiles at the specified level of the tree. /// /// @details level = 1,2,3 corresponds to active tile count in lower internal nodes, upper /// internal nodes, and the root level. Note active values at the leaf level are /// referred to as active voxels (see activeVoxelCount defined above). - __hostdev__ const uint32_t& activeTileCount(uint32_t level) const + __hostdev__ __global__ const uint32_t& activeTileCount(uint32_t level) const __global__ { NANOVDB_ASSERT(level > 0 && level <= 3);// 1, 2, or 3 - return DataType::mTileCount[level - 1]; + return BASE(mTileCount)[level - 1]; } template - __hostdev__ uint32_t nodeCount() const + __hostdev__ uint32_t nodeCount() const __global__ { static_assert(NodeT::LEVEL < 3, "Invalid NodeT"); - return DataType::mNodeCount[NodeT::LEVEL]; + return BASE(mNodeCount)[NodeT::LEVEL]; } - __hostdev__ uint32_t nodeCount(int level) const + __hostdev__ uint32_t nodeCount(int level) const __global__ { NANOVDB_ASSERT(level < 3); - return DataType::mNodeCount[level]; + return BASE(mNodeCount)[level]; } /// @brief return a pointer to the first node of the specified type /// /// @warning Note it may return NULL if no nodes exist template - __hostdev__ NodeT* getFirstNode() + __hostdev__ __global__ NodeT* getFirstNode() __global__ { - const uint64_t offset = DataType::mNodeOffset[NodeT::LEVEL]; + const uint64_t offset = BASE(mNodeOffset)[NodeT::LEVEL]; return offset>0 ? PtrAdd(this, offset) : nullptr; } @@ -2897,9 +3256,9 @@ public: /// /// @warning Note it may return NULL if no nodes exist template - __hostdev__ const NodeT* getFirstNode() const + __hostdev__ __global__ const NodeT* getFirstNode() const __global__ { - const uint64_t offset = DataType::mNodeOffset[NodeT::LEVEL]; + const uint64_t offset = BASE(mNodeOffset)[NodeT::LEVEL]; return offset>0 ? PtrAdd(this, offset) : nullptr; } @@ -2907,8 +3266,8 @@ public: /// /// @warning Note it may return NULL if no nodes exist template - __hostdev__ typename NodeTrait::type* - getFirstNode() + __hostdev__ __global__ typename NodeTrait::type* + getFirstNode() __global__ { return this->template getFirstNode::type>(); } @@ -2917,27 +3276,28 @@ public: /// /// @warning Note it may return NULL if no nodes exist template - __hostdev__ const typename NodeTrait::type* - getFirstNode() const + __hostdev__ __global__ const typename NodeTrait::type* + getFirstNode() const __global__ { return this->template getFirstNode::type>(); } /// @brief Template specializations of getFirstNode - __hostdev__ LeafNodeType* getFirstLeaf() {return this->getFirstNode();} - __hostdev__ const LeafNodeType* getFirstLeaf() const {return this->getFirstNode();} - __hostdev__ typename NodeTrait::type* getFirstLower() {return this->getFirstNode<1>();} - __hostdev__ const typename NodeTrait::type* getFirstLower() const {return this->getFirstNode<1>();} - __hostdev__ typename NodeTrait::type* getFirstUpper() {return this->getFirstNode<2>();} - __hostdev__ const typename NodeTrait::type* getFirstUpper() const {return this->getFirstNode<2>();} + __hostdev__ __global__ LeafNodeType* getFirstLeaf() {return this->getFirstNode();} + __hostdev__ __global__ const LeafNodeType* getFirstLeaf() const {return this->getFirstNode();} + __hostdev__ __global__ typename NodeTrait::type* getFirstLower() {return this->getFirstNode<1>();} + __hostdev__ __global__ const typename NodeTrait::type* getFirstLower() const {return this->getFirstNode<1>();} + __hostdev__ __global__ typename NodeTrait::type* getFirstUpper() {return this->getFirstNode<2>();} + __hostdev__ __global__ const typename NodeTrait::type* getFirstUpper() const {return this->getFirstNode<2>();} private: static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(TreeData) is misaligned"); +#undef BASE }; // Tree class template -__hostdev__ void Tree::extrema(ValueType& min, ValueType& max) const +__hostdev__ void Tree::extrema(__global__ ValueType& min, __global__ ValueType& max) const __global__ { min = this->root().minimum(); max = this->root().maximum(); @@ -2955,13 +3315,13 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData using BuildT = typename ChildT::BuildType;// in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool using CoordT = typename ChildT::CoordType; using StatsT = typename ChildT::FloatType; - static constexpr bool FIXED_SIZE = false; + static __constant__ constexpr bool FIXED_SIZE = false; /// @brief Return a key based on the coordinates of a voxel #ifdef USE_SINGLE_ROOT_KEY using KeyT = uint64_t; template - __hostdev__ static KeyT CoordToKey(const CoordType& ijk) + __hostdev__ static KeyT CoordToKey(__global__ const CoordType& ijk) { static_assert(sizeof(CoordT) == sizeof(CoordType), "Mismatching sizeof"); static_assert(32 - ChildT::TOTAL <= 21, "Cannot use 64 bit root keys"); @@ -2969,17 +3329,28 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData (KeyT(uint32_t(ijk[1]) >> ChildT::TOTAL) << 21) | // y is the middle 21 bits (KeyT(uint32_t(ijk[0]) >> ChildT::TOTAL) << 42); // x is the upper 21 bits } - __hostdev__ static CoordT KeyToCoord(const KeyT& key) +#if defined(__KERNEL_METAL__) + template + __hostdev__ static KeyT CoordToKey(__local__ const CoordType& ijk) + { + static_assert(sizeof(CoordT) == sizeof(CoordType), "Mismatching sizeof"); + static_assert(32 - ChildT::TOTAL <= 21, "Cannot use 64 bit root keys"); + return (KeyT(uint32_t(ijk[2]) >> ChildT::TOTAL)) | // z is the lower 21 bits + (KeyT(uint32_t(ijk[1]) >> ChildT::TOTAL) << 21) | // y is the middle 21 bits + (KeyT(uint32_t(ijk[0]) >> ChildT::TOTAL) << 42); // x is the upper 21 bits + } +#endif + static __constant__ constexpr uint64_t MASK = (1u << 21) - 1; + __hostdev__ static CoordT KeyToCoord(__global__ const KeyT& key) { - static constexpr uint64_t MASK = (1u << 21) - 1; return CoordT(((key >> 42) & MASK) << ChildT::TOTAL, ((key >> 21) & MASK) << ChildT::TOTAL, (key & MASK) << ChildT::TOTAL); } #else using KeyT = CoordT; - __hostdev__ static KeyT CoordToKey(const CoordT& ijk) { return ijk & ~ChildT::MASK; } - __hostdev__ static CoordT KeyToCoord(const KeyT& key) { return key; } + __hostdev__ static KeyT CoordToKey(__global__ const CoordT& ijk) { return ijk & ~ChildT::MASK; } + __hostdev__ static CoordT KeyToCoord(__global__ const KeyT& key) { return key; } #endif BBox mBBox; // 24B. AABB of active values in index space. uint32_t mTableSize; // 4B. number of tiles and child pointers in the root node @@ -3000,23 +3371,23 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) Tile { template - __hostdev__ void setChild(const CoordType& k, const ChildT *ptr, const RootData *data) + __hostdev__ void setChild(__global__ const CoordType& k, __global__ const ChildT *ptr, __global__ const RootData *data) { key = CoordToKey(k); child = PtrDiff(ptr, data); } template - __hostdev__ void setValue(const CoordType& k, bool s, const ValueType &v) + __hostdev__ void setValue(__global__ const CoordType& k, bool s, __global__ const ValueType &v) { key = CoordToKey(k); state = s; value = v; child = 0; } - __hostdev__ bool isChild() const { return child!=0; } - __hostdev__ bool isValue() const { return child==0; } - __hostdev__ bool isActive() const { return child==0 && state; } - __hostdev__ CoordT origin() const { return KeyToCoord(key); } + __hostdev__ bool isChild() const __global__ { return child!=0; } + __hostdev__ bool isValue() const __global__ { return child==0; } + __hostdev__ bool isActive() const __global__ { return child==0 && state; } + __hostdev__ CoordT origin() const __global__ { return KeyToCoord(key); } KeyT key; // USE_SINGLE_ROOT_KEY ? 8B : 12B int64_t child; // 8B. signed byte offset from this node to the child node. 0 means it is a constant tile, so use value. uint32_t state; // 4B. state of tile value @@ -3026,53 +3397,64 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData /// @brief Returns a non-const reference to the tile at the specified linear offset. /// /// @warning The linear offset is assumed to be in the valid range - __hostdev__ const Tile* tile(uint32_t n) const + __hostdev__ __global__ const Tile* tile(uint32_t n) const { NANOVDB_ASSERT(n < mTableSize); - return reinterpret_cast(this + 1) + n; + return reinterpret_cast<__global__ const Tile*>(this + 1) + n; } - __hostdev__ Tile* tile(uint32_t n) + __hostdev__ __global__ Tile* tile(uint32_t n) { NANOVDB_ASSERT(n < mTableSize); - return reinterpret_cast(this + 1) + n; + return reinterpret_cast<__global__ Tile*>(this + 1) + n; } /// @brief Returns a const reference to the child node in the specified tile. /// /// @warning A child node is assumed to exist in the specified tile - __hostdev__ ChildT* getChild(const Tile* tile) + __hostdev__ __global__ ChildT* getChild(__global__ const Tile* tile) __global__ { NANOVDB_ASSERT(tile->child); return PtrAdd(this, tile->child); } - __hostdev__ const ChildT* getChild(const Tile* tile) const + __hostdev__ __global__ const ChildT* getChild(__global__ const Tile* tile) const __global__ { NANOVDB_ASSERT(tile->child); return PtrAdd(this, tile->child); } - __hostdev__ const ValueT& getMin() const { return mMinimum; } - __hostdev__ const ValueT& getMax() const { return mMaximum; } - __hostdev__ const StatsT& average() const { return mAverage; } - __hostdev__ const StatsT& stdDeviation() const { return mStdDevi; } + __hostdev__ __global__ const ValueT& getMin() const { return mMinimum; } + __hostdev__ __global__ const ValueT& getMax() const { return mMaximum; } + __hostdev__ __global__ const StatsT& average() const { return mAverage; } + __hostdev__ __global__ const StatsT& stdDeviation() const { return mStdDevi; } - __hostdev__ void setMin(const ValueT& v) { mMinimum = v; } - __hostdev__ void setMax(const ValueT& v) { mMaximum = v; } - __hostdev__ void setAvg(const StatsT& v) { mAverage = v; } - __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; } + __hostdev__ void setMin(__global__ const ValueT& v) { mMinimum = v; } + __hostdev__ void setMax(__global__ const ValueT& v) { mMaximum = v; } + __hostdev__ void setAvg(__global__ const StatsT& v) { mAverage = v; } + __hostdev__ void setDev(__global__ const StatsT& v) { mStdDevi = v; } /// @brief This class cannot be constructed or deleted RootData() = delete; - RootData(const RootData&) = delete; - RootData& operator=(const RootData&) = delete; + RootData(__global__ const RootData&) = delete; + __global__ RootData& operator=(__global__ const RootData&) = delete; ~RootData() = delete; }; // RootData /// @brief Top-most node of the VDB tree structure. template -class RootNode : private RootData +class RootNode +#if !defined(__KERNEL_METAL__) + : private RootData +#endif { public: +#if defined(__KERNEL_METAL__) + + RootData _base; +#define BASE(v) _base.v +#else +#define BASE(v) DataType::v +#endif + using DataType = RootData; using LeafNodeType = typename ChildT::LeafNodeType; using ChildNodeType = ChildT; @@ -3086,27 +3468,27 @@ public: using BBoxType = BBox; using AccessorType = DefaultReadAccessor; using Tile = typename DataType::Tile; - static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE; + static __constant__ constexpr bool FIXED_SIZE = DataType::FIXED_SIZE; - static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf + static __constant__ constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf class ChildIterator { - const DataType *mParent; - uint32_t mPos, mSize; + __global__ const DataType *mParent; + uint32_t mPos, mSize; public: __hostdev__ ChildIterator() : mParent(nullptr), mPos(0), mSize(0) {} - __hostdev__ ChildIterator(const RootNode *parent) : mParent(parent->data()), mPos(0), mSize(parent->tileCount()) { + __hostdev__ ChildIterator(__global__ const RootNode *parent) : mParent(parent->data()), mPos(0), mSize(parent->tileCount()) { NANOVDB_ASSERT(mParent); while (mPostile(mPos)->isChild()) ++mPos; } - ChildIterator& operator=(const ChildIterator&) = default; - __hostdev__ const ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mParent->getChild(mParent->tile(mPos));} - __hostdev__ const ChildT* operator->() const {NANOVDB_ASSERT(*this); return mParent->getChild(mParent->tile(mPos));} + __global__ ChildIterator& operator=(__global__ const ChildIterator&) = default; + __hostdev__ __global__ const ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mParent->getChild(mParent->tile(mPos));} + __hostdev__ __global__ const ChildT* operator->() const {NANOVDB_ASSERT(*this); return mParent->getChild(mParent->tile(mPos));} __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); mParent->tile(mPos)->origin();} __hostdev__ operator bool() const {return mPos < mSize;} __hostdev__ uint32_t pos() const {return mPos;} - __hostdev__ ChildIterator& operator++() { + __hostdev__ __global__ ChildIterator& operator++() { NANOVDB_ASSERT(mParent); ++mPos; while (mPos < mSize && mParent->tile(mPos)->isValue()) ++mPos; @@ -3123,21 +3505,21 @@ public: class ValueIterator { - const DataType *mParent; - uint32_t mPos, mSize; + __global__ const DataType *mParent; + uint32_t mPos, mSize; public: __hostdev__ ValueIterator() : mParent(nullptr), mPos(0), mSize(0) {} - __hostdev__ ValueIterator(const RootNode *parent) : mParent(parent->data()), mPos(0), mSize(parent->tileCount()){ + __hostdev__ ValueIterator(__global__ const RootNode *parent) : mParent(parent->data()), mPos(0), mSize(parent->tileCount()){ NANOVDB_ASSERT(mParent); while (mPos < mSize && mParent->tile(mPos)->isChild()) ++mPos; } - ValueIterator& operator=(const ValueIterator&) = default; + __global__ ValueIterator& operator=(__global__ const ValueIterator&) = default; __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->tile(mPos)->value;} __hostdev__ bool isActive() const {NANOVDB_ASSERT(*this); return mParent->tile(mPos)->state;} __hostdev__ operator bool() const {return mPos < mSize;} __hostdev__ uint32_t pos() const {return mPos;} __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); mParent->tile(mPos)->origin();} - __hostdev__ ValueIterator& operator++() { + __hostdev__ __global__ ValueIterator& operator++() { NANOVDB_ASSERT(mParent); ++mPos; while (mPos < mSize && mParent->tile(mPos)->isChild()) ++mPos; @@ -3154,20 +3536,20 @@ public: class ValueOnIterator { - const DataType *mParent; + __global__ const DataType *mParent; uint32_t mPos, mSize; public: __hostdev__ ValueOnIterator() : mParent(nullptr), mPos(0), mSize(0) {} - __hostdev__ ValueOnIterator(const RootNode *parent) : mParent(parent->data()), mPos(0), mSize(parent->tileCount()){ + __hostdev__ ValueOnIterator(__global__ const RootNode *parent) : mParent(parent->data()), mPos(0), mSize(parent->tileCount()){ NANOVDB_ASSERT(mParent); while (mPos < mSize && !mParent->tile(mPos)->isActive()) ++mPos; } - ValueOnIterator& operator=(const ValueOnIterator&) = default; + __global__ ValueOnIterator& operator=(__global__ const ValueOnIterator&) = default; __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->tile(mPos)->value;} __hostdev__ operator bool() const {return mPos < mSize;} __hostdev__ uint32_t pos() const {return mPos;} __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); mParent->tile(mPos)->origin();} - __hostdev__ ValueOnIterator& operator++() { + __hostdev__ __global__ ValueOnIterator& operator++() { NANOVDB_ASSERT(mParent); ++mPos; while (mPos < mSize && !mParent->tile(mPos)->isActive()) ++mPos; @@ -3183,75 +3565,107 @@ public: ValueOnIterator beginValueOn() const {return ValueOnIterator(this);} /// @brief This class cannot be constructed or deleted - RootNode() = delete; - RootNode(const RootNode&) = delete; - RootNode& operator=(const RootNode&) = delete; - ~RootNode() = delete; + RootNode() __global__ = delete; + RootNode(__global__ const RootNode&) __global__ = delete; + __global__ RootNode& operator=(__global__ const RootNode&) __global__ = delete; + ~RootNode() __global__ = delete; - __hostdev__ AccessorType getAccessor() const { return AccessorType(*this); } + __hostdev__ AccessorType getAccessor() const __global__ { return AccessorType(*this); } - __hostdev__ DataType* data() { return reinterpret_cast(this); } + __hostdev__ __global__ DataType* data() __global__ { return reinterpret_cast<__global__ DataType*>(this); } - __hostdev__ const DataType* data() const { return reinterpret_cast(this); } + __hostdev__ __global__ const DataType* data() const __global__ { return reinterpret_cast<__global__ const DataType*>(this); } /// @brief Return a const reference to the index bounding box of all the active values in this tree, i.e. in all nodes of the tree - __hostdev__ const BBoxType& bbox() const { return DataType::mBBox; } + __hostdev__ __global__ const BBoxType& bbox() const __global__ { return BASE(mBBox); } /// @brief Return the total number of active voxels in the root and all its child nodes. /// @brief Return a const reference to the background value, i.e. the value associated with /// any coordinate location that has not been set explicitly. - __hostdev__ const ValueType& background() const { return DataType::mBackground; } + __hostdev__ __global__ const ValueType& background() const __global__ { return DataType::mBackground; } /// @brief Return the number of tiles encoded in this root node - __hostdev__ const uint32_t& tileCount() const { return DataType::mTableSize; } + __hostdev__ __global__ const uint32_t& tileCount() const __global__ { return DataType::mTableSize; } /// @brief Return a const reference to the minimum active value encoded in this root node and any of its child nodes - __hostdev__ const ValueType& minimum() const { return this->getMin(); } + __hostdev__ __global__ const ValueType& minimum() const __global__ { return this->getMin(); } /// @brief Return a const reference to the maximum active value encoded in this root node and any of its child nodes - __hostdev__ const ValueType& maximum() const { return this->getMax(); } + __hostdev__ __global__ const ValueType& maximum() const __global__ { return this->getMax(); } /// @brief Return a const reference to the average of all the active values encoded in this root node and any of its child nodes - __hostdev__ const FloatType& average() const { return DataType::mAverage; } + __hostdev__ __global__ const FloatType& average() const __global__ { return DataType::mAverage; } /// @brief Return the variance of all the active values encoded in this root node and any of its child nodes - __hostdev__ FloatType variance() const { return DataType::mStdDevi * DataType::mStdDevi; } + __hostdev__ FloatType variance() const __global__ { return DataType::mStdDevi * DataType::mStdDevi; } /// @brief Return a const reference to the standard deviation of all the active values encoded in this root node and any of its child nodes - __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; } + __hostdev__ __global__ const FloatType& stdDeviation() const __global__ { return DataType::mStdDevi; } /// @brief Return the expected memory footprint in bytes with the specified number of tiles __hostdev__ static uint64_t memUsage(uint32_t tableSize) { return sizeof(RootNode) + tableSize * sizeof(Tile); } /// @brief Return the actual memory footprint of this root node - __hostdev__ uint64_t memUsage() const { return sizeof(RootNode) + DataType::mTableSize * sizeof(Tile); } + __hostdev__ uint64_t memUsage() const __global__ { return sizeof(RootNode) + DataType::mTableSize * sizeof(Tile); } /// @brief Return the value of the given voxel - __hostdev__ ValueType getValue(const CoordType& ijk) const + __hostdev__ ValueType getValue(__global__ const CoordType& ijk) const __global__ { - if (const Tile* tile = this->probeTile(ijk)) { + if (__global__ const Tile* tile = this->probeTile(ijk)) { return tile->isChild() ? this->getChild(tile)->getValue(ijk) : tile->value; } return DataType::mBackground; } - - __hostdev__ bool isActive(const CoordType& ijk) const +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __global__ { - if (const Tile* tile = this->probeTile(ijk)) { - return tile->isChild() ? this->getChild(tile)->isActive(ijk) : tile->state; + if (__global__ const Tile* tile = this->findTile(ijk)) { + return tile->isChild() ? this->getChild(tile)->getValue(ijk) : tile->value; + } + return DataType::mBackground; + } + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __local__ + { + if (__global__ const Tile* tile = this->findTile(ijk)) { + return tile->isChild() ? this->getChild(tile)->getValue(ijk) : tile->value; + } + return DataType::mBackground; + } +#endif + + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __global__ + { + if (__global__ const Tile* tile = this->findTile(ijk)) { + return tile->isChild() ? BASE(getChild)(tile)->isActive(ijk) : tile->state; } return false; } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __global__ + { + if (__global__ const Tile* tile = this->findTile(ijk)) { + return tile->isChild() ? BASE(getChild)(tile)->isActive(ijk) : tile->state; + } + return false; + } + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __local__ + { + if (__global__ const Tile* tile = this->findTile(ijk)) { + return tile->isChild() ? BASE(getChild)(tile)->isActive(ijk) : tile->state; + } + return false; + } +#endif /// @brief Return true if this RootNode is empty, i.e. contains no values or nodes - __hostdev__ bool isEmpty() const { return DataType::mTableSize == uint32_t(0); } + __hostdev__ bool isEmpty() const __global__ { return BASE(mTableSize) == uint32_t(0); } - __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ ValueType& v) const __global__ { - if (const Tile* tile = this->probeTile(ijk)) { + if (__global__ const Tile* tile = this->probeTile(ijk)) { if (tile->isChild()) { - const auto *child = this->getChild(tile); + __global__ const auto *child = this->getChild(tile); return child->probeValue(ijk, v); } v = tile->value; @@ -3260,33 +3674,49 @@ public: v = DataType::mBackground; return false; } - - __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const +#if defined(__KERNEL_METAL__) + __hostdev__ bool probeValue(__local__ const CoordType& ijk, __local__ ValueType& v) const __global__ { - const Tile* tile = this->probeTile(ijk); + if (__global__ const Tile* tile = this->findTile(ijk)) { + if (tile->isChild()) { + __global__ const auto *child = BASE(getChild)(tile); + return child->probeValue(ijk, v); + } + v = tile->value; + return tile->state; + } + v = BASE(mBackground); + return false; + } +#endif + + __hostdev__ __global__ const LeafNodeType* probeLeaf(__global__ const CoordType& ijk) const + { + __global__ const Tile* tile = this->probeTile(ijk); if (tile && tile->isChild()) { - const auto *child = this->getChild(tile); + const __global__ auto *child = this->getChild(tile); return child->probeLeaf(ijk); } return nullptr; } - __hostdev__ const ChildNodeType* probeChild(const CoordType& ijk) const + __hostdev__ __global__ const ChildNodeType* probeChild(__global__ const CoordType& ijk) const { - const Tile* tile = this->probeTile(ijk); + __global__ const Tile* tile = this->probeTile(ijk); if (tile && tile->isChild()) { return this->getChild(tile); } return nullptr; } + /// @brief Find and return a Tile of this root node - __hostdev__ const Tile* probeTile(const CoordType& ijk) const + __hostdev__ __global__ const Tile* probeTile(__global__ const CoordType& ijk) const __global__ { - const Tile* tiles = reinterpret_cast(this + 1); - const auto key = DataType::CoordToKey(ijk); + __global__ const Tile* tiles = reinterpret_cast<__global__ const Tile*>(this + 1); + const auto key = BASE(CoordToKey)(ijk); #if 1 // switch between linear and binary seach - for (uint32_t i = 0; i < DataType::mTableSize; ++i) { + for (uint32_t i = 0; i < BASE(mTableSize); ++i) { if (tiles[i].key == key) return &tiles[i]; } #else// do not enable binary search if tiles are not guaranteed to be sorted!!!!!! @@ -3306,6 +3736,33 @@ public: #endif return nullptr; } +#if defined(__KERNEL_METAL__) + __hostdev__ __global__ const Tile* findTile(__local__ const CoordType& ijk) const __global__ + { + __global__ const Tile* tiles = reinterpret_cast<__global__ const Tile*>(this + 1); + const auto key = BASE(CoordToKey)(ijk); +#if 1 // switch between linear and binary seach + for (uint32_t i = 0; i < BASE(mTableSize); ++i) { + if (tiles[i].key == key) return &tiles[i]; + } +#else// do not enable binary search if tiles are not guaranteed to be sorted!!!!!! + // binary-search of pre-sorted elements + int32_t low = 0, high = DataType::mTableSize; // low is inclusive and high is exclusive + while (low != high) { + int mid = low + ((high - low) >> 1); + const Tile* tile = &tiles[mid]; + if (tile->key == key) { + return tile; + } else if (tile->key < key) { + low = mid + 1; + } else { + high = mid; + } + } +#endif + return nullptr; + } +#endif private: static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(RootData) is misaligned"); @@ -3319,12 +3776,12 @@ private: /// @brief Private method to return node information and update a ReadAccessor template - __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const { using NodeInfoT = typename AccT::NodeInfo; - if (const Tile* tile = this->probeTile(ijk)) { + if (__global__ const Tile* tile = this->probeTile(ijk)) { if (tile->isChild()) { - const auto *child = this->getChild(tile); + __global__ const auto *child = this->getChild(tile); acc.insert(ijk, child); return child->getNodeInfoAndCache(ijk, acc); } @@ -3337,11 +3794,11 @@ private: /// @brief Private method to return a voxel value and update a ReadAccessor template - __hostdev__ ValueType getValueAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ ValueType getValueAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const __global__ { - if (const Tile* tile = this->probeTile(ijk)) { + if (__global__ const Tile* tile = this->probeTile(ijk)) { if (tile->isChild()) { - const auto *child = this->getChild(tile); + __global__ const auto *child = this->getChild(tile); acc.insert(ijk, child); return child->getValueAndCache(ijk, acc); } @@ -3349,25 +3806,66 @@ private: } return DataType::mBackground; } +#if defined(__KERNEL_METAL__) + template + __hostdev__ ValueType getValueAndCache(__local__ const CoordType& ijk, __local__ const AccT& acc) const __global__ + { + if (__global__ const Tile* tile = this->findTile(ijk)) { + if (tile->isChild()) { + __global__ const auto *child = BASE(getChild)(tile); + acc.insert(ijk, child); + return child->getValueAndCache(ijk, acc); + } + return tile->value; + } + return BASE(mBackground); + } + template + __hostdev__ ValueType getValueAndCache(__local__ const CoordType& ijk, __local__ const AccT& acc) const __local__ + { + if (__global__ const Tile* tile = this->findTile(ijk)) { + if (tile->isChild()) { + __global__ const auto *child = BASE(getChild)(tile); + acc.insert(ijk, child); + return child->getValueAndCache(ijk, acc); + } + return tile->value; + } + return BASE(mBackground); + } +#endif template - __hostdev__ bool isActiveAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ bool isActiveAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const { - const Tile* tile = this->probeTile(ijk); + __global__ const Tile* tile = this->probeTile(ijk); if (tile && tile->isChild()) { - const auto *child = this->getChild(tile); + __global__ const auto *child = BASE(getChild)(tile); acc.insert(ijk, child); return child->isActiveAndCache(ijk, acc); } return false; } +#if defined(__KERNEL_METAL__) + template + __hostdev__ bool isActiveAndCache(__local__ const CoordType& ijk, __local__ const AccT& acc) const __global__ + { + __global__ const Tile* tile = this->findTile(ijk); + if (tile && tile->isChild()) { + __global__ const auto *child = BASE(getChild)(tile); + acc.insert(ijk, child); + return child->isActiveAndCache(ijk, acc); + } + return false; + } +#endif template - __hostdev__ bool probeValueAndCache(const CoordType& ijk, ValueType& v, const AccT& acc) const + __hostdev__ bool probeValueAndCache(__global__ const CoordType& ijk, __global__ ValueType& v, __global__ const AccT& acc) const { - if (const Tile* tile = this->probeTile(ijk)) { + if (__global__ const Tile* tile = this->probeTile(ijk)) { if (tile->isChild()) { - const auto *child = this->getChild(tile); + __global__ const auto *child = BASE(getChild)(tile); acc.insert(ijk, child); return child->probeValueAndCache(ijk, v, acc); } @@ -3379,11 +3877,11 @@ private: } template - __hostdev__ const LeafNodeType* probeLeafAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ __global__ const LeafNodeType* probeLeafAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const { - const Tile* tile = this->probeTile(ijk); + __global__ const Tile* tile = this->probeTile(ijk); if (tile && tile->isChild()) { - const auto *child = this->getChild(tile); + __global__ const auto *child = BASE(getChild)(tile); acc.insert(ijk, child); return child->probeLeafAndCache(ijk, acc); } @@ -3391,11 +3889,11 @@ private: } template - __hostdev__ uint32_t getDimAndCache(const CoordType& ijk, const RayT& ray, const AccT& acc) const + __hostdev__ uint32_t getDimAndCache(__global__ const CoordType& ijk, __global__ const RayT& ray, __global__ const AccT& acc) const __global__ { - if (const Tile* tile = this->probeTile(ijk)) { + if (__global__ const Tile* tile = this->probeTile(ijk)) { if (tile->isChild()) { - const auto *child = this->getChild(tile); + __global__ const auto *child = BASE(getChild)(tile); acc.insert(ijk, child); return child->getDimAndCache(ijk, ray, acc); } @@ -3403,7 +3901,23 @@ private: } return ChildNodeType::dim(); // background } +#if defined(__KERNEL_METAL__) + template + __hostdev__ uint32_t getDimAndCache(__local__ const CoordType& ijk, __local__ const RayT& ray, __local__ const AccT& acc) const __global__ + { + if (__global__ const Tile* tile = this->probeTile(ijk)) { + if (tile->isChild()) { + __global__ const auto *child = BASE(getChild)(tile); + acc.insert(ijk, child); + return child->getDimAndCache(ijk, ray, acc); + } + return 1 << ChildT::TOTAL; //tile value + } + return ChildNodeType::dim(); // background + } +#endif +#undef BASE }; // RootNode class // After the RootNode the memory layout is assumed to be the sorted Tiles @@ -3421,7 +3935,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData using StatsT = typename ChildT::FloatType; using CoordT = typename ChildT::CoordType; using MaskT = typename ChildT::template MaskType; - static constexpr bool FIXED_SIZE = true; + static __constant__ constexpr bool FIXED_SIZE = true; union Tile { @@ -3429,8 +3943,8 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData int64_t child;//signed 64 bit byte offset relative to the InternalData!! /// @brief This class cannot be constructed or deleted Tile() = delete; - Tile(const Tile&) = delete; - Tile& operator=(const Tile&) = delete; + Tile(__global__ const Tile&) = delete; + __global__ Tile& operator=(__global__ const Tile&) = delete; ~Tile() = delete; }; @@ -3456,32 +3970,32 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData __hostdev__ static uint64_t memUsage() { return sizeof(InternalData); } - __hostdev__ void setChild(uint32_t n, const void *ptr) + __hostdev__ void setChild(uint32_t n, __global__ const void *ptr) { NANOVDB_ASSERT(mChildMask.isOn(n)); mTable[n].child = PtrDiff(ptr, this); } template - __hostdev__ void setValue(uint32_t n, const ValueT &v) + __hostdev__ void setValue(uint32_t n, __global__ const ValueT &v) { NANOVDB_ASSERT(!mChildMask.isOn(n)); mTable[n].value = v; } /// @brief Returns a pointer to the child node at the specifed linear offset. - __hostdev__ ChildT* getChild(uint32_t n) + __hostdev__ __global__ ChildT* getChild(uint32_t n) __global__ { NANOVDB_ASSERT(mChildMask.isOn(n)); return PtrAdd(this, mTable[n].child); } - __hostdev__ const ChildT* getChild(uint32_t n) const + __hostdev__ __global__ const ChildT* getChild(uint32_t n) const __global__ { NANOVDB_ASSERT(mChildMask.isOn(n)); return PtrAdd(this, mTable[n].child); } - __hostdev__ ValueT getValue(uint32_t n) const + __hostdev__ ValueT getValue(uint32_t n) const __global__ { NANOVDB_ASSERT(!mChildMask.isOn(n)); return mTable[n].value; @@ -3496,29 +4010,38 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData __hostdev__ bool isChild(uint32_t n) const {return mChildMask.isOn(n);} template - __hostdev__ void setOrigin(const T& ijk) { mBBox[0] = ijk; } + __hostdev__ void setOrigin(__global__ const T& ijk) { mBBox[0] = ijk; } - __hostdev__ const ValueT& getMin() const { return mMinimum; } - __hostdev__ const ValueT& getMax() const { return mMaximum; } - __hostdev__ const StatsT& average() const { return mAverage; } - __hostdev__ const StatsT& stdDeviation() const { return mStdDevi; } + __hostdev__ __global__ const ValueT& getMin() const { return mMinimum; } + __hostdev__ __global__ const ValueT& getMax() const { return mMaximum; } + __hostdev__ __global__ const StatsT& average() const { return mAverage; } + __hostdev__ __global__ const StatsT& stdDeviation() const { return mStdDevi; } - __hostdev__ void setMin(const ValueT& v) { mMinimum = v; } - __hostdev__ void setMax(const ValueT& v) { mMaximum = v; } - __hostdev__ void setAvg(const StatsT& v) { mAverage = v; } - __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; } + __hostdev__ void setMin(__global__ const ValueT& v) { mMinimum = v; } + __hostdev__ void setMax(__global__ const ValueT& v) { mMaximum = v; } + __hostdev__ void setAvg(__global__ const StatsT& v) { mAverage = v; } + __hostdev__ void setDev(__global__ const StatsT& v) { mStdDevi = v; } /// @brief This class cannot be constructed or deleted InternalData() = delete; - InternalData(const InternalData&) = delete; - InternalData& operator=(const InternalData&) = delete; + InternalData(__global__ const InternalData&) = delete; + __global__ InternalData& operator=(__global__ const InternalData&) = delete; ~InternalData() = delete; }; // InternalData /// @brief Internal nodes of a VDB treedim(), template -class InternalNode : private InternalData +class InternalNode +#if !defined(__KERNEL_METAL__) + : private InternalData +#endif { +#if defined(__KERNEL_METAL__) + InternalData _base; +#define BASE(v) _base.v +#else +#define BASE(v) DataType::v +#endif public: using DataType = InternalData; using ValueType = typename DataType::ValueT; @@ -3527,76 +4050,109 @@ public: using LeafNodeType = typename ChildT::LeafNodeType; using ChildNodeType = ChildT; using CoordType = typename ChildT::CoordType; - static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE; + static __constant__ constexpr bool FIXED_SIZE = DataType::FIXED_SIZE; template using MaskType = typename ChildT::template MaskType; template using MaskIterT = typename Mask::template Iterator; - static constexpr uint32_t LOG2DIM = Log2Dim; - static constexpr uint32_t TOTAL = LOG2DIM + ChildT::TOTAL; // dimension in index space - static constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node - static constexpr uint32_t SIZE = 1u << (3 * LOG2DIM); // number of tile values (or child pointers) - static constexpr uint32_t MASK = (1u << TOTAL) - 1u; - static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf - static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node + static __constant__ constexpr uint32_t LOG2DIM = Log2Dim; + static __constant__ constexpr uint32_t TOTAL = LOG2DIM + ChildT::TOTAL; // dimension in index space + static __constant__ constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node + static __constant__ constexpr uint32_t SIZE = 1u << (3 * LOG2DIM); // number of tile values (or child pointers) + static __constant__ constexpr uint32_t MASK = (1u << TOTAL) - 1u; + static __constant__ constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf + static __constant__ constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node /// @brief Visits child nodes of this node only - class ChildIterator : public MaskIterT + class ChildIterator +#if !defined (__KERNEL_METAL__) + : public MaskIterT +#endif { +#if defined (__KERNEL_METAL__) + MaskIterT BaseT; +#define BASE(v) BaseT.v +#else using BaseT = MaskIterT; - const DataType *mParent; +#define BASE(v) BaseT::v +#endif + __global__ const DataType *mParent; public: __hostdev__ ChildIterator() : BaseT(), mParent(nullptr) {} - __hostdev__ ChildIterator(const InternalNode* parent) : BaseT(parent->data()->mChildMask.beginOn()), mParent(parent->data()) {} - ChildIterator& operator=(const ChildIterator&) = default; - __hostdev__ const ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mParent->getChild(BaseT::pos());} - __hostdev__ const ChildT* operator->() const {NANOVDB_ASSERT(*this); return mParent->getChild(BaseT::pos());} + __hostdev__ ChildIterator(__global__ const InternalNode* parent) : BaseT(parent->data()->mChildMask.beginOn()), mParent(parent->data()) {} + __global__ ChildIterator& operator=(__global__ const ChildIterator&) = default; + __hostdev__ __global__ const ChildT& operator*() const {NANOVDB_ASSERT(*this); return *mParent->getChild(BASE(pos)());} + __hostdev__ __global__ const ChildT* operator->() const {NANOVDB_ASSERT(*this); return mParent->getChild(BASE(pos)());} __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); return (*this)->origin();} }; // Member class ChildIterator ChildIterator beginChild() const {return ChildIterator(this);} /// @brief Visits all tile values in this node, i.e. both inactive and active tiles - class ValueIterator : public MaskIterT + class ValueIterator +#if !defined (__KERNEL_METAL__) + : public MaskIterT +#endif { +#if defined (__KERNEL_METAL__) + MaskIterT BaseT; +#define BASE(v) BaseT.v +#else using BaseT = MaskIterT; - const InternalNode *mParent; +#define BASE(v) BaseT::v +#endif + __global__ const InternalNode *mParent; public: __hostdev__ ValueIterator() : BaseT(), mParent(nullptr) {} - __hostdev__ ValueIterator(const InternalNode* parent) : BaseT(parent->data()->mChildMask.beginOff()), mParent(parent) {} - ValueIterator& operator=(const ValueIterator&) = default; - __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->data()->getValue(BaseT::pos());} - __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); return mParent->localToGlobalCoord(BaseT::pos());} - __hostdev__ bool isActive() const { NANOVDB_ASSERT(*this); return mParent->data()->isActive(BaseT::mPos);} + __hostdev__ ValueIterator(__global__ const InternalNode* parent) : BaseT(parent->data()->mChildMask.beginOff()), mParent(parent) {} + __global__ ValueIterator& operator=(__global__ const ValueIterator&) = default; + __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->data()->getValue(BASE(pos)());} + __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); return mParent->localToGlobalCoord(BASE(pos)());} + __hostdev__ bool isActive() const { NANOVDB_ASSERT(*this); return mParent->data()->isActive(BASE(mPos));} }; // Member class ValueIterator ValueIterator beginValue() const {return ValueIterator(this);} /// @brief Visits active tile values of this node only - class ValueOnIterator : public MaskIterT + class ValueOnIterator +#if !defined (__KERNEL_METAL__) + : public MaskIterT +#endif { +#if defined (__KERNEL_METAL__) + MaskIterT BaseT; +#define BASE(v) BaseT.v +#else using BaseT = MaskIterT; - const InternalNode *mParent; +#define BASE(v) BaseT::v +#endif + __global__ const InternalNode *mParent; public: __hostdev__ ValueOnIterator() : BaseT(), mParent(nullptr) {} - __hostdev__ ValueOnIterator(const InternalNode* parent) : BaseT(parent->data()->mValueMask.beginOn()), mParent(parent) {} - ValueOnIterator& operator=(const ValueOnIterator&) = default; - __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->data()->getValue(BaseT::pos());} - __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); return mParent->localToGlobalCoord(BaseT::pos());} + __hostdev__ ValueOnIterator(__global__ const InternalNode* parent) : BaseT(parent->data()->mValueMask.beginOn()), mParent(parent) {} + __global__ ValueOnIterator& operator=(__global__ const ValueOnIterator&) = default; + __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->data()->getValue(BASE(pos)());} + __hostdev__ CoordType getOrigin() const { NANOVDB_ASSERT(*this); return mParent->localToGlobalCoord(BASE(pos)());} }; // Member class ValueOnIterator ValueOnIterator beginValueOn() const {return ValueOnIterator(this);} +#if defined(__KERNEL_METAL__) +#define BASE(v) _base.v +#else +#define BASE(v) DataType::v +#endif + /// @brief This class cannot be constructed or deleted - InternalNode() = delete; - InternalNode(const InternalNode&) = delete; - InternalNode& operator=(const InternalNode&) = delete; + InternalNode() __global__ = delete; + InternalNode(__global__ const InternalNode&) __global__ = delete; + __global__ InternalNode& operator=(__global__ const InternalNode&) __global__ = delete; ~InternalNode() = delete; - __hostdev__ DataType* data() { return reinterpret_cast(this); } + __hostdev__ __global__ DataType* data() __global__ { return reinterpret_cast<__global__ DataType*>(this); } - __hostdev__ const DataType* data() const { return reinterpret_cast(this); } + __hostdev__ __global__ const DataType* data() const __global__ { return reinterpret_cast<__global__ const DataType*>(this); } /// @brief Return the dimension, in voxel units, of this internal node (typically 8*16 or 8*16*32) __hostdev__ static uint32_t dim() { return 1u << TOTAL; } @@ -3605,47 +4161,66 @@ public: __hostdev__ static size_t memUsage() { return DataType::memUsage(); } /// @brief Return a const reference to the bit mask of active voxels in this internal node - __hostdev__ const MaskType& valueMask() const { return DataType::mValueMask; } + __hostdev__ __global__ const MaskType& valueMask() const __global__ { return BASE(mValueMask); } /// @brief Return a const reference to the bit mask of child nodes in this internal node - __hostdev__ const MaskType& childMask() const { return DataType::mChildMask; } + __hostdev__ __global__ const MaskType& childMask() const __global__ { return DataType::mChildMask; } /// @brief Return the origin in index space of this leaf node - __hostdev__ CoordType origin() const { return DataType::mBBox.min() & ~MASK; } + __hostdev__ CoordType origin() const __global__ { return DataType::mBBox.min() & ~MASK; } /// @brief Return a const reference to the minimum active value encoded in this internal node and any of its child nodes - __hostdev__ const ValueType& minimum() const { return this->getMin(); } + __hostdev__ __global__ const ValueType& minimum() const __global__ { return this->getMin(); } /// @brief Return a const reference to the maximum active value encoded in this internal node and any of its child nodes - __hostdev__ const ValueType& maximum() const { return this->getMax(); } + __hostdev__ __global__ const ValueType& maximum() const __global__ { return this->getMax(); } /// @brief Return a const reference to the average of all the active values encoded in this internal node and any of its child nodes - __hostdev__ const FloatType& average() const { return DataType::mAverage; } + __hostdev__ __global__ const FloatType& average() const __global__ { return DataType::mAverage; } /// @brief Return the variance of all the active values encoded in this internal node and any of its child nodes - __hostdev__ FloatType variance() const { return DataType::mStdDevi*DataType::mStdDevi; } + __hostdev__ FloatType variance() const __global__ { return DataType::mStdDevi*DataType::mStdDevi; } /// @brief Return a const reference to the standard deviation of all the active values encoded in this internal node and any of its child nodes - __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; } + __hostdev__ __global__ const FloatType& stdDeviation() const __global__ { return DataType::mStdDevi; } /// @brief Return a const reference to the bounding box in index space of active values in this internal node and any of its child nodes - __hostdev__ const BBox& bbox() const { return DataType::mBBox; } + __hostdev__ __global__ const BBox& bbox() const __global__ { return DataType::mBBox; } /// @brief Return the value of the given voxel - __hostdev__ ValueType getValue(const CoordType& ijk) const + __hostdev__ ValueType getValue(__global__ const CoordType& ijk) const __global__ { const uint32_t n = CoordToOffset(ijk); return DataType::mChildMask.isOn(n) ? this->getChild(n)->getValue(ijk) : DataType::getValue(n); } +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __global__ + { + const uint32_t n = CoordToOffset(ijk); + return DataType::mChildMask.isOn(n) ? this->getChild(n)->getValue(ijk) : DataType::mTable[n].value; + } +#endif - __hostdev__ bool isActive(const CoordType& ijk) const + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __global__ { const uint32_t n = CoordToOffset(ijk); return DataType::mChildMask.isOn(n) ? this->getChild(n)->isActive(ijk) : DataType::isActive(n); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __global__ + { + const uint32_t n = CoordToOffset(ijk); + return DataType::mChildMask.isOn(n) ? this->getChild(n)->isActive(ijk) : DataType::isActive(n); + } + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __local__ + { + const uint32_t n = CoordToOffset(ijk); + return DataType::mChildMask.isOn(n) ? this->getChild(n)->isActive(ijk) : DataType::isActive(n); + } +#endif /// @brief return the state and updates the value of the specified voxel - __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ ValueType& v) const __global__ { const uint32_t n = CoordToOffset(ijk); if (DataType::mChildMask.isOn(n)) @@ -3653,8 +4228,18 @@ public: v = DataType::getValue(n); return DataType::isActive(n); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool probeValue(__local__ const CoordType& ijk, __local__ ValueType& v) const __global__ + { + const uint32_t n = CoordToOffset(ijk); + if (DataType::mChildMask.isOn(n)) + return this->getChild(n)->probeValue(ijk, v); + v = DataType::getValue(n); + return DataType::isActive(n); + } +#endif - __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const + __hostdev__ __global__ const LeafNodeType* probeLeaf(__global__ const CoordType& ijk) const __global__ { const uint32_t n = CoordToOffset(ijk); if (DataType::mChildMask.isOn(n)) @@ -3662,14 +4247,14 @@ public: return nullptr; } - __hostdev__ const ChildNodeType* probeChild(const CoordType& ijk) const + __hostdev__ __global__ const ChildNodeType* probeChild(__global__ const CoordType& ijk) const __global__ { const uint32_t n = CoordToOffset(ijk); return DataType::mChildMask.isOn(n) ? this->getChild(n) : nullptr; } /// @brief Return the linear offset corresponding to the given coordinate - __hostdev__ static uint32_t CoordToOffset(const CoordType& ijk) + __hostdev__ static uint32_t CoordToOffset(__global__ const CoordType& ijk) { #if 0 return (((ijk[0] & MASK) >> ChildT::TOTAL) << (2 * LOG2DIM)) + @@ -3681,6 +4266,20 @@ public: ((ijk[2] & MASK) >> ChildT::TOTAL); #endif } +#if defined(__KERNEL_METAL__) + __hostdev__ static uint32_t CoordToOffset(__local__ const CoordType& ijk) + { +#if 0 + return (((ijk[0] & MASK) >> ChildT::TOTAL) << (2 * LOG2DIM)) + + (((ijk[1] & MASK) >> ChildT::TOTAL) << (LOG2DIM)) + + ((ijk[2] & MASK) >> ChildT::TOTAL); +#else + return (((ijk[0] & MASK) >> ChildT::TOTAL) << (2 * LOG2DIM)) | + (((ijk[1] & MASK) >> ChildT::TOTAL) << (LOG2DIM)) | + ((ijk[2] & MASK) >> ChildT::TOTAL); +#endif + } +#endif /// @return the local coordinate of the n'th tile or child node __hostdev__ static Coord OffsetToLocalCoord(uint32_t n) @@ -3691,13 +4290,13 @@ public: } /// @brief modifies local coordinates to global coordinates of a tile or child node - __hostdev__ void localToGlobalCoord(Coord& ijk) const + __hostdev__ void localToGlobalCoord(__global__ Coord& ijk) const __global__ { ijk <<= ChildT::TOTAL; ijk += this->origin(); } - __hostdev__ Coord offsetToGlobalCoord(uint32_t n) const + __hostdev__ Coord offsetToGlobalCoord(uint32_t n) const __global__ { Coord ijk = InternalNode::OffsetToLocalCoord(n); this->localToGlobalCoord(ijk); @@ -3705,13 +4304,24 @@ public: } /// @brief Return true if this node or any of its child nodes contain active values - __hostdev__ bool isActive() const + __hostdev__ bool isActive() const __global__ { return DataType::mFlags & uint32_t(2); } +#if defined(__KERNEL_METAL__) + /// @brief Retrun true if this node or any of its child nodes contain active values + __hostdev__ bool isActive() const __local__ + { + return DataType::mFlags & uint32_t(2); + } +#endif private: +#if !defined(__KERNEL_METAL__) static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(InternalData) is misaligned"); +#else + static_assert(sizeof(_base) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(InternalData) is misaligned"); +#endif //static_assert(offsetof(DataType, mTable) % 32 == 0, "InternalData::mTable is misaligned"); template @@ -3724,18 +4334,30 @@ private: /// @brief Private read access method used by the ReadAccessor template - __hostdev__ ValueType getValueAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ ValueType getValueAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const __global__ { const uint32_t n = CoordToOffset(ijk); if (!DataType::mChildMask.isOn(n)) - return DataType::getValue(n); - const ChildT* child = this->getChild(n); + return BASE(getValue)(n); + __global__ const ChildT* child = BASE(getChild)(n); acc.insert(ijk, child); return child->getValueAndCache(ijk, acc); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ ValueType getValueAndCache(__local__ const CoordType& ijk, __local__ const AccT& acc) const __global__ + { + const uint32_t n = CoordToOffset(ijk); + if (!BASE(mChildMask).isOn(n)) + return BASE(getValue)(n); + __global__ const ChildT* child = BASE(getChild)(n); + acc.insert(ijk, child); + return child->getValueAndCache(ijk, acc); + } +#endif template - __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const __global__ { using NodeInfoT = typename AccT::NodeInfo; const uint32_t n = CoordToOffset(ijk); @@ -3743,61 +4365,91 @@ private: return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]}; } - const ChildT* child = this->getChild(n); + __global__ const ChildT* child = BASE(getChild)(n); acc.insert(ijk, child); return child->getNodeInfoAndCache(ijk, acc); } template - __hostdev__ bool isActiveAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ bool isActiveAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const __global__ { const uint32_t n = CoordToOffset(ijk); if (!DataType::mChildMask.isOn(n)) return DataType::isActive(n); - const ChildT* child = this->getChild(n); + __global__ const ChildT* child = BASE(getChild)(n); acc.insert(ijk, child); return child->isActiveAndCache(ijk, acc); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ bool isActiveAndCache(__local__ const CoordType& ijk, __local__ const AccT& acc) const __global__ + { + const uint32_t n = CoordToOffset(ijk); + if (!BASE(mChildMask).isOn(n)) + return BASE(mValueMask).isOn(n); + __global__ const ChildT* child = BASE(getChild)(n); + acc.insert(ijk, child); + return child->isActiveAndCache(ijk, acc); + } +#endif template - __hostdev__ bool probeValueAndCache(const CoordType& ijk, ValueType& v, const AccT& acc) const + __hostdev__ bool probeValueAndCache(__global__ const CoordType& ijk, __global__ ValueType& v, __global__ const AccT& acc) const __global__ { const uint32_t n = CoordToOffset(ijk); if (!DataType::mChildMask.isOn(n)) { v = DataType::getValue(n); return DataType::isActive(n); } - const ChildT* child = this->getChild(n); + __global__ const ChildT* child = BASE(getChild)(n); acc.insert(ijk, child); return child->probeValueAndCache(ijk, v, acc); } template - __hostdev__ const LeafNodeType* probeLeafAndCache(const CoordType& ijk, const AccT& acc) const + __hostdev__ __global__ const LeafNodeType* probeLeafAndCache(__global__ const CoordType& ijk, __global__ const AccT& acc) const __global__ { const uint32_t n = CoordToOffset(ijk); if (!DataType::mChildMask.isOn(n)) return nullptr; - const ChildT* child = this->getChild(n); + __global__ const ChildT* child = BASE(getChild)(n); acc.insert(ijk, child); return child->probeLeafAndCache(ijk, acc); } template - __hostdev__ uint32_t getDimAndCache(const CoordType& ijk, const RayT& ray, const AccT& acc) const + __hostdev__ uint32_t getDimAndCache(__global__ const CoordType& ijk, __global__ const RayT& ray, __global__ const AccT& acc) const __global__ { if (DataType::mFlags & uint32_t(1u)) return this->dim(); // skip this node if the 1st bit is set //if (!ray.intersects( this->bbox() )) return 1<getChild(n); + __global__ const ChildT* child = BASE(getChild)(n); acc.insert(ijk, child); return child->getDimAndCache(ijk, ray, acc); } return ChildNodeType::dim(); // tile value } +#if defined(__KERNEL_METAL__) + template + __hostdev__ uint32_t getDimAndCache(__local__ const CoordType& ijk, __local__ const RayT& ray, __local__ const AccT& acc) const __global__ + { + if (BASE(mFlags) & uint32_t(1)) + this->dim(); //ship this node if first bit is set + //if (!ray.intersects( this->bbox() )) return 1<getDimAndCache(ijk, ray, acc); + } + return ChildNodeType::dim(); // tile value + } +#endif + +#undef BASE }; // InternalNode class // --------------------------> LeafNode <------------------------------------ @@ -3814,7 +4466,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData using BuildType = ValueT; using FloatType = typename FloatTraits::FloatType; using ArrayType = ValueT;// type used for the internal mValue array - static constexpr bool FIXED_SIZE = true; + static __constant__ constexpr bool FIXED_SIZE = true; CoordT mBBoxMin; // 12B. uint8_t mBBoxDif[3]; // 3B. @@ -3826,7 +4478,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData FloatType mAverage; // typically 4B, average of all the active values in this node and its child nodes FloatType mStdDevi; // typically 4B, standard deviation of all the active values in this node and its child nodes alignas(32) ValueType mValues[1u << 3 * LOG2DIM]; - + /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment /// /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members. @@ -3838,32 +4490,35 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); } //__hostdev__ const ValueType* values() const { return mValues; } - __hostdev__ ValueType getValue(uint32_t i) const { return mValues[i]; } - __hostdev__ void setValueOnly(uint32_t offset, const ValueType& value) { mValues[offset] = value; } - __hostdev__ void setValue(uint32_t offset, const ValueType& value) + __hostdev__ ValueType getValue(uint32_t i) const __global__ { return mValues[i]; } +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(uint32_t i) const __local__ { return mValues[i]; } +#endif + __hostdev__ void setValueOnly(uint32_t offset, __global__ const ValueType& value) __global__ { mValues[offset] = value; } + __hostdev__ void setValue(uint32_t offset, __global__ const ValueType& value) __global__ { mValueMask.setOn(offset); mValues[offset] = value; } - __hostdev__ ValueType getMin() const { return mMinimum; } - __hostdev__ ValueType getMax() const { return mMaximum; } - __hostdev__ FloatType getAvg() const { return mAverage; } - __hostdev__ FloatType getDev() const { return mStdDevi; } + __hostdev__ ValueType getMin() const __global__ { return mMinimum; } + __hostdev__ ValueType getMax() const __global__ { return mMaximum; } + __hostdev__ FloatType getAvg() const __global__ { return mAverage; } + __hostdev__ FloatType getDev() const __global__ { return mStdDevi; } - __hostdev__ void setMin(const ValueType& v) { mMinimum = v; } - __hostdev__ void setMax(const ValueType& v) { mMaximum = v; } - __hostdev__ void setAvg(const FloatType& v) { mAverage = v; } - __hostdev__ void setDev(const FloatType& v) { mStdDevi = v; } + __hostdev__ void setMin(__global__ const ValueType& v) __global__ { mMinimum = v; } + __hostdev__ void setMax(__global__ const ValueType& v) __global__ { mMaximum = v; } + __hostdev__ void setAvg(__global__ const FloatType& v) __global__ { mAverage = v; } + __hostdev__ void setDev(__global__ const FloatType& v) __global__ { mStdDevi = v; } template - __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; } + __hostdev__ void setOrigin(__global__ const T& ijk) __global__ { mBBoxMin = ijk; } /// @brief This class cannot be constructed or deleted - LeafData() = delete; - LeafData(const LeafData&) = delete; - LeafData& operator=(const LeafData&) = delete; - ~LeafData() = delete; + LeafData() __global__ = delete; + LeafData(__global__ const LeafData&) __global__ = delete; + __global__ LeafData& operator=(__global__ const LeafData&) __global__ = delete; + ~LeafData() __global__ = delete; }; // LeafData /// @brief Base-class for quantized float leaf nodes @@ -3892,39 +4547,39 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafFnBase __hostdev__ static constexpr uint32_t padding() { return sizeof(LeafFnBase) - (12 + 3 + 1 + sizeof(MaskT) + 2*4 + 4*2); } - __hostdev__ void init(float min, float max, uint8_t bitWidth) + __hostdev__ void init(float min, float max, uint8_t bitWidth) __global__ { mMinimum = min; mQuantum = (max - min)/float((1 << bitWidth)-1); } /// @brief return the quantized minimum of the active values in this node - __hostdev__ float getMin() const { return mMin*mQuantum + mMinimum; } + __hostdev__ float getMin() const __global__ { return mMin*mQuantum + mMinimum; } /// @brief return the quantized maximum of the active values in this node - __hostdev__ float getMax() const { return mMax*mQuantum + mMinimum; } + __hostdev__ float getMax() const __global__ { return mMax*mQuantum + mMinimum; } /// @brief return the quantized average of the active values in this node - __hostdev__ float getAvg() const { return mAvg*mQuantum + mMinimum; } + __hostdev__ float getAvg() const __global__ { return mAvg*mQuantum + mMinimum; } /// @brief return the quantized standard deviation of the active values in this node /// @note 0 <= StdDev <= max-min or 0 <= StdDev/(max-min) <= 1 - __hostdev__ float getDev() const { return mDev*mQuantum; } + __hostdev__ float getDev() const __global__ { return mDev*mQuantum; } /// @note min <= X <= max or 0 <= (X-min)/(min-max) <= 1 - __hostdev__ void setMin(float min) { mMin = uint16_t((min - mMinimum)/mQuantum + 0.5f); } + __hostdev__ void setMin(float min) __global__ { mMin = uint16_t((min - mMinimum)/mQuantum + 0.5f); } /// @note min <= X <= max or 0 <= (X-min)/(min-max) <= 1 - __hostdev__ void setMax(float max) { mMax = uint16_t((max - mMinimum)/mQuantum + 0.5f); } + __hostdev__ void setMax(float max) __global__ { mMax = uint16_t((max - mMinimum)/mQuantum + 0.5f); } /// @note min <= avg <= max or 0 <= (avg-min)/(min-max) <= 1 - __hostdev__ void setAvg(float avg) { mAvg = uint16_t((avg - mMinimum)/mQuantum + 0.5f); } + __hostdev__ void setAvg(float avg) __global__ { mAvg = uint16_t((avg - mMinimum)/mQuantum + 0.5f); } /// @note 0 <= StdDev <= max-min or 0 <= StdDev/(max-min) <= 1 - __hostdev__ void setDev(float dev) { mDev = uint16_t(dev/mQuantum + 0.5f); } + __hostdev__ void setDev(float dev) __global__ { mDev = uint16_t(dev/mQuantum + 0.5f); } template - __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; } + __hostdev__ void setOrigin(__global__ const T& ijk) __global__ { mBBoxMin = ijk; } };// LeafFnBase /// @brief Stuct with all the member data of the LeafNode (useful during serialization of an openvdb LeafNode) @@ -3932,12 +4587,24 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafFnBase /// @note No client code should (or can) interface with this struct so it can safely be ignored! template class MaskT, uint32_t LOG2DIM> struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData +#if !defined(__KERNEL_METAL__) : public LeafFnBase +#endif { +#if defined(__KERNEL_METAL__) + LeafFnBase _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif using BaseT = LeafFnBase; using BuildType = Fp4; using ArrayType = uint8_t;// type used for the internal mValue array - static constexpr bool FIXED_SIZE = true; +#if defined(__KERNEL_METAL__) + using ValueType = typename BaseT::ValueType; + using FloatType = typename BaseT::FloatType; +#endif + static __constant__ constexpr bool FIXED_SIZE = true; alignas(32) uint8_t mCode[1u << (3 * LOG2DIM - 1)];// LeafFnBase is 32B aligned and so is mCode __hostdev__ static constexpr uint64_t memUsage() { return sizeof(LeafData); } @@ -3947,31 +4614,53 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData>1]; return ( (i&1) ? c >> 4 : c & uint8_t(15) )*BaseT::mQuantum + BaseT::mMinimum; #else - return ((mCode[i>>1] >> ((i&1)<<2)) & uint8_t(15))*BaseT::mQuantum + BaseT::mMinimum; + return ((mCode[i>>1] >> ((i&1)<<2)) & uint8_t(15))*BASE(mQuantum) + BASE(mMinimum); #endif } +#endif +#if defined(__KERNEL_METAL__) +__hostdev__ float getValue(uint32_t i) const __local__ + { +#if 0 + const uint8_t c = mCode[i>>1]; + return ( (i&1) ? c >> 4 : c & uint8_t(15) )*BaseT::mQuantum + BaseT::mMinimum; +#else + return ((mCode[i>>1] >> ((i&1)<<2)) & uint8_t(15))*BASE(mQuantum) + BASE(mMinimum); +#endif + } +#endif /// @brief This class cannot be constructed or deleted - LeafData() = delete; - LeafData(const LeafData&) = delete; - LeafData& operator=(const LeafData&) = delete; - ~LeafData() = delete; + LeafData() __global__ = delete; + LeafData(__global__ const LeafData&) __global__ = delete; + __global__ LeafData& operator=(__global__ const LeafData&) __global__ = delete; + ~LeafData() __global__ = delete; +#undef BASE }; // LeafData template class MaskT, uint32_t LOG2DIM> struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData +#if !defined(__KERNEL_METAL__) : public LeafFnBase +#endif { +#if defined(__KERNEL_METAL__) + LeafFnBase _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif using BaseT = LeafFnBase; using BuildType = Fp8; using ArrayType = uint8_t;// type used for the internal mValue array - static constexpr bool FIXED_SIZE = true; + static __constant__ constexpr bool FIXED_SIZE = true; alignas(32) uint8_t mCode[1u << 3 * LOG2DIM]; __hostdev__ static constexpr int64_t memUsage() { return sizeof(LeafData); } __hostdev__ static constexpr uint32_t padding() { @@ -3980,25 +4669,44 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData template class MaskT, uint32_t LOG2DIM> struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData +#if !defined(__KERNEL_METAL__) : public LeafFnBase +#endif { +#if defined(__KERNEL_METAL__) + LeafFnBase _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif using BaseT = LeafFnBase; using BuildType = Fp16; using ArrayType = uint16_t;// type used for the internal mValue array - static constexpr bool FIXED_SIZE = true; +#if defined(__KERNEL_METAL__) + using ValueType = typename BaseT::ValueType; + using FloatType = typename BaseT::FloatType; +#endif + static __constant__ constexpr bool FIXED_SIZE = true; alignas(32) uint16_t mCode[1u << 3 * LOG2DIM]; __hostdev__ static constexpr uint64_t memUsage() { return sizeof(LeafData); } @@ -4008,35 +4716,93 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData template class MaskT, uint32_t LOG2DIM> struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData +#if !defined(__KERNEL_METAL__) : public LeafFnBase +#endif {// this class has no data members, however every instance is immediately followed // bitWidth*64 bytes. Since its base class is 32B aligned so are the bitWidth*64 bytes +#if defined(__KERNEL_METAL__) + LeafFnBase _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif using BaseT = LeafFnBase; using BuildType = FpN; - static constexpr bool FIXED_SIZE = false; + static __constant__ constexpr bool FIXED_SIZE = false; +#if defined(__KERNEL_METAL__) + using ValueType = typename BaseT::ValueType; + using FloatType = typename BaseT::FloatType; +#endif __hostdev__ static constexpr uint32_t padding() { static_assert(BaseT::padding()==0, "expected no padding in LeafFnBase"); return 0; } - __hostdev__ uint8_t bitWidth() const { return 1 << (BaseT::mFlags >> 5); }// 4,8,16,32 = 2^(2,3,4,5) - __hostdev__ size_t memUsage() const { return sizeof(*this) + this->bitWidth()*64; } + __hostdev__ uint8_t bitWidth() const __global__ { return 1 << (BaseT::mFlags >> 5); }// 4,8,16,32 = 2^(2,3,4,5) + __hostdev__ size_t memUsage() const __global__ { return sizeof(*this) + this->bitWidth()*64; } __hostdev__ static size_t memUsage(uint32_t bitWidth) { return 96u + bitWidth*64; } - __hostdev__ float getValue(uint32_t i) const + __hostdev__ float getValue(uint32_t i) const __global__ + { +#ifdef NANOVDB_FPN_BRANCHLESS// faster + const int b = BASE(mFlags) >> 5;// b = 0, 1, 2, 3, 4 corresponding to 1, 2, 4, 8, 16 bits +#if 0// use LUT + uint16_t code = reinterpret_cast(this + 1)[i >> (4 - b)]; + const static uint8_t shift[5] = {15, 7, 3, 1, 0}; + const static uint16_t mask[5] = {1, 3, 15, 255, 65535}; + code >>= (i & shift[b]) << b; + code &= mask[b]; +#else// no LUT + uint32_t code = reinterpret_cast<__global__ const uint32_t*>(this + 1)[i >> (5 - b)]; + //code >>= (i & ((16 >> b) - 1)) << b; + code >>= (i & ((32 >> b) - 1)) << b; + code &= (1 << (1 << b)) - 1; +#endif +#else// use branched version (slow) + float code; + __global__ auto *values = reinterpret_cast(this+1); + switch (BaseT::mFlags >> 5) { + case 0u:// 1 bit float + code = float((values[i>>3] >> (i&7) ) & uint8_t(1)); + break; + case 1u:// 2 bits float + code = float((values[i>>2] >> ((i&3)<<1)) & uint8_t(3)); + break; + case 2u:// 4 bits float + code = float((values[i>>1] >> ((i&1)<<2)) & uint8_t(15)); + break; + case 3u:// 8 bits float + code = float(values[i]); + break; + default:// 16 bits float + code = float(reinterpret_cast(values)[i]); + } +#endif + return float(code) * BASE(mQuantum) + BASE(mMinimum);// code * (max-min)/UNITS + min + } +#if defined(__KERNEL_METAL__) + __hostdev__ float getValue(uint32_t i) const __local__ { #ifdef NANOVDB_FPN_BRANCHLESS// faster const int b = BaseT::mFlags >> 5;// b = 0, 1, 2, 3, 4 corresponding to 1, 2, 4, 8, 16 bits @@ -4047,14 +4813,14 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData>= (i & shift[b]) << b; code &= mask[b]; #else// no LUT - uint32_t code = reinterpret_cast(this + 1)[i >> (5 - b)]; + uint32_t code = reinterpret_cast<__global__ const uint32_t*>(this + 1)[i >> (5 - b)]; //code >>= (i & ((16 >> b) - 1)) << b; code >>= (i & ((32 >> b) - 1)) << b; code &= (1 << (1 << b)) - 1; #endif #else// use branched version (slow) float code; - auto *values = reinterpret_cast(this+1); + __global__ auto *values = reinterpret_cast(this+1); switch (BaseT::mFlags >> 5) { case 0u:// 1 bit float code = float((values[i>>3] >> (i&7) ) & uint8_t(1)); @@ -4074,12 +4840,15 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData // Partial template specialization of LeafData with bool @@ -4092,7 +4861,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData;// type used for the internal mValue array - static constexpr bool FIXED_SIZE = true; + static __constant__ constexpr bool FIXED_SIZE = true; CoordT mBBoxMin; // 12B. uint8_t mBBoxDif[3]; // 3B. @@ -4104,31 +4873,34 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData) - 16u;} __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); } - //__hostdev__ const ValueType* values() const { return nullptr; } - __hostdev__ bool getValue(uint32_t i) const { return mValues.isOn(i); } - __hostdev__ bool getMin() const { return false; }// dummy - __hostdev__ bool getMax() const { return false; }// dummy - __hostdev__ bool getAvg() const { return false; }// dummy - __hostdev__ bool getDev() const { return false; }// dummy - __hostdev__ void setValue(uint32_t offset, bool v) + //__hostdev__ __global__ const ValueType* values() const __global__ { return nullptr; } + __hostdev__ bool getValue(uint32_t i) const __global__ { return mValues.isOn(i); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool getValue(uint32_t i) const __local__ { return mValues.isOn(i); } +#endif + __hostdev__ bool getMin() const __global__ { return false; }// dummy + __hostdev__ bool getMax() const __global__ { return false; }// dummy + __hostdev__ bool getAvg() const __global__ { return false; }// dummy + __hostdev__ bool getDev() const __global__ { return false; }// dummy + __hostdev__ void setValue(uint32_t offset, bool v) __global__ { mValueMask.setOn(offset); mValues.set(offset, v); } - __hostdev__ void setMin(const bool&) {}// no-op - __hostdev__ void setMax(const bool&) {}// no-op - __hostdev__ void setAvg(const bool&) {}// no-op - __hostdev__ void setDev(const bool&) {}// no-op + __hostdev__ void setMin(__global__ const bool&) __global__ {}// no-op + __hostdev__ void setMax(__global__ const bool&) __global__ {}// no-op + __hostdev__ void setAvg(__global__ const bool&) __global__ {}// no-op + __hostdev__ void setDev(__global__ const bool&) __global__ {}// no-op template - __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; } + __hostdev__ void setOrigin(__global__ const T& ijk) __global__ { mBBoxMin = ijk; } /// @brief This class cannot be constructed or deleted - LeafData() = delete; - LeafData(const LeafData&) = delete; - LeafData& operator=(const LeafData&) = delete; - ~LeafData() = delete; + LeafData() __global__ = delete; + LeafData(__global__ const LeafData&) __global__ = delete; + __global__ LeafData& operator=(__global__ const LeafData&) __global__ = delete; + ~LeafData() __global__ = delete; }; // LeafData // Partial template specialization of LeafData with ValueMask @@ -4141,7 +4913,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData - __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; } + __hostdev__ void setOrigin(__global__ const T& ijk) __global__ { mBBoxMin = ijk; } /// @brief This class cannot be constructed or deleted - LeafData() = delete; - LeafData(const LeafData&) = delete; - LeafData& operator=(const LeafData&) = delete; - ~LeafData() = delete; + LeafData() __global__ = delete; + LeafData(__global__ const LeafData&) __global__ = delete; + __global__ LeafData& operator=(__global__ const LeafData&) __global__ = delete; + ~LeafData() __global__ = delete; }; // LeafData // Partial template specialization of LeafData with ValueIndex @@ -4191,7 +4966,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData - __hostdev__ void setMin(const T &min, T *p) { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 0] = min; } + __hostdev__ void setMin(__global__ const T &min, __global__ T *p) __global__ { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 0] = min; } template - __hostdev__ void setMax(const T &max, T *p) { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 1] = max; } + __hostdev__ void setMax(__global__ const T &max, __global__ T *p) __global__ { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 1] = max; } template - __hostdev__ void setAvg(const T &avg, T *p) { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 2] = avg; } + __hostdev__ void setAvg(__global__ const T &avg, __global__ T *p) __global__ { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 2] = avg; } template - __hostdev__ void setDev(const T &dev, T *p) { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 3] = dev; } + __hostdev__ void setDev(__global__ const T &dev, __global__ T *p) __global__ { NANOVDB_ASSERT(mStatsOff); p[mStatsOff + 3] = dev; } template - __hostdev__ void setOrigin(const T &ijk) { mBBoxMin = ijk; } + __hostdev__ void setOrigin(__global__ const T &ijk) __global__ { mBBoxMin = ijk; } /// @brief This class cannot be constructed or deleted - LeafData() = delete; - LeafData(const LeafData&) = delete; - LeafData& operator=(const LeafData&) = delete; - ~LeafData() = delete; + LeafData() __global__ = delete; + LeafData(__global__ const LeafData&) __global__ = delete; + __global__ LeafData& operator=(__global__ const LeafData&) __global__ = delete; + ~LeafData() __global__ = delete; }; // LeafData /// @brief Leaf nodes of the VDB tree. (defaults to 8x8x8 = 512 voxels) @@ -4248,13 +5031,22 @@ template class MaskT = Mask, uint32_t Log2Dim = 3> -class LeafNode : private LeafData +class LeafNode +#if !defined(__KERNEL_METAL__) + : private LeafData +#endif { +#if defined(__KERNEL_METAL__) + LeafData _base; +#define BASE(v) _base.v +#else +#define BASE(v) DataType::v +#endif public: struct ChildNodeType { - static constexpr uint32_t TOTAL = 0; - static constexpr uint32_t DIM = 1; + static __constant__ constexpr uint32_t TOTAL = 0; + static __constant__ constexpr uint32_t DIM = 1; __hostdev__ static uint32_t dim() { return 1u; } }; // Voxel using LeafNodeType = LeafNode; @@ -4263,38 +5055,56 @@ public: using FloatType = typename DataType::FloatType; using BuildType = typename DataType::BuildType; using CoordType = CoordT; - static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE; + static __constant__ constexpr bool FIXED_SIZE = DataType::FIXED_SIZE; template using MaskType = MaskT; template using MaskIterT = typename Mask::template Iterator; /// @brief Visits all active values in a leaf node - class ValueOnIterator : public MaskIterT + class ValueOnIterator +#if !defined (__KERNEL_METAL__) + : public MaskIterT +#endif { +#if defined(__KERNEL_METAL__) + MaskIterT BaseT; +#define BASE(v) BaseT.v +#else using BaseT = MaskIterT; - const LeafNode *mParent; +#define BASE(v) BaseT::v +#endif + __global__ const LeafNode *mParent; public: __hostdev__ ValueOnIterator() : BaseT(), mParent(nullptr) {} - __hostdev__ ValueOnIterator(const LeafNode* parent) : BaseT(parent->data()->mValueMask.beginOn()), mParent(parent) {} - ValueOnIterator& operator=(const ValueOnIterator&) = default; - __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->getValue(BaseT::pos());} - __hostdev__ CoordT getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());} + __hostdev__ ValueOnIterator(__global__ const LeafNode* parent) : BaseT(parent->data()->mValueMask.beginOn()), mParent(parent) {} + __global__ ValueOnIterator& operator=(__global__ const ValueOnIterator&) = default; + __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->getValue(BASE(pos)());} + __hostdev__ CoordT getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BASE(pos)());} }; // Member class ValueOnIterator ValueOnIterator beginValueOn() const {return ValueOnIterator(this);} /// @brief Visits all inactive values in a leaf node - class ValueOffIterator : public MaskIterT + class ValueOffIterator +#if !defined (__KERNEL_METAL__) + : public MaskIterT +#endif { +#if defined(__KERNEL_METAL__) + MaskIterT BaseT; +#define BASE(v) BaseT.v +#else using BaseT = MaskIterT; - const LeafNode *mParent; +#define BASE(v) BaseT::v +#endif + __global__ const LeafNode *mParent; public: __hostdev__ ValueOffIterator() : BaseT(), mParent(nullptr) {} - __hostdev__ ValueOffIterator(const LeafNode* parent) : BaseT(parent->data()->mValueMask.beginOff()), mParent(parent) {} - ValueOffIterator& operator=(const ValueOffIterator&) = default; - __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->getValue(BaseT::pos());} - __hostdev__ CoordT getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BaseT::pos());} + __hostdev__ ValueOffIterator(__global__ const LeafNode* parent) : BaseT(parent->data()->mValueMask.beginOff()), mParent(parent) {} + __global__ ValueOffIterator& operator=(__global__ const ValueOffIterator&) = default; + __hostdev__ ValueType operator*() const {NANOVDB_ASSERT(*this); return mParent->getValue(BASE(pos)());} + __hostdev__ CoordT getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(BASE(pos)());} }; // Member class ValueOffIterator ValueOffIterator beginValueOff() const {return ValueOffIterator(this);} @@ -4302,17 +5112,17 @@ public: /// @brief Visits all values in a leaf node, i.e. both active and inactive values class ValueIterator { - const LeafNode *mParent; + __global__ const LeafNode *mParent; uint32_t mPos; public: __hostdev__ ValueIterator() : mParent(nullptr), mPos(1u << 3 * Log2Dim) {} - __hostdev__ ValueIterator(const LeafNode* parent) : mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);} - ValueIterator& operator=(const ValueIterator&) = default; + __hostdev__ ValueIterator(__global__ const LeafNode* parent) : mParent(parent), mPos(0) {NANOVDB_ASSERT(parent);} + __global__ ValueIterator& operator=(__global__ const ValueIterator&) = default; __hostdev__ ValueType operator*() const { NANOVDB_ASSERT(*this); return mParent->getValue(mPos);} __hostdev__ CoordT getCoord() const { NANOVDB_ASSERT(*this); return mParent->offsetToGlobalCoord(mPos);} __hostdev__ bool isActive() const { NANOVDB_ASSERT(*this); return mParent->isActive(mPos);} __hostdev__ operator bool() const {return mPos < (1u << 3 * Log2Dim);} - __hostdev__ ValueIterator& operator++() {++mPos; return *this;} + __hostdev__ __global__ ValueIterator& operator++() {++mPos; return *this;} __hostdev__ ValueIterator operator++(int) { auto tmp = *this; ++(*this); @@ -4320,43 +5130,49 @@ public: } }; // Member class ValueIterator +#if defined(__KERNEL_METAL__) +#define BASE(v) _base.v +#else +#define BASE(v) DataType::v +#endif + ValueIterator beginValue() const {return ValueIterator(this);} static_assert(is_same::Type>::value, "Mismatching BuildType"); - static constexpr uint32_t LOG2DIM = Log2Dim; - static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes - static constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node - static constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node - static constexpr uint32_t MASK = (1u << LOG2DIM) - 1u; // mask for bit operations - static constexpr uint32_t LEVEL = 0; // level 0 = leaf - static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node + static __constant__ constexpr uint32_t LOG2DIM = Log2Dim; + static __constant__ constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes + static __constant__ constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node + static __constant__ constexpr uint32_t SIZE = 1u << 3 * LOG2DIM; // total number of voxels represented by this node + static __constant__ constexpr uint32_t MASK = (1u << LOG2DIM) - 1u; // mask for bit operations + static __constant__ constexpr uint32_t LEVEL = 0; // level 0 = leaf + static __constant__ constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node - __hostdev__ DataType* data() { return reinterpret_cast(this); } + __hostdev__ __global__ DataType* data() __global__ { return reinterpret_cast<__global__ DataType*>(this); } - __hostdev__ const DataType* data() const { return reinterpret_cast(this); } + __hostdev__ __global__ const DataType* data() __global__ const { return reinterpret_cast<__global__ const DataType*>(this); } /// @brief Return a const reference to the bit mask of active voxels in this leaf node - __hostdev__ const MaskType& valueMask() const { return DataType::mValueMask; } + __hostdev__ __global__ const MaskType& valueMask() const __global__ { return DataType::mValueMask; } /// @brief Return a const reference to the minimum active value encoded in this leaf node - __hostdev__ ValueType minimum() const { return this->getMin(); } + __hostdev__ ValueType minimum() const __global__ { return this->getMin(); } /// @brief Return a const reference to the maximum active value encoded in this leaf node - __hostdev__ ValueType maximum() const { return this->getMax(); } + __hostdev__ ValueType maximum() const __global__ { return this->getMax(); } /// @brief Return a const reference to the average of all the active values encoded in this leaf node - __hostdev__ FloatType average() const { return DataType::getAvg(); } + __hostdev__ FloatType average() const __global__ { return DataType::getAvg(); } /// @brief Return the variance of all the active values encoded in this leaf node - __hostdev__ FloatType variance() const { return DataType::getDev()*DataType::getDev(); } + __hostdev__ FloatType variance() const __global__ { return DataType::getDev()*DataType::getDev(); } /// @brief Return a const reference to the standard deviation of all the active values encoded in this leaf node - __hostdev__ FloatType stdDeviation() const { return DataType::getDev(); } + __hostdev__ FloatType stdDeviation() const __global__ { return DataType::getDev(); } - __hostdev__ uint8_t flags() const { return DataType::mFlags; } + __hostdev__ uint8_t flags() const __global__ { return DataType::mFlags; } /// @brief Return the origin in index space of this leaf node - __hostdev__ CoordT origin() const { return DataType::mBBoxMin & ~MASK; } + __hostdev__ CoordT origin() const __global__ { return DataType::mBBoxMin & ~MASK; } __hostdev__ static CoordT OffsetToLocalCoord(uint32_t n) { @@ -4366,9 +5182,9 @@ public: } /// @brief Converts (in place) a local index coordinate to a global index coordinate - __hostdev__ void localToGlobalCoord(Coord& ijk) const { ijk += this->origin(); } + __hostdev__ void localToGlobalCoord(__global__ Coord& ijk) const __global__ { ijk += this->origin(); } - __hostdev__ CoordT offsetToGlobalCoord(uint32_t n) const + __hostdev__ CoordT offsetToGlobalCoord(uint32_t n) const __global__ { return OffsetToLocalCoord(n) + this->origin(); } @@ -4377,7 +5193,7 @@ public: __hostdev__ static uint32_t dim() { return 1u << LOG2DIM; } /// @brief Return the bounding box in index space of active values in this leaf node - __hostdev__ BBox bbox() const + __hostdev__ BBox bbox() const __global__ { BBox bbox(DataType::mBBoxMin, DataType::mBBoxMin); if ( this->hasBBox() ) { @@ -4399,54 +5215,85 @@ public: __hostdev__ uint64_t memUsage() { return DataType::memUsage(); } /// @brief This class cannot be constructed or deleted - LeafNode() = delete; - LeafNode(const LeafNode&) = delete; - LeafNode& operator=(const LeafNode&) = delete; - ~LeafNode() = delete; + LeafNode() __global__ = delete; + LeafNode(__global__ const LeafNode&) __global__ = delete; + __global__ LeafNode& operator=(__global__ const LeafNode&) __global__ = delete; + ~LeafNode() __global__ = delete; /// @brief Return the voxel value at the given offset. - __hostdev__ ValueType getValue(uint32_t offset) const { return DataType::getValue(offset); } + + __hostdev__ ValueType getValue(uint32_t offset) const __global__ { return DataType::getValue(offset); } +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(uint32_t offset) const __local__ { return DataType::getValue(offset); } +#endif /// @brief Return the voxel value at the given coordinate. - __hostdev__ ValueType getValue(const CoordT& ijk) const { return DataType::getValue(CoordToOffset(ijk)); } + __hostdev__ ValueType getValue(__global__ const CoordT& ijk) const __global__ { return BASE(getValue)(CoordToOffset(ijk)); } +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordT& ijk) const __global__ { return BASE(getValue)(CoordToOffset(ijk)); } + __hostdev__ ValueType getValue(__local__ const CoordT& ijk) const __local__ { return BASE(getValue)(CoordToOffset(ijk)); } +#endif /// @brief Sets the value at the specified location and activate its state. /// /// @note This is safe since it does not change the topology of the tree (unlike setValue methods on the other nodes) - __hostdev__ void setValue(const CoordT& ijk, const ValueType& v) { DataType::setValue(CoordToOffset(ijk), v); } + __hostdev__ void setValue(__global__ const CoordT& ijk, __global__ const ValueType& v) __global__ { DataType::setValue(CoordToOffset(ijk), v); } /// @brief Sets the value at the specified location but leaves its state unchanged. /// /// @note This is safe since it does not change the topology of the tree (unlike setValue methods on the other nodes) - __hostdev__ void setValueOnly(uint32_t offset, const ValueType& v) { DataType::setValueOnly(offset, v); } - __hostdev__ void setValueOnly(const CoordT& ijk, const ValueType& v) { DataType::setValueOnly(CoordToOffset(ijk), v); } + __hostdev__ void setValueOnly(uint32_t offset, __global__ const ValueType& v) __global__ { DataType::setValueOnly(offset, v); } + __hostdev__ void setValueOnly(__global__ const CoordT& ijk, __global__ const ValueType& v) __global__ { DataType::setValueOnly(CoordToOffset(ijk), v); } /// @brief Return @c true if the voxel value at the given coordinate is active. - __hostdev__ bool isActive(const CoordT& ijk) const { return DataType::mValueMask.isOn(CoordToOffset(ijk)); } - __hostdev__ bool isActive(uint32_t n) const { return DataType::mValueMask.isOn(n); } + __hostdev__ bool isActive(__global__ const CoordT& ijk) const __global__ { return BASE(mValueMask).isOn(CoordToOffset(ijk)); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(__local__ const CoordT& ijk) const __global__ { return BASE(mValueMask).isOn(CoordToOffset(ijk)); } + __hostdev__ bool isActive(__local__ const CoordT& ijk) const __local__ { return BASE(mValueMask).isOn(CoordToOffset(ijk)); } +#endif + + __hostdev__ bool isActive(uint32_t n) const __global__ { return BASE(mValueMask).isOn(n); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(uint32_t n) const __local__ { return BASE(mValueMask).isOn(n); } +#endif /// @brief Return @c true if any of the voxel value are active in this leaf node. - __hostdev__ bool isActive() const + __hostdev__ bool isActive() const __global__ { //NANOVDB_ASSERT( bool(DataType::mFlags & uint8_t(2)) != DataType::mValueMask.isOff() ); //return DataType::mFlags & uint8_t(2); return !DataType::mValueMask.isOff(); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive() const __local__ + { + NANOVDB_ASSERT( bool(DataType::mFlags & uint8_t(2)) != BASE(mValueMask).isOff() ); + return DataType::mFlags & uint8_t(2); + } +#endif __hostdev__ bool hasBBox() const {return DataType::mFlags & uint8_t(2);} /// @brief Return @c true if the voxel value at the given coordinate is active and updates @c v with the value. - __hostdev__ bool probeValue(const CoordT& ijk, ValueType& v) const + __hostdev__ bool probeValue(__global__ const CoordT& ijk, __global__ ValueType& v) const __global__ { const uint32_t n = CoordToOffset(ijk); v = DataType::getValue(n); - return DataType::mValueMask.isOn(n); + return BASE(mValueMask).isOn(n); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool probeValue(__local__ const CoordT& ijk, __local__ ValueType& v) const __global__ + { + const uint32_t n = CoordToOffset(ijk); + v = BASE(getValue)(n); + return BASE(mValueMask).isOn(n); + } +#endif - __hostdev__ const LeafNode* probeLeaf(const CoordT&) const { return this; } + __hostdev__ __global__ const LeafNode* probeLeaf(__global__ const CoordT&) const __global__ { return this; } /// @brief Return the linear offset corresponding to the given coordinate - __hostdev__ static uint32_t CoordToOffset(const CoordT& ijk) + __hostdev__ static uint32_t CoordToOffset(__global__ const CoordT& ijk) { #if 0 return ((ijk[0] & MASK) << (2 * LOG2DIM)) + ((ijk[1] & MASK) << LOG2DIM) + (ijk[2] & MASK); @@ -4454,6 +5301,16 @@ public: return ((ijk[0] & MASK) << (2 * LOG2DIM)) | ((ijk[1] & MASK) << LOG2DIM) | (ijk[2] & MASK); #endif } +#if defined(__KERNEL_METAL__) + __hostdev__ static uint32_t CoordToOffset(__local__ const CoordT& ijk) + { + #if 0 + return ((ijk[0] & MASK) << (2 * LOG2DIM)) + ((ijk[1] & MASK) << LOG2DIM) + (ijk[2] & MASK); + #else + return ((ijk[0] & MASK) << (2 * LOG2DIM)) | ((ijk[1] & MASK) << LOG2DIM) | (ijk[2] & MASK); + #endif + } +#endif /// @brief Updates the local bounding box of active voxels in this node. Return true if bbox was updated. /// @@ -4461,8 +5318,9 @@ public: /// /// @details This method is based on few (intrinsic) bit operations and hence is relatively fast. /// However, it should only only be called of either the value mask has changed or if the + /// active bounding box is still undefined. e.g. during construction of this node. - __hostdev__ bool updateBBox(); + __hostdev__ bool updateBBox() __global__; private: static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(LeafData) is misaligned"); @@ -4478,49 +5336,77 @@ private: /// @brief Private method to return a voxel value and update a (dummy) ReadAccessor template - __hostdev__ ValueType getValueAndCache(const CoordT& ijk, const AccT&) const { return this->getValue(ijk); } + __hostdev__ ValueType getValueAndCache(__global__ const CoordT& ijk, __global__ const AccT&) const __global__ { return this->getValue(ijk); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ ValueType getValueAndCache(__local__ const CoordT& ijk, __local__ const AccT&) const __global__ { return this->getValue(ijk); } +#endif /// @brief Return the node information. template - __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& /*ijk*/, const AccT& /*acc*/) const { + __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(__global__ const CoordType& /*ijk*/, __global__ const AccT& /*acc*/) const __global__ { using NodeInfoT = typename AccT::NodeInfo; return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]}; } template - __hostdev__ bool isActiveAndCache(const CoordT& ijk, const AccT&) const { return this->isActive(ijk); } + __hostdev__ bool isActiveAndCache(__global__ const CoordT& ijk, __global__ const AccT&) const __global__ { return this->isActive(ijk); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ bool isActiveAndCache(__local__ const CoordT& ijk, __local__ const AccT&) const __global__ { return this->isActive(ijk); } +#endif template - __hostdev__ bool probeValueAndCache(const CoordT& ijk, ValueType& v, const AccT&) const { return this->probeValue(ijk, v); } + __hostdev__ bool probeValueAndCache(__global__ const CoordT& ijk, __global__ ValueType& v, __global__ const AccT&) const __global__ { return this->probeValue(ijk, v); } template - __hostdev__ const LeafNode* probeLeafAndCache(const CoordT&, const AccT&) const { return this; } + __hostdev__ __global__ const LeafNode* probeLeafAndCache(__global__ const CoordT&, __global__ const AccT&) const __global__ { return this; } template - __hostdev__ uint32_t getDimAndCache(const CoordT&, const RayT& /*ray*/, const AccT&) const + __hostdev__ uint32_t getDimAndCache(__global__ const CoordT&, __global__ const RayT& /*ray*/, __global__ const AccT&) const __global__ { if (DataType::mFlags & uint8_t(1u)) return this->dim(); // skip this node if the 1st bit is set //if (!ray.intersects( this->bbox() )) return 1 << LOG2DIM; return ChildNodeType::dim(); } - +#if defined(__KERNEL_METAL__) + template + __hostdev__ uint32_t getDimAndCache(__local__ const CoordT&, __local__ const RayT& /*ray*/, __local__ const AccT&) const __global__ + { + if (BASE(mFlags) & uint8_t(1)) + return this->dim(); // skip this node if first bit is set + //if (!ray.intersects( this->bbox() )) return 1 << LOG2DIM; + return ChildNodeType::dim(); + } +#endif +#undef BASE }; // LeafNode class template class MaskT, uint32_t LOG2DIM> -__hostdev__ inline bool LeafNode::updateBBox() +__hostdev__ inline bool LeafNode::updateBBox() __global__ { static_assert(LOG2DIM == 3, "LeafNode::updateBBox: only supports LOGDIM = 3!"); if (DataType::mValueMask.isOff()) { DataType::mFlags &= ~uint8_t(2);// set 2nd bit off, which indicates that this nodes has no bbox return false; } +#if defined(__KERNEL_METAL__) + struct Update { + static void update(__global__ DataType &d, uint32_t min, uint32_t max, int axis) { + NANOVDB_ASSERT(min <= max && max < 8); + d.mBBoxMin[axis] = (d.mBBoxMin[axis] & ~MASK) + int(min); + d.mBBoxDif[axis] = uint8_t(max - min); + } + }; +#else auto update = [&](uint32_t min, uint32_t max, int axis) { NANOVDB_ASSERT(min <= max && max < 8); DataType::mBBoxMin[axis] = (DataType::mBBoxMin[axis] & ~MASK) + int(min); DataType::mBBoxDif[axis] = uint8_t(max - min); }; +#endif uint64_t word64 = DataType::mValueMask.template getWord(0); uint32_t Xmin = word64 ? 0u : 8u; uint32_t Xmax = Xmin; @@ -4534,6 +5420,17 @@ __hostdev__ inline bool LeafNode::updateBBox() } } NANOVDB_ASSERT(word64); +#if defined(__KERNEL_METAL__) + Update::update(this, Xmin, Xmax, 0); + Update::update(this, FindLowestOn(word64) >> 3, FindHighestOn(word64) >> 3, 1); + __local__ const uint32_t *p = reinterpret_cast<__local__ const uint32_t*>(&word64), word32 = p[0] | p[1]; + __local__ const uint16_t *q = reinterpret_cast<__local__ const uint16_t*>(&word32), word16 = q[0] | q[1]; + __local__ const uint8_t *b = reinterpret_cast<__local__ const uint8_t* >(&word16), byte = b[0] | b[1]; + NANOVDB_ASSERT(byte); + Update::update(this, FindLowestOn(static_cast(byte)), FindHighestOn(static_cast(byte)), 2); + DataType::mFlags |= uint8_t(2);// set 2nd bit on, which indicates that this nodes has a bbox + return true; +#else update(Xmin, Xmax, 0); update(FindLowestOn(word64) >> 3, FindHighestOn(word64) >> 3, 1); const uint32_t *p = reinterpret_cast(&word64), word32 = p[0] | p[1]; @@ -4541,8 +5438,9 @@ __hostdev__ inline bool LeafNode::updateBBox() const uint8_t *b = reinterpret_cast(&word16), byte = b[0] | b[1]; NANOVDB_ASSERT(byte); update(FindLowestOn(static_cast(byte)), FindHighestOn(static_cast(byte)), 2); - DataType::mFlags |= uint8_t(2);// set 2nd bit on, which indicates that this nodes has a bbox + DataType::mFlags |= uint8_t(2);// set 2nd bit on, which indicates that this nodes has a bbox return true; +#endif } // LeafNode::updateBBox // --------------------------> Template specializations and traits <------------------------------------ @@ -4651,12 +5549,12 @@ class ReadAccessor using FloatType = typename RootT::FloatType; using CoordValueType = typename RootT::CoordType::ValueType; - mutable const RootT* mRoot; // 8 bytes (mutable to allow for access methods to be const) + mutable __global__ const RootT* mRoot; // 8 bytes (mutable to allow for access methods to be const) public: using ValueType = typename RootT::ValueType; using CoordType = typename RootT::CoordType; - static const int CacheLevels = 0; + static __constant__ const int CacheLevels = 0; struct NodeInfo { uint32_t mLevel; // 4B @@ -4670,60 +5568,77 @@ public: }; /// @brief Constructor from a root node - __hostdev__ ReadAccessor(const RootT& root) : mRoot{&root} {} + __hostdev__ ReadAccessor(__global__ const RootT& root) __local__ : mRoot{&root} {} /// @brief Constructor from a grid - __hostdev__ ReadAccessor(const GridT& grid) : ReadAccessor(grid.tree().root()) {} + __hostdev__ ReadAccessor(__global__ const GridT& grid) __local__ : ReadAccessor(grid.tree().root()) {} /// @brief Constructor from a tree - __hostdev__ ReadAccessor(const TreeT& tree) : ReadAccessor(tree.root()) {} + __hostdev__ ReadAccessor(__global__ const TreeT& tree) __local__ : ReadAccessor(tree.root()) {} /// @brief Reset this access to its initial state, i.e. with an empty cache /// @node Noop since this template specialization has no cache __hostdev__ void clear() {} - __hostdev__ const RootT& root() const { return *mRoot; } + __hostdev__ __global__ const RootT& root() const __global__ { return *mRoot; } +#if defined(__KERNEL_METAL__) + __hostdev__ __global__ const RootT& root() const __local__ { return *mRoot; } +#endif /// @brief Defaults constructors - ReadAccessor(const ReadAccessor&) = default; - ~ReadAccessor() = default; - ReadAccessor& operator=(const ReadAccessor&) = default; + ReadAccessor(__local__ const ReadAccessor&) __local__ = default; + ~ReadAccessor() __local__ = default; + __local__ ReadAccessor& operator=(__local__ const ReadAccessor&) __local__ = default; - __hostdev__ ValueType getValue(const CoordType& ijk) const + __hostdev__ ValueType getValue(__global__ const CoordType& ijk) const __local__ { return mRoot->getValueAndCache(ijk, *this); } - __hostdev__ ValueType operator()(const CoordType& ijk) const + +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __local__ + { + return mRoot->getValueAndCache(ijk, *this); + } +#endif + + __hostdev__ ValueType operator()(__global__ const CoordType& ijk) const __local__ { return this->getValue(ijk); } - __hostdev__ ValueType operator()(int i, int j, int k) const + __hostdev__ ValueType operator()(int i, int j, int k) const __local__ { return this->getValue(CoordType(i,j,k)); } - __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const + __hostdev__ NodeInfo getNodeInfo(__global__ const CoordType& ijk) const __local__ { return mRoot->getNodeInfoAndCache(ijk, *this); } - __hostdev__ bool isActive(const CoordType& ijk) const + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __local__ { return mRoot->isActiveAndCache(ijk, *this); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __local__ + { + return mRoot->isActiveAndCache(ijk, *this); + } +#endif - __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ ValueType& v) const __local__ { return mRoot->probeValueAndCache(ijk, v, *this); } - __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const + __hostdev__ __global__ const LeafT* probeLeaf(__global__ const CoordType& ijk) const __local__ { return mRoot->probeLeafAndCache(ijk, *this); } template - __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const + __hostdev__ uint32_t getDim(__global__ const CoordType& ijk, __global__ const RayT& ray) const __local__ { return mRoot->getDimAndCache(ijk, ray, *this); } @@ -4739,7 +5654,11 @@ private: /// @brief No-op template - __hostdev__ void insert(const CoordType&, const NodeT*) const {} + __hostdev__ void insert(__global__ const CoordType&, __global__ const NodeT*) const __local__ {} +#if defined(__KERNEL_METAL__) + template + __hostdev__ void insert(__local__ const CoordType&, __global__ const NodeT*) const __local__ {} +#endif }; // ReadAccessor class /// @brief Node caching at a single tree level @@ -4761,19 +5680,19 @@ class ReadAccessor//e.g. 0, 1, 2 // All member data are mutable to allow for access methods to be const mutable CoordT mKey; // 3*4 = 12 bytes - mutable const RootT* mRoot; // 8 bytes - mutable const NodeT* mNode; // 8 bytes + mutable __global__ const RootT* mRoot; // 8 bytes + mutable __global__ const NodeT* mNode; // 8 bytes public: using ValueType = ValueT; using CoordType = CoordT; - static const int CacheLevels = 1; + static __constant__ const int CacheLevels = 1; using NodeInfo = typename ReadAccessor::NodeInfo; /// @brief Constructor from a root node - __hostdev__ ReadAccessor(const RootT& root) + __hostdev__ ReadAccessor(__global__ const RootT& root) __local__ : mKey(CoordType::max()) , mRoot(&root) , mNode(nullptr) @@ -4781,10 +5700,10 @@ public: } /// @brief Constructor from a grid - __hostdev__ ReadAccessor(const GridT& grid) : ReadAccessor(grid.tree().root()) {} + __hostdev__ ReadAccessor(__global__ const GridT& grid) __local__ : ReadAccessor(grid.tree().root()) {} /// @brief Constructor from a tree - __hostdev__ ReadAccessor(const TreeT& tree) : ReadAccessor(tree.root()) {} + __hostdev__ ReadAccessor(__global__ const TreeT& tree) __local__ : ReadAccessor(tree.root()) {} /// @brief Reset this access to its initial state, i.e. with an empty cache __hostdev__ void clear() @@ -4793,37 +5712,64 @@ public: mNode = nullptr; } - __hostdev__ const RootT& root() const { return *mRoot; } + __hostdev__ __global__ const RootT& root() const __global__ { return *mRoot; } +#if defined(__KERNEL_METAL__) + __hostdev__ __global__ const RootT& root() const __local__ { return *mRoot; } +#endif /// @brief Defaults constructors - ReadAccessor(const ReadAccessor&) = default; - ~ReadAccessor() = default; - ReadAccessor& operator=(const ReadAccessor&) = default; + ReadAccessor(__global__ const ReadAccessor&) __global__ = default; + ~ReadAccessor() __global__ = default; + __global__ ReadAccessor& operator=(__global__ const ReadAccessor&) __global__ = default; - __hostdev__ bool isCached(const CoordType& ijk) const + __hostdev__ bool isCached(__global__ const CoordType& ijk) const __global__ { return (ijk[0] & int32_t(~NodeT::MASK)) == mKey[0] && (ijk[1] & int32_t(~NodeT::MASK)) == mKey[1] && (ijk[2] & int32_t(~NodeT::MASK)) == mKey[2]; } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isCached(__local__ const CoordType& ijk) const __global__ + { + return (ijk[0] & int32_t(~NodeT::MASK)) == mKey[0] && + (ijk[1] & int32_t(~NodeT::MASK)) == mKey[1] && + (ijk[2] & int32_t(~NodeT::MASK)) == mKey[2]; + } + __hostdev__ bool isCached(__local__ const CoordType& ijk) const __local__ + { + return (ijk[0] & int32_t(~NodeT::MASK)) == mKey[0] && + (ijk[1] & int32_t(~NodeT::MASK)) == mKey[1] && + (ijk[2] & int32_t(~NodeT::MASK)) == mKey[2]; + } +#endif - __hostdev__ ValueType getValue(const CoordType& ijk) const + __hostdev__ ValueType getValue(__global__ const CoordType& ijk) const __global__ { if (this->isCached(ijk)) { return mNode->getValueAndCache(ijk, *this); } return mRoot->getValueAndCache(ijk, *this); } - __hostdev__ ValueType operator()(const CoordType& ijk) const +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __global__ + { + if (this->isCached(ijk)) { + return mNode->getValueAndCache(ijk, *this); + } + return mRoot->getValueAndCache(ijk, *this); + } +#endif + + __hostdev__ ValueType operator()(__global__ const CoordType& ijk) const __global__ { return this->getValue(ijk); } - __hostdev__ ValueType operator()(int i, int j, int k) const + __hostdev__ ValueType operator()(int i, int j, int k) const __global__ { return this->getValue(CoordType(i,j,k)); } - __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const + __hostdev__ NodeInfo getNodeInfo(__global__ const CoordType& ijk) const __global__ { if (this->isCached(ijk)) { return mNode->getNodeInfoAndCache(ijk, *this); @@ -4831,15 +5777,24 @@ public: return mRoot->getNodeInfoAndCache(ijk, *this); } - __hostdev__ bool isActive(const CoordType& ijk) const + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __global__ { if (this->isCached(ijk)) { return mNode->isActiveAndCache(ijk, *this); } return mRoot->isActiveAndCache(ijk, *this); } +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __local__ + { + if (this->isCached(ijk)) { + return mNode->isActiveAndCache(ijk, *this); + } + return mRoot->isActiveAndCache(ijk, *this); + } +#endif - __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ ValueType& v) const __global__ { if (this->isCached(ijk)) { return mNode->probeValueAndCache(ijk, v, *this); @@ -4847,7 +5802,7 @@ public: return mRoot->probeValueAndCache(ijk, v, *this); } - __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const + __hostdev__ __global__ const LeafT* probeLeaf(__global__ const CoordType& ijk) const __global__ { if (this->isCached(ijk)) { return mNode->probeLeafAndCache(ijk, *this); @@ -4856,7 +5811,7 @@ public: } template - __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const + __hostdev__ uint32_t getDim(__global__ const CoordType& ijk, __global__ const RayT& ray) const __global__ { if (this->isCached(ijk)) { return mNode->getDimAndCache(ijk, ray, *this); @@ -4874,15 +5829,26 @@ private: friend class LeafNode; /// @brief Inserts a leaf node and key pair into this ReadAccessor - __hostdev__ void insert(const CoordType& ijk, const NodeT* node) const + __hostdev__ void insert(__global__ const CoordType& ijk, __global__ const NodeT* node) const __local__ { mKey = ijk & ~NodeT::MASK; mNode = node; } +#if defined(__KERNEL_METAL__) + __hostdev__ void insert(__local__ const CoordType& ijk, __global__ const NodeT* node) const __local__ + { + mKey = ijk & ~NodeT::MASK; + mNode = node; + } +#endif // no-op template - __hostdev__ void insert(const CoordType&, const OtherNodeT*) const {} + __hostdev__ void insert(__global__ const CoordType&, __global__ const OtherNodeT*) const __local__ {} +#if defined(__KERNEL_METAL__) + template + __hostdev__ void insert(__local__ const CoordType&, __global__ const OtherNodeT*) const __local__ {} +#endif }; // ReadAccessor @@ -4909,20 +5875,20 @@ class ReadAccessor//e.g. (0,1), (1,2), (0,2) #else // 68 bytes total mutable CoordT mKeys[2]; // 2*3*4 = 24 bytes #endif - mutable const RootT* mRoot; - mutable const Node1T* mNode1; - mutable const Node2T* mNode2; + mutable __global__ const RootT* mRoot; + mutable __global__ const Node1T* mNode1; + mutable __global__ const Node2T* mNode2; public: using ValueType = ValueT; using CoordType = CoordT; - static const int CacheLevels = 2; + static __constant__ const int CacheLevels = 2; using NodeInfo = typename ReadAccessor::NodeInfo; /// @brief Constructor from a root node - __hostdev__ ReadAccessor(const RootT& root) + __hostdev__ ReadAccessor(__global__ const RootT& root) __local__ #ifdef USE_SINGLE_ACCESSOR_KEY : mKey(CoordType::max()) #else @@ -4935,10 +5901,10 @@ public: } /// @brief Constructor from a grid - __hostdev__ ReadAccessor(const GridT& grid) : ReadAccessor(grid.tree().root()) {} + __hostdev__ ReadAccessor(__global__ const GridT& grid) __local__ : ReadAccessor(grid.tree().root()) {} /// @brief Constructor from a tree - __hostdev__ ReadAccessor(const TreeT& tree) : ReadAccessor(tree.root()) {} + __hostdev__ ReadAccessor(__global__ const TreeT& tree) __local__ : ReadAccessor(tree.root()) {} /// @brief Reset this access to its initial state, i.e. with an empty cache __hostdev__ void clear() @@ -4952,15 +5918,18 @@ public: mNode2 = nullptr; } - __hostdev__ const RootT& root() const { return *mRoot; } + __hostdev__ __global__ const RootT& root() const __global__ { return *mRoot; } +#if defined(__KERNEL_METAL__) + __hostdev__ __global__ const RootT& root() const __local__ { return *mRoot; } +#endif /// @brief Defaults constructors - ReadAccessor(const ReadAccessor&) = default; + ReadAccessor(__global__ const ReadAccessor&) __global__ = default; ~ReadAccessor() = default; - ReadAccessor& operator=(const ReadAccessor&) = default; + __global__ ReadAccessor& operator=(__global__ const ReadAccessor&) __global__ = default; #ifdef USE_SINGLE_ACCESSOR_KEY - __hostdev__ bool isCached1(CoordValueType dirty) const + __hostdev__ bool isCached1(CoordValueType dirty) const __global__ { if (!mNode1) return false; @@ -4970,7 +5939,7 @@ public: } return true; } - __hostdev__ bool isCached2(CoordValueType dirty) const + __hostdev__ bool isCached2(CoordValueType dirty) const __global__ { if (!mNode2) return false; @@ -4980,18 +5949,18 @@ public: } return true; } - __hostdev__ CoordValueType computeDirty(const CoordType& ijk) const + __hostdev__ CoordValueType computeDirty(__global__ const CoordType& ijk) const __global__ { return (ijk[0] ^ mKey[0]) | (ijk[1] ^ mKey[1]) | (ijk[2] ^ mKey[2]); } #else - __hostdev__ bool isCached1(const CoordType& ijk) const + __hostdev__ bool isCached1(__global__ const CoordType& ijk) const __global__ { return (ijk[0] & int32_t(~Node1T::MASK)) == mKeys[0][0] && (ijk[1] & int32_t(~Node1T::MASK)) == mKeys[0][1] && (ijk[2] & int32_t(~Node1T::MASK)) == mKeys[0][2]; } - __hostdev__ bool isCached2(const CoordType& ijk) const + __hostdev__ bool isCached2(__global__ const CoordType& ijk) const __global__ { return (ijk[0] & int32_t(~Node2T::MASK)) == mKeys[1][0] && (ijk[1] & int32_t(~Node2T::MASK)) == mKeys[1][1] && @@ -4999,12 +5968,12 @@ public: } #endif - __hostdev__ ValueType getValue(const CoordType& ijk) const + __hostdev__ ValueType getValue(__global__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached1(dirty)) { return mNode1->getValueAndCache(ijk, *this); @@ -5013,21 +5982,37 @@ public: } return mRoot->getValueAndCache(ijk, *this); } - __hostdev__ ValueType operator()(const CoordType& ijk) const - { - return this->getValue(ijk); - } - __hostdev__ ValueType operator()(int i, int j, int k) const - { - return this->getValue(CoordType(i,j,k)); - } - - __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; +#endif + if (this->isCached1(dirty)) { + return mNode1->getValueAndCache(ijk, *this); + } else if (this->isCached2(dirty)) { + return mNode2->getValueAndCache(ijk, *this); + } + return mRoot->getValueAndCache(ijk, *this); + } +#endif + __hostdev__ ValueType operator()(__global__ const CoordType& ijk) const __global__ + { + return this->getValue(ijk); + } + __hostdev__ ValueType operator()(int i, int j, int k) const __global__ + { + return this->getValue(CoordType(i,j,k)); + } + + __hostdev__ NodeInfo getNodeInfo(__global__ const CoordType& ijk) const __global__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + const CoordValueType dirty = this->computeDirty(ijk); +#else + __global__ auto&& dirty = ijk; #endif if (this->isCached1(dirty)) { return mNode1->getNodeInfoAndCache(ijk, *this); @@ -5037,12 +6022,12 @@ public: return mRoot->getNodeInfoAndCache(ijk, *this); } - __hostdev__ bool isActive(const CoordType& ijk) const + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached1(dirty)) { return mNode1->isActiveAndCache(ijk, *this); @@ -5052,12 +6037,12 @@ public: return mRoot->isActiveAndCache(ijk, *this); } - __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ ValueType& v) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached1(dirty)) { return mNode1->probeValueAndCache(ijk, v, *this); @@ -5067,12 +6052,12 @@ public: return mRoot->probeValueAndCache(ijk, v, *this); } - __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const + __hostdev__ __global__ const LeafT* probeLeaf(__global__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached1(dirty)) { return mNode1->probeLeafAndCache(ijk, *this); @@ -5083,12 +6068,12 @@ public: } template - __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const + __hostdev__ uint32_t getDim(__global__ const CoordType& ijk, __global__ const RayT& ray) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached1(dirty)) { return mNode1->getDimAndCache(ijk, ray, *this); @@ -5108,7 +6093,7 @@ private: friend class LeafNode; /// @brief Inserts a leaf node and key pair into this ReadAccessor - __hostdev__ void insert(const CoordType& ijk, const Node1T* node) const + __hostdev__ void insert(__global__ const CoordType& ijk, __global__ const Node1T* node) const __local__ { #ifdef USE_SINGLE_ACCESSOR_KEY mKey = ijk; @@ -5117,7 +6102,7 @@ private: #endif mNode1 = node; } - __hostdev__ void insert(const CoordType& ijk, const Node2T* node) const + __hostdev__ void insert(__local__ const CoordType& ijk, __global__ const Node2T* node) const __local__ { #ifdef USE_SINGLE_ACCESSOR_KEY mKey = ijk; @@ -5127,7 +6112,11 @@ private: mNode2 = node; } template - __hostdev__ void insert(const CoordType&, const OtherNodeT*) const {} + __hostdev__ void insert(__global__ const CoordType&, __global__ const OtherNodeT*) const __local__ {} +#if defined(__KERNEL_METAL__) + template + __hostdev__ void insert(__local__ const CoordType&, __global__ const OtherNodeT*) const __local__ {} +#endif }; // ReadAccessor @@ -5145,7 +6134,7 @@ class ReadAccessor using ValueT = typename RootT::ValueType; using FloatType = typename RootT::FloatType; - using CoordValueType = typename RootT::CoordT::ValueType; + using CoordValueType = typename RootT::CoordType::ValueType; // All member data are mutable to allow for access methods to be const #ifdef USE_SINGLE_ACCESSOR_KEY // 44 bytes total @@ -5153,19 +6142,19 @@ class ReadAccessor #else // 68 bytes total mutable CoordT mKeys[3]; // 3*3*4 = 36 bytes #endif - mutable const RootT* mRoot; - mutable const void* mNode[3]; // 4*8 = 32 bytes + mutable __global__ const RootT* mRoot; + mutable __global__ const void* mNode[3]; // 4*8 = 32 bytes public: using ValueType = ValueT; using CoordType = CoordT; - static const int CacheLevels = 3; + static __constant__ const int CacheLevels = 3; using NodeInfo = typename ReadAccessor::NodeInfo; /// @brief Constructor from a root node - __hostdev__ ReadAccessor(const RootT& root) + __hostdev__ ReadAccessor(__global__ const RootT& root) __local__ #ifdef USE_SINGLE_ACCESSOR_KEY : mKey(CoordType::max()) #else @@ -5177,35 +6166,38 @@ public: } /// @brief Constructor from a grid - __hostdev__ ReadAccessor(const GridT& grid) : ReadAccessor(grid.tree().root()) {} + __hostdev__ ReadAccessor(__global__ const GridT& grid) __local__ : ReadAccessor(grid.tree().root()) {} /// @brief Constructor from a tree - __hostdev__ ReadAccessor(const TreeT& tree) : ReadAccessor(tree.root()) {} + __hostdev__ ReadAccessor(__global__ const TreeT& tree) __local__ : ReadAccessor(tree.root()) {} - __hostdev__ const RootT& root() const { return *mRoot; } + __hostdev__ __global__ const RootT& root() const __global__ { return *mRoot; } +#if defined(__KERNEL_METAL__) + __hostdev__ __global__ const RootT& root() const __local__ { return *mRoot; } +#endif /// @brief Defaults constructors - ReadAccessor(const ReadAccessor&) = default; - ~ReadAccessor() = default; - ReadAccessor& operator=(const ReadAccessor&) = default; + ReadAccessor(__local__ const ReadAccessor&) __local__ = default; + ~ReadAccessor() __global__ = default; + __global__ ReadAccessor& operator=(__global__ const ReadAccessor&) __global__ = default; /// @brief Return a const point to the cached node of the specified type /// /// @warning The return value could be NULL. template - __hostdev__ const NodeT* getNode() const + __hostdev__ __global__ const NodeT* getNode() const __global__ { using T = typename NodeTrait::type; static_assert(is_same::value, "ReadAccessor::getNode: Invalid node type"); - return reinterpret_cast(mNode[NodeT::LEVEL]); + return reinterpret_cast<__global__ const T*>(mNode[NodeT::LEVEL]); } template - __hostdev__ const typename NodeTrait::type* getNode() const + __hostdev__ __global__ const typename NodeTrait::type* getNode() const { using T = typename NodeTrait::type; static_assert(LEVEL>=0 && LEVEL<=2, "ReadAccessor::getNode: Invalid node type"); - return reinterpret_cast(mNode[LEVEL]); + return reinterpret_cast<__global__ const T*>(mNode[LEVEL]); } @@ -5222,7 +6214,7 @@ public: #ifdef USE_SINGLE_ACCESSOR_KEY template - __hostdev__ bool isCached(CoordValueType dirty) const + __hostdev__ bool isCached(CoordValueType dirty) const __global__ { if (!mNode[NodeT::LEVEL]) return false; @@ -5233,128 +6225,229 @@ public: return true; } - __hostdev__ CoordValueType computeDirty(const CoordType& ijk) const + __hostdev__ CoordValueType computeDirty(const CoordType& ijk) const __global__ { return (ijk[0] ^ mKey[0]) | (ijk[1] ^ mKey[1]) | (ijk[2] ^ mKey[2]); } #else template - __hostdev__ bool isCached(const CoordType& ijk) const + __hostdev__ bool isCached(__global__ const CoordType& ijk) const __global__ { return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] && (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] && (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2]; } +#if defined(__KERNEL_METAL__) + template + __hostdev__ bool isCached(__local__ const CoordType& ijk) const __global__ + { + return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] && (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] && (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2]; + } + template + __hostdev__ bool isCached(__local__ const CoordType& ijk) const __local__ + { + return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] && (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] && (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2]; + } +#endif // __KERNEL_METAL__ #endif - __hostdev__ ValueType getValue(const CoordType& ijk) const + __hostdev__ ValueType getValue(__global__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached(dirty)) { - return ((LeafT*)mNode[0])->getValue(ijk); + return ((__global__ LeafT*)mNode[0])->getValue(ijk); } else if (this->isCached(dirty)) { - return ((NodeT1*)mNode[1])->getValueAndCache(ijk, *this); + return ((__global__ NodeT1*)mNode[1])->getValueAndCache(ijk, *this); } else if (this->isCached(dirty)) { - return ((NodeT2*)mNode[2])->getValueAndCache(ijk, *this); + return ((__global__ NodeT2*)mNode[2])->getValueAndCache(ijk, *this); } return mRoot->getValueAndCache(ijk, *this); } - __hostdev__ ValueType operator()(const CoordType& ijk) const +#if defined(__KERNEL_METAL__) + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __global__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + const CoordValueType dirty = this->computeDirty(ijk); +#else + __local__ auto&& dirty = ijk; +#endif + if (this->isCached(dirty)) { + return ((__global__ LeafT*)mNode[0])->getValue(ijk); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT1*)mNode[1])->getValueAndCache(ijk, *this); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT2*)mNode[2])->getValueAndCache(ijk, *this); + } + return mRoot->getValueAndCache(ijk, *this); + } + __hostdev__ ValueType getValue(__local__ const CoordType& ijk) const __local__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + const CoordValueType dirty = this->computeDirty(ijk); +#else + __local__ auto&& dirty = ijk; +#endif + if (this->isCached(dirty)) { + return ((__global__ LeafT*)mNode[0])->getValue(ijk); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT1*)mNode[1])->getValueAndCache(ijk, *this); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT2*)mNode[2])->getValueAndCache(ijk, *this); + } + return mRoot->getValueAndCache(ijk, *this); + } +#endif // __KERNEL_METAL__ + + __hostdev__ ValueType operator()(__global__ const CoordType& ijk) const __global__ { return this->getValue(ijk); } - __hostdev__ ValueType operator()(int i, int j, int k) const + __hostdev__ ValueType operator()(int i, int j, int k) const __global__ { return this->getValue(CoordType(i,j,k)); } - __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const + __hostdev__ NodeInfo getNodeInfo(__global__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached(dirty)) { - return ((LeafT*)mNode[0])->getNodeInfoAndCache(ijk, *this); + return ((__global__ LeafT*)mNode[0])->getNodeInfoAndCache(ijk, *this); } else if (this->isCached(dirty)) { - return ((NodeT1*)mNode[1])->getNodeInfoAndCache(ijk, *this); + return ((__global__ NodeT1*)mNode[1])->getNodeInfoAndCache(ijk, *this); } else if (this->isCached(dirty)) { - return ((NodeT2*)mNode[2])->getNodeInfoAndCache(ijk, *this); + return ((__global__ NodeT2*)mNode[2])->getNodeInfoAndCache(ijk, *this); } return mRoot->getNodeInfoAndCache(ijk, *this); } - __hostdev__ bool isActive(const CoordType& ijk) const + __hostdev__ bool isActive(__global__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached(dirty)) { - return ((LeafT*)mNode[0])->isActive(ijk); + return ((__global__ LeafT*)mNode[0])->isActive(ijk); } else if (this->isCached(dirty)) { - return ((NodeT1*)mNode[1])->isActiveAndCache(ijk, *this); + return ((__global__ NodeT1*)mNode[1])->isActiveAndCache(ijk, *this); } else if (this->isCached(dirty)) { - return ((NodeT2*)mNode[2])->isActiveAndCache(ijk, *this); + return ((__global__ NodeT2*)mNode[2])->isActiveAndCache(ijk, *this); } return mRoot->isActiveAndCache(ijk, *this); } - - __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const +#if defined(__KERNEL_METAL__) + __hostdev__ bool isActive(__local__ const CoordType& ijk) const __local__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __local__ auto&& dirty = ijk; #endif if (this->isCached(dirty)) { - return ((LeafT*)mNode[0])->probeValue(ijk, v); + return ((__global__ LeafT*)mNode[0])->isActive(ijk); } else if (this->isCached(dirty)) { - return ((NodeT1*)mNode[1])->probeValueAndCache(ijk, v, *this); + return ((__global__ NodeT1*)mNode[1])->isActiveAndCache(ijk, *this); } else if (this->isCached(dirty)) { - return ((NodeT2*)mNode[2])->probeValueAndCache(ijk, v, *this); + return ((__global__ NodeT2*)mNode[2])->isActiveAndCache(ijk, *this); + } + return mRoot->isActiveAndCache(ijk, *this); + } +#endif + + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ ValueType& v) const __global__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + const CoordValueType dirty = this->computeDirty(ijk); +#else + __global__ auto&& dirty = ijk; +#endif + if (this->isCached(dirty)) { + return ((__global__ LeafT*)mNode[0])->probeValue(ijk, v); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT1*)mNode[1])->probeValueAndCache(ijk, v, *this); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT2*)mNode[2])->probeValueAndCache(ijk, v, *this); } return mRoot->probeValueAndCache(ijk, v, *this); } - __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const + __hostdev__ __global__ const LeafT* probeLeaf(__global__ const CoordType& ijk) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached(dirty)) { - return ((LeafT*)mNode[0]); + return ((__global__ LeafT*)mNode[0]); } else if (this->isCached(dirty)) { - return ((NodeT1*)mNode[1])->probeLeafAndCache(ijk, *this); + return ((__global__ NodeT1*)mNode[1])->probeLeafAndCache(ijk, *this); } else if (this->isCached(dirty)) { - return ((NodeT2*)mNode[2])->probeLeafAndCache(ijk, *this); + return ((__global__ NodeT2*)mNode[2])->probeLeafAndCache(ijk, *this); } return mRoot->probeLeafAndCache(ijk, *this); } template - __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const + __hostdev__ uint32_t getDim(__global__ const CoordType& ijk, __global__ const RayT& ray) const __global__ { #ifdef USE_SINGLE_ACCESSOR_KEY const CoordValueType dirty = this->computeDirty(ijk); #else - auto&& dirty = ijk; + __global__ auto&& dirty = ijk; #endif if (this->isCached(dirty)) { - return ((LeafT*)mNode[0])->getDimAndCache(ijk, ray, *this); + return ((__global__ LeafT*)mNode[0])->getDimAndCache(ijk, ray, *this); } else if (this->isCached(dirty)) { - return ((NodeT1*)mNode[1])->getDimAndCache(ijk, ray, *this); + return ((__global__ NodeT1*)mNode[1])->getDimAndCache(ijk, ray, *this); } else if (this->isCached(dirty)) { - return ((NodeT2*)mNode[2])->getDimAndCache(ijk, ray, *this); + return ((__global__ NodeT2*)mNode[2])->getDimAndCache(ijk, ray, *this); } return mRoot->getDimAndCache(ijk, ray, *this); } +#if defined(__KERNEL_METAL__) + template + __hostdev__ uint32_t getDim(__global__ const CoordType& ijk, __local__ const RayT& ray) const __global__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + const CoordValueType dirty = this->computeDirty(ijk); +#else + __global__ auto&& dirty = ijk; +#endif + if (this->isCached(dirty)) { + return ((__global__ LeafT*)mNode[0])->getDimAndCache(ijk, ray, *this); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT1*)mNode[1])->getDimAndCache(ijk, ray, *this); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT2*)mNode[2])->getDimAndCache(ijk, ray, *this); + } + return mRoot->getDimAndCache(ijk, ray, *this); + } + template + __hostdev__ uint32_t getDim(__local__ const CoordType& ijk, __local__ const RayT& ray) const __local__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + const CoordValueType dirty = this->computeDirty(ijk); +#else + __local__ auto&& dirty = ijk; +#endif + if (this->isCached(dirty)) { + return ((__global__ LeafT*)mNode[0])->getDimAndCache(ijk, ray, *this); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT1*)mNode[1])->getDimAndCache(ijk, ray, *this); + } else if (this->isCached(dirty)) { + return ((__global__ NodeT2*)mNode[2])->getDimAndCache(ijk, ray, *this); + } + return mRoot->getDimAndCache(ijk, ray, *this); + } +#endif // __KERNEL_METAL__ private: /// @brief Allow nodes to insert themselves into the cache. @@ -5367,7 +6460,7 @@ private: /// @brief Inserts a leaf node and key pair into this ReadAccessor template - __hostdev__ void insert(const CoordType& ijk, const NodeT* node) const + __hostdev__ void insert(__global__ const CoordType& ijk, __global__ const NodeT* node) const __local__ { #ifdef USE_SINGLE_ACCESSOR_KEY mKey = ijk; @@ -5376,6 +6469,28 @@ private: #endif mNode[NodeT::LEVEL] = node; } +#if defined(__KERNEL_METAL__) + template + __hostdev__ void insert(__local__ const CoordType& ijk, __global__ const NodeT* node) const __local__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + mKey = ijk; +#else + mKeys[NodeT::LEVEL] = ijk & ~NodeT::MASK; +#endif + mNode[NodeT::LEVEL] = node; + } + template + __hostdev__ void insert(__local__ const CoordType& ijk, __global__ const NodeT* node) const __global__ + { +#ifdef USE_SINGLE_ACCESSOR_KEY + mKey = ijk; +#else + mKeys[NodeT::LEVEL] = ijk & ~NodeT::MASK; +#endif + mNode[NodeT::LEVEL] = node; + } +#endif // __KERNEL_METAL__ }; // ReadAccessor ////////////////////////////////////////////////// @@ -5393,19 +6508,19 @@ private: /// createAccessor<0,1,2>(grid): Caching of all nodes at all tree levels template -ReadAccessor createAccessor(const NanoGrid &grid) +ReadAccessor createAccessor(__global__ const NanoGrid &grid) { return ReadAccessor(grid); } template -ReadAccessor createAccessor(const NanoTree &tree) +ReadAccessor createAccessor(__global__ const NanoTree &tree) { return ReadAccessor(tree); } template -ReadAccessor createAccessor(const NanoRoot &root) +ReadAccessor createAccessor(__global__ const NanoRoot &root) { return ReadAccessor(root); } @@ -5424,52 +6539,59 @@ class GridMetaData // memory-layout of the data structure and the reasons why certain methods are safe // to call and others are not! using GridT = NanoGrid; - __hostdev__ const GridT& grid() const { return *reinterpret_cast(this); } + __hostdev__ __global__ const GridT& grid() const __global__ { return *reinterpret_cast<__global__ const GridT*>(this); } public: - __hostdev__ bool isValid() const { return this->grid().isValid(); } - __hostdev__ uint64_t gridSize() const { return this->grid().gridSize(); } - __hostdev__ uint32_t gridIndex() const { return this->grid().gridIndex(); } - __hostdev__ uint32_t gridCount() const { return this->grid().gridCount(); } - __hostdev__ const char* shortGridName() const { return this->grid().shortGridName(); } - __hostdev__ GridType gridType() const { return this->grid().gridType(); } - __hostdev__ GridClass gridClass() const { return this->grid().gridClass(); } - __hostdev__ bool isLevelSet() const { return this->grid().isLevelSet(); } - __hostdev__ bool isFogVolume() const { return this->grid().isFogVolume(); } - __hostdev__ bool isPointIndex() const { return this->grid().isPointIndex(); } - __hostdev__ bool isPointData() const { return this->grid().isPointData(); } - __hostdev__ bool isMask() const { return this->grid().isMask(); } - __hostdev__ bool isStaggered() const { return this->grid().isStaggered(); } - __hostdev__ bool isUnknown() const { return this->grid().isUnknown(); } - __hostdev__ const Map& map() const { return this->grid().map(); } - __hostdev__ const BBox& worldBBox() const { return this->grid().worldBBox(); } - __hostdev__ const BBox& indexBBox() const { return this->grid().indexBBox(); } - __hostdev__ Vec3R voxelSize() const { return this->grid().voxelSize(); } - __hostdev__ int blindDataCount() const { return this->grid().blindDataCount(); } - __hostdev__ const GridBlindMetaData& blindMetaData(uint32_t n) const { return this->grid().blindMetaData(n); } - __hostdev__ uint64_t activeVoxelCount() const { return this->grid().activeVoxelCount(); } - __hostdev__ const uint32_t& activeTileCount(uint32_t level) const { return this->grid().tree().activeTileCount(level); } - __hostdev__ uint32_t nodeCount(uint32_t level) const { return this->grid().tree().nodeCount(level); } - __hostdev__ uint64_t checksum() const { return this->grid().checksum(); } - __hostdev__ bool isEmpty() const { return this->grid().isEmpty(); } - __hostdev__ Version version() const { return this->grid().version(); } + __hostdev__ bool isValid() const __global__ { return this->grid().isValid(); } + __hostdev__ uint64_t gridSize() const __global__ { return this->grid().gridSize(); } + __hostdev__ uint32_t gridIndex() const __global__ { return this->grid().gridIndex(); } + __hostdev__ uint32_t gridCount() const __global__ { return this->grid().gridCount(); } + __hostdev__ __global__ const char* shortGridName() const __global__ { return this->grid().shortGridName(); } + __hostdev__ GridType gridType() const __global__ { return this->grid().gridType(); } + __hostdev__ GridClass gridClass() const __global__ { return this->grid().gridClass(); } + __hostdev__ bool isLevelSet() const __global__ { return this->grid().isLevelSet(); } + __hostdev__ bool isFogVolume() const __global__ { return this->grid().isFogVolume(); } + __hostdev__ bool isPointIndex() const __global__ { return this->grid().isPointIndex(); } + __hostdev__ bool isPointData() const __global__ { return this->grid().isPointData(); } + __hostdev__ bool isMask() const __global__ { return this->grid().isMask(); } + __hostdev__ bool isStaggered() const __global__ { return this->grid().isStaggered(); } + __hostdev__ bool isUnknown() const __global__ { return this->grid().isUnknown(); } + __hostdev__ __global__ const Map& map() const __global__ { return this->grid().map(); } + __hostdev__ __global__ const BBox& worldBBox() const __global__ { return this->grid().worldBBox(); } + __hostdev__ __global__ const BBox& indexBBox() const __global__ { return this->grid().indexBBox(); } + __hostdev__ Vec3R voxelSize() const __global__ { return this->grid().voxelSize(); } + __hostdev__ int blindDataCount() const __global__ { return this->grid().blindDataCount(); } + __hostdev__ __global__ const GridBlindMetaData& blindMetaData(uint32_t n) const __global__ { return this->grid().blindMetaData(n); } + __hostdev__ uint64_t activeVoxelCount() const __global__ { return this->grid().activeVoxelCount(); } + __hostdev__ __global__ const uint32_t& activeTileCount(uint32_t level) const __global__ { return this->grid().tree().activeTileCount(level); } + __hostdev__ uint32_t nodeCount(uint32_t level) const __global__ { return this->grid().tree().nodeCount(level); } + __hostdev__ uint64_t checksum() const __global__ { return this->grid().checksum(); } + __hostdev__ bool isEmpty() const __global__ { return this->grid().isEmpty(); } + __hostdev__ Version version() const __global__ { return this->grid().version(); } }; // GridMetaData /// @brief Class to access points at a specific voxel location template -class PointAccessor : public DefaultReadAccessor +class PointAccessor +#if !defined(__KERNEL_METAL__) + : public DefaultReadAccessor +#endif { +#if defined(__KERNEL_METAL__) + DefaultReadAccessor AccT; +#else using AccT = DefaultReadAccessor; - const UInt32Grid* mGrid; - const AttT* mData; +#endif + const __global__ UInt32Grid* mGrid; + const __global__ AttT* mData; public: using LeafNodeType = typename NanoRoot::LeafNodeType; - PointAccessor(const UInt32Grid& grid) + PointAccessor(__global__ const UInt32Grid& grid) __local__ : AccT(grid.tree().root()) , mGrid(&grid) - , mData(reinterpret_cast(grid.blindData(0))) + , mData(reinterpret_cast<__global__ const AttT*>(grid.blindData(0))) { NANOVDB_ASSERT(grid.gridType() == GridType::UInt32); NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && is_same::value) || @@ -5478,7 +6600,7 @@ public: } /// @brief Return the total number of point in the grid and set the /// iterators to the complete range of points. - __hostdev__ uint64_t gridPoints(const AttT*& begin, const AttT*& end) const + __hostdev__ uint64_t gridPoints(__global__ const AttT*& begin, __global__ const AttT*& end) const __global__ { const uint64_t count = mGrid->blindMetaData(0u).mElementCount; begin = mData; @@ -5488,9 +6610,9 @@ public: /// @brief Return the number of points in the leaf node containing the coordinate @a ijk. /// If this return value is larger than zero then the iterators @a begin and @a end /// will point to all the attributes contained within that leaf node. - __hostdev__ uint64_t leafPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const + __hostdev__ uint64_t leafPoints(__global__ const Coord& ijk, __global__ const AttT*& begin, __global__ const AttT*& end) const __global__ { - auto* leaf = this->probeLeaf(ijk); + __global__ auto* leaf = this->probeLeaf(ijk); if (leaf == nullptr) { return 0; } @@ -5500,14 +6622,14 @@ public: } /// @brief get iterators over offsets to points at a specific voxel location - __hostdev__ uint64_t voxelPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const + __hostdev__ uint64_t voxelPoints(__global__ const Coord& ijk, __global__ const AttT*& begin, __global__ const AttT*& end) const __global__ { - auto* leaf = this->probeLeaf(ijk); + __global__ auto* leaf = this->probeLeaf(ijk); if (leaf == nullptr) return 0; const uint32_t offset = LeafNodeType::CoordToOffset(ijk); if (leaf->isActive(offset)) { - auto* p = mData + leaf->minimum(); + __global__ auto* p = mData + leaf->minimum(); begin = p + (offset == 0 ? 0 : leaf->getValue(offset - 1)); end = p + leaf->getValue(offset); return end - begin; @@ -5520,11 +6642,20 @@ public: /// /// @note The ChannelT template parameter can be either const and non-const. template -class ChannelAccessor : public DefaultReadAccessor +class ChannelAccessor +#if !defined (__KERNEL_METAL__) + : public DefaultReadAccessor +#endif { +#if defined (__KERNEL_METAL__) + DefaultReadAccessor BaseT; +#define BASE(v) BaseT.v +#else using BaseT = DefaultReadAccessor; - const IndexGrid &mGrid; - ChannelT *mChannel; +#define BASE(v) BaseT::v +#endif + __global__ const IndexGrid &mGrid; + __global__ ChannelT *mChannel; public: using ValueType = ChannelT; @@ -5533,7 +6664,7 @@ public: /// @brief Ctor from an IndexGrid and an integer ID of an internal channel /// that is assumed to exist as blind data in the IndexGrid. - __hostdev__ ChannelAccessor(const IndexGrid& grid, uint32_t channelID = 0u) + __hostdev__ ChannelAccessor(__global__ const IndexGrid& grid, uint32_t channelID = 0u) : BaseT(grid.tree().root()) , mGrid(grid) , mChannel(nullptr) @@ -5544,7 +6675,7 @@ public: } /// @brief Ctor from an IndexGrid and an external channel - __hostdev__ ChannelAccessor(const IndexGrid& grid, ChannelT *channelPtr) + __hostdev__ ChannelAccessor(__global__ const IndexGrid& grid, __global__ ChannelT *channelPtr) : BaseT(grid.tree().root()) , mGrid(grid) , mChannel(channelPtr) @@ -5555,19 +6686,19 @@ public: } /// @brief Return a const reference to the IndexGrid - __hostdev__ const IndexGrid &grid() const {return mGrid;} + __hostdev__ __global__ const IndexGrid &grid() const {return mGrid;} /// @brief Return a const reference to the tree of the IndexGrid - __hostdev__ const IndexTree &tree() const {return mGrid.tree();} + __hostdev__ __global__ const IndexTree &tree() const {return mGrid.tree();} /// @brief Return a vector of the axial voxel sizes - __hostdev__ const Vec3R& voxelSize() const { return mGrid.voxelSize(); } + __hostdev__ __global__ const Vec3R& voxelSize() const { return mGrid.voxelSize(); } /// @brief Return total number of values indexed by the IndexGrid - __hostdev__ const uint64_t& valueCount() const { return mGrid.valueCount(); } + __hostdev__ uint64_t valueCount() const { return mGrid.valueCount(); } /// @brief Change to an external channel - __hostdev__ void setChannel(ChannelT *channelPtr) + __hostdev__ void setChannel(__global__ ChannelT *channelPtr) { mChannel = channelPtr; NANOVDB_ASSERT(mChannel); @@ -5577,23 +6708,24 @@ public: /// in the IndexGrid. __hostdev__ void setChannel(uint32_t channelID) { - this->setChannel(reinterpret_cast(const_cast(mGrid.blindData(channelID)))); + this->setChannel(reinterpret_cast<__global__ ChannelT*>(const_cast<__global__ void*>(mGrid.blindData(channelID)))); } /// @brief Return the linear offset into a channel that maps to the specified coordinate - __hostdev__ uint64_t getIndex(const Coord& ijk) const {return BaseT::getValue(ijk);} - __hostdev__ uint64_t idx(int i, int j, int k) const {return BaseT::getValue(Coord(i,j,k));} + __hostdev__ uint64_t getIndex(__global__ const Coord& ijk) const {return BASE(getValue)(ijk);} + __hostdev__ uint64_t idx(int i, int j, int k) const {return BASE(getValue)(Coord(i,j,k));} /// @brief Return the value from a cached channel that maps to the specified coordinate - __hostdev__ ChannelT& getValue(const Coord& ijk) const {return mChannel[BaseT::getValue(ijk)];} - __hostdev__ ChannelT& operator()(const Coord& ijk) const {return this->getValue(ijk);} - __hostdev__ ChannelT& operator()(int i, int j, int k) const {return this->getValue(Coord(i,j,k));} + __hostdev__ __global__ ChannelT& getValue(__global__ const Coord& ijk) const {return mChannel[BASE(getValue)(ijk)];} + __hostdev__ __global__ ChannelT& operator()(__global__ const Coord& ijk) const {return this->getValue(ijk);} + __hostdev__ __global__ ChannelT& operator()(int i, int j, int k) const {return this->getValue(Coord(i,j,k));} /// @brief return the state and updates the value of the specified voxel - __hostdev__ bool probeValue(const CoordType& ijk, typename remove_const::type &v) const + using CoordType = DefaultReadAccessor::CoordType; + __hostdev__ bool probeValue(__global__ const CoordType& ijk, __global__ typename remove_const::type &v) const { uint64_t idx; - const bool isActive = BaseT::probeValue(ijk, idx); + const bool isActive = BASE(probeValue)(ijk, idx); v = mChannel[idx]; return isActive; } @@ -5601,7 +6733,7 @@ public: /// /// @note The template parameter can be either const or non-const template - __hostdev__ T& getValue(const Coord& ijk, T* channelPtr) const {return channelPtr[BaseT::getValue(ijk)];} + __hostdev__ __global__ T& getValue(__global__ const Coord& ijk, __global__ T* channelPtr) const {return channelPtr[BASE(getValue)(ijk)];} }; // ChannelAccessor @@ -5643,6 +6775,7 @@ namespace io { /// @throw std::invalid_argument if buffer does not point to a valid NanoVDB grid. /// /// @warning This is pretty ugly code that involves lots of pointer and bit manipulations - not for the faint of heart :) +#if !defined(__KERNEL_METAL__) template // StreamT class must support: "void write(char*, size_t)" void writeUncompressedGrid(StreamT &os, const void *buffer) { @@ -5768,7 +6901,7 @@ VecT readUncompressedGrids(const char *fileName, const typename Gri } return readUncompressedGrids(is, buffer); }// readUncompressedGrids - +#endif // #if !defined(__KERNEL_METAL__) } // namespace io #endif// if !defined(__CUDA_ARCH__) && !defined(__HIP__) diff --git a/nanovdb/nanovdb/util/SampleFromVoxels.h b/nanovdb/nanovdb/util/SampleFromVoxels.h index e779d66..e2f9283 100644 --- a/nanovdb/nanovdb/util/SampleFromVoxels.h +++ b/nanovdb/nanovdb/util/SampleFromVoxels.h @@ -1,983 +1,1120 @@ -// Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 - -////////////////////////////////////////////////////////////////////////// -/// -/// @file SampleFromVoxels.h -/// -/// @brief NearestNeighborSampler, TrilinearSampler, TriquadraticSampler and TricubicSampler -/// -/// @note These interpolators employ internal caching for better performance when used repeatedly -/// in the same voxel location, so try to reuse an instance of these classes more than once. -/// -/// @warning While all the interpolators defined below work with both scalars and vectors -/// values (e.g. float and Vec3) TrilinarSampler::zeroCrossing and -/// Trilinear::gradient will only compile with floating point value types. -/// -/// @author Ken Museth -/// -/////////////////////////////////////////////////////////////////////////// - -#ifndef NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED -#define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED - -// Only define __hostdev__ when compiling as NVIDIA CUDA -#if defined(__CUDACC__) || defined(__HIP__) -#define __hostdev__ __host__ __device__ -#else -#include // for floor -#define __hostdev__ -#endif - -namespace nanovdb { - -// Forward declaration of sampler with specific polynomial orders -template -class SampleFromVoxels; - -/// @brief Factory free-function for a sampler of specific polynomial orders -/// -/// @details This allows for the compact syntax: -/// @code -/// auto acc = grid.getAccessor(); -/// auto smp = nanovdb::createSampler<1>( acc ); -/// @endcode -template -__hostdev__ SampleFromVoxels createSampler(const TreeOrAccT& acc) -{ - return SampleFromVoxels(acc); -} - -/// @brief Utility function that returns the Coord of the round-down of @a xyz -/// and redefined @xyz as the fractional part, ie xyz-in = return-value + xyz-out -template class Vec3T> -__hostdev__ inline CoordT Floor(Vec3T& xyz); - -/// @brief Template specialization of Floor for Vec3 -template class Vec3T> -__hostdev__ inline CoordT Floor(Vec3T& xyz) -{ - const float ijk[3] = {floorf(xyz[0]), floorf(xyz[1]), floorf(xyz[2])}; - xyz[0] -= ijk[0]; - xyz[1] -= ijk[1]; - xyz[2] -= ijk[2]; - return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2])); -} - -/// @brief Template specialization of Floor for Vec3 -template class Vec3T> -__hostdev__ inline CoordT Floor(Vec3T& xyz) -{ - const double ijk[3] = {floor(xyz[0]), floor(xyz[1]), floor(xyz[2])}; - xyz[0] -= ijk[0]; - xyz[1] -= ijk[1]; - xyz[2] -= ijk[2]; - return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2])); -} - -// ------------------------------> NearestNeighborSampler <-------------------------------------- - -/// @brief Nearest neighbor, i.e. zero order, interpolator with caching -template -class SampleFromVoxels -{ -public: - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - - static const int ORDER = 0; - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) - : mAcc(acc) - , mPos(CoordT::max()) - { - } - - __hostdev__ const TreeOrAccT& accessor() const { return mAcc; } - - /// @note xyz is in index space space - template - inline __hostdev__ ValueT operator()(const Vec3T& xyz) const; - - inline __hostdev__ ValueT operator()(const CoordT& ijk) const; - -private: - const TreeOrAccT& mAcc; - mutable CoordT mPos; - mutable ValueT mVal; // private cache -}; // SampleFromVoxels - -/// @brief Nearest neighbor, i.e. zero order, interpolator without caching -template -class SampleFromVoxels -{ -public: - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - static const int ORDER = 0; - - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) - : mAcc(acc) - { - } - - __hostdev__ const TreeOrAccT& accessor() const { return mAcc; } - - /// @note xyz is in index space space - template - inline __hostdev__ ValueT operator()(const Vec3T& xyz) const; - - inline __hostdev__ ValueT operator()(const CoordT& ijk) const { return mAcc.getValue(ijk);} - -private: - const TreeOrAccT& mAcc; -}; // SampleFromVoxels - -template -template -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const Vec3T& xyz) const -{ - const CoordT ijk = Round(xyz); - if (ijk != mPos) { - mPos = ijk; - mVal = mAcc.getValue(mPos); - } - return mVal; -} - -template -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT& ijk) const -{ - if (ijk != mPos) { - mPos = ijk; - mVal = mAcc.getValue(mPos); - } - return mVal; -} - -template -template -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const Vec3T& xyz) const -{ - return mAcc.getValue(Round(xyz)); -} - -// ------------------------------> TrilinearSampler <-------------------------------------- - -/// @brief Tri-linear sampler, i.e. first order, interpolator -template -class TrilinearSampler -{ -protected: - const TreeOrAccT& mAcc; - -public: - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - static const int ORDER = 1; - - /// @brief Protected constructor from a Tree or ReadAccessor - __hostdev__ TrilinearSampler(const TreeOrAccT& acc) : mAcc(acc) {} - - __hostdev__ const TreeOrAccT& accessor() const { return mAcc; } - - /// @brief Extract the stencil of 8 values - inline __hostdev__ void stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const; - - template class Vec3T> - static inline __hostdev__ ValueT sample(const Vec3T &uvw, const ValueT (&v)[2][2][2]); - - template class Vec3T> - static inline __hostdev__ Vec3T gradient(const Vec3T &uvw, const ValueT (&v)[2][2][2]); - - static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[2][2][2]); -}; // TrilinearSamplerBase - -template -__hostdev__ void TrilinearSampler::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const -{ - v[0][0][0] = mAcc.getValue(ijk); // i, j, k - - ijk[2] += 1; - v[0][0][1] = mAcc.getValue(ijk); // i, j, k + 1 - - ijk[1] += 1; - v[0][1][1] = mAcc.getValue(ijk); // i, j+1, k + 1 - - ijk[2] -= 1; - v[0][1][0] = mAcc.getValue(ijk); // i, j+1, k - - ijk[0] += 1; - ijk[1] -= 1; - v[1][0][0] = mAcc.getValue(ijk); // i+1, j, k - - ijk[2] += 1; - v[1][0][1] = mAcc.getValue(ijk); // i+1, j, k + 1 - - ijk[1] += 1; - v[1][1][1] = mAcc.getValue(ijk); // i+1, j+1, k + 1 - - ijk[2] -= 1; - v[1][1][0] = mAcc.getValue(ijk); // i+1, j+1, k -} - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType TrilinearSampler::sample(const Vec3T &uvw, const ValueT (&v)[2][2][2]) -{ -#if 0 - auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a - //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b -#else - auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); }; -#endif - return lerp(lerp(lerp(v[0][0][0], v[0][0][1], uvw[2]), lerp(v[0][1][0], v[0][1][1], uvw[2]), uvw[1]), - lerp(lerp(v[1][0][0], v[1][0][1], uvw[2]), lerp(v[1][1][0], v[1][1][1], uvw[2]), uvw[1]), - uvw[0]); -} - -template -template class Vec3T> -__hostdev__ Vec3T TrilinearSampler::gradient(const Vec3T &uvw, const ValueT (&v)[2][2][2]) -{ - static_assert(is_floating_point::value, "TrilinearSampler::gradient requires a floating-point type"); -#if 0 - auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a - //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b -#else - auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); }; -#endif - - ValueT D[4] = {v[0][0][1] - v[0][0][0], v[0][1][1] - v[0][1][0], v[1][0][1] - v[1][0][0], v[1][1][1] - v[1][1][0]}; - - // Z component - Vec3T grad(0, 0, lerp(lerp(D[0], D[1], uvw[1]), lerp(D[2], D[3], uvw[1]), uvw[0])); - - const ValueT w = ValueT(uvw[2]); - D[0] = v[0][0][0] + D[0] * w; - D[1] = v[0][1][0] + D[1] * w; - D[2] = v[1][0][0] + D[2] * w; - D[3] = v[1][1][0] + D[3] * w; - - // X component - grad[0] = lerp(D[2], D[3], uvw[1]) - lerp(D[0], D[1], uvw[1]); - - // Y component - grad[1] = lerp(D[1] - D[0], D[3] - D[2], uvw[0]); - - return grad; -} - -template -__hostdev__ bool TrilinearSampler::zeroCrossing(const ValueT (&v)[2][2][2]) -{ - static_assert(is_floating_point::value, "TrilinearSampler::zeroCrossing requires a floating-point type"); - const bool less = v[0][0][0] < ValueT(0); - return (less ^ (v[0][0][1] < ValueT(0))) || - (less ^ (v[0][1][1] < ValueT(0))) || - (less ^ (v[0][1][0] < ValueT(0))) || - (less ^ (v[1][0][0] < ValueT(0))) || - (less ^ (v[1][0][1] < ValueT(0))) || - (less ^ (v[1][1][1] < ValueT(0))) || - (less ^ (v[1][1][0] < ValueT(0))); -} - -/// @brief Template specialization that does not use caching of stencil points -template -class SampleFromVoxels : public TrilinearSampler -{ - using BaseT = TrilinearSampler; - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - -public: - - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {} - - /// @note xyz is in index space space - template class Vec3T> - inline __hostdev__ ValueT operator()(Vec3T xyz) const; - - /// @note ijk is in index space space - __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} - - /// @brief Return the gradient in index space. - /// - /// @warning Will only compile with floating point value types - template class Vec3T> - inline __hostdev__ Vec3T gradient(Vec3T xyz) const; - - /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. - /// - /// @warning Will only compile with floating point value types - template class Vec3T> - inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; - -}; // SampleFromVoxels - -/// @brief Template specialization with caching of stencil values -template -class SampleFromVoxels : public TrilinearSampler -{ - using BaseT = TrilinearSampler; - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - - mutable CoordT mPos; - mutable ValueT mVal[2][2][2]; - - template class Vec3T> - __hostdev__ void cache(Vec3T& xyz) const; -public: - - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){} - - /// @note xyz is in index space space - template class Vec3T> - inline __hostdev__ ValueT operator()(Vec3T xyz) const; - - // @note ijk is in index space space - __hostdev__ ValueT operator()(const CoordT &ijk) const; - - /// @brief Return the gradient in index space. - /// - /// @warning Will only compile with floating point value types - template class Vec3T> - inline __hostdev__ Vec3T gradient(Vec3T xyz) const; - - /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. - /// - /// @warning Will only compile with floating point value types - template class Vec3T> - inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; - - /// @brief Return true if the cached tri-linear stencil has a zero crossing. - /// - /// @warning Will only compile with floating point value types - __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); } - -}; // SampleFromVoxels - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const -{ - this->cache(xyz); - return BaseT::sample(xyz, mVal); -} - -template -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT &ijk) const -{ - return ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk); -} - -template -template class Vec3T> -__hostdev__ Vec3T SampleFromVoxels::gradient(Vec3T xyz) const -{ - this->cache(xyz); - return BaseT::gradient(xyz, mVal); -} - -template -template class Vec3T> -__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const -{ - this->cache(xyz); - return BaseT::zeroCrossing(mVal); -} - -template -template class Vec3T> -__hostdev__ void SampleFromVoxels::cache(Vec3T& xyz) const -{ - CoordT ijk = Floor(xyz); - if (ijk != mPos) { - mPos = ijk; - BaseT::stencil(ijk, mVal); - } -} - -#if 0 - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const -{ - ValueT val[2][2][2]; - CoordT ijk = Floor(xyz); - BaseT::stencil(ijk, val); - return BaseT::sample(xyz, val); -} - -#else - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const -{ - auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); }; - - CoordT coord = Floor(xyz); - - ValueT vx, vx1, vy, vy1, vz, vz1; - - vz = BaseT::mAcc.getValue(coord); - coord[2] += 1; - vz1 = BaseT::mAcc.getValue(coord); - vy = lerp(vz, vz1, xyz[2]); - - coord[1] += 1; - - vz1 = BaseT::mAcc.getValue(coord); - coord[2] -= 1; - vz = BaseT::mAcc.getValue(coord); - vy1 = lerp(vz, vz1, xyz[2]); - - vx = lerp(vy, vy1, xyz[1]); - - coord[0] += 1; - - vz = BaseT::mAcc.getValue(coord); - coord[2] += 1; - vz1 = BaseT::mAcc.getValue(coord); - vy1 = lerp(vz, vz1, xyz[2]); - - coord[1] -= 1; - - vz1 = BaseT::mAcc.getValue(coord); - coord[2] -= 1; - vz = BaseT::mAcc.getValue(coord); - vy = lerp(vz, vz1, xyz[2]); - - vx1 = lerp(vy, vy1, xyz[1]); - - return lerp(vx, vx1, xyz[0]); -} -#endif - - -template -template class Vec3T> -__hostdev__ inline Vec3T SampleFromVoxels::gradient(Vec3T xyz) const -{ - ValueT val[2][2][2]; - CoordT ijk = Floor(xyz); - BaseT::stencil(ijk, val); - return BaseT::gradient(xyz, val); -} - -template -template class Vec3T> -__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const -{ - ValueT val[2][2][2]; - CoordT ijk = Floor(xyz); - BaseT::stencil(ijk, val); - return BaseT::zeroCrossing(val); -} - -// ------------------------------> TriquadraticSampler <-------------------------------------- - -/// @brief Tri-quadratic sampler, i.e. second order, interpolator -template -class TriquadraticSampler -{ -protected: - const TreeOrAccT& mAcc; - -public: - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - static const int ORDER = 1; - - /// @brief Protected constructor from a Tree or ReadAccessor - __hostdev__ TriquadraticSampler(const TreeOrAccT& acc) : mAcc(acc) {} - - __hostdev__ const TreeOrAccT& accessor() const { return mAcc; } - - /// @brief Extract the stencil of 27 values - inline __hostdev__ void stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const; - - template class Vec3T> - static inline __hostdev__ ValueT sample(const Vec3T &uvw, const ValueT (&v)[3][3][3]); - - static inline __hostdev__ bool zeroCrossing(const ValueT (&v)[3][3][3]); -}; // TriquadraticSamplerBase - -template -__hostdev__ void TriquadraticSampler::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const -{ - CoordT p(ijk[0] - 1, 0, 0); - for (int dx = 0; dx < 3; ++dx, ++p[0]) { - p[1] = ijk[1] - 1; - for (int dy = 0; dy < 3; ++dy, ++p[1]) { - p[2] = ijk[2] - 1; - for (int dz = 0; dz < 3; ++dz, ++p[2]) { - v[dx][dy][dz] = mAcc.getValue(p);// extract the stencil of 27 values - } - } - } -} - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType TriquadraticSampler::sample(const Vec3T &uvw, const ValueT (&v)[3][3][3]) -{ - auto kernel = [](const ValueT* value, double weight)->ValueT { - return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) + - 0.5f * (value[2] - value[0])) + value[1]; - }; - - ValueT vx[3]; - for (int dx = 0; dx < 3; ++dx) { - ValueT vy[3]; - for (int dy = 0; dy < 3; ++dy) { - vy[dy] = kernel(&v[dx][dy][0], uvw[2]); - }//loop over y - vx[dx] = kernel(vy, uvw[1]); - }//loop over x - return kernel(vx, uvw[0]); -} - -template -__hostdev__ bool TriquadraticSampler::zeroCrossing(const ValueT (&v)[3][3][3]) -{ - static_assert(is_floating_point::value, "TrilinearSampler::zeroCrossing requires a floating-point type"); - const bool less = v[0][0][0] < ValueT(0); - for (int dx = 0; dx < 3; ++dx) { - for (int dy = 0; dy < 3; ++dy) { - for (int dz = 0; dz < 3; ++dz) { - if (less ^ (v[dx][dy][dz] < ValueT(0))) return true; - } - } - } - return false; -} - -/// @brief Template specialization that does not use caching of stencil points -template -class SampleFromVoxels : public TriquadraticSampler -{ - using BaseT = TriquadraticSampler; - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; -public: - - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc) {} - - /// @note xyz is in index space space - template class Vec3T> - inline __hostdev__ ValueT operator()(Vec3T xyz) const; - - __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} - - /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. - /// - /// @warning Will only compile with floating point value types - template class Vec3T> - inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; - -}; // SampleFromVoxels - -/// @brief Template specialization with caching of stencil values -template -class SampleFromVoxels : public TriquadraticSampler -{ - using BaseT = TriquadraticSampler; - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - - mutable CoordT mPos; - mutable ValueT mVal[3][3][3]; - - template class Vec3T> - __hostdev__ void cache(Vec3T& xyz) const; -public: - - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){} - - /// @note xyz is in index space space - template class Vec3T> - inline __hostdev__ ValueT operator()(Vec3T xyz) const; - - inline __hostdev__ ValueT operator()(const CoordT &ijk) const; - - /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. - /// - /// @warning Will only compile with floating point value types - template class Vec3T> - inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; - - /// @brief Return true if the cached tri-linear stencil has a zero crossing. - /// - /// @warning Will only compile with floating point value types - __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); } - -}; // SampleFromVoxels - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const -{ - this->cache(xyz); - return BaseT::sample(xyz, mVal); -} - -template -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(const CoordT &ijk) const -{ - return ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk); -} - -template -template class Vec3T> -__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const -{ - this->cache(xyz); - return BaseT::zeroCrossing(mVal); -} - -template -template class Vec3T> -__hostdev__ void SampleFromVoxels::cache(Vec3T& xyz) const -{ - CoordT ijk = Floor(xyz); - if (ijk != mPos) { - mPos = ijk; - BaseT::stencil(ijk, mVal); - } -} - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const -{ - ValueT val[3][3][3]; - CoordT ijk = Floor(xyz); - BaseT::stencil(ijk, val); - return BaseT::sample(xyz, val); -} - -template -template class Vec3T> -__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const -{ - ValueT val[3][3][3]; - CoordT ijk = Floor(xyz); - BaseT::stencil(ijk, val); - return BaseT::zeroCrossing(val); -} - -// ------------------------------> TricubicSampler <-------------------------------------- - -/// @brief Tri-cubic sampler, i.e. third order, interpolator. -/// -/// @details See the following paper for implementation details: -/// Lekien, F. and Marsden, J.: Tricubic interpolation in three dimensions. -/// In: International Journal for Numerical Methods -/// in Engineering (2005), No. 63, p. 455-471 - -template -class TricubicSampler -{ -protected: - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - - const TreeOrAccT& mAcc; - -public: - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ TricubicSampler(const TreeOrAccT& acc) - : mAcc(acc) - { - } - - __hostdev__ const TreeOrAccT& accessor() const { return mAcc; } - - /// @brief Extract the stencil of 8 values - inline __hostdev__ void stencil(const CoordT& ijk, ValueT (&c)[64]) const; - - template class Vec3T> - static inline __hostdev__ ValueT sample(const Vec3T &uvw, const ValueT (&c)[64]); -}; // TricubicSampler - -template -__hostdev__ void TricubicSampler::stencil(const CoordT& ijk, ValueT (&C)[64]) const -{ - auto fetch = [&](int i, int j, int k) -> ValueT& { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; }; - - // fetch 64 point stencil values - for (int i = -1; i < 3; ++i) { - for (int j = -1; j < 3; ++j) { - fetch(i, j, -1) = mAcc.getValue(ijk + CoordT(i, j, -1)); - fetch(i, j, 0) = mAcc.getValue(ijk + CoordT(i, j, 0)); - fetch(i, j, 1) = mAcc.getValue(ijk + CoordT(i, j, 1)); - fetch(i, j, 2) = mAcc.getValue(ijk + CoordT(i, j, 2)); - } - } - const ValueT half(0.5), quarter(0.25), eighth(0.125); - const ValueT X[64] = {// values of f(x,y,z) at the 8 corners (each from 1 stencil value). - fetch(0, 0, 0), - fetch(1, 0, 0), - fetch(0, 1, 0), - fetch(1, 1, 0), - fetch(0, 0, 1), - fetch(1, 0, 1), - fetch(0, 1, 1), - fetch(1, 1, 1), - // values of df/dx at the 8 corners (each from 2 stencil values). - half * (fetch(1, 0, 0) - fetch(-1, 0, 0)), - half * (fetch(2, 0, 0) - fetch(0, 0, 0)), - half * (fetch(1, 1, 0) - fetch(-1, 1, 0)), - half * (fetch(2, 1, 0) - fetch(0, 1, 0)), - half * (fetch(1, 0, 1) - fetch(-1, 0, 1)), - half * (fetch(2, 0, 1) - fetch(0, 0, 1)), - half * (fetch(1, 1, 1) - fetch(-1, 1, 1)), - half * (fetch(2, 1, 1) - fetch(0, 1, 1)), - // values of df/dy at the 8 corners (each from 2 stencil values). - half * (fetch(0, 1, 0) - fetch(0, -1, 0)), - half * (fetch(1, 1, 0) - fetch(1, -1, 0)), - half * (fetch(0, 2, 0) - fetch(0, 0, 0)), - half * (fetch(1, 2, 0) - fetch(1, 0, 0)), - half * (fetch(0, 1, 1) - fetch(0, -1, 1)), - half * (fetch(1, 1, 1) - fetch(1, -1, 1)), - half * (fetch(0, 2, 1) - fetch(0, 0, 1)), - half * (fetch(1, 2, 1) - fetch(1, 0, 1)), - // values of df/dz at the 8 corners (each from 2 stencil values). - half * (fetch(0, 0, 1) - fetch(0, 0, -1)), - half * (fetch(1, 0, 1) - fetch(1, 0, -1)), - half * (fetch(0, 1, 1) - fetch(0, 1, -1)), - half * (fetch(1, 1, 1) - fetch(1, 1, -1)), - half * (fetch(0, 0, 2) - fetch(0, 0, 0)), - half * (fetch(1, 0, 2) - fetch(1, 0, 0)), - half * (fetch(0, 1, 2) - fetch(0, 1, 0)), - half * (fetch(1, 1, 2) - fetch(1, 1, 0)), - // values of d2f/dxdy at the 8 corners (each from 4 stencil values). - quarter * (fetch(1, 1, 0) - fetch(-1, 1, 0) - fetch(1, -1, 0) + fetch(-1, -1, 0)), - quarter * (fetch(2, 1, 0) - fetch(0, 1, 0) - fetch(2, -1, 0) + fetch(0, -1, 0)), - quarter * (fetch(1, 2, 0) - fetch(-1, 2, 0) - fetch(1, 0, 0) + fetch(-1, 0, 0)), - quarter * (fetch(2, 2, 0) - fetch(0, 2, 0) - fetch(2, 0, 0) + fetch(0, 0, 0)), - quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1)), - quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1)), - quarter * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1)), - quarter * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1)), - // values of d2f/dxdz at the 8 corners (each from 4 stencil values). - quarter * (fetch(1, 0, 1) - fetch(-1, 0, 1) - fetch(1, 0, -1) + fetch(-1, 0, -1)), - quarter * (fetch(2, 0, 1) - fetch(0, 0, 1) - fetch(2, 0, -1) + fetch(0, 0, -1)), - quarter * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1)), - quarter * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1)), - quarter * (fetch(1, 0, 2) - fetch(-1, 0, 2) - fetch(1, 0, 0) + fetch(-1, 0, 0)), - quarter * (fetch(2, 0, 2) - fetch(0, 0, 2) - fetch(2, 0, 0) + fetch(0, 0, 0)), - quarter * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0)), - quarter * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0)), - // values of d2f/dydz at the 8 corners (each from 4 stencil values). - quarter * (fetch(0, 1, 1) - fetch(0, -1, 1) - fetch(0, 1, -1) + fetch(0, -1, -1)), - quarter * (fetch(1, 1, 1) - fetch(1, -1, 1) - fetch(1, 1, -1) + fetch(1, -1, -1)), - quarter * (fetch(0, 2, 1) - fetch(0, 0, 1) - fetch(0, 2, -1) + fetch(0, 0, -1)), - quarter * (fetch(1, 2, 1) - fetch(1, 0, 1) - fetch(1, 2, -1) + fetch(1, 0, -1)), - quarter * (fetch(0, 1, 2) - fetch(0, -1, 2) - fetch(0, 1, 0) + fetch(0, -1, 0)), - quarter * (fetch(1, 1, 2) - fetch(1, -1, 2) - fetch(1, 1, 0) + fetch(1, -1, 0)), - quarter * (fetch(0, 2, 2) - fetch(0, 0, 2) - fetch(0, 2, 0) + fetch(0, 0, 0)), - quarter * (fetch(1, 2, 2) - fetch(1, 0, 2) - fetch(1, 2, 0) + fetch(1, 0, 0)), - // values of d3f/dxdydz at the 8 corners (each from 8 stencil values). - eighth * (fetch(1, 1, 1) - fetch(-1, 1, 1) - fetch(1, -1, 1) + fetch(-1, -1, 1) - fetch(1, 1, -1) + fetch(-1, 1, -1) + fetch(1, -1, -1) - fetch(-1, -1, -1)), - eighth * (fetch(2, 1, 1) - fetch(0, 1, 1) - fetch(2, -1, 1) + fetch(0, -1, 1) - fetch(2, 1, -1) + fetch(0, 1, -1) + fetch(2, -1, -1) - fetch(0, -1, -1)), - eighth * (fetch(1, 2, 1) - fetch(-1, 2, 1) - fetch(1, 0, 1) + fetch(-1, 0, 1) - fetch(1, 2, -1) + fetch(-1, 2, -1) + fetch(1, 0, -1) - fetch(-1, 0, -1)), - eighth * (fetch(2, 2, 1) - fetch(0, 2, 1) - fetch(2, 0, 1) + fetch(0, 0, 1) - fetch(2, 2, -1) + fetch(0, 2, -1) + fetch(2, 0, -1) - fetch(0, 0, -1)), - eighth * (fetch(1, 1, 2) - fetch(-1, 1, 2) - fetch(1, -1, 2) + fetch(-1, -1, 2) - fetch(1, 1, 0) + fetch(-1, 1, 0) + fetch(1, -1, 0) - fetch(-1, -1, 0)), - eighth * (fetch(2, 1, 2) - fetch(0, 1, 2) - fetch(2, -1, 2) + fetch(0, -1, 2) - fetch(2, 1, 0) + fetch(0, 1, 0) + fetch(2, -1, 0) - fetch(0, -1, 0)), - eighth * (fetch(1, 2, 2) - fetch(-1, 2, 2) - fetch(1, 0, 2) + fetch(-1, 0, 2) - fetch(1, 2, 0) + fetch(-1, 2, 0) + fetch(1, 0, 0) - fetch(-1, 0, 0)), - eighth * (fetch(2, 2, 2) - fetch(0, 2, 2) - fetch(2, 0, 2) + fetch(0, 0, 2) - fetch(2, 2, 0) + fetch(0, 2, 0) + fetch(2, 0, 0) - fetch(0, 0, 0))}; - - // 4Kb of static table (int8_t has a range of -127 -> 127 which suffices) - static const int8_t A[64][64] = { - {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {-3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {9, -9, -9, 9, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {-6, 6, 6, -6, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {-6, 6, 6, -6, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {4, -4, -4, 4, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0}, - {-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {9, -9, 0, 0, -9, 9, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {-6, 6, 0, 0, 6, -6, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, 0, 0, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0}, - {9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0}, - {-27, 27, 27, -27, 27, -27, -27, 27, -18, -9, 18, 9, 18, 9, -18, -9, -18, 18, -9, 9, 18, -18, 9, -9, -18, 18, 18, -18, -9, 9, 9, -9, -12, -6, -6, -3, 12, 6, 6, 3, -12, -6, 12, 6, -6, -3, 6, 3, -12, 12, -6, 6, -6, 6, -3, 3, -8, -4, -4, -2, -4, -2, -2, -1}, - {18, -18, -18, 18, -18, 18, 18, -18, 9, 9, -9, -9, -9, -9, 9, 9, 12, -12, 6, -6, -12, 12, -6, 6, 12, -12, -12, 12, 6, -6, -6, 6, 6, 6, 3, 3, -6, -6, -3, -3, 6, 6, -6, -6, 3, 3, -3, -3, 8, -8, 4, -4, 4, -4, 2, -2, 4, 4, 2, 2, 2, 2, 1, 1}, - {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0}, - {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 9, -9, 9, -9, -9, 9, -9, 9, 12, -12, -12, 12, 6, -6, -6, 6, 6, 3, 6, 3, -6, -3, -6, -3, 8, 4, -8, -4, 4, 2, -4, -2, 6, -6, 6, -6, 3, -3, 3, -3, 4, 2, 4, 2, 2, 1, 2, 1}, - {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -6, 6, -6, 6, 6, -6, 6, -6, -8, 8, 8, -8, -4, 4, 4, -4, -3, -3, -3, -3, 3, 3, 3, 3, -4, -4, 4, 4, -2, -2, 2, 2, -4, 4, -4, 4, -2, 2, -2, 2, -2, -2, -2, -2, -1, -1, -1, -1}, - {2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {-6, 6, 0, 0, 6, -6, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {4, -4, 0, 0, -4, 4, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, 0, 0, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}, - {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0}, - {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 12, -12, 6, -6, -12, 12, -6, 6, 9, -9, -9, 9, 9, -9, -9, 9, 8, 4, 4, 2, -8, -4, -4, -2, 6, 3, -6, -3, 6, 3, -6, -3, 6, -6, 3, -3, 6, -6, 3, -3, 4, 2, 2, 1, 4, 2, 2, 1}, - {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -8, 8, -4, 4, 8, -8, 4, -4, -6, 6, 6, -6, -6, 6, 6, -6, -4, -4, -2, -2, 4, 4, 2, 2, -3, -3, 3, 3, -3, -3, 3, 3, -4, 4, -2, 2, -4, 4, -2, 2, -2, -2, -1, -1, -2, -2, -1, -1}, - {4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0}, - {-12, 12, 12, -12, 12, -12, -12, 12, -8, -4, 8, 4, 8, 4, -8, -4, -6, 6, -6, 6, 6, -6, 6, -6, -6, 6, 6, -6, -6, 6, 6, -6, -4, -2, -4, -2, 4, 2, 4, 2, -4, -2, 4, 2, -4, -2, 4, 2, -3, 3, -3, 3, -3, 3, -3, 3, -2, -1, -2, -1, -2, -1, -2, -1}, - {8, -8, -8, 8, -8, 8, 8, -8, 4, 4, -4, -4, -4, -4, 4, 4, 4, -4, 4, -4, -4, 4, -4, 4, 4, -4, -4, 4, 4, -4, -4, 4, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, -2, 2, -2, 2, -2, 2, -2, 1, 1, 1, 1, 1, 1, 1, 1}}; - - for (int i = 0; i < 64; ++i) { // C = A * X - C[i] = ValueT(0); -#if 0 - for (int j = 0; j < 64; j += 4) { - C[i] = fma(A[i][j], X[j], fma(A[i][j+1], X[j+1], fma(A[i][j+2], X[j+2], fma(A[i][j+3], X[j+3], C[i])))); - } -#else - for (int j = 0; j < 64; j += 4) { - C[i] += A[i][j] * X[j] + A[i][j + 1] * X[j + 1] + A[i][j + 2] * X[j + 2] + A[i][j + 3] * X[j + 3]; - } -#endif - } -} - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType TricubicSampler::sample(const Vec3T &xyz, const ValueT (&C)[64]) -{ - ValueT zPow(1), sum(0); - for (int k = 0, n = 0; k < 4; ++k) { - ValueT yPow(1); - for (int j = 0; j < 4; ++j, n += 4) { -#if 0 - sum = fma( yPow, zPow * fma(xyz[0], fma(xyz[0], fma(xyz[0], C[n + 3], C[n + 2]), C[n + 1]), C[n]), sum); -#else - sum += yPow * zPow * (C[n] + xyz[0] * (C[n + 1] + xyz[0] * (C[n + 2] + xyz[0] * C[n + 3]))); -#endif - yPow *= xyz[1]; - } - zPow *= xyz[2]; - } - return sum; -} - -template -class SampleFromVoxels : public TricubicSampler -{ - using BaseT = TricubicSampler; - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - - mutable CoordT mPos; - mutable ValueT mC[64]; - - template class Vec3T> - __hostdev__ void cache(Vec3T& xyz) const; - -public: - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) - : BaseT(acc) - { - } - - /// @note xyz is in index space space - template class Vec3T> - inline __hostdev__ ValueT operator()(Vec3T xyz) const; - - // @brief Return value at the coordinate @a ijk in index space space - __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} - -}; // SampleFromVoxels - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const -{ - this->cache(xyz); - return BaseT::sample(xyz, mC); -} - -template -template class Vec3T> -__hostdev__ void SampleFromVoxels::cache(Vec3T& xyz) const -{ - CoordT ijk = Floor(xyz); - if (ijk != mPos) { - mPos = ijk; - BaseT::stencil(ijk, mC); - } -} - -template -class SampleFromVoxels : public TricubicSampler -{ - using BaseT = TricubicSampler; - using ValueT = typename TreeOrAccT::ValueType; - using CoordT = typename TreeOrAccT::CoordType; - -public: - /// @brief Construction from a Tree or ReadAccessor - __hostdev__ SampleFromVoxels(const TreeOrAccT& acc) - : BaseT(acc) - { - } - - /// @note xyz is in index space space - template class Vec3T> - inline __hostdev__ ValueT operator()(Vec3T xyz) const; - - __hostdev__ ValueT operator()(const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} - -}; // SampleFromVoxels - -template -template class Vec3T> -__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const -{ - ValueT C[64]; - CoordT ijk = Floor(xyz); - BaseT::stencil(ijk, C); - return BaseT::sample(xyz, C); -} - -} // namespace nanovdb - -#endif // NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED +// SampleFromVoxels.h +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: MPL-2.0 + +////////////////////////////////////////////////////////////////////////// +/// +/// @file SampleFromVoxels.h +/// +/// @brief NearestNeighborSampler, TrilinearSampler, TriquadraticSampler and TricubicSampler +/// +/// @note These interpolators employ internal caching for better performance when used repeatedly +/// in the same voxel location, so try to reuse an instance of these classes more than once. +/// +/// @warning While all the interpolators defined below work with both scalars and vectors +/// values (e.g. float and Vec3) TrilinarSampler::zeroCrossing and +/// Trilinear::gradient will only compile with floating point value types. +/// +/// @author Ken Museth +/// +/////////////////////////////////////////////////////////////////////////// + +#ifndef NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED +#define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED + +// Only define __hostdev__ when compiling as NVIDIA CUDA +#ifdef __CUDACC__ +#define __hostdev__ __host__ __device__ +#elif defined(__KERNEL_METAL__) +#else +#include // for floor +#define __hostdev__ +#endif + +namespace nanovdb { + +// Forward declaration of sampler with specific polynomial orders +template +class SampleFromVoxels; + +/// @brief Factory free-function for a sampler of specific polynomial orders +/// +/// @details This allows for the compact syntax: +/// @code +/// auto acc = grid.getAccessor(); +/// auto smp = nanovdb::createSampler<1>( acc ); +/// @endcode +template +__hostdev__ SampleFromVoxels createSampler(__global__ const TreeOrAccT& acc) +{ + return SampleFromVoxels(acc); +} + +/// @brief Utility function that returns the Coord of the round-down of @a xyz +/// and redefined @xyz as the fractional part, ie xyz-in = return-value + xyz-out +template class Vec3T> +__hostdev__ inline CoordT Floor(__global__ Vec3T& xyz); + +/// @brief Template specialization of Floor for Vec3 +template class Vec3T> +__hostdev__ inline CoordT Floor(__global__ Vec3T& xyz) +{ + const float ijk[3] = {floorf(xyz[0]), floorf(xyz[1]), floorf(xyz[2])}; + xyz[0] -= ijk[0]; + xyz[1] -= ijk[1]; + xyz[2] -= ijk[2]; + return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2])); +} + +/// @brief Template specialization of Floor for Vec3 +template class Vec3T> +__hostdev__ inline CoordT Floor(__global__ Vec3T& xyz) +{ + const double ijk[3] = {floor(xyz[0]), floor(xyz[1]), floor(xyz[2])}; + xyz[0] -= ijk[0]; + xyz[1] -= ijk[1]; + xyz[2] -= ijk[2]; + return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2])); +} + +#if defined(__KERNEL_METAL__) +/// @brief Template specialization of Floor for Vec3 +template class Vec3T> +__hostdev__ inline CoordT Floor(__local__ Vec3T& xyz) +{ + const float ijk[3] = {floorf(xyz[0]), floorf(xyz[1]), floorf(xyz[2])}; + xyz[0] -= ijk[0]; + xyz[1] -= ijk[1]; + xyz[2] -= ijk[2]; + return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2])); +} + +/// @brief Template specialization of Floor for Vec3 +template class Vec3T> +__hostdev__ inline CoordT Floor(__local__ Vec3T& xyz) +{ + const double ijk[3] = {floor(xyz[0]), floor(xyz[1]), floor(xyz[2])}; + xyz[0] -= ijk[0]; + xyz[1] -= ijk[1]; + xyz[2] -= ijk[2]; + return CoordT(int32_t(ijk[0]), int32_t(ijk[1]), int32_t(ijk[2])); +} +#endif + +// ------------------------------> NearestNeighborSampler <-------------------------------------- + +/// @brief Nearest neighbor, i.e. zero order, interpolator with caching +template +class SampleFromVoxels +{ +public: + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + + static __constant__ const int ORDER = 0; + /// @brief Construction from a Tree or ReadAccessor + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) + : mAcc(acc) + , mPos(CoordT::max()) + { + } + + __hostdev__ __global__ const TreeOrAccT& accessor() const { return mAcc; } + + /// @note xyz is in index space space + template + inline __hostdev__ ValueT operator()(__global__ const Vec3T& xyz) const __local__; +#if defined(__KERNEL_METAL__) + template + inline __hostdev__ ValueT operator()(__local__ const Vec3T& xyz) const __local__; +#endif + + inline __hostdev__ ValueT operator()(__global__ const CoordT& ijk) const __local__; + + inline __hostdev__ ValueT operator()() const; + +private: + __global__ const TreeOrAccT& mAcc; + mutable CoordT mPos; + mutable ValueT mVal; // private cache +}; // SampleFromVoxels + +/// @brief Nearest neighbor, i.e. zero order, interpolator without caching +template +class SampleFromVoxels +{ +public: + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + static __constant__ const int ORDER = 0; + + /// @brief Construction from a Tree or ReadAccessor + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) + : mAcc(acc) + { + } + + __hostdev__ __global__ const TreeOrAccT& accessor() const __local__ { return mAcc; } + + /// @note xyz is in index space space + template + inline __hostdev__ ValueT operator()(__global__ const Vec3T& xyz) const __local__; +#if defined(__KERNEL_METAL__) + template + inline __hostdev__ ValueT operator()(__local__ const Vec3T& xyz) const __local__; +#endif + + inline __hostdev__ ValueT operator()(__global__ const CoordT& ijk) const __local__ { return mAcc.getValue(ijk);} + +private: + __local__ const TreeOrAccT& mAcc; +}; // SampleFromVoxels + +template +template +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(__global__ const Vec3T& xyz) const __local__ +{ + const CoordT ijk = Round(xyz); + if (ijk != mPos) { + mPos = ijk; + mVal = mAcc.getValue(mPos); + } + return mVal; +} +#if defined(__KERNEL_METAL__) +template +template +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(__local__ const Vec3T& xyz) const __local__ +{ + const CoordT ijk = Round(xyz); + if (ijk != mPos) { + mPos = ijk; + mVal = mAcc.getValue(mPos); + } + return mVal; +} +#endif + +template +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(__global__ const CoordT& ijk) const __local__ +{ + if (ijk != mPos) { + mPos = ijk; + mVal = mAcc.getValue(mPos); + } + return mVal; +} + +template +template +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(__global__ const Vec3T& xyz) const __local__ +{ + return mAcc.getValue(Round(xyz)); +} + +#if defined(__KERNEL_METAL__) +template +template +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(__local__ const Vec3T& xyz) const __local__ +{ + return mAcc.getValue(Round(xyz)); +} +#endif + +// ------------------------------> TrilinearSampler <-------------------------------------- + +/// @brief Tri-linear sampler, i.e. first order, interpolator +template +class TrilinearSampler +{ +#if defined(__KERNEL_METAL__) +public: +#else +protected: +#endif + __local__ const TreeOrAccT& mAcc; + +public: + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + static __constant__ const int ORDER = 1; + + /// @brief Protected constructor from a Tree or ReadAccessor + __hostdev__ TrilinearSampler(__local__ const TreeOrAccT& acc) : mAcc(acc) {} + + __hostdev__ __global__ const TreeOrAccT& accessor() const { return mAcc; } + + /// @brief Extract the stencil of 8 values + inline __hostdev__ void stencil(__global__ CoordT& ijk, __global__ ValueT (&v)[2][2][2]) const; + + template class Vec3T> + static inline __hostdev__ ValueT sample(__global__ const Vec3T &uvw, __global__ const ValueT (&v)[2][2][2]); + + template class Vec3T> + static inline __hostdev__ Vec3T gradient(__global__ const Vec3T &uvw, __global__ const ValueT (&v)[2][2][2]); + + static inline __hostdev__ bool zeroCrossing(__global__ const ValueT (&v)[2][2][2]); +}; // TrilinearSamplerBase + +template +void TrilinearSampler::stencil(__global__ CoordT& ijk, __global__ ValueT (&v)[2][2][2]) const +{ + v[0][0][0] = mAcc.getValue(ijk); // i, j, k + + ijk[2] += 1; + v[0][0][1] = mAcc.getValue(ijk); // i, j, k + 1 + + ijk[1] += 1; + v[0][1][1] = mAcc.getValue(ijk); // i, j+1, k + 1 + + ijk[2] -= 1; + v[0][1][0] = mAcc.getValue(ijk); // i, j+1, k + + ijk[0] += 1; + ijk[1] -= 1; + v[1][0][0] = mAcc.getValue(ijk); // i+1, j, k + + ijk[2] += 1; + v[1][0][1] = mAcc.getValue(ijk); // i+1, j, k + 1 + + ijk[1] += 1; + v[1][1][1] = mAcc.getValue(ijk); // i+1, j+1, k + 1 + + ijk[2] -= 1; + v[1][1][0] = mAcc.getValue(ijk); // i+1, j+1, k +} + +template +template class Vec3T> +typename TreeOrAccT::ValueType TrilinearSampler::sample(__global__ const Vec3T &uvw, __global__ const ValueT (&v)[2][2][2]) +{ +#if 0 + auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a + //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b +#else + struct Lerp { + static ValueT lerp(ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); } + }; +#endif + return Lerp::lerp(Lerp::lerp(Lerp::lerp(v[0][0][0], v[0][0][1], uvw[2]), Lerp::lerp(v[0][1][0], v[0][1][1], uvw[2]), uvw[1]), + Lerp::lerp(Lerp::lerp(v[1][0][0], v[1][0][1], uvw[2]), Lerp::lerp(v[1][1][0], v[1][1][1], uvw[2]), uvw[1]), + uvw[0]); +} + +template +template class Vec3T> +Vec3T TrilinearSampler::gradient(__global__ const Vec3T &uvw, __global__ const ValueT (&v)[2][2][2]) +{ + static_assert(is_floating_point::value, "TrilinearSampler::gradient requires a floating-point type"); +#if 0 + auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a + //auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b, fma(-w, a, a));};// = (1-w)*a + w*b +#else + struct Lerp { + static ValueT lerp(ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); } + }; +#endif + + ValueT D[4] = {v[0][0][1] - v[0][0][0], v[0][1][1] - v[0][1][0], v[1][0][1] - v[1][0][0], v[1][1][1] - v[1][1][0]}; + + // Z component + Vec3T grad(0, 0, Lerp::lerp(Lerp::lerp(D[0], D[1], uvw[1]), lerp(D[2], D[3], uvw[1]), uvw[0])); + + const ValueT w = ValueT(uvw[2]); + D[0] = v[0][0][0] + D[0] * w; + D[1] = v[0][1][0] + D[1] * w; + D[2] = v[1][0][0] + D[2] * w; + D[3] = v[1][1][0] + D[3] * w; + + // X component + grad[0] = Lerp::lerp(D[2], D[3], uvw[1]) - Lerp::lerp(D[0], D[1], uvw[1]); + + // Y component + grad[1] = Lerp::lerp(D[1] - D[0], D[3] - D[2], uvw[0]); + + return grad; +} + +template +bool TrilinearSampler::zeroCrossing(__global__ const ValueT (&v)[2][2][2]) +{ + static_assert(is_floating_point::value, "TrilinearSampler::zeroCrossing requires a floating-point type"); + const bool less = v[0][0][0] < ValueT(0); + return (less ^ (v[0][0][1] < ValueT(0))) || + (less ^ (v[0][1][1] < ValueT(0))) || + (less ^ (v[0][1][0] < ValueT(0))) || + (less ^ (v[1][0][0] < ValueT(0))) || + (less ^ (v[1][0][1] < ValueT(0))) || + (less ^ (v[1][1][1] < ValueT(0))) || + (less ^ (v[1][1][0] < ValueT(0))); +} + +/// @brief Template specialization that does not use caching of stencil points +template +class SampleFromVoxels +#if !defined(__KERNEL_METAL__) + : public TrilinearSampler +#endif +{ +#if defined(__KERNEL_METAL__) + + TrilinearSampler _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v + +#endif + using BaseT = TrilinearSampler; + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + +public: + + /// @brief Construction from a Tree or ReadAccessor +#if defined(__KERNEL_METAL__) + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) : _base(acc) {} +#else + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) : BaseT(acc) {} +#endif + + /// @note xyz is in index space space + template class Vec3T> + inline __hostdev__ ValueT operator()(Vec3T xyz) const; + + /// @note ijk is in index space space + __hostdev__ ValueT operator()(__global__ const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} + + /// @brief Return the gradient in index space. + /// + /// @warning Will only compile with floating point value types + template class Vec3T> + inline __hostdev__ Vec3T gradient(Vec3T xyz) const; + + /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. + /// + /// @warning Will only compile with floating point value types + template class Vec3T> + inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; + +}; // SampleFromVoxels + +/// @brief Template specialization with caching of stencil values +template +class SampleFromVoxels +#if !defined(__KERNEL_METAL__) + : public TrilinearSampler +#endif +{ +#if defined(__KERNEL_METAL__) + TrilinearSampler _base; +#endif + using BaseT = TrilinearSampler; + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + + mutable CoordT mPos; + mutable ValueT mVal[2][2][2]; + + template class Vec3T> + __hostdev__ void cache(__global__ Vec3T& xyz) const; +public: + + /// @brief Construction from a Tree or ReadAccessor + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){} + + /// @note xyz is in index space space + template class Vec3T> + inline __hostdev__ ValueT operator()(Vec3T xyz) const; + + // @note ijk is in index space space + __hostdev__ ValueT operator()(__global__ const CoordT &ijk) const; + + /// @brief Return the gradient in index space. + /// + /// @warning Will only compile with floating point value types + template class Vec3T> + inline __hostdev__ Vec3T gradient(Vec3T xyz) const; + + /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. + /// + /// @warning Will only compile with floating point value types + template class Vec3T> + inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; + + /// @brief Return true if the cached tri-linear stencil has a zero crossing. + /// + /// @warning Will only compile with floating point value types + __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); } + +}; // SampleFromVoxels + +template +template class Vec3T> +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const +{ + this->cache(xyz); + return BaseT::sample(xyz, mVal); +} + +template +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(__global__ const CoordT &ijk) const +{ + return ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk); +} + +template +template class Vec3T> +Vec3T SampleFromVoxels::gradient(Vec3T xyz) const +{ + this->cache(xyz); + return BaseT::gradient(xyz, mVal); +} + +template +template class Vec3T> +__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const +{ + this->cache(xyz); + return BaseT::zeroCrossing(mVal); +} + +template +template class Vec3T> +void SampleFromVoxels::cache(__global__ Vec3T& xyz) const +{ + CoordT ijk = Floor(xyz); + if (ijk != mPos) { + mPos = ijk; + BaseT::stencil(ijk, mVal); + } +} + +#if 0 + +template +template class Vec3T> +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const +{ + ValueT val[2][2][2]; + CoordT ijk = Floor(xyz); + BaseT::stencil(ijk, val); + return BaseT::sample(xyz, val); +} + +#else + +template +template class Vec3T> +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const +{ + struct Lerp { + static ValueT lerp(ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); } + }; + + CoordT coord = Floor(xyz); + + ValueT vx, vx1, vy, vy1, vz, vz1; + + vz = BASE(mAcc).getValue(coord); + coord[2] += 1; + vz1 = BASE(mAcc).getValue(coord); + vy = Lerp::lerp(vz, vz1, xyz[2]); + + coord[1] += 1; + + vz1 = BASE(mAcc).getValue(coord); + coord[2] -= 1; + vz = BASE(mAcc).getValue(coord); + vy1 = Lerp::lerp(vz, vz1, xyz[2]); + + vx = Lerp::lerp(vy, vy1, xyz[1]); + + coord[0] += 1; + + vz = BASE(mAcc).getValue(coord); + coord[2] += 1; + vz1 = BASE(mAcc).getValue(coord); + vy1 = Lerp::lerp(vz, vz1, xyz[2]); + + coord[1] -= 1; + + vz1 = BASE(mAcc).getValue(coord); + coord[2] -= 1; + vz = BASE(mAcc).getValue(coord); + vy = Lerp::lerp(vz, vz1, xyz[2]); + + vx1 = Lerp::lerp(vy, vy1, xyz[1]); + + return Lerp::lerp(vx, vx1, xyz[0]); +} +#endif + + +template +template class Vec3T> +inline Vec3T SampleFromVoxels::gradient(Vec3T xyz) const +{ + ValueT val[2][2][2]; + CoordT ijk = Floor(xyz); + BaseT::stencil(ijk, val); + return BaseT::gradient(xyz, val); +} + +template +template class Vec3T> +bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const +{ + ValueT val[2][2][2]; + CoordT ijk = Floor(xyz); + BaseT::stencil(ijk, val); + return BaseT::zeroCrossing(val); +} + +// ------------------------------> TriquadraticSampler <-------------------------------------- + +/// @brief Tri-quadratic sampler, i.e. second order, interpolator +template +class TriquadraticSampler +{ +protected: + __local__ const TreeOrAccT& mAcc; + +public: + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + static __constant__ const int ORDER = 1; + + /// @brief Protected constructor from a Tree or ReadAccessor + __hostdev__ TriquadraticSampler(__local__ const TreeOrAccT& acc) : mAcc(acc) {} + + __hostdev__ __global__ const TreeOrAccT& accessor() const { return mAcc; } + + /// @brief Extract the stencil of 27 values + inline __hostdev__ void stencil(__local__ const CoordT &ijk, __local__ ValueT (&v)[3][3][3]) const; + + template class Vec3T> + static inline __hostdev__ ValueT sample(__local__ const Vec3T &uvw, __local__ const ValueT (&v)[3][3][3]); + + static inline __hostdev__ bool zeroCrossing(__global__ const ValueT (&v)[3][3][3]); +}; // TriquadraticSamplerBase + +template +void TriquadraticSampler::stencil(__local__ const CoordT &ijk, __local__ ValueT (&v)[3][3][3]) const +{ + CoordT p(ijk[0] - 1, 0, 0); + for (int dx = 0; dx < 3; ++dx, ++p[0]) { + p[1] = ijk[1] - 1; + for (int dy = 0; dy < 3; ++dy, ++p[1]) { + p[2] = ijk[2] - 1; + for (int dz = 0; dz < 3; ++dz, ++p[2]) { + v[dx][dy][dz] = mAcc.getValue(p);// extract the stencil of 27 values + } + } + } +} + +template +template class Vec3T> +typename TreeOrAccT::ValueType TriquadraticSampler::sample(__local__ const Vec3T &uvw, __local__ const ValueT (&v)[3][3][3]) +{ + struct Kernel { + static ValueT _kernel(__local__ const ValueT* value, double weight) { + return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) + 0.5f * (value[2] - value[0])) + value[1]; + } + }; + + ValueT vx[3]; + for (int dx = 0; dx < 3; ++dx) { + ValueT vy[3]; + for (int dy = 0; dy < 3; ++dy) { + vy[dy] = Kernel::_kernel(&v[dx][dy][0], uvw[2]); + }//loop over y + vx[dx] = Kernel::_kernel(vy, uvw[1]); + }//loop over x + return Kernel::_kernel(vx, uvw[0]); +} + +template +bool TriquadraticSampler::zeroCrossing(__global__ const ValueT (&v)[3][3][3]) +{ + static_assert(is_floating_point::value, "TrilinearSampler::zeroCrossing requires a floating-point type"); + const bool less = v[0][0][0] < ValueT(0); + for (int dx = 0; dx < 3; ++dx) { + for (int dy = 0; dy < 3; ++dy) { + for (int dz = 0; dz < 3; ++dz) { + if (less ^ (v[dx][dy][dz] < ValueT(0))) return true; + } + } + } + return false; +} + +/// @brief Template specialization that does not use caching of stencil points +template +class SampleFromVoxels +#if !defined(__KERNEL_METAL__) + : public TriquadraticSampler +#endif +{ +#if defined(__KERNEL_METAL__) + TriquadraticSampler _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif + using BaseT = TriquadraticSampler; + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; +public: + + /// @brief Construction from a Tree or ReadAccessor +#if defined(__KERNEL_METAL__) + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) : _base(acc) {} +#else + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) : BaseT(acc) {} +#endif + + /// @note xyz is in index space space + template class Vec3T> + inline __hostdev__ ValueT operator()(Vec3T xyz) const; + + __hostdev__ ValueT operator()(__global__ const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} + + /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. + /// + /// @warning Will only compile with floating point value types + template class Vec3T> + inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; + +}; // SampleFromVoxels + +/// @brief Template specialization with caching of stencil values +template +class SampleFromVoxels +#if !defined(__KERNEL_METAL__) + : public TriquadraticSampler +#endif +{ +#if defined(__KERNEL_METAL__) + TriquadraticSampler _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif + using BaseT = TriquadraticSampler; + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + + mutable CoordT mPos; + mutable ValueT mVal[3][3][3]; + + template class Vec3T> + __hostdev__ void cache(__global__ Vec3T& xyz) const; +public: + + /// @brief Construction from a Tree or ReadAccessor + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) : BaseT(acc), mPos(CoordT::max()){} + + /// @note xyz is in index space space + template class Vec3T> + inline __hostdev__ ValueT operator()(Vec3T xyz) const; + + inline __hostdev__ ValueT operator()(__global__ const CoordT &ijk) const; + + /// @brief Return true if the tr-linear stencil has a zero crossing at the specified index position. + /// + /// @warning Will only compile with floating point value types + template class Vec3T> + inline __hostdev__ bool zeroCrossing(Vec3T xyz) const; + + /// @brief Return true if the cached tri-linear stencil has a zero crossing. + /// + /// @warning Will only compile with floating point value types + __hostdev__ bool zeroCrossing() const { return BaseT::zeroCrossing(mVal); } + +}; // SampleFromVoxels + +template +template class Vec3T> +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const +{ + this->cache(xyz); + return BaseT::sample(xyz, mVal); +} + +template +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(__global__ const CoordT &ijk) const +{ + return ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk); +} + +template +template class Vec3T> +__hostdev__ bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const +{ + this->cache(xyz); + return BaseT::zeroCrossing(mVal); +} + +template +template class Vec3T> +void SampleFromVoxels::cache(__global__ Vec3T& xyz) const +{ + CoordT ijk = Floor(xyz); + if (ijk != mPos) { + mPos = ijk; + BaseT::stencil(ijk, mVal); + } +} + +template +template class Vec3T> +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const +{ + ValueT val[3][3][3]; + CoordT ijk = Floor(xyz); + BASE(stencil)(ijk, val); + return BaseT::sample(xyz, val); +} + +template +template class Vec3T> +bool SampleFromVoxels::zeroCrossing(Vec3T xyz) const +{ + ValueT val[3][3][3]; + CoordT ijk = Floor(xyz); + BaseT::stencil(ijk, val); + return BaseT::zeroCrossing(val); +} + +// ------------------------------> TricubicSampler <-------------------------------------- + +/// @brief Tri-cubic sampler, i.e. third order, interpolator. +/// +/// @details See the following paper for implementation details: +/// Lekien, F. and Marsden, J.: Tricubic interpolation in three dimensions. +/// In: International Journal for Numerical Methods +/// in Engineering (2005), No. 63, p. 455-471 + +template +class TricubicSampler +{ +protected: + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + + __global__ const TreeOrAccT& mAcc; + +public: + /// @brief Construction from a Tree or ReadAccessor + __hostdev__ TricubicSampler(__global__ const TreeOrAccT& acc) + : mAcc(acc) + { + } + + __hostdev__ __global__ const TreeOrAccT& accessor() const { return mAcc; } + + /// @brief Extract the stencil of 8 values + inline __hostdev__ void stencil(__global__ const CoordT& ijk, __global__ ValueT (&c)[64]) const; + + template class Vec3T> + static inline __hostdev__ ValueT sample(__global__ const Vec3T &uvw, __global__ const ValueT (&c)[64]); +}; // TricubicSampler + +// 4Kb of static table (int8_t has a range of -127 -> 127 which suffices) +static __constant__ const int8_t TricubicSampler_A[64][64] = { + {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {9, -9, -9, 9, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-6, 6, 6, -6, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-6, 6, 6, -6, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {4, -4, -4, 4, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, -6, -3, 0, 0, 0, 0, 6, -6, 3, -3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 3, 3, 0, 0, 0, 0, -4, 4, -2, 2, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 4, 2, 0, 0, 0, 0, -3, 3, -3, 3, 0, 0, 0, 0, -2, -1, -2, -1, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, -2, -2, 0, 0, 0, 0, 2, -2, 2, -2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0}, + {-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {9, -9, 0, 0, -9, 9, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-6, 6, 0, 0, 6, -6, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, -1, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -9, 0, 0, -9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0, -6, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, -6, 0, 0, 3, -3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 4, 0, 0, -2, 2, 0, 0, -2, -2, 0, 0, -1, -1, 0, 0}, + {9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, -9, 0, -9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, -6, 0, -3, 0, 6, 0, -6, 0, 3, 0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0}, + {-27, 27, 27, -27, 27, -27, -27, 27, -18, -9, 18, 9, 18, 9, -18, -9, -18, 18, -9, 9, 18, -18, 9, -9, -18, 18, 18, -18, -9, 9, 9, -9, -12, -6, -6, -3, 12, 6, 6, 3, -12, -6, 12, 6, -6, -3, 6, 3, -12, 12, -6, 6, -6, 6, -3, 3, -8, -4, -4, -2, -4, -2, -2, -1}, + {18, -18, -18, 18, -18, 18, 18, -18, 9, 9, -9, -9, -9, -9, 9, 9, 12, -12, 6, -6, -12, 12, -6, 6, 12, -12, -12, 12, 6, -6, -6, 6, 6, 6, 3, 3, -6, -6, -3, -3, 6, 6, -6, -6, 3, 3, -3, -3, 8, -8, 4, -4, 4, -4, 2, -2, 4, 4, 2, 2, 2, 2, 1, 1}, + {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 0, -3, 0, 3, 0, 3, 0, -4, 0, 4, 0, -2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -1, 0, -1, 0}, + {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 9, -9, 9, -9, -9, 9, -9, 9, 12, -12, -12, 12, 6, -6, -6, 6, 6, 3, 6, 3, -6, -3, -6, -3, 8, 4, -8, -4, 4, 2, -4, -2, 6, -6, 6, -6, 3, -3, 3, -3, 4, 2, 4, 2, 2, 1, 2, 1}, + {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -6, 6, -6, 6, 6, -6, 6, -6, -8, 8, 8, -8, -4, 4, 4, -4, -3, -3, -3, -3, 3, 3, 3, 3, -4, -4, 4, 4, -2, -2, 2, 2, -4, 4, -4, 4, -2, 2, -2, 2, -2, -2, -2, -2, -1, -1, -1, -1}, + {2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-6, 6, 0, 0, 6, -6, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {4, -4, 0, 0, -4, 4, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -6, 6, 0, 0, 6, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0, -2, -1, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4, 0, 0, -4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0}, + {-6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, -6, 0, 6, 0, 6, 0, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -2, 0, 4, 0, 2, 0, -3, 0, 3, 0, -3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, 0, -2, 0, -1, 0}, + {18, -18, -18, 18, -18, 18, 18, -18, 12, 6, -12, -6, -12, -6, 12, 6, 12, -12, 6, -6, -12, 12, -6, 6, 9, -9, -9, 9, 9, -9, -9, 9, 8, 4, 4, 2, -8, -4, -4, -2, 6, 3, -6, -3, 6, 3, -6, -3, 6, -6, 3, -3, 6, -6, 3, -3, 4, 2, 2, 1, 4, 2, 2, 1}, + {-12, 12, 12, -12, 12, -12, -12, 12, -6, -6, 6, 6, 6, 6, -6, -6, -8, 8, -4, 4, 8, -8, 4, -4, -6, 6, 6, -6, -6, 6, 6, -6, -4, -4, -2, -2, 4, 4, 2, 2, -3, -3, 3, 3, -3, -3, 3, 3, -4, 4, -2, 2, -4, 4, -2, 2, -2, -2, -1, -1, -2, -2, -1, -1}, + {4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, -4, 0, -4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, -2, 0, -2, 0, 2, 0, -2, 0, 2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0}, + {-12, 12, 12, -12, 12, -12, -12, 12, -8, -4, 8, 4, 8, 4, -8, -4, -6, 6, -6, 6, 6, -6, 6, -6, -6, 6, 6, -6, -6, 6, 6, -6, -4, -2, -4, -2, 4, 2, 4, 2, -4, -2, 4, 2, -4, -2, 4, 2, -3, 3, -3, 3, -3, 3, -3, 3, -2, -1, -2, -1, -2, -1, -2, -1}, + {8, -8, -8, 8, -8, 8, 8, -8, 4, 4, -4, -4, -4, -4, 4, 4, 4, -4, 4, -4, -4, 4, -4, 4, 4, -4, -4, 4, 4, -4, -4, 4, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, -2, 2, -2, 2, -2, 2, -2, 1, 1, 1, 1, 1, 1, 1, 1}}; + +template +void TricubicSampler::stencil(__global__ const CoordT& ijk, __global__ ValueT (&C)[64]) const +{ + struct Fetch { + Fetch(__global__ ValueT (&_C)[64]):C(_C) {} + __global__ ValueT& fetch(int i, int j, int k) { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; } + + __global__ ValueT (&C)[64]; + }; + Fetch f(C); + + // fetch 64 point stencil values + for (int i = -1; i < 3; ++i) { + for (int j = -1; j < 3; ++j) { + Fetch::fetch(i, j, -1) = mAcc.getValue(ijk + CoordT(i, j, -1)); + Fetch::fetch(i, j, 0) = mAcc.getValue(ijk + CoordT(i, j, 0)); + Fetch::fetch(i, j, 1) = mAcc.getValue(ijk + CoordT(i, j, 1)); + Fetch::fetch(i, j, 2) = mAcc.getValue(ijk + CoordT(i, j, 2)); + } + } + const ValueT _half(0.5), quarter(0.25), eighth(0.125); + const ValueT X[64] = {// values of f(x,y,z) at the 8 corners (each from 1 stencil value). + f.fetch(0, 0, 0), + f.fetch(1, 0, 0), + f.fetch(0, 1, 0), + f.fetch(1, 1, 0), + f.fetch(0, 0, 1), + f.fetch(1, 0, 1), + f.fetch(0, 1, 1), + f.fetch(1, 1, 1), + // values of df/dx at the 8 corners (each from 2 stencil values). + _half * (f.fetch(1, 0, 0) - f.fetch(-1, 0, 0)), + _half * (f.fetch(2, 0, 0) - f.fetch(0, 0, 0)), + _half * (f.fetch(1, 1, 0) - f.fetch(-1, 1, 0)), + _half * (f.fetch(2, 1, 0) - f.fetch(0, 1, 0)), + _half * (f.fetch(1, 0, 1) - f.fetch(-1, 0, 1)), + _half * (f.fetch(2, 0, 1) - f.fetch(0, 0, 1)), + _half * (f.fetch(1, 1, 1) - f.fetch(-1, 1, 1)), + _half * (f.fetch(2, 1, 1) - f.fetch(0, 1, 1)), + // values of df/dy at the 8 corners (each from 2 stencil values). + _half * (f.fetch(0, 1, 0) - f.fetch(0, -1, 0)), + _half * (f.fetch(1, 1, 0) - f.fetch(1, -1, 0)), + _half * (f.fetch(0, 2, 0) - f.fetch(0, 0, 0)), + _half * (f.fetch(1, 2, 0) - f.fetch(1, 0, 0)), + _half * (f.fetch(0, 1, 1) - f.fetch(0, -1, 1)), + _half * (f.fetch(1, 1, 1) - f.fetch(1, -1, 1)), + _half * (f.fetch(0, 2, 1) - f.fetch(0, 0, 1)), + _half * (f.fetch(1, 2, 1) - f.fetch(1, 0, 1)), + // values of df/dz at the 8 corners (each from 2 stencil values). + _half * (f.fetch(0, 0, 1) - f.fetch(0, 0, -1)), + _half * (f.fetch(1, 0, 1) - f.fetch(1, 0, -1)), + _half * (f.fetch(0, 1, 1) - f.fetch(0, 1, -1)), + _half * (f.fetch(1, 1, 1) - f.fetch(1, 1, -1)), + _half * (f.fetch(0, 0, 2) - f.fetch(0, 0, 0)), + _half * (f.fetch(1, 0, 2) - f.fetch(1, 0, 0)), + _half * (f.fetch(0, 1, 2) - f.fetch(0, 1, 0)), + _half * (f.fetch(1, 1, 2) - f.fetch(1, 1, 0)), + // values of d2f/dxdy at the 8 corners (each from 4 stencil values). + quarter * (f.fetch(1, 1, 0) - f.fetch(-1, 1, 0) - f.fetch(1, -1, 0) + f.fetch(-1, -1, 0)), + quarter * (f.fetch(2, 1, 0) - f.fetch(0, 1, 0) - f.fetch(2, -1, 0) + f.fetch(0, -1, 0)), + quarter * (f.fetch(1, 2, 0) - f.fetch(-1, 2, 0) - f.fetch(1, 0, 0) + f.fetch(-1, 0, 0)), + quarter * (f.fetch(2, 2, 0) - f.fetch(0, 2, 0) - f.fetch(2, 0, 0) + f.fetch(0, 0, 0)), + quarter * (f.fetch(1, 1, 1) - f.fetch(-1, 1, 1) - f.fetch(1, -1, 1) + f.fetch(-1, -1, 1)), + quarter * (f.fetch(2, 1, 1) - f.fetch(0, 1, 1) - f.fetch(2, -1, 1) + f.fetch(0, -1, 1)), + quarter * (f.fetch(1, 2, 1) - f.fetch(-1, 2, 1) - f.fetch(1, 0, 1) + f.fetch(-1, 0, 1)), + quarter * (f.fetch(2, 2, 1) - f.fetch(0, 2, 1) - f.fetch(2, 0, 1) + f.fetch(0, 0, 1)), + // values of d2f/dxdz at the 8 corners (each from 4 stencil values). + quarter * (f.fetch(1, 0, 1) - f.fetch(-1, 0, 1) - f.fetch(1, 0, -1) + f.fetch(-1, 0, -1)), + quarter * (f.fetch(2, 0, 1) - f.fetch(0, 0, 1) - f.fetch(2, 0, -1) + f.fetch(0, 0, -1)), + quarter * (f.fetch(1, 1, 1) - f.fetch(-1, 1, 1) - f.fetch(1, 1, -1) + f.fetch(-1, 1, -1)), + quarter * (f.fetch(2, 1, 1) - f.fetch(0, 1, 1) - f.fetch(2, 1, -1) + f.fetch(0, 1, -1)), + quarter * (f.fetch(1, 0, 2) - f.fetch(-1, 0, 2) - f.fetch(1, 0, 0) + f.fetch(-1, 0, 0)), + quarter * (f.fetch(2, 0, 2) - f.fetch(0, 0, 2) - f.fetch(2, 0, 0) + f.fetch(0, 0, 0)), + quarter * (f.fetch(1, 1, 2) - f.fetch(-1, 1, 2) - f.fetch(1, 1, 0) + f.fetch(-1, 1, 0)), + quarter * (f.fetch(2, 1, 2) - f.fetch(0, 1, 2) - f.fetch(2, 1, 0) + f.fetch(0, 1, 0)), + // values of d2f/dydz at the 8 corners (each from 4 stencil values). + quarter * (f.fetch(0, 1, 1) - f.fetch(0, -1, 1) - f.fetch(0, 1, -1) + f.fetch(0, -1, -1)), + quarter * (f.fetch(1, 1, 1) - f.fetch(1, -1, 1) - f.fetch(1, 1, -1) + f.fetch(1, -1, -1)), + quarter * (f.fetch(0, 2, 1) - f.fetch(0, 0, 1) - f.fetch(0, 2, -1) + f.fetch(0, 0, -1)), + quarter * (f.fetch(1, 2, 1) - f.fetch(1, 0, 1) - f.fetch(1, 2, -1) + f.fetch(1, 0, -1)), + quarter * (f.fetch(0, 1, 2) - f.fetch(0, -1, 2) - f.fetch(0, 1, 0) + f.fetch(0, -1, 0)), + quarter * (f.fetch(1, 1, 2) - f.fetch(1, -1, 2) - f.fetch(1, 1, 0) + f.fetch(1, -1, 0)), + quarter * (f.fetch(0, 2, 2) - f.fetch(0, 0, 2) - f.fetch(0, 2, 0) + f.fetch(0, 0, 0)), + quarter * (f.fetch(1, 2, 2) - f.fetch(1, 0, 2) - f.fetch(1, 2, 0) + f.fetch(1, 0, 0)), + // values of d3f/dxdydz at the 8 corners (each from 8 stencil values). + eighth * (f.fetch(1, 1, 1) - f.fetch(-1, 1, 1) - f.fetch(1, -1, 1) + f.fetch(-1, -1, 1) - f.fetch(1, 1, -1) + f.fetch(-1, 1, -1) + f.fetch(1, -1, -1) - f.fetch(-1, -1, -1)), + eighth * (f.fetch(2, 1, 1) - f.fetch(0, 1, 1) - f.fetch(2, -1, 1) + f.fetch(0, -1, 1) - f.fetch(2, 1, -1) + f.fetch(0, 1, -1) + f.fetch(2, -1, -1) - f.fetch(0, -1, -1)), + eighth * (f.fetch(1, 2, 1) - f.fetch(-1, 2, 1) - f.fetch(1, 0, 1) + f.fetch(-1, 0, 1) - f.fetch(1, 2, -1) + f.fetch(-1, 2, -1) + f.fetch(1, 0, -1) - f.fetch(-1, 0, -1)), + eighth * (f.fetch(2, 2, 1) - f.fetch(0, 2, 1) - f.fetch(2, 0, 1) + f.fetch(0, 0, 1) - f.fetch(2, 2, -1) + f.fetch(0, 2, -1) + f.fetch(2, 0, -1) - f.fetch(0, 0, -1)), + eighth * (f.fetch(1, 1, 2) - f.fetch(-1, 1, 2) - f.fetch(1, -1, 2) + f.fetch(-1, -1, 2) - f.fetch(1, 1, 0) + f.fetch(-1, 1, 0) + f.fetch(1, -1, 0) - f.fetch(-1, -1, 0)), + eighth * (f.fetch(2, 1, 2) - f.fetch(0, 1, 2) - f.fetch(2, -1, 2) + f.fetch(0, -1, 2) - f.fetch(2, 1, 0) + f.fetch(0, 1, 0) + f.fetch(2, -1, 0) - f.fetch(0, -1, 0)), + eighth * (f.fetch(1, 2, 2) - f.fetch(-1, 2, 2) - f.fetch(1, 0, 2) + f.fetch(-1, 0, 2) - f.fetch(1, 2, 0) + f.fetch(-1, 2, 0) + f.fetch(1, 0, 0) - f.fetch(-1, 0, 0)), + eighth * (f.fetch(2, 2, 2) - f.fetch(0, 2, 2) - f.fetch(2, 0, 2) + f.fetch(0, 0, 2) - f.fetch(2, 2, 0) + f.fetch(0, 2, 0) + f.fetch(2, 0, 0) - f.fetch(0, 0, 0))}; + + for (int i = 0; i < 64; ++i) { // C = A * X + C[i] = ValueT(0); +#if 0 + for (int j = 0; j < 64; j += 4) { + C[i] = fma(A[i][j], X[j], fma(A[i][j+1], X[j+1], fma(A[i][j+2], X[j+2], fma(A[i][j+3], X[j+3], C[i])))); + } +#else + for (int j = 0; j < 64; j += 4) { + C[i] += TricubicSampler_A[i][j] * X[j] + TricubicSampler_A[i][j + 1] * X[j + 1] + + TricubicSampler_A[i][j + 2] * X[j + 2] + TricubicSampler_A[i][j + 3] * X[j + 3]; + } +#endif + } +} + +template +template class Vec3T> +__hostdev__ typename TreeOrAccT::ValueType TricubicSampler::sample(__global__ const Vec3T &xyz, __global__ const ValueT (&C)[64]) +{ + ValueT zPow(1), sum(0); + for (int k = 0, n = 0; k < 4; ++k) { + ValueT yPow(1); + for (int j = 0; j < 4; ++j, n += 4) { +#if 0 + sum = fma( yPow, zPow * fma(xyz[0], fma(xyz[0], fma(xyz[0], C[n + 3], C[n + 2]), C[n + 1]), C[n]), sum); +#else + sum += yPow * zPow * (C[n] + xyz[0] * (C[n + 1] + xyz[0] * (C[n + 2] + xyz[0] * C[n + 3]))); +#endif + yPow *= xyz[1]; + } + zPow *= xyz[2]; + } + return sum; +} + +template +class SampleFromVoxels +#if !defined(__KERNEL_METAL__) + : public TricubicSampler +#endif +{ +#if defined(__KERNEL_METAL__) + TricubicSampler _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif + using BaseT = TricubicSampler; + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + + mutable CoordT mPos; + mutable ValueT mC[64]; + + template class Vec3T> + __hostdev__ void cache(__global__ Vec3T& xyz) const; + +public: + /// @brief Construction from a Tree or ReadAccessor + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) + : BaseT(acc) + { + } + + /// @note xyz is in index space space + template class Vec3T> + inline __hostdev__ ValueT operator()(Vec3T xyz) const; + + // @brief Return value at the coordinate @a ijk in index space space + __hostdev__ ValueT operator()(__global__ const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} + +}; // SampleFromVoxels + +template +template class Vec3T> +typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const +{ + this->cache(xyz); + return BaseT::sample(xyz, mC); +} + +template +template class Vec3T> +void SampleFromVoxels::cache(__global__ Vec3T& xyz) const +{ + CoordT ijk = Floor(xyz); + if (ijk != mPos) { + mPos = ijk; + BaseT::stencil(ijk, mC); + } +} + +template +class SampleFromVoxels +#if !defined(__KERNEL_METAL__) + : public TricubicSampler +#endif +{ +#if defined(__KERNEL_METAL__) + TricubicSampler _base; +#define BASE(v) _base.v +#else +#define BASE(v) BaseT::v +#endif + using BaseT = TricubicSampler; + using ValueT = typename TreeOrAccT::ValueType; + using CoordT = typename TreeOrAccT::CoordType; + +public: + /// @brief Construction from a Tree or ReadAccessor + __hostdev__ SampleFromVoxels(__local__ const TreeOrAccT& acc) + : BaseT(acc) + { + } + + /// @note xyz is in index space space + template class Vec3T> + inline __hostdev__ ValueT operator()(Vec3T xyz) const; + + __hostdev__ ValueT operator()(__global__ const CoordT &ijk) const {return BaseT::mAcc.getValue(ijk);} + +}; // SampleFromVoxels + +template +template class Vec3T> +__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels::operator()(Vec3T xyz) const +{ + ValueT C[64]; + CoordT ijk = Floor(xyz); + BaseT::stencil(ijk, C); + return BaseT::sample(xyz, C); +} + +} // namespace nanovdb + +#endif // NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED