debugging

4349abe1 · Chao Liu · e55cfe15 · 4349abe1 · 4349abe1 · 4349abe1
Commit 4349abe1 authored 6 years ago by Chao Liu
Hide whitespace changes
Inline Side-by-side

Showing

with 109 additions and 93 deletions
+109 -93
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -225,9 +225,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
                                                GemmDataPerReadB);

        constexpr index_t in_block_space =
-            in_c_n1_b_n2_block_mem_desc.GetElementSpace(Number<max_align>{});
+            math::integer_least_multiple(in_c_n1_b_n2_block_mem_desc.GetElementSpace(), max_align);

-        constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t wei_block_space =
+            math::integer_least_multiple(wei_c_k_block_desc.GetElementSpace(), max_align);

        __shared__ Float p_in_block_double[2 * in_block_space];
        __shared__ Float p_wei_block_double[2 * wei_block_space];

--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -17,12 +17,6 @@ typedef float float2_t __attribute__((ext_vector_type(2)));
 typedef float float4_t __attribute__((ext_vector_type(4)));

 using index_t = uint32_t;
-
-__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
-{
-    d += s0 * s1;
-}
-
 } // namespace ck

 #endif
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -21,34 +21,6 @@ using float2_t = float2;
 using float4_t = float4;

 using index_t = uint32_t;
-
-__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
-{
-    d += s0 * s1;
-}
-
-#if 0
-__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
-
-__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
-{
-    d += s0.x * s1.x;
-    d += s0.y * s1.y;
-}
-
-__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
-{
-    d += s0.x * s1.x + s0.y * s1.y;
-}
-
-__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
-
-// TODO:: this interface is misleading, s0, s1 are actually int8x4
-//  need to make a better interface
-__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
-{
-    d = __dp4a(s0, s1, d);
-}
 #endif

 } // namespace ck

--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
@@ -64,6 +64,11 @@ struct vector_type<float, 4>
    }
 };

+__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
+{
+    d += s0 * s1;
+}
+
 } // namespace ck

 #endif
--- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -93,6 +93,42 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    // blocksize = 512
+    // give wrong result
+    constexpr index_t BlockSize = 512;
+
+    constexpr index_t BPerBlock = 32;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 8;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 32, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<2, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 1
    constexpr index_t BlockSize = 256;


--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -8,6 +8,8 @@
 #include <utility>
 #include <cassert>
 #include <iostream>
+#include "config.hpp"
+#include "common_header.hpp"

 template <class Range>
 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
@@ -269,4 +271,46 @@ struct Tensor
    std::vector<T> mData;
 };

+// this is ugly, only for 4d
+template <class TConstTensorDesc>
+void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
+{
+    using namespace ck;
+
+    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
+
+    constexpr auto I0   = Number<0>{};
+    constexpr auto I1   = Number<1>{};
+    constexpr auto I2   = Number<2>{};
+    constexpr auto I3   = Number<3>{};
+    constexpr auto desc = TConstTensorDesc{};
+
+    os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
+       << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
+       << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
+       << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
+}
+
+// this is ugly, only for 4d
+template <class TConstTensorDesc>
+auto make_TensorDescriptor(TConstTensorDesc)
+{
+    using namespace ck;
+
+    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
+
+    constexpr auto I0   = Number<0>{};
+    constexpr auto I1   = Number<1>{};
+    constexpr auto I2   = Number<2>{};
+    constexpr auto I3   = Number<3>{};
+    constexpr auto desc = TConstTensorDesc{};
+
+    std::initializer_list<index_t> lengths = {
+        desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
+    std::initializer_list<index_t> strides = {
+        desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};
+
+    return TensorDescriptor(lengths, strides);
+}
+
 #endif
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -6,6 +6,7 @@
 #include "config.hpp"
 #include "ConstantTensorDescriptor.hpp"
 #include "device.hpp"
+#include "tensor.hpp"
 #include "conv_common.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
@@ -65,44 +66,6 @@ struct GeneratorTensor_Checkboard
    }
 };

-// this is ugly, only for 4d
-template <class TConstTensorDesc>
-void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
-{
-    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-
-    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Number<3>{};
-    constexpr auto desc = TConstTensorDesc{};
-
-    os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
-       << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
-       << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
-       << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
-}
-
-// this is ugly, only for 4d
-template <class TConstTensorDesc>
-auto make_TensorDescriptor(TConstTensorDesc)
-{
-    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-
-    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Number<3>{};
-    constexpr auto desc = TConstTensorDesc{};
-
-    std::initializer_list<index_t> lengths = {
-        desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
-    std::initializer_list<index_t> strides = {
-        desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};
-
-    return TensorDescriptor(lengths, strides);
-}
-
 template <class TIn,
          class TWei,
          class TOut,
@@ -460,7 +423,7 @@ int main(int argc, char* argv[])
    constexpr index_t C  = 256;
    constexpr index_t HI = 28;
    constexpr index_t WI = 28;
-    constexpr index_t K  = 128;
+    constexpr index_t K  = 512;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

@@ -651,30 +614,31 @@ int main(int argc, char* argv[])
 #endif
    }

-#if 1
 #if 0
-    device_convolution_direct_v2_nchw_kcyx_nkhw
+    device_convolution_direct_v2_nchw_kcyx_nkhw(
+        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
+    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(
+        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
+    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(
+        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
-    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
+    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(
+        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
-    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
+    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
+        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 1
-    device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
-#endif
-    (in_nchw_desc,
-     in_nchw,
-     wei_kcyx_desc,
-     wei_kcyx,
-     out_nkhw_desc,
-     out_nkhw_device,
-     ConvStrides{},
-     ConvDilations{},
-     nrepeat);
-
+    device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(in_nchw_desc,
+                                                       in_nchw,
+                                                       wei_kcyx_desc,
+                                                       wei_kcyx,
+                                                       out_nkhw_desc,
+                                                       out_nkhw_device,
+                                                       ConvStrides{},
+                                                       ConvDilations{},
+                                                       nrepeat);
 #elif 0
    device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc,
                                                             in_nchw,