tweak gemm

0fe6e258 · Chao Liu · 50fc3865 · 0fe6e258 · 0fe6e258 · 0fe6e258
Commit 0fe6e258 authored 3 years ago by Chao Liu
Hide whitespace changes
Inline Side-by-side

Showing

with 2392 additions and 335 deletions
+2392 -335
--- a/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
@@ -31,7 +31,7 @@ __host__ __device__ constexpr auto make_left_pad_transform(
    return LeftPad<LowLength, LeftPadLength, SkipIsValidCheck>{low_length, left_pad};
 }

-template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck>
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
 __host__ __device__ constexpr auto make_right_pad_transform(
    const LowLength& low_length,
    const RightPadLength& right_pad,

--- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -29,7 +29,7 @@ __global__ void
                                FloatC* __restrict__ p_c_grid,
                                const AK0MK1GridDesc a_k0_m_k1_grid_desc,
                                const BK0NK1GridDesc b_k0_n_k1_grid_desc,
-                                const CM0N0M1N1M2M3M4N2GridDesc c_m0_m1_m2_n_grid_desc,
+                                const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
                                const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
    constexpr index_t shared_block_size =
@@ -132,7 +132,9 @@ template <index_t BlockSize,
          typename CGridStepHacks,
          typename AGridMoveSliceWindowStepHacks,
          typename BGridMoveSliceWindowStepHacks,
-          bool CAccessOrderMRepeatNRepeat>
+          bool CAccessOrderMRepeatNRepeat,
+          bool ABlockLdsExtraM,
+          bool BBlockLdsExtraN>
 struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 {
    static constexpr auto I0 = Number<0>{};
@@ -151,14 +153,34 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        constexpr auto max_lds_align = K1;

        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();

        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();

        // LDS allocation for A and B: be careful of alignment
        constexpr auto a_block_space_size =
@@ -170,29 +192,45 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
    }

+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
    __host__ __device__ static constexpr bool
    CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
                  const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                  const CMNGridDesc& c_m_n_grid_desc)
+                  const CMNGridDesc& c_m_n_grid_desc,
+                  index_t M01,
+                  index_t N01)
    {
-        // TODO: turn on this
        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
                      "wrong! K1 need to be known at compile-time");

+        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
+                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);

-        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
-                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
-                      "Invalid tuning param!");
+        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+             K0 == b_k0_n_k1_grid_desc.GetLength(I0) && K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
+             K1 == b_k0_n_k1_grid_desc.GetLength(I2)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % KPerBlock == 0))
+            return false;
+
+        // check M01, N01
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        if(!(M0 % M01 == 0 && N0 % N01 == 0))
+            return false;

        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
-        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
-                K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
-                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
-                K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
-               (M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % KPerBlock == 0);
+        return true;
    }

    __host__ __device__ static constexpr index_t
@@ -211,11 +249,35 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
    {
        constexpr auto max_lds_align = K1;

-        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();

-        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();

        using BlockwiseGemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
@@ -231,8 +293,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        return BlockwiseGemm::MakeCM0N0M1N1M2M3M4N2GridDescriptor(c_m_n_grid_desc);
    }

+    // return block_id to C matrix tile idx (m0, n0) mapping
    __host__ __device__ static constexpr auto
-    MakeCBlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    MakeCBlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01)
    {
        const auto M = c_m_n_grid_desc.GetLength(I0);
        const auto N = c_m_n_grid_desc.GetLength(I1);
@@ -243,23 +306,31 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        const auto M0 = M / M1;
        const auto N0 = N / N1;

-#if 1
-        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
-            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
-                                             make_tuple(Sequence<0, 1>{}),
-                                             make_tuple(Sequence<0>{}));
-#elif 1
+        const auto M00 = M0 / M01;
+        const auto N00 = N0 / N01;
+
+        const auto m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(M00, M01)),
+                           make_unmerge_transform(make_tuple(N00, N01))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}));
+
+        const auto c_blockid_to_m00_m01_n00_n01_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(M00, N00, M01, N01))),
+                make_tuple(Sequence<0, 1, 2, 3>{}),
+                make_tuple(Sequence<0>{}));
+
        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
-            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(N0, M0))),
-                                             make_tuple(Sequence<1, 0>{}),
-                                             make_tuple(Sequence<0>{}));
-#endif
+            chain_tensor_adaptors(m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor,
+                                  c_blockid_to_m00_m01_n00_n01_block_cluster_adaptor);

        return c_blockid_to_m0_n0_block_cluster_adaptor;
    }

    using CM0N0M1N1M2M3M4N2GridDesc = decltype(MakeCM0N0M1N1M2M3M4N2GridDescriptor(CMNGridDesc{}));
-    using CBlockClusterAdaptor      = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}));
+    using CBlockClusterAdaptor      = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1));

    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
                               const FloatAB* __restrict__ p_b_grid,
@@ -294,14 +365,34 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        constexpr auto max_lds_align = K1;

        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();

        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();

        // A matrix blockwise copy
        auto a_blockwise_copy =

--- a/host/driver_offline/include/debug.hpp
+++ b/host/driver_offline/include/debug.hpp
+#ifndef DEBUG_HPP
+#define DEBUG_HPP
+
+namespace debug_driver_gemm_xdlops_v2r3 {
+
+// these vars are on host, they control block_id to C matrix tile idx (m0, n0) mapping
+static ck::index_t M01 = 1;
+static ck::index_t N01 = 1;
+
+} // namespace debug_driver_gemm_xdlops_v2r3
+#endif
--- a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
@@ -4,16 +4,8 @@
 #include "host_tensor.hpp"
 #include "driver_gemm_xdlops_v2r3.hpp"

-template <typename ABType,
-          typename AccType,
-          typename CType,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc>
-void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
-                                 const BDesc& b_k_n_grid_desc,
-                                 const CDesc& c_m_n_grid_desc,
-                                 const Tensor<ABType>& a_k_m,
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_km_kn_mn(const Tensor<ABType>& a_k_m,
                                 const Tensor<ABType>& b_k_n,
                                 Tensor<CType>& c_m_n,
                                 ck::index_t nrepeat)
@@ -22,9 +14,6 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,

    std::cout << __func__ << std::endl;

-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
@@ -62,7 +51,91 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #elif 0
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;

    constexpr index_t MPerBlock = 256;
@@ -89,8 +162,36 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128,  for fp16
    constexpr index_t BlockSize = 128;

    constexpr index_t MPerBlock = 128;
@@ -116,46 +217,101 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #endif

-    const auto K = a_k_m_grid_desc.GetLength(I0);
-    const auto M = a_k_m_grid_desc.GetLength(I1);
-    const auto N = b_k_n_grid_desc.GetLength(I1);
+    const auto K = a_k_m.mDesc.GetLengths()[0];
+    const auto M = a_k_m.mDesc.GetLengths()[1];
+    const auto N = b_k_n.mDesc.GetLengths()[1];

    constexpr auto K1Number = Number<K1>{};
    const auto K0           = K / K1Number;

    const auto a_k0_m_k1_grid_desc =
-        transform_tensor_descriptor(a_k_m_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                               make_pass_through_transform(M)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
+                                                a_k_m.mDesc.GetStrides()[1],
+                                                a_k_m.mDesc.GetStrides()[0]));

    const auto b_k0_n_k1_grid_desc =
-        transform_tensor_descriptor(b_k_n_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                               make_pass_through_transform(N)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
+                                                b_k_n.mDesc.GetStrides()[1],
+                                                b_k_n.mDesc.GetStrides()[0]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));

    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: M
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: M
-                              Sequence<0, 0, 0>{})); // 2-: K1
-
-    constexpr auto b_k0_n_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: N
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: N
-                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1

    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
@@ -175,9 +331,9 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2

-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};

-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};

    for(index_t i = 0; i < 5; ++i)
    {
@@ -222,13 +378,17 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
+                                    false, // CAccessOrderMRepeatNRepeat
+                                    true,  // ABlockLdsExtraM
+                                    true   // BBlockLdsExtraN
                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
                                      a_k0_m_k1_grid_desc,
                                      b_k0_n_k1_grid_desc,
                                      c_m_n_grid_desc,
+                                      debug_driver_gemm_xdlops_v2r3::M01,
+                                      debug_driver_gemm_xdlops_v2r3::N01,
                                      a_k0_m_k1_grid_step_hacks,
                                      b_k0_n_k1_grid_step_hacks,
                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,

--- a/host/driver_offline/include/device_gemm_xdlops_km_kn_nm.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_km_kn_nm.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_km_kn_nm(const Tensor<ABType>& a_k_m,
+                                 const Tensor<ABType>& b_k_n,
+                                 Tensor<CType>& c_n_m,
+                                 ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
+
+    a_k_m_device_buf.ToDevice(a_k_m.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
+
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto K = a_k_m.mDesc.GetLengths()[0];
+    const auto M = a_k_m.mDesc.GetLengths()[1];
+    const auto N = b_k_n.mDesc.GetLengths()[1];
+
+    constexpr auto K1Number = Number<K1>{};
+    const auto K0           = K / K1Number;
+
+    const auto a_k0_m_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
+                                                a_k_m.mDesc.GetStrides()[1],
+                                                a_k_m.mDesc.GetStrides()[0]));
+
+    const auto b_k0_n_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
+                                                b_k_n.mDesc.GetStrides()[1],
+                                                b_k_n.mDesc.GetStrides()[0]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
+
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time =
+            driver_gemm_xdlops_v2r3<BlockSize,
+                                    ABType,
+                                    AccType,
+                                    CType,
+                                    InMemoryDataOperationEnum_t::Set,
+                                    decltype(a_k0_m_k1_grid_desc),
+                                    decltype(b_k0_n_k1_grid_desc),
+                                    decltype(c_m_n_grid_desc),
+                                    MPerBlock,
+                                    NPerBlock,
+                                    KPerBlock,
+                                    MPerXDL,
+                                    NPerXDL,
+                                    K1,
+                                    MRepeat,
+                                    NRepeat,
+                                    ABlockTransferThreadSliceLengths_K0_M_K1,
+                                    ABlockTransferThreadClusterLengths_K0_M_K1,
+                                    Sequence<0, 2, 1>,
+                                    Sequence<0, 2, 1>,
+                                    1,
+                                    ABlockTransferSrcScalarPerVector_M,
+                                    ABlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    BBlockTransferThreadSliceLengths_K0_N_K1,
+                                    BBlockTransferThreadClusterLengths_K0_N_K1,
+                                    Sequence<0, 2, 1>,
+                                    Sequence<0, 2, 1>,
+                                    1,
+                                    BBlockTransferSrcScalarPerVector_N,
+                                    BBlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+                                    6,
+                                    CThreadTransferDstScalarPerVector,
+                                    decltype(a_k0_m_k1_grid_step_hacks),
+                                    decltype(b_k0_n_k1_grid_step_hacks),
+                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
+                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
+                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
+                                    false // CAccessOrderMRepeatNRepeat
+                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
+                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
+                                      a_k0_m_k1_grid_desc,
+                                      b_k0_n_k1_grid_desc,
+                                      c_m_n_grid_desc,
+                                      a_k0_m_k1_grid_step_hacks,
+                                      b_k0_n_k1_grid_step_hacks,
+                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
+                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
+                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
+                                      nrepeat);
+
+        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
+}
--- a/host/driver_offline/include/device_gemm_xdlops_km_nk_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_km_nk_mn.hpp
@@ -4,16 +4,8 @@
 #include "host_tensor.hpp"
 #include "driver_gemm_xdlops_v2r3.hpp"

-template <typename ABType,
-          typename AccType,
-          typename CType,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc>
-void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
-                                 const BDesc& b_n_k_grid_desc,
-                                 const CDesc& c_m_n_grid_desc,
-                                 const Tensor<ABType>& a_k_m,
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_km_nk_mn(const Tensor<ABType>& a_k_m,
                                 const Tensor<ABType>& b_n_k,
                                 Tensor<CType>& c_m_n,
                                 ck::index_t nrepeat)
@@ -22,9 +14,6 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,

    std::cout << __func__ << std::endl;

-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
@@ -62,7 +51,91 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #elif 0
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64,  for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;

    constexpr index_t MPerBlock = 256;
@@ -89,8 +162,36 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128,  for fp16
    constexpr index_t BlockSize = 128;

    constexpr index_t MPerBlock = 128;
@@ -116,46 +217,101 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64,  for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32,  for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #endif

-    const auto K = a_k_m_grid_desc.GetLength(I0);
-    const auto M = a_k_m_grid_desc.GetLength(I1);
-    const auto N = b_n_k_grid_desc.GetLength(I0);
+    const auto K = a_k_m.mDesc.GetLengths()[0];
+    const auto M = a_k_m.mDesc.GetLengths()[1];
+    const auto N = b_n_k.mDesc.GetLengths()[0];

    constexpr auto K1Number = Number<K1>{};
    const auto K0           = K / K1Number;

    const auto a_k0_m_k1_grid_desc =
-        transform_tensor_descriptor(a_k_m_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                               make_pass_through_transform(M)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
+                                                a_k_m.mDesc.GetStrides()[1],
+                                                a_k_m.mDesc.GetStrides()[0]));

    const auto b_k0_n_k1_grid_desc =
-        transform_tensor_descriptor(b_n_k_grid_desc,
-                                    make_tuple(make_pass_through_transform(N),
-                                               make_unmerge_transform(make_tuple(K0, K1Number))),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
+                                                b_n_k.mDesc.GetStrides()[0],
+                                                b_n_k.mDesc.GetStrides()[1]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));

    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: M
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: M
-                              Sequence<0, 0, 0>{})); // 2-: K1
-
-    constexpr auto b_k0_n_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: N
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: N
-                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1

    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
@@ -175,9 +331,9 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2

-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};

-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};

    for(index_t i = 0; i < 5; ++i)
    {
@@ -222,13 +378,17 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
+                                    false, // CAccessOrderMRepeatNRepeat
+                                    true,  // ABlockLdsExtraM
+                                    true   // BBlockLdsExtraN
                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
                                      a_k0_m_k1_grid_desc,
                                      b_k0_n_k1_grid_desc,
                                      c_m_n_grid_desc,
+                                      debug_driver_gemm_xdlops_v2r3::M01,
+                                      debug_driver_gemm_xdlops_v2r3::N01,
                                      a_k0_m_k1_grid_step_hacks,
                                      b_k0_n_k1_grid_step_hacks,
                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,

--- a/host/driver_offline/include/device_gemm_xdlops_km_nk_nm.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_km_nk_nm.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_km_nk_nm(const Tensor<ABType>& a_k_m,
+                                 const Tensor<ABType>& b_n_k,
+                                 Tensor<CType>& c_n_m,
+                                 ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
+    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
+    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
+
+    a_k_m_device_buf.ToDevice(a_k_m.mData.data());
+    b_n_k_device_buf.ToDevice(b_n_k.mData.data());
+    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
+
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto K = a_k_m.mDesc.GetLengths()[0];
+    const auto M = a_k_m.mDesc.GetLengths()[1];
+    const auto N = b_n_k.mDesc.GetLengths()[0];
+
+    constexpr auto K1Number = Number<K1>{};
+    const auto K0           = K / K1Number;
+
+    const auto a_k0_m_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
+                                                a_k_m.mDesc.GetStrides()[1],
+                                                a_k_m.mDesc.GetStrides()[0]));
+
+    const auto b_k0_n_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
+                                                b_n_k.mDesc.GetStrides()[0],
+                                                b_n_k.mDesc.GetStrides()[1]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
+
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time =
+            driver_gemm_xdlops_v2r3<BlockSize,
+                                    ABType,
+                                    AccType,
+                                    CType,
+                                    InMemoryDataOperationEnum_t::Set,
+                                    decltype(a_k0_m_k1_grid_desc),
+                                    decltype(b_k0_n_k1_grid_desc),
+                                    decltype(c_m_n_grid_desc),
+                                    MPerBlock,
+                                    NPerBlock,
+                                    KPerBlock,
+                                    MPerXDL,
+                                    NPerXDL,
+                                    K1,
+                                    MRepeat,
+                                    NRepeat,
+                                    ABlockTransferThreadSliceLengths_K0_M_K1,
+                                    ABlockTransferThreadClusterLengths_K0_M_K1,
+                                    Sequence<0, 2, 1>,
+                                    Sequence<0, 2, 1>,
+                                    1,
+                                    ABlockTransferSrcScalarPerVector_M,
+                                    ABlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    BBlockTransferThreadSliceLengths_K0_N_K1,
+                                    BBlockTransferThreadClusterLengths_K0_N_K1,
+                                    Sequence<1, 0, 2>,
+                                    Sequence<1, 0, 2>,
+                                    2,
+                                    BBlockTransferSrcScalarPerVector_K1,
+                                    BBlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+                                    6,
+                                    CThreadTransferDstScalarPerVector,
+                                    decltype(a_k0_m_k1_grid_step_hacks),
+                                    decltype(b_k0_n_k1_grid_step_hacks),
+                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
+                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
+                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
+                                    false // CAccessOrderMRepeatNRepeat
+                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
+                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
+                                      a_k0_m_k1_grid_desc,
+                                      b_k0_n_k1_grid_desc,
+                                      c_m_n_grid_desc,
+                                      a_k0_m_k1_grid_step_hacks,
+                                      b_k0_n_k1_grid_step_hacks,
+                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
+                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
+                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
+                                      nrepeat);
+
+        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
+}
--- a/host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
@@ -4,16 +4,8 @@
 #include "host_tensor.hpp"
 #include "driver_gemm_xdlops_v2r3.hpp"

-template <typename ABType,
-          typename AccType,
-          typename CType,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc>
-void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
-                                 const BDesc& b_k_n_grid_desc,
-                                 const CDesc& c_m_n_grid_desc,
-                                 const Tensor<ABType>& a_m_k,
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_mk_kn_mn(const Tensor<ABType>& a_m_k,
                                 const Tensor<ABType>& b_k_n,
                                 Tensor<CType>& c_m_n,
                                 ck::index_t nrepeat)
@@ -22,9 +14,6 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,

    std::cout << __func__ << std::endl;

-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
@@ -34,7 +23,119 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
    c_m_n_device_buf.ToDevice(c_m_n.mData.data());

 #if 0
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64,  for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32,  for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;

    constexpr index_t MPerBlock = 256;
@@ -89,8 +190,8 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128,  for fp16
    constexpr index_t BlockSize = 128;

    constexpr index_t MPerBlock = 128;
@@ -116,46 +217,101 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64,  for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #endif

-    const auto K = a_m_k_grid_desc.GetLength(I1);
-    const auto M = a_m_k_grid_desc.GetLength(I0);
-    const auto N = b_k_n_grid_desc.GetLength(I1);
+    const auto K = a_m_k.mDesc.GetLengths()[1];
+    const auto M = a_m_k.mDesc.GetLengths()[0];
+    const auto N = b_k_n.mDesc.GetLengths()[1];

    constexpr auto K1Number = Number<K1>{};
    const auto K0           = K / K1Number;

    const auto a_k0_m_k1_grid_desc =
-        transform_tensor_descriptor(a_m_k_grid_desc,
-                                    make_tuple(make_pass_through_transform(M),
-                                               make_unmerge_transform(make_tuple(K0, K1Number))),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
+                                                a_m_k.mDesc.GetStrides()[0],
+                                                a_m_k.mDesc.GetStrides()[1]));

    const auto b_k0_n_k1_grid_desc =
-        transform_tensor_descriptor(b_k_n_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                               make_pass_through_transform(N)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
+                                                b_k_n.mDesc.GetStrides()[1],
+                                                b_k_n.mDesc.GetStrides()[0]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));

    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: M
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: M
-                              Sequence<0, 0, 0>{})); // 2-: K1
-
-    constexpr auto b_k0_n_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: N
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: N
-                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1

    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
@@ -175,9 +331,9 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2

-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};

-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};

    for(index_t i = 0; i < 5; ++i)
    {
@@ -222,13 +378,17 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
+                                    false, // CAccessOrderMRepeatNRepeat
+                                    true,  // ABlockLdsExtraM
+                                    true   // BBlockLdsExtraN
                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
                                      a_k0_m_k1_grid_desc,
                                      b_k0_n_k1_grid_desc,
                                      c_m_n_grid_desc,
+                                      debug_driver_gemm_xdlops_v2r3::M01,
+                                      debug_driver_gemm_xdlops_v2r3::N01,
                                      a_k0_m_k1_grid_step_hacks,
                                      b_k0_n_k1_grid_step_hacks,
                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,

--- a/host/driver_offline/include/device_gemm_xdlops_mk_kn_nm.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_mk_kn_nm.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_mk_kn_nm(const Tensor<ABType>& a_m_k,
+                                 const Tensor<ABType>& b_k_n,
+                                 Tensor<CType>& c_n_m,
+                                 ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
+
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto K = a_m_k.mDesc.GetLengths()[1];
+    const auto M = a_m_k.mDesc.GetLengths()[0];
+    const auto N = b_k_n.mDesc.GetLengths()[1];
+
+    constexpr auto K1Number = Number<K1>{};
+    const auto K0           = K / K1Number;
+
+    const auto a_k0_m_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
+                                                a_m_k.mDesc.GetStrides()[0],
+                                                a_m_k.mDesc.GetStrides()[1]));
+
+    const auto b_k0_n_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
+                                                b_k_n.mDesc.GetStrides()[1],
+                                                b_k_n.mDesc.GetStrides()[0]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
+
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time =
+            driver_gemm_xdlops_v2r3<BlockSize,
+                                    ABType,
+                                    AccType,
+                                    CType,
+                                    InMemoryDataOperationEnum_t::Set,
+                                    decltype(a_k0_m_k1_grid_desc),
+                                    decltype(b_k0_n_k1_grid_desc),
+                                    decltype(c_m_n_grid_desc),
+                                    MPerBlock,
+                                    NPerBlock,
+                                    KPerBlock,
+                                    MPerXDL,
+                                    NPerXDL,
+                                    K1,
+                                    MRepeat,
+                                    NRepeat,
+                                    ABlockTransferThreadSliceLengths_K0_M_K1,
+                                    ABlockTransferThreadClusterLengths_K0_M_K1,
+                                    Sequence<1, 0, 2>,
+                                    Sequence<1, 0, 2>,
+                                    2,
+                                    ABlockTransferSrcScalarPerVector_K1,
+                                    ABlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    BBlockTransferThreadSliceLengths_K0_N_K1,
+                                    BBlockTransferThreadClusterLengths_K0_N_K1,
+                                    Sequence<0, 2, 1>,
+                                    Sequence<0, 2, 1>,
+                                    1,
+                                    BBlockTransferSrcScalarPerVector_N,
+                                    BBlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+                                    6,
+                                    CThreadTransferDstScalarPerVector,
+                                    decltype(a_k0_m_k1_grid_step_hacks),
+                                    decltype(b_k0_n_k1_grid_step_hacks),
+                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
+                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
+                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
+                                    false // CAccessOrderMRepeatNRepeat
+                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
+                                      a_k0_m_k1_grid_desc,
+                                      b_k0_n_k1_grid_desc,
+                                      c_m_n_grid_desc,
+                                      a_k0_m_k1_grid_step_hacks,
+                                      b_k0_n_k1_grid_step_hacks,
+                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
+                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
+                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
+                                      nrepeat);
+
+        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
+}
--- a/host/driver_offline/include/device_gemm_xdlops_mk_nk_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_mk_nk_mn.hpp
@@ -4,16 +4,8 @@
 #include "host_tensor.hpp"
 #include "driver_gemm_xdlops_v2r3.hpp"

-template <typename ABType,
-          typename AccType,
-          typename CType,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc>
-void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
-                                 const BDesc& b_n_k_grid_desc,
-                                 const CDesc& c_m_n_grid_desc,
-                                 const Tensor<ABType>& a_m_k,
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_mk_nk_mn(const Tensor<ABType>& a_m_k,
                                 const Tensor<ABType>& b_n_k,
                                 Tensor<CType>& c_m_n,
                                 ck::index_t nrepeat)
@@ -22,9 +14,6 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,

    std::cout << __func__ << std::endl;

-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
@@ -34,6 +23,34 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
    c_m_n_device_buf.ToDevice(c_m_n.mData.data());

 #if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
    constexpr index_t BlockSize = 256;

@@ -62,7 +79,63 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #elif 0
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;

    constexpr index_t MPerBlock = 256;
@@ -90,7 +163,7 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;

    constexpr index_t MPerBlock = 128;
@@ -117,8 +190,8 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 128;

    constexpr index_t MPerBlock = 128;
@@ -144,46 +217,131 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;

+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [64, 128, 4, 8], C = 64, for fp16
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
    constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #endif

-    const auto K = a_m_k_grid_desc.GetLength(I1);
-    const auto M = a_m_k_grid_desc.GetLength(I0);
-    const auto N = b_n_k_grid_desc.GetLength(I0);
+    const auto K = a_m_k.mDesc.GetLengths()[1];
+    const auto M = a_m_k.mDesc.GetLengths()[0];
+    const auto N = b_n_k.mDesc.GetLengths()[0];

    constexpr auto K1Number = Number<K1>{};
    const auto K0           = K / K1Number;

+#if 1
+    // non-padded GEMM
    const auto a_k0_m_k1_grid_desc =
-        transform_tensor_descriptor(a_m_k_grid_desc,
-                                    make_tuple(make_pass_through_transform(M),
-                                               make_unmerge_transform(make_tuple(K0, K1Number))),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
+                                                a_m_k.mDesc.GetStrides()[0],
+                                                a_m_k.mDesc.GetStrides()[1]));

    const auto b_k0_n_k1_grid_desc =
-        transform_tensor_descriptor(b_n_k_grid_desc,
-                                    make_tuple(make_pass_through_transform(N),
-                                               make_unmerge_transform(make_tuple(K0, K1Number))),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
+                                                b_n_k.mDesc.GetStrides()[0],
+                                                b_n_k.mDesc.GetStrides()[1]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));

    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: M
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: M
-                              Sequence<0, 0, 0>{})); // 2-: K1
-
-    constexpr auto b_k0_n_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0>{},   // 1+: N
-                              Sequence<0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0>{},   // 1-: N
-                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1

    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
@@ -203,9 +361,80 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2

-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+#else
+    // padded GEMM
+    const auto a_k0_m_k1_grid_desc_tmp =
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
+                                                a_m_k.mDesc.GetStrides()[0],
+                                                a_m_k.mDesc.GetStrides()[1]));

-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    const auto MRightPad = math::integer_divide_ceil(M, MPerBlock) * MPerBlock - M;
+
+    const auto a_k0_m_k1_grid_desc =
+        transform_tensor_descriptor(a_k0_m_k1_grid_desc_tmp,
+                                    make_tuple(make_pass_through_transform(K0),
+                                               make_right_pad_transform(M, MRightPad),
+                                               make_pass_through_transform(K1Number)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+    const auto b_k0_n_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
+                                                b_n_k.mDesc.GetStrides()[0],
+                                                b_n_k.mDesc.GetStrides()[1]));
+
+    const auto c_m_n_grid_desc_tmp = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));
+
+    const auto c_m_n_grid_desc = transform_tensor_descriptor(
+        c_m_n_grid_desc_tmp,
+        make_tuple(make_right_pad_transform(M, MRightPad), make_pass_through_transform(N)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto a_k0_m_k1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0>{},   // 0+: K0
+                              Sequence<0, 0, 0, 0>{},   // 1+: M
+                              Sequence<0, 0, 0, 0>{}),  // 2+: K1
+                   make_tuple(Sequence<0, 0, 0, 0>{},   // 0-: K0
+                              Sequence<0, 0, 0, 0>{},   // 1-: M
+                              Sequence<0, 0, 0, 0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
+
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0, 0>{};
+
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+#endif

    for(index_t i = 0; i < 5; ++i)
    {
@@ -250,13 +479,17 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
+                                    false, // CAccessOrderMRepeatNRepeat
+                                    true,  // ABlockLdsExtraM
+                                    true   // BBlockLdsExtraN
                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
                                      a_k0_m_k1_grid_desc,
                                      b_k0_n_k1_grid_desc,
                                      c_m_n_grid_desc,
+                                      debug_driver_gemm_xdlops_v2r3::M01,
+                                      debug_driver_gemm_xdlops_v2r3::N01,
                                      a_k0_m_k1_grid_step_hacks,
                                      b_k0_n_k1_grid_step_hacks,
                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,

--- a/host/driver_offline/include/device_gemm_xdlops_mk_nk_nm.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_mk_nk_nm.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+
+template <typename ABType, typename AccType, typename CType>
+void device_gemm_xdlops_mk_nk_nm(const Tensor<ABType>& a_m_k,
+                                 const Tensor<ABType>& b_n_k,
+                                 Tensor<CType>& c_n_m,
+                                 ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
+    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_n_k_device_buf.ToDevice(b_n_k.mData.data());
+    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
+
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t MPerBlock = 64;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 2;
+
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+
+    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+
+    constexpr index_t CThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto K = a_m_k.mDesc.GetLengths()[1];
+    const auto M = a_m_k.mDesc.GetLengths()[0];
+    const auto N = b_n_k.mDesc.GetLengths()[0];
+
+    constexpr auto K1Number = Number<K1>{};
+    const auto K0           = K / K1Number;
+
+    const auto a_k0_m_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
+                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
+                                                a_m_k.mDesc.GetStrides()[0],
+                                                a_m_k.mDesc.GetStrides()[1]));
+
+    const auto b_k0_n_k1_grid_desc =
+        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
+                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
+                                                b_n_k.mDesc.GetStrides()[0],
+                                                b_n_k.mDesc.GetStrides()[1]));
+
+    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
+        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: M
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: M
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
+                                                                     Sequence<0>{},   // 1+: N
+                                                                     Sequence<0>{}),  // 2+: K1
+                                                          make_tuple(Sequence<0>{},   // 0-: K0
+                                                                     Sequence<0>{},   // 1-: N
+                                                                     Sequence<0>{})); // 2-: K1
+
+    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
+
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time =
+            driver_gemm_xdlops_v2r3<BlockSize,
+                                    ABType,
+                                    AccType,
+                                    CType,
+                                    InMemoryDataOperationEnum_t::Set,
+                                    decltype(a_k0_m_k1_grid_desc),
+                                    decltype(b_k0_n_k1_grid_desc),
+                                    decltype(c_m_n_grid_desc),
+                                    MPerBlock,
+                                    NPerBlock,
+                                    KPerBlock,
+                                    MPerXDL,
+                                    NPerXDL,
+                                    K1,
+                                    MRepeat,
+                                    NRepeat,
+                                    ABlockTransferThreadSliceLengths_K0_M_K1,
+                                    ABlockTransferThreadClusterLengths_K0_M_K1,
+                                    Sequence<1, 0, 2>,
+                                    Sequence<1, 0, 2>,
+                                    2,
+                                    ABlockTransferSrcScalarPerVector_K1,
+                                    ABlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    BBlockTransferThreadSliceLengths_K0_N_K1,
+                                    BBlockTransferThreadClusterLengths_K0_N_K1,
+                                    Sequence<1, 0, 2>,
+                                    Sequence<1, 0, 2>,
+                                    2,
+                                    BBlockTransferSrcScalarPerVector_K1,
+                                    BBlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+                                    6,
+                                    CThreadTransferDstScalarPerVector,
+                                    decltype(a_k0_m_k1_grid_step_hacks),
+                                    decltype(b_k0_n_k1_grid_step_hacks),
+                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
+                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
+                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
+                                    false // CAccessOrderMRepeatNRepeat
+                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
+                                      a_k0_m_k1_grid_desc,
+                                      b_k0_n_k1_grid_desc,
+                                      c_m_n_grid_desc,
+                                      a_k0_m_k1_grid_step_hacks,
+                                      b_k0_n_k1_grid_step_hacks,
+                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
+                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
+                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
+                                      nrepeat);
+
+        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
+}
--- a/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
+++ b/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
-#ifndef DRIVER_GEMM_XDLOPS_V2R3
-#define DRIVER_GEMM_XDLOPS_V2R3
+#ifndef DRIVER_GEMM_XDLOPS_V2R3_HPP
+#define DRIVER_GEMM_XDLOPS_V2R3_HPP

 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
@@ -46,13 +46,17 @@ template <ck::index_t BlockSize,
          typename CGridStepHacks,
          typename AGridMoveSliceWindowStepHacks,
          typename BGridMoveSliceWindowStepHacks,
-          bool CAccessOrderMRepeatNRepeat>
+          bool CAccessOrderMRepeatNRepeat,
+          bool ABlockLdsAddExtraM,
+          bool BBlockLdsAddExtraN>
 __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                                       const FloatAB* p_b_grid,
                                       FloatC* p_c_grid,
                                       const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
                                       const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
                                       const CMNGridDesc& c_m_n_grid_desc,
+                                       ck::index_t M01,
+                                       ck::index_t N01,
                                       AGridStepHacks,
                                       BGridStepHacks,
                                       CGridStepHacks,
@@ -108,7 +112,9 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                                                CGridStepHacks,
                                                AGridMoveSliceWindowStepHacks,
                                                BGridMoveSliceWindowStepHacks,
-                                                CAccessOrderMRepeatNRepeat>;
+                                                CAccessOrderMRepeatNRepeat,
+                                                ABlockLdsAddExtraM,
+                                                BBlockLdsAddExtraN>;

    {
        std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
@@ -123,7 +129,8 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                  << c_m_n_grid_desc.GetLength(I1) << "}" << std::endl;
    }

-    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
+    if(!GridwiseGemm::CheckValidity(
+           a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc, M01, N01))
    {
        throw std::runtime_error(
            "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
@@ -134,7 +141,8 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,

    using CM0N0M1N1M2M3M4N2GridDesc = decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc);

-    const auto c_block_cluster_adaptor = GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+    const auto c_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc, M01, N01);

    using CBlockClusterAdaptor = decltype(c_block_cluster_adaptor);


--- a/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -16,7 +16,7 @@
 #include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"

 #define USE_MODE 1
-#define USE_CONV_BWD_V4R1_XDL_NHWC 1
+#define USE_CONV_BWD_V4R1_XDL_NHWC 0
 #define USE_CONV_BWD_V4R1R2_XDL_NHWC 1

 enum ConvBackwardDataAlgo

--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -24,7 +24,7 @@
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
-#define USE_CONV_FWD_V4R4R2_XDL_NCHW 1
+#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 1

 enum ConvForwardAlgo

--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <half.hpp>
 #include "config.hpp"
+#include "debug.hpp"
 #include "print.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -16,11 +17,19 @@
 #include "device_gemm_xdlops_mk_nk_mn.hpp"
 #include "device_gemm_xdlops_km_kn_mn.hpp"
 #include "device_gemm_xdlops_km_nk_mn.hpp"
+#include "device_gemm_xdlops_mk_kn_nm.hpp"
+#include "device_gemm_xdlops_mk_nk_nm.hpp"
+#include "device_gemm_xdlops_km_kn_nm.hpp"
+#include "device_gemm_xdlops_km_nk_nm.hpp"

 #define USE_GEMM_XDL_MK_KN_MN 1
 #define USE_GEMM_XDL_MK_NK_MN 1
 #define USE_GEMM_XDL_KM_KN_MN 1
 #define USE_GEMM_XDL_KM_NK_MN 1
+#define USE_GEMM_XDL_MK_KN_NM 0
+#define USE_GEMM_XDL_MK_NK_NM 0
+#define USE_GEMM_XDL_KM_KN_NM 0
+#define USE_GEMM_XDL_KM_NK_NM 0

 enum GemmAlgo
 {
@@ -28,21 +37,21 @@ enum GemmAlgo
    Xdl_MK_NK_MN, // 1
    Xdl_KM_KN_MN, // 2
    Xdl_KM_NK_MN, // 3
+    Xdl_MK_KN_NM, // 4
+    Xdl_MK_NK_NM, // 5
+    Xdl_KM_KN_NM, // 6
+    Xdl_KM_NK_NM, // 7
 };

 int main(int argc, char* argv[])
 {
    using namespace ck;

-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    // dynamic mode
-    if(argc != 10)
+    if(argc != 12)
    {
        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: M, N, K\n");
+        printf("rest: M, N, K, debug_driver_gemm_xdlops_v2r3::M01, "
+               "debug_driver_gemm_xdlops_v2r3::N01\n");
        exit(1);
    }

@@ -57,6 +66,9 @@ int main(int argc, char* argv[])
    const index_t N = std::stoi(argv[8]);
    const index_t K = std::stoi(argv[9]);

+    debug_driver_gemm_xdlops_v2r3::M01 = std::stoi(argv[10]);
+    debug_driver_gemm_xdlops_v2r3::N01 = std::stoi(argv[11]);
+
 #if 0
    using ab_data_t  = float;
    using acc_data_t = float;
@@ -74,69 +86,44 @@ int main(int argc, char* argv[])
    std::vector<std::size_t> a_lengths_host(2), b_lengths_host(2), c_lengths_host(2);
    std::vector<std::size_t> a_strides_host(2), b_strides_host(2), c_strides_host(2);

-    if(layout == GemmMatrixLayout::MK_KN_MN)
+    // A
+    if(layout == GemmMatrixLayout::MK_KN_MN || layout == GemmMatrixLayout::MK_NK_MN ||
+       layout == GemmMatrixLayout::MK_KN_NM || layout == GemmMatrixLayout::MK_NK_NM)
    {
        a_lengths_host[0] = static_cast<std::size_t>(M);
        a_lengths_host[1] = static_cast<std::size_t>(K);
        a_strides_host[0] = static_cast<std::size_t>(K);
        a_strides_host[1] = static_cast<std::size_t>(1);
-
-        b_lengths_host[0] = static_cast<std::size_t>(K);
-        b_lengths_host[1] = static_cast<std::size_t>(N);
-        b_strides_host[0] = static_cast<std::size_t>(N);
-        b_strides_host[1] = static_cast<std::size_t>(1);
-
-        c_lengths_host[0] = static_cast<std::size_t>(M);
-        c_lengths_host[1] = static_cast<std::size_t>(N);
-        c_strides_host[0] = static_cast<std::size_t>(N);
-        c_strides_host[1] = static_cast<std::size_t>(1);
    }
-    else if(layout == GemmMatrixLayout::MK_NK_MN)
+    else
    {
-        a_lengths_host[0] = static_cast<std::size_t>(M);
-        a_lengths_host[1] = static_cast<std::size_t>(K);
-        a_strides_host[0] = static_cast<std::size_t>(K);
+        a_lengths_host[0] = static_cast<std::size_t>(K);
+        a_lengths_host[1] = static_cast<std::size_t>(M);
+        a_strides_host[0] = static_cast<std::size_t>(M);
        a_strides_host[1] = static_cast<std::size_t>(1);
+    }

+    // B
+    if(layout == GemmMatrixLayout::MK_NK_MN || layout == GemmMatrixLayout::KM_NK_MN ||
+       layout == GemmMatrixLayout::MK_NK_NM || layout == GemmMatrixLayout::KM_NK_NM)
+    {
        b_lengths_host[0] = static_cast<std::size_t>(N);
        b_lengths_host[1] = static_cast<std::size_t>(K);
        b_strides_host[0] = static_cast<std::size_t>(K);
        b_strides_host[1] = static_cast<std::size_t>(1);
-
-        c_lengths_host[0] = static_cast<std::size_t>(M);
-        c_lengths_host[1] = static_cast<std::size_t>(N);
-        c_strides_host[0] = static_cast<std::size_t>(N);
-        c_strides_host[1] = static_cast<std::size_t>(1);
    }
-    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    else
    {
-        a_lengths_host[0] = static_cast<std::size_t>(K);
-        a_lengths_host[1] = static_cast<std::size_t>(M);
-        a_strides_host[0] = static_cast<std::size_t>(M);
-        a_strides_host[1] = static_cast<std::size_t>(1);
-
        b_lengths_host[0] = static_cast<std::size_t>(K);
        b_lengths_host[1] = static_cast<std::size_t>(N);
        b_strides_host[0] = static_cast<std::size_t>(N);
        b_strides_host[1] = static_cast<std::size_t>(1);
-
-        c_lengths_host[0] = static_cast<std::size_t>(M);
-        c_lengths_host[1] = static_cast<std::size_t>(N);
-        c_strides_host[0] = static_cast<std::size_t>(N);
-        c_strides_host[1] = static_cast<std::size_t>(1);
    }
-    else if(layout == GemmMatrixLayout::KM_NK_MN)
-    {
-        a_lengths_host[0] = static_cast<std::size_t>(K);
-        a_lengths_host[1] = static_cast<std::size_t>(M);
-        a_strides_host[0] = static_cast<std::size_t>(M);
-        a_strides_host[1] = static_cast<std::size_t>(1);
-
-        b_lengths_host[0] = static_cast<std::size_t>(N);
-        b_lengths_host[1] = static_cast<std::size_t>(K);
-        b_strides_host[0] = static_cast<std::size_t>(K);
-        b_strides_host[1] = static_cast<std::size_t>(1);

+    // C
+    if(layout == GemmMatrixLayout::MK_KN_MN || layout == GemmMatrixLayout::KM_KN_MN ||
+       layout == GemmMatrixLayout::MK_NK_MN || layout == GemmMatrixLayout::KM_NK_MN)
+    {
        c_lengths_host[0] = static_cast<std::size_t>(M);
        c_lengths_host[1] = static_cast<std::size_t>(N);
        c_strides_host[0] = static_cast<std::size_t>(N);
@@ -144,7 +131,10 @@ int main(int argc, char* argv[])
    }
    else
    {
-        std::runtime_error("wrong! not implemented");
+        c_lengths_host[0] = static_cast<std::size_t>(N);
+        c_lengths_host[1] = static_cast<std::size_t>(M);
+        c_strides_host[0] = static_cast<std::size_t>(M);
+        c_strides_host[1] = static_cast<std::size_t>(1);
    }

    Tensor<ab_data_t> a(a_lengths_host, a_strides_host);
@@ -185,38 +175,6 @@ int main(int argc, char* argv[])
        b.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
    }

-    auto f_make_for_device_mk_kn_mn = [&]() {
-        const auto a_desc = make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(K, I1));
-        const auto b_desc = make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(N, I1));
-        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
-
-        return make_tuple(a_desc, b_desc, c_desc);
-    };
-
-    auto f_make_for_device_mk_nk_mn = [&]() {
-        const auto a_desc = make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(K, I1));
-        const auto b_desc = make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(K, I1));
-        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
-
-        return make_tuple(a_desc, b_desc, c_desc);
-    };
-
-    auto f_make_for_device_km_kn_mn = [&]() {
-        const auto a_desc = make_naive_tensor_descriptor(make_tuple(K, M), make_tuple(M, I1));
-        const auto b_desc = make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(N, I1));
-        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
-
-        return make_tuple(a_desc, b_desc, c_desc);
-    };
-
-    auto f_make_for_device_km_nk_mn = [&]() {
-        const auto a_desc = make_naive_tensor_descriptor(make_tuple(K, M), make_tuple(M, I1));
-        const auto b_desc = make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(K, I1));
-        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
-
-        return make_tuple(a_desc, b_desc, c_desc);
-    };
-
 #if USE_GEMM_XDL_MK_KN_MN
    if(algo == GemmAlgo::Xdl_MK_KN_MN)
    {
@@ -225,10 +183,7 @@ int main(int argc, char* argv[])
            throw std::runtime_error("wrong! layout");
        }

-        const auto descs = f_make_for_device_mk_kn_mn();
-
-        device_gemm_xdlops_mk_kn_mn<ab_data_t, acc_data_t, c_data_t>(
-            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+        device_gemm_xdlops_mk_kn_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
    }
 #endif

@@ -240,10 +195,7 @@ int main(int argc, char* argv[])
            throw std::runtime_error("wrong! layout");
        }

-        const auto descs = f_make_for_device_mk_nk_mn();
-
-        device_gemm_xdlops_mk_nk_mn<ab_data_t, acc_data_t, c_data_t>(
-            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+        device_gemm_xdlops_mk_nk_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
    }
 #endif

@@ -255,10 +207,7 @@ int main(int argc, char* argv[])
            throw std::runtime_error("wrong! layout");
        }

-        const auto descs = f_make_for_device_km_kn_mn();
-
-        device_gemm_xdlops_km_kn_mn<ab_data_t, acc_data_t, c_data_t>(
-            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+        device_gemm_xdlops_km_kn_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
    }
 #endif

@@ -270,10 +219,55 @@ int main(int argc, char* argv[])
            throw std::runtime_error("wrong! layout");
        }

-        const auto descs = f_make_for_device_km_nk_mn();
+        device_gemm_xdlops_km_nk_mn<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
+    }
+#endif
+
+#if USE_GEMM_XDL_MK_KN_NM
+    if(algo == GemmAlgo::Xdl_MK_KN_NM)
+    {
+        if(layout != GemmMatrixLayout::MK_KN_NM)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        device_gemm_xdlops_mk_kn_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
+    }
+#endif
+
+#if USE_GEMM_XDL_MK_NK_NM
+    if(algo == GemmAlgo::Xdl_MK_NK_NM)
+    {
+        if(layout != GemmMatrixLayout::MK_NK_NM)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        device_gemm_xdlops_mk_nk_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
+    }
+#endif
+
+#if USE_GEMM_XDL_KM_KN_NM
+    if(algo == GemmAlgo::Xdl_KM_KN_NM)
+    {
+        if(layout != GemmMatrixLayout::KM_KN_NM)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        device_gemm_xdlops_km_kn_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
+    }
+#endif
+
+#if USE_GEMM_XDL_KM_NK_NM
+    if(algo == GemmAlgo::Xdl_KM_NK_NM)
+    {
+        if(layout != GemmMatrixLayout::KM_NK_NM)
+        {
+            throw std::runtime_error("wrong! layout");
+        }

-        device_gemm_xdlops_km_nk_mn<ab_data_t, acc_data_t, c_data_t>(
-            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+        device_gemm_xdlops_km_nk_nm<ab_data_t, acc_data_t, c_data_t>(a, b, c_device, nrepeat);
    }
 #endif


--- a/host/host_tensor/include/gemm_common.hpp
+++ b/host/host_tensor/include/gemm_common.hpp
@@ -7,6 +7,10 @@ enum GemmMatrixLayout
    MK_NK_MN, // 1
    KM_KN_MN, // 2
    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
 };

 #endif
--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
@@ -80,6 +80,78 @@ void host_gemm(const Tensor<AType>& a,
        make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
            std::thread::hardware_concurrency());
    }
+    else if(layout == GemmMatrixLayout::MK_KN_NM)
+    {
+        auto f_mk_kn_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_NM)
+    {
+        auto f_mk_nk_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_KN_NM)
+    {
+        auto f_km_kn_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_NK_NM)
+    {
+        auto f_km_nk_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
    else
    {
        throw std::runtime_error("wrong! not supported layout");