Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liyinrong
composable_kernel
Commits
0fe6e258
Commit
0fe6e258
authored
3 years ago
by
Chao Liu
Browse files
Options
Download
Email Patches
Plain Diff
tweak gemm
parent
50fc3865
test
add_bfp16_gemm
add_bfp16_int8_gemm
add_bfp16_xdlops
improve_gemm
No related merge requests found
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
+1
-1
...clude/tensor_description/multi_index_transform_helper.hpp
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+131
-40
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
host/driver_offline/include/debug.hpp
+11
-0
host/driver_offline/include/debug.hpp
host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
+207
-47
host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
host/driver_offline/include/device_gemm_xdlops_km_kn_nm.hpp
+263
-0
host/driver_offline/include/device_gemm_xdlops_km_kn_nm.hpp
host/driver_offline/include/device_gemm_xdlops_km_nk_mn.hpp
+207
-47
host/driver_offline/include/device_gemm_xdlops_km_nk_mn.hpp
host/driver_offline/include/device_gemm_xdlops_km_nk_nm.hpp
+263
-0
host/driver_offline/include/device_gemm_xdlops_km_nk_nm.hpp
host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
+207
-47
host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
host/driver_offline/include/device_gemm_xdlops_mk_kn_nm.hpp
+291
-0
host/driver_offline/include/device_gemm_xdlops_mk_kn_nm.hpp
host/driver_offline/include/device_gemm_xdlops_mk_nk_mn.hpp
+281
-48
host/driver_offline/include/device_gemm_xdlops_mk_nk_mn.hpp
host/driver_offline/include/device_gemm_xdlops_mk_nk_nm.hpp
+347
-0
host/driver_offline/include/device_gemm_xdlops_mk_nk_nm.hpp
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
+14
-6
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
host/driver_offline/src/conv_bwd_driver_offline.cpp
+1
-1
host/driver_offline/src/conv_bwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+1
-1
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/gemm_driver_offline.cpp
+91
-97
host/driver_offline/src/gemm_driver_offline.cpp
host/host_tensor/include/gemm_common.hpp
+4
-0
host/host_tensor/include/gemm_common.hpp
host/host_tensor/include/host_gemm.hpp
+72
-0
host/host_tensor/include/host_gemm.hpp
with
2392 additions
and
335 deletions
+2392
-335
composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
View file @
0fe6e258
...
...
@@ -31,7 +31,7 @@ __host__ __device__ constexpr auto make_left_pad_transform(
return
LeftPad
<
LowLength
,
LeftPadLength
,
SkipIsValidCheck
>
{
low_length
,
left_pad
};
}
template
<
typename
LowLength
,
typename
RightPadLength
,
bool
SkipIsValidCheck
>
template
<
typename
LowLength
,
typename
RightPadLength
,
bool
SkipIsValidCheck
=
false
>
__host__
__device__
constexpr
auto
make_right_pad_transform
(
const
LowLength
&
low_length
,
const
RightPadLength
&
right_pad
,
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
View file @
0fe6e258
...
...
@@ -29,7 +29,7 @@ __global__ void
FloatC
*
__restrict__
p_c_grid
,
const
AK0MK1GridDesc
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
b_k0_n_k1_grid_desc
,
const
CM0N0M1N1M2M3M4N2GridDesc
c_m0_
m1_m2_n
_grid_desc
,
const
CM0N0M1N1M2M3M4N2GridDesc
c_m0_
n0_m1_n1_m2_m3_m4_n2
_grid_desc
,
const
CBlockClusterAdaptor
c_block_cluster_adaptor
)
{
constexpr
index_t
shared_block_size
=
...
...
@@ -132,7 +132,9 @@ template <index_t BlockSize,
typename
CGridStepHacks
,
typename
AGridMoveSliceWindowStepHacks
,
typename
BGridMoveSliceWindowStepHacks
,
bool
CAccessOrderMRepeatNRepeat
>
bool
CAccessOrderMRepeatNRepeat
,
bool
ABlockLdsExtraM
,
bool
BBlockLdsExtraN
>
struct
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -151,14 +153,34 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr
auto
max_lds_align
=
K1
;
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
constexpr
auto
a_k0_m_k1_block_desc
=
[
&
]()
{
if
constexpr
(
ABlockLdsExtraM
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
make_tuple
(
Number
<
MPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
}
else
{
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
}
}();
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
constexpr
auto
b_k0_n_k1_block_desc
=
[
&
]()
{
if
constexpr
(
BBlockLdsExtraN
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
make_tuple
(
Number
<
NPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
}
else
{
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
}
}();
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size
=
...
...
@@ -170,29 +192,45 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
return
(
a_block_space_size
+
b_block_space_size
)
*
sizeof
(
FloatAB
);
}
// block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
)
const
CMNGridDesc
&
c_m_n_grid_desc
,
index_t
M01
,
index_t
N01
)
{
// TODO: turn on this
static_assert
(
is_known_at_compile_time
<
remove_cv_t
<
decltype
(
K1
)
>>::
value
,
"wrong! K1 need to be known at compile-time"
);
static_assert
((
MPerBlock
%
(
MPerXDL
*
MRepeat
)
==
0
)
&&
(
NPerBlock
%
(
NRepeat
*
NPerXDL
))
==
0
,
"Invalid tuning param!"
);
const
auto
M
=
a_k0_m_k1_grid_desc
.
GetLength
(
I1
);
const
auto
N
=
b_k0_n_k1_grid_desc
.
GetLength
(
I1
);
const
auto
K0
=
a_k0_m_k1_grid_desc
.
GetLength
(
I0
);
static_assert
((
MPerBlock
%
(
MPerXDL
*
MRepeat
)
==
0
)
&&
(
NPerBlock
%
(
NRepeat
*
NPerXDL
))
==
0
,
"Invalid tuning param!"
);
if
(
!
(
M
==
c_m_n_grid_desc
.
GetLength
(
I0
)
&&
N
==
c_m_n_grid_desc
.
GetLength
(
I1
)
&&
K0
==
b_k0_n_k1_grid_desc
.
GetLength
(
I0
)
&&
K1
==
a_k0_m_k1_grid_desc
.
GetLength
(
I2
)
&&
K1
==
b_k0_n_k1_grid_desc
.
GetLength
(
I2
)))
return
false
;
if
(
!
(
M
%
MPerBlock
==
0
&&
N
%
NPerBlock
==
0
&&
K0
%
KPerBlock
==
0
))
return
false
;
// check M01, N01
constexpr
auto
M1
=
Number
<
MPerBlock
>
{};
constexpr
auto
N1
=
Number
<
NPerBlock
>
{};
const
auto
M0
=
M
/
M1
;
const
auto
N0
=
N
/
N1
;
if
(
!
(
M0
%
M01
==
0
&&
N0
%
N01
==
0
))
return
false
;
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
return
(
M
==
c_m_n_grid_desc
.
GetLength
(
I0
)
&&
N
==
c_m_n_grid_desc
.
GetLength
(
I1
)
&&
K0
==
b_k0_n_k1_grid_desc
.
GetLength
(
I0
)
&&
K1
==
a_k0_m_k1_grid_desc
.
GetLength
(
I2
)
&&
K1
==
b_k0_n_k1_grid_desc
.
GetLength
(
I2
))
&&
(
M
%
MPerBlock
==
0
&&
N
%
NPerBlock
==
0
&&
K0
%
KPerBlock
==
0
);
return
true
;
}
__host__
__device__
static
constexpr
index_t
...
...
@@ -211,11 +249,35 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
{
constexpr
auto
max_lds_align
=
K1
;
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_k0_m_k1_block_desc
=
[
&
]()
{
if
constexpr
(
ABlockLdsExtraM
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
make_tuple
(
Number
<
MPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
}
else
{
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
}
}();
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
constexpr
auto
b_k0_n_k1_block_desc
=
[
&
]()
{
if
constexpr
(
BBlockLdsExtraN
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
make_tuple
(
Number
<
NPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
}
else
{
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
}
}();
using
BlockwiseGemm
=
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
<
BlockSize
,
...
...
@@ -231,8 +293,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
return
BlockwiseGemm
::
MakeCM0N0M1N1M2M3M4N2GridDescriptor
(
c_m_n_grid_desc
);
}
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeCBlockClusterAdaptor
(
const
CMNGridDesc
&
c_m_n_grid_desc
)
MakeCBlockClusterAdaptor
(
const
CMNGridDesc
&
c_m_n_grid_desc
,
index_t
M01
,
index_t
N01
)
{
const
auto
M
=
c_m_n_grid_desc
.
GetLength
(
I0
);
const
auto
N
=
c_m_n_grid_desc
.
GetLength
(
I1
);
...
...
@@ -243,23 +306,31 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
const
auto
M0
=
M
/
M1
;
const
auto
N0
=
N
/
N1
;
#if 1
const
auto
c_blockid_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
M0
,
N0
))),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
#elif 1
const
auto
M00
=
M0
/
M01
;
const
auto
N00
=
N0
/
N01
;
const
auto
m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_unmerge_transform
(
make_tuple
(
M00
,
M01
)),
make_unmerge_transform
(
make_tuple
(
N00
,
N01
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
,
3
>
{}));
const
auto
c_blockid_to_m00_m01_n00_n01_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
M00
,
N00
,
M01
,
N01
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
c_blockid_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
N0
,
M0
))),
make_tuple
(
Sequence
<
1
,
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
#endif
chain_tensor_adaptors
(
m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
,
c_blockid_to_m00_m01_n00_n01_block_cluster_adaptor
);
return
c_blockid_to_m0_n0_block_cluster_adaptor
;
}
using
CM0N0M1N1M2M3M4N2GridDesc
=
decltype
(
MakeCM0N0M1N1M2M3M4N2GridDescriptor
(
CMNGridDesc
{}));
using
CBlockClusterAdaptor
=
decltype
(
MakeCBlockClusterAdaptor
(
CMNGridDesc
{}));
using
CBlockClusterAdaptor
=
decltype
(
MakeCBlockClusterAdaptor
(
CMNGridDesc
{}
,
1
,
1
));
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
...
...
@@ -294,14 +365,34 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr
auto
max_lds_align
=
K1
;
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
constexpr
auto
a_k0_m_k1_block_desc
=
[
&
]()
{
if
constexpr
(
ABlockLdsExtraM
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
make_tuple
(
Number
<
MPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
}
else
{
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
}
}();
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
constexpr
auto
b_k0_n_k1_block_desc
=
[
&
]()
{
if
constexpr
(
BBlockLdsExtraN
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
make_tuple
(
Number
<
NPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
}
else
{
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
}
}();
// A matrix blockwise copy
auto
a_blockwise_copy
=
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/include/debug.hpp
0 → 100644
View file @
0fe6e258
#ifndef DEBUG_HPP
#define DEBUG_HPP
namespace
debug_driver_gemm_xdlops_v2r3
{
// these vars are on host, they control block_id to C matrix tile idx (m0, n0) mapping
static
ck
::
index_t
M01
=
1
;
static
ck
::
index_t
N01
=
1
;
}
// namespace debug_driver_gemm_xdlops_v2r3
#endif
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
View file @
0fe6e258
...
...
@@ -4,16 +4,8 @@
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
,
typename
ADesc
,
typename
BDesc
,
typename
CDesc
>
void
device_gemm_xdlops_km_kn_mn
(
const
ADesc
&
a_k_m_grid_desc
,
const
BDesc
&
b_k_n_grid_desc
,
const
CDesc
&
c_m_n_grid_desc
,
const
Tensor
<
ABType
>&
a_k_m
,
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_km_kn_mn
(
const
Tensor
<
ABType
>&
a_k_m
,
const
Tensor
<
ABType
>&
b_k_n
,
Tensor
<
CType
>&
c_m_n
,
ck
::
index_t
nrepeat
)
...
...
@@ -22,9 +14,6 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
DeviceMem
a_k_m_device_buf
(
sizeof
(
ABType
)
*
a_k_m
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
ABType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CType
)
*
c_m_n
.
mDesc
.
GetElementSpace
());
...
...
@@ -62,7 +51,91 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
constexpr index_t CThreadTransferDstScalarPerVector = 1;
#elif
0
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
...
...
@@ -89,8 +162,36 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [128, 128, 4, 8] for fp16
#elif 0
// [M, N, K0, K1] = [128, 256, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
...
...
@@ -116,46 +217,101 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#endif
const
auto
K
=
a_k_m
_grid_d
esc
.
GetLength
(
I0
)
;
const
auto
M
=
a_k_m
_grid_d
esc
.
GetLength
(
I1
)
;
const
auto
N
=
b_k_n
_grid_d
esc
.
GetLength
(
I1
)
;
const
auto
K
=
a_k_m
.
mD
esc
.
GetLength
s
()[
0
]
;
const
auto
M
=
a_k_m
.
mD
esc
.
GetLength
s
()[
1
]
;
const
auto
N
=
b_k_n
.
mD
esc
.
GetLength
s
()[
1
]
;
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
const
auto
a_k0_m_k1_grid_desc
=
transform_tensor_descriptor
(
a_k_m_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_k_m
.
mDesc
.
GetStrides
()[
0
],
a_k_m
.
mDesc
.
GetStrides
()[
1
],
a_k_m
.
mDesc
.
GetStrides
()[
0
]));
const
auto
b_k0_n_k1_grid_desc
=
transform_tensor_descriptor
(
b_k_n_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_k_n
.
mDesc
.
GetStrides
()[
0
],
b_k_n
.
mDesc
.
GetStrides
()[
1
],
b_k_n
.
mDesc
.
GetStrides
()[
0
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_m_n
.
mDesc
.
GetStrides
()[
0
],
c_m_n
.
mDesc
.
GetStrides
()[
1
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: M
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: M
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: N
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: N
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
...
...
@@ -175,9 +331,9 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
...
...
@@ -222,13 +378,17 @@ void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
false
,
// CAccessOrderMRepeatNRepeat
true
,
// ABlockLdsExtraM
true
// BBlockLdsExtraN
>
(
static_cast
<
ABType
*>
(
a_k_m_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
debug_driver_gemm_xdlops_v2r3
::
M01
,
debug_driver_gemm_xdlops_v2r3
::
N01
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_km_kn_nm.hpp
0 → 100644
View file @
0fe6e258
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_km_kn_nm
(
const
Tensor
<
ABType
>&
a_k_m
,
const
Tensor
<
ABType
>&
b_k_n
,
Tensor
<
CType
>&
c_n_m
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
DeviceMem
a_k_m_device_buf
(
sizeof
(
ABType
)
*
a_k_m
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
ABType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_n_m_device_buf
(
sizeof
(
CType
)
*
c_n_m
.
mDesc
.
GetElementSpace
());
a_k_m_device_buf
.
ToDevice
(
a_k_m
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
c_n_m_device_buf
.
ToDevice
(
c_n_m
.
mData
.
data
());
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
constexpr index_t BlockSize = 256;
constexpr index_t MPerBlock = 256;
constexpr index_t NPerBlock = 128;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 32;
constexpr index_t NPerXDL = 32;
constexpr index_t K1 = 4;
constexpr index_t MRepeat = 4;
constexpr index_t NRepeat = 2;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 4>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_M = 4;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 4>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_N = 2;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
constexpr index_t CThreadTransferDstScalarPerVector = 4;
#elif
0
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [128, 128, 4, 8] for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#endif
const
auto
K
=
a_k_m
.
mDesc
.
GetLengths
()[
0
];
const
auto
M
=
a_k_m
.
mDesc
.
GetLengths
()[
1
];
const
auto
N
=
b_k_n
.
mDesc
.
GetLengths
()[
1
];
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
const
auto
a_k0_m_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_k_m
.
mDesc
.
GetStrides
()[
0
],
a_k_m
.
mDesc
.
GetStrides
()[
1
],
a_k_m
.
mDesc
.
GetStrides
()[
0
]));
const
auto
b_k0_n_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_k_n
.
mDesc
.
GetStrides
()[
0
],
b_k_n
.
mDesc
.
GetStrides
()[
1
],
b_k_n
.
mDesc
.
GetStrides
()[
0
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_n_m
.
mDesc
.
GetStrides
()[
1
],
c_n_m
.
mDesc
.
GetStrides
()[
0
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4+: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5+: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6+: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 7+: N2
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4-: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5-: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
ABType
,
AccType
,
CType
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
a_k0_m_k1_grid_desc
),
decltype
(
b_k0_n_k1_grid_desc
),
decltype
(
c_m_n_grid_desc
),
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerXDL
,
NPerXDL
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
Sequence
<
0
,
2
,
1
>
,
Sequence
<
0
,
2
,
1
>
,
1
,
ABlockTransferSrcScalarPerVector_M
,
ABlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
Sequence
<
0
,
2
,
1
>
,
Sequence
<
0
,
2
,
1
>
,
1
,
BBlockTransferSrcScalarPerVector_N
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
6
,
CThreadTransferDstScalarPerVector
,
decltype
(
a_k0_m_k1_grid_step_hacks
),
decltype
(
b_k0_n_k1_grid_step_hacks
),
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
>
(
static_cast
<
ABType
*>
(
a_k_m_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_n_m_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
a_k0_m_k1_grid_move_slice_window_step_hacks
,
b_k0_n_k1_grid_move_slice_window_step_hacks
,
nrepeat
);
float
perf
=
static_cast
<
float
>
((
std
::
size_t
(
2
)
*
M
*
N
*
K
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
// copy result back to host
c_n_m_device_buf
.
FromDevice
(
c_n_m
.
mData
.
data
());
}
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_km_nk_mn.hpp
View file @
0fe6e258
...
...
@@ -4,16 +4,8 @@
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
,
typename
ADesc
,
typename
BDesc
,
typename
CDesc
>
void
device_gemm_xdlops_km_nk_mn
(
const
ADesc
&
a_k_m_grid_desc
,
const
BDesc
&
b_n_k_grid_desc
,
const
CDesc
&
c_m_n_grid_desc
,
const
Tensor
<
ABType
>&
a_k_m
,
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_km_nk_mn
(
const
Tensor
<
ABType
>&
a_k_m
,
const
Tensor
<
ABType
>&
b_n_k
,
Tensor
<
CType
>&
c_m_n
,
ck
::
index_t
nrepeat
)
...
...
@@ -22,9 +14,6 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
DeviceMem
a_k_m_device_buf
(
sizeof
(
ABType
)
*
a_k_m
.
mDesc
.
GetElementSpace
());
DeviceMem
b_n_k_device_buf
(
sizeof
(
ABType
)
*
b_n_k
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CType
)
*
c_m_n
.
mDesc
.
GetElementSpace
());
...
...
@@ -62,7 +51,91 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
constexpr index_t CThreadTransferDstScalarPerVector = 1;
#elif
0
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
...
...
@@ -89,8 +162,36 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [128, 128, 4, 8] for fp16
#elif 0
// [M, N, K0, K1] = [128, 256, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
...
...
@@ -116,46 +217,101 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#endif
const
auto
K
=
a_k_m
_grid_d
esc
.
GetLength
(
I0
)
;
const
auto
M
=
a_k_m
_grid_d
esc
.
GetLength
(
I1
)
;
const
auto
N
=
b_n_k
_grid_d
esc
.
GetLength
(
I0
)
;
const
auto
K
=
a_k_m
.
mD
esc
.
GetLength
s
()[
0
]
;
const
auto
M
=
a_k_m
.
mD
esc
.
GetLength
s
()[
1
]
;
const
auto
N
=
b_n_k
.
mD
esc
.
GetLength
s
()[
0
]
;
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
const
auto
a_k0_m_k1_grid_desc
=
transform_tensor_descriptor
(
a_k_m_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_k_m
.
mDesc
.
GetStrides
()[
0
],
a_k_m
.
mDesc
.
GetStrides
()[
1
],
a_k_m
.
mDesc
.
GetStrides
()[
0
]));
const
auto
b_k0_n_k1_grid_desc
=
transform_tensor_descriptor
(
b_n_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_n_k
.
mDesc
.
GetStrides
()[
1
],
b_n_k
.
mDesc
.
GetStrides
()[
0
],
b_n_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_m_n
.
mDesc
.
GetStrides
()[
0
],
c_m_n
.
mDesc
.
GetStrides
()[
1
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: M
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: M
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: N
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: N
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
...
...
@@ -175,9 +331,9 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
...
...
@@ -222,13 +378,17 @@ void device_gemm_xdlops_km_nk_mn(const ADesc& a_k_m_grid_desc,
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
false
,
// CAccessOrderMRepeatNRepeat
true
,
// ABlockLdsExtraM
true
// BBlockLdsExtraN
>
(
static_cast
<
ABType
*>
(
a_k_m_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_n_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
debug_driver_gemm_xdlops_v2r3
::
M01
,
debug_driver_gemm_xdlops_v2r3
::
N01
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_km_nk_nm.hpp
0 → 100644
View file @
0fe6e258
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_km_nk_nm
(
const
Tensor
<
ABType
>&
a_k_m
,
const
Tensor
<
ABType
>&
b_n_k
,
Tensor
<
CType
>&
c_n_m
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
DeviceMem
a_k_m_device_buf
(
sizeof
(
ABType
)
*
a_k_m
.
mDesc
.
GetElementSpace
());
DeviceMem
b_n_k_device_buf
(
sizeof
(
ABType
)
*
b_n_k
.
mDesc
.
GetElementSpace
());
DeviceMem
c_n_m_device_buf
(
sizeof
(
CType
)
*
c_n_m
.
mDesc
.
GetElementSpace
());
a_k_m_device_buf
.
ToDevice
(
a_k_m
.
mData
.
data
());
b_n_k_device_buf
.
ToDevice
(
b_n_k
.
mData
.
data
());
c_n_m_device_buf
.
ToDevice
(
c_n_m
.
mData
.
data
());
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
constexpr index_t BlockSize = 256;
constexpr index_t MPerBlock = 256;
constexpr index_t NPerBlock = 128;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 32;
constexpr index_t NPerXDL = 32;
constexpr index_t K1 = 4;
constexpr index_t MRepeat = 4;
constexpr index_t NRepeat = 2;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 4>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_M = 4;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 4>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
constexpr index_t CThreadTransferDstScalarPerVector = 4;
#elif
0
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [128, 128, 4, 8] for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_M
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#endif
const
auto
K
=
a_k_m
.
mDesc
.
GetLengths
()[
0
];
const
auto
M
=
a_k_m
.
mDesc
.
GetLengths
()[
1
];
const
auto
N
=
b_n_k
.
mDesc
.
GetLengths
()[
0
];
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
const
auto
a_k0_m_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_k_m
.
mDesc
.
GetStrides
()[
0
],
a_k_m
.
mDesc
.
GetStrides
()[
1
],
a_k_m
.
mDesc
.
GetStrides
()[
0
]));
const
auto
b_k0_n_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_n_k
.
mDesc
.
GetStrides
()[
1
],
b_n_k
.
mDesc
.
GetStrides
()[
0
],
b_n_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_n_m
.
mDesc
.
GetStrides
()[
1
],
c_n_m
.
mDesc
.
GetStrides
()[
0
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4+: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5+: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6+: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 7+: N2
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4-: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5-: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
ABType
,
AccType
,
CType
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
a_k0_m_k1_grid_desc
),
decltype
(
b_k0_n_k1_grid_desc
),
decltype
(
c_m_n_grid_desc
),
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerXDL
,
NPerXDL
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
Sequence
<
0
,
2
,
1
>
,
Sequence
<
0
,
2
,
1
>
,
1
,
ABlockTransferSrcScalarPerVector_M
,
ABlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
Sequence
<
1
,
0
,
2
>
,
Sequence
<
1
,
0
,
2
>
,
2
,
BBlockTransferSrcScalarPerVector_K1
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
6
,
CThreadTransferDstScalarPerVector
,
decltype
(
a_k0_m_k1_grid_step_hacks
),
decltype
(
b_k0_n_k1_grid_step_hacks
),
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
>
(
static_cast
<
ABType
*>
(
a_k_m_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_n_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_n_m_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
a_k0_m_k1_grid_move_slice_window_step_hacks
,
b_k0_n_k1_grid_move_slice_window_step_hacks
,
nrepeat
);
float
perf
=
static_cast
<
float
>
((
std
::
size_t
(
2
)
*
M
*
N
*
K
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
// copy result back to host
c_n_m_device_buf
.
FromDevice
(
c_n_m
.
mData
.
data
());
}
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
View file @
0fe6e258
...
...
@@ -4,16 +4,8 @@
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
,
typename
ADesc
,
typename
BDesc
,
typename
CDesc
>
void
device_gemm_xdlops_mk_kn_mn
(
const
ADesc
&
a_m_k_grid_desc
,
const
BDesc
&
b_k_n_grid_desc
,
const
CDesc
&
c_m_n_grid_desc
,
const
Tensor
<
ABType
>&
a_m_k
,
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_mk_kn_mn
(
const
Tensor
<
ABType
>&
a_m_k
,
const
Tensor
<
ABType
>&
b_k_n
,
Tensor
<
CType
>&
c_m_n
,
ck
::
index_t
nrepeat
)
...
...
@@ -22,9 +14,6 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
DeviceMem
a_m_k_device_buf
(
sizeof
(
ABType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
ABType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CType
)
*
c_m_n
.
mDesc
.
GetElementSpace
());
...
...
@@ -34,7 +23,119 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
c_m_n_device_buf
.
ToDevice
(
c_m_n
.
mData
.
data
());
#if 0
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
constexpr index_t BlockSize = 256;
constexpr index_t MPerBlock = 256;
constexpr index_t NPerBlock = 128;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 32;
constexpr index_t NPerXDL = 32;
constexpr index_t K1 = 4;
constexpr index_t MRepeat = 4;
constexpr index_t NRepeat = 2;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 4>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 4>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_N = 2;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
constexpr index_t CThreadTransferDstScalarPerVector = 1;
#elif
0
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
...
...
@@ -89,8 +190,8 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif
1
// [M, N, K0, K1] = [128, 128, 4, 8] for fp16
#elif
0
// [M, N, K0, K1] = [128, 128, 4, 8]
, C = 128,
for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
...
...
@@ -116,46 +217,101 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#endif
const
auto
K
=
a_m_k
_grid_d
esc
.
GetLength
(
I1
)
;
const
auto
M
=
a_m_k
_grid_d
esc
.
GetLength
(
I0
)
;
const
auto
N
=
b_k_n
_grid_d
esc
.
GetLength
(
I1
)
;
const
auto
K
=
a_m_k
.
mD
esc
.
GetLength
s
()[
1
]
;
const
auto
M
=
a_m_k
.
mD
esc
.
GetLength
s
()[
0
]
;
const
auto
N
=
b_k_n
.
mD
esc
.
GetLength
s
()[
1
]
;
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
const
auto
a_k0_m_k1_grid_desc
=
transform_tensor_descriptor
(
a_m_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
M
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_m_k
.
mDesc
.
GetStrides
()[
1
],
a_m_k
.
mDesc
.
GetStrides
()[
0
],
a_m_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
b_k0_n_k1_grid_desc
=
transform_tensor_descriptor
(
b_k_n_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_k_n
.
mDesc
.
GetStrides
()[
0
],
b_k_n
.
mDesc
.
GetStrides
()[
1
],
b_k_n
.
mDesc
.
GetStrides
()[
0
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_m_n
.
mDesc
.
GetStrides
()[
0
],
c_m_n
.
mDesc
.
GetStrides
()[
1
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: M
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: M
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: N
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: N
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
...
...
@@ -175,9 +331,9 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
...
...
@@ -222,13 +378,17 @@ void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
false
,
// CAccessOrderMRepeatNRepeat
true
,
// ABlockLdsExtraM
true
// BBlockLdsExtraN
>
(
static_cast
<
ABType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
debug_driver_gemm_xdlops_v2r3
::
M01
,
debug_driver_gemm_xdlops_v2r3
::
N01
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_mk_kn_nm.hpp
0 → 100644
View file @
0fe6e258
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_mk_kn_nm
(
const
Tensor
<
ABType
>&
a_m_k
,
const
Tensor
<
ABType
>&
b_k_n
,
Tensor
<
CType
>&
c_n_m
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
DeviceMem
a_m_k_device_buf
(
sizeof
(
ABType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
ABType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_n_m_device_buf
(
sizeof
(
CType
)
*
c_n_m
.
mDesc
.
GetElementSpace
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
c_n_m_device_buf
.
ToDevice
(
c_n_m
.
mData
.
data
());
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
constexpr index_t BlockSize = 256;
constexpr index_t MPerBlock = 256;
constexpr index_t NPerBlock = 128;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 32;
constexpr index_t NPerXDL = 32;
constexpr index_t K1 = 4;
constexpr index_t MRepeat = 4;
constexpr index_t NRepeat = 2;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 4>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 4>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_N = 2;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
constexpr index_t CThreadTransferDstScalarPerVector = 4;
#elif
0
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
2
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [128, 256, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [128, 128, 4, 8] for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_N
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#endif
const
auto
K
=
a_m_k
.
mDesc
.
GetLengths
()[
1
];
const
auto
M
=
a_m_k
.
mDesc
.
GetLengths
()[
0
];
const
auto
N
=
b_k_n
.
mDesc
.
GetLengths
()[
1
];
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
const
auto
a_k0_m_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_m_k
.
mDesc
.
GetStrides
()[
1
],
a_m_k
.
mDesc
.
GetStrides
()[
0
],
a_m_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
b_k0_n_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_k_n
.
mDesc
.
GetStrides
()[
0
],
b_k_n
.
mDesc
.
GetStrides
()[
1
],
b_k_n
.
mDesc
.
GetStrides
()[
0
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_n_m
.
mDesc
.
GetStrides
()[
1
],
c_n_m
.
mDesc
.
GetStrides
()[
0
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4+: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5+: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6+: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 7+: N2
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4-: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5-: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
ABType
,
AccType
,
CType
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
a_k0_m_k1_grid_desc
),
decltype
(
b_k0_n_k1_grid_desc
),
decltype
(
c_m_n_grid_desc
),
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerXDL
,
NPerXDL
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
Sequence
<
1
,
0
,
2
>
,
Sequence
<
1
,
0
,
2
>
,
2
,
ABlockTransferSrcScalarPerVector_K1
,
ABlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
Sequence
<
0
,
2
,
1
>
,
Sequence
<
0
,
2
,
1
>
,
1
,
BBlockTransferSrcScalarPerVector_N
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
6
,
CThreadTransferDstScalarPerVector
,
decltype
(
a_k0_m_k1_grid_step_hacks
),
decltype
(
b_k0_n_k1_grid_step_hacks
),
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
>
(
static_cast
<
ABType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_n_m_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
a_k0_m_k1_grid_move_slice_window_step_hacks
,
b_k0_n_k1_grid_move_slice_window_step_hacks
,
nrepeat
);
float
perf
=
static_cast
<
float
>
((
std
::
size_t
(
2
)
*
M
*
N
*
K
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
// copy result back to host
c_n_m_device_buf
.
FromDevice
(
c_n_m
.
mData
.
data
());
}
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_mk_nk_mn.hpp
View file @
0fe6e258
...
...
@@ -4,16 +4,8 @@
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
,
typename
ADesc
,
typename
BDesc
,
typename
CDesc
>
void
device_gemm_xdlops_mk_nk_mn
(
const
ADesc
&
a_m_k_grid_desc
,
const
BDesc
&
b_n_k_grid_desc
,
const
CDesc
&
c_m_n_grid_desc
,
const
Tensor
<
ABType
>&
a_m_k
,
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_mk_nk_mn
(
const
Tensor
<
ABType
>&
a_m_k
,
const
Tensor
<
ABType
>&
b_n_k
,
Tensor
<
CType
>&
c_m_n
,
ck
::
index_t
nrepeat
)
...
...
@@ -22,9 +14,6 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
DeviceMem
a_m_k_device_buf
(
sizeof
(
ABType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_n_k_device_buf
(
sizeof
(
ABType
)
*
b_n_k
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CType
)
*
c_m_n
.
mDesc
.
GetElementSpace
());
...
...
@@ -34,6 +23,34 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
c_m_n_device_buf
.
ToDevice
(
c_m_n
.
mData
.
data
());
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
constexpr index_t BlockSize = 256;
constexpr index_t MPerBlock = 256;
constexpr index_t NPerBlock = 128;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 32;
constexpr index_t NPerXDL = 32;
constexpr index_t K1 = 4;
constexpr index_t MRepeat = 4;
constexpr index_t NRepeat = 2;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 4>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 4>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
constexpr index_t CThreadTransferDstScalarPerVector = 1;
#elif
0
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -62,7 +79,63 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
// [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
...
...
@@ -90,7 +163,7 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 256, 4, 8] for fp16
// [M, N, K0, K1] = [128, 256, 4, 8]
, C = 128,
for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
...
...
@@ -117,8 +190,8 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif
1
// [M, N, K0, K1] = [128, 128, 4, 8] for fp16
#elif
0
// [M, N, K0, K1] = [128, 128, 4, 8]
, C = 128,
for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
...
...
@@ -144,46 +217,131 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 0
// [M, N, K0, K1] = [64, 128, 4, 8], C = 64, for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#elif 1
// [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
1
;
#endif
const
auto
K
=
a_m_k
_grid_d
esc
.
GetLength
(
I1
)
;
const
auto
M
=
a_m_k
_grid_d
esc
.
GetLength
(
I0
)
;
const
auto
N
=
b_n_k
_grid_d
esc
.
GetLength
(
I0
)
;
const
auto
K
=
a_m_k
.
mD
esc
.
GetLength
s
()[
1
]
;
const
auto
M
=
a_m_k
.
mD
esc
.
GetLength
s
()[
0
]
;
const
auto
N
=
b_n_k
.
mD
esc
.
GetLength
s
()[
0
]
;
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
#if 1
// non-padded GEMM
const
auto
a_k0_m_k1_grid_desc
=
transform_tensor_descriptor
(
a_m_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
M
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_m_k
.
mDesc
.
GetStrides
()[
1
],
a_m_k
.
mDesc
.
GetStrides
()[
0
],
a_m_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
b_k0_n_k1_grid_desc
=
transform_tensor_descriptor
(
b_n_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_n_k
.
mDesc
.
GetStrides
()[
1
],
b_n_k
.
mDesc
.
GetStrides
()[
0
],
b_n_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_m_n
.
mDesc
.
GetStrides
()[
0
],
c_m_n
.
mDesc
.
GetStrides
()[
1
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: M
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: M
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
>
{},
// 1+: N
Sequence
<
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
>
{},
// 1-: N
Sequence
<
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
...
...
@@ -203,9 +361,80 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
#else
// padded GEMM
const
auto
a_k0_m_k1_grid_desc_tmp
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_m_k
.
mDesc
.
GetStrides
()[
1
],
a_m_k
.
mDesc
.
GetStrides
()[
0
],
a_m_k
.
mDesc
.
GetStrides
()[
1
]));
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
>
{};
const
auto
MRightPad
=
math
::
integer_divide_ceil
(
M
,
MPerBlock
)
*
MPerBlock
-
M
;
const
auto
a_k0_m_k1_grid_desc
=
transform_tensor_descriptor
(
a_k0_m_k1_grid_desc_tmp
,
make_tuple
(
make_pass_through_transform
(
K0
),
make_right_pad_transform
(
M
,
MRightPad
),
make_pass_through_transform
(
K1Number
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
const
auto
b_k0_n_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_n_k
.
mDesc
.
GetStrides
()[
1
],
b_n_k
.
mDesc
.
GetStrides
()[
0
],
b_n_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
c_m_n_grid_desc_tmp
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_m_n
.
mDesc
.
GetStrides
()[
0
],
c_m_n
.
mDesc
.
GetStrides
()[
1
]));
const
auto
c_m_n_grid_desc
=
transform_tensor_descriptor
(
c_m_n_grid_desc_tmp
,
make_tuple
(
make_right_pad_transform
(
M
,
MRightPad
),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
>
{},
// 0+: K0
Sequence
<
0
,
0
,
0
,
0
>
{},
// 1+: M
Sequence
<
0
,
0
,
0
,
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
>
{},
// 0-: K0
Sequence
<
0
,
0
,
0
,
0
>
{},
// 1-: M
Sequence
<
0
,
0
,
0
,
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4+: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5+: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6+: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 7+: N2
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4-: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5-: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
,
0
,
0
,
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
#endif
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
...
...
@@ -250,13 +479,17 @@ void device_gemm_xdlops_mk_nk_mn(const ADesc& a_m_k_grid_desc,
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
false
,
// CAccessOrderMRepeatNRepeat
true
,
// ABlockLdsExtraM
true
// BBlockLdsExtraN
>
(
static_cast
<
ABType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_n_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
debug_driver_gemm_xdlops_v2r3
::
M01
,
debug_driver_gemm_xdlops_v2r3
::
N01
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/include/device_gemm_xdlops_mk_nk_nm.hpp
0 → 100644
View file @
0fe6e258
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
ABType
,
typename
AccType
,
typename
CType
>
void
device_gemm_xdlops_mk_nk_nm
(
const
Tensor
<
ABType
>&
a_m_k
,
const
Tensor
<
ABType
>&
b_n_k
,
Tensor
<
CType
>&
c_n_m
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
DeviceMem
a_m_k_device_buf
(
sizeof
(
ABType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_n_k_device_buf
(
sizeof
(
ABType
)
*
b_n_k
.
mDesc
.
GetElementSpace
());
DeviceMem
c_n_m_device_buf
(
sizeof
(
CType
)
*
c_n_m
.
mDesc
.
GetElementSpace
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_n_k_device_buf
.
ToDevice
(
b_n_k
.
mData
.
data
());
c_n_m_device_buf
.
ToDevice
(
c_n_m
.
mData
.
data
());
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
constexpr index_t BlockSize = 256;
constexpr index_t MPerBlock = 256;
constexpr index_t NPerBlock = 128;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 32;
constexpr index_t NPerXDL = 32;
constexpr index_t K1 = 4;
constexpr index_t MRepeat = 4;
constexpr index_t NRepeat = 2;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 4>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 4>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
constexpr index_t CThreadTransferDstScalarPerVector = 4;
#elif
0
// [M, N, K0, K1] = [128, 256, 4, 4] for fp32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
4
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
4
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
4
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
4
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
4
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
4
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 0
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
256
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 0
// [M, N, K0, K1] = [128, 256, 4, 8] for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
256
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
4
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
4
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
4
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
4
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 0
// [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
128
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
2
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
2
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#elif 1
// [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
MPerBlock
=
64
;
constexpr
index_t
NPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
4
;
constexpr
index_t
MPerXDL
=
32
;
constexpr
index_t
NPerXDL
=
32
;
constexpr
index_t
K1
=
8
;
constexpr
index_t
MRepeat
=
1
;
constexpr
index_t
NRepeat
=
2
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
1
,
1
,
8
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
8
;
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
1
,
2
,
8
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
4
,
64
,
1
>
;
constexpr
index_t
BBlockTransferSrcScalarPerVector_K1
=
8
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
8
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
4
;
#endif
const
auto
K
=
a_m_k
.
mDesc
.
GetLengths
()[
1
];
const
auto
M
=
a_m_k
.
mDesc
.
GetLengths
()[
0
];
const
auto
N
=
b_n_k
.
mDesc
.
GetLengths
()[
0
];
constexpr
auto
K1Number
=
Number
<
K1
>
{};
const
auto
K0
=
K
/
K1Number
;
const
auto
a_k0_m_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
M
,
K1Number
),
make_tuple
(
K1Number
*
a_m_k
.
mDesc
.
GetStrides
()[
1
],
a_m_k
.
mDesc
.
GetStrides
()[
0
],
a_m_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
b_k0_n_k1_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
,
N
,
K1Number
),
make_tuple
(
K1Number
*
b_n_k
.
mDesc
.
GetStrides
()[
1
],
b_n_k
.
mDesc
.
GetStrides
()[
0
],
b_n_k
.
mDesc
.
GetStrides
()[
1
]));
const
auto
c_m_n_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
c_n_m
.
mDesc
.
GetStrides
()[
1
],
c_n_m
.
mDesc
.
GetStrides
()[
0
]));
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: M
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: M
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
// 0+: K0
Sequence
<
0
>
{},
// 1+: N
Sequence
<
0
>
{}),
// 2+: K1
make_tuple
(
Sequence
<
0
>
{},
// 0-: K0
Sequence
<
0
>
{},
// 1-: N
Sequence
<
0
>
{}));
// 2-: K1
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4+: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5+: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6+: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 7+: N2
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: M0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: N0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: M1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: N1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 4-: M2
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 5-: M3
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M4
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N2
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hacks
=
Sequence
<
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
ABType
,
AccType
,
CType
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
a_k0_m_k1_grid_desc
),
decltype
(
b_k0_n_k1_grid_desc
),
decltype
(
c_m_n_grid_desc
),
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerXDL
,
NPerXDL
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
Sequence
<
1
,
0
,
2
>
,
Sequence
<
1
,
0
,
2
>
,
2
,
ABlockTransferSrcScalarPerVector_K1
,
ABlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
Sequence
<
1
,
0
,
2
>
,
Sequence
<
1
,
0
,
2
>
,
2
,
BBlockTransferSrcScalarPerVector_K1
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
6
,
CThreadTransferDstScalarPerVector
,
decltype
(
a_k0_m_k1_grid_step_hacks
),
decltype
(
b_k0_n_k1_grid_step_hacks
),
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
),
decltype
(
a_k0_m_k1_grid_move_slice_window_step_hacks
),
decltype
(
b_k0_n_k1_grid_move_slice_window_step_hacks
),
false
// CAccessOrderMRepeatNRepeat
>
(
static_cast
<
ABType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ABType
*>
(
b_n_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CType
*>
(
c_n_m_device_buf
.
GetDeviceBuffer
()),
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
a_k0_m_k1_grid_step_hacks
,
b_k0_n_k1_grid_step_hacks
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks
,
a_k0_m_k1_grid_move_slice_window_step_hacks
,
b_k0_n_k1_grid_move_slice_window_step_hacks
,
nrepeat
);
float
perf
=
static_cast
<
float
>
((
std
::
size_t
(
2
)
*
M
*
N
*
K
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
// copy result back to host
c_n_m_device_buf
.
FromDevice
(
c_n_m
.
mData
.
data
());
}
This diff is collapsed.
Click to expand it.
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
View file @
0fe6e258
#ifndef DRIVER_GEMM_XDLOPS_V2R3
#define DRIVER_GEMM_XDLOPS_V2R3
#ifndef DRIVER_GEMM_XDLOPS_V2R3
_HPP
#define DRIVER_GEMM_XDLOPS_V2R3
_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
...
...
@@ -46,13 +46,17 @@ template <ck::index_t BlockSize,
typename
CGridStepHacks
,
typename
AGridMoveSliceWindowStepHacks
,
typename
BGridMoveSliceWindowStepHacks
,
bool
CAccessOrderMRepeatNRepeat
>
bool
CAccessOrderMRepeatNRepeat
,
bool
ABlockLdsAddExtraM
,
bool
BBlockLdsAddExtraN
>
__host__
float
driver_gemm_xdlops_v2r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
ck
::
index_t
M01
,
ck
::
index_t
N01
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
...
...
@@ -108,7 +112,9 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
,
CAccessOrderMRepeatNRepeat
>
;
CAccessOrderMRepeatNRepeat
,
ABlockLdsAddExtraM
,
BBlockLdsAddExtraN
>
;
{
std
::
cout
<<
"a_k0_m_k1_grid_desc{"
<<
a_k0_m_k1_grid_desc
.
GetLength
(
I0
)
<<
", "
...
...
@@ -123,7 +129,8 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
<<
c_m_n_grid_desc
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
}
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
))
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
,
M01
,
N01
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"
);
...
...
@@ -134,7 +141,8 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
using
CM0N0M1N1M2M3M4N2GridDesc
=
decltype
(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc
);
const
auto
c_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockClusterAdaptor
(
c_m_n_grid_desc
);
const
auto
c_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockClusterAdaptor
(
c_m_n_grid_desc
,
M01
,
N01
);
using
CBlockClusterAdaptor
=
decltype
(
c_block_cluster_adaptor
);
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/src/conv_bwd_driver_offline.cpp
View file @
0fe6e258
...
...
@@ -16,7 +16,7 @@
#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_MODE 1
#define USE_CONV_BWD_V4R1_XDL_NHWC
1
#define USE_CONV_BWD_V4R1_XDL_NHWC
0
#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
enum
ConvBackwardDataAlgo
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
0fe6e258
...
...
@@ -24,7 +24,7 @@
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW
1
#define USE_CONV_FWD_V4R4R2_XDL_NCHW
0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
enum
ConvForwardAlgo
...
...
This diff is collapsed.
Click to expand it.
host/driver_offline/src/gemm_driver_offline.cpp
View file @
0fe6e258
...
...
@@ -5,6 +5,7 @@
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -16,11 +17,19 @@
#include "device_gemm_xdlops_mk_nk_mn.hpp"
#include "device_gemm_xdlops_km_kn_mn.hpp"
#include "device_gemm_xdlops_km_nk_mn.hpp"
#include "device_gemm_xdlops_mk_kn_nm.hpp"
#include "device_gemm_xdlops_mk_nk_nm.hpp"
#include "device_gemm_xdlops_km_kn_nm.hpp"
#include "device_gemm_xdlops_km_nk_nm.hpp"
#define USE_GEMM_XDL_MK_KN_MN 1
#define USE_GEMM_XDL_MK_NK_MN 1
#define USE_GEMM_XDL_KM_KN_MN 1
#define USE_GEMM_XDL_KM_NK_MN 1
#define USE_GEMM_XDL_MK_KN_NM 0
#define USE_GEMM_XDL_MK_NK_NM 0
#define USE_GEMM_XDL_KM_KN_NM 0
#define USE_GEMM_XDL_KM_NK_NM 0
enum
GemmAlgo
{
...
...
@@ -28,21 +37,21 @@ enum GemmAlgo
Xdl_MK_NK_MN
,
// 1
Xdl_KM_KN_MN
,
// 2
Xdl_KM_NK_MN
,
// 3
Xdl_MK_KN_NM
,
// 4
Xdl_MK_NK_NM
,
// 5
Xdl_KM_KN_NM
,
// 6
Xdl_KM_NK_NM
,
// 7
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
// dynamic mode
if
(
argc
!=
10
)
if
(
argc
!=
12
)
{
printf
(
"arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: M, N, K
\n
"
);
printf
(
"rest: M, N, K, debug_driver_gemm_xdlops_v2r3::M01, "
"debug_driver_gemm_xdlops_v2r3::N01
\n
"
);
exit
(
1
);
}
...
...
@@ -57,6 +66,9 @@ int main(int argc, char* argv[])
const
index_t
N
=
std
::
stoi
(
argv
[
8
]);
const
index_t
K
=
std
::
stoi
(
argv
[
9
]);
debug_driver_gemm_xdlops_v2r3
::
M01
=
std
::
stoi
(
argv
[
10
]);
debug_driver_gemm_xdlops_v2r3
::
N01
=
std
::
stoi
(
argv
[
11
]);
#if 0
using ab_data_t = float;
using acc_data_t = float;
...
...
@@ -74,69 +86,44 @@ int main(int argc, char* argv[])
std
::
vector
<
std
::
size_t
>
a_lengths_host
(
2
),
b_lengths_host
(
2
),
c_lengths_host
(
2
);
std
::
vector
<
std
::
size_t
>
a_strides_host
(
2
),
b_strides_host
(
2
),
c_strides_host
(
2
);
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
// A
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
||
layout
==
GemmMatrixLayout
::
MK_NK_MN
||
layout
==
GemmMatrixLayout
::
MK_KN_NM
||
layout
==
GemmMatrixLayout
::
MK_NK_NM
)
{
a_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
b_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
c_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
else
{
a_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
// B
if
(
layout
==
GemmMatrixLayout
::
MK_NK_MN
||
layout
==
GemmMatrixLayout
::
KM_NK_MN
||
layout
==
GemmMatrixLayout
::
MK_NK_NM
||
layout
==
GemmMatrixLayout
::
KM_NK_NM
)
{
b_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
c_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
else
{
a_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
b_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
c_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
a_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
a_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
a_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
b_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
b_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
b_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
// C
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
||
layout
==
GemmMatrixLayout
::
KM_KN_MN
||
layout
==
GemmMatrixLayout
::
MK_NK_MN
||
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
c_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
...
...
@@ -144,7 +131,10 @@ int main(int argc, char* argv[])
}
else
{
std
::
runtime_error
(
"wrong! not implemented"
);
c_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
c_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_strides_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
M
);
c_strides_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
1
);
}
Tensor
<
ab_data_t
>
a
(
a_lengths_host
,
a_strides_host
);
...
...
@@ -185,38 +175,6 @@ int main(int argc, char* argv[])
b
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
}
auto
f_make_for_device_mk_kn_mn
=
[
&
]()
{
const
auto
a_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
K
,
I1
));
const
auto
b_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
N
,
I1
));
const
auto
c_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
N
,
I1
));
return
make_tuple
(
a_desc
,
b_desc
,
c_desc
);
};
auto
f_make_for_device_mk_nk_mn
=
[
&
]()
{
const
auto
a_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
K
,
I1
));
const
auto
b_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
K
),
make_tuple
(
K
,
I1
));
const
auto
c_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
N
,
I1
));
return
make_tuple
(
a_desc
,
b_desc
,
c_desc
);
};
auto
f_make_for_device_km_kn_mn
=
[
&
]()
{
const
auto
a_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K
,
M
),
make_tuple
(
M
,
I1
));
const
auto
b_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
N
,
I1
));
const
auto
c_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
N
,
I1
));
return
make_tuple
(
a_desc
,
b_desc
,
c_desc
);
};
auto
f_make_for_device_km_nk_mn
=
[
&
]()
{
const
auto
a_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K
,
M
),
make_tuple
(
M
,
I1
));
const
auto
b_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
K
),
make_tuple
(
K
,
I1
));
const
auto
c_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
N
,
I1
));
return
make_tuple
(
a_desc
,
b_desc
,
c_desc
);
};
#if USE_GEMM_XDL_MK_KN_MN
if
(
algo
==
GemmAlgo
::
Xdl_MK_KN_MN
)
{
...
...
@@ -225,10 +183,7 @@ int main(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
descs
=
f_make_for_device_mk_kn_mn
();
device_gemm_xdlops_mk_kn_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
descs
[
I0
],
descs
[
I1
],
descs
[
I2
],
a
,
b
,
c_device
,
nrepeat
);
device_gemm_xdlops_mk_kn_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
...
...
@@ -240,10 +195,7 @@ int main(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
descs
=
f_make_for_device_mk_nk_mn
();
device_gemm_xdlops_mk_nk_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
descs
[
I0
],
descs
[
I1
],
descs
[
I2
],
a
,
b
,
c_device
,
nrepeat
);
device_gemm_xdlops_mk_nk_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
...
...
@@ -255,10 +207,7 @@ int main(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
descs
=
f_make_for_device_km_kn_mn
();
device_gemm_xdlops_km_kn_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
descs
[
I0
],
descs
[
I1
],
descs
[
I2
],
a
,
b
,
c_device
,
nrepeat
);
device_gemm_xdlops_km_kn_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
...
...
@@ -270,10 +219,55 @@ int main(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
descs
=
f_make_for_device_km_nk_mn
();
device_gemm_xdlops_km_nk_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_MK_KN_NM
if
(
algo
==
GemmAlgo
::
Xdl_MK_KN_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
MK_KN_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_mk_kn_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_MK_NK_NM
if
(
algo
==
GemmAlgo
::
Xdl_MK_NK_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
MK_NK_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_mk_nk_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_KM_KN_NM
if
(
algo
==
GemmAlgo
::
Xdl_KM_KN_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
KM_KN_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_km_kn_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
#if USE_GEMM_XDL_KM_NK_NM
if
(
algo
==
GemmAlgo
::
Xdl_KM_NK_NM
)
{
if
(
layout
!=
GemmMatrixLayout
::
KM_NK_NM
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
device_gemm_xdlops_km_nk_mn
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
descs
[
I0
],
descs
[
I1
],
descs
[
I2
],
a
,
b
,
c_device
,
nrepeat
);
device_gemm_xdlops_km_nk_nm
<
ab_data_t
,
acc_data_t
,
c_data_t
>
(
a
,
b
,
c_device
,
nrepeat
);
}
#endif
...
...
This diff is collapsed.
Click to expand it.
host/host_tensor/include/gemm_common.hpp
View file @
0fe6e258
...
...
@@ -7,6 +7,10 @@ enum GemmMatrixLayout
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
,
// 7
};
#endif
This diff is collapsed.
Click to expand it.
host/host_tensor/include/host_gemm.hpp
View file @
0fe6e258
...
...
@@ -80,6 +80,78 @@ void host_gemm(const Tensor<AType>& a,
make_ParallelTensorFunctor
(
f_km_nk_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_KN_NM
)
{
auto
f_mk_kn_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_NK_NM
)
{
auto
f_mk_nk_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_KN_NM
)
{
auto
f_km_kn_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_NK_NM
)
{
auto
f_km_nk_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help