Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liyinrong
composable_kernel
Commits
3d66465a
Commit
3d66465a
authored
4 years ago
by
Jing Zhang
Browse files
Options
Download
Email Patches
Plain Diff
add non-pad
parent
bf111ac6
No related merge requests found
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+74
-35
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+11
-6
...tion_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+2
-2
...ble_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+8
-114
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+23
-0
...le_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+3
-3
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
driver/src/conv_driver.cpp
+15
-1
driver/src/conv_driver.cpp
with
136 additions
and
161 deletions
+136
-161
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
3d66465a
...
...
@@ -31,6 +31,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Add
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
...
...
@@ -38,6 +39,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
typename
InRightPads
>
__host__
void
Run
(
const
DynamicTensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
DynamicTensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
DynamicTensorDescriptor
<
Add
...
>&
add_n_k0_hox2_wox2_k1_global_desc
,
const
DynamicTensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
...
...
@@ -45,6 +47,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -63,6 +66,9 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
Hox2
=
Ho
*
2
;
const
auto
Wox2
=
Wo
*
2
;
const
auto
K1
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
K
=
wei_k_c_y_x_global_desc
.
GetLength
(
I0
);
...
...
@@ -127,6 +133,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// add tensor
const
auto
add_k_n_hopx2_wopx2_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
)),
make_tuple
(
make_pass_through_transform
(
K0
),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hox2
),
make_pass_through_transform
(
Wox2
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
E
=
C
*
Y
*
X
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Ho
%
HoPerBlock
)
==
0
&&
(
Wo
%
WoPerBlock
)
==
0
&&
...
...
@@ -158,16 +174,15 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
>
{},
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
#if 0
// GEMM
using
gridwise_gemm
=
GridwiseDynamicGemm_km_kn_mn_v3
<
BlockSize
,
...
...
@@ -177,6 +192,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
InMemoryDataOperation
::
Set
,
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_n_ho_wo_global_desc
),
decltype
(
add_k_n_hopx2_wopx2_global_desc
),
decltype
(
out_k_n_ho_wo_global_desc
),
KPerBlock
,
HoPerBlock
,
...
...
@@ -228,13 +244,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
#if 0
//if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(add_k_n_hopx2_wopx2_global_desc),
const FloatC*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, true>,
...
...
@@ -249,22 +268,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global,
in_e_n_ho_wo_global_desc,
p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
#elif
1
// else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
add_k_n_hopx2_wopx2_global_desc
),
const
FloatC
*
,
decltype
(
out_k_n_ho_wo_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
...
...
@@ -275,22 +300,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
add_k_n_hopx2_wopx2_global_desc
,
p_d_global
,
out_k_n_ho_wo_global_desc
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
#elif 1
// else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
add_k_n_hopx2_wopx2_global_desc
),
const
FloatC
*
,
decltype
(
out_k_n_ho_wo_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
...
...
@@ -301,22 +332,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
add_k_n_hopx2_wopx2_global_desc
,
p_d_global
,
out_k_n_ho_wo_global_desc
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
#elif 1
// else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
add_k_n_hopx2_wopx2_global_desc
),
const
FloatC
*
,
decltype
(
out_k_n_ho_wo_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
...
...
@@ -327,11 +364,14 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
add_k_n_hopx2_wopx2_global_desc
,
p_d_global
,
out_k_n_ho_wo_global_desc
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
#endif
}
timer
.
End
();
...
...
@@ -346,7 +386,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
#endif
}
};
}
// namespace ck
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
View file @
3d66465a
...
...
@@ -192,11 +192,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
>
{},
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
...
...
@@ -262,7 +262,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
#if 0
//if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
...
...
@@ -293,7 +294,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
#elif
1
// else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
...
...
@@ -324,7 +326,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
#elif 1
// else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
...
...
@@ -355,7 +358,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
#elif 1
// else
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
...
...
@@ -386,6 +390,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
#endif
}
timer
.
End
();
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
View file @
3d66465a
...
...
@@ -134,8 +134,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr
auto
KPerThreadSubC
=
4
;
constexpr
auto
HoPerThreadSubC
=
2
;
constexpr
auto
WoPerThreadSubC
=
2
;
constexpr
auto
HoPerThreadSubC
=
HPerThread
;
constexpr
auto
WoPerThreadSubC
=
WPerThread
;
static_assert
(
KPerThread
%
KPerThreadSubC
==
0
,
""
);
static_assert
(
HPerThread
%
HoPerThreadSubC
==
0
,
""
);
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
3d66465a
...
...
@@ -175,6 +175,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const
index_t
ho_block_data_on_global
=
ho_block_work_id
*
HoPerBlock
;
const
index_t
wo_block_data_on_global
=
wo_block_work_id
*
WoPerBlock
;
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
const
index_t
ho_thread_data_on_global
=
ho_block_data_on_global
+
ho_thread_id
*
HoPerThread
;
const
index_t
wo_thread_data_on_global
=
...
...
@@ -262,11 +263,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
a_blockwise_copy
.
RunWrite
(
a_e_k_desc
,
p_a_block
);
}
__syncthreads
();
block_sync_lds
();
//__syncthreads();
#if 1
constexpr
auto
KPerThreadAdd
=
KPerThread
/
CThreadTransferDstScalarPerVector
;
constexpr
auto
HoPerThreadx2
=
HoPerThread
*
2
;
constexpr
auto
WoPerThreadx2
=
WoPerThread
*
2
;
...
...
@@ -281,25 +281,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
const
index_t
hox2_block_data_on_global
=
ho_block_work_id
*
HoPerBlock
*
2
;
const
index_t
wox2_block_data_on_global
=
wo_block_work_id
*
WoPerBlock
*
2
;
const
index_t
hox2_thread_data_on_global
=
hox2_block_data_on_global
+
ho_thread_id
*
HoPerThreadx2
;
const
index_t
wox2_thread_data_on_global
=
wox2_block_data_on_global
+
wo_thread_id
*
WoPerThreadx2
;
const
index_t
hox2_thread_data_on_global
=
ho_thread_data_on_global
*
2
;
const
index_t
wox2_thread_data_on_global
=
wo_thread_data_on_global
*
2
;
const
index_t
k_thread_data_on_global_add
=
k_thread_data_on_global
/
CThreadTransferDstScalarPerVector
;
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector
==
0
,
""
);
static_assert
(
CThreadTransferDstScalarPerVector
==
16
,
""
);
const
index_t
k_block_data_on_global_add
=
k_block_work_id
*
KPerBlock
/
CThreadTransferDstScalarPerVector
;
const
index_t
k_thread_data_on_global_add
=
k_block_data_on_global_add
+
k_thread_id
*
KPerThreadAdd
;
static_assert
(
vec_len
==
256
,
""
);
vector_type
<
int8_t
,
vec_len
>
d_vec
;
#if 1
{
ThreadwiseDynamicTensorSliceTransfer_v2
<
...
...
@@ -419,104 +412,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
#endif
// output: register to global memory
#if 0
{
constexpr auto HoPerThreadx2 = HoPerThread * 2;
constexpr auto WoPerThreadx2 = WoPerThread * 2;
const index_t hox2_block_data_on_global = ho_block_work_id * HoPerBlock * 2;
const index_t wox2_block_data_on_global = wo_block_work_id * WoPerBlock * 2;
const index_t hox2_thread_data_on_global =
hox2_block_data_on_global + ho_thread_id * HoPerThreadx2;
const index_t wox2_thread_data_on_global =
wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
const index_t k_block_data_on_global_add =
k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
const index_t k_thread_data_on_global_add =
k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
constexpr auto d_k_n_hox2_wox2_thread_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<1>{}, Number<1>{}, Number<1>{}, Number<1>{}));
constexpr auto vector_len = CThreadTransferDstScalarPerVector;
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
vector_type<int8_t, vector_len> d_vec;
for(index_t k_i = 0; k_i < KPerThreadAdd; ++k_i)
{
for(index_t h_i = 0; h_i < HoPerThreadx2; ++h_i)
{
for(index_t w_i = 0; w_i < WoPerThreadx2; ++w_i)
{
ThreadwiseDynamicTensorSliceTransfer_v2<
FloatC,
decltype(d_vec),
decltype(d_k_n_hox2_wox2_global_desc),
decltype(d_k_n_hox2_wox2_thread_desc),
Sequence<1, 1, 1, 1>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
// CThreadTransferDstScalarPerVector,
1,
AddressSpace::Global,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
1,
true>(d_k_n_hox2_wox2_global_desc,
make_multi_index(k_thread_data_on_global_add + k_i,
0,
hox2_thread_data_on_global + h_i,
wox2_thread_data_on_global + w_i))
.Run2(d_k_n_hox2_wox2_global_desc,
p_d_global,
d_k_n_hox2_wox2_thread_desc,
make_tuple(I0, I0, I0, I0),
d_vec,
c_k_n_ho_wo_global_tensor_iterator_hacks);
static_for<0, vector_len, 1>{}([&](auto i) {
d_vec.template AsType<int8_t>()(i) +=
p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
make_tuple(k_i * vector_len + i, 0, h_i / 2, w_i / 2))];
});
ThreadwiseDynamicTensorSliceTransfer_v1r3<
decltype(d_vec),
FloatC,
decltype(d_k_n_hox2_wox2_thread_desc),
decltype(d_k_n_hox2_wox2_global_desc),
Sequence<1, 1, 1, 1>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
// CThreadTransferDstScalarPerVector,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryDataOperation,
1,
true>(d_k_n_hox2_wox2_global_desc,
make_multi_index(k_thread_data_on_global_add + k_i,
0,
hox2_thread_data_on_global + h_i,
wox2_thread_data_on_global + w_i))
.Run2(d_k_n_hox2_wox2_thread_desc,
make_tuple(I0, I0, I0, I0),
d_vec,
d_k_n_hox2_wox2_global_desc,
p_c_global,
c_k_n_ho_wo_global_tensor_iterator_hacks);
}
}
}
}
#else
{
static_for
<
0
,
KPerThreadAdd
,
1
>
{}([
&
](
auto
k_i
)
{
static_for
<
0
,
HoPerThreadx2
,
1
>
{}([
&
](
auto
h_i
)
{
...
...
@@ -569,7 +464,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
p_c_global
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
}
#endif
}
// pass tensor descriptor by reference
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
View file @
3d66465a
...
...
@@ -109,6 +109,29 @@ struct ThreadwiseGemm_km_kn_mn_v3
p_c
[
c_offset_2
],
p_c
[
c_offset_3
]);
}
else
if
constexpr
(
H
==
1
&&
W
==
4
)
{
constexpr
auto
b_offset_0
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
0
,
0
));
constexpr
auto
b_offset_1
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
0
,
1
));
constexpr
auto
b_offset_2
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
0
,
2
));
constexpr
auto
b_offset_3
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
0
,
3
));
constexpr
auto
c_offset_0
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
0
,
0
));
constexpr
auto
c_offset_1
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
0
,
1
));
constexpr
auto
c_offset_2
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
0
,
2
));
constexpr
auto
c_offset_3
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
0
,
3
));
amd_assembly_outer_product_1x4
(
p_a
[
a_offset
],
p_b
[
b_offset_0
],
p_b
[
b_offset_1
],
p_b
[
b_offset_2
],
p_b
[
b_offset_3
],
p_c
[
c_offset_0
],
p_c
[
c_offset_1
],
p_c
[
c_offset_2
],
p_c
[
c_offset_3
]);
}
else
{
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
h
)
{
...
...
This diff is collapsed.
Click to expand it.
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
3d66465a
...
...
@@ -142,8 +142,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr
index_t
EPerBlock
=
C0
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
HoPerThread
=
4
;
constexpr
index_t
WoPerThread
=
1
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
...
...
@@ -184,7 +184,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
#endif
constexpr
auto
conv_driver
=
#if
0
#if
1
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
...
...
This diff is collapsed.
Click to expand it.
driver/src/conv_driver.cpp
View file @
3d66465a
...
...
@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
#elif 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
HI
=
54
0
;
constexpr
index_t
HI
=
54
4
;
constexpr
index_t
WI
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
...
...
@@ -118,6 +118,20 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
HI
=
135
;
constexpr
index_t
WI
=
240
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help