Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liyinrong
composable_kernel
Commits
7ec80cef
Commit
7ec80cef
authored
5 years ago
by
kevin.chang
Browse files
Options
Download
Email Patches
Plain Diff
enable vectorx4-load,vectorx2-load and fake-extend odd to even image
parent
4904281f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+68
-2
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+73
-50
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
composable_kernel/include/utility/common_header.hpp
+3
-0
composable_kernel/include/utility/common_header.hpp
driver/src/conv_driver.cpp
+13
-1
driver/src/conv_driver.cpp
with
157 additions
and
53 deletions
+157
-53
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
7ec80cef
...
...
@@ -75,6 +75,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
constexpr
index_t
ConvDilationH
=
ConvDilations
{}[
0
];
constexpr
index_t
ConvDilationW
=
ConvDilations
{}[
1
];
#if CK_EXTEND_IMAGE_SIZE_PAD_W
#else
// sanity-check for vectorized memory load
static_assert
((
Wo
==
1
||
(
ConvStrideW
==
1
||
GemmBBlockCopySrcDataPerRead_GemmN
==
1
))
&&
(
X
==
1
||
ConvDilationW
%
GemmBBlockCopySrcDataPerRead_GemmN
==
0
)
&&
...
...
@@ -82,6 +84,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
InRightPads
{}[
1
]
%
GemmBBlockCopySrcDataPerRead_GemmN
==
0
,
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"
);
#endif
// weight tensor
constexpr
auto
wei_e_k_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
...
...
@@ -107,20 +110,83 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
Embed
<
Wip
,
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
#if CK_EXTEND_IMAGE_SIZE_PAD_HW
constexpr
auto
in_n_c_y_ho_x_wo_global_desc_unfold
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
PassThrough
<
Y
>
{},
PassThrough
<
X
>
{},
Merge
<
Sequence
<
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
constexpr
auto
in_n_c_y_ho_x_wo_global_desc_unfold_padd
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc_unfold
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
PassThrough
<
Y
>
{},
PassThrough
<
X
>
{},
Pad
<
Sequence
<
Ho
*
Wo
>
,
Sequence
<
0
>
,
Sequence
<
3
>>
{}),
//Transforms,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
constexpr
auto
in_e_b_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc_unfold_padd
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
52
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
3
>
{},
Sequence
<
0
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
#else
constexpr
auto
in_e_b_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
#endif
#if CK_EXTEND_IMAGE_SIZE_PAD_W
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Hi
,
Wi
>
{});
constexpr
auto
out_nkhw_desc_native
=
make_native_tensor_descriptor
(
out_nkhw_desc
.
GetLengths
(),
out_nkhw_desc
.
GetStrides
());
constexpr
auto
out_n_k_ho_wo_global_desc_pad
=
transform_tensor_descriptor
(
out_nkhw_desc_native
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
InLeftPads
,
InRightPads
>
{}),
//Transforms,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}),
//LowDimensionIds,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}));
// UpDimensionIds
// output tensor
constexpr
auto
out_k_b_global_desc
=
transform_tensor_descriptor
(
out_n_k_ho_wo_global_desc_pad
,
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
#elif CK_EXTEND_IMAGE_SIZE_PAD_HW
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Hi
,
Wi
>
{});
constexpr
auto
out_nkhw_desc_native
=
make_native_tensor_descriptor
(
out_nkhw_desc
.
GetLengths
(),
out_nkhw_desc
.
GetStrides
());
constexpr
auto
out_nkhw_desc_unfold
=
transform_tensor_descriptor
(
out_nkhw_desc_native
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
Merge
<
Sequence
<
Hi
,
Wi
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
constexpr
auto
out_nkhw_desc_unfold_pad
=
transform_tensor_descriptor
(
out_nkhw_desc_unfold
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
Pad
<
Sequence
<
Hi
*
Wi
>
,
Sequence
<
0
>
,
Sequence
<
3
>>
{}),
//Transforms,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
//LowDimensionIds,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
// UpDimensionIds
constexpr
auto
out_k_b_global_desc
=
transform_tensor_descriptor
(
out_nkhw_desc_unfold_pad
,
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
,
52
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
#else
// output tensor
constexpr
auto
out_k_b_global_desc
=
transform_tensor_descriptor
(
unfold_tensor_descriptor
(
out_n_k_ho_wo_global_desc
,
I2
,
I3
),
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
,
Ho
*
Wo
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
#endif
// GEMM
constexpr
auto
gridwise_gemm
=
GridwiseGemmTransposedANormalBNormalC_v1
<
GridSize
,
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
7ec80cef
...
...
@@ -5,6 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
#define CK_VECTORX4_FLAG 0
#define CK_VECTORX2_FLAG 1
namespace
ck
{
...
...
@@ -89,15 +91,82 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a src long-vector
SrcData
p_src_long_vector
[
long_vector_size
];
SrcData
p_src_long_vector
[
long_vector_size
*
2
];
// zero out buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_src_long_vector
[
i
]
=
0
;
}
#if 0 //original code
#if CK_VECTORX4_FLAG //vectorloadx4
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
auto
src_coord
=
mSrcSliceOrigin
+
long_vector_data_begin_id
;
scalar_id
(
vector_access_dim
)
=
3
;
if
((
long_vector_size
==
4
)
&&
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
((
src_coord
.
CalculateOffsetDiff
(
scalar_id
))
==
3
)){
transfer_data
<
SrcData
,
4
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
0
);
}
else
{
//original code
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
scalar_id
(
vector_access_dim
)
=
i
;
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
{
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
i
);
}
}
}
}
#elif CK_VECTORX2_FLAG //vectorloadx2
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
auto
src_coord
=
mSrcSliceOrigin
+
long_vector_data_begin_id
;
scalar_id
(
vector_access_dim
)
=
1
;
if
((
long_vector_size
==
2
)
&&
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
((
src_coord
.
CalculateOffsetDiff
(
scalar_id
))
==
1
)){
transfer_data
<
SrcData
,
2
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
0
);
}
else
{
//original code
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
scalar_id
(
vector_access_dim
)
=
i
;
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
{
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
i
);
}
}
}
}
#else //original code
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
...
...
@@ -121,53 +190,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
}
}
#else
//try vector load
// load data from src to the long-vector buffer
index_t
i
=
0
;
while
(
i
<
long_vector_size
){
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
i
;
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
scalar_id
(
vector_access_dim
)
=
i
+
3
;
const
auto
src_coord3
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
scalar_id
(
vector_access_dim
)
=
i
+
1
;
const
auto
src_coord1
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
if
(((
long_vector_size
-
i
)
>=
4
)
&&
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
src_coord3
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
(
src_coord
.
GetOffset
()
+
3
==
src_coord3
.
GetOffset
())){
transfer_data
<
SrcData
,
4
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
i
=
i
+
4
;
}
else
if
(((
long_vector_size
-
i
)
>=
2
)
&&
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
src_coord1
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
((
src_coord
.
GetOffset
()
+
1
)
==
src_coord1
.
GetOffset
())){
transfer_data
<
SrcData
,
2
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
i
=
i
+
2
;
}
else
{
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
{
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
}
i
++
;
}
}
#endif
#endif
// SrcData to DstData conversion
DstData
p_dst_long_vector
[
long_vector_size
];
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/common_header.hpp
View file @
7ec80cef
...
...
@@ -29,4 +29,7 @@
#include "amd_xdlops.hpp"
#endif
#define CK_EXTEND_IMAGE_SIZE_PAD_W 1 //fake pad 7x7 into 7x8 to make it dwordx4
#define CK_EXTEND_IMAGE_SIZE_PAD_HW 0 //fake pad 7x7 into 7x7+3 to make it dwordx4
#endif
This diff is collapsed.
Click to expand it.
driver/src/conv_driver.cpp
View file @
7ec80cef
...
...
@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
{
using
namespace
ck
;
#if
0
#if
1
// 1x1 for vector memory access , 7x7 image size
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
...
...
@@ -43,7 +43,11 @@ int main(int argc, char* argv[])
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
#if CK_EXTEND_IMAGE_SIZE_PAD_W
using
RightPads
=
Sequence
<
0
,
1
>
;
#else
using
RightPads
=
Sequence
<
0
,
0
>
;
#endif
#elif 1
// 1x1 for vector memory access, 13x13 image size
constexpr
index_t
N
=
128
;
...
...
@@ -379,6 +383,10 @@ int main(int argc, char* argv[])
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor_deprecated
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
#if CK_EXTEND_IMAGE_SIZE_PAD_W
auto
out_nkhw_desc_nopadd
=
get_convolution_output_default_4d_tensor_descriptor_deprecated
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
Sequence
<
0
,
0
>
{});
//RightPads{});
#endif
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
...
...
@@ -392,7 +400,11 @@ int main(int argc, char* argv[])
using
out_data_t
=
float
;
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
#if CK_EXTEND_IMAGE_SIZE_PAD_W
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc_nopadd
));
#else
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
#endif
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help