Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liyinrong
composable_kernel
Commits
4349abe1
Commit
4349abe1
authored
6 years ago
by
Chao Liu
Browse files
Options
Download
Email Patches
Plain Diff
debugging
parent
e55cfe15
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+3
-2
...ion_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
composable_kernel/include/utility/config_amd.hpp.in
+0
-6
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+0
-28
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/vector_type.hpp
+5
-0
composable_kernel/include/utility/vector_type.hpp
driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+36
-0
...de/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
driver/include/tensor.hpp
+44
-0
driver/include/tensor.hpp
driver/src/driver.cpp
+21
-57
driver/src/driver.cpp
with
109 additions
and
93 deletions
+109
-93
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
View file @
4349abe1
...
...
@@ -225,9 +225,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
GemmDataPerReadB
);
constexpr
index_t
in_block_space
=
in_c_n1_b_n2_block_mem_desc
.
GetElementSpace
(
Number
<
max_align
>
{}
);
math
::
integer_least_multiple
(
in_c_n1_b_n2_block_mem_desc
.
GetElementSpace
(
),
max_align
);
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_space
=
math
::
integer_least_multiple
(
wei_c_k_block_desc
.
GetElementSpace
(),
max_align
);
__shared__
Float
p_in_block_double
[
2
*
in_block_space
];
__shared__
Float
p_wei_block_double
[
2
*
wei_block_space
];
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/config_amd.hpp.in
View file @
4349abe1
...
...
@@ -17,12 +17,6 @@ typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
}
} // namespace ck
#endif
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
4349abe1
...
...
@@ -21,34 +21,6 @@ using float2_t = float2;
using float4_t = float4;
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
// TODO:: this interface is misleading, s0, s1 are actually int8x4
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
d = __dp4a(s0, s1, d);
}
#endif
} // namespace ck
...
...
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/vector_type.hpp
View file @
4349abe1
...
...
@@ -64,6 +64,11 @@ struct vector_type<float, 4>
}
};
__device__
void
fused_multiply_accumulate
(
float
&
d
,
const
float
&
s0
,
const
float
&
s1
)
{
d
+=
s0
*
s1
;
}
}
// namespace ck
#endif
This diff is collapsed.
Click to expand it.
driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
View file @
4349abe1
...
...
@@ -93,6 +93,42 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif
1
// blocksize = 512
// give wrong result
constexpr
index_t
BlockSize
=
512
;
constexpr
index_t
BPerBlock
=
32
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
8
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
32
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
2
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif 1
constexpr
index_t
BlockSize
=
256
;
...
...
This diff is collapsed.
Click to expand it.
driver/include/tensor.hpp
View file @
4349abe1
...
...
@@ -8,6 +8,8 @@
#include <utility>
#include <cassert>
#include <iostream>
#include "config.hpp"
#include "common_header.hpp"
template
<
class
Range
>
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
range
,
std
::
string
delim
)
...
...
@@ -269,4 +271,46 @@ struct Tensor
std
::
vector
<
T
>
mData
;
};
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
void
ostream_ConstantTensorDescriptor
(
TConstTensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
{
using
namespace
ck
;
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
os
<<
"Lengths: {"
<<
desc
.
GetLength
(
I0
)
<<
", "
<<
desc
.
GetLength
(
I1
)
<<
", "
<<
desc
.
GetLength
(
I2
)
<<
", "
<<
desc
.
GetLength
(
I3
)
<<
"}, "
<<
"Strides: {"
<<
desc
.
GetStride
(
I0
)
<<
", "
<<
desc
.
GetStride
(
I1
)
<<
", "
<<
desc
.
GetStride
(
I2
)
<<
", "
<<
desc
.
GetStride
(
I3
)
<<
"}"
<<
std
::
endl
;
}
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
auto
make_TensorDescriptor
(
TConstTensorDesc
)
{
using
namespace
ck
;
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
std
::
initializer_list
<
index_t
>
lengths
=
{
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
)};
std
::
initializer_list
<
index_t
>
strides
=
{
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
)};
return
TensorDescriptor
(
lengths
,
strides
);
}
#endif
This diff is collapsed.
Click to expand it.
driver/src/driver.cpp
View file @
4349abe1
...
...
@@ -6,6 +6,7 @@
#include "config.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "conv_common.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
...
...
@@ -65,44 +66,6 @@ struct GeneratorTensor_Checkboard
}
};
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
void
ostream_ConstantTensorDescriptor
(
TConstTensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
{
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
os
<<
"Lengths: {"
<<
desc
.
GetLength
(
I0
)
<<
", "
<<
desc
.
GetLength
(
I1
)
<<
", "
<<
desc
.
GetLength
(
I2
)
<<
", "
<<
desc
.
GetLength
(
I3
)
<<
"}, "
<<
"Strides: {"
<<
desc
.
GetStride
(
I0
)
<<
", "
<<
desc
.
GetStride
(
I1
)
<<
", "
<<
desc
.
GetStride
(
I2
)
<<
", "
<<
desc
.
GetStride
(
I3
)
<<
"}"
<<
std
::
endl
;
}
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
auto
make_TensorDescriptor
(
TConstTensorDesc
)
{
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
std
::
initializer_list
<
index_t
>
lengths
=
{
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
)};
std
::
initializer_list
<
index_t
>
strides
=
{
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
)};
return
TensorDescriptor
(
lengths
,
strides
);
}
template
<
class
TIn
,
class
TWei
,
class
TOut
,
...
...
@@ -460,7 +423,7 @@ int main(int argc, char* argv[])
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
12
8
;
constexpr
index_t
K
=
5
12
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
...
...
@@ -651,30 +614,31 @@ int main(int argc, char* argv[])
#endif
}
#if 1
#if 0
device_convolution_direct_v2_nchw_kcyx_nkhw
device_convolution_direct_v2_nchw_kcyx_nkhw(
in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif
0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 1
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
nrepeat
);
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
in_nchw
,
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help