Commit 7ec80cef authored by kevin.chang's avatar kevin.chang
Browse files

enable vectorx4-load,vectorx2-load and fake-extend odd to even image

parent 4904281f
Showing with 157 additions and 53 deletions
+157 -53
......@@ -75,6 +75,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1];
#if CK_EXTEND_IMAGE_SIZE_PAD_W
#else
// sanity-check for vectorized memory load
static_assert((Wo == 1 || (ConvStrideW == 1 || GemmBBlockCopySrcDataPerRead_GemmN == 1)) &&
(X == 1 || ConvDilationW % GemmBBlockCopySrcDataPerRead_GemmN == 0) &&
......@@ -82,6 +84,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
InRightPads{}[1] % GemmBBlockCopySrcDataPerRead_GemmN == 0,
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated");
#endif
// weight tensor
constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
......@@ -107,20 +110,83 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
#if CK_EXTEND_IMAGE_SIZE_PAD_HW
constexpr auto in_n_c_y_ho_x_wo_global_desc_unfold =
transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
make_tuple(PassThrough<N>{},PassThrough<C>{},PassThrough<Y>{}, PassThrough<X>{}, Merge<Sequence<Ho, Wo>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
constexpr auto in_n_c_y_ho_x_wo_global_desc_unfold_padd =
transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc_unfold,
make_tuple(PassThrough<N>{},PassThrough<C>{},PassThrough<Y>{}, PassThrough<X>{},
Pad<Sequence<Ho*Wo>, Sequence<0>, Sequence<3>>{}),//Transforms,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
constexpr auto in_e_b_global_desc =
transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc_unfold_padd,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, 52>>{}),
make_tuple(Sequence<1, 2, 3>{}, Sequence<0, 4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#else
constexpr auto in_e_b_global_desc = transform_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#endif
#if CK_EXTEND_IMAGE_SIZE_PAD_W
constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Hi, Wi>{});
constexpr auto out_nkhw_desc_native =
make_native_tensor_descriptor(out_nkhw_desc.GetLengths(), out_nkhw_desc.GetStrides());
constexpr auto out_n_k_ho_wo_global_desc_pad = transform_tensor_descriptor(
out_nkhw_desc_native,
make_tuple(PassThrough<N>{},
PassThrough<K>{},
Pad<Sequence<Hi, Wi>, InLeftPads, InRightPads>{}),//Transforms,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),//LowDimensionIds,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));// UpDimensionIds
// output tensor
constexpr auto out_k_b_global_desc =
transform_tensor_descriptor(out_n_k_ho_wo_global_desc_pad,
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho,Wo>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#elif CK_EXTEND_IMAGE_SIZE_PAD_HW
constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Hi, Wi>{});
constexpr auto out_nkhw_desc_native =
make_native_tensor_descriptor(out_nkhw_desc.GetLengths(), out_nkhw_desc.GetStrides());
constexpr auto out_nkhw_desc_unfold =
transform_tensor_descriptor(out_nkhw_desc_native,
make_tuple(PassThrough<N>{},PassThrough<K>{}, Merge<Sequence<Hi, Wi>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
constexpr auto out_nkhw_desc_unfold_pad =
transform_tensor_descriptor(out_nkhw_desc_unfold,
make_tuple(PassThrough<N>{},
PassThrough<K>{},
Pad<Sequence<Hi*Wi>, Sequence<0>, Sequence<3>>{}),//Transforms,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),//LowDimensionIds,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));// UpDimensionIds
constexpr auto out_k_b_global_desc =
transform_tensor_descriptor(out_nkhw_desc_unfold_pad,
make_tuple(PassThrough<K>{}, Merge<Sequence<N, 52>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#else
// output tensor
constexpr auto out_k_b_global_desc =
transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#endif
// GEMM
constexpr auto gridwise_gemm =
GridwiseGemmTransposedANormalBNormalC_v1<GridSize,
......
......@@ -5,6 +5,8 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
#define CK_VECTORX4_FLAG 0
#define CK_VECTORX2_FLAG 1
namespace ck {
......@@ -89,15 +91,82 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size];
SrcData p_src_long_vector[long_vector_size*2];
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
#if 0 //original code
#if CK_VECTORX4_FLAG //vectorloadx4
{ auto scalar_id = make_zero_array<index_t, nDim>();
auto src_coord = mSrcSliceOrigin + long_vector_data_begin_id;
scalar_id(vector_access_dim) = 3;
if((long_vector_size==4) && src_coord.IsOffsetValidAssumingUpperIndexIsValid()&&((src_coord.CalculateOffsetDiff(scalar_id))==3)){
transfer_data<SrcData,
4,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, 0);
}else{
//original code
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
scalar_id(vector_access_dim) = i;
src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, i);
}
}
}
}
#elif CK_VECTORX2_FLAG //vectorloadx2
{
auto scalar_id = make_zero_array<index_t, nDim>();
auto src_coord = mSrcSliceOrigin + long_vector_data_begin_id;
scalar_id(vector_access_dim) = 1;
if((long_vector_size==2) && src_coord.IsOffsetValidAssumingUpperIndexIsValid()&&((src_coord.CalculateOffsetDiff(scalar_id))==1)){
transfer_data<SrcData,
2,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, 0);
}
else{
//original code
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
scalar_id(vector_access_dim) = i;
src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, i);
}
}
}
}
#else //original code
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
......@@ -121,53 +190,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
}
#else //try vector load
// load data from src to the long-vector buffer
index_t i = 0;
while(i<long_vector_size){
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i;
const index_t buffer_offset = i;
const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
scalar_id(vector_access_dim) = i + 3;
const auto src_coord3 = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
scalar_id(vector_access_dim) = i + 1;
const auto src_coord1 = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
if(((long_vector_size-i)>=4)&&src_coord.IsOffsetValidAssumingUpperIndexIsValid()&&src_coord3.IsOffsetValidAssumingUpperIndexIsValid()&&(src_coord.GetOffset()+3==src_coord3.GetOffset())){
transfer_data<SrcData,
4,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
i=i+4;
}else if(((long_vector_size-i)>=2)&&src_coord.IsOffsetValidAssumingUpperIndexIsValid()&&src_coord1.IsOffsetValidAssumingUpperIndexIsValid()&&((src_coord.GetOffset()+1)==src_coord1.GetOffset())){
transfer_data<SrcData,
2,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
i=i+2;
}else {
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
i++;
}
}
#endif
#endif
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
......
......@@ -29,4 +29,7 @@
#include "amd_xdlops.hpp"
#endif
#define CK_EXTEND_IMAGE_SIZE_PAD_W 1 //fake pad 7x7 into 7x8 to make it dwordx4
#define CK_EXTEND_IMAGE_SIZE_PAD_HW 0 //fake pad 7x7 into 7x7+3 to make it dwordx4
#endif
......@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
{
using namespace ck;
#if 0
#if 1
// 1x1 for vector memory access , 7x7 image size
constexpr index_t N = 128;
constexpr index_t C = 256;
......@@ -43,7 +43,11 @@ int main(int argc, char* argv[])
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
#if CK_EXTEND_IMAGE_SIZE_PAD_W
using RightPads = Sequence<0, 1>;
#else
using RightPads = Sequence<0, 0>;
#endif
#elif 1
// 1x1 for vector memory access, 13x13 image size
constexpr index_t N = 128;
......@@ -379,6 +383,10 @@ int main(int argc, char* argv[])
auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated(
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
#if CK_EXTEND_IMAGE_SIZE_PAD_W
auto out_nkhw_desc_nopadd = get_convolution_output_default_4d_tensor_descriptor_deprecated(
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, Sequence<0,0>{});//RightPads{});
#endif
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
......@@ -392,7 +400,11 @@ int main(int argc, char* argv[])
using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
#if CK_EXTEND_IMAGE_SIZE_PAD_W
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc_nopadd));
#else
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
#endif
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
std::size_t num_thread = std::thread::hardware_concurrency();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment