Commit 3d66465a authored by Jing Zhang's avatar Jing Zhang
Browse files

add non-pad

parent bf111ac6
No related merge requests found
Showing with 136 additions and 161 deletions
+136 -161
......@@ -31,6 +31,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
{
template <typename... Wei,
typename... In,
typename... Add,
typename... Out,
typename ConvStrides,
typename ConvDilations,
......@@ -38,6 +39,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
typename InRightPads>
__host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Add...>& add_n_k0_hox2_wox2_k1_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
......@@ -45,6 +47,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
const InRightPads& in_right_pads,
const FloatAB* __restrict__ p_wei_global,
const FloatAB* __restrict__ p_in_global,
const FloatC* __restrict__ p_d_global,
FloatC* __restrict__ p_out_global) const
{
constexpr auto I0 = Number<0>{};
......@@ -63,6 +66,9 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
const auto Hox2 = Ho * 2;
const auto Wox2 = Wo * 2;
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
......@@ -127,6 +133,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
// add tensor
const auto add_k_n_hopx2_wopx2_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2)),
make_tuple(make_pass_through_transform(K0),
make_pass_through_transform(N),
make_pass_through_transform(Hox2),
make_pass_through_transform(Wox2)),
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const auto E = C * Y * X;
if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 &&
......@@ -158,16 +174,15 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 2, 0, 0, 0>{},
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
#if 0
// GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
BlockSize,
......@@ -177,6 +192,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
InMemoryDataOperation::Set,
decltype(wei_e_k_global_desc),
decltype(in_e_n_ho_wo_global_desc),
decltype(add_k_n_hopx2_wopx2_global_desc),
decltype(out_k_n_ho_wo_global_desc),
KPerBlock,
HoPerBlock,
......@@ -228,13 +244,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
#if 0
//if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(add_k_n_hopx2_wopx2_global_desc),
const FloatC*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, true>,
......@@ -249,22 +268,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global,
in_e_n_ho_wo_global_desc,
p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
#elif 1
// else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(add_k_n_hopx2_wopx2_global_desc),
const FloatC*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
......@@ -275,22 +300,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global,
in_e_n_ho_wo_global_desc,
p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
#elif 1
// else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(add_k_n_hopx2_wopx2_global_desc),
const FloatC*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
......@@ -301,22 +332,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global,
in_e_n_ho_wo_global_desc,
p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
#elif 1
// else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
const auto kernel =
run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc),
const FloatAB*,
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(add_k_n_hopx2_wopx2_global_desc),
const FloatC*,
decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
......@@ -327,11 +364,14 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global,
in_e_n_ho_wo_global_desc,
p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
#endif
}
timer.End();
......@@ -346,7 +386,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#endif
}
};
} // namespace ck
......
......@@ -192,11 +192,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 2, 0, 0, 0>{},
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
......@@ -262,7 +262,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
for(index_t j = 0; j < nrepeat; ++j)
{
if(has_main_k_block_loop && has_double_tail_k_block_loop)
#if 0
//if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
......@@ -293,7 +294,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
#elif 1
// else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
......@@ -324,7 +326,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
#elif 1
// else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
......@@ -355,7 +358,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
#elif 1
// else
{
const auto kernel =
run_gridwise_operation<gridwise_gemm,
......@@ -386,6 +390,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
#endif
}
timer.End();
......
......@@ -134,8 +134,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto KPerThreadSubC = 4;
constexpr auto HoPerThreadSubC = 2;
constexpr auto WoPerThreadSubC = 2;
constexpr auto HoPerThreadSubC = HPerThread;
constexpr auto WoPerThreadSubC = WPerThread;
static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % HoPerThreadSubC == 0, "");
......
......@@ -175,6 +175,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
const index_t k_thread_data_on_global = k_block_data_on_global + k_thread_id * KPerThread;
const index_t ho_thread_data_on_global =
ho_block_data_on_global + ho_thread_id * HoPerThread;
const index_t wo_thread_data_on_global =
......@@ -262,11 +263,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
a_blockwise_copy.RunWrite(a_e_k_desc, p_a_block);
}
__syncthreads();
block_sync_lds();
//__syncthreads();
#if 1
constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
constexpr auto HoPerThreadx2 = HoPerThread * 2;
constexpr auto WoPerThreadx2 = WoPerThread * 2;
......@@ -281,25 +281,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
const index_t hox2_block_data_on_global = ho_block_work_id * HoPerBlock * 2;
const index_t wox2_block_data_on_global = wo_block_work_id * WoPerBlock * 2;
const index_t hox2_thread_data_on_global =
hox2_block_data_on_global + ho_thread_id * HoPerThreadx2;
const index_t wox2_thread_data_on_global =
wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
const index_t hox2_thread_data_on_global = ho_thread_data_on_global * 2;
const index_t wox2_thread_data_on_global = wo_thread_data_on_global * 2;
const index_t k_thread_data_on_global_add =
k_thread_data_on_global / CThreadTransferDstScalarPerVector;
static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
static_assert(CThreadTransferDstScalarPerVector == 16, "");
const index_t k_block_data_on_global_add =
k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
const index_t k_thread_data_on_global_add =
k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
static_assert(vec_len == 256, "");
vector_type<int8_t, vec_len> d_vec;
#if 1
{
ThreadwiseDynamicTensorSliceTransfer_v2<
......@@ -419,104 +412,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
#endif
// output: register to global memory
#if 0
{
constexpr auto HoPerThreadx2 = HoPerThread * 2;
constexpr auto WoPerThreadx2 = WoPerThread * 2;
const index_t hox2_block_data_on_global = ho_block_work_id * HoPerBlock * 2;
const index_t wox2_block_data_on_global = wo_block_work_id * WoPerBlock * 2;
const index_t hox2_thread_data_on_global =
hox2_block_data_on_global + ho_thread_id * HoPerThreadx2;
const index_t wox2_thread_data_on_global =
wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
const index_t k_block_data_on_global_add =
k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
const index_t k_thread_data_on_global_add =
k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
constexpr auto d_k_n_hox2_wox2_thread_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<1>{}, Number<1>{}, Number<1>{}, Number<1>{}));
constexpr auto vector_len = CThreadTransferDstScalarPerVector;
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
vector_type<int8_t, vector_len> d_vec;
for(index_t k_i = 0; k_i < KPerThreadAdd; ++k_i)
{
for(index_t h_i = 0; h_i < HoPerThreadx2; ++h_i)
{
for(index_t w_i = 0; w_i < WoPerThreadx2; ++w_i)
{
ThreadwiseDynamicTensorSliceTransfer_v2<
FloatC,
decltype(d_vec),
decltype(d_k_n_hox2_wox2_global_desc),
decltype(d_k_n_hox2_wox2_thread_desc),
Sequence<1, 1, 1, 1>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
// CThreadTransferDstScalarPerVector,
1,
AddressSpace::Global,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
1,
true>(d_k_n_hox2_wox2_global_desc,
make_multi_index(k_thread_data_on_global_add + k_i,
0,
hox2_thread_data_on_global + h_i,
wox2_thread_data_on_global + w_i))
.Run2(d_k_n_hox2_wox2_global_desc,
p_d_global,
d_k_n_hox2_wox2_thread_desc,
make_tuple(I0, I0, I0, I0),
d_vec,
c_k_n_ho_wo_global_tensor_iterator_hacks);
static_for<0, vector_len, 1>{}([&](auto i) {
d_vec.template AsType<int8_t>()(i) +=
p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
make_tuple(k_i * vector_len + i, 0, h_i / 2, w_i / 2))];
});
ThreadwiseDynamicTensorSliceTransfer_v1r3<
decltype(d_vec),
FloatC,
decltype(d_k_n_hox2_wox2_thread_desc),
decltype(d_k_n_hox2_wox2_global_desc),
Sequence<1, 1, 1, 1>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
// CThreadTransferDstScalarPerVector,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryDataOperation,
1,
true>(d_k_n_hox2_wox2_global_desc,
make_multi_index(k_thread_data_on_global_add + k_i,
0,
hox2_thread_data_on_global + h_i,
wox2_thread_data_on_global + w_i))
.Run2(d_k_n_hox2_wox2_thread_desc,
make_tuple(I0, I0, I0, I0),
d_vec,
d_k_n_hox2_wox2_global_desc,
p_c_global,
c_k_n_ho_wo_global_tensor_iterator_hacks);
}
}
}
}
#else
{
static_for<0, KPerThreadAdd, 1>{}([&](auto k_i) {
static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
......@@ -569,7 +464,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
p_c_global,
c_k_n_ho_wo_global_tensor_iterator_hacks);
}
#endif
}
// pass tensor descriptor by reference
......
......@@ -109,6 +109,29 @@ struct ThreadwiseGemm_km_kn_mn_v3
p_c[c_offset_2],
p_c[c_offset_3]);
}
else if constexpr(H == 1 && W == 4)
{
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 2));
constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 3));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 2));
constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 3));
amd_assembly_outer_product_1x4(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_b[b_offset_2],
p_b[b_offset_3],
p_c[c_offset_0],
p_c[c_offset_1],
p_c[c_offset_2],
p_c[c_offset_3]);
}
else
{
static_for<0, H, 1>{}([&](auto h) {
......
......@@ -142,8 +142,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr index_t EPerBlock = C0;
constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t HoPerThread = 4;
constexpr index_t WoPerThread = 1;
constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>;
......@@ -184,7 +184,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
#endif
constexpr auto conv_driver =
#if 0
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
......
......@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
#elif 1
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t HI = 540;
constexpr index_t HI = 544;
constexpr index_t WI = 960;
constexpr index_t K = 16;
constexpr index_t Y = 3;
......@@ -118,6 +118,20 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t HI = 135;
constexpr index_t WI = 240;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment