diff --git a/3rdparty/tvm b/3rdparty/tvm index fba6ef955..1cc769cd7 160000 --- a/3rdparty/tvm +++ b/3rdparty/tvm @@ -1 +1 @@ -Subproject commit fba6ef9552e0f04e39d5ecf1b5253412e1f607df +Subproject commit 1cc769cd75cc9a497c5077cb71e68d7e60225f28 diff --git a/benchmark/tilelang/benchmark_tilelang_matmul.py b/benchmark/tilelang/benchmark_tilelang_matmul.py index e9590116a..5d8f49bfa 100644 --- a/benchmark/tilelang/benchmark_tilelang_matmul.py +++ b/benchmark/tilelang/benchmark_tilelang_matmul.py @@ -4,6 +4,12 @@ from tvm.tl.autotuner import * import itertools +import logging + +logger = logging.getLogger(__name__) + +logger.setLevel(logging.DEBUG) + def ref_program(A, B): return A @ B.T @@ -16,8 +22,11 @@ def get_configs(): num_stages = [0, 1, 2, 3, 4] thread_num = [128, 256] enable_rasteration = [True, False] + k_pack = [1, 2] + _configs = list( - itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasteration)) + itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, + k_pack)) configs = [{ 'block_M': c[0], @@ -25,7 +34,8 @@ def get_configs(): 'block_K': c[2], 'num_stages': c[3], 'thread_num': c[4], - 'enable_rasteration': c[5] + 'enable_rasteration': c[5], + 'k_pack': c[6] } for c in _configs] return configs @@ -34,14 +44,17 @@ def matmul(M, N, K): @autotune( configs=get_configs(), - keys=['block_M', 'block_N', 'block_K', 'num_stages', 'thread_num'], + keys=[ + 'block_M', 'block_N', 'block_K', 'num_stages', 'thread_num', 'enable_rasteration', + 'k_pack' + ], warmup=3, rep=5) @jit( out_idx=[2], supply_type=tl.TensorSupplyType.Integer, ref_prog=ref_program, - skip_check=True, + skip_check=False, profiler="tvm", target="hip") def kernel(block_M=None, @@ -49,7 +62,8 @@ def kernel(block_M=None, block_K=None, num_stages=None, thread_num=None, - enable_rasteration=None): + enable_rasteration=None, + k_pack=None): dtype = "float16" accum_dtype = "float" @@ -66,9 +80,9 @@ def main(A: T.Buffer((M, K), dtype), B: T.Buffer((N, K), dtype), C: T.Buffer((M, T.clear(C_local) for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages): - T.copy(A[by * block_M, k * block_K], A_shared) - T.copy(B[bx * block_N, k * block_K], B_shared) - T.gemm(A_shared, B_shared, C_local, transpose_B=True) + T.copy(A[by * block_M, k * block_K], A_shared, coalesced_width=4 * k_pack) + T.copy(B[bx * block_N, k * block_K], B_shared, coalesced_width=4 * k_pack) + T.gemm(A_shared, B_shared, C_local, transpose_B=True, k_pack=k_pack) T.copy(C_local, C[by * block_M, bx * block_N]) return main