diff --git a/src/qs8-dwconv/gen/up16x9-minmax-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-wasmsimd-mul16.c index eb7b3cfad9c..273fa8d3265 100644 --- a/src/qs8-dwconv/gen/up16x9-minmax-wasmsimd-mul16.c +++ b/src/qs8-dwconv/gen/up16x9-minmax-wasmsimd-mul16.c @@ -214,19 +214,14 @@ void xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16( w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)); - const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31); - const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31); - const v128_t vsign89AB = wasm_i32x4_shr(vacc89AB, 31); - const v128_t vsignCDEF = wasm_i32x4_shr(vaccCDEF, 31); - - const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5); - const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7); - const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5); - const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7); - const v128_t vacc89 = wasm_v32x4_shuffle(vacc89AB, vsign89AB, 0, 4, 1, 5); - const v128_t vaccAB = wasm_v32x4_shuffle(vacc89AB, vsign89AB, 2, 6, 3, 7); - const v128_t vaccCD = wasm_v32x4_shuffle(vaccCDEF, vsignCDEF, 0, 4, 1, 5); - const v128_t vaccEF = wasm_v32x4_shuffle(vaccCDEF, vsignCDEF, 2, 6, 3, 7); + const v128_t vacc01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0123); + const v128_t vacc23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0123); + const v128_t vacc45 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc4567); + const v128_t vacc67 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc4567); + const v128_t vacc89 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc89AB); + const v128_t vaccAB = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc89AB); + const v128_t vaccCD = __builtin_wasm_widen_low_s_i32x4_i64x2(vaccCDEF); + const v128_t vaccEF = __builtin_wasm_widen_high_s_i32x4_i64x2(vaccCDEF); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); @@ -359,13 +354,10 @@ void xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16( w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); k += 8; - const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31); - const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31); - - const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5); - const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7); - const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5); - const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7); + const v128_t vacc01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0123); + const v128_t vacc23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0123); + const v128_t vacc45 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc4567); + const v128_t vacc67 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc4567); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); diff --git a/src/qs8-dwconv/gen/up24x9-minmax-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-wasmsimd-mul16.c index f5efd68bcf0..f239a11796a 100644 --- a/src/qs8-dwconv/gen/up24x9-minmax-wasmsimd-mul16.c +++ b/src/qs8-dwconv/gen/up24x9-minmax-wasmsimd-mul16.c @@ -261,25 +261,18 @@ void xnn_qs8_dwconv_minmax_ukernel_up24x9__wasmsimd_mul16( w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t)); - const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31); - const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31); - const v128_t vsign89AB = wasm_i32x4_shr(vacc89AB, 31); - const v128_t vsignCDEF = wasm_i32x4_shr(vaccCDEF, 31); - const v128_t vsignGHIJ = wasm_i32x4_shr(vaccGHIJ, 31); - const v128_t vsignKLMN = wasm_i32x4_shr(vaccKLMN, 31); - - const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5); - const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7); - const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5); - const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7); - const v128_t vacc89 = wasm_v32x4_shuffle(vacc89AB, vsign89AB, 0, 4, 1, 5); - const v128_t vaccAB = wasm_v32x4_shuffle(vacc89AB, vsign89AB, 2, 6, 3, 7); - const v128_t vaccCD = wasm_v32x4_shuffle(vaccCDEF, vsignCDEF, 0, 4, 1, 5); - const v128_t vaccEF = wasm_v32x4_shuffle(vaccCDEF, vsignCDEF, 2, 6, 3, 7); - const v128_t vaccGH = wasm_v32x4_shuffle(vaccGHIJ, vsignGHIJ, 0, 4, 1, 5); - const v128_t vaccIJ = wasm_v32x4_shuffle(vaccGHIJ, vsignGHIJ, 2, 6, 3, 7); - const v128_t vaccKL = wasm_v32x4_shuffle(vaccKLMN, vsignKLMN, 0, 4, 1, 5); - const v128_t vaccMN = wasm_v32x4_shuffle(vaccKLMN, vsignKLMN, 2, 6, 3, 7); + const v128_t vacc01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0123); + const v128_t vacc23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0123); + const v128_t vacc45 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc4567); + const v128_t vacc67 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc4567); + const v128_t vacc89 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc89AB); + const v128_t vaccAB = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc89AB); + const v128_t vaccCD = __builtin_wasm_widen_low_s_i32x4_i64x2(vaccCDEF); + const v128_t vaccEF = __builtin_wasm_widen_high_s_i32x4_i64x2(vaccCDEF); + const v128_t vaccGH = __builtin_wasm_widen_low_s_i32x4_i64x2(vaccGHIJ); + const v128_t vaccIJ = __builtin_wasm_widen_high_s_i32x4_i64x2(vaccGHIJ); + const v128_t vaccKL = __builtin_wasm_widen_low_s_i32x4_i64x2(vaccKLMN); + const v128_t vaccMN = __builtin_wasm_widen_high_s_i32x4_i64x2(vaccKLMN); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); @@ -425,13 +418,10 @@ void xnn_qs8_dwconv_minmax_ukernel_up24x9__wasmsimd_mul16( w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); k += 8; - const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31); - const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31); - - const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5); - const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7); - const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5); - const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7); + const v128_t vacc01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0123); + const v128_t vacc23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0123); + const v128_t vacc45 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc4567); + const v128_t vacc67 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc4567); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); diff --git a/src/qs8-dwconv/gen/up8x9-minmax-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-wasmsimd-mul16.c index cd229d56e06..9a7aab87915 100644 --- a/src/qs8-dwconv/gen/up8x9-minmax-wasmsimd-mul16.c +++ b/src/qs8-dwconv/gen/up8x9-minmax-wasmsimd-mul16.c @@ -167,13 +167,10 @@ void xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16( w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)); - const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31); - const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31); - - const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5); - const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7); - const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5); - const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7); + const v128_t vacc01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0123); + const v128_t vacc23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0123); + const v128_t vacc45 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc4567); + const v128_t vacc67 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc4567); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); @@ -283,13 +280,10 @@ void xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16( vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod8x01234567)); - const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31); - const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31); - - const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5); - const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7); - const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5); - const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7); + const v128_t vacc01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0123); + const v128_t vacc23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0123); + const v128_t vacc45 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc4567); + const v128_t vacc67 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc4567); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); diff --git a/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in b/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in index 7a74d3b5403..8603c02dc0e 100644 --- a/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in +++ b/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in @@ -65,11 +65,8 @@ void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__wasmsimd_mu w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(int8_t)); $for C in range(0, CHANNEL_TILE, 4): - const v128_t vsign${ABC[C:C+4]} = wasm_i32x4_shr(vacc${ABC[C:C+4]}, 31); - - $for C in range(0, CHANNEL_TILE, 4): - const v128_t vacc${ABC[C:C+2]} = wasm_v32x4_shuffle(vacc${ABC[C:C+4]}, vsign${ABC[C:C+4]}, 0, 4, 1, 5); - const v128_t vacc${ABC[C+2:C+4]} = wasm_v32x4_shuffle(vacc${ABC[C:C+4]}, vsign${ABC[C:C+4]}, 2, 6, 3, 7); + const v128_t vacc${ABC[C:C+2]} = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc${ABC[C:C+4]}); + const v128_t vacc${ABC[C+2:C+4]} = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc${ABC[C:C+4]}); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); @@ -140,13 +137,10 @@ void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__wasmsimd_mu w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); k += 8; - const v128_t vsign${ABC[0:4]} = wasm_i32x4_shr(vacc${ABC[0:4]}, 31); - const v128_t vsign${ABC[4:8]} = wasm_i32x4_shr(vacc${ABC[4:8]}, 31); - - const v128_t vacc${ABC[0:2]} = wasm_v32x4_shuffle(vacc${ABC[0:4]}, vsign${ABC[0:4]}, 0, 4, 1, 5); - const v128_t vacc${ABC[2:4]} = wasm_v32x4_shuffle(vacc${ABC[0:4]}, vsign${ABC[0:4]}, 2, 6, 3, 7); - const v128_t vacc${ABC[4:6]} = wasm_v32x4_shuffle(vacc${ABC[4:8]}, vsign${ABC[4:8]}, 0, 4, 1, 5); - const v128_t vacc${ABC[6:8]} = wasm_v32x4_shuffle(vacc${ABC[4:8]}, vsign${ABC[4:8]}, 2, 6, 3, 7); + const v128_t vacc${ABC[0:2]} = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc${ABC[0:4]}); + const v128_t vacc${ABC[2:4]} = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc${ABC[0:4]}); + const v128_t vacc${ABC[4:6]} = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc${ABC[4:8]}); + const v128_t vacc${ABC[6:8]} = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc${ABC[4:8]}); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); diff --git a/src/qs8-gemm/MRx4c8-wasmsimd.c.in b/src/qs8-gemm/MRx4c8-wasmsimd.c.in index dbc36b27ca7..2a95019e752 100644 --- a/src/qs8-gemm/MRx4c8-wasmsimd.c.in +++ b/src/qs8-gemm/MRx4c8-wasmsimd.c.in @@ -124,16 +124,13 @@ void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c8__wasmsimd${LOAD_SUFFIX} v128_t vacc${M}x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc${M}x02, vacc${M}x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc${M}x02, vacc${M}x13, 2, 6, 3, 7)); $for M in range(MR): - const v128_t vsign${M}x0123 = wasm_i32x4_lt(vacc${M}x0123, vzero); - - $for M in range(MR): - const v128_t vacc${M}x01 = wasm_v32x4_shuffle(vacc${M}x0123, vsign${M}x0123, 0, 4, 1, 5); + const v128_t vacc${M}x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc${M}x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); $for M in range(MR): const v128_t vprod${M}x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc${M}x01, vmultiplier), vrounding); - const v128_t vacc${M}x23 = wasm_v32x4_shuffle(vacc${M}x0123, vsign${M}x0123, 2, 6, 3, 7); + const v128_t vacc${M}x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc${M}x0123); $for M in range(MR): const v128_t vprod${M}x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc${M}x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c index ba3e4f19728..1ae9711a7e7 100644 --- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c +++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c @@ -85,14 +85,12 @@ void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c index 428426a177d..396bee0aeb5 100644 --- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c +++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c @@ -81,14 +81,12 @@ void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c index 7cc764350de..71debe00a10 100644 --- a/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c +++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c @@ -81,14 +81,12 @@ void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c index a77acde9d7e..43605700681 100644 --- a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c +++ b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c @@ -112,18 +112,15 @@ void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c index c3f54833360..f0f1be9aff8 100644 --- a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c +++ b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c @@ -108,18 +108,15 @@ void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c index 00a234e535c..a79c6da4977 100644 --- a/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c +++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c @@ -108,18 +108,15 @@ void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c index 8a037edc544..be1bec15346 100644 --- a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c +++ b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c @@ -139,22 +139,18 @@ void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128( v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - const v128_t vsign2x0123 = wasm_i32x4_lt(vacc2x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); - const v128_t vacc2x01 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); + const v128_t vacc2x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc2x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod2x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x01, vmultiplier), vrounding); - const v128_t vacc2x23 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 2, 6, 3, 7); + const v128_t vacc2x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc2x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c index f63c0f0b3b5..e294b281844 100644 --- a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c +++ b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c @@ -135,22 +135,18 @@ void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64( v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - const v128_t vsign2x0123 = wasm_i32x4_lt(vacc2x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); - const v128_t vacc2x01 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); + const v128_t vacc2x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc2x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod2x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x01, vmultiplier), vrounding); - const v128_t vacc2x23 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 2, 6, 3, 7); + const v128_t vacc2x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc2x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c index 42583376042..ba76a230fa2 100644 --- a/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c +++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c @@ -135,22 +135,18 @@ void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd( v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - const v128_t vsign2x0123 = wasm_i32x4_lt(vacc2x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); - const v128_t vacc2x01 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); + const v128_t vacc2x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc2x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod2x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x01, vmultiplier), vrounding); - const v128_t vacc2x23 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 2, 6, 3, 7); + const v128_t vacc2x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc2x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-igemm/MRx4c8-wasmsimd.c.in b/src/qs8-igemm/MRx4c8-wasmsimd.c.in index 51dc5cb09d8..faa46acb1fe 100644 --- a/src/qs8-igemm/MRx4c8-wasmsimd.c.in +++ b/src/qs8-igemm/MRx4c8-wasmsimd.c.in @@ -134,16 +134,13 @@ void xnn_qs8_igemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c8__wasmsimd${LOAD_SUFFIX v128_t vacc${M}x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc${M}x02, vacc${M}x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc${M}x02, vacc${M}x13, 2, 6, 3, 7)); $for M in range(MR): - const v128_t vsign${M}x0123 = wasm_i32x4_lt(vacc${M}x0123, vzero); - - $for M in range(MR): - const v128_t vacc${M}x01 = wasm_v32x4_shuffle(vacc${M}x0123, vsign${M}x0123, 0, 4, 1, 5); + const v128_t vacc${M}x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc${M}x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); $for M in range(MR): const v128_t vprod${M}x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc${M}x01, vmultiplier), vrounding); - const v128_t vacc${M}x23 = wasm_v32x4_shuffle(vacc${M}x0123, vsign${M}x0123, 2, 6, 3, 7); + const v128_t vacc${M}x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc${M}x0123); $for M in range(MR): const v128_t vprod${M}x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc${M}x23, vmultiplier), vrounding); diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c index 8a104c0b793..5839158b6d2 100644 --- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c +++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c @@ -98,14 +98,12 @@ void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c index 15bc8bb1c81..0d3fb58f484 100644 --- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c +++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c @@ -94,14 +94,12 @@ void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); diff --git a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c index 11188d49a7d..590f02fcf71 100644 --- a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c +++ b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c @@ -127,18 +127,15 @@ void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c index a85d4d77327..5bb67f264e3 100644 --- a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c +++ b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c @@ -123,18 +123,15 @@ void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64( v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c index 0c95932f1ff..bbbc7c7e3ef 100644 --- a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c +++ b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c @@ -156,22 +156,18 @@ void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128( v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - const v128_t vsign2x0123 = wasm_i32x4_lt(vacc2x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); - const v128_t vacc2x01 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); + const v128_t vacc2x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc2x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod2x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x01, vmultiplier), vrounding); - const v128_t vacc2x23 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 2, 6, 3, 7); + const v128_t vacc2x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc2x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c index 7f22e3d6472..98383dbc878 100644 --- a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c +++ b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c @@ -152,22 +152,18 @@ void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64( v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); - const v128_t vsign0x0123 = wasm_i32x4_lt(vacc0x0123, vzero); - const v128_t vsign1x0123 = wasm_i32x4_lt(vacc1x0123, vzero); - const v128_t vsign2x0123 = wasm_i32x4_lt(vacc2x0123, vzero); - - const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5); - const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5); - const v128_t vacc2x01 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 0, 4, 1, 5); + const v128_t vacc0x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc0x0123); + const v128_t vacc1x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc1x0123); + const v128_t vacc2x01 = __builtin_wasm_widen_low_s_i32x4_i64x2(vacc2x0123); const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding); - const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7); + const v128_t vacc0x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc0x0123); const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding); - const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7); + const v128_t vacc1x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc1x0123); const v128_t vprod2x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x01, vmultiplier), vrounding); - const v128_t vacc2x23 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 2, 6, 3, 7); + const v128_t vacc2x23 = __builtin_wasm_widen_high_s_i32x4_i64x2(vacc2x0123); const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding); const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding); diff --git a/src/qs8-requantization/q31-wasmsimd.c b/src/qs8-requantization/q31-wasmsimd.c index be8b99fc030..a007dd3844c 100644 --- a/src/qs8-requantization/q31-wasmsimd.c +++ b/src/qs8-requantization/q31-wasmsimd.c @@ -61,20 +61,15 @@ void xnn_qs8_requantize_q31__wasmsimd( const v128_t w = wasm_v128_load(input + 12); input += 16; - const v128_t x_sign = wasm_i32x4_lt(x, vzero); - const v128_t y_sign = wasm_i32x4_lt(y, vzero); - const v128_t z_sign = wasm_i32x4_lt(z, vzero); - const v128_t w_sign = wasm_i32x4_lt(w, vzero); - - const v128_t x_lo = wasm_v32x4_shuffle(x, x_sign, 0, 4, 1, 5); - const v128_t y_lo = wasm_v32x4_shuffle(y, y_sign, 0, 4, 1, 5); - const v128_t z_lo = wasm_v32x4_shuffle(z, z_sign, 0, 4, 1, 5); - const v128_t w_lo = wasm_v32x4_shuffle(w, w_sign, 0, 4, 1, 5); - - const v128_t x_hi = wasm_v32x4_shuffle(x, x_sign, 2, 6, 3, 7); - const v128_t y_hi = wasm_v32x4_shuffle(y, y_sign, 2, 6, 3, 7); - const v128_t z_hi = wasm_v32x4_shuffle(z, z_sign, 2, 6, 3, 7); - const v128_t w_hi = wasm_v32x4_shuffle(w, w_sign, 2, 6, 3, 7); + const v128_t x_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(x); + const v128_t y_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(y); + const v128_t z_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(z); + const v128_t w_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(w); + + const v128_t x_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(x); + const v128_t y_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(y); + const v128_t z_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(z); + const v128_t w_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(w); const v128_t x_product_lo = wasm_i64x2_add(wasm_i64x2_mul(x_lo, vmultiplier), vtwice_q31rounding); const v128_t y_product_lo = wasm_i64x2_add(wasm_i64x2_mul(y_lo, vmultiplier), vtwice_q31rounding); diff --git a/src/qu8-requantization/q31-wasmsimd.c b/src/qu8-requantization/q31-wasmsimd.c index 2e64ffb03eb..896093e89bd 100644 --- a/src/qu8-requantization/q31-wasmsimd.c +++ b/src/qu8-requantization/q31-wasmsimd.c @@ -61,20 +61,15 @@ void xnn_qu8_requantize_q31__wasmsimd( const v128_t w = wasm_v128_load(input + 12); input += 16; - const v128_t x_sign = wasm_i32x4_lt(x, vzero); - const v128_t y_sign = wasm_i32x4_lt(y, vzero); - const v128_t z_sign = wasm_i32x4_lt(z, vzero); - const v128_t w_sign = wasm_i32x4_lt(w, vzero); - - const v128_t x_lo = wasm_v32x4_shuffle(x, x_sign, 0, 4, 1, 5); - const v128_t y_lo = wasm_v32x4_shuffle(y, y_sign, 0, 4, 1, 5); - const v128_t z_lo = wasm_v32x4_shuffle(z, z_sign, 0, 4, 1, 5); - const v128_t w_lo = wasm_v32x4_shuffle(w, w_sign, 0, 4, 1, 5); - - const v128_t x_hi = wasm_v32x4_shuffle(x, x_sign, 2, 6, 3, 7); - const v128_t y_hi = wasm_v32x4_shuffle(y, y_sign, 2, 6, 3, 7); - const v128_t z_hi = wasm_v32x4_shuffle(z, z_sign, 2, 6, 3, 7); - const v128_t w_hi = wasm_v32x4_shuffle(w, w_sign, 2, 6, 3, 7); + const v128_t x_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(x); + const v128_t y_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(y); + const v128_t z_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(z); + const v128_t w_lo = __builtin_wasm_widen_low_s_i32x4_i64x2(w); + + const v128_t x_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(x); + const v128_t y_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(y); + const v128_t z_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(z); + const v128_t w_hi = __builtin_wasm_widen_high_s_i32x4_i64x2(w); const v128_t x_product_lo = wasm_i64x2_add(wasm_i64x2_mul(x_lo, vmultiplier), vtwice_q31rounding); const v128_t y_product_lo = wasm_i64x2_add(wasm_i64x2_mul(y_lo, vmultiplier), vtwice_q31rounding);