diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 613606d43b8097..98e848bd23e93c 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -644,6 +644,10 @@ class CodeGen final : public CodeGenInterface void genAmd64EmitterUnitTestsCTEST(); #endif +#if defined(TARGET_WASM) + void genWasmEmitterUnitTestsSimd(); +#endif + #endif // defined(DEBUG) #ifdef TARGET_ARM64 diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index ea160060717233..e696c886cd9e28 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -212,6 +212,12 @@ class CodeGenInterface bool IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op); #endif // TARGET_XARCH +#if defined(TARGET_WASM) + // On wasm, we store the simd element size in the upper 7 bits of the instruction info. + // The lower bit is reserved as an FP flag. + static constexpr unsigned InstInfoElemSizeShift = 1; + static uint8_t instSimdElemSize(instruction ins); +#endif //------------------------------------------------------------------------- // Liveness-related fields & methods public: diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 9ab94d99448c94..00b5ccf2740ac2 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -2703,6 +2703,7 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc) * Possible values for JitEmitUnitTestsSections: * Amd64: all, sse2 * Arm64: all, general, advsimd, sve + * Wasm: all, simd */ #if defined(DEBUG) @@ -2727,7 +2728,14 @@ void CodeGen::genEmitterUnitTests() // Jump over the generated tests as they are not intended to be run. BasicBlock* skipLabel = genCreateTempLabel(); +#ifndef TARGET_WASM inst_JMP(EJ_jmp, skipLabel); +#else + // On Wasm, we skip over the generated emitter test code by nesting it in a block where the + // first instruction branches to the end of the block. + GetEmitter()->emitIns_BlockTy(INS_block); + GetEmitter()->emitIns_J(INS_br, EA_4BYTE, 0, nullptr); +#endif // Add NOPs at the start and end for easier script parsing. instGen(INS_nop); @@ -2777,6 +2785,13 @@ void CodeGen::genEmitterUnitTests() { genArm64EmitterUnitTestsPac(); } + +#elif defined(TARGET_WASM) + if (unitTestSectionAll || (strstr(unitTestSection, "simd") != nullptr)) + { + genWasmEmitterUnitTestsSimd(); + } + instGen(INS_end); #endif genDefineTempLabel(skipLabel); diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index db26846e35e0b7..6f9ddcf8a1fd65 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -3445,6 +3445,238 @@ void CodeGen::inst_JMP(emitJumpKind jmp, BasicBlock* tgtBlock) GetEmitter()->emitIns_J(instr, EA_4BYTE, depth, tgtBlock); } +#if defined(DEBUG) + +//------------------------------------------------------------------------ +// genWasmEmitterUnitTestsSimd: Exercise the packed SIMD instruction emit +// functions added for Wasm (v128.const, extract/replace lane, shuffle, +// load/store lane, and plain-opcode SIMD instructions). +// +// This is a temporary debug-only test that verifies the encoding paths +// do not assert or crash. Each instruction is emitted with valid stack +// operands so the resulting bytecode is semantically valid Wasm. +// +void CodeGen::genWasmEmitterUnitTestsSimd() +{ + emitter* emit = GetEmitter(); + + // Helper macros to push typed constants, ensuring valid stack state. + // clang-format off +#define PUSH_V128(bytes) emit->emitIns_V128Imm(INS_v128_const, (bytes)) +#define PUSH_I32(val) emit->emitIns_I(INS_i32_const, EA_4BYTE, (val)) +#define PUSH_I64(val) emit->emitIns_I(INS_i64_const, EA_8BYTE, (val)) +#define PUSH_F32(val) emit->emitIns_I(INS_f32_const, EA_4BYTE, (val)) +#define PUSH_F64(val) emit->emitIns_I(INS_f64_const, EA_8BYTE, (val)) +#define DROP emit->emitIns(INS_drop) + + // Unary v128 -> result: push operand, emit instruction, drop result +#define TEST_UNARY_V128(bytes, ins) \ + PUSH_V128(bytes); \ + emit->emitIns(ins); \ + DROP + + // Binary v128 x v128 -> v128: push two operands, emit instruction, drop result +#define TEST_BINARY_V128(bytes, ins) \ + PUSH_V128(bytes); \ + PUSH_V128(bytes); \ + emit->emitIns(ins); \ + DROP + + // Extract lane: v128 -> scalar (i32/i64/f32/f64), then drop +#define TEST_EXTRACT_LANE(bytes, ins, attr, lane) \ + PUSH_V128(bytes); \ + emit->emitIns_Lane(ins, attr, lane); \ + DROP + + // Replace lane: [v128, scalar] -> v128, then drop +#define TEST_REPLACE_LANE_I32(bytes, ins, attr, lane) \ + PUSH_V128(bytes); \ + PUSH_I32(42); \ + emit->emitIns_Lane(ins, attr, lane); \ + DROP + +#define TEST_REPLACE_LANE_I64(bytes, ins, attr, lane) \ + PUSH_V128(bytes); \ + PUSH_I64(42); \ + emit->emitIns_Lane(ins, attr, lane); \ + DROP + +#define TEST_REPLACE_LANE_F32(bytes, ins, attr, lane) \ + PUSH_V128(bytes); \ + PUSH_F32(0); \ + emit->emitIns_Lane(ins, attr, lane); \ + DROP + +#define TEST_REPLACE_LANE_F64(bytes, ins, attr, lane) \ + PUSH_V128(bytes); \ + PUSH_F64(0); \ + emit->emitIns_Lane(ins, attr, lane); \ + DROP + + // Load lane: [i32_addr, v128] -> v128, then drop +#define TEST_LOAD_LANE(bytes, ins, attr, offset, lane) \ + PUSH_I32(0); \ + PUSH_V128(bytes); \ + emit->emitIns_MemargLane(ins, attr, offset, lane); \ + DROP + + // Store lane: [i32_addr, v128] -> void +#define TEST_STORE_LANE(bytes, ins, attr, offset, lane) \ + PUSH_I32(0); \ + PUSH_V128(bytes); \ + emit->emitIns_MemargLane(ins, attr, offset, lane) + + // Shuffle: [v128, v128] -> v128, then drop +#define TEST_SHUFFLE(bytes, shuffleBytes) \ + PUSH_V128(bytes); \ + PUSH_V128(bytes); \ + emit->emitIns_V128Imm(INS_i8x16_shuffle, shuffleBytes); \ + DROP + // clang-format on + + // --- IF_V128: v128.const with 16 raw bytes --- + const uint8_t v128Bytes[16] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; + PUSH_V128(v128Bytes); + DROP; + + // All-zeros and all-ones constants + const uint8_t v128Zeros[16] = {0}; + PUSH_V128(v128Zeros); + DROP; + + const uint8_t v128Ones[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; + PUSH_V128(v128Ones); + DROP; + + // --- IF_LANE: extract/replace lane instructions --- + // i8x16 lanes (0..15) + TEST_EXTRACT_LANE(v128Ones, INS_i8x16_extract_lane_s, EA_1BYTE, 0); + TEST_EXTRACT_LANE(v128Ones, INS_i8x16_extract_lane_u, EA_1BYTE, 15); + TEST_REPLACE_LANE_I32(v128Ones, INS_i8x16_replace_lane, EA_1BYTE, 7); + + // i16x8 lanes (0..7) + TEST_EXTRACT_LANE(v128Ones, INS_i16x8_extract_lane_s, EA_2BYTE, 0); + TEST_EXTRACT_LANE(v128Ones, INS_i16x8_extract_lane_u, EA_2BYTE, 7); + TEST_REPLACE_LANE_I32(v128Ones, INS_i16x8_replace_lane, EA_2BYTE, 3); + + // i32x4 lanes (0..3) + TEST_EXTRACT_LANE(v128Ones, INS_i32x4_extract_lane, EA_4BYTE, 0); + TEST_REPLACE_LANE_I32(v128Ones, INS_i32x4_replace_lane, EA_4BYTE, 3); + + // i64x2 lanes (0..1) + TEST_EXTRACT_LANE(v128Ones, INS_i64x2_extract_lane, EA_8BYTE, 0); + TEST_REPLACE_LANE_I64(v128Ones, INS_i64x2_replace_lane, EA_8BYTE, 1); + + // f32x4 lanes (0..3) + TEST_EXTRACT_LANE(v128Ones, INS_f32x4_extract_lane, EA_4BYTE, 3); + TEST_REPLACE_LANE_F32(v128Ones, INS_f32x4_replace_lane, EA_4BYTE, 0); + + // f64x2 lanes (0..1) + TEST_EXTRACT_LANE(v128Ones, INS_f64x2_extract_lane, EA_8BYTE, 0); + TEST_REPLACE_LANE_F64(v128Ones, INS_f64x2_replace_lane, EA_8BYTE, 1); + + // --- IF_MEMARG_LANE: load/store lane with memarg --- + TEST_LOAD_LANE(v128Ones, INS_v128_load8_lane, EA_1BYTE, 0, 5); + TEST_LOAD_LANE(v128Ones, INS_v128_load16_lane, EA_2BYTE, 16, 3); + TEST_LOAD_LANE(v128Ones, INS_v128_load32_lane, EA_4BYTE, 64, 2); + TEST_LOAD_LANE(v128Ones, INS_v128_load64_lane, EA_8BYTE, 128, 1); + TEST_STORE_LANE(v128Ones, INS_v128_store8_lane, EA_1BYTE, 0, 0); + TEST_STORE_LANE(v128Ones, INS_v128_store16_lane, EA_2BYTE, 8, 7); + TEST_STORE_LANE(v128Ones, INS_v128_store32_lane, EA_4BYTE, 32, 1); + TEST_STORE_LANE(v128Ones, INS_v128_store64_lane, EA_8BYTE, 256, 0); + + // --- IF_V128: i8x16.shuffle with 16 lane-index bytes --- + // Identity shuffle + const uint8_t identityShuffle[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + TEST_SHUFFLE(v128Bytes, identityShuffle); + + // Reverse bytes + const uint8_t reverseShuffle[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + TEST_SHUFFLE(v128Bytes, reverseShuffle); + + // Cross-operand shuffle (indices 16..31 refer to the second operand) + const uint8_t crossShuffle[16] = {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31}; + TEST_SHUFFLE(v128Bytes, crossShuffle); + + // --- IF_OPCODE: plain opcode SIMD instructions (representative sample) --- + // Splat operations: push scalar, splat to v128, drop + PUSH_I32(1); + emit->emitIns(INS_i8x16_splat); + DROP; + + PUSH_I32(2); + emit->emitIns(INS_i16x8_splat); + DROP; + + PUSH_I32(3); + emit->emitIns(INS_i32x4_splat); + DROP; + + PUSH_I64(4); + emit->emitIns(INS_i64x2_splat); + DROP; + + PUSH_F32(0); + emit->emitIns(INS_f32x4_splat); + DROP; + + PUSH_F64(0); + emit->emitIns(INS_f64x2_splat); + DROP; + + // Swizzle: [v128, v128] -> v128 + TEST_BINARY_V128(v128Ones, INS_i8x16_swizzle); + + // A few comparisons: [v128, v128] -> v128 + TEST_BINARY_V128(v128Ones, INS_i8x16_eq); + TEST_BINARY_V128(v128Ones, INS_i32x4_ne); + TEST_BINARY_V128(v128Ones, INS_f64x2_lt); + + // A few arithmetic ops + TEST_BINARY_V128(v128Ones, INS_i8x16_add); + TEST_BINARY_V128(v128Ones, INS_i32x4_mul); + TEST_UNARY_V128(v128Ones, INS_f32x4_sqrt); + TEST_UNARY_V128(v128Ones, INS_f64x2_neg); + + // Bitwise ops + TEST_UNARY_V128(v128Ones, INS_v128_not); + TEST_BINARY_V128(v128Ones, INS_v128_and); + TEST_BINARY_V128(v128Ones, INS_v128_or); + TEST_BINARY_V128(v128Ones, INS_v128_xor); + TEST_BINARY_V128(v128Ones, INS_v128_andnot); + + // Bitmask / any_true / all_true: v128 -> i32 + TEST_UNARY_V128(v128Ones, INS_v128_any_true); + TEST_UNARY_V128(v128Ones, INS_i8x16_all_true); + TEST_UNARY_V128(v128Ones, INS_i32x4_bitmask); + + // Conversion operations: v128 -> v128 + TEST_UNARY_V128(v128Ones, INS_f32x4_convert_s_i32x4); + TEST_UNARY_V128(v128Ones, INS_f64x2_convert_low_u_i32x4); + TEST_UNARY_V128(v128Ones, INS_i32x4_trunc_sat_s_f32x4); + +#undef PUSH_V128 +#undef PUSH_I32 +#undef PUSH_I64 +#undef PUSH_F32 +#undef PUSH_F64 +#undef DROP +#undef TEST_UNARY_V128 +#undef TEST_BINARY_V128 +#undef TEST_EXTRACT_LANE +#undef TEST_REPLACE_LANE_I32 +#undef TEST_REPLACE_LANE_I64 +#undef TEST_REPLACE_LANE_F32 +#undef TEST_REPLACE_LANE_F64 +#undef TEST_LOAD_LANE +#undef TEST_STORE_LANE +#undef TEST_SHUFFLE +} + +#endif // defined(DEBUG) + void CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* code)) { IAllocator* allowZeroAlloc = new (m_compiler, CMK_GC) CompIAllocator(m_compiler->getAllocatorGC()); diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 08326526046229..014d5f962e3013 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -689,6 +689,9 @@ class emitter // TODO-LoongArch64: not include SIMD-vector. static_assert(INS_count <= 512); instruction _idIns : 9; +#elif defined(TARGET_WASM) + static_assert(INS_count <= 512); + instruction _idIns : 9; #else static_assert(INS_count <= 256); instruction _idIns : 8; @@ -1321,6 +1324,17 @@ class emitter { return _idInsFmt == IF_TRY_TABLE; } + + bool idIsV128Imm() const + { + return _idInsFmt == IF_V128; + } + + bool idIsMemargLaneImm() const + { + return _idInsFmt == IF_MEMARG_LANE; + } + #endif #ifdef TARGET_ARM64 @@ -2414,6 +2428,41 @@ class emitter imm = i; } }; + + struct instrDescV128Imm : instrDesc + { + instrDescV128Imm() = delete; + + uint8_t v128Bytes[16]; + + void idV128Const(const uint8_t bytes[16]) + { + assert(bytes != nullptr); + memcpy(v128Bytes, bytes, 16); + } + + const uint8_t* idV128Const() const + { + return v128Bytes; + } + }; + + struct instrDescMemargLane : instrDescCns + { + instrDescMemargLane() = delete; + + uint8_t lane; + + void idLaneIdx(uint8_t idx) + { + lane = idx; + } + + uint8_t idLaneIdx() const + { + return lane; + } + }; #endif // TARGET_WASM #ifdef TARGET_RISCV64 diff --git a/src/coreclr/jit/emitfmtswasm.h b/src/coreclr/jit/emitfmtswasm.h index 9d65047ed0a510..0f052e198f2f07 100644 --- a/src/coreclr/jit/emitfmtswasm.h +++ b/src/coreclr/jit/emitfmtswasm.h @@ -46,6 +46,9 @@ IF_DEF(CALL_INDIRECT, IS_NONE, NONE) // IF_DEF(TRY_TABLE, IS_NONE, NONE) // IF_DEF(CATCH_DECL, IS_NONE, NONE) // +IF_DEF(V128, IS_NONE, NONE) // <16 raw bytes> +IF_DEF(LANE, IS_NONE, NONE) // +IF_DEF(MEMARG_LANE, IS_NONE, NONE) // #undef IF_DEF #endif // !DEFINE_ID_OPS diff --git a/src/coreclr/jit/emitwasm.cpp b/src/coreclr/jit/emitwasm.cpp index e897fd1126a255..46e21686deb1a9 100644 --- a/src/coreclr/jit/emitwasm.cpp +++ b/src/coreclr/jit/emitwasm.cpp @@ -17,6 +17,44 @@ }; // clang-format on +bool isValidSimdElemSize(unsigned elemSize) +{ + // Valid SIMD configurations are i8x16, i16x8, i32x4, i64x2, f32x4, f64x2 + return (elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8); +} + +// -------------------------------------------------------------------------------------------------- +// isValidVectorIndex - returns true if the specified index is valid for the given SIMD element size +// Arguments: +// elemSize - element size in bytes (1, 2, 4, or 8) +// index - the index to validate + +bool emitter::isValidVectorIndex(uint8_t elemSize, uint8_t index) +{ + assert(isValidSimdElemSize(elemSize)); + + bool isValid = false; + switch (elemSize) + { + case 1: + isValid = (index < 16); + break; + case 2: + isValid = (index < 8); + break; + case 4: + isValid = (index < 4); + break; + case 8: + isValid = (index < 2); + break; + default: + unreached(); + } + + return isValid; +} + void emitter::emitIns(instruction ins) { instrDesc* id = emitNewInstrSmall(EA_8BYTE); @@ -374,6 +412,109 @@ unsigned int emitter::emitGetValTypeImmImm(const instrDesc* id) return static_cast(id)->imm; } +const uint8_t* emitter::emitGetV128ImmValue(const instrDesc* id) +{ + assert(id->idIsV128Imm()); + return static_cast(id)->v128Bytes; +} + +uint8_t emitter::emitGetLaneImmValue(const instrDesc* id) +{ + if (id->idIsMemargLaneImm()) + { + return static_cast(id)->lane; + } + else if (id->idInsFmt() == IF_LANE) + { + cnsval_size_t lane = emitGetInsSC(id); + assert(FitsIn(lane)); + return static_cast(lane); + } + else + { + unreached(); + } + + return 0; +} + +//------------------------------------------------------------------------ +// Packed SIMD instruction emit functions +//------------------------------------------------------------------------ + +//------------------------------------------------------------------------ +// emitIns_V128Imm: Emit a packed SIMD instruction with a 16 byte vector immediate. +// +// Arguments: +// ins - instruction (currently used with INS_v128_const and INS_i8x16_shuffle) +// bytes - pointer to 16 bytes of constant data +// +void emitter::emitIns_V128Imm(instruction ins, const uint8_t bytes[16]) +{ + assert(bytes != nullptr); + instrDescV128Imm* id = static_cast(emitAllocAnyInstr(sizeof(instrDescV128Imm), EA_16BYTE)); + insFormat fmt = emitInsFormat(ins); + assert(fmt == IF_V128); + + id->idInsFmt(fmt); + id->idIns(ins); + id->idV128Const(bytes); + + dispIns(id); + appendToCurIG(id); +} + +//------------------------------------------------------------------------ +// emitIns_Lane: Emit a SIMD extract/replace lane instruction. +// +// Arguments: +// ins - instruction (e.g., INS_i8x16_extract_lane_s) +// attr - emit attribute indicating the lane element size +// laneIdx - lane index byte +// +void emitter::emitIns_Lane(instruction ins, emitAttr attr, uint8_t laneIdx) +{ + instrDesc* id = emitNewInstrSC(attr, laneIdx); + insFormat fmt = emitInsFormat(ins); + uint8_t elemSize = CodeGenInterface::instSimdElemSize(ins); + assert(fmt == IF_LANE); + assert(isValidVectorIndex(elemSize, laneIdx)); + + id->idInsFmt(fmt); + id->idIns(ins); + + dispIns(id); + appendToCurIG(id); +} + +//------------------------------------------------------------------------ +// emitIns_MemargLane: Emit a SIMD load/store lane instruction with memarg + lane index. +// +// Arguments: +// ins - instruction (e.g., INS_v128_load8_lane) +// attr - emit attribute indicating the memory access size +// offset - memory offset for the memarg +// laneIdx - lane index byte +// +void emitter::emitIns_MemargLane(instruction ins, emitAttr attr, cnsval_ssize_t offset, uint8_t laneIdx) +{ + instrDescMemargLane* id = static_cast(emitAllocAnyInstr(sizeof(instrDescMemargLane), attr)); + insFormat fmt = emitInsFormat(ins); + uint8_t elemSize = CodeGenInterface::instSimdElemSize(ins); + assert(fmt == IF_MEMARG_LANE); + assert(offset >= 0); + assert(isValidVectorIndex(elemSize, laneIdx)); + + id->idInsFmt(fmt); + id->idIns(ins); + id->idcCnsVal = offset; + id->idSetIsLargeCns(); + id->idLaneIdx(laneIdx); + + dispIns(id); + appendToCurIG(id); +} + emitter::insFormat emitter::emitInsFormat(instruction ins) { static_assert(IF_COUNT < 255); @@ -425,6 +566,16 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id) const return SMALL_IDSC_SIZE; } + if (id->idIsMemargLaneImm()) + { + return sizeof(instrDescMemargLane); + } + + if (id->idIsV128Imm()) + { + return sizeof(instrDescV128Imm); + } + if (id->idIsLargeCns()) { assert(!id->idIsLargeDsp()); @@ -576,6 +727,21 @@ unsigned emitter::instrDesc::idCodeSize() const size += SizeOfULEB128(emitGetInsSC(this)); // control flow stack offset break; } + case IF_V128: + size += 16; // 16 raw bytes for the v128 constant + break; + case IF_LANE: + size += 1; // 1 byte lane index + break; + case IF_MEMARG_LANE: + { + uint64_t align = emitGetAlignHintLog2(this); + assert(align < 64); // spec says align > 2^6 produces a memidx for multiple memories. + size += SizeOfULEB128(align); + size += idIsCnsReloc() ? PADDED_RELOC_SIZE : SizeOfULEB128(emitGetInsSC(this)); + size += 1; // 1 byte lane index + break; + } default: unreached(); } @@ -897,6 +1063,32 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutputULEB128Padded(dst, (int64_t)size); break; } + case IF_V128: + { + dst += emitOutputOpcode(dst, ins); + const uint8_t* v128Value = emitGetV128ImmValue(id); + dst += emitRawBytes(dst, v128Value, 16); + break; + } + case IF_LANE: + { + dst += emitOutputOpcode(dst, ins); + uint8_t laneIdx = emitGetLaneImmValue(id); + dst += emitOutputByte(dst, laneIdx); + break; + } + case IF_MEMARG_LANE: + { + dst += emitOutputOpcode(dst, ins); + uint8_t laneIdx = emitGetLaneImmValue(id); + uint64_t align = emitGetAlignHintLog2(id); + uint64_t offset = emitGetInsSC(id); + assert(align < 64); + dst += emitOutputULEB128(dst, align); + dst += emitOutputULEB128(dst, offset); + dst += emitOutputByte(dst, laneIdx); + break; + } default: NYI_WASM("emitOutputInstr"); break; @@ -1182,6 +1374,35 @@ void emitter::emitDispIns( } break; + case IF_V128: + { + const uint8_t* imm = emitGetV128ImmValue(id); + for (int i = 0; i < 16; i++) + { + printf(" 0x%02x", imm[i]); + } + } + break; + + case IF_LANE: + { + uint8_t lane = emitGetLaneImmValue(id); + printf(" [%u]", (uint8_t)lane); + } + break; + + case IF_MEMARG_LANE: + { + unsigned log2align = emitGetAlignHintLog2(id); + cnsval_ssize_t offset = emitGetInsSC(id); + printf(" %u %llu", log2align, (uint64_t)offset); + dispLclVarInfoIfAny(); + + uint8_t lane = emitGetLaneImmValue(id); + printf(" [%u]", (uint8_t)lane); + } + break; + default: unreached(); } diff --git a/src/coreclr/jit/emitwasm.h b/src/coreclr/jit/emitwasm.h index 030c73f7550889..b5bbff12e7cc8d 100644 --- a/src/coreclr/jit/emitwasm.h +++ b/src/coreclr/jit/emitwasm.h @@ -16,6 +16,7 @@ void emitDispInst(instruction ins); /************************************************************************/ public: +bool isValidVectorIndex(uint8_t elemsize, uint8_t index); void emitIns(instruction ins); void emitIns_BlockTy(instruction ins, WasmValueType valType = WasmValueType::Invalid); void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t imm); @@ -31,6 +32,11 @@ void emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2) void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs); +// Packed SIMD instruction emit functions +void emitIns_V128Imm(instruction ins, const uint8_t bytes[16]); +void emitIns_Lane(instruction ins, emitAttr attr, uint8_t laneIdx); +void emitIns_MemargLane(instruction ins, emitAttr attr, cnsval_ssize_t offset, uint8_t laneIdx); + void emitAddressConstant(void* address); void emitFuncletAddressConstant(cnsval_ssize_t funcletId); @@ -47,6 +53,9 @@ instrDesc* emitNewInstrValTypeImm(emitAttr attr, WasmValueType type, u static WasmValueType emitGetValTypeImmType(const instrDesc* id); static unsigned int emitGetValTypeImmImm(const instrDesc* id); +const uint8_t* emitGetV128ImmValue(const instrDesc* id); +uint8_t emitGetLaneImmValue(const instrDesc* id); + /************************************************************************/ /* Private members that deal with target-dependent instr. descriptors */ /************************************************************************/ diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 8e7451a9bc1a86..500e94832339cc 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -598,6 +598,14 @@ bool CodeGenInterface::instHasPseudoName(instruction ins) } #endif // TARGET_XARCH +#if defined(TARGET_WASM) +uint8_t CodeGenInterface::instSimdElemSize(instruction ins) +{ + assert((unsigned)ins < ArrLen(instInfo)); + return static_cast((instInfo[ins] >> InstInfoElemSizeShift)); +} +#endif + /***************************************************************************** * * Generate a set instruction. diff --git a/src/coreclr/jit/instrswasm.h b/src/coreclr/jit/instrswasm.h index 9982598dfdbb5b..90e87b177e7cba 100644 --- a/src/coreclr/jit/instrswasm.h +++ b/src/coreclr/jit/instrswasm.h @@ -244,6 +244,288 @@ INST2(i64_trunc_sat_f64_u, "i64.trunc_sat_f64_u", 0, IF_OPCODE, 0xFC, 7) INST2(memory_copy, "memory.copy", 0, IF_MEMIDX_MEMIDX, 0xFC, 10) INST2(memory_fill, "memory.fill", 0, IF_ULEB128, 0xFC, 11) +// 5.4.9 Vector Instructions (SIMD, 0xFD prefix) +// +// Memory operations (memarg format) +INST2(v128_load, "v128.load", 0, IF_MEMARG, 0xFD, 0) +INST2(v128_load8x8_s, "v128.load8x8_s", 0, IF_MEMARG, 0xFD, 1) +INST2(v128_load8x8_u, "v128.load8x8_u", 0, IF_MEMARG, 0xFD, 2) +INST2(v128_load16x4_s, "v128.load16x4_s", 0, IF_MEMARG, 0xFD, 3) +INST2(v128_load16x4_u, "v128.load16x4_u", 0, IF_MEMARG, 0xFD, 4) +INST2(v128_load32x2_s, "v128.load32x2_s", 0, IF_MEMARG, 0xFD, 5) +INST2(v128_load32x2_u, "v128.load32x2_u", 0, IF_MEMARG, 0xFD, 6) +INST2(v128_load8_splat, "v128.load8_splat", 0, IF_MEMARG, 0xFD, 7) +INST2(v128_load16_splat, "v128.load16_splat", 0, IF_MEMARG, 0xFD, 8) +INST2(v128_load32_splat, "v128.load32_splat", 0, IF_MEMARG, 0xFD, 9) +INST2(v128_load64_splat, "v128.load64_splat", 0, IF_MEMARG, 0xFD, 10) +INST2(v128_store, "v128.store", 0, IF_MEMARG, 0xFD, 11) + +// v128.const and i8x16.shuffle (special formats) +INST2(v128_const, "v128.const", 0, IF_V128, 0xFD, 12) +INST2(i8x16_shuffle, "i8x16.shuffle", 0, IF_V128, 0xFD, 13) + +// i8x16.swizzle (plain opcode) +INST2(i8x16_swizzle, "i8x16.swizzle", 0, IF_OPCODE, 0xFD, 14) + +// Splat operations +INST2(i8x16_splat, "i8x16.splat", 0, IF_OPCODE, 0xFD, 15) +INST2(i16x8_splat, "i16x8.splat", 0, IF_OPCODE, 0xFD, 16) +INST2(i32x4_splat, "i32x4.splat", 0, IF_OPCODE, 0xFD, 17) +INST2(i64x2_splat, "i64x2.splat", 0, IF_OPCODE, 0xFD, 18) +INST2(f32x4_splat, "f32x4.splat", 0, IF_OPCODE, 0xFD, 19) +INST2(f64x2_splat, "f64x2.splat", 0, IF_OPCODE, 0xFD, 20) + +// Extract/replace lane operations (lane index byte), info byte is (elemSize << 1) +INST2(i8x16_extract_lane_s, "i8x16.extract_lane_s", 2, IF_LANE, 0xFD, 21) +INST2(i8x16_extract_lane_u, "i8x16.extract_lane_u", 2, IF_LANE, 0xFD, 22) +INST2(i8x16_replace_lane, "i8x16.replace_lane", 2, IF_LANE, 0xFD, 23) +INST2(i16x8_extract_lane_s, "i16x8.extract_lane_s", 4, IF_LANE, 0xFD, 24) +INST2(i16x8_extract_lane_u, "i16x8.extract_lane_u", 4, IF_LANE, 0xFD, 25) +INST2(i16x8_replace_lane, "i16x8.replace_lane", 4, IF_LANE, 0xFD, 26) +INST2(i32x4_extract_lane, "i32x4.extract_lane", 8, IF_LANE, 0xFD, 27) +INST2(i32x4_replace_lane, "i32x4.replace_lane", 8, IF_LANE, 0xFD, 28) +INST2(i64x2_extract_lane, "i64x2.extract_lane", 16, IF_LANE, 0xFD, 29) +INST2(i64x2_replace_lane, "i64x2.replace_lane", 16, IF_LANE, 0xFD, 30) +INST2(f32x4_extract_lane, "f32x4.extract_lane", 8, IF_LANE, 0xFD, 31) +INST2(f32x4_replace_lane, "f32x4.replace_lane", 8, IF_LANE, 0xFD, 32) +INST2(f64x2_extract_lane, "f64x2.extract_lane", 16, IF_LANE, 0xFD, 33) +INST2(f64x2_replace_lane, "f64x2.replace_lane", 16, IF_LANE, 0xFD, 34) + +// i8x16 comparisons +INST2(i8x16_eq, "i8x16.eq", 0, IF_OPCODE, 0xFD, 35) +INST2(i8x16_ne, "i8x16.ne", 0, IF_OPCODE, 0xFD, 36) +INST2(i8x16_lt_s, "i8x16.lt_s", 0, IF_OPCODE, 0xFD, 37) +INST2(i8x16_lt_u, "i8x16.lt_u", 0, IF_OPCODE, 0xFD, 38) +INST2(i8x16_gt_s, "i8x16.gt_s", 0, IF_OPCODE, 0xFD, 39) +INST2(i8x16_gt_u, "i8x16.gt_u", 0, IF_OPCODE, 0xFD, 40) +INST2(i8x16_le_s, "i8x16.le_s", 0, IF_OPCODE, 0xFD, 41) +INST2(i8x16_le_u, "i8x16.le_u", 0, IF_OPCODE, 0xFD, 42) +INST2(i8x16_ge_s, "i8x16.ge_s", 0, IF_OPCODE, 0xFD, 43) +INST2(i8x16_ge_u, "i8x16.ge_u", 0, IF_OPCODE, 0xFD, 44) + +// i16x8 comparisons +INST2(i16x8_eq, "i16x8.eq", 0, IF_OPCODE, 0xFD, 45) +INST2(i16x8_ne, "i16x8.ne", 0, IF_OPCODE, 0xFD, 46) +INST2(i16x8_lt_s, "i16x8.lt_s", 0, IF_OPCODE, 0xFD, 47) +INST2(i16x8_lt_u, "i16x8.lt_u", 0, IF_OPCODE, 0xFD, 48) +INST2(i16x8_gt_s, "i16x8.gt_s", 0, IF_OPCODE, 0xFD, 49) +INST2(i16x8_gt_u, "i16x8.gt_u", 0, IF_OPCODE, 0xFD, 50) +INST2(i16x8_le_s, "i16x8.le_s", 0, IF_OPCODE, 0xFD, 51) +INST2(i16x8_le_u, "i16x8.le_u", 0, IF_OPCODE, 0xFD, 52) +INST2(i16x8_ge_s, "i16x8.ge_s", 0, IF_OPCODE, 0xFD, 53) +INST2(i16x8_ge_u, "i16x8.ge_u", 0, IF_OPCODE, 0xFD, 54) + +// i32x4 comparisons +INST2(i32x4_eq, "i32x4.eq", 0, IF_OPCODE, 0xFD, 55) +INST2(i32x4_ne, "i32x4.ne", 0, IF_OPCODE, 0xFD, 56) +INST2(i32x4_lt_s, "i32x4.lt_s", 0, IF_OPCODE, 0xFD, 57) +INST2(i32x4_lt_u, "i32x4.lt_u", 0, IF_OPCODE, 0xFD, 58) +INST2(i32x4_gt_s, "i32x4.gt_s", 0, IF_OPCODE, 0xFD, 59) +INST2(i32x4_gt_u, "i32x4.gt_u", 0, IF_OPCODE, 0xFD, 60) +INST2(i32x4_le_s, "i32x4.le_s", 0, IF_OPCODE, 0xFD, 61) +INST2(i32x4_le_u, "i32x4.le_u", 0, IF_OPCODE, 0xFD, 62) +INST2(i32x4_ge_s, "i32x4.ge_s", 0, IF_OPCODE, 0xFD, 63) +INST2(i32x4_ge_u, "i32x4.ge_u", 0, IF_OPCODE, 0xFD, 64) + +// i64x2 comparisons +INST2(i64x2_eq, "i64x2.eq", 0, IF_OPCODE, 0xFD, 214) +INST2(i64x2_ne, "i64x2.ne", 0, IF_OPCODE, 0xFD, 215) +INST2(i64x2_lt_s, "i64x2.lt_s", 0, IF_OPCODE, 0xFD, 216) +INST2(i64x2_gt_s, "i64x2.gt_s", 0, IF_OPCODE, 0xFD, 217) +INST2(i64x2_le_s, "i64x2.le_s", 0, IF_OPCODE, 0xFD, 218) +INST2(i64x2_ge_s, "i64x2.ge_s", 0, IF_OPCODE, 0xFD, 219) + +// f32x4 comparisons +INST2(f32x4_eq, "f32x4.eq", 0, IF_OPCODE, 0xFD, 65) +INST2(f32x4_ne, "f32x4.ne", 0, IF_OPCODE, 0xFD, 66) +INST2(f32x4_lt, "f32x4.lt", 0, IF_OPCODE, 0xFD, 67) +INST2(f32x4_gt, "f32x4.gt", 0, IF_OPCODE, 0xFD, 68) +INST2(f32x4_le, "f32x4.le", 0, IF_OPCODE, 0xFD, 69) +INST2(f32x4_ge, "f32x4.ge", 0, IF_OPCODE, 0xFD, 70) + +// f64x2 comparisons +INST2(f64x2_eq, "f64x2.eq", 0, IF_OPCODE, 0xFD, 71) +INST2(f64x2_ne, "f64x2.ne", 0, IF_OPCODE, 0xFD, 72) +INST2(f64x2_lt, "f64x2.lt", 0, IF_OPCODE, 0xFD, 73) +INST2(f64x2_gt, "f64x2.gt", 0, IF_OPCODE, 0xFD, 74) +INST2(f64x2_le, "f64x2.le", 0, IF_OPCODE, 0xFD, 75) +INST2(f64x2_ge, "f64x2.ge", 0, IF_OPCODE, 0xFD, 76) + +// v128 bitwise operations +INST2(v128_not, "v128.not", 0, IF_OPCODE, 0xFD, 77) +INST2(v128_and, "v128.and", 0, IF_OPCODE, 0xFD, 78) +INST2(v128_andnot, "v128.andnot", 0, IF_OPCODE, 0xFD, 79) +INST2(v128_or, "v128.or", 0, IF_OPCODE, 0xFD, 80) +INST2(v128_xor, "v128.xor", 0, IF_OPCODE, 0xFD, 81) +INST2(v128_bitselect, "v128.bitselect", 0, IF_OPCODE, 0xFD, 82) +INST2(v128_any_true, "v128.any_true", 0, IF_OPCODE, 0xFD, 83) + +// Load/store lane operations (memarg + lane index), info byte is (elemSize << 1) +INST2(v128_load8_lane, "v128.load8_lane", 2, IF_MEMARG_LANE, 0xFD, 84) +INST2(v128_load16_lane, "v128.load16_lane", 4, IF_MEMARG_LANE, 0xFD, 85) +INST2(v128_load32_lane, "v128.load32_lane", 8, IF_MEMARG_LANE, 0xFD, 86) +INST2(v128_load64_lane, "v128.load64_lane", 16, IF_MEMARG_LANE, 0xFD, 87) +INST2(v128_store8_lane, "v128.store8_lane", 2, IF_MEMARG_LANE, 0xFD, 88) +INST2(v128_store16_lane, "v128.store16_lane", 4, IF_MEMARG_LANE, 0xFD, 89) +INST2(v128_store32_lane, "v128.store32_lane", 8, IF_MEMARG_LANE, 0xFD, 90) +INST2(v128_store64_lane, "v128.store64_lane", 16, IF_MEMARG_LANE, 0xFD, 91) + +// Load zero operations (memarg) +INST2(v128_load32_zero, "v128.load32_zero", 0, IF_MEMARG, 0xFD, 92) +INST2(v128_load64_zero, "v128.load64_zero", 0, IF_MEMARG, 0xFD, 93) + +// Conversion: demote/promote (no sign specifier) +INST2(f32x4_demote_f64x2_zero, "f32x4.demote_f64x2_zero", 0, IF_OPCODE, 0xFD, 94) +INST2(f64x2_promote_low_f32x4, "f64x2.promote_low_f32x4", 0, IF_OPCODE, 0xFD, 95) + +// i8x16 arithmetic and other operations +INST2(i8x16_abs, "i8x16.abs", 0, IF_OPCODE, 0xFD, 96) +INST2(i8x16_neg, "i8x16.neg", 0, IF_OPCODE, 0xFD, 97) +INST2(i8x16_popcnt, "i8x16.popcnt", 0, IF_OPCODE, 0xFD, 98) +INST2(i8x16_all_true, "i8x16.all_true", 0, IF_OPCODE, 0xFD, 99) +INST2(i8x16_bitmask, "i8x16.bitmask", 0, IF_OPCODE, 0xFD, 100) +INST2(i8x16_narrow_i16x8_s, "i8x16.narrow_i16x8_s", 0, IF_OPCODE, 0xFD, 101) +INST2(i8x16_narrow_i16x8_u, "i8x16.narrow_i16x8_u", 0, IF_OPCODE, 0xFD, 102) +INST2(f32x4_ceil, "f32x4.ceil", 0, IF_OPCODE, 0xFD, 103) +INST2(f32x4_floor, "f32x4.floor", 0, IF_OPCODE, 0xFD, 104) +INST2(f32x4_trunc, "f32x4.trunc", 0, IF_OPCODE, 0xFD, 105) +INST2(f32x4_nearest, "f32x4.nearest", 0, IF_OPCODE, 0xFD, 106) +INST2(i8x16_shl, "i8x16.shl", 0, IF_OPCODE, 0xFD, 107) +INST2(i8x16_shr_s, "i8x16.shr_s", 0, IF_OPCODE, 0xFD, 108) +INST2(i8x16_shr_u, "i8x16.shr_u", 0, IF_OPCODE, 0xFD, 109) +INST2(i8x16_add, "i8x16.add", 0, IF_OPCODE, 0xFD, 110) +INST2(i8x16_add_sat_s, "i8x16.add_sat_s", 0, IF_OPCODE, 0xFD, 111) +INST2(i8x16_add_sat_u, "i8x16.add_sat_u", 0, IF_OPCODE, 0xFD, 112) +INST2(i8x16_sub, "i8x16.sub", 0, IF_OPCODE, 0xFD, 113) +INST2(i8x16_sub_sat_s, "i8x16.sub_sat_s", 0, IF_OPCODE, 0xFD, 114) +INST2(i8x16_sub_sat_u, "i8x16.sub_sat_u", 0, IF_OPCODE, 0xFD, 115) +INST2(f64x2_ceil, "f64x2.ceil", 0, IF_OPCODE, 0xFD, 116) +INST2(f64x2_floor, "f64x2.floor", 0, IF_OPCODE, 0xFD, 117) +INST2(i8x16_min_s, "i8x16.min_s", 0, IF_OPCODE, 0xFD, 118) +INST2(i8x16_min_u, "i8x16.min_u", 0, IF_OPCODE, 0xFD, 119) +INST2(i8x16_max_s, "i8x16.max_s", 0, IF_OPCODE, 0xFD, 120) +INST2(i8x16_max_u, "i8x16.max_u", 0, IF_OPCODE, 0xFD, 121) +INST2(f64x2_trunc, "f64x2.trunc", 0, IF_OPCODE, 0xFD, 122) +INST2(i8x16_avgr_u, "i8x16.avgr_u", 0, IF_OPCODE, 0xFD, 123) + +// i16x8 operations +INST2(i16x8_extadd_pairwise_s_i8x16, "i16x8.extadd_pairwise_s_i8x16", 0, IF_OPCODE, 0xFD, 124) +INST2(i16x8_extadd_pairwise_u_i8x16, "i16x8.extadd_pairwise_u_i8x16", 0, IF_OPCODE, 0xFD, 125) +INST2(i32x4_extadd_pairwise_s_i16x8, "i32x4.extadd_pairwise_s_i16x8", 0, IF_OPCODE, 0xFD, 126) +INST2(i32x4_extadd_pairwise_u_i16x8, "i32x4.extadd_pairwise_u_i16x8", 0, IF_OPCODE, 0xFD, 127) +INST2(i16x8_abs, "i16x8.abs", 0, IF_OPCODE, 0xFD, 128) +INST2(i16x8_neg, "i16x8.neg", 0, IF_OPCODE, 0xFD, 129) +INST2(i16x8_q15mulr_sat_s, "i16x8.q15mulr_sat_s", 0, IF_OPCODE, 0xFD, 130) +INST2(i16x8_all_true, "i16x8.all_true", 0, IF_OPCODE, 0xFD, 131) +INST2(i16x8_bitmask, "i16x8.bitmask", 0, IF_OPCODE, 0xFD, 132) +INST2(i16x8_narrow_i32x4_s, "i16x8.narrow_i32x4_s", 0, IF_OPCODE, 0xFD, 133) +INST2(i16x8_narrow_i32x4_u, "i16x8.narrow_i32x4_u", 0, IF_OPCODE, 0xFD, 134) +INST2(i16x8_extend_low_s_i8x16, "i16x8.extend_low_s_i8x16", 0, IF_OPCODE, 0xFD, 135) +INST2(i16x8_extend_high_s_i8x16, "i16x8.extend_high_s_i8x16", 0, IF_OPCODE, 0xFD, 136) +INST2(i16x8_extend_low_u_i8x16, "i16x8.extend_low_u_i8x16", 0, IF_OPCODE, 0xFD, 137) +INST2(i16x8_extend_high_u_i8x16, "i16x8.extend_high_u_i8x16", 0, IF_OPCODE, 0xFD, 138) +INST2(i16x8_shl, "i16x8.shl", 0, IF_OPCODE, 0xFD, 139) +INST2(i16x8_shr_s, "i16x8.shr_s", 0, IF_OPCODE, 0xFD, 140) +INST2(i16x8_shr_u, "i16x8.shr_u", 0, IF_OPCODE, 0xFD, 141) +INST2(i16x8_add, "i16x8.add", 0, IF_OPCODE, 0xFD, 142) +INST2(i16x8_add_sat_s, "i16x8.add_sat_s", 0, IF_OPCODE, 0xFD, 143) +INST2(i16x8_add_sat_u, "i16x8.add_sat_u", 0, IF_OPCODE, 0xFD, 144) +INST2(i16x8_sub, "i16x8.sub", 0, IF_OPCODE, 0xFD, 145) +INST2(i16x8_sub_sat_s, "i16x8.sub_sat_s", 0, IF_OPCODE, 0xFD, 146) +INST2(i16x8_sub_sat_u, "i16x8.sub_sat_u", 0, IF_OPCODE, 0xFD, 147) +INST2(f64x2_nearest, "f64x2.nearest", 0, IF_OPCODE, 0xFD, 148) +INST2(i16x8_mul, "i16x8.mul", 0, IF_OPCODE, 0xFD, 149) +INST2(i16x8_min_s, "i16x8.min_s", 0, IF_OPCODE, 0xFD, 150) +INST2(i16x8_min_u, "i16x8.min_u", 0, IF_OPCODE, 0xFD, 151) +INST2(i16x8_max_s, "i16x8.max_s", 0, IF_OPCODE, 0xFD, 152) +INST2(i16x8_max_u, "i16x8.max_u", 0, IF_OPCODE, 0xFD, 153) +INST2(i16x8_avgr_u, "i16x8.avgr_u", 0, IF_OPCODE, 0xFD, 155) +INST2(i16x8_extmul_low_s_i8x16, "i16x8.extmul_low_s_i8x16", 0, IF_OPCODE, 0xFD, 156) +INST2(i16x8_extmul_high_s_i8x16, "i16x8.extmul_high_s_i8x16", 0, IF_OPCODE, 0xFD, 157) +INST2(i16x8_extmul_low_u_i8x16, "i16x8.extmul_low_u_i8x16", 0, IF_OPCODE, 0xFD, 158) +INST2(i16x8_extmul_high_u_i8x16, "i16x8.extmul_high_u_i8x16", 0, IF_OPCODE, 0xFD, 159) + +// i32x4 operations +INST2(i32x4_abs, "i32x4.abs", 0, IF_OPCODE, 0xFD, 160) +INST2(i32x4_neg, "i32x4.neg", 0, IF_OPCODE, 0xFD, 161) +INST2(i32x4_all_true, "i32x4.all_true", 0, IF_OPCODE, 0xFD, 163) +INST2(i32x4_bitmask, "i32x4.bitmask", 0, IF_OPCODE, 0xFD, 164) +INST2(i32x4_extend_low_s_i16x8, "i32x4.extend_low_s_i16x8", 0, IF_OPCODE, 0xFD, 167) +INST2(i32x4_extend_high_s_i16x8, "i32x4.extend_high_s_i16x8", 0, IF_OPCODE, 0xFD, 168) +INST2(i32x4_extend_low_u_i16x8, "i32x4.extend_low_u_i16x8", 0, IF_OPCODE, 0xFD, 169) +INST2(i32x4_extend_high_u_i16x8, "i32x4.extend_high_u_i16x8", 0, IF_OPCODE, 0xFD, 170) +INST2(i32x4_shl, "i32x4.shl", 0, IF_OPCODE, 0xFD, 171) +INST2(i32x4_shr_s, "i32x4.shr_s", 0, IF_OPCODE, 0xFD, 172) +INST2(i32x4_shr_u, "i32x4.shr_u", 0, IF_OPCODE, 0xFD, 173) +INST2(i32x4_add, "i32x4.add", 0, IF_OPCODE, 0xFD, 174) +INST2(i32x4_sub, "i32x4.sub", 0, IF_OPCODE, 0xFD, 177) +INST2(i32x4_mul, "i32x4.mul", 0, IF_OPCODE, 0xFD, 181) +INST2(i32x4_min_s, "i32x4.min_s", 0, IF_OPCODE, 0xFD, 182) +INST2(i32x4_min_u, "i32x4.min_u", 0, IF_OPCODE, 0xFD, 183) +INST2(i32x4_max_s, "i32x4.max_s", 0, IF_OPCODE, 0xFD, 184) +INST2(i32x4_max_u, "i32x4.max_u", 0, IF_OPCODE, 0xFD, 185) +INST2(i32x4_dot_i16x8_s, "i32x4.dot_i16x8_s", 0, IF_OPCODE, 0xFD, 186) +INST2(i32x4_extmul_low_s_i16x8, "i32x4.extmul_low_s_i16x8", 0, IF_OPCODE, 0xFD, 188) +INST2(i32x4_extmul_high_s_i16x8, "i32x4.extmul_high_s_i16x8", 0, IF_OPCODE, 0xFD, 189) +INST2(i32x4_extmul_low_u_i16x8, "i32x4.extmul_low_u_i16x8", 0, IF_OPCODE, 0xFD, 190) +INST2(i32x4_extmul_high_u_i16x8, "i32x4.extmul_high_u_i16x8", 0, IF_OPCODE, 0xFD, 191) + +// i64x2 operations +INST2(i64x2_abs, "i64x2.abs", 0, IF_OPCODE, 0xFD, 192) +INST2(i64x2_neg, "i64x2.neg", 0, IF_OPCODE, 0xFD, 193) +INST2(i64x2_all_true, "i64x2.all_true", 0, IF_OPCODE, 0xFD, 195) +INST2(i64x2_bitmask, "i64x2.bitmask", 0, IF_OPCODE, 0xFD, 196) +INST2(i64x2_extend_low_s_i32x4, "i64x2.extend_low_s_i32x4", 0, IF_OPCODE, 0xFD, 199) +INST2(i64x2_extend_high_s_i32x4, "i64x2.extend_high_s_i32x4", 0, IF_OPCODE, 0xFD, 200) +INST2(i64x2_extend_low_u_i32x4, "i64x2.extend_low_u_i32x4", 0, IF_OPCODE, 0xFD, 201) +INST2(i64x2_extend_high_u_i32x4, "i64x2.extend_high_u_i32x4", 0, IF_OPCODE, 0xFD, 202) +INST2(i64x2_shl, "i64x2.shl", 0, IF_OPCODE, 0xFD, 203) +INST2(i64x2_shr_s, "i64x2.shr_s", 0, IF_OPCODE, 0xFD, 204) +INST2(i64x2_shr_u, "i64x2.shr_u", 0, IF_OPCODE, 0xFD, 205) +INST2(i64x2_add, "i64x2.add", 0, IF_OPCODE, 0xFD, 206) +INST2(i64x2_sub, "i64x2.sub", 0, IF_OPCODE, 0xFD, 209) +INST2(i64x2_mul, "i64x2.mul", 0, IF_OPCODE, 0xFD, 213) +INST2(i64x2_extmul_low_s_i32x4, "i64x2.extmul_low_s_i32x4", 0, IF_OPCODE, 0xFD, 220) +INST2(i64x2_extmul_high_s_i32x4, "i64x2.extmul_high_s_i32x4", 0, IF_OPCODE, 0xFD, 221) +INST2(i64x2_extmul_low_u_i32x4, "i64x2.extmul_low_u_i32x4", 0, IF_OPCODE, 0xFD, 222) +INST2(i64x2_extmul_high_u_i32x4, "i64x2.extmul_high_u_i32x4", 0, IF_OPCODE, 0xFD, 223) + +// f32x4 arithmetic +INST2(f32x4_abs, "f32x4.abs", 0, IF_OPCODE, 0xFD, 224) +INST2(f32x4_neg, "f32x4.neg", 0, IF_OPCODE, 0xFD, 225) +INST2(f32x4_sqrt, "f32x4.sqrt", 0, IF_OPCODE, 0xFD, 227) +INST2(f32x4_add, "f32x4.add", 0, IF_OPCODE, 0xFD, 228) +INST2(f32x4_sub, "f32x4.sub", 0, IF_OPCODE, 0xFD, 229) +INST2(f32x4_mul, "f32x4.mul", 0, IF_OPCODE, 0xFD, 230) +INST2(f32x4_div, "f32x4.div", 0, IF_OPCODE, 0xFD, 231) +INST2(f32x4_min, "f32x4.min", 0, IF_OPCODE, 0xFD, 232) +INST2(f32x4_max, "f32x4.max", 0, IF_OPCODE, 0xFD, 233) +INST2(f32x4_pmin, "f32x4.pmin", 0, IF_OPCODE, 0xFD, 234) +INST2(f32x4_pmax, "f32x4.pmax", 0, IF_OPCODE, 0xFD, 235) + +// f64x2 arithmetic +INST2(f64x2_abs, "f64x2.abs", 0, IF_OPCODE, 0xFD, 236) +INST2(f64x2_neg, "f64x2.neg", 0, IF_OPCODE, 0xFD, 237) +INST2(f64x2_sqrt, "f64x2.sqrt", 0, IF_OPCODE, 0xFD, 239) +INST2(f64x2_add, "f64x2.add", 0, IF_OPCODE, 0xFD, 240) +INST2(f64x2_sub, "f64x2.sub", 0, IF_OPCODE, 0xFD, 241) +INST2(f64x2_mul, "f64x2.mul", 0, IF_OPCODE, 0xFD, 242) +INST2(f64x2_div, "f64x2.div", 0, IF_OPCODE, 0xFD, 243) +INST2(f64x2_min, "f64x2.min", 0, IF_OPCODE, 0xFD, 244) +INST2(f64x2_max, "f64x2.max", 0, IF_OPCODE, 0xFD, 245) +INST2(f64x2_pmin, "f64x2.pmin", 0, IF_OPCODE, 0xFD, 246) +INST2(f64x2_pmax, "f64x2.pmax", 0, IF_OPCODE, 0xFD, 247) + +// Conversion operations (sign specifier before source type per spec) +INST2(i32x4_trunc_sat_s_f32x4, "i32x4.trunc_sat_s_f32x4", 0, IF_OPCODE, 0xFD, 248) +INST2(i32x4_trunc_sat_u_f32x4, "i32x4.trunc_sat_u_f32x4", 0, IF_OPCODE, 0xFD, 249) +INST2(f32x4_convert_s_i32x4, "f32x4.convert_s_i32x4", 0, IF_OPCODE, 0xFD, 250) +INST2(f32x4_convert_u_i32x4, "f32x4.convert_u_i32x4", 0, IF_OPCODE, 0xFD, 251) +INST2(i32x4_trunc_sat_s_f64x2_zero, "i32x4.trunc_sat_s_f64x2_zero", 0, IF_OPCODE, 0xFD, 252) +INST2(i32x4_trunc_sat_u_f64x2_zero, "i32x4.trunc_sat_u_f64x2_zero", 0, IF_OPCODE, 0xFD, 253) +INST2(f64x2_convert_low_s_i32x4, "f64x2.convert_low_s_i32x4", 0, IF_OPCODE, 0xFD, 254) +INST2(f64x2_convert_low_u_i32x4, "f64x2.convert_low_u_i32x4", 0, IF_OPCODE, 0xFD, 255) + // clang-format on #undef INST