diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 613606d43b8097..98e848bd23e93c 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -644,6 +644,10 @@ class CodeGen final : public CodeGenInterface
     void genAmd64EmitterUnitTestsCTEST();
 #endif
 
+#if defined(TARGET_WASM)
+    void genWasmEmitterUnitTestsSimd();
+#endif
+
 #endif // defined(DEBUG)
 
 #ifdef TARGET_ARM64
diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h
index ea160060717233..e696c886cd9e28 100644
--- a/src/coreclr/jit/codegeninterface.h
+++ b/src/coreclr/jit/codegeninterface.h
@@ -212,6 +212,12 @@ class CodeGenInterface
 
     bool IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op);
 #endif // TARGET_XARCH
+#if defined(TARGET_WASM)
+    // On wasm, we store the simd element size in the upper 7 bits of the instruction info.
+    // The lower bit is reserved as an FP flag.
+    static constexpr unsigned InstInfoElemSizeShift = 1;
+    static uint8_t            instSimdElemSize(instruction ins);
+#endif
     //-------------------------------------------------------------------------
     // Liveness-related fields & methods
 public:
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 9ab94d99448c94..00b5ccf2740ac2 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -2703,6 +2703,7 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc)
  * Possible values for JitEmitUnitTestsSections:
  * Amd64: all, sse2
  * Arm64: all, general, advsimd, sve
+ * Wasm:  all, simd
  */
 
 #if defined(DEBUG)
@@ -2727,7 +2728,14 @@ void CodeGen::genEmitterUnitTests()
 
     // Jump over the generated tests as they are not intended to be run.
     BasicBlock* skipLabel = genCreateTempLabel();
+#ifndef TARGET_WASM
     inst_JMP(EJ_jmp, skipLabel);
+#else
+    // On Wasm, we skip over the generated emitter test code by nesting it in a block where the
+    // first instruction branches to the end of the block.
+    GetEmitter()->emitIns_BlockTy(INS_block);
+    GetEmitter()->emitIns_J(INS_br, EA_4BYTE, 0, nullptr);
+#endif
 
     // Add NOPs at the start and end for easier script parsing.
     instGen(INS_nop);
@@ -2777,6 +2785,13 @@ void CodeGen::genEmitterUnitTests()
     {
         genArm64EmitterUnitTestsPac();
     }
+
+#elif defined(TARGET_WASM)
+    if (unitTestSectionAll || (strstr(unitTestSection, "simd") != nullptr))
+    {
+        genWasmEmitterUnitTestsSimd();
+    }
+    instGen(INS_end);
 #endif
 
     genDefineTempLabel(skipLabel);
diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp
index db26846e35e0b7..6f9ddcf8a1fd65 100644
--- a/src/coreclr/jit/codegenwasm.cpp
+++ b/src/coreclr/jit/codegenwasm.cpp
@@ -3445,6 +3445,238 @@ void CodeGen::inst_JMP(emitJumpKind jmp, BasicBlock* tgtBlock)
     GetEmitter()->emitIns_J(instr, EA_4BYTE, depth, tgtBlock);
 }
 
+#if defined(DEBUG)
+
+//------------------------------------------------------------------------
+// genWasmEmitterUnitTestsSimd: Exercise the packed SIMD instruction emit
+//   functions added for Wasm (v128.const, extract/replace lane, shuffle,
+//   load/store lane, and plain-opcode SIMD instructions).
+//
+//   This is a temporary debug-only test that verifies the encoding paths
+//   do not assert or crash. Each instruction is emitted with valid stack
+//   operands so the resulting bytecode is semantically valid Wasm.
+//
+void CodeGen::genWasmEmitterUnitTestsSimd()
+{
+    emitter* emit = GetEmitter();
+
+    // Helper macros to push typed constants, ensuring valid stack state.
+    // clang-format off
+#define PUSH_V128(bytes) emit->emitIns_V128Imm(INS_v128_const, (bytes))
+#define PUSH_I32(val)    emit->emitIns_I(INS_i32_const, EA_4BYTE, (val))
+#define PUSH_I64(val)    emit->emitIns_I(INS_i64_const, EA_8BYTE, (val))
+#define PUSH_F32(val)    emit->emitIns_I(INS_f32_const, EA_4BYTE, (val))
+#define PUSH_F64(val)    emit->emitIns_I(INS_f64_const, EA_8BYTE, (val))
+#define DROP             emit->emitIns(INS_drop)
+
+    // Unary v128 -> result: push operand, emit instruction, drop result
+#define TEST_UNARY_V128(bytes, ins) \
+    PUSH_V128(bytes);               \
+    emit->emitIns(ins);             \
+    DROP
+
+    // Binary v128 x v128 -> v128: push two operands, emit instruction, drop result
+#define TEST_BINARY_V128(bytes, ins) \
+    PUSH_V128(bytes);                \
+    PUSH_V128(bytes);                \
+    emit->emitIns(ins);                  \
+    DROP
+
+    // Extract lane: v128 -> scalar (i32/i64/f32/f64), then drop
+#define TEST_EXTRACT_LANE(bytes, ins, attr, lane) \
+    PUSH_V128(bytes);                             \
+    emit->emitIns_Lane(ins, attr, lane);              \
+    DROP
+
+    // Replace lane: [v128, scalar] -> v128, then drop
+#define TEST_REPLACE_LANE_I32(bytes, ins, attr, lane) \
+    PUSH_V128(bytes);                                 \
+    PUSH_I32(42);                                     \
+    emit->emitIns_Lane(ins, attr, lane);                  \
+    DROP
+
+#define TEST_REPLACE_LANE_I64(bytes, ins, attr, lane) \
+    PUSH_V128(bytes);                                 \
+    PUSH_I64(42);                                     \
+    emit->emitIns_Lane(ins, attr, lane);                  \
+    DROP
+
+#define TEST_REPLACE_LANE_F32(bytes, ins, attr, lane) \
+    PUSH_V128(bytes);                                 \
+    PUSH_F32(0);                                      \
+    emit->emitIns_Lane(ins, attr, lane);                  \
+    DROP
+
+#define TEST_REPLACE_LANE_F64(bytes, ins, attr, lane) \
+    PUSH_V128(bytes);                                 \
+    PUSH_F64(0);                                      \
+    emit->emitIns_Lane(ins, attr, lane);                  \
+    DROP
+
+    // Load lane: [i32_addr, v128] -> v128, then drop
+#define TEST_LOAD_LANE(bytes, ins, attr, offset, lane) \
+    PUSH_I32(0);                                       \
+    PUSH_V128(bytes);                                  \
+    emit->emitIns_MemargLane(ins, attr, offset, lane); \
+    DROP
+
+    // Store lane: [i32_addr, v128] -> void
+#define TEST_STORE_LANE(bytes, ins, attr, offset, lane) \
+    PUSH_I32(0);                                        \
+    PUSH_V128(bytes);                                   \
+    emit->emitIns_MemargLane(ins, attr, offset, lane)
+
+    // Shuffle: [v128, v128] -> v128, then drop
+#define TEST_SHUFFLE(bytes, shuffleBytes) \
+    PUSH_V128(bytes);                     \
+    PUSH_V128(bytes);                     \
+    emit->emitIns_V128Imm(INS_i8x16_shuffle, shuffleBytes); \
+    DROP
+    // clang-format on
+
+    // --- IF_V128: v128.const with 16 raw bytes ---
+    const uint8_t v128Bytes[16] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F};
+    PUSH_V128(v128Bytes);
+    DROP;
+
+    // All-zeros and all-ones constants
+    const uint8_t v128Zeros[16] = {0};
+    PUSH_V128(v128Zeros);
+    DROP;
+
+    const uint8_t v128Ones[16] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+                                  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+    PUSH_V128(v128Ones);
+    DROP;
+
+    // --- IF_LANE: extract/replace lane instructions ---
+    // i8x16 lanes (0..15)
+    TEST_EXTRACT_LANE(v128Ones, INS_i8x16_extract_lane_s, EA_1BYTE, 0);
+    TEST_EXTRACT_LANE(v128Ones, INS_i8x16_extract_lane_u, EA_1BYTE, 15);
+    TEST_REPLACE_LANE_I32(v128Ones, INS_i8x16_replace_lane, EA_1BYTE, 7);
+
+    // i16x8 lanes (0..7)
+    TEST_EXTRACT_LANE(v128Ones, INS_i16x8_extract_lane_s, EA_2BYTE, 0);
+    TEST_EXTRACT_LANE(v128Ones, INS_i16x8_extract_lane_u, EA_2BYTE, 7);
+    TEST_REPLACE_LANE_I32(v128Ones, INS_i16x8_replace_lane, EA_2BYTE, 3);
+
+    // i32x4 lanes (0..3)
+    TEST_EXTRACT_LANE(v128Ones, INS_i32x4_extract_lane, EA_4BYTE, 0);
+    TEST_REPLACE_LANE_I32(v128Ones, INS_i32x4_replace_lane, EA_4BYTE, 3);
+
+    // i64x2 lanes (0..1)
+    TEST_EXTRACT_LANE(v128Ones, INS_i64x2_extract_lane, EA_8BYTE, 0);
+    TEST_REPLACE_LANE_I64(v128Ones, INS_i64x2_replace_lane, EA_8BYTE, 1);
+
+    // f32x4 lanes (0..3)
+    TEST_EXTRACT_LANE(v128Ones, INS_f32x4_extract_lane, EA_4BYTE, 3);
+    TEST_REPLACE_LANE_F32(v128Ones, INS_f32x4_replace_lane, EA_4BYTE, 0);
+
+    // f64x2 lanes (0..1)
+    TEST_EXTRACT_LANE(v128Ones, INS_f64x2_extract_lane, EA_8BYTE, 0);
+    TEST_REPLACE_LANE_F64(v128Ones, INS_f64x2_replace_lane, EA_8BYTE, 1);
+
+    // --- IF_MEMARG_LANE: load/store lane with memarg ---
+    TEST_LOAD_LANE(v128Ones, INS_v128_load8_lane, EA_1BYTE, 0, 5);
+    TEST_LOAD_LANE(v128Ones, INS_v128_load16_lane, EA_2BYTE, 16, 3);
+    TEST_LOAD_LANE(v128Ones, INS_v128_load32_lane, EA_4BYTE, 64, 2);
+    TEST_LOAD_LANE(v128Ones, INS_v128_load64_lane, EA_8BYTE, 128, 1);
+    TEST_STORE_LANE(v128Ones, INS_v128_store8_lane, EA_1BYTE, 0, 0);
+    TEST_STORE_LANE(v128Ones, INS_v128_store16_lane, EA_2BYTE, 8, 7);
+    TEST_STORE_LANE(v128Ones, INS_v128_store32_lane, EA_4BYTE, 32, 1);
+    TEST_STORE_LANE(v128Ones, INS_v128_store64_lane, EA_8BYTE, 256, 0);
+
+    // --- IF_V128: i8x16.shuffle with 16 lane-index bytes ---
+    // Identity shuffle
+    const uint8_t identityShuffle[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    TEST_SHUFFLE(v128Bytes, identityShuffle);
+
+    // Reverse bytes
+    const uint8_t reverseShuffle[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+    TEST_SHUFFLE(v128Bytes, reverseShuffle);
+
+    // Cross-operand shuffle (indices 16..31 refer to the second operand)
+    const uint8_t crossShuffle[16] = {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
+    TEST_SHUFFLE(v128Bytes, crossShuffle);
+
+    // --- IF_OPCODE: plain opcode SIMD instructions (representative sample) ---
+    // Splat operations: push scalar, splat to v128, drop
+    PUSH_I32(1);
+    emit->emitIns(INS_i8x16_splat);
+    DROP;
+
+    PUSH_I32(2);
+    emit->emitIns(INS_i16x8_splat);
+    DROP;
+
+    PUSH_I32(3);
+    emit->emitIns(INS_i32x4_splat);
+    DROP;
+
+    PUSH_I64(4);
+    emit->emitIns(INS_i64x2_splat);
+    DROP;
+
+    PUSH_F32(0);
+    emit->emitIns(INS_f32x4_splat);
+    DROP;
+
+    PUSH_F64(0);
+    emit->emitIns(INS_f64x2_splat);
+    DROP;
+
+    // Swizzle: [v128, v128] -> v128
+    TEST_BINARY_V128(v128Ones, INS_i8x16_swizzle);
+
+    // A few comparisons: [v128, v128] -> v128
+    TEST_BINARY_V128(v128Ones, INS_i8x16_eq);
+    TEST_BINARY_V128(v128Ones, INS_i32x4_ne);
+    TEST_BINARY_V128(v128Ones, INS_f64x2_lt);
+
+    // A few arithmetic ops
+    TEST_BINARY_V128(v128Ones, INS_i8x16_add);
+    TEST_BINARY_V128(v128Ones, INS_i32x4_mul);
+    TEST_UNARY_V128(v128Ones, INS_f32x4_sqrt);
+    TEST_UNARY_V128(v128Ones, INS_f64x2_neg);
+
+    // Bitwise ops
+    TEST_UNARY_V128(v128Ones, INS_v128_not);
+    TEST_BINARY_V128(v128Ones, INS_v128_and);
+    TEST_BINARY_V128(v128Ones, INS_v128_or);
+    TEST_BINARY_V128(v128Ones, INS_v128_xor);
+    TEST_BINARY_V128(v128Ones, INS_v128_andnot);
+
+    // Bitmask / any_true / all_true: v128 -> i32
+    TEST_UNARY_V128(v128Ones, INS_v128_any_true);
+    TEST_UNARY_V128(v128Ones, INS_i8x16_all_true);
+    TEST_UNARY_V128(v128Ones, INS_i32x4_bitmask);
+
+    // Conversion operations: v128 -> v128
+    TEST_UNARY_V128(v128Ones, INS_f32x4_convert_s_i32x4);
+    TEST_UNARY_V128(v128Ones, INS_f64x2_convert_low_u_i32x4);
+    TEST_UNARY_V128(v128Ones, INS_i32x4_trunc_sat_s_f32x4);
+
+#undef PUSH_V128
+#undef PUSH_I32
+#undef PUSH_I64
+#undef PUSH_F32
+#undef PUSH_F64
+#undef DROP
+#undef TEST_UNARY_V128
+#undef TEST_BINARY_V128
+#undef TEST_EXTRACT_LANE
+#undef TEST_REPLACE_LANE_I32
+#undef TEST_REPLACE_LANE_I64
+#undef TEST_REPLACE_LANE_F32
+#undef TEST_REPLACE_LANE_F64
+#undef TEST_LOAD_LANE
+#undef TEST_STORE_LANE
+#undef TEST_SHUFFLE
+}
+
+#endif // defined(DEBUG)
+
 void CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* code))
 {
     IAllocator*    allowZeroAlloc = new (m_compiler, CMK_GC) CompIAllocator(m_compiler->getAllocatorGC());
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 08326526046229..014d5f962e3013 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -689,6 +689,9 @@ class emitter
         // TODO-LoongArch64: not include SIMD-vector.
         static_assert(INS_count <= 512);
         instruction _idIns : 9;
+#elif defined(TARGET_WASM)
+        static_assert(INS_count <= 512);
+        instruction _idIns : 9;
 #else
         static_assert(INS_count <= 256);
         instruction _idIns : 8;
@@ -1321,6 +1324,17 @@ class emitter
         {
             return _idInsFmt == IF_TRY_TABLE;
         }
+
+        bool idIsV128Imm() const
+        {
+            return _idInsFmt == IF_V128;
+        }
+
+        bool idIsMemargLaneImm() const
+        {
+            return _idInsFmt == IF_MEMARG_LANE;
+        }
+
 #endif
 
 #ifdef TARGET_ARM64
@@ -2414,6 +2428,41 @@ class emitter
             imm = i;
         }
     };
+
+    struct instrDescV128Imm : instrDesc
+    {
+        instrDescV128Imm() = delete;
+
+        uint8_t v128Bytes[16];
+
+        void idV128Const(const uint8_t bytes[16])
+        {
+            assert(bytes != nullptr);
+            memcpy(v128Bytes, bytes, 16);
+        }
+
+        const uint8_t* idV128Const() const
+        {
+            return v128Bytes;
+        }
+    };
+
+    struct instrDescMemargLane : instrDescCns
+    {
+        instrDescMemargLane() = delete;
+
+        uint8_t lane;
+
+        void idLaneIdx(uint8_t idx)
+        {
+            lane = idx;
+        }
+
+        uint8_t idLaneIdx() const
+        {
+            return lane;
+        }
+    };
 #endif // TARGET_WASM
 
 #ifdef TARGET_RISCV64
diff --git a/src/coreclr/jit/emitfmtswasm.h b/src/coreclr/jit/emitfmtswasm.h
index 9d65047ed0a510..0f052e198f2f07 100644
--- a/src/coreclr/jit/emitfmtswasm.h
+++ b/src/coreclr/jit/emitfmtswasm.h
@@ -46,6 +46,9 @@ IF_DEF(CALL_INDIRECT, IS_NONE, NONE) // <opcode> <ULEB128 immediate> <ULEB128 im
 IF_DEF(MEMIDX_MEMIDX, IS_NONE, NONE) // <memory index> <memory index>
 IF_DEF(TRY_TABLE,     IS_NONE, NONE) // <opcode> <sig = 0x40> <len = 0x01>
 IF_DEF(CATCH_DECL,    IS_NONE, NONE) // <catch-type> <ULEB128 immediate (type reloc)> <ULEB128 immediate>
+IF_DEF(V128,          IS_NONE, NONE) // <opcode> <16 raw bytes>
+IF_DEF(LANE,          IS_NONE, NONE) // <opcode> <u8 lane index>
+IF_DEF(MEMARG_LANE,   IS_NONE, NONE) // <opcode> <memarg> <u8 lane index>
 
 #undef IF_DEF
 #endif // !DEFINE_ID_OPS
diff --git a/src/coreclr/jit/emitwasm.cpp b/src/coreclr/jit/emitwasm.cpp
index e897fd1126a255..46e21686deb1a9 100644
--- a/src/coreclr/jit/emitwasm.cpp
+++ b/src/coreclr/jit/emitwasm.cpp
@@ -17,6 +17,44 @@
 };
 // clang-format on
 
+bool isValidSimdElemSize(unsigned elemSize)
+{
+    // Valid SIMD configurations are i8x16, i16x8, i32x4, i64x2, f32x4, f64x2
+    return (elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8);
+}
+
+// --------------------------------------------------------------------------------------------------
+// isValidVectorIndex - returns true if the specified index is valid for the given SIMD element size
+// Arguments:
+//  elemSize - element size in bytes (1, 2, 4, or 8)
+//  index    - the index to validate
+
+bool emitter::isValidVectorIndex(uint8_t elemSize, uint8_t index)
+{
+    assert(isValidSimdElemSize(elemSize));
+
+    bool isValid = false;
+    switch (elemSize)
+    {
+        case 1:
+            isValid = (index < 16);
+            break;
+        case 2:
+            isValid = (index < 8);
+            break;
+        case 4:
+            isValid = (index < 4);
+            break;
+        case 8:
+            isValid = (index < 2);
+            break;
+        default:
+            unreached();
+    }
+
+    return isValid;
+}
+
 void emitter::emitIns(instruction ins)
 {
     instrDesc* id  = emitNewInstrSmall(EA_8BYTE);
@@ -374,6 +412,109 @@ unsigned int emitter::emitGetValTypeImmImm(const instrDesc* id)
     return static_cast<const instrDescValTypeImm*>(id)->imm;
 }
 
+const uint8_t* emitter::emitGetV128ImmValue(const instrDesc* id)
+{
+    assert(id->idIsV128Imm());
+    return static_cast<const instrDescV128Imm*>(id)->v128Bytes;
+}
+
+uint8_t emitter::emitGetLaneImmValue(const instrDesc* id)
+{
+    if (id->idIsMemargLaneImm())
+    {
+        return static_cast<const instrDescMemargLane*>(id)->lane;
+    }
+    else if (id->idInsFmt() == IF_LANE)
+    {
+        cnsval_size_t lane = emitGetInsSC(id);
+        assert(FitsIn<uint8_t>(lane));
+        return static_cast<uint8_t>(lane);
+    }
+    else
+    {
+        unreached();
+    }
+
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// Packed SIMD instruction emit functions
+//------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+// emitIns_V128Imm: Emit a packed SIMD instruction with a 16 byte vector immediate.
+//
+// Arguments:
+//   ins   - instruction (currently used with INS_v128_const and INS_i8x16_shuffle)
+//   bytes - pointer to 16 bytes of constant data
+//
+void emitter::emitIns_V128Imm(instruction ins, const uint8_t bytes[16])
+{
+    assert(bytes != nullptr);
+    instrDescV128Imm* id  = static_cast<instrDescV128Imm*>(emitAllocAnyInstr(sizeof(instrDescV128Imm), EA_16BYTE));
+    insFormat         fmt = emitInsFormat(ins);
+    assert(fmt == IF_V128);
+
+    id->idInsFmt(fmt);
+    id->idIns(ins);
+    id->idV128Const(bytes);
+
+    dispIns(id);
+    appendToCurIG(id);
+}
+
+//------------------------------------------------------------------------
+// emitIns_Lane: Emit a SIMD extract/replace lane instruction.
+//
+// Arguments:
+//   ins     - instruction (e.g., INS_i8x16_extract_lane_s)
+//   attr    - emit attribute indicating the lane element size
+//   laneIdx - lane index byte
+//
+void emitter::emitIns_Lane(instruction ins, emitAttr attr, uint8_t laneIdx)
+{
+    instrDesc* id       = emitNewInstrSC(attr, laneIdx);
+    insFormat  fmt      = emitInsFormat(ins);
+    uint8_t    elemSize = CodeGenInterface::instSimdElemSize(ins);
+    assert(fmt == IF_LANE);
+    assert(isValidVectorIndex(elemSize, laneIdx));
+
+    id->idInsFmt(fmt);
+    id->idIns(ins);
+
+    dispIns(id);
+    appendToCurIG(id);
+}
+
+//------------------------------------------------------------------------
+// emitIns_MemargLane: Emit a SIMD load/store lane instruction with memarg + lane index.
+//
+// Arguments:
+//   ins     - instruction (e.g., INS_v128_load8_lane)
+//   attr    - emit attribute indicating the memory access size
+//   offset  - memory offset for the memarg
+//   laneIdx - lane index byte
+//
+void emitter::emitIns_MemargLane(instruction ins, emitAttr attr, cnsval_ssize_t offset, uint8_t laneIdx)
+{
+    instrDescMemargLane* id  = static_cast<instrDescMemargLane*>(emitAllocAnyInstr(sizeof(instrDescMemargLane), attr));
+    insFormat            fmt = emitInsFormat(ins);
+    uint8_t              elemSize = CodeGenInterface::instSimdElemSize(ins);
+    assert(fmt == IF_MEMARG_LANE);
+    assert(offset >= 0);
+    assert(isValidVectorIndex(elemSize, laneIdx));
+
+    id->idInsFmt(fmt);
+    id->idIns(ins);
+    id->idcCnsVal = offset;
+    id->idSetIsLargeCns();
+    id->idLaneIdx(laneIdx);
+
+    dispIns(id);
+    appendToCurIG(id);
+}
+
 emitter::insFormat emitter::emitInsFormat(instruction ins)
 {
     static_assert(IF_COUNT < 255);
@@ -425,6 +566,16 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id) const
         return SMALL_IDSC_SIZE;
     }
 
+    if (id->idIsMemargLaneImm())
+    {
+        return sizeof(instrDescMemargLane);
+    }
+
+    if (id->idIsV128Imm())
+    {
+        return sizeof(instrDescV128Imm);
+    }
+
     if (id->idIsLargeCns())
     {
         assert(!id->idIsLargeDsp());
@@ -576,6 +727,21 @@ unsigned emitter::instrDesc::idCodeSize() const
             size += SizeOfULEB128(emitGetInsSC(this)); // control flow stack offset
             break;
         }
+        case IF_V128:
+            size += 16; // 16 raw bytes for the v128 constant
+            break;
+        case IF_LANE:
+            size += 1; // 1 byte lane index
+            break;
+        case IF_MEMARG_LANE:
+        {
+            uint64_t align = emitGetAlignHintLog2(this);
+            assert(align < 64); // spec says align > 2^6 produces a memidx for multiple memories.
+            size += SizeOfULEB128(align);
+            size += idIsCnsReloc() ? PADDED_RELOC_SIZE : SizeOfULEB128(emitGetInsSC(this));
+            size += 1; // 1 byte lane index
+            break;
+        }
         default:
             unreached();
     }
@@ -897,6 +1063,32 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             dst += emitOutputULEB128Padded(dst, (int64_t)size);
             break;
         }
+        case IF_V128:
+        {
+            dst += emitOutputOpcode(dst, ins);
+            const uint8_t* v128Value = emitGetV128ImmValue(id);
+            dst += emitRawBytes(dst, v128Value, 16);
+            break;
+        }
+        case IF_LANE:
+        {
+            dst += emitOutputOpcode(dst, ins);
+            uint8_t laneIdx = emitGetLaneImmValue(id);
+            dst += emitOutputByte(dst, laneIdx);
+            break;
+        }
+        case IF_MEMARG_LANE:
+        {
+            dst += emitOutputOpcode(dst, ins);
+            uint8_t  laneIdx = emitGetLaneImmValue(id);
+            uint64_t align   = emitGetAlignHintLog2(id);
+            uint64_t offset  = emitGetInsSC(id);
+            assert(align < 64);
+            dst += emitOutputULEB128(dst, align);
+            dst += emitOutputULEB128(dst, offset);
+            dst += emitOutputByte(dst, laneIdx);
+            break;
+        }
         default:
             NYI_WASM("emitOutputInstr");
             break;
@@ -1182,6 +1374,35 @@ void emitter::emitDispIns(
         }
         break;
 
+        case IF_V128:
+        {
+            const uint8_t* imm = emitGetV128ImmValue(id);
+            for (int i = 0; i < 16; i++)
+            {
+                printf(" 0x%02x", imm[i]);
+            }
+        }
+        break;
+
+        case IF_LANE:
+        {
+            uint8_t lane = emitGetLaneImmValue(id);
+            printf(" [%u]", (uint8_t)lane);
+        }
+        break;
+
+        case IF_MEMARG_LANE:
+        {
+            unsigned       log2align = emitGetAlignHintLog2(id);
+            cnsval_ssize_t offset    = emitGetInsSC(id);
+            printf(" %u %llu", log2align, (uint64_t)offset);
+            dispLclVarInfoIfAny();
+
+            uint8_t lane = emitGetLaneImmValue(id);
+            printf(" [%u]", (uint8_t)lane);
+        }
+        break;
+
         default:
             unreached();
     }
diff --git a/src/coreclr/jit/emitwasm.h b/src/coreclr/jit/emitwasm.h
index 030c73f7550889..b5bbff12e7cc8d 100644
--- a/src/coreclr/jit/emitwasm.h
+++ b/src/coreclr/jit/emitwasm.h
@@ -16,6 +16,7 @@ void emitDispInst(instruction ins);
 /************************************************************************/
 
 public:
+bool isValidVectorIndex(uint8_t elemsize, uint8_t index);
 void emitIns(instruction ins);
 void emitIns_BlockTy(instruction ins, WasmValueType valType = WasmValueType::Invalid);
 void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t imm);
@@ -31,6 +32,11 @@ void emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2)
 
 void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
 
+// Packed SIMD instruction emit functions
+void emitIns_V128Imm(instruction ins, const uint8_t bytes[16]);
+void emitIns_Lane(instruction ins, emitAttr attr, uint8_t laneIdx);
+void emitIns_MemargLane(instruction ins, emitAttr attr, cnsval_ssize_t offset, uint8_t laneIdx);
+
 void emitAddressConstant(void* address);
 void emitFuncletAddressConstant(cnsval_ssize_t funcletId);
 
@@ -47,6 +53,9 @@ instrDesc*           emitNewInstrValTypeImm(emitAttr attr, WasmValueType type, u
 static WasmValueType emitGetValTypeImmType(const instrDesc* id);
 static unsigned int  emitGetValTypeImmImm(const instrDesc* id);
 
+const uint8_t* emitGetV128ImmValue(const instrDesc* id);
+uint8_t        emitGetLaneImmValue(const instrDesc* id);
+
 /************************************************************************/
 /*  Private members that deal with target-dependent instr. descriptors  */
 /************************************************************************/
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index 8e7451a9bc1a86..500e94832339cc 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -598,6 +598,14 @@ bool CodeGenInterface::instHasPseudoName(instruction ins)
 }
 #endif // TARGET_XARCH
 
+#if defined(TARGET_WASM)
+uint8_t CodeGenInterface::instSimdElemSize(instruction ins)
+{
+    assert((unsigned)ins < ArrLen(instInfo));
+    return static_cast<uint8_t>((instInfo[ins] >> InstInfoElemSizeShift));
+}
+#endif
+
 /*****************************************************************************
  *
  *  Generate a set instruction.
diff --git a/src/coreclr/jit/instrswasm.h b/src/coreclr/jit/instrswasm.h
index 9982598dfdbb5b..90e87b177e7cba 100644
--- a/src/coreclr/jit/instrswasm.h
+++ b/src/coreclr/jit/instrswasm.h
@@ -244,6 +244,288 @@ INST2(i64_trunc_sat_f64_u, "i64.trunc_sat_f64_u", 0, IF_OPCODE,  0xFC, 7)
 INST2(memory_copy,         "memory.copy",         0, IF_MEMIDX_MEMIDX, 0xFC, 10)
 INST2(memory_fill,         "memory.fill",         0, IF_ULEB128,       0xFC, 11)
 
+// 5.4.9 Vector Instructions (SIMD, 0xFD prefix)
+//
+// Memory operations (memarg format)
+INST2(v128_load,            "v128.load",            0, IF_MEMARG,      0xFD, 0)
+INST2(v128_load8x8_s,      "v128.load8x8_s",      0, IF_MEMARG,      0xFD, 1)
+INST2(v128_load8x8_u,      "v128.load8x8_u",      0, IF_MEMARG,      0xFD, 2)
+INST2(v128_load16x4_s,     "v128.load16x4_s",     0, IF_MEMARG,      0xFD, 3)
+INST2(v128_load16x4_u,     "v128.load16x4_u",     0, IF_MEMARG,      0xFD, 4)
+INST2(v128_load32x2_s,     "v128.load32x2_s",     0, IF_MEMARG,      0xFD, 5)
+INST2(v128_load32x2_u,     "v128.load32x2_u",     0, IF_MEMARG,      0xFD, 6)
+INST2(v128_load8_splat,    "v128.load8_splat",    0, IF_MEMARG,      0xFD, 7)
+INST2(v128_load16_splat,   "v128.load16_splat",   0, IF_MEMARG,      0xFD, 8)
+INST2(v128_load32_splat,   "v128.load32_splat",   0, IF_MEMARG,      0xFD, 9)
+INST2(v128_load64_splat,   "v128.load64_splat",   0, IF_MEMARG,      0xFD, 10)
+INST2(v128_store,           "v128.store",           0, IF_MEMARG,      0xFD, 11)
+
+// v128.const and i8x16.shuffle (special formats)
+INST2(v128_const,           "v128.const",           0, IF_V128,  0xFD, 12)
+INST2(i8x16_shuffle,       "i8x16.shuffle",       0, IF_V128,     0xFD, 13)
+
+// i8x16.swizzle (plain opcode)
+INST2(i8x16_swizzle,       "i8x16.swizzle",       0, IF_OPCODE,      0xFD, 14)
+
+// Splat operations
+INST2(i8x16_splat,         "i8x16.splat",         0, IF_OPCODE,      0xFD, 15)
+INST2(i16x8_splat,         "i16x8.splat",         0, IF_OPCODE,      0xFD, 16)
+INST2(i32x4_splat,         "i32x4.splat",         0, IF_OPCODE,      0xFD, 17)
+INST2(i64x2_splat,         "i64x2.splat",         0, IF_OPCODE,      0xFD, 18)
+INST2(f32x4_splat,         "f32x4.splat",         0, IF_OPCODE,      0xFD, 19)
+INST2(f64x2_splat,         "f64x2.splat",         0, IF_OPCODE,      0xFD, 20)
+
+// Extract/replace lane operations (lane index byte), info byte is (elemSize << 1)
+INST2(i8x16_extract_lane_s, "i8x16.extract_lane_s", 2, IF_LANE,     0xFD, 21)
+INST2(i8x16_extract_lane_u, "i8x16.extract_lane_u", 2, IF_LANE,     0xFD, 22)
+INST2(i8x16_replace_lane,   "i8x16.replace_lane",   2, IF_LANE,     0xFD, 23)
+INST2(i16x8_extract_lane_s, "i16x8.extract_lane_s", 4, IF_LANE,     0xFD, 24)
+INST2(i16x8_extract_lane_u, "i16x8.extract_lane_u", 4, IF_LANE,     0xFD, 25)
+INST2(i16x8_replace_lane,   "i16x8.replace_lane",   4, IF_LANE,     0xFD, 26)
+INST2(i32x4_extract_lane,   "i32x4.extract_lane",   8, IF_LANE,     0xFD, 27)
+INST2(i32x4_replace_lane,   "i32x4.replace_lane",   8, IF_LANE,     0xFD, 28)
+INST2(i64x2_extract_lane,   "i64x2.extract_lane",   16, IF_LANE,     0xFD, 29)
+INST2(i64x2_replace_lane,   "i64x2.replace_lane",   16, IF_LANE,     0xFD, 30)
+INST2(f32x4_extract_lane,   "f32x4.extract_lane",   8, IF_LANE,     0xFD, 31)
+INST2(f32x4_replace_lane,   "f32x4.replace_lane",   8, IF_LANE,     0xFD, 32)
+INST2(f64x2_extract_lane,   "f64x2.extract_lane",   16, IF_LANE,     0xFD, 33)
+INST2(f64x2_replace_lane,   "f64x2.replace_lane",   16, IF_LANE,     0xFD, 34)
+
+// i8x16 comparisons
+INST2(i8x16_eq,             "i8x16.eq",             0, IF_OPCODE,    0xFD, 35)
+INST2(i8x16_ne,             "i8x16.ne",             0, IF_OPCODE,    0xFD, 36)
+INST2(i8x16_lt_s,           "i8x16.lt_s",           0, IF_OPCODE,    0xFD, 37)
+INST2(i8x16_lt_u,           "i8x16.lt_u",           0, IF_OPCODE,    0xFD, 38)
+INST2(i8x16_gt_s,           "i8x16.gt_s",           0, IF_OPCODE,    0xFD, 39)
+INST2(i8x16_gt_u,           "i8x16.gt_u",           0, IF_OPCODE,    0xFD, 40)
+INST2(i8x16_le_s,           "i8x16.le_s",           0, IF_OPCODE,    0xFD, 41)
+INST2(i8x16_le_u,           "i8x16.le_u",           0, IF_OPCODE,    0xFD, 42)
+INST2(i8x16_ge_s,           "i8x16.ge_s",           0, IF_OPCODE,    0xFD, 43)
+INST2(i8x16_ge_u,           "i8x16.ge_u",           0, IF_OPCODE,    0xFD, 44)
+
+// i16x8 comparisons
+INST2(i16x8_eq,             "i16x8.eq",             0, IF_OPCODE,    0xFD, 45)
+INST2(i16x8_ne,             "i16x8.ne",             0, IF_OPCODE,    0xFD, 46)
+INST2(i16x8_lt_s,           "i16x8.lt_s",           0, IF_OPCODE,    0xFD, 47)
+INST2(i16x8_lt_u,           "i16x8.lt_u",           0, IF_OPCODE,    0xFD, 48)
+INST2(i16x8_gt_s,           "i16x8.gt_s",           0, IF_OPCODE,    0xFD, 49)
+INST2(i16x8_gt_u,           "i16x8.gt_u",           0, IF_OPCODE,    0xFD, 50)
+INST2(i16x8_le_s,           "i16x8.le_s",           0, IF_OPCODE,    0xFD, 51)
+INST2(i16x8_le_u,           "i16x8.le_u",           0, IF_OPCODE,    0xFD, 52)
+INST2(i16x8_ge_s,           "i16x8.ge_s",           0, IF_OPCODE,    0xFD, 53)
+INST2(i16x8_ge_u,           "i16x8.ge_u",           0, IF_OPCODE,    0xFD, 54)
+
+// i32x4 comparisons
+INST2(i32x4_eq,             "i32x4.eq",             0, IF_OPCODE,    0xFD, 55)
+INST2(i32x4_ne,             "i32x4.ne",             0, IF_OPCODE,    0xFD, 56)
+INST2(i32x4_lt_s,           "i32x4.lt_s",           0, IF_OPCODE,    0xFD, 57)
+INST2(i32x4_lt_u,           "i32x4.lt_u",           0, IF_OPCODE,    0xFD, 58)
+INST2(i32x4_gt_s,           "i32x4.gt_s",           0, IF_OPCODE,    0xFD, 59)
+INST2(i32x4_gt_u,           "i32x4.gt_u",           0, IF_OPCODE,    0xFD, 60)
+INST2(i32x4_le_s,           "i32x4.le_s",           0, IF_OPCODE,    0xFD, 61)
+INST2(i32x4_le_u,           "i32x4.le_u",           0, IF_OPCODE,    0xFD, 62)
+INST2(i32x4_ge_s,           "i32x4.ge_s",           0, IF_OPCODE,    0xFD, 63)
+INST2(i32x4_ge_u,           "i32x4.ge_u",           0, IF_OPCODE,    0xFD, 64)
+
+// i64x2 comparisons
+INST2(i64x2_eq,             "i64x2.eq",             0, IF_OPCODE,    0xFD, 214)
+INST2(i64x2_ne,             "i64x2.ne",             0, IF_OPCODE,    0xFD, 215)
+INST2(i64x2_lt_s,           "i64x2.lt_s",           0, IF_OPCODE,    0xFD, 216)
+INST2(i64x2_gt_s,           "i64x2.gt_s",           0, IF_OPCODE,    0xFD, 217)
+INST2(i64x2_le_s,           "i64x2.le_s",           0, IF_OPCODE,    0xFD, 218)
+INST2(i64x2_ge_s,           "i64x2.ge_s",           0, IF_OPCODE,    0xFD, 219)
+
+// f32x4 comparisons
+INST2(f32x4_eq,             "f32x4.eq",             0, IF_OPCODE,    0xFD, 65)
+INST2(f32x4_ne,             "f32x4.ne",             0, IF_OPCODE,    0xFD, 66)
+INST2(f32x4_lt,             "f32x4.lt",             0, IF_OPCODE,    0xFD, 67)
+INST2(f32x4_gt,             "f32x4.gt",             0, IF_OPCODE,    0xFD, 68)
+INST2(f32x4_le,             "f32x4.le",             0, IF_OPCODE,    0xFD, 69)
+INST2(f32x4_ge,             "f32x4.ge",             0, IF_OPCODE,    0xFD, 70)
+
+// f64x2 comparisons
+INST2(f64x2_eq,             "f64x2.eq",             0, IF_OPCODE,    0xFD, 71)
+INST2(f64x2_ne,             "f64x2.ne",             0, IF_OPCODE,    0xFD, 72)
+INST2(f64x2_lt,             "f64x2.lt",             0, IF_OPCODE,    0xFD, 73)
+INST2(f64x2_gt,             "f64x2.gt",             0, IF_OPCODE,    0xFD, 74)
+INST2(f64x2_le,             "f64x2.le",             0, IF_OPCODE,    0xFD, 75)
+INST2(f64x2_ge,             "f64x2.ge",             0, IF_OPCODE,    0xFD, 76)
+
+// v128 bitwise operations
+INST2(v128_not,             "v128.not",             0, IF_OPCODE,    0xFD, 77)
+INST2(v128_and,             "v128.and",             0, IF_OPCODE,    0xFD, 78)
+INST2(v128_andnot,          "v128.andnot",          0, IF_OPCODE,    0xFD, 79)
+INST2(v128_or,              "v128.or",              0, IF_OPCODE,    0xFD, 80)
+INST2(v128_xor,             "v128.xor",             0, IF_OPCODE,    0xFD, 81)
+INST2(v128_bitselect,       "v128.bitselect",       0, IF_OPCODE,    0xFD, 82)
+INST2(v128_any_true,        "v128.any_true",        0, IF_OPCODE,    0xFD, 83)
+
+// Load/store lane operations (memarg + lane index), info byte is (elemSize << 1)
+INST2(v128_load8_lane,     "v128.load8_lane",     2, IF_MEMARG_LANE, 0xFD, 84)
+INST2(v128_load16_lane,    "v128.load16_lane",    4, IF_MEMARG_LANE, 0xFD, 85)
+INST2(v128_load32_lane,    "v128.load32_lane",    8, IF_MEMARG_LANE, 0xFD, 86)
+INST2(v128_load64_lane,    "v128.load64_lane",    16, IF_MEMARG_LANE, 0xFD, 87)
+INST2(v128_store8_lane,    "v128.store8_lane",    2, IF_MEMARG_LANE, 0xFD, 88)
+INST2(v128_store16_lane,   "v128.store16_lane",   4, IF_MEMARG_LANE, 0xFD, 89)
+INST2(v128_store32_lane,   "v128.store32_lane",   8, IF_MEMARG_LANE, 0xFD, 90)
+INST2(v128_store64_lane,   "v128.store64_lane",   16, IF_MEMARG_LANE, 0xFD, 91)
+
+// Load zero operations (memarg)
+INST2(v128_load32_zero,    "v128.load32_zero",    0, IF_MEMARG,      0xFD, 92)
+INST2(v128_load64_zero,    "v128.load64_zero",    0, IF_MEMARG,      0xFD, 93)
+
+// Conversion: demote/promote (no sign specifier)
+INST2(f32x4_demote_f64x2_zero,  "f32x4.demote_f64x2_zero",  0, IF_OPCODE, 0xFD, 94)
+INST2(f64x2_promote_low_f32x4,  "f64x2.promote_low_f32x4",  0, IF_OPCODE, 0xFD, 95)
+
+// i8x16 arithmetic and other operations
+INST2(i8x16_abs,            "i8x16.abs",            0, IF_OPCODE,    0xFD, 96)
+INST2(i8x16_neg,            "i8x16.neg",            0, IF_OPCODE,    0xFD, 97)
+INST2(i8x16_popcnt,         "i8x16.popcnt",         0, IF_OPCODE,    0xFD, 98)
+INST2(i8x16_all_true,       "i8x16.all_true",       0, IF_OPCODE,    0xFD, 99)
+INST2(i8x16_bitmask,        "i8x16.bitmask",        0, IF_OPCODE,    0xFD, 100)
+INST2(i8x16_narrow_i16x8_s, "i8x16.narrow_i16x8_s", 0, IF_OPCODE,  0xFD, 101)
+INST2(i8x16_narrow_i16x8_u, "i8x16.narrow_i16x8_u", 0, IF_OPCODE,  0xFD, 102)
+INST2(f32x4_ceil,           "f32x4.ceil",           0, IF_OPCODE,    0xFD, 103)
+INST2(f32x4_floor,          "f32x4.floor",          0, IF_OPCODE,    0xFD, 104)
+INST2(f32x4_trunc,          "f32x4.trunc",          0, IF_OPCODE,    0xFD, 105)
+INST2(f32x4_nearest,        "f32x4.nearest",        0, IF_OPCODE,    0xFD, 106)
+INST2(i8x16_shl,            "i8x16.shl",            0, IF_OPCODE,    0xFD, 107)
+INST2(i8x16_shr_s,          "i8x16.shr_s",          0, IF_OPCODE,    0xFD, 108)
+INST2(i8x16_shr_u,          "i8x16.shr_u",          0, IF_OPCODE,    0xFD, 109)
+INST2(i8x16_add,            "i8x16.add",            0, IF_OPCODE,    0xFD, 110)
+INST2(i8x16_add_sat_s,      "i8x16.add_sat_s",      0, IF_OPCODE,    0xFD, 111)
+INST2(i8x16_add_sat_u,      "i8x16.add_sat_u",      0, IF_OPCODE,    0xFD, 112)
+INST2(i8x16_sub,            "i8x16.sub",            0, IF_OPCODE,    0xFD, 113)
+INST2(i8x16_sub_sat_s,      "i8x16.sub_sat_s",      0, IF_OPCODE,    0xFD, 114)
+INST2(i8x16_sub_sat_u,      "i8x16.sub_sat_u",      0, IF_OPCODE,    0xFD, 115)
+INST2(f64x2_ceil,           "f64x2.ceil",           0, IF_OPCODE,    0xFD, 116)
+INST2(f64x2_floor,          "f64x2.floor",          0, IF_OPCODE,    0xFD, 117)
+INST2(i8x16_min_s,          "i8x16.min_s",          0, IF_OPCODE,    0xFD, 118)
+INST2(i8x16_min_u,          "i8x16.min_u",          0, IF_OPCODE,    0xFD, 119)
+INST2(i8x16_max_s,          "i8x16.max_s",          0, IF_OPCODE,    0xFD, 120)
+INST2(i8x16_max_u,          "i8x16.max_u",          0, IF_OPCODE,    0xFD, 121)
+INST2(f64x2_trunc,          "f64x2.trunc",          0, IF_OPCODE,    0xFD, 122)
+INST2(i8x16_avgr_u,         "i8x16.avgr_u",         0, IF_OPCODE,    0xFD, 123)
+
+// i16x8 operations
+INST2(i16x8_extadd_pairwise_s_i8x16, "i16x8.extadd_pairwise_s_i8x16", 0, IF_OPCODE, 0xFD, 124)
+INST2(i16x8_extadd_pairwise_u_i8x16, "i16x8.extadd_pairwise_u_i8x16", 0, IF_OPCODE, 0xFD, 125)
+INST2(i32x4_extadd_pairwise_s_i16x8, "i32x4.extadd_pairwise_s_i16x8", 0, IF_OPCODE, 0xFD, 126)
+INST2(i32x4_extadd_pairwise_u_i16x8, "i32x4.extadd_pairwise_u_i16x8", 0, IF_OPCODE, 0xFD, 127)
+INST2(i16x8_abs,            "i16x8.abs",            0, IF_OPCODE,    0xFD, 128)
+INST2(i16x8_neg,            "i16x8.neg",            0, IF_OPCODE,    0xFD, 129)
+INST2(i16x8_q15mulr_sat_s,  "i16x8.q15mulr_sat_s",  0, IF_OPCODE,    0xFD, 130)
+INST2(i16x8_all_true,       "i16x8.all_true",       0, IF_OPCODE,    0xFD, 131)
+INST2(i16x8_bitmask,        "i16x8.bitmask",        0, IF_OPCODE,    0xFD, 132)
+INST2(i16x8_narrow_i32x4_s, "i16x8.narrow_i32x4_s", 0, IF_OPCODE,  0xFD, 133)
+INST2(i16x8_narrow_i32x4_u, "i16x8.narrow_i32x4_u", 0, IF_OPCODE,  0xFD, 134)
+INST2(i16x8_extend_low_s_i8x16,  "i16x8.extend_low_s_i8x16",  0, IF_OPCODE, 0xFD, 135)
+INST2(i16x8_extend_high_s_i8x16, "i16x8.extend_high_s_i8x16", 0, IF_OPCODE, 0xFD, 136)
+INST2(i16x8_extend_low_u_i8x16,  "i16x8.extend_low_u_i8x16",  0, IF_OPCODE, 0xFD, 137)
+INST2(i16x8_extend_high_u_i8x16, "i16x8.extend_high_u_i8x16", 0, IF_OPCODE, 0xFD, 138)
+INST2(i16x8_shl,            "i16x8.shl",            0, IF_OPCODE,    0xFD, 139)
+INST2(i16x8_shr_s,          "i16x8.shr_s",          0, IF_OPCODE,    0xFD, 140)
+INST2(i16x8_shr_u,          "i16x8.shr_u",          0, IF_OPCODE,    0xFD, 141)
+INST2(i16x8_add,            "i16x8.add",            0, IF_OPCODE,    0xFD, 142)
+INST2(i16x8_add_sat_s,      "i16x8.add_sat_s",      0, IF_OPCODE,    0xFD, 143)
+INST2(i16x8_add_sat_u,      "i16x8.add_sat_u",      0, IF_OPCODE,    0xFD, 144)
+INST2(i16x8_sub,            "i16x8.sub",            0, IF_OPCODE,    0xFD, 145)
+INST2(i16x8_sub_sat_s,      "i16x8.sub_sat_s",      0, IF_OPCODE,    0xFD, 146)
+INST2(i16x8_sub_sat_u,      "i16x8.sub_sat_u",      0, IF_OPCODE,    0xFD, 147)
+INST2(f64x2_nearest,        "f64x2.nearest",        0, IF_OPCODE,    0xFD, 148)
+INST2(i16x8_mul,            "i16x8.mul",            0, IF_OPCODE,    0xFD, 149)
+INST2(i16x8_min_s,          "i16x8.min_s",          0, IF_OPCODE,    0xFD, 150)
+INST2(i16x8_min_u,          "i16x8.min_u",          0, IF_OPCODE,    0xFD, 151)
+INST2(i16x8_max_s,          "i16x8.max_s",          0, IF_OPCODE,    0xFD, 152)
+INST2(i16x8_max_u,          "i16x8.max_u",          0, IF_OPCODE,    0xFD, 153)
+INST2(i16x8_avgr_u,         "i16x8.avgr_u",         0, IF_OPCODE,    0xFD, 155)
+INST2(i16x8_extmul_low_s_i8x16,  "i16x8.extmul_low_s_i8x16",  0, IF_OPCODE, 0xFD, 156)
+INST2(i16x8_extmul_high_s_i8x16, "i16x8.extmul_high_s_i8x16", 0, IF_OPCODE, 0xFD, 157)
+INST2(i16x8_extmul_low_u_i8x16,  "i16x8.extmul_low_u_i8x16",  0, IF_OPCODE, 0xFD, 158)
+INST2(i16x8_extmul_high_u_i8x16, "i16x8.extmul_high_u_i8x16", 0, IF_OPCODE, 0xFD, 159)
+
+// i32x4 operations
+INST2(i32x4_abs,            "i32x4.abs",            0, IF_OPCODE,    0xFD, 160)
+INST2(i32x4_neg,            "i32x4.neg",            0, IF_OPCODE,    0xFD, 161)
+INST2(i32x4_all_true,       "i32x4.all_true",       0, IF_OPCODE,    0xFD, 163)
+INST2(i32x4_bitmask,        "i32x4.bitmask",        0, IF_OPCODE,    0xFD, 164)
+INST2(i32x4_extend_low_s_i16x8,  "i32x4.extend_low_s_i16x8",  0, IF_OPCODE, 0xFD, 167)
+INST2(i32x4_extend_high_s_i16x8, "i32x4.extend_high_s_i16x8", 0, IF_OPCODE, 0xFD, 168)
+INST2(i32x4_extend_low_u_i16x8,  "i32x4.extend_low_u_i16x8",  0, IF_OPCODE, 0xFD, 169)
+INST2(i32x4_extend_high_u_i16x8, "i32x4.extend_high_u_i16x8", 0, IF_OPCODE, 0xFD, 170)
+INST2(i32x4_shl,            "i32x4.shl",            0, IF_OPCODE,    0xFD, 171)
+INST2(i32x4_shr_s,          "i32x4.shr_s",          0, IF_OPCODE,    0xFD, 172)
+INST2(i32x4_shr_u,          "i32x4.shr_u",          0, IF_OPCODE,    0xFD, 173)
+INST2(i32x4_add,            "i32x4.add",            0, IF_OPCODE,    0xFD, 174)
+INST2(i32x4_sub,            "i32x4.sub",            0, IF_OPCODE,    0xFD, 177)
+INST2(i32x4_mul,            "i32x4.mul",            0, IF_OPCODE,    0xFD, 181)
+INST2(i32x4_min_s,          "i32x4.min_s",          0, IF_OPCODE,    0xFD, 182)
+INST2(i32x4_min_u,          "i32x4.min_u",          0, IF_OPCODE,    0xFD, 183)
+INST2(i32x4_max_s,          "i32x4.max_s",          0, IF_OPCODE,    0xFD, 184)
+INST2(i32x4_max_u,          "i32x4.max_u",          0, IF_OPCODE,    0xFD, 185)
+INST2(i32x4_dot_i16x8_s,   "i32x4.dot_i16x8_s",   0, IF_OPCODE,    0xFD, 186)
+INST2(i32x4_extmul_low_s_i16x8,  "i32x4.extmul_low_s_i16x8",  0, IF_OPCODE, 0xFD, 188)
+INST2(i32x4_extmul_high_s_i16x8, "i32x4.extmul_high_s_i16x8", 0, IF_OPCODE, 0xFD, 189)
+INST2(i32x4_extmul_low_u_i16x8,  "i32x4.extmul_low_u_i16x8",  0, IF_OPCODE, 0xFD, 190)
+INST2(i32x4_extmul_high_u_i16x8, "i32x4.extmul_high_u_i16x8", 0, IF_OPCODE, 0xFD, 191)
+
+// i64x2 operations
+INST2(i64x2_abs,            "i64x2.abs",            0, IF_OPCODE,    0xFD, 192)
+INST2(i64x2_neg,            "i64x2.neg",            0, IF_OPCODE,    0xFD, 193)
+INST2(i64x2_all_true,       "i64x2.all_true",       0, IF_OPCODE,    0xFD, 195)
+INST2(i64x2_bitmask,        "i64x2.bitmask",        0, IF_OPCODE,    0xFD, 196)
+INST2(i64x2_extend_low_s_i32x4,  "i64x2.extend_low_s_i32x4",  0, IF_OPCODE, 0xFD, 199)
+INST2(i64x2_extend_high_s_i32x4, "i64x2.extend_high_s_i32x4", 0, IF_OPCODE, 0xFD, 200)
+INST2(i64x2_extend_low_u_i32x4,  "i64x2.extend_low_u_i32x4",  0, IF_OPCODE, 0xFD, 201)
+INST2(i64x2_extend_high_u_i32x4, "i64x2.extend_high_u_i32x4", 0, IF_OPCODE, 0xFD, 202)
+INST2(i64x2_shl,            "i64x2.shl",            0, IF_OPCODE,    0xFD, 203)
+INST2(i64x2_shr_s,          "i64x2.shr_s",          0, IF_OPCODE,    0xFD, 204)
+INST2(i64x2_shr_u,          "i64x2.shr_u",          0, IF_OPCODE,    0xFD, 205)
+INST2(i64x2_add,            "i64x2.add",            0, IF_OPCODE,    0xFD, 206)
+INST2(i64x2_sub,            "i64x2.sub",            0, IF_OPCODE,    0xFD, 209)
+INST2(i64x2_mul,            "i64x2.mul",            0, IF_OPCODE,    0xFD, 213)
+INST2(i64x2_extmul_low_s_i32x4,  "i64x2.extmul_low_s_i32x4",  0, IF_OPCODE, 0xFD, 220)
+INST2(i64x2_extmul_high_s_i32x4, "i64x2.extmul_high_s_i32x4", 0, IF_OPCODE, 0xFD, 221)
+INST2(i64x2_extmul_low_u_i32x4,  "i64x2.extmul_low_u_i32x4",  0, IF_OPCODE, 0xFD, 222)
+INST2(i64x2_extmul_high_u_i32x4, "i64x2.extmul_high_u_i32x4", 0, IF_OPCODE, 0xFD, 223)
+
+// f32x4 arithmetic
+INST2(f32x4_abs,            "f32x4.abs",            0, IF_OPCODE,    0xFD, 224)
+INST2(f32x4_neg,            "f32x4.neg",            0, IF_OPCODE,    0xFD, 225)
+INST2(f32x4_sqrt,           "f32x4.sqrt",           0, IF_OPCODE,    0xFD, 227)
+INST2(f32x4_add,            "f32x4.add",            0, IF_OPCODE,    0xFD, 228)
+INST2(f32x4_sub,            "f32x4.sub",            0, IF_OPCODE,    0xFD, 229)
+INST2(f32x4_mul,            "f32x4.mul",            0, IF_OPCODE,    0xFD, 230)
+INST2(f32x4_div,            "f32x4.div",            0, IF_OPCODE,    0xFD, 231)
+INST2(f32x4_min,            "f32x4.min",            0, IF_OPCODE,    0xFD, 232)
+INST2(f32x4_max,            "f32x4.max",            0, IF_OPCODE,    0xFD, 233)
+INST2(f32x4_pmin,           "f32x4.pmin",           0, IF_OPCODE,    0xFD, 234)
+INST2(f32x4_pmax,           "f32x4.pmax",           0, IF_OPCODE,    0xFD, 235)
+
+// f64x2 arithmetic
+INST2(f64x2_abs,            "f64x2.abs",            0, IF_OPCODE,    0xFD, 236)
+INST2(f64x2_neg,            "f64x2.neg",            0, IF_OPCODE,    0xFD, 237)
+INST2(f64x2_sqrt,           "f64x2.sqrt",           0, IF_OPCODE,    0xFD, 239)
+INST2(f64x2_add,            "f64x2.add",            0, IF_OPCODE,    0xFD, 240)
+INST2(f64x2_sub,            "f64x2.sub",            0, IF_OPCODE,    0xFD, 241)
+INST2(f64x2_mul,            "f64x2.mul",            0, IF_OPCODE,    0xFD, 242)
+INST2(f64x2_div,            "f64x2.div",            0, IF_OPCODE,    0xFD, 243)
+INST2(f64x2_min,            "f64x2.min",            0, IF_OPCODE,    0xFD, 244)
+INST2(f64x2_max,            "f64x2.max",            0, IF_OPCODE,    0xFD, 245)
+INST2(f64x2_pmin,           "f64x2.pmin",           0, IF_OPCODE,    0xFD, 246)
+INST2(f64x2_pmax,           "f64x2.pmax",           0, IF_OPCODE,    0xFD, 247)
+
+// Conversion operations (sign specifier before source type per spec)
+INST2(i32x4_trunc_sat_s_f32x4,        "i32x4.trunc_sat_s_f32x4",        0, IF_OPCODE, 0xFD, 248)
+INST2(i32x4_trunc_sat_u_f32x4,        "i32x4.trunc_sat_u_f32x4",        0, IF_OPCODE, 0xFD, 249)
+INST2(f32x4_convert_s_i32x4,          "f32x4.convert_s_i32x4",          0, IF_OPCODE, 0xFD, 250)
+INST2(f32x4_convert_u_i32x4,          "f32x4.convert_u_i32x4",          0, IF_OPCODE, 0xFD, 251)
+INST2(i32x4_trunc_sat_s_f64x2_zero,   "i32x4.trunc_sat_s_f64x2_zero",   0, IF_OPCODE, 0xFD, 252)
+INST2(i32x4_trunc_sat_u_f64x2_zero,   "i32x4.trunc_sat_u_f64x2_zero",   0, IF_OPCODE, 0xFD, 253)
+INST2(f64x2_convert_low_s_i32x4,      "f64x2.convert_low_s_i32x4",      0, IF_OPCODE, 0xFD, 254)
+INST2(f64x2_convert_low_u_i32x4,      "f64x2.convert_low_u_i32x4",      0, IF_OPCODE, 0xFD, 255)
+
 // clang-format on
 
 #undef INST