cryscan
diff --git a/‎examples/puzzle15.rs‎ renamed to ‎examples/puzzle15/main.rs‎
Lines changed: 26 additions & 1 deletion b/‎examples/puzzle15.rs‎ renamed to ‎examples/puzzle15/main.rs‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎examples/puzzle15/mul_exp.wgsl‎
Lines changed: 55 additions & 0 deletions b/‎examples/puzzle15/mul_exp.wgsl‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎examples/puzzle15/ops.rs‎
Lines changed: 81 additions & 0 deletions b/‎examples/puzzle15/ops.rs‎
Lines changed: 81 additions & 0 deletions
@@ -8,23 +8,28 @@ use half::f16;
 #[cfg(not(debug_assertions))]
 use itertools::Itertools;
 use memmap2::Mmap;
+use ops::TensorOpExt;
 use safetensors::SafeTensors;
 use tokio::{
     fs::File,
     io::{AsyncReadExt, BufReader},
 };
 use web_rwkv::{
     context::{Context, ContextBuilder, InstanceExt},
+    num::Float,
     runtime::{
         infer::{InferInput, InferInputBatch, InferOption},
         loader::Loader,
         model::{ContextAutoLimits, ModelBuilder, ModelInfo},
         v6, TokioRuntime,
     },
+    tensor::ops::TensorOp,
     tokenizer::Tokenizer,
     wgpu,
 };
 
+mod ops;
+
 const PROMPT: &str = r"<input>
 <board>
 15 0  2  12 
@@ -85,6 +90,25 @@ async fn load_tokenizer() -> Result<Tokenizer> {
     Ok(Tokenizer::new(&contents)?)
 }
 
+fn make_hooks<F: Float>(info: &ModelInfo) -> Result<v6::HookMap<F>> {
+    let mut hooks = v6::HookMap::new();
+
+    for layer in 0..info.num_layer {
+        hooks.insert(
+            v6::Hook::PreAttTimeDecayActivate(layer),
+            Box::new(move |frame: v6::Frame<F>| {
+                let ops = vec![TensorOp::mul_exp(
+                    &frame.buffer.time_decay,
+                    &frame.buffer.att_k,
+                )?];
+                Ok(TensorOp::List(ops))
+            }),
+        );
+    }
+
+    Ok(hooks)
+}
+
 #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ValueEnum)]
 enum EmbedDevice {
     #[default]
@@ -138,12 +162,13 @@ async fn main() -> Result<()> {
 
     let embed_device = cli.embed_device.unwrap_or(EmbedDevice::Cpu).into();
 
+    let hooks = make_hooks(&info)?;
     let model = ModelBuilder::new(&context, model)
         .embed_device(embed_device)
         .rescale(0)
         .build_v6()
         .await?;
-    let bundle = v6::Bundle::<f16>::new(model, 1);
+    let bundle = v6::Bundle::<f16>::new_with_hooks(model, 1, hooks);
     let runtime = TokioRuntime::new(bundle).await;
 
     let tokens = tokenizer.encode(PROMPT.as_bytes())?;
 
@@ -0,0 +1,55 @@
+struct View {
+    shape: vec4<u32>,
+    stride: vec4<u32>,
+    offset: vec4<u32>,
+};
+
+@group(0) @binding(0) var<uniform> source: View;
+@group(0) @binding(1) var<uniform> destination: View;
+
+#ifdef IN_FP16
+@group(0) @binding(2) var<storage, read> input: array<vec2<u32>>;      // (B, T, C)
+#else
+@group(0) @binding(2) var<storage, read> input: array<vec4<f32>>;      // (B, T, C)
+#endif
+#ifdef OUT_FP16
+@group(0) @binding(3) var<storage, read_write> output: array<vec2<u32>>;    // (B, T, C)
+#else
+@group(0) @binding(3) var<storage, read_write> output: array<vec4<f32>>;    // (B, T, C)
+#endif
+
+fn pack4x16float(x: vec4<f32>) -> vec2<u32> {
+    return vec2<u32>(pack2x16float(x.xy), pack2x16float(x.zw));
+}
+
+fn unpack4x16float(x: vec2<u32>) -> vec4<f32> {
+    return vec4<f32>(unpack2x16float(x.x), unpack2x16float(x.y));
+}
+
+fn compute_index(view: View, batch: u32, token: u32, index: u32) -> u32 {
+    let stride = view.stride.x >> 2u;
+    let offset = vec3<u32>(view.offset.zy, view.offset.x >> 2u);
+    return dot(vec3<u32>(batch, token, index) + offset, vec3<u32>(view.stride.y * stride, stride, 1u));
+}
+
+@compute @workgroup_size(BLOCK_SIZE, 1, 1)
+fn mul_exp(@builtin(global_invocation_id) invocation_id: vec3<u32>) {
+    let stride = destination.shape.x / 4u;
+    let index = invocation_id.x;
+    let token = invocation_id.y;
+    let batch = invocation_id.z;
+
+    if index < stride {
+#ifdef IN_FP16
+        let x = unpack4x16float(input[compute_index(source, batch, select(token, 0u, source.shape.y == 1u), index)]);
+#else
+        let x = input[compute_index(source, batch, select(token, 0u, source.shape.y == 1u), index)];
+#endif
+        let bti = compute_index(destination, batch, token, index);
+#ifdef OUT_FP16
+        output[bti] = pack4x16float(exp(min(x, vec4<f32>(0.0))) * unpack4x16float(output[bti]));
+#else
+        output[bti] = exp(min(x, vec4<f32>(0.0))) * output[bti];
+#endif
+    }
+}
@@ -0,0 +1,81 @@
+use web_rwkv::{
+    context::Macros,
+    num::Float,
+    tensor::{ops::TensorOp, TensorError, TensorGpuView, TensorShape},
+    wgpu::{BindGroupDescriptor, BindGroupEntry},
+};
+
+pub trait TensorOpExt: Sized {
+    /// Multiply `input` to exponential of `output`.
+    /// - `input` shape: `[C, 1, B]` or `[C, T, B]`.
+    /// - `output` shape: `[C, T, B]`.
+    fn mul_exp<'a, 'b, F0: Float, F1: Float>(
+        input: impl Into<TensorGpuView<'a, F0>>,
+        output: impl Into<TensorGpuView<'b, F1>>,
+    ) -> Result<Self, TensorError>;
+}
+
+impl TensorOpExt for TensorOp {
+    fn mul_exp<'a, 'b, F0: Float, F1: Float>(
+        input: impl Into<TensorGpuView<'a, F0>>,
+        output: impl Into<TensorGpuView<'b, F1>>,
+    ) -> Result<Self, TensorError> {
+        const BLOCK_SIZE: u32 = 128;
+
+        let input: TensorGpuView<_> = input.into();
+        let output: TensorGpuView<_> = output.into();
+
+        let shape = {
+            let [index, token, batch, _] = output.shape().into();
+            input
+                .check_shape([index, 1, batch, 1])
+                .or(input.check_shape([index, token, batch, 1]))?;
+            output.check_shape([index, token, batch, 1])?;
+            output.shape()
+        };
+
+        let context = output.context();
+        let pipeline = context.checkout_pipeline(
+            "mul_exp",
+            include_str!("mul_exp.wgsl"),
+            "mul_exp",
+            None,
+            Macros::new()
+                .u32("BLOCK_SIZE", BLOCK_SIZE)
+                .tensor(&input, Some("IN"))
+                .tensor(&output, Some("OUT")),
+        );
+        let bindings = vec![context.device.create_bind_group(&BindGroupDescriptor {
+            label: None,
+            layout: &pipeline.layout,
+            entries: &[
+                BindGroupEntry {
+                    binding: 0,
+                    resource: input.meta_binding(),
+                },
+                BindGroupEntry {
+                    binding: 1,
+                    resource: output.meta_binding(),
+                },
+                BindGroupEntry {
+                    binding: 2,
+                    resource: input.binding(),
+                },
+                BindGroupEntry {
+                    binding: 3,
+                    resource: output.binding(),
+                },
+            ],
+        })];
+
+        Ok(Self::Atom {
+            pipeline,
+            bindings,
+            dispatch: [
+                u32::div_ceil(shape[0] as u32 / 4, BLOCK_SIZE),
+                shape[1] as u32,
+                shape[2] as u32,
+            ],
+        })
+    }
+}