wolfSSL · dgarske · Jun 12, 2026 · Jun 12, 2026
diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml
@@ -0,0 +1,178 @@
+name: AMD/Xilinx ports
+
+# Build (and QEMU boot-smoke) the bare-metal wolfIP ports under src/port/amd/:
+# ZCU102 (A53), Versal VMK180 (A72) and Zynq-7000 ZC702 (A9). app.elf builds
+# from repo sources only - no Vitis/FSBL/bootgen/hardware. (BOOT.BIN needs an
+# FSBL + bootgen and is out of scope here.)
+
+on:
+  push:
+    paths:
+      - 'src/port/amd/**'
+      - 'src/wolfip.c'
+      - 'wolfip.h'
+      - 'tools/scripts/amd/**'
+      - '.github/workflows/amd.yml'
+  pull_request:
+    paths:
+      - 'src/port/amd/**'
+      - 'src/wolfip.c'
+      - 'wolfip.h'
+      - 'tools/scripts/amd/**'
+      - '.github/workflows/amd.yml'
+
+# Cancel superseded runs on the same ref (runner optimization).
+concurrency:
+  group: amd-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  ARM_TC_VER: 14.3.rel1
+  TC_ROOT: /home/runner/toolchains
+
+jobs:
+  # --------------------------------------------------------------------------
+  # Prime the toolchain cache once so the matrix legs don't each re-download
+  # ~150 MB. Pin the official ARM GNU Toolchain bundle that ships BOTH cross
+  # compilers (aarch64-none-elf for ZCU102/Versal, arm-none-eabi for Zynq-7000).
+  # --------------------------------------------------------------------------
+  toolchains:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - name: Cache ARM GNU toolchains
+        id: tc-cache
+        uses: actions/cache@v4
+        with:
+          path: ${{ env.TC_ROOT }}
+          key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64
+
+      - name: Download + extract toolchains
+        if: steps.tc-cache.outputs.cache-hit != 'true'
+        run: |
+          set -euxo pipefail
+          mkdir -p "$TC_ROOT"
+          base="https://developer.arm.com/-/media/Files/downloads/gnu/${ARM_TC_VER}/binrel"
+          for t in aarch64-none-elf arm-none-eabi; do
+            f="arm-gnu-toolchain-${ARM_TC_VER}-x86_64-${t}.tar.xz"
+            curl -fSL --retry 3 --retry-delay 5 -o "/tmp/$f" "$base/$f"
+            tar -xf "/tmp/$f" -C "$TC_ROOT"
+          done
+          ls -d "$TC_ROOT"/*/
+
+  # --------------------------------------------------------------------------
+  # Full build matrix: per board x layout x default/SPEED_TEST (~10 legs).
+  # -Werror is already in each board's CFLAGS, so a clean compile is the gate.
+  # --------------------------------------------------------------------------
+  build:
+    needs: toolchains
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { board: zcu102,   cross: aarch64-none-elf-, layout: ocm, speed: ""            }
+          - { board: zcu102,   cross: aarch64-none-elf-, layout: ocm, speed: "-DSPEED_TEST" }
+          - { board: zcu102,   cross: aarch64-none-elf-, layout: ddr, speed: ""            }
+          - { board: zcu102,   cross: aarch64-none-elf-, layout: ddr, speed: "-DSPEED_TEST" }
+          - { board: versal,   cross: aarch64-none-elf-, layout: ocm, speed: ""            }
+          - { board: versal,   cross: aarch64-none-elf-, layout: ocm, speed: "-DSPEED_TEST" }
+          - { board: versal,   cross: aarch64-none-elf-, layout: ddr, speed: ""            }
+          - { board: versal,   cross: aarch64-none-elf-, layout: ddr, speed: "-DSPEED_TEST" }
+          - { board: zynq7000, cross: arm-none-eabi-,    layout: ocm, speed: ""            }
+          - { board: zynq7000, cross: arm-none-eabi-,    layout: ocm, speed: "-DSPEED_TEST" }
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Restore toolchains
+        uses: actions/cache/restore@v4
+        with:
+          path: ${{ env.TC_ROOT }}
+          key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64
+          fail-on-cache-miss: true
+
+      - name: Add toolchains to PATH
+        run: |
+          echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-aarch64-none-elf/bin" >> "$GITHUB_PATH"
+          echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-arm-none-eabi/bin" >> "$GITHUB_PATH"
+
+      - name: Build ${{ matrix.board }} (${{ matrix.layout }}${{ matrix.speed && ' SPEED' || '' }})
+        run: |
+          set -euxo pipefail
+          ${{ matrix.cross }}gcc --version | head -1
+          args="CROSS_COMPILE=${{ matrix.cross }}"
+          # zynq7000 is OCM-only (no LAYOUT switch / no target_ddr.ld).
+          if [ "${{ matrix.board }}" != "zynq7000" ]; then
+            args="$args LAYOUT=${{ matrix.layout }}"
+          fi
+          if [ -n "${{ matrix.speed }}" ]; then
+            args="$args CFLAGS_EXTRA=${{ matrix.speed }}"
+          fi
+          make -C "src/port/amd/boards/${{ matrix.board }}" $args
+          ${{ matrix.cross }}size "src/port/amd/boards/${{ matrix.board }}/app.elf"
+
+      - name: Upload app.elf
+        uses: actions/upload-artifact@v4
+        with:
+          name: amd-${{ matrix.board }}-${{ matrix.layout }}-${{ matrix.speed != '' && 'speed' || 'default' }}
+          path: src/port/amd/boards/${{ matrix.board }}/app.elf
+          if-no-files-found: error
+
+  # --------------------------------------------------------------------------
+  # QEMU boot smoke: build the OCM/default app per board and confirm it boots
+  # to "Ready" under the matching Xilinx QEMU machine. zcu102 gates; versal and
+  # zynq7000 are informational (continue-on-error) until their QEMU device
+  # models are confirmed - the machine/UART/load details may need iteration.
+  # --------------------------------------------------------------------------
+  qemu:
+    needs: toolchains
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { board: zcu102,   cross: aarch64-none-elf-, gate: true  }
+          - { board: versal,   cross: aarch64-none-elf-, gate: false }
+          - { board: zynq7000, cross: arm-none-eabi-,    gate: false }
+    continue-on-error: ${{ !matrix.gate }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Restore toolchains
+        uses: actions/cache/restore@v4
+        with:
+          path: ${{ env.TC_ROOT }}
+          key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64
+          fail-on-cache-miss: true
+
+      - name: Add toolchains to PATH
+        run: |
+          echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-aarch64-none-elf/bin" >> "$GITHUB_PATH"
+          echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-arm-none-eabi/bin" >> "$GITHUB_PATH"
+
+      - name: Install QEMU
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            qemu-system-arm qemu-system-aarch64
+
+      - name: Build ${{ matrix.board }} (OCM, default)
+        run: |
+          set -euxo pipefail
+          make -C "src/port/amd/boards/${{ matrix.board }}" CROSS_COMPILE=${{ matrix.cross }}
+
+      - name: QEMU boot smoke
+        run: |
+          chmod +x tools/scripts/amd/qemu-smoke.sh
+          UART_LOG="uart-${{ matrix.board }}.log" \
+            tools/scripts/amd/qemu-smoke.sh "${{ matrix.board }}"
+
+      - name: Upload UART log
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: qemu-uart-${{ matrix.board }}
+          path: uart-${{ matrix.board }}.log
+          if-no-files-found: warn
diff --git a/src/port/amd/README.md b/src/port/amd/README.md
@@ -0,0 +1,152 @@
+# wolfIP AMD/Xilinx bare-metal ports
+
+Bare-metal wolfIP ports for AMD/Xilinx PS-GEM SoCs, sharing one tree:
+
+- **ZCU102** - ZynqMP, Cortex-A53, AArch64, EL3
+- **Versal Gen 1 / VMK180** - Cortex-A72, AArch64, EL3
+- **Zynq-7000 / ZC702** - Cortex-A9, ARMv7-A, SVC
+
+All three are brought up on real hardware (DHCP, ICMP ping, UDP echo).
+
+## Layout
+
+Shared code lives once; each board's Makefile selects which components to
+compile (build-selected files, not `#ifdef` forks).
+
+```
+common/    arch- and SoC-independent
+  app.c app.h          shared UDP-echo + DHCP demo (board hooks: board.c)
+  gem_core.c gem.h     shared Cadence GEM core (init, MDIO, polled TX, diag)
+  gem_regs.h gem_port.h  GEM register map / internal hook interface
+  uart_util.c          shared UART helpers (puts/puthex/putdec/putip4)
+  entropy.c            memuse-pattern RNG (counter via arch_counter64)
+  wolfip_config.h      shared wolfIP profile (board config.h includes it)
+  gic.h uart.h mmu.h   driver API headers
+
+arch/aarch64/  cache.h timer.h mmu_aarch64.c startup_aarch64.S exception_aarch64.c
+arch/armv7/    cache.h timer.h mmu_armv7.c   startup_armv7.S
+
+ip/        per-IP-block drivers (build-selected)
+  uart_cadence.c uart_pl011.c          UART
+  gic_gicv2.c gic_gicv3.c              GIC
+  gem_swq.c gem_rx_swq_poll.c gem_rx_poll.c   RX delivery model (all boards poll)
+  gem_rx_irq.c                         reference IRQ-driven RX (not built; see file)
+  phy_dp83867.c phy_marvell.c          PHY drivers
+  phy_dispatch_dp83867.c phy_dispatch_multi.c   PHY vendor dispatch
+
+boards/<board>/   the build root for each board (keeps app.elf + JTAG in place)
+  board.h board.c board_gem.c config.h Makefile target*.ld jtag/ [bootgen/]
+```
+
+## Component selection per board
+
+| Component | ZCU102 | Versal | Zynq-7000 |
+|-----------|--------|--------|-----------|
+| arch      | aarch64 | aarch64 | armv7 |
+| UART      | cadence | pl011 | cadence |
+| GIC       | gicv2 | gicv3 | gicv2 |
+| GEM RX    | gem_rx_swq_poll + gem_swq | gem_rx_swq_poll + gem_swq | gem_rx_poll |
+| PHY       | dp83867 | dp83867 | dp83867 + marvell (multi) |
+| GEM inst  | GEM3 | GEM0 | GEM0 |
+
+## Build
+
+```
+cd boards/zcu102   && make CROSS_COMPILE=aarch64-none-elf-
+cd boards/versal   && make CROSS_COMPILE=aarch64-none-elf-
+cd boards/zynq7000 && make CROSS_COMPILE=arm-none-eabi-
+```
+
+Output is `app.elf` in the board directory. See each board's `README.md`
+for the JTAG / BOOT.BIN flow and bring-up notes.
+
+## Throughput test (SPEED_TEST)
+
+The default build runs the UDP echo + DHCP demo. Building with
+`CFLAGS_EXTRA=-DSPEED_TEST` instead brings up a TCP throughput server on
+**port 9** (a discard/chargen-style sink + source, in the spirit of iperf but
+without iperf3's JSON control channel, which is impractical on bare metal). On
+each accepted connection the board sinks everything the host sends (RX) and, in
+the same window, sources chargen data whenever the socket is writable (TX); on
+close it prints the byte totals and an average rate over the UART:
+
+```
+cd boards/zcu102 && make CROSS_COMPILE=aarch64-none-elf- CFLAGS_EXTRA=-DSPEED_TEST
+```
+
+Measure from a host on the same subnet as the board (replace `<ip>` with the
+leased address printed at DHCP bind):
+
+```
+# RX (host -> board): how fast the board sinks
+dd if=/dev/zero bs=1460 count=20000 | nc -q1 <ip> 9
+
+# TX (board -> host): how fast the board sources
+nc <ip> 9 </dev/null | pv -r >/dev/null
+```
+
+The board's own `SPEED done ... RX/TX bytes (~B/s)` UART line is the
+authoritative figure (it times the connection with the hardware clock). Note
+the RX and TX counters cover the same connection window, so during the RX run
+the board is also back-sourcing; the printed RX B/s is the host->board goodput
+under that concurrent load. iperf3 host-to-host on the same link is a useful
+*link* reference, but the board is not an iperf3 endpoint.
+
+The `SPEED_TEST` build also widens the TCP window (`RXBUF_SIZE`/`TXBUF_SIZE` to
+`LINK_MTU * 6` in `config.h`) and trims the UDP socket count to keep the larger
+per-socket buffers inside the 256 KB OCM budget.
+
+### Results
+
+Single Cortex core, 1 Gbps RGMII link, MTU 1500, host on the same switch.
+RX is the board's UART `~B/s` line (host -> board); TX is host-measured
+(board -> host). Bytes x8 for Mbps.
+
+| Board (SoC, core)            | Layout / boot   | RX Mbps | TX Mbps |
+|------------------------------|-----------------|--------:|--------:|
+| VMK180 (Versal, A72 @ EL3)   | DDR (JTAG)      |   ~300  |   ~334  |
+| ZCU102 (ZynqMP, A53 @ EL3)   | DDR (SD boot)   |   ~126  |   ~194  |
+| ZC702 (Zynq-7000, A9 @ SVC)  | OCM (JTAG)      |    ~22  |    ~19  |
+| ZCU102 (ZynqMP, A53 @ EL3)   | OCM (JTAG)      |    ~10  |     ~9  |
+
+The single dominant factor is the **memory layout**: the OCM layout runs *all*
+code (and the rings) from Normal non-cacheable OCM, so every instruction fetch
+and frame copy is uncached. The DDR layout keeps code+data in cacheable DDR and
+maps only the GEM DMA region non-cacheable - ~13-30x faster, as the two ZCU102
+rows show directly (same SoC/core, OCM ~10/9 vs DDR ~126/194 Mbps). The faster
+A72 (Versal) reaches ~300/334 on DDR.
+
+How each DDR number was loaded: Versal's PLM trains DDR from a boot PDI, so the
+DDR app loads cleanly over JTAG. On ZynqMP, JTAG writes into DDR after a bare
+`psu_init` are unreliable (the load goes through the A53 with a cache flush and
+either errors or lands corrupt - DDR itself is fine, a direct DAP memtest passes),
+so the ZCU102 DDR figure is from an **SD boot**: `FSBL_ELF=.../zynqmp_fsbl.elf
+make bootbin` produces a DDR-layout `BOOT.BIN` that the FSBL trains DDR for and
+DMA-loads (no JTAG memory writes). Copy it to the SD card's FAT boot partition
+and set SW6 = SD. The same applies to ZC702 (its OCM-only port has no DDR layout
+yet; a DDR profile is future work).
+
+What it took to get here:
+
+1. **NC-map the DMA rings in the DDR layout (correctness, not just speed).**
+   The DDR layout had mapped the GEM BD rings cacheable with per-BD
+   `cache_clean`. Because the 8-byte BDs share 64-byte cache lines, cleaning one
+   BD wrote stale neighbours back over MAC-set OWN bits and wedged the RX ring
+   under sustained (TCP-rate) load - the UDP-only profile never had two BDs live
+   in a line at once. The DMA region is now Normal-NC in both layouts, with
+   `.dma_buffers` in its own 2 MB block so `.text` stays cacheable.
+2. **Main-loop poll cadence.** The original loop called `wolfIP_poll()` then
+   `delay_ms(1)`, capping the stack at ~1 poll/ms (~12 Mbps) and feeding wolfIP
+   a `tick++` counter that only approximated real milliseconds. It now
+   busy-polls with a real-millisecond clock from the hardware timer
+   (`timer_now()/timer_freq()`), which also de-skews every DHCP/TCP/ARP timeout.
+3. **Drain RX fully, bounded TX per event.** Reading one chunk per READABLE
+   left the advertised TCP window stuck (~2 KB) and deadlocked; the SPEED server
+   now drains the rx buffer each event and does a bounded tx fill.
+4. **Word-wise `memcpy`/`memset`.** Frame-staging copies are 8 bytes at a time
+   (bytewise tail), which matters for the non-cacheable DMA buffers.
+
+Notes / remaining levers: ZCU102 uses the same poll-driven RX as the other two
+boards - its original IRQ-driven RX storms the CPU under sustained RX load.
+A DDR/BOOT.BIN profile for the OCM boards (cached code) and draining more than
+one frame per poll are the next levers.
diff --git a/src/port/amd/arch/aarch64/cache.h b/src/port/amd/arch/aarch64/cache.h
@@ -0,0 +1,41 @@
+/* cache.h
+ *
+ * Copyright (C) 2026 wolfSSL Inc.
+ *
+ * This file is part of wolfIP TCP/IP stack.
+ *
+ * AArch64 (Cortex-A53/A72) cache maintenance for GEM DMA coherency. The
+ * cache line is 64 bytes. With D-cache enabled and BD/buffers in normal
+ * cacheable memory, CPU writes may sit in L1 and not be visible to the
+ * MAC's DMA path. cache_clean() writes back dirty lines before DMA
+ * reads; cache_inval() invalidates lines so CPU reads pull fresh
+ * DMA-written data.
+ */
+#ifndef AMD_CACHE_H
+#define AMD_CACHE_H
+
+#include <stdint.h>
+
+#define CACHE_LINE 64u
+
+static inline void cache_clean(const void *p, uint32_t sz)
+{
+    uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u);
+    uintptr_t end   = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u);
+    uintptr_t a;
+    for (a = start; a < end; a += CACHE_LINE)
+        __asm__ volatile ("dc cvac, %0" :: "r"(a) : "memory");
+    __asm__ volatile ("dsb sy" ::: "memory");
+}
+
+static inline void cache_inval(const void *p, uint32_t sz)
+{
+    uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u);
+    uintptr_t end   = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u);
+    uintptr_t a;
+    for (a = start; a < end; a += CACHE_LINE)
+        __asm__ volatile ("dc ivac, %0" :: "r"(a) : "memory");
+    __asm__ volatile ("dsb sy" ::: "memory");
+}
+
+#endif /* AMD_CACHE_H */