Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 178 additions & 0 deletions .github/workflows/amd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
name: AMD/Xilinx ports

# Build (and QEMU boot-smoke) the bare-metal wolfIP ports under src/port/amd/:
# ZCU102 (A53), Versal VMK180 (A72) and Zynq-7000 ZC702 (A9). app.elf builds
# from repo sources only - no Vitis/FSBL/bootgen/hardware. (BOOT.BIN needs an
# FSBL + bootgen and is out of scope here.)

on:
push:
paths:
- 'src/port/amd/**'
- 'src/wolfip.c'
- 'wolfip.h'
- 'tools/scripts/amd/**'
- '.github/workflows/amd.yml'
pull_request:
paths:
- 'src/port/amd/**'
- 'src/wolfip.c'
- 'wolfip.h'
- 'tools/scripts/amd/**'
- '.github/workflows/amd.yml'

# Cancel superseded runs on the same ref (runner optimization).
concurrency:
group: amd-${{ github.ref }}
cancel-in-progress: true

env:
ARM_TC_VER: 14.3.rel1
TC_ROOT: /home/runner/toolchains

jobs:
# --------------------------------------------------------------------------
# Prime the toolchain cache once so the matrix legs don't each re-download
# ~150 MB. Pin the official ARM GNU Toolchain bundle that ships BOTH cross
# compilers (aarch64-none-elf for ZCU102/Versal, arm-none-eabi for Zynq-7000).
# --------------------------------------------------------------------------
toolchains:
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: Cache ARM GNU toolchains
id: tc-cache
uses: actions/cache@v4
with:
path: ${{ env.TC_ROOT }}
key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64

- name: Download + extract toolchains
if: steps.tc-cache.outputs.cache-hit != 'true'
run: |
set -euxo pipefail
mkdir -p "$TC_ROOT"
base="https://developer.arm.com/-/media/Files/downloads/gnu/${ARM_TC_VER}/binrel"
for t in aarch64-none-elf arm-none-eabi; do
f="arm-gnu-toolchain-${ARM_TC_VER}-x86_64-${t}.tar.xz"
curl -fSL --retry 3 --retry-delay 5 -o "/tmp/$f" "$base/$f"
tar -xf "/tmp/$f" -C "$TC_ROOT"
done
ls -d "$TC_ROOT"/*/

# --------------------------------------------------------------------------
# Full build matrix: per board x layout x default/SPEED_TEST (~10 legs).
# -Werror is already in each board's CFLAGS, so a clean compile is the gate.
# --------------------------------------------------------------------------
build:
needs: toolchains
runs-on: ubuntu-latest
timeout-minutes: 10
strategy:
fail-fast: false
matrix:
include:
- { board: zcu102, cross: aarch64-none-elf-, layout: ocm, speed: "" }
- { board: zcu102, cross: aarch64-none-elf-, layout: ocm, speed: "-DSPEED_TEST" }
- { board: zcu102, cross: aarch64-none-elf-, layout: ddr, speed: "" }
- { board: zcu102, cross: aarch64-none-elf-, layout: ddr, speed: "-DSPEED_TEST" }
- { board: versal, cross: aarch64-none-elf-, layout: ocm, speed: "" }
- { board: versal, cross: aarch64-none-elf-, layout: ocm, speed: "-DSPEED_TEST" }
- { board: versal, cross: aarch64-none-elf-, layout: ddr, speed: "" }
- { board: versal, cross: aarch64-none-elf-, layout: ddr, speed: "-DSPEED_TEST" }
- { board: zynq7000, cross: arm-none-eabi-, layout: ocm, speed: "" }
- { board: zynq7000, cross: arm-none-eabi-, layout: ocm, speed: "-DSPEED_TEST" }
steps:
- uses: actions/checkout@v4

- name: Restore toolchains
uses: actions/cache/restore@v4
with:
path: ${{ env.TC_ROOT }}
key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64
fail-on-cache-miss: true

- name: Add toolchains to PATH
run: |
echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-aarch64-none-elf/bin" >> "$GITHUB_PATH"
echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-arm-none-eabi/bin" >> "$GITHUB_PATH"

- name: Build ${{ matrix.board }} (${{ matrix.layout }}${{ matrix.speed && ' SPEED' || '' }})
run: |
set -euxo pipefail
${{ matrix.cross }}gcc --version | head -1
args="CROSS_COMPILE=${{ matrix.cross }}"
# zynq7000 is OCM-only (no LAYOUT switch / no target_ddr.ld).
if [ "${{ matrix.board }}" != "zynq7000" ]; then
args="$args LAYOUT=${{ matrix.layout }}"
fi
if [ -n "${{ matrix.speed }}" ]; then
args="$args CFLAGS_EXTRA=${{ matrix.speed }}"
fi
make -C "src/port/amd/boards/${{ matrix.board }}" $args
${{ matrix.cross }}size "src/port/amd/boards/${{ matrix.board }}/app.elf"

- name: Upload app.elf
uses: actions/upload-artifact@v4
with:
name: amd-${{ matrix.board }}-${{ matrix.layout }}-${{ matrix.speed != '' && 'speed' || 'default' }}
path: src/port/amd/boards/${{ matrix.board }}/app.elf
if-no-files-found: error

# --------------------------------------------------------------------------
# QEMU boot smoke: build the OCM/default app per board and confirm it boots
# to "Ready" under the matching Xilinx QEMU machine. zcu102 gates; versal and
# zynq7000 are informational (continue-on-error) until their QEMU device
# models are confirmed - the machine/UART/load details may need iteration.
# --------------------------------------------------------------------------
qemu:
needs: toolchains
runs-on: ubuntu-latest
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
include:
- { board: zcu102, cross: aarch64-none-elf-, gate: true }
- { board: versal, cross: aarch64-none-elf-, gate: false }
- { board: zynq7000, cross: arm-none-eabi-, gate: false }
continue-on-error: ${{ !matrix.gate }}
steps:
- uses: actions/checkout@v4

- name: Restore toolchains
uses: actions/cache/restore@v4
with:
path: ${{ env.TC_ROOT }}
key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64
fail-on-cache-miss: true

- name: Add toolchains to PATH
run: |
echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-aarch64-none-elf/bin" >> "$GITHUB_PATH"
echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-arm-none-eabi/bin" >> "$GITHUB_PATH"

- name: Install QEMU
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
qemu-system-arm qemu-system-aarch64

- name: Build ${{ matrix.board }} (OCM, default)
run: |
set -euxo pipefail
make -C "src/port/amd/boards/${{ matrix.board }}" CROSS_COMPILE=${{ matrix.cross }}

- name: QEMU boot smoke
run: |
chmod +x tools/scripts/amd/qemu-smoke.sh
UART_LOG="uart-${{ matrix.board }}.log" \
tools/scripts/amd/qemu-smoke.sh "${{ matrix.board }}"

- name: Upload UART log
if: always()
uses: actions/upload-artifact@v4
with:
name: qemu-uart-${{ matrix.board }}
path: uart-${{ matrix.board }}.log
if-no-files-found: warn
152 changes: 152 additions & 0 deletions src/port/amd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# wolfIP AMD/Xilinx bare-metal ports

Bare-metal wolfIP ports for AMD/Xilinx PS-GEM SoCs, sharing one tree:

- **ZCU102** - ZynqMP, Cortex-A53, AArch64, EL3
- **Versal Gen 1 / VMK180** - Cortex-A72, AArch64, EL3
- **Zynq-7000 / ZC702** - Cortex-A9, ARMv7-A, SVC

All three are brought up on real hardware (DHCP, ICMP ping, UDP echo).

## Layout

Shared code lives once; each board's Makefile selects which components to
compile (build-selected files, not `#ifdef` forks).

```
common/ arch- and SoC-independent
app.c app.h shared UDP-echo + DHCP demo (board hooks: board.c)
gem_core.c gem.h shared Cadence GEM core (init, MDIO, polled TX, diag)
gem_regs.h gem_port.h GEM register map / internal hook interface
uart_util.c shared UART helpers (puts/puthex/putdec/putip4)
entropy.c memuse-pattern RNG (counter via arch_counter64)
wolfip_config.h shared wolfIP profile (board config.h includes it)
gic.h uart.h mmu.h driver API headers

arch/aarch64/ cache.h timer.h mmu_aarch64.c startup_aarch64.S exception_aarch64.c
arch/armv7/ cache.h timer.h mmu_armv7.c startup_armv7.S

ip/ per-IP-block drivers (build-selected)
uart_cadence.c uart_pl011.c UART
gic_gicv2.c gic_gicv3.c GIC
gem_swq.c gem_rx_swq_poll.c gem_rx_poll.c RX delivery model (all boards poll)
gem_rx_irq.c reference IRQ-driven RX (not built; see file)
phy_dp83867.c phy_marvell.c PHY drivers
phy_dispatch_dp83867.c phy_dispatch_multi.c PHY vendor dispatch

boards/<board>/ the build root for each board (keeps app.elf + JTAG in place)
board.h board.c board_gem.c config.h Makefile target*.ld jtag/ [bootgen/]
```

## Component selection per board

| Component | ZCU102 | Versal | Zynq-7000 |
|-----------|--------|--------|-----------|
| arch | aarch64 | aarch64 | armv7 |
| UART | cadence | pl011 | cadence |
| GIC | gicv2 | gicv3 | gicv2 |
| GEM RX | gem_rx_swq_poll + gem_swq | gem_rx_swq_poll + gem_swq | gem_rx_poll |
| PHY | dp83867 | dp83867 | dp83867 + marvell (multi) |
| GEM inst | GEM3 | GEM0 | GEM0 |

## Build

```
cd boards/zcu102 && make CROSS_COMPILE=aarch64-none-elf-
cd boards/versal && make CROSS_COMPILE=aarch64-none-elf-
cd boards/zynq7000 && make CROSS_COMPILE=arm-none-eabi-
```

Output is `app.elf` in the board directory. See each board's `README.md`
for the JTAG / BOOT.BIN flow and bring-up notes.

## Throughput test (SPEED_TEST)

The default build runs the UDP echo + DHCP demo. Building with
`CFLAGS_EXTRA=-DSPEED_TEST` instead brings up a TCP throughput server on
**port 9** (a discard/chargen-style sink + source, in the spirit of iperf but
without iperf3's JSON control channel, which is impractical on bare metal). On
each accepted connection the board sinks everything the host sends (RX) and, in
the same window, sources chargen data whenever the socket is writable (TX); on
close it prints the byte totals and an average rate over the UART:

```
cd boards/zcu102 && make CROSS_COMPILE=aarch64-none-elf- CFLAGS_EXTRA=-DSPEED_TEST
```

Measure from a host on the same subnet as the board (replace `<ip>` with the
leased address printed at DHCP bind):

```
# RX (host -> board): how fast the board sinks
dd if=/dev/zero bs=1460 count=20000 | nc -q1 <ip> 9

# TX (board -> host): how fast the board sources
nc <ip> 9 </dev/null | pv -r >/dev/null
```

The board's own `SPEED done ... RX/TX bytes (~B/s)` UART line is the
authoritative figure (it times the connection with the hardware clock). Note
the RX and TX counters cover the same connection window, so during the RX run
the board is also back-sourcing; the printed RX B/s is the host->board goodput
under that concurrent load. iperf3 host-to-host on the same link is a useful
*link* reference, but the board is not an iperf3 endpoint.

The `SPEED_TEST` build also widens the TCP window (`RXBUF_SIZE`/`TXBUF_SIZE` to
`LINK_MTU * 6` in `config.h`) and trims the UDP socket count to keep the larger
per-socket buffers inside the 256 KB OCM budget.

### Results

Single Cortex core, 1 Gbps RGMII link, MTU 1500, host on the same switch.
RX is the board's UART `~B/s` line (host -> board); TX is host-measured
(board -> host). Bytes x8 for Mbps.

| Board (SoC, core) | Layout / boot | RX Mbps | TX Mbps |
|------------------------------|-----------------|--------:|--------:|
| VMK180 (Versal, A72 @ EL3) | DDR (JTAG) | ~300 | ~334 |
| ZCU102 (ZynqMP, A53 @ EL3) | DDR (SD boot) | ~126 | ~194 |
| ZC702 (Zynq-7000, A9 @ SVC) | OCM (JTAG) | ~22 | ~19 |
| ZCU102 (ZynqMP, A53 @ EL3) | OCM (JTAG) | ~10 | ~9 |

The single dominant factor is the **memory layout**: the OCM layout runs *all*
code (and the rings) from Normal non-cacheable OCM, so every instruction fetch
and frame copy is uncached. The DDR layout keeps code+data in cacheable DDR and
maps only the GEM DMA region non-cacheable - ~13-30x faster, as the two ZCU102
rows show directly (same SoC/core, OCM ~10/9 vs DDR ~126/194 Mbps). The faster
A72 (Versal) reaches ~300/334 on DDR.

How each DDR number was loaded: Versal's PLM trains DDR from a boot PDI, so the
DDR app loads cleanly over JTAG. On ZynqMP, JTAG writes into DDR after a bare
`psu_init` are unreliable (the load goes through the A53 with a cache flush and
either errors or lands corrupt - DDR itself is fine, a direct DAP memtest passes),
so the ZCU102 DDR figure is from an **SD boot**: `FSBL_ELF=.../zynqmp_fsbl.elf
make bootbin` produces a DDR-layout `BOOT.BIN` that the FSBL trains DDR for and
DMA-loads (no JTAG memory writes). Copy it to the SD card's FAT boot partition
and set SW6 = SD. The same applies to ZC702 (its OCM-only port has no DDR layout
yet; a DDR profile is future work).

What it took to get here:

1. **NC-map the DMA rings in the DDR layout (correctness, not just speed).**
The DDR layout had mapped the GEM BD rings cacheable with per-BD
`cache_clean`. Because the 8-byte BDs share 64-byte cache lines, cleaning one
BD wrote stale neighbours back over MAC-set OWN bits and wedged the RX ring
under sustained (TCP-rate) load - the UDP-only profile never had two BDs live
in a line at once. The DMA region is now Normal-NC in both layouts, with
`.dma_buffers` in its own 2 MB block so `.text` stays cacheable.
2. **Main-loop poll cadence.** The original loop called `wolfIP_poll()` then
`delay_ms(1)`, capping the stack at ~1 poll/ms (~12 Mbps) and feeding wolfIP
a `tick++` counter that only approximated real milliseconds. It now
busy-polls with a real-millisecond clock from the hardware timer
(`timer_now()/timer_freq()`), which also de-skews every DHCP/TCP/ARP timeout.
3. **Drain RX fully, bounded TX per event.** Reading one chunk per READABLE
left the advertised TCP window stuck (~2 KB) and deadlocked; the SPEED server
now drains the rx buffer each event and does a bounded tx fill.
4. **Word-wise `memcpy`/`memset`.** Frame-staging copies are 8 bytes at a time
(bytewise tail), which matters for the non-cacheable DMA buffers.

Notes / remaining levers: ZCU102 uses the same poll-driven RX as the other two
boards - its original IRQ-driven RX storms the CPU under sustained RX load.
A DDR/BOOT.BIN profile for the OCM boards (cached code) and draining more than
one frame per poll are the next levers.
41 changes: 41 additions & 0 deletions src/port/amd/arch/aarch64/cache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/* cache.h
*
* Copyright (C) 2026 wolfSSL Inc.
*
* This file is part of wolfIP TCP/IP stack.
*
* AArch64 (Cortex-A53/A72) cache maintenance for GEM DMA coherency. The
* cache line is 64 bytes. With D-cache enabled and BD/buffers in normal
* cacheable memory, CPU writes may sit in L1 and not be visible to the
* MAC's DMA path. cache_clean() writes back dirty lines before DMA
* reads; cache_inval() invalidates lines so CPU reads pull fresh
* DMA-written data.
*/
#ifndef AMD_CACHE_H
#define AMD_CACHE_H

#include <stdint.h>

#define CACHE_LINE 64u

static inline void cache_clean(const void *p, uint32_t sz)
{
uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u);
uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u);
uintptr_t a;
for (a = start; a < end; a += CACHE_LINE)
__asm__ volatile ("dc cvac, %0" :: "r"(a) : "memory");
__asm__ volatile ("dsb sy" ::: "memory");
}

static inline void cache_inval(const void *p, uint32_t sz)
{
uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u);
uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u);
uintptr_t a;
for (a = start; a < end; a += CACHE_LINE)
__asm__ volatile ("dc ivac, %0" :: "r"(a) : "memory");
__asm__ volatile ("dsb sy" ::: "memory");
}

#endif /* AMD_CACHE_H */
Loading
Loading