diff --git a/Documentation/core-api/floating-point.rst b/Documentation/core-api/floating-point.rst new file mode 100644 index 000000000000..a8d0d4b05052 --- /dev/null +++ b/Documentation/core-api/floating-point.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +Floating-point API +================== + +Kernel code is normally prohibited from using floating-point (FP) registers or +instructions, including the C float and double data types. This rule reduces +system call overhead, because the kernel does not need to save and restore the +userspace floating-point register state. + +However, occasionally drivers or library functions may need to include FP code. +This is supported by isolating the functions containing FP code to a separate +translation unit (a separate source file), and saving/restoring the FP register +state around calls to those functions. This creates "critical sections" of +floating-point usage. + +The reason for this isolation is to prevent the compiler from generating code +touching the FP registers outside these critical sections. Compilers sometimes +use FP registers to optimize inlined ``memcpy`` or variable assignment, as +floating-point registers may be wider than general-purpose registers. + +Usability of floating-point code within the kernel is architecture-specific. +Additionally, because a single kernel may be configured to support platforms +both with and without a floating-point unit, FPU availability must be checked +both at build time and at run time. + +Several architectures implement the generic kernel floating-point API from +``linux/fpu.h``, as described below. Some other architectures implement their +own unique APIs, which are documented separately. + +Build-time API +-------------- + +Floating-point code may be built if the option ``ARCH_HAS_KERNEL_FPU_SUPPORT`` +is enabled. For C code, such code must be placed in a separate file, and that +file must have its compilation flags adjusted using the following pattern:: + + CFLAGS_foo.o += $(CC_FLAGS_FPU) + CFLAGS_REMOVE_foo.o += $(CC_FLAGS_NO_FPU) + +Architectures are expected to define one or both of these variables in their +top-level Makefile as needed. For example:: + + CC_FLAGS_FPU := -mhard-float + +or:: + + CC_FLAGS_NO_FPU := -msoft-float + +Normal kernel code is assumed to use the equivalent of ``CC_FLAGS_NO_FPU``. + +Runtime API +----------- + +The runtime API is provided in ``linux/fpu.h``. This header cannot be included +from files implementing FP code (those with their compilation flags adjusted as +above). Instead, it must be included when defining the FP critical sections. + +.. c:function:: bool kernel_fpu_available( void ) + + This function reports if floating-point code can be used on this CPU or + platform. The value returned by this function is not expected to change + at runtime, so it only needs to be called once, not before every + critical section. + +.. c:function:: void kernel_fpu_begin( void ) + void kernel_fpu_end( void ) + + These functions create a floating-point critical section. It is only + valid to call ``kernel_fpu_begin()`` after a previous call to + ``kernel_fpu_available()`` returned ``true``. These functions are only + guaranteed to be callable from (preemptible or non-preemptible) process + context. + + Preemption may be disabled inside critical sections, so their size + should be minimized. They are *not* required to be reentrant. If the + caller expects to nest critical sections, it must implement its own + reference counting. diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 7a3a08d81f11..974beccd671f 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -48,6 +48,7 @@ Library functionality that is used throughout the kernel. errseq wrappers/atomic_t wrappers/atomic_bitops + floating-point Low level entry and exit ======================== diff --git a/Makefile b/Makefile index b7198af9e59b..c8993b8dd605 100644 --- a/Makefile +++ b/Makefile @@ -981,6 +981,11 @@ KBUILD_CFLAGS += $(CC_FLAGS_CFI) export CC_FLAGS_CFI endif +# Architectures can define flags to add/remove for floating-point support +CC_FLAGS_FPU += -D_LINUX_FPU_COMPILATION_UNIT +export CC_FLAGS_FPU +export CC_FLAGS_NO_FPU + ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0) KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT) endif diff --git a/arch/Kconfig b/arch/Kconfig index 20c2c93d2c88..603eb47c6268 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1480,6 +1480,12 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG address translations. Page table walkers that clear the accessed bit may use this capability to reduce their search space. +config ARCH_HAS_KERNEL_FPU_SUPPORT + bool + help + Architectures that select this option can run floating-point code in + the kernel, as described in Documentation/core-api/floating-point.rst. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 443f7566c9d1..eacbc4efe60c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -29,6 +29,7 @@ config RISCV select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_KERNEL_FPU_SUPPORT if 64BIT && FPU select ARCH_HAS_MMIOWB select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index b43a6bb7e4dc..aaf7b9bd2275 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -77,6 +77,9 @@ KBUILD_CFLAGS += -march=$(shell echo $(riscv-march-y) | sed -E 's/(rv32ima|rv64i KBUILD_AFLAGS += -march=$(riscv-march-y) +# For C code built with floating-point support, exclude V but keep F and D. +CC_FLAGS_FPU := -march=$(shell echo $(riscv-march-y) | sed -E 's/(rv32ima|rv64ima)([^v_]*)v?/\1\2/') + KBUILD_CFLAGS += -mno-save-restore KBUILD_CFLAGS += -DCONFIG_PAGE_OFFSET=$(CONFIG_PAGE_OFFSET) diff --git a/arch/riscv/include/asm/fpu.h b/arch/riscv/include/asm/fpu.h new file mode 100644 index 000000000000..91c04c244e12 --- /dev/null +++ b/arch/riscv/include/asm/fpu.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 SiFive + */ + +#ifndef _ASM_RISCV_FPU_H +#define _ASM_RISCV_FPU_H + +#include + +#define kernel_fpu_available() has_fpu() + +void kernel_fpu_begin(void); +void kernel_fpu_end(void); + +#endif /* ! _ASM_RISCV_FPU_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 95cf25d48405..a90f90eb0578 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_M_MODE) += traps_misaligned.o obj-$(CONFIG_FPU) += fpu.o +obj-$(CONFIG_FPU) += kernel_mode_fpu.o obj-$(CONFIG_RISCV_ISA_V) += vector.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/riscv/kernel/kernel_mode_fpu.c b/arch/riscv/kernel/kernel_mode_fpu.c new file mode 100644 index 000000000000..0ac8348876c4 --- /dev/null +++ b/arch/riscv/kernel/kernel_mode_fpu.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 SiFive + */ + +#include +#include + +#include +#include +#include +#include + +void kernel_fpu_begin(void) +{ + preempt_disable(); + fstate_save(current, task_pt_regs(current)); + csr_set(CSR_SSTATUS, SR_FS); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + csr_clear(CSR_SSTATUS, SR_FS); + fstate_restore(current, task_pt_regs(current)); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index e264e59e596e..a191ba7f8892 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -9,6 +9,7 @@ #include #include #include +#include unsigned long module_emit_got_entry(struct module *mod, unsigned long val) { @@ -55,19 +56,27 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) return (unsigned long)&plt[i]; } -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) { - return x->r_info == y->r_info && x->r_addend == y->r_addend; + const Elf_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(x->r_info, y->r_info); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; } static bool duplicate_rela(const Elf_Rela *rela, int idx) { - int i; - for (i = 0; i < idx; i++) { - if (is_rela_equal(&rela[i], &rela[idx])) - return true; - } - return false; + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding slot. + */ + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; } static void count_max_entries(Elf_Rela *relas, int num, @@ -87,11 +96,33 @@ static void count_max_entries(Elf_Rela *relas, int num, } } +static bool rela_needs_plt_got(const Elf_Rela *rela) +{ + unsigned int type = ELF_R_TYPE(rela->r_info); + + return type == R_RISCV_CALL_PLT || type == R_RISCV_GOT_HI20; +} + +/* Copy PLT and GOT relas to the scratch array. */ +static unsigned int partition_plt_got_relas(const Elf_Rela *relas, Elf_Rela *scratch, + unsigned int num_rela) +{ + int j = 0; + + for (int i = 0; i < num_rela; i++) + if (rela_needs_plt_got(&relas[i])) + scratch[j++] = relas[i]; + + return j; +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { unsigned int num_plts = 0; unsigned int num_gots = 0; + Elf_Rela *scratch = NULL; + size_t scratch_size = 0; int i; /* @@ -132,9 +163,26 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dst_sec->sh_flags & SHF_EXECINSTR)) continue; - count_max_entries(relas, num_rela, &num_plts, &num_gots); + /* + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being + * close together, so sort a copy of the section to avoid interfering. + */ + if (sechdrs[i].sh_size > scratch_size) { + scratch = kvrealloc(scratch, scratch_size, sechdrs[i].sh_size, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + scratch_size = sechdrs[i].sh_size; + } + + /* sort relocations requiring a PLT or GOT entry so duplicates are adjacent */ + num_rela = partition_plt_got_relas(relas, scratch, num_rela); + sort(scratch, num_rela, sizeof(Elf_Rela), cmp_rela, NULL); + count_max_entries(scratch, num_rela, &num_plts, &num_gots); } + if (scratch) + kvfree(scratch); + mod->arch.plt.shdr->sh_type = SHT_NOBITS; mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC; mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES; diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 901d1961b739..5fcd4f778dc3 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -8,7 +8,7 @@ config DRM_AMD_DC depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64 select SND_HDA_COMPONENT if SND_HDA_CORE # !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752 - select DRM_AMD_DC_FP if (X86 || LOONGARCH || (PPC64 && ALTIVEC) || (ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG)) + select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || !CC_IS_CLANG) help Choose this option if you want to use the new display engine support for AMDGPU. This adds required support for Vega and diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c index 172aa10a8800..e46f8ce41d87 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c @@ -26,16 +26,7 @@ #include "dc_trace.h" -#if defined(CONFIG_X86) -#include -#elif defined(CONFIG_PPC64) -#include -#include -#elif defined(CONFIG_ARM64) -#include -#elif defined(CONFIG_LOONGARCH) -#include -#endif +#include /** * DOC: DC FPU manipulation overview @@ -60,11 +51,9 @@ static DEFINE_PER_CPU(int, fpu_recursion_depth); */ inline void dc_assert_fp_enabled(void) { - int *pcpu, depth = 0; + int depth; - pcpu = get_cpu_ptr(&fpu_recursion_depth); - depth = *pcpu; - put_cpu_ptr(&fpu_recursion_depth); + depth = __this_cpu_read(fpu_recursion_depth); ASSERT(depth >= 1); } @@ -84,33 +73,17 @@ inline void dc_assert_fp_enabled(void) */ void dc_fpu_begin(const char *function_name, const int line) { - int *pcpu; + int depth; - pcpu = get_cpu_ptr(&fpu_recursion_depth); - *pcpu += 1; - - if (*pcpu == 1) { -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) - migrate_disable(); + WARN_ON_ONCE(!in_task()); + preempt_disable(); + depth = __this_cpu_inc_return(fpu_recursion_depth); + if (depth == 1) { + BUG_ON(!kernel_fpu_available()); kernel_fpu_begin(); -#elif defined(CONFIG_PPC64) - if (cpu_has_feature(CPU_FTR_VSX_COMP)) { - preempt_disable(); - enable_kernel_vsx(); - } else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) { - preempt_disable(); - enable_kernel_altivec(); - } else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) { - preempt_disable(); - enable_kernel_fp(); - } -#elif defined(CONFIG_ARM64) - kernel_neon_begin(); -#endif } - TRACE_DCN_FPU(true, function_name, line, *pcpu); - put_cpu_ptr(&fpu_recursion_depth); + TRACE_DCN_FPU(true, function_name, line, depth); } /** @@ -125,30 +98,15 @@ void dc_fpu_begin(const char *function_name, const int line) */ void dc_fpu_end(const char *function_name, const int line) { - int *pcpu; + int depth; - pcpu = get_cpu_ptr(&fpu_recursion_depth); - *pcpu -= 1; - if (*pcpu <= 0) { -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) + depth = __this_cpu_dec_return(fpu_recursion_depth); + if (depth == 0) { kernel_fpu_end(); - migrate_enable(); -#elif defined(CONFIG_PPC64) - if (cpu_has_feature(CPU_FTR_VSX_COMP)) { - disable_kernel_vsx(); - preempt_enable(); - } else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) { - disable_kernel_altivec(); - preempt_enable(); - } else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) { - disable_kernel_fp(); - preempt_enable(); - } -#elif defined(CONFIG_ARM64) - kernel_neon_end(); -#endif + } else { + WARN_ON_ONCE(depth < 0); } - TRACE_DCN_FPU(false, function_name, line, *pcpu); - put_cpu_ptr(&fpu_recursion_depth); + TRACE_DCN_FPU(false, function_name, line, depth); + preempt_enable(); } diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 0ba9a7997d56..2d04de698dd6 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -25,40 +25,8 @@ # It provides the general basic services required by other DAL # subcomponents. -ifdef CONFIG_X86 -dml_ccflags-$(CONFIG_CC_IS_GCC) := -mhard-float -dml_ccflags := $(dml_ccflags-y) -msse -endif - -ifdef CONFIG_PPC64 -dml_ccflags := -mhard-float -maltivec -endif - -ifdef CONFIG_ARM64 -dml_rcflags := -mgeneral-regs-only -endif - -ifdef CONFIG_LOONGARCH -dml_ccflags := -mfpu=64 -dml_rcflags := -msoft-float -endif - -ifdef CONFIG_CC_IS_GCC -ifneq ($(call gcc-min-version, 70100),y) -IS_OLD_GCC = 1 -endif -endif - -ifdef CONFIG_X86 -ifdef IS_OLD_GCC -# Stack alignment mismatch, proceed with caution. -# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3 -# (8B stack alignment). -dml_ccflags += -mpreferred-stack-boundary=4 -else -dml_ccflags += -msse2 -endif -endif +dml_ccflags := $(CC_FLAGS_FPU) +dml_rcflags := $(CC_FLAGS_NO_FPU) ifneq ($(CONFIG_FRAME_WARN),0) ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) diff --git a/include/linux/fpu.h b/include/linux/fpu.h new file mode 100644 index 000000000000..2fb63e22913b --- /dev/null +++ b/include/linux/fpu.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_FPU_H +#define _LINUX_FPU_H + +#ifdef _LINUX_FPU_COMPILATION_UNIT +#error FP code must be compiled separately. See Documentation/core-api/floating-point.rst. +#endif + +#include + +#endif