From 7d3018da4c4b7b39b376d259ade896c86103d4bd Mon Sep 17 00:00:00 2001 From: Scare Crowe <84860158+CWDSYSTEMS@users.noreply.github.com> Date: Wed, 20 Oct 2021 18:55:35 +0500 Subject: [PATCH] First of all. T3Q is a fucking slut whore. Next, Core 2.0 optimizations * NVME, SATA NAND Security added * Qortal Core exception fetcher is now redone. * Update DT overlays for firmware * Fix for bvb clockj settings * Fix for no audio for sissy desktop porn watchers -_- ( thanks crowetic for watching gay porn and reporting me that bug asshat ) * Normalize the fetch() stream while doing a peer to peer handshake for nodes * Fix for RNG token editing error while performing a SHA256 encryption * Now under voltage errors will blink red led constantly for 5 minutes then go solid. * Improve kernel thread scaling for Qortal 2.0 core * HDMI circuit is now enabled at power up instead. * Added KMS * Added line replication instead of interpolation for VC4 GPU resulting in slightly better frame rates * Fix for long and doubles * Backplane clock is now set at standard rate * Capped HVEC clocks * Add support for Creative Cinema webcam for donkers who like sharing dick pics. *looks at crowetic* * More scanline XGA modes for people who have weird ass monitors of all sorts. * TX/RX flow control support is now 100% stable. No lags over 1Gbps ethernet. ( Hello Qortal 3.0 ) * Using flush cache instead of fetch for QC 2.0 resulting in performance gains * VC4 clock is now enforced for desktop oriented images. * Ondemand governor now waits for 2 seconds instead of 0.5ms to scale down to the lowest safest clock freq preventing lags to the core. * Timeout of OC set at 35ms from 90ms resulting in way better clocks and sync for Qortal 2.0 core --- Documentation/admin-guide/cgroup-v2.rst | 24 +- .../admin-guide/kernel-parameters.txt | 2 +- .../bindings/display/bridge/ti,sn65dsi83.yaml | 2 - .../bindings/display/bridge/ti,sn65dsi86.yaml | 2 +- .../display/panel/ilitek,ili9341.yaml | 2 +- .../bindings/interconnect/qcom,sdm660.yaml | 46 +- .../bindings/mmc/snps,dwcmshc-sdhci.yaml | 4 +- .../bindings/net/nxp,dwmac-imx.yaml | 1 - .../devicetree/bindings/net/snps,dwmac.yaml | 2 + .../bindings/pci/fsl,imx6q-pcie.yaml | 1 - .../bindings/spi/snps,dw-apb-ssi.yaml | 2 +- Documentation/filesystems/ntfs3.rst | 143 +- Documentation/gpu/amdgpu.rst | 4 +- Documentation/gpu/drm-internals.rst | 9 - Documentation/hwmon/k10temp.rst | 17 - Documentation/userspace-api/vduse.rst | 2 +- arch/arc/include/asm/pgtable.h | 5 - arch/arm/Kconfig | 2 - arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 1 - arch/arm/boot/dts/at91-sama7g5ek.dts | 45 +- arch/arm/boot/dts/bcm270x-rpi.dtsi | 13 +- arch/arm/boot/dts/bcm2711-rpi-4-b.dts | 6 +- arch/arm/boot/dts/imx53-m53menlo.dts | 8 +- arch/arm/boot/dts/imx6dl-yapp4-common.dtsi | 5 + arch/arm/boot/dts/imx6qdl-pico.dtsi | 11 + arch/arm/boot/dts/overlays/Makefile | 1 + arch/arm/boot/dts/overlays/README | 14 + .../dts/overlays/adafruit-st7735r-overlay.dts | 83 + .../hifiberry-dacplusadcpro-overlay.dts | 5 + arch/arm/boot/dts/qcom-apq8064.dtsi | 15 +- arch/arm/boot/dts/sama7g5.dtsi | 39 + arch/arm/boot/dts/vexpress-v2m-rs1.dtsi | 67 +- arch/arm/boot/dts/vexpress-v2m.dtsi | 65 +- arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts | 57 +- arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts | 57 +- arch/arm/boot/dts/vexpress-v2p-ca5s.dts | 57 +- arch/arm/boot/dts/vexpress-v2p-ca9.dts | 58 - arch/arm/common/sharpsl_param.c | 4 +- arch/arm/configs/bcm2709_defconfig | 1 + arch/arm/configs/bcm2711_defconfig | 2 + arch/arm/configs/bcmrpi_defconfig | 1 + arch/arm/configs/gemini_defconfig | 1 + arch/arm/configs/imx_v6_v7_defconfig | 1 + arch/arm/configs/multi_v7_defconfig | 4 +- arch/arm/configs/oxnas_v6_defconfig | 1 - arch/arm/configs/shmobile_defconfig | 1 - arch/arm/mach-at91/pm.c | 130 +- arch/arm/mach-at91/pm_suspend.S | 50 +- arch/arm/mach-dove/include/mach/uncompress.h | 4 +- arch/arm/mach-imx/mach-imx6q.c | 3 + arch/arm/mach-imx/pm-imx6.c | 2 + arch/arm/mach-imx/src.c | 40 +- arch/arm/mach-omap1/include/mach/memory.h | 12 - arch/arm/mach-omap1/usb.c | 116 +- arch/arm/mach-omap2/Kconfig | 1 - arch/arm/mach-omap2/omap_hwmod.c | 2 + arch/arm/net/bpf_jit_32.c | 19 + arch/arm64/Kconfig | 2 - arch/arm64/boot/dts/arm/foundation-v8.dtsi | 1 - arch/arm64/boot/dts/arm/fvp-base-revc.dts | 23 - arch/arm64/boot/dts/arm/juno-base.dtsi | 12 - arch/arm64/boot/dts/arm/juno-motherboard.dtsi | 21 +- arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts | 11 - .../boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi | 2 +- .../boot/dts/arm/rtsm_ve-motherboard.dtsi | 20 +- .../boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts | 57 +- arch/arm64/boot/dts/arm/vexpress-v2m-rs1.dtsi | 67 +- .../arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 4 +- .../freescale/imx8mm-kontron-n801x-som.dtsi | 1 + arch/arm64/boot/dts/freescale/imx8mq-evk.dts | 2 + arch/arm64/boot/dts/qcom/pm8150.dtsi | 4 +- arch/arm64/boot/dts/qcom/qrb5165-rb5.dts | 10 + arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi | 9 +- arch/arm64/boot/dts/qcom/sc7280.dtsi | 6 +- arch/arm64/boot/dts/qcom/sdm630.dtsi | 15 +- arch/arm64/boot/dts/qcom/sdm845.dtsi | 21 +- .../boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 34 + arch/arm64/configs/bcm2711_defconfig | 2 + arch/arm64/configs/bcmrpi3_defconfig | 1 + arch/arm64/configs/defconfig | 1 - arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/perf.c | 3 - arch/arm64/kvm/pmu-emul.c | 9 +- arch/arm64/mm/hugetlbpage.c | 2 +- arch/csky/Kconfig | 3 +- arch/csky/include/asm/bitops.h | 1 - arch/csky/kernel/ptrace.c | 3 +- arch/csky/kernel/signal.c | 4 + arch/ia64/Kconfig | 2 - arch/m68k/68000/entry.S | 4 - arch/m68k/Kconfig | 1 - arch/m68k/coldfire/entry.S | 4 - arch/m68k/include/asm/processor.h | 31 +- arch/m68k/include/asm/thread_info.h | 3 - arch/m68k/include/asm/tlbflush.h | 11 +- arch/m68k/include/asm/traps.h | 4 + arch/m68k/include/asm/uaccess.h | 215 +- arch/m68k/kernel/entry.S | 58 +- arch/m68k/kernel/process.c | 4 +- arch/m68k/kernel/signal.c | 197 +- arch/m68k/kernel/traps.c | 13 +- arch/m68k/mac/misc.c | 1 - arch/m68k/mm/cache.c | 25 +- arch/m68k/mm/init.c | 6 - arch/m68k/mm/kmap.c | 1 - arch/m68k/mm/memory.c | 1 - arch/m68k/sun3/config.c | 3 +- arch/m68k/sun3/mmu_emu.c | 6 +- arch/m68k/sun3/sun3ints.c | 1 - arch/m68k/sun3x/prom.c | 1 - arch/mips/Kconfig | 2 - arch/mips/include/asm/mips-cps.h | 23 +- arch/mips/net/bpf_jit.c | 57 +- arch/nios2/Kconfig.debug | 3 +- arch/nios2/kernel/setup.c | 2 - arch/parisc/Kconfig | 2 - arch/powerpc/include/asm/book3s/32/kup.h | 8 + arch/powerpc/include/asm/code-patching.h | 1 + arch/powerpc/include/asm/interrupt.h | 18 +- arch/powerpc/include/asm/security_features.h | 5 + arch/powerpc/kernel/dma-iommu.c | 9 + arch/powerpc/kernel/exceptions-64s.S | 25 +- arch/powerpc/kernel/irq.c | 6 + arch/powerpc/kernel/security.c | 5 + arch/powerpc/kernel/traps.c | 43 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 28 +- arch/powerpc/lib/code-patching.c | 7 +- arch/powerpc/net/bpf_jit.h | 33 +- arch/powerpc/net/bpf_jit64.h | 8 +- arch/powerpc/net/bpf_jit_comp.c | 6 +- arch/powerpc/net/bpf_jit_comp32.c | 16 +- arch/powerpc/net/bpf_jit_comp64.c | 100 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 4 + arch/powerpc/platforms/pseries/msi.c | 15 + arch/powerpc/sysdev/xive/common.c | 3 +- arch/riscv/Kconfig | 2 - arch/riscv/include/asm/syscall.h | 1 + arch/riscv/include/asm/vdso.h | 18 +- arch/riscv/include/uapi/asm/unistd.h | 3 +- arch/riscv/kernel/syscall_table.c | 1 - arch/riscv/kernel/vdso.c | 57 +- arch/riscv/kernel/vdso/vdso.lds.S | 3 +- arch/riscv/mm/cacheflush.c | 2 + arch/s390/include/asm/pci.h | 2 + arch/s390/kvm/interrupt.c | 4 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/kvm-s390.h | 2 +- arch/s390/lib/string.c | 13 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/s390/pci/pci.c | 45 +- arch/s390/pci/pci_event.c | 4 +- arch/x86/Kconfig | 5 +- arch/x86/crypto/sm4-aesni-avx-asm_64.S | 5 +- arch/x86/events/core.c | 1 + arch/x86/events/intel/core.c | 1 + arch/x86/events/msr.c | 1 + arch/x86/hyperv/hv_apic.c | 20 +- arch/x86/include/asm/entry-common.h | 2 +- arch/x86/include/asm/kvm_page_track.h | 2 +- arch/x86/include/asm/kvmclock.h | 14 + arch/x86/include/asm/xen/pci.h | 11 +- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/cpu/resctrl/core.c | 6 +- arch/x86/kernel/early-quirks.c | 6 - arch/x86/kernel/fpu/signal.c | 11 +- arch/x86/kernel/hpet.c | 81 + arch/x86/kernel/kvmclock.c | 13 +- arch/x86/kernel/sev-shared.c | 2 + arch/x86/kvm/emulate.c | 3 +- arch/x86/kvm/hyperv.c | 7 +- arch/x86/kvm/hyperv.h | 2 +- arch/x86/kvm/ioapic.c | 10 +- arch/x86/kvm/mmu/mmu.c | 17 +- arch/x86/kvm/mmu/page_track.c | 4 +- arch/x86/kvm/mmu/paging_tmpl.h | 46 +- arch/x86/kvm/svm/nested.c | 10 +- arch/x86/kvm/svm/sev.c | 92 +- arch/x86/kvm/svm/svm.c | 137 +- arch/x86/kvm/svm/svm.h | 3 +- arch/x86/kvm/vmx/evmcs.c | 12 +- arch/x86/kvm/vmx/nested.c | 24 +- arch/x86/kvm/vmx/vmx.c | 39 +- arch/x86/kvm/vmx/vmx.h | 5 +- arch/x86/kvm/x86.c | 28 +- arch/x86/net/bpf_jit_comp.c | 66 +- arch/x86/pci/xen.c | 15 +- arch/x86/platform/olpc/olpc.c | 2 +- arch/x86/platform/pvh/enlighten.c | 12 +- arch/x86/xen/Kconfig | 19 +- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten.c | 54 +- arch/x86/xen/enlighten_pv.c | 35 +- arch/x86/xen/enlighten_pvh.c | 10 +- arch/x86/xen/mmu_pv.c | 2 +- arch/x86/xen/xen-ops.h | 5 +- arch/xtensa/include/asm/kmem_layout.h | 2 +- arch/xtensa/kernel/irq.c | 2 +- arch/xtensa/kernel/setup.c | 12 +- arch/xtensa/mm/mmu.c | 2 +- arch/xtensa/platforms/xtfpga/setup.c | 12 +- block/Kconfig | 68 +- block/Makefile | 11 +- block/bdev.c | 1058 ++++++++++ block/bfq-cgroup.c | 8 + block/bfq-iosched.c | 979 +++++++-- block/bfq-iosched.h | 50 +- block/bfq-wf2q.c | 17 +- block/bio-integrity.c | 57 +- block/bio.c | 900 ++++---- block/blk-cgroup.c | 301 ++- block/blk-core.c | 349 ++- block/blk-crypto-fallback.c | 20 +- block/blk-crypto.c | 5 +- block/blk-exec.c | 37 +- block/blk-flush.c | 62 +- block/blk-integrity.c | 21 +- block/blk-iocost.c | 355 ++-- block/blk-iolatency.c | 38 +- block/blk-lib.c | 5 +- block/blk-map.c | 129 +- block/blk-merge.c | 72 +- block/blk-mq-debugfs.c | 26 +- block/blk-mq-sched.c | 146 +- block/blk-mq-sched.h | 6 +- block/blk-mq-sysfs.c | 55 - block/blk-mq-tag.c | 77 +- block/blk-mq-tag.h | 9 +- block/blk-mq.c | 492 +++-- block/blk-mq.h | 47 +- block/blk-pm.h | 38 - block/blk-rq-qos.h | 14 +- block/blk-settings.c | 152 +- block/blk-sysfs.c | 102 +- block/blk-throttle.c | 41 +- block/blk-wbt.c | 14 +- block/blk-zoned.c | 166 +- block/blk.h | 171 +- block/bounce.c | 193 +- block/bsg-lib.c | 98 +- block/bsg.c | 466 +--- block/disk-events.c | 69 +- block/elevator.c | 25 +- block/fops.c | 639 ++++++ block/genhd.c | 1873 ++++------------- block/holder.c | 174 ++ block/ioctl.c | 92 +- block/ioprio.c | 20 +- block/keyslot-manager.c | 175 ++ block/kyber-iosched.c | 12 +- block/mq-deadline.c | 648 ++++-- block/partitions/Kconfig | 1 - block/partitions/acorn.c | 4 +- block/partitions/aix.c | 20 +- block/partitions/amiga.c | 7 +- block/partitions/atari.c | 4 +- block/partitions/check.h | 2 +- block/partitions/cmdline.c | 275 ++- block/partitions/core.c | 457 ++-- block/partitions/efi.c | 50 +- block/partitions/ibm.c | 4 +- block/partitions/ldm.c | 20 +- block/partitions/mac.c | 2 +- block/partitions/msdos.c | 8 +- block/partitions/sgi.c | 5 +- block/partitions/sun.c | 5 +- block/t10-pi.c | 16 +- drivers/Kconfig | 2 + drivers/acpi/arm64/gtdt.c | 2 +- drivers/acpi/nfit/core.c | 12 + drivers/acpi/x86/s2idle.c | 3 +- drivers/ata/libahci_platform.c | 5 +- drivers/ata/pata_legacy.c | 6 +- drivers/base/core.c | 93 +- drivers/base/test/Makefile | 4 +- drivers/block/brd.c | 44 +- drivers/block/nbd.c | 29 +- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 +- drivers/block/virtio_blk.c | 37 +- drivers/bus/Kconfig | 12 - drivers/bus/Makefile | 2 +- drivers/bus/simple-pm-bus.c | 42 +- drivers/bus/ti-sysc.c | 4 + drivers/char/broadcom/vcio.c | 135 +- drivers/clk/qcom/Kconfig | 1 + drivers/clk/renesas/r9a07g044-cpg.c | 2 + drivers/clk/renesas/rzg2l-cpg.c | 2 +- drivers/clk/socfpga/clk-agilex.c | 9 - drivers/crypto/ccp/ccp-ops.c | 14 +- drivers/firmware/Kconfig | 5 +- drivers/firmware/arm_ffa/bus.c | 10 +- drivers/firmware/arm_scmi/Kconfig | 2 +- drivers/firmware/arm_scmi/virtio.c | 44 +- drivers/firmware/efi/cper.c | 4 +- drivers/firmware/efi/libstub/fdt.c | 2 +- drivers/firmware/efi/runtime-wrappers.c | 2 +- drivers/fpga/ice40-spi.c | 7 + drivers/gpio/Kconfig | 8 + drivers/gpio/Makefile | 1 + drivers/gpio/gpio-74x164.c | 8 + drivers/gpio/gpio-bcm-virt.c | 2 +- drivers/gpio/gpio-mockup.c | 21 +- drivers/gpio/gpio-pca953x.c | 27 +- drivers/gpio/gpio-pwm.c | 144 ++ drivers/gpio/gpio-rockchip.c | 22 + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 31 + drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 14 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 8 + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +- drivers/gpu/drm/amd/display/Kconfig | 2 + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 + .../gpu/drm/amd/display/dc/core/dc_link_dp.c | 21 +- .../amd/display/dc/dcn10/dcn10_link_encoder.h | 1 + .../display/dc/dcn31/dcn31_dio_link_encoder.c | 66 +- .../display/dc/dcn31/dcn31_dio_link_encoder.h | 14 +- .../drm/amd/display/dc/dcn31/dcn31_resource.c | 8 +- .../gpu/drm/amd/display/include/dal_asic_id.h | 2 +- .../include/asic_reg/dpcs/dpcs_4_2_0_offset.h | 27 + drivers/gpu/drm/drm_edid.c | 15 +- drivers/gpu/drm/drm_fb_helper.c | 6 + drivers/gpu/drm/exynos/exynos5433_drm_decon.c | 4 +- drivers/gpu/drm/exynos/exynos_drm_dsi.c | 4 +- drivers/gpu/drm/exynos/exynos_drm_fimc.c | 5 +- drivers/gpu/drm/exynos/exynos_drm_fimd.c | 4 +- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 5 +- drivers/gpu/drm/exynos/exynos_drm_gsc.c | 6 +- drivers/gpu/drm/exynos/exynos_drm_rotator.c | 4 +- drivers/gpu/drm/exynos/exynos_drm_scaler.c | 4 +- drivers/gpu/drm/exynos/exynos_hdmi.c | 4 +- drivers/gpu/drm/hyperv/hyperv_drm.h | 1 + drivers/gpu/drm/hyperv/hyperv_drm_modeset.c | 1 + drivers/gpu/drm/hyperv/hyperv_drm_proto.c | 54 +- drivers/gpu/drm/i915/display/icl_dsi.c | 10 +- drivers/gpu/drm/i915/display/intel_acpi.c | 7 +- drivers/gpu/drm/i915/display/intel_audio.c | 5 +- drivers/gpu/drm/i915/display/intel_bios.c | 22 +- drivers/gpu/drm/i915/display/intel_ddi.c | 8 +- drivers/gpu/drm/i915/display/intel_display.c | 20 +- drivers/gpu/drm/i915/display/intel_vbt_defs.h | 5 + drivers/gpu/drm/i915/gem/i915_gem_context.c | 5 +- drivers/gpu/drm/i915/gem/i915_gem_shrinker.c | 7 +- drivers/gpu/drm/i915/gt/intel_context.c | 6 +- drivers/gpu/drm/i915/gt/intel_rps.c | 2 - drivers/gpu/drm/i915/gvt/scheduler.c | 4 +- drivers/gpu/drm/i915/i915_reg.h | 5 + drivers/gpu/drm/i915/i915_request.c | 11 +- drivers/gpu/drm/i915/intel_pm.c | 12 + drivers/gpu/drm/kmb/kmb_drv.c | 8 +- drivers/gpu/drm/kmb/kmb_drv.h | 5 + drivers/gpu/drm/kmb/kmb_plane.c | 81 +- drivers/gpu/drm/kmb/kmb_plane.h | 5 +- drivers/gpu/drm/kmb/kmb_regs.h | 3 + drivers/gpu/drm/mediatek/mtk_drm_crtc.c | 157 +- drivers/gpu/drm/msm/Kconfig | 4 +- drivers/gpu/drm/msm/adreno/a3xx_gpu.c | 9 +- drivers/gpu/drm/msm/adreno/a4xx_gpu.c | 9 +- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 6 + drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 3 + drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +- drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 11 +- drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c | 16 + drivers/gpu/drm/msm/dp/dp_display.c | 10 +- drivers/gpu/drm/msm/dsi/dsi.c | 4 +- drivers/gpu/drm/msm/dsi/dsi_host.c | 2 +- drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c | 30 +- .../gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c | 4 +- drivers/gpu/drm/msm/edp/edp_ctrl.c | 3 +- drivers/gpu/drm/msm/msm_drv.c | 15 +- drivers/gpu/drm/msm/msm_drv.h | 47 +- drivers/gpu/drm/msm/msm_gem_submit.c | 7 +- drivers/gpu/drm/msm/msm_gpu.h | 66 +- drivers/gpu/drm/msm/msm_gpu_devfreq.c | 6 + drivers/gpu/drm/msm/msm_submitqueue.c | 72 +- drivers/gpu/drm/nouveau/dispnv50/crc.c | 1 + drivers/gpu/drm/nouveau/include/nvif/class.h | 2 + .../drm/nouveau/include/nvkm/engine/fifo.h | 1 + drivers/gpu/drm/nouveau/nouveau_bo.c | 1 + drivers/gpu/drm/nouveau/nouveau_chan.c | 6 +- drivers/gpu/drm/nouveau/nouveau_debugfs.c | 1 + drivers/gpu/drm/nouveau/nouveau_drm.c | 4 + drivers/gpu/drm/nouveau/nouveau_gem.c | 4 +- drivers/gpu/drm/nouveau/nv84_fence.c | 2 +- .../gpu/drm/nouveau/nvkm/engine/device/base.c | 3 + .../gpu/drm/nouveau/nvkm/engine/fifo/Kbuild | 1 + .../drm/nouveau/nvkm/engine/fifo/chang84.c | 2 +- .../gpu/drm/nouveau/nvkm/engine/fifo/ga102.c | 311 +++ .../gpu/drm/nouveau/nvkm/subdev/top/ga100.c | 7 +- drivers/gpu/drm/panel/Kconfig | 1 + drivers/gpu/drm/r128/ati_pcigart.c | 2 +- drivers/gpu/drm/rcar-du/rcar_du_encoder.c | 16 +- drivers/gpu/drm/rcar-du/rcar_lvds.c | 11 + drivers/gpu/drm/rcar-du/rcar_lvds.h | 5 + drivers/gpu/drm/rockchip/rockchip_drm_vop.c | 26 +- drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c | 7 +- drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h | 4 +- drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c | 97 +- drivers/gpu/drm/tegra/dc.c | 3 - drivers/gpu/drm/tegra/dc.h | 6 - drivers/gpu/drm/tegra/uapi.c | 2 +- drivers/gpu/drm/vc4/vc4_hdmi.c | 8 - drivers/gpu/host1x/fence.c | 6 +- drivers/hid/hid-apple.c | 7 + drivers/hid/hid-betopff.c | 13 +- drivers/hid/hid-u2fzero.c | 4 +- drivers/hid/wacom_wac.c | 8 + drivers/hwmon/k10temp.c | 6 - drivers/hwmon/ltc2947-core.c | 8 +- drivers/hwmon/mlxreg-fan.c | 12 +- drivers/hwmon/occ/common.c | 17 +- drivers/hwmon/pmbus/ibm-cffps.c | 10 +- drivers/hwmon/pmbus/mp2975.c | 2 +- drivers/hwmon/tmp421.c | 73 +- drivers/hwmon/w83791d.c | 29 +- drivers/hwmon/w83792d.c | 28 +- drivers/hwmon/w83793.c | 26 +- drivers/i2c/busses/i2c-mlxcpld.c | 4 +- drivers/i2c/busses/i2c-mt65xx.c | 11 +- drivers/i2c/i2c-core-acpi.c | 1 + drivers/iio/accel/fxls8962af-core.c | 2 +- drivers/iio/adc/ad7192.c | 1 + drivers/iio/adc/ad7780.c | 2 +- drivers/iio/adc/ad7793.c | 2 +- drivers/iio/adc/aspeed_adc.c | 1 + drivers/iio/adc/max1027.c | 3 +- drivers/iio/adc/mt6577_auxadc.c | 8 + drivers/iio/adc/rzg2l_adc.c | 6 +- drivers/iio/adc/ti-adc128s052.c | 6 + drivers/iio/common/ssp_sensors/ssp_spi.c | 11 +- drivers/iio/dac/ti-dac5571.c | 1 + drivers/iio/imu/adis16475.c | 3 +- drivers/iio/imu/adis16480.c | 14 +- drivers/iio/light/opt3001.c | 6 +- drivers/iio/test/Makefile | 1 + drivers/infiniband/core/cma.c | 51 +- drivers/infiniband/core/cma_priv.h | 1 + drivers/infiniband/hw/hfi1/ipoib_tx.c | 8 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 31 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 13 +- drivers/infiniband/hw/irdma/cm.c | 4 +- drivers/infiniband/hw/irdma/hw.c | 14 +- drivers/infiniband/hw/irdma/i40iw_if.c | 2 +- drivers/infiniband/hw/irdma/main.h | 1 - drivers/infiniband/hw/irdma/user.h | 2 + drivers/infiniband/hw/irdma/utils.c | 2 +- drivers/infiniband/hw/irdma/verbs.c | 9 +- drivers/infiniband/hw/usnic/usnic_ib.h | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 16 +- drivers/input/joystick/xpad.c | 2 + drivers/input/keyboard/snvs_pwrkey.c | 29 + drivers/input/touchscreen.c | 42 +- .../input/touchscreen/resistive-adc-touch.c | 29 +- drivers/interconnect/qcom/sdm660.c | 25 +- drivers/iommu/Kconfig | 11 +- drivers/iommu/apple-dart.c | 56 +- drivers/iommu/arm/arm-smmu/Makefile | 3 +- drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 3 +- drivers/iommu/intel/dmar.c | 6 +- drivers/ipack/devices/ipoctal.c | 63 +- drivers/isdn/capi/kcapi.c | 5 + drivers/md/dm-clone-target.c | 2 +- drivers/md/dm-rq.c | 8 + drivers/md/dm-verity-target.c | 15 +- drivers/md/dm.c | 17 +- drivers/media/platform/Kconfig | 2 +- drivers/media/platform/s5p-jpeg/jpeg-core.c | 18 +- drivers/media/platform/s5p-jpeg/jpeg-core.h | 28 +- drivers/media/rc/ir_toy.c | 21 +- drivers/misc/Kconfig | 1 + drivers/misc/cb710/sgbuf2.c | 2 +- drivers/misc/eeprom/at25.c | 8 + drivers/misc/eeprom/eeprom_93xx46.c | 18 + drivers/misc/fastrpc.c | 2 + drivers/misc/gehc-achc.c | 1 + .../habanalabs/common/command_submission.c | 33 +- drivers/misc/mei/hbm.c | 12 +- drivers/misc/mei/hw-me-regs.h | 1 + drivers/misc/mei/pci-me.c | 1 + drivers/mmc/host/Kconfig | 2 +- drivers/mmc/host/dw_mmc.c | 15 +- drivers/mmc/host/meson-gx-mmc.c | 73 +- drivers/mmc/host/renesas_sdhi_core.c | 2 + drivers/mmc/host/sdhci-of-at91.c | 22 +- drivers/mtd/nand/raw/qcom_nandc.c | 8 +- drivers/net/dsa/microchip/ksz_common.c | 4 +- drivers/net/dsa/mv88e6xxx/chip.c | 142 +- drivers/net/dsa/mv88e6xxx/chip.h | 10 + drivers/net/dsa/mv88e6xxx/global1.c | 2 + drivers/net/dsa/mv88e6xxx/port.c | 23 + drivers/net/dsa/mv88e6xxx/port.h | 2 + drivers/net/dsa/ocelot/felix.c | 149 +- drivers/net/dsa/ocelot/felix.h | 1 + drivers/net/dsa/sja1105/sja1105_main.c | 3 +- drivers/net/dsa/sja1105/sja1105_ptp.c | 45 +- drivers/net/dsa/sja1105/sja1105_ptp.h | 19 - drivers/net/ethernet/Kconfig | 1 + drivers/net/ethernet/arc/Kconfig | 1 + .../net/ethernet/broadcom/bgmac-platform.c | 3 + .../net/ethernet/freescale/enetc/enetc_pf.c | 3 +- drivers/net/ethernet/google/gve/gve.h | 2 +- drivers/net/ethernet/google/gve/gve_main.c | 45 +- drivers/net/ethernet/google/gve/gve_rx.c | 8 +- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 - .../net/ethernet/hisilicon/hns3/hns3_enet.c | 16 +- .../ethernet/hisilicon/hns3/hns3_ethtool.c | 6 +- .../hisilicon/hns3/hns3pf/hclge_cmd.c | 21 +- .../hisilicon/hns3/hns3pf/hclge_dcb.c | 29 +- .../hisilicon/hns3/hns3pf/hclge_debugfs.c | 28 +- .../hisilicon/hns3/hns3pf/hclge_main.c | 27 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 33 +- drivers/net/ethernet/hisilicon/hns_mdio.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.c | 8 - drivers/net/ethernet/intel/e100.c | 22 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +- drivers/net/ethernet/intel/iavf/iavf_main.c | 1 - drivers/net/ethernet/intel/ice/ice_ptp.c | 15 +- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 7 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 12 +- .../mellanox/mlx5/core/en/hv_vhca_stats.c | 6 +- .../net/ethernet/mellanox/mlx5/core/en/ptp.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en/ptp.h | 2 + .../mellanox/mlx5/core/en/rep/bridge.c | 8 +- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 11 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 239 ++- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 9 +- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 7 +- .../ethernet/mellanox/mlx5/core/en_stats.c | 11 +- .../mellanox/mlx5/core/esw/acl/egress_lgcy.c | 12 +- .../mellanox/mlx5/core/esw/acl/ingress_lgcy.c | 4 +- .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 4 +- .../ethernet/mellanox/mlx5/core/lib/clock.c | 37 +- .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 9 +- .../ethernet/mellanox/mlxsw/core_thermal.c | 52 +- drivers/net/ethernet/micrel/Makefile | 6 +- drivers/net/ethernet/micrel/ks8851_common.c | 8 + .../ethernet/microchip/encx24j600-regmap.c | 10 +- drivers/net/ethernet/microchip/encx24j600.c | 5 +- .../net/ethernet/microchip/encx24j600_hw.h | 4 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 4 +- drivers/net/ethernet/mscc/ocelot.c | 111 +- drivers/net/ethernet/mscc/ocelot_net.c | 3 +- drivers/net/ethernet/mscc/ocelot_vcap.c | 4 +- drivers/net/ethernet/neterion/s2io.c | 2 +- .../net/ethernet/netronome/nfp/flower/main.c | 19 +- .../net/ethernet/pensando/ionic/ionic_lif.c | 8 +- .../ethernet/pensando/ionic/ionic_rx_filter.c | 3 - .../net/ethernet/pensando/ionic/ionic_stats.c | 9 - drivers/net/ethernet/qlogic/qed/qed_main.c | 1 + .../ethernet/stmicro/stmmac/dwmac-generic.c | 1 + .../net/ethernet/stmicro/stmmac/dwmac-rk.c | 5 + .../ethernet/stmicro/stmmac/dwmac1000_dma.c | 13 +- .../net/ethernet/stmicro/stmmac/dwmac4_dma.c | 6 +- .../ethernet/stmicro/stmmac/dwxgmac2_dma.c | 6 +- drivers/net/ethernet/stmicro/stmmac/hwif.h | 6 +- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 10 +- .../ethernet/stmicro/stmmac/stmmac_platform.c | 8 + drivers/net/ethernet/sun/Kconfig | 1 + drivers/net/hamradio/Kconfig | 1 + drivers/net/ipa/Kconfig | 1 + drivers/net/mdio/mdio-ipq4019.c | 6 +- drivers/net/mdio/mdio-mscc-miim.c | 15 +- drivers/net/mhi_net.c | 6 +- drivers/net/pcs/pcs-xpcs.c | 45 +- drivers/net/phy/bcm7xxx.c | 114 +- drivers/net/phy/mdio_bus.c | 11 + drivers/net/phy/mxl-gpy.c | 23 +- drivers/net/phy/phy_device.c | 3 + drivers/net/usb/Kconfig | 4 + drivers/net/usb/r8152.c | 16 +- drivers/net/usb/smsc95xx.c | 3 + drivers/net/virtio_net.c | 2 +- drivers/net/wireless/ath/ath10k/Kconfig | 2 +- drivers/net/wireless/ath/ath5k/Kconfig | 4 +- drivers/net/wireless/ath/ath5k/led.c | 10 +- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 17 +- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 5 +- .../wireless/intel/iwlwifi/mvm/time-event.c | 3 +- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 2 + drivers/net/wireless/mac80211_hwsim.c | 4 +- drivers/net/wireless/marvell/mwifiex/sta_tx.c | 4 +- .../net/wireless/marvell/mwifiex/uap_txrx.c | 4 +- drivers/nvdimm/pmem.c | 5 +- drivers/nvme/host/core.c | 25 +- drivers/nvme/host/multipath.c | 2 - drivers/nvme/host/nvme.h | 6 + drivers/nvme/host/pci.c | 5 +- drivers/nvmem/core.c | 3 +- drivers/of/base.c | 1 + drivers/pci/controller/pci-hyperv.c | 13 +- drivers/pci/hotplug/s390_pci_hpc.c | 9 +- drivers/pci/msi.c | 18 +- drivers/pci/pci-acpi.c | 3 + drivers/perf/arm_pmu.c | 2 + drivers/pinctrl/core.c | 2 +- drivers/pinctrl/pinctrl-amd.c | 19 +- drivers/pinctrl/pinctrl-amd.h | 1 + drivers/pinctrl/pinctrl-rockchip.c | 67 + drivers/pinctrl/pinctrl-rockchip.h | 10 + drivers/pinctrl/qcom/Kconfig | 3 +- drivers/pinctrl/qcom/pinctrl-sc7280.c | 1 + drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 37 +- drivers/platform/mellanox/mlxreg-io.c | 4 +- drivers/platform/x86/amd-pmc.c | 1 + drivers/platform/x86/dell/Kconfig | 1 + drivers/platform/x86/gigabyte-wmi.c | 1 + .../platform/x86/intel/int1092/intel_sar.c | 21 +- .../int3472/intel_skl_int3472_discrete.c | 2 +- drivers/platform/x86/intel_scu_ipc.c | 6 +- drivers/ptp/ptp_kvm_x86.c | 9 +- drivers/ptp/ptp_pch.c | 1 + drivers/rtc/rtc-pcf85063.c | 2 + drivers/s390/cio/blacklist.c | 8 +- drivers/s390/cio/css.c | 40 +- drivers/s390/cio/css.h | 10 +- drivers/s390/crypto/vfio_ap_ops.c | 4 +- drivers/scsi/arm/acornscsi.c | 2 +- drivers/scsi/csiostor/csio_init.c | 1 + drivers/scsi/elx/efct/efct_scsi.c | 3 +- drivers/scsi/libiscsi.c | 15 +- drivers/scsi/lpfc/lpfc_sli.c | 11 +- drivers/scsi/qla2xxx/qla_isr.c | 4 +- drivers/scsi/ufs/ufshcd.c | 55 +- drivers/scsi/ufs/ufshcd.h | 1 + drivers/scsi/virtio_scsi.c | 4 +- drivers/soc/canaan/Kconfig | 1 - drivers/soc/qcom/mdt_loader.c | 2 +- drivers/soc/qcom/socinfo.c | 2 +- drivers/soc/ti/omap_prm.c | 29 +- drivers/spi/spi-atmel.c | 4 +- drivers/spi/spi-bcm-qspi.c | 77 +- drivers/spi/spi-mt65xx.c | 64 +- drivers/spi/spi-mux.c | 7 + drivers/spi/spi-nxp-fspi.c | 26 +- drivers/spi/spi-tegra20-slink.c | 4 +- drivers/spi/spi.c | 27 +- drivers/spi/spidev.c | 14 + .../hive_isp_css_common/host/input_system.c | 2 + drivers/staging/media/hantro/hantro_drv.c | 2 +- .../staging/media/sunxi/cedrus/cedrus_video.c | 2 +- drivers/staging/r8188eu/hal/hal_intf.c | 2 +- .../interface/vchiq_arm/vchiq_arm.c | 2 +- drivers/tee/optee/core.c | 3 + drivers/tee/optee/device.c | 22 + drivers/tee/optee/optee_private.h | 1 + drivers/tee/optee/shm_pool.c | 2 +- drivers/thunderbolt/Makefile | 1 + drivers/tty/hvc/hvc_xen.c | 13 +- drivers/tty/serial/8250/Kconfig | 8 +- drivers/usb/chipidea/ci_hdrc_imx.c | 15 +- drivers/usb/class/cdc-acm.c | 8 + drivers/usb/class/cdc-wdm.c | 6 +- drivers/usb/common/Kconfig | 3 +- drivers/usb/dwc3/gadget.c | 2 +- drivers/usb/gadget/function/f_uac2.c | 14 +- drivers/usb/host/dwc_otg/dwc_otg_hcd_queue.c | 2 +- drivers/usb/host/ohci-omap.c | 72 +- drivers/usb/host/xhci-dbgtty.c | 28 +- drivers/usb/host/xhci-pci.c | 6 +- drivers/usb/host/xhci-ring.c | 39 +- drivers/usb/host/xhci-tegra.c | 12 +- drivers/usb/host/xhci.h | 1 + drivers/usb/musb/musb_dsps.c | 4 +- drivers/usb/serial/option.c | 8 + drivers/usb/serial/qcserial.c | 1 + drivers/usb/typec/tcpm/tcpci.c | 2 +- drivers/usb/typec/tcpm/tcpm.c | 1 + drivers/usb/typec/tipd/core.c | 8 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 + drivers/vdpa/vdpa_user/vduse_dev.c | 10 +- drivers/vfio/pci/vfio_pci_core.c | 2 +- drivers/vhost/vdpa.c | 12 +- drivers/video/fbdev/Kconfig | 5 +- drivers/video/fbdev/gbefb.c | 2 +- drivers/virtio/virtio.c | 18 +- drivers/watchdog/Kconfig | 2 +- drivers/xen/Kconfig | 4 +- drivers/xen/balloon.c | 21 +- drivers/xen/privcmd.c | 18 +- fs/9p/cache.c | 8 +- fs/9p/fid.c | 14 +- fs/9p/v9fs.c | 8 +- fs/9p/vfs_addr.c | 14 +- fs/9p/vfs_file.c | 33 +- fs/9p/vfs_inode.c | 24 +- fs/9p/vfs_inode_dotl.c | 11 +- fs/afs/dir_silly.c | 4 +- fs/afs/write.c | 3 +- fs/binfmt_elf.c | 2 +- fs/btrfs/ctree.h | 2 +- fs/btrfs/dir-item.c | 48 +- fs/btrfs/extent-tree.c | 1 + fs/btrfs/file.c | 19 +- fs/btrfs/tree-log.c | 79 +- fs/debugfs/inode.c | 2 +- fs/ext4/dir.c | 6 +- fs/ext4/ext4.h | 3 - fs/ext4/extents.c | 19 +- fs/ext4/fast_commit.c | 6 + fs/ext4/inline.c | 150 +- fs/ext4/inode.c | 178 +- fs/ext4/super.c | 21 +- fs/fscache/object.c | 2 +- fs/fscache/operation.c | 3 + fs/io-wq.c | 5 +- fs/io_uring.c | 19 +- fs/kernfs/dir.c | 18 +- fs/ksmbd/auth.c | 205 -- fs/ksmbd/crypto_ctx.c | 16 - fs/ksmbd/crypto_ctx.h | 8 - fs/ksmbd/misc.c | 17 +- fs/ksmbd/oplock.c | 41 +- fs/ksmbd/smb2misc.c | 98 +- fs/ksmbd/smb2ops.c | 5 - fs/ksmbd/smb2pdu.c | 313 ++- fs/ksmbd/smb2pdu.h | 10 +- fs/ksmbd/smb_common.c | 59 +- fs/ksmbd/smb_common.h | 10 +- fs/ksmbd/smbacl.c | 21 +- fs/ksmbd/transport_tcp.c | 4 +- fs/netfs/read_helper.c | 2 +- fs/nfs_common/grace.c | 1 - fs/nfsd/filecache.c | 2 +- fs/nfsd/nfs4xdr.c | 19 +- fs/nfsd/nfsctl.c | 7 +- fs/ntfs3/attrib.c | 20 +- fs/ntfs3/attrlist.c | 9 +- fs/ntfs3/bitfunc.c | 10 +- fs/ntfs3/bitmap.c | 14 +- fs/ntfs3/debug.h | 3 + fs/ntfs3/dir.c | 30 +- fs/ntfs3/file.c | 12 +- fs/ntfs3/frecord.c | 55 +- fs/ntfs3/fslog.c | 12 +- fs/ntfs3/fsntfs.c | 77 +- fs/ntfs3/index.c | 166 +- fs/ntfs3/inode.c | 159 +- fs/ntfs3/lib/decompress_common.h | 5 + fs/ntfs3/lib/lib.h | 6 + fs/ntfs3/lznt.c | 12 +- fs/ntfs3/namei.c | 24 - fs/ntfs3/ntfs.h | 20 +- fs/ntfs3/ntfs_fs.h | 67 +- fs/ntfs3/record.c | 3 - fs/ntfs3/run.c | 2 - fs/ntfs3/super.c | 651 +++--- fs/ntfs3/upcase.c | 8 +- fs/ntfs3/xattr.c | 251 +-- fs/overlayfs/dir.c | 10 +- fs/overlayfs/file.c | 15 +- fs/vboxsf/super.c | 12 +- fs/verity/enable.c | 2 +- fs/verity/open.c | 2 +- include/asm-generic/io.h | 2 +- include/kunit/test.h | 6 +- include/kvm/arm_pmu.h | 3 - include/linux/arm-smccc.h | 10 + include/linux/bpf.h | 3 +- include/linux/cpumask.h | 7 +- include/linux/dsa/mv88e6xxx.h | 13 + include/linux/dsa/ocelot.h | 49 + include/linux/dsa/sja1105.h | 44 +- include/linux/etherdevice.h | 2 +- include/linux/fwnode.h | 11 +- include/linux/genhd.h | 1 + include/linux/kvm_host.h | 6 - include/linux/mlx5/mlx5_ifc.h | 10 +- include/linux/perf/arm_pmu.h | 6 + include/linux/perf_event.h | 4 +- include/linux/platform_data/usb-omap1.h | 2 + include/linux/qcom_scm.h | 71 - include/linux/sched.h | 2 +- include/linux/spi/spi.h | 3 + include/linux/workqueue.h | 5 +- include/net/ip_fib.h | 2 +- include/net/mac80211.h | 8 +- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 1 - include/net/netfilter/nf_tables.h | 2 +- include/net/netns/netfilter.h | 6 + include/net/nexthop.h | 2 +- include/net/pkt_sched.h | 1 + include/net/sock.h | 34 +- include/soc/mscc/ocelot.h | 55 +- include/soc/mscc/ocelot_ptp.h | 3 + include/soc/mscc/ocelot_vcap.h | 4 +- include/sound/hda_codec.h | 1 + include/sound/rawmidi.h | 1 + include/trace/events/cachefiles.h | 6 +- include/trace/events/kyber.h | 19 +- include/uapi/linux/hyperv.h | 2 +- include/uapi/linux/xfrm.h | 15 +- include/uapi/misc/habanalabs.h | 6 +- include/uapi/sound/asound.h | 1 + include/xen/xen-ops.h | 15 +- init/main.c | 1 + kernel/bpf/bpf_struct_ops.c | 7 +- kernel/bpf/core.c | 2 +- kernel/bpf/stackmap.c | 3 +- kernel/cgroup/cgroup.c | 17 +- kernel/cgroup/cpuset.c | 56 +- kernel/events/core.c | 34 +- kernel/module.c | 2 + kernel/sched/debug.c | 8 +- kernel/sched/fair.c | 6 +- kernel/trace/trace.c | 11 +- kernel/trace/trace_eprobe.c | 61 +- kernel/trace/trace_events_hist.c | 2 +- kernel/workqueue.c | 18 +- lib/Makefile | 2 +- lib/kunit/executor_test.c | 4 +- mm/memblock.c | 7 +- net/bpf/test_run.c | 14 +- net/bridge/br_multicast.c | 6 +- net/bridge/br_netlink.c | 3 +- net/bridge/br_private.h | 2 +- net/core/dev_addr_lists.c | 6 + net/core/net-procfs.c | 24 +- net/core/rtnetlink.c | 2 +- net/core/sock.c | 52 +- net/dsa/Kconfig | 5 - net/dsa/dsa2.c | 4 +- net/dsa/switch.c | 2 +- net/dsa/tag_dsa.c | 30 +- net/dsa/tag_ocelot.c | 1 - net/dsa/tag_ocelot_8021q.c | 40 +- net/dsa/tag_sja1105.c | 43 + net/ipv4/fib_semantics.c | 16 +- net/ipv4/icmp.c | 23 +- net/ipv4/inet_hashtables.c | 4 +- net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 30 +- net/ipv4/udp.c | 13 +- net/ipv6/inet6_hashtables.c | 2 +- net/ipv6/ioam6.c | 70 +- net/ipv6/ioam6_iptunnel.c | 6 +- net/ipv6/netfilter/ip6_tables.c | 1 + net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 25 +- net/ipv6/route.c | 5 +- net/ipv6/udp.c | 5 +- net/mac80211/mesh_pathtbl.c | 5 +- net/mac80211/mesh_ps.c | 3 +- net/mac80211/rate.c | 4 - net/mac80211/rx.c | 3 +- net/mac80211/tx.c | 12 + net/mac80211/wpa.c | 6 + net/mptcp/mptcp_diag.c | 2 +- net/mptcp/pm_netlink.c | 4 +- net/mptcp/protocol.c | 57 +- net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 2 +- net/mptcp/syncookies.c | 13 +- net/mptcp/token.c | 11 +- net/mptcp/token_test.c | 14 +- net/netfilter/ipset/ip_set_hash_gen.h | 4 +- net/netfilter/ipvs/ip_vs_conn.c | 4 + net/netfilter/nf_conntrack_core.c | 154 +- net/netfilter/nf_nat_core.c | 17 +- net/netfilter/nf_nat_masquerade.c | 168 +- net/netfilter/nf_tables_api.c | 121 +- net/netfilter/nft_compat.c | 17 +- net/netfilter/nft_quota.c | 2 +- net/netfilter/xt_LOG.c | 10 +- net/netfilter/xt_NFLOG.c | 10 +- net/netlink/af_netlink.c | 14 +- net/nfc/af_nfc.c | 3 + net/nfc/digital_core.c | 9 +- net/nfc/digital_technology.c | 8 +- net/nfc/nci/rsp.c | 2 + net/sched/cls_flower.c | 6 + net/sched/sch_api.c | 6 + net/sched/sch_fifo.c | 3 + net/sched/sch_mqprio.c | 32 +- net/sched/sch_taprio.c | 4 + net/sctp/input.c | 2 +- net/sctp/sm_make_chunk.c | 2 +- net/smc/smc_cdc.c | 7 +- net/smc/smc_core.c | 20 +- net/smc/smc_llc.c | 63 +- net/smc/smc_tx.c | 22 +- net/smc/smc_wr.h | 14 + net/unix/af_unix.c | 94 +- net/xfrm/xfrm_user.c | 67 +- samples/bpf/Makefile | 17 +- samples/bpf/bpf_insn.h | 2 +- samples/bpf/xdp_redirect_map_multi.bpf.c | 5 - scripts/Makefile.gcc-plugins | 4 + scripts/checksyscalls.sh | 6 +- scripts/recordmcount.pl | 2 +- security/selinux/nlmsgtab.c | 4 +- sound/core/pcm_compat.c | 72 +- sound/core/rawmidi.c | 9 + sound/core/seq_device.c | 8 +- sound/drivers/pcsp/pcsp_lib.c | 2 +- sound/firewire/motu/amdtp-motu.c | 7 +- sound/firewire/oxfw/oxfw.c | 13 +- sound/hda/hdac_controller.c | 5 +- sound/pci/hda/hda_bind.c | 20 +- sound/pci/hda/hda_codec.c | 1 + sound/pci/hda/hda_controller.c | 24 +- sound/pci/hda/hda_controller.h | 2 +- sound/pci/hda/hda_intel.c | 41 +- sound/pci/hda/hda_intel.h | 4 +- sound/pci/hda/patch_cs8409.c | 3 + sound/pci/hda/patch_realtek.c | 191 +- sound/soc/bcm/Kconfig | 3 +- sound/soc/bcm/hifiberry_dacplusadcpro.c | 74 +- sound/soc/fsl/fsl_esai.c | 16 +- sound/soc/fsl/fsl_micfil.c | 15 +- sound/soc/fsl/fsl_sai.c | 14 +- sound/soc/fsl/fsl_spdif.c | 16 +- sound/soc/fsl/fsl_xcvr.c | 15 +- sound/soc/intel/boards/sof_sdw.c | 5 + sound/soc/mediatek/Kconfig | 3 + sound/soc/mediatek/common/mtk-afe-fe-dai.c | 19 +- .../mt8195/mt8195-mt6359-rt1019-rt5682.c | 7 +- sound/soc/sof/core.c | 4 +- sound/soc/sof/imx/imx8.c | 9 +- sound/soc/sof/imx/imx8m.c | 9 +- sound/soc/sof/loader.c | 8 +- sound/soc/sof/trace.c | 1 - sound/usb/card.c | 18 +- sound/usb/mixer.c | 26 +- sound/usb/mixer.h | 3 +- sound/usb/mixer_quirks.c | 2 +- sound/usb/mixer_scarlett_gen2.c | 2 + sound/usb/quirks-table.h | 42 + sound/usb/quirks.c | 2 + tools/include/uapi/sound/asound.h | 1 + tools/lib/bpf/libbpf.c | 3 +- tools/lib/bpf/linker.c | 8 +- tools/lib/bpf/strset.c | 1 + tools/lib/perf/tests/test-evlist.c | 6 +- tools/lib/perf/tests/test-evsel.c | 7 +- tools/objtool/arch/x86/decode.c | 2 +- tools/objtool/check.c | 16 +- tools/objtool/elf.c | 70 +- tools/objtool/include/objtool/elf.h | 1 - tools/objtool/orc_gen.c | 2 +- tools/objtool/special.c | 22 +- .../Documentation/jitdump-specification.txt | 2 +- tools/perf/Documentation/perf-c2c.txt | 2 +- tools/perf/Documentation/perf-intel-pt.txt | 2 +- tools/perf/Documentation/perf-lock.txt | 2 +- tools/perf/Documentation/perf-script-perl.txt | 2 +- .../perf/Documentation/perf-script-python.txt | 2 +- tools/perf/Documentation/perf-stat.txt | 2 +- tools/perf/Documentation/topdown.txt | 2 +- tools/perf/Makefile.config | 2 +- tools/perf/Makefile.perf | 2 +- tools/perf/arch/arm/util/auxtrace.c | 8 +- tools/perf/arch/arm/util/cs-etm.c | 24 +- tools/perf/arch/arm/util/perf_regs.c | 2 +- tools/perf/arch/arm/util/pmu.c | 2 +- tools/perf/arch/arm/util/unwind-libdw.c | 6 +- tools/perf/arch/arm/util/unwind-libunwind.c | 4 +- tools/perf/arch/x86/util/iostat.c | 2 +- tools/perf/builtin-stat.c | 2 + .../pmu-events/arch/powerpc/power8/other.json | 2 +- tools/perf/pmu-events/jevents.c | 2 + tools/perf/tests/attr/test-stat-default | 97 + tools/perf/tests/attr/test-stat-detailed-1 | 113 +- tools/perf/tests/attr/test-stat-detailed-2 | 137 +- tools/perf/tests/attr/test-stat-detailed-3 | 145 +- tools/perf/tests/code-reading.c | 4 +- tools/perf/tests/dwarf-unwind.c | 39 +- tools/perf/util/config.c | 2 +- tools/perf/util/session.c | 4 +- tools/testing/kunit/kunit.py | 24 +- tools/testing/kunit/kunit_tool_test.py | 8 + tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/test_lwt_ip_encap.sh | 13 +- .../selftests/drivers/dma-buf/udmabuf.c | 5 +- .../test.d/dynevent/add_remove_eprobe.tc | 54 +- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/access_tracking_perf_test.c | 6 +- .../selftests/kvm/demand_paging_test.c | 15 +- .../selftests/kvm/dirty_log_perf_test.c | 62 +- .../testing/selftests/kvm/include/test_util.h | 4 +- .../selftests/kvm/include/x86_64/processor.h | 34 +- .../selftests/kvm/kvm_page_table_test.c | 7 +- tools/testing/selftests/kvm/lib/test_util.c | 17 +- tools/testing/selftests/kvm/rseq_test.c | 70 +- tools/testing/selftests/kvm/steal_time.c | 4 +- .../selftests/kvm/x86_64/svm_int_ctl_test.c | 128 ++ tools/testing/selftests/net/ioam6.sh | 24 +- tools/testing/selftests/net/ioam6_parser.c | 164 +- .../selftests/netfilter/nft_nat_zones.sh | 309 +++ .../selftests/netfilter/nft_zones_many.sh | 156 ++ virt/kvm/kvm_main.c | 68 +- 997 files changed, 18164 insertions(+), 11257 deletions(-) create mode 100644 arch/arm/boot/dts/overlays/adafruit-st7735r-overlay.dts create mode 100644 block/bdev.c create mode 100644 block/fops.c create mode 100644 block/holder.c create mode 100644 drivers/gpio/gpio-pwm.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c create mode 100644 include/linux/dsa/mv88e6xxx.h create mode 100644 tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c create mode 100644 tools/testing/selftests/netfilter/nft_nat_zones.sh create mode 100644 tools/testing/selftests/netfilter/nft_zones_many.sh diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index babbe04c8d..4d8c27eca9 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1226,7 +1226,7 @@ PAGE_SIZE multiple when read back. Note that all fields in this file are hierarchical and the file modified event can be generated due to an event down the - hierarchy. For for the local events at the cgroup level see + hierarchy. For the local events at the cgroup level see memory.events.local. low @@ -2170,19 +2170,19 @@ existing device files. Cgroup v2 device controller has no interface files and is implemented on top of cgroup BPF. To control access to device files, a user may -create bpf programs of the BPF_CGROUP_DEVICE type and attach them -to cgroups. On an attempt to access a device file, corresponding -BPF programs will be executed, and depending on the return value -the attempt will succeed or fail with -EPERM. +create bpf programs of type BPF_PROG_TYPE_CGROUP_DEVICE and attach +them to cgroups with BPF_CGROUP_DEVICE flag. On an attempt to access a +device file, corresponding BPF programs will be executed, and depending +on the return value the attempt will succeed or fail with -EPERM. -A BPF_CGROUP_DEVICE program takes a pointer to the bpf_cgroup_dev_ctx -structure, which describes the device access attempt: access type -(mknod/read/write) and device (type, major and minor numbers). -If the program returns 0, the attempt fails with -EPERM, otherwise -it succeeds. +A BPF_PROG_TYPE_CGROUP_DEVICE program takes a pointer to the +bpf_cgroup_dev_ctx structure, which describes the device access attempt: +access type (mknod/read/write) and device (type, major and minor numbers). +If the program returns 0, the attempt fails with -EPERM, otherwise it +succeeds. -An example of BPF_CGROUP_DEVICE program may be found in the kernel -source tree in the tools/testing/selftests/bpf/progs/dev_cgroup.c file. +An example of BPF_PROG_TYPE_CGROUP_DEVICE program may be found in +tools/testing/selftests/bpf/progs/dev_cgroup.c in the kernel source tree. RDMA diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 91ba391f9b..43dc35fe5b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1266,7 +1266,7 @@ The VGA and EFI output is eventually overwritten by the real console. - The xen output can only be used by Xen PV guests. + The xen option can only be used in Xen domains. The sclp output can only be used on s390. diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml index 07b20383cb..b446d0f0f1 100644 --- a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml @@ -50,7 +50,6 @@ properties: data-lanes: description: array of physical DSI data lane indexes. minItems: 1 - maxItems: 4 items: - const: 1 - const: 2 @@ -71,7 +70,6 @@ properties: data-lanes: description: array of physical DSI data lane indexes. minItems: 1 - maxItems: 4 items: - const: 1 - const: 2 diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml index 1c2daf7c24..911564468c 100644 --- a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml @@ -18,7 +18,7 @@ properties: const: ti,sn65dsi86 reg: - const: 0x2d + enum: [ 0x2c, 0x2d ] enable-gpios: maxItems: 1 diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml index 2ed010f91e..20ce88ab4b 100644 --- a/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml +++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml @@ -22,7 +22,7 @@ properties: items: - enum: # ili9341 240*320 Color on stm32f429-disco board - - st,sf-tc240t-9370-t + - st,sf-tc240t-9370-t - const: ilitek,ili9341 reg: true diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sdm660.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sdm660.yaml index 29de7807df..bcd41e491f 100644 --- a/Documentation/devicetree/bindings/interconnect/qcom,sdm660.yaml +++ b/Documentation/devicetree/bindings/interconnect/qcom,sdm660.yaml @@ -31,11 +31,11 @@ properties: clocks: minItems: 1 - maxItems: 3 + maxItems: 7 clock-names: minItems: 1 - maxItems: 3 + maxItems: 7 required: - compatible @@ -72,6 +72,32 @@ allOf: contains: enum: - qcom,sdm660-a2noc + then: + properties: + clocks: + items: + - description: Bus Clock. + - description: Bus A Clock. + - description: IPA Clock. + - description: UFS AXI Clock. + - description: Aggregate2 UFS AXI Clock. + - description: Aggregate2 USB3 AXI Clock. + - description: Config NoC USB2 AXI Clock. + clock-names: + items: + - const: bus + - const: bus_a + - const: ipa + - const: ufs_axi + - const: aggre2_ufs_axi + - const: aggre2_usb3_axi + - const: cfg_noc_usb2_axi + + - if: + properties: + compatible: + contains: + enum: - qcom,sdm660-bimc - qcom,sdm660-cnoc - qcom,sdm660-gnoc @@ -91,6 +117,7 @@ examples: - | #include #include + #include bimc: interconnect@1008000 { compatible = "qcom,sdm660-bimc"; @@ -123,9 +150,20 @@ examples: compatible = "qcom,sdm660-a2noc"; reg = <0x01704000 0xc100>; #interconnect-cells = <1>; - clock-names = "bus", "bus_a"; + clock-names = "bus", + "bus_a", + "ipa", + "ufs_axi", + "aggre2_ufs_axi", + "aggre2_usb3_axi", + "cfg_noc_usb2_axi"; clocks = <&rpmcc RPM_SMD_AGGR2_NOC_CLK>, - <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>; + <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>, + <&rpmcc RPM_SMD_IPA_CLK>, + <&gcc GCC_UFS_AXI_CLK>, + <&gcc GCC_AGGRE2_UFS_AXI_CLK>, + <&gcc GCC_AGGRE2_USB3_AXI_CLK>, + <&gcc GCC_CFG_NOC_USB2_AXI_CLK>; }; mnoc: interconnect@1745000 { diff --git a/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml b/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml index e6c9a2f77c..f300ced4cd 100644 --- a/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml +++ b/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml @@ -20,9 +20,7 @@ properties: - snps,dwcmshc-sdhci reg: - minItems: 1 - items: - - description: Offset and length of the register set for the device + maxItems: 1 interrupts: maxItems: 1 diff --git a/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml b/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml index 5629b2e4cc..ee4afe361f 100644 --- a/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml +++ b/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml @@ -34,7 +34,6 @@ properties: clocks: minItems: 3 - maxItems: 5 items: - description: MAC host clock - description: MAC apb clock diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index 42689b7d03..c115c95ee5 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -21,6 +21,7 @@ select: contains: enum: - snps,dwmac + - snps,dwmac-3.40a - snps,dwmac-3.50a - snps,dwmac-3.610 - snps,dwmac-3.70a @@ -76,6 +77,7 @@ properties: - rockchip,rk3399-gmac - rockchip,rv1108-gmac - snps,dwmac + - snps,dwmac-3.40a - snps,dwmac-3.50a - snps,dwmac-3.610 - snps,dwmac-3.70a diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml index 2911e565b2..acea1cd444 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml @@ -41,7 +41,6 @@ properties: - description: builtin MSI controller. interrupt-names: - minItems: 1 items: - const: msi diff --git a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml index ca91201a99..d7e08b03e2 100644 --- a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml +++ b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml @@ -171,7 +171,7 @@ examples: cs-gpios = <&gpio0 13 0>, <&gpio0 14 0>; rx-sample-delay-ns = <3>; - spi-flash@1 { + flash@1 { compatible = "spi-nand"; reg = <1>; rx-sample-delay-ns = <7>; diff --git a/Documentation/filesystems/ntfs3.rst b/Documentation/filesystems/ntfs3.rst index ffe9ea0c14..d67ccd22c6 100644 --- a/Documentation/filesystems/ntfs3.rst +++ b/Documentation/filesystems/ntfs3.rst @@ -4,103 +4,112 @@ NTFS3 ===== - Summary and Features ==================== -NTFS3 is fully functional NTFS Read-Write driver. The driver works with -NTFS versions up to 3.1, normal/compressed/sparse files -and journal replaying. File system type to use on mount is 'ntfs3'. +NTFS3 is fully functional NTFS Read-Write driver. The driver works with NTFS +versions up to 3.1. File system type to use on mount is *ntfs3*. - This driver implements NTFS read/write support for normal, sparse and compressed files. -- Supports native journal replaying; -- Supports extended attributes - Predefined extended attributes: - - 'system.ntfs_security' gets/sets security - descriptor (SECURITY_DESCRIPTOR_RELATIVE) - - 'system.ntfs_attrib' gets/sets ntfs file/dir attributes. - Note: applied to empty files, this allows to switch type between - sparse(0x200), compressed(0x800) and normal; +- Supports native journal replaying. - Supports NFS export of mounted NTFS volumes. +- Supports extended attributes. Predefined extended attributes: + + - *system.ntfs_security* gets/sets security + + Descriptor: SECURITY_DESCRIPTOR_RELATIVE + + - *system.ntfs_attrib* gets/sets ntfs file/dir attributes. + + Note: Applied to empty files, this allows to switch type between + sparse(0x200), compressed(0x800) and normal. Mount Options ============= The list below describes mount options supported by NTFS3 driver in addition to -generic ones. +generic ones. You can use every mount option with **no** option. If it is in +this table marked with no it means default is without **no**. -=============================================================================== +.. flat-table:: + :widths: 1 5 + :fill-cells: -nls=name This option informs the driver how to interpret path - strings and translate them to Unicode and back. If - this option is not set, the default codepage will be - used (CONFIG_NLS_DEFAULT). - Examples: - 'nls=utf8' + * - iocharset=name + - This option informs the driver how to interpret path strings and + translate them to Unicode and back. If this option is not set, the + default codepage will be used (CONFIG_NLS_DEFAULT). -uid= -gid= -umask= Controls the default permissions for files/directories created - after the NTFS volume is mounted. + Example: iocharset=utf8 -fmask= -dmask= Instead of specifying umask which applies both to - files and directories, fmask applies only to files and - dmask only to directories. + * - uid= + - :rspan:`1` + * - gid= -nohidden Files with the Windows-specific HIDDEN (FILE_ATTRIBUTE_HIDDEN) - attribute will not be shown under Linux. + * - umask= + - Controls the default permissions for files/directories created after + the NTFS volume is mounted. -sys_immutable Files with the Windows-specific SYSTEM - (FILE_ATTRIBUTE_SYSTEM) attribute will be marked as system - immutable files. + * - dmask= + - :rspan:`1` Instead of specifying umask which applies both to files and + directories, fmask applies only to files and dmask only to directories. + * - fmask= -discard Enable support of the TRIM command for improved performance - on delete operations, which is recommended for use with the - solid-state drives (SSD). + * - noacsrules + - "No access rules" mount option sets access rights for files/folders to + 777 and owner/group to root. This mount option absorbs all other + permissions. -force Forces the driver to mount partitions even if 'dirty' flag - (volume dirty) is set. Not recommended for use. + - Permissions change for files/folders will be reported as successful, + but they will remain 777. -sparse Create new files as "sparse". + - Owner/group change will be reported as successful, butthey will stay + as root. -showmeta Use this parameter to show all meta-files (System Files) on - a mounted NTFS partition. - By default, all meta-files are hidden. + * - nohidden + - Files with the Windows-specific HIDDEN (FILE_ATTRIBUTE_HIDDEN) attribute + will not be shown under Linux. -prealloc Preallocate space for files excessively when file size is - increasing on writes. Decreases fragmentation in case of - parallel write operations to different files. + * - sys_immutable + - Files with the Windows-specific SYSTEM (FILE_ATTRIBUTE_SYSTEM) attribute + will be marked as system immutable files. -no_acs_rules "No access rules" mount option sets access rights for - files/folders to 777 and owner/group to root. This mount - option absorbs all other permissions: - - permissions change for files/folders will be reported - as successful, but they will remain 777; - - owner/group change will be reported as successful, but - they will stay as root + * - discard + - Enable support of the TRIM command for improved performance on delete + operations, which is recommended for use with the solid-state drives + (SSD). -acl Support POSIX ACLs (Access Control Lists). Effective if - supported by Kernel. Not to be confused with NTFS ACLs. - The option specified as acl enables support for POSIX ACLs. + * - force + - Forces the driver to mount partitions even if volume is marked dirty. + Not recommended for use. -noatime All files and directories will not update their last access - time attribute if a partition is mounted with this parameter. - This option can speed up file system operation. + * - sparse + - Create new files as sparse. -=============================================================================== + * - showmeta + - Use this parameter to show all meta-files (System Files) on a mounted + NTFS partition. By default, all meta-files are hidden. -ToDo list + * - prealloc + - Preallocate space for files excessively when file size is increasing on + writes. Decreases fragmentation in case of parallel write operations to + different files. + + * - acl + - Support POSIX ACLs (Access Control Lists). Effective if supported by + Kernel. Not to be confused with NTFS ACLs. The option specified as acl + enables support for POSIX ACLs. + +Todo list ========= - -- Full journaling support (currently journal replaying is supported) over JBD. - +- Full journaling support over JBD. Currently journal replaying is supported + which is not necessarily as effectice as JBD would be. References ========== -https://www.paragon-software.com/home/ntfs-linux-professional/ - - Commercial version of the NTFS driver for Linux. +- Commercial version of the NTFS driver for Linux. + https://www.paragon-software.com/home/ntfs-linux-professional/ -almaz.alexandrovich@paragon-software.com - - Direct e-mail address for feedback and requests on the NTFS3 implementation. +- Direct e-mail address for feedback and requests on the NTFS3 implementation. + almaz.alexandrovich@paragon-software.com diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst index 364680cdad..8ba72e8980 100644 --- a/Documentation/gpu/amdgpu.rst +++ b/Documentation/gpu/amdgpu.rst @@ -300,8 +300,8 @@ pcie_replay_count .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c :doc: pcie_replay_count -+GPU SmartShift Information -============================ +GPU SmartShift Information +========================== GPU SmartShift information via sysfs diff --git a/Documentation/gpu/drm-internals.rst b/Documentation/gpu/drm-internals.rst index 06af044c88..607f78f0f1 100644 --- a/Documentation/gpu/drm-internals.rst +++ b/Documentation/gpu/drm-internals.rst @@ -111,15 +111,6 @@ Component Helper Usage .. kernel-doc:: drivers/gpu/drm/drm_drv.c :doc: component helper usage recommendations -IRQ Helper Library -~~~~~~~~~~~~~~~~~~ - -.. kernel-doc:: drivers/gpu/drm/drm_irq.c - :doc: irq helpers - -.. kernel-doc:: drivers/gpu/drm/drm_irq.c - :export: - Memory Manager Initialization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/hwmon/k10temp.rst b/Documentation/hwmon/k10temp.rst index 8557e26281..91b99adc6c 100644 --- a/Documentation/hwmon/k10temp.rst +++ b/Documentation/hwmon/k10temp.rst @@ -132,20 +132,3 @@ On Family 17h and Family 18h CPUs, additional temperature sensors may report Core Complex Die (CCD) temperatures. Up to 8 such temperatures are reported as temp{3..10}_input, labeled Tccd{1..8}. Actual support depends on the CPU variant. - -Various Family 17h and 18h CPUs report voltage and current telemetry -information. The following attributes may be reported. - -Attribute Label Description -=============== ======= ================ -in0_input Vcore Core voltage -in1_input Vsoc SoC voltage -curr1_input Icore Core current -curr2_input Isoc SoC current -=============== ======= ================ - -Current values are raw (unscaled) as reported by the CPU. Core current is -reported as multiples of 1A / LSB. SoC is reported as multiples of 0.25A -/ LSB. The real current is board specific. Reported currents should be seen -as rough guidance, and should be scaled using sensors3.conf as appropriate -for a given board. diff --git a/Documentation/userspace-api/vduse.rst b/Documentation/userspace-api/vduse.rst index 42ef59ea53..bdb880e011 100644 --- a/Documentation/userspace-api/vduse.rst +++ b/Documentation/userspace-api/vduse.rst @@ -18,7 +18,7 @@ types can be added after the security issue of corresponding device driver is clarified or fixed in the future. Create/Destroy VDUSE devices ------------------------- +---------------------------- VDUSE devices are created as follows: diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 9320b04c04..4cf45a99fd 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -26,11 +26,6 @@ extern char empty_zero_page[PAGE_SIZE]; extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); -/* Macro to mark a page protection as uncacheable */ -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE)) - -extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); - /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fc196421b2..59baf6c132 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1989,8 +1989,6 @@ config ARCH_HIBERNATION_POSSIBLE endmenu -source "drivers/firmware/Kconfig" - if CRYPTO source "arch/arm/crypto/Kconfig" endif diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index 614999dcb9..cd4672501a 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -71,7 +71,6 @@ apb { isc: isc@f0008000 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_isc_base &pinctrl_isc_data_8bit &pinctrl_isc_data_9_10 &pinctrl_isc_data_11_12>; - status = "okay"; }; qspi1: spi@f0024000 { diff --git a/arch/arm/boot/dts/at91-sama7g5ek.dts b/arch/arm/boot/dts/at91-sama7g5ek.dts index 4cbed98cc2..f3d6aaa3a7 100644 --- a/arch/arm/boot/dts/at91-sama7g5ek.dts +++ b/arch/arm/boot/dts/at91-sama7g5ek.dts @@ -196,11 +196,13 @@ vddioddr: VDD_DDR { regulator-state-standby { regulator-on-in-suspend; + regulator-suspend-microvolt = <1350000>; regulator-mode = <4>; }; regulator-state-mem { regulator-on-in-suspend; + regulator-suspend-microvolt = <1350000>; regulator-mode = <4>; }; }; @@ -353,7 +355,10 @@ &gmac0 { #address-cells = <1>; #size-cells = <0>; pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_gmac0_default &pinctrl_gmac0_txck_default &pinctrl_gmac0_phy_irq>; + pinctrl-0 = <&pinctrl_gmac0_default + &pinctrl_gmac0_mdio_default + &pinctrl_gmac0_txck_default + &pinctrl_gmac0_phy_irq>; phy-mode = "rgmii-id"; status = "okay"; @@ -368,7 +373,9 @@ &gmac1 { #address-cells = <1>; #size-cells = <0>; pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_gmac1_default &pinctrl_gmac1_phy_irq>; + pinctrl-0 = <&pinctrl_gmac1_default + &pinctrl_gmac1_mdio_default + &pinctrl_gmac1_phy_irq>; phy-mode = "rmii"; status = "okay"; @@ -423,14 +430,20 @@ pinctrl_gmac0_default: gmac0_default { , , , - , - , ; + slew-rate = <0>; + bias-disable; + }; + + pinctrl_gmac0_mdio_default: gmac0_mdio_default { + pinmux = , + ; bias-disable; }; pinctrl_gmac0_txck_default: gmac0_txck_default { pinmux = ; + slew-rate = <0>; bias-pull-up; }; @@ -447,8 +460,13 @@ pinctrl_gmac1_default: gmac1_default { , , , - , - , + ; + slew-rate = <0>; + bias-disable; + }; + + pinctrl_gmac1_mdio_default: gmac1_mdio_default { + pinmux = , ; bias-disable; }; @@ -540,6 +558,7 @@ cmd_data { , , ; + slew-rate = <0>; bias-pull-up; }; @@ -547,6 +566,7 @@ ck_cd_rstn_vddsel { pinmux = , , ; + slew-rate = <0>; bias-pull-up; }; }; @@ -558,6 +578,7 @@ cmd_data { , , ; + slew-rate = <0>; bias-pull-up; }; @@ -566,6 +587,7 @@ ck_cd_rstn_vddsel { , , ; + slew-rate = <0>; bias-pull-up; }; }; @@ -577,11 +599,13 @@ cmd_data { , , ; + slew-rate = <0>; bias-pull-up; }; ck { pinmux = ; + slew-rate = <0>; bias-pull-up; }; }; @@ -634,6 +658,15 @@ &sdmmc2 { pinctrl-0 = <&pinctrl_sdmmc2_default>; }; +&shdwc { + atmel,shdwc-debouncer = <976>; + status = "okay"; + + input@0 { + reg = <0>; + }; +}; + &spdifrx { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_spdifrx_default>; diff --git a/arch/arm/boot/dts/bcm270x-rpi.dtsi b/arch/arm/boot/dts/bcm270x-rpi.dtsi index 68a7e1c09d..57e7d5f60d 100644 --- a/arch/arm/boot/dts/bcm270x-rpi.dtsi +++ b/arch/arm/boot/dts/bcm270x-rpi.dtsi @@ -65,12 +65,6 @@ fb: fb { status = "okay"; }; - vcsm: vcsm { - compatible = "raspberrypi,bcm2835-vcsm"; - firmware = <&firmware>; - status = "okay"; - }; - /* External sound card */ sound: sound { status = "disabled"; @@ -148,7 +142,14 @@ &vchiq { /* Onboard audio */ audio: bcm2835_audio { compatible = "brcm,bcm2835-audio"; + brcm,firmware = <&firmware>; brcm,pwm-channels = <8>; status = "disabled"; }; }; + +&firmware { + vcio: vcio { + compatible = "raspberrypi,vcio"; + }; +}; diff --git a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts index a3d87cbe90..f6e0247f7d 100644 --- a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts +++ b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts @@ -40,8 +40,8 @@ sd_io_1v8_reg: sd_io_1v8_reg { regulator-always-on; regulator-settling-time-us = <5000>; gpios = <&expgpio 4 GPIO_ACTIVE_HIGH>; - states = <1800000 0x1 - 3300000 0x0>; + states = <1800000 0x1>, + <3300000 0x0>; status = "okay"; }; @@ -218,7 +218,7 @@ phy1: ethernet-phy@1 { &pcie0 { pci@0,0 { - device-type = "pci"; + device_type = "pci"; #address-cells = <3>; #size-cells = <2>; ranges; diff --git a/arch/arm/boot/dts/imx53-m53menlo.dts b/arch/arm/boot/dts/imx53-m53menlo.dts index f98691ae44..4f88e96d81 100644 --- a/arch/arm/boot/dts/imx53-m53menlo.dts +++ b/arch/arm/boot/dts/imx53-m53menlo.dts @@ -56,6 +56,7 @@ eth { panel { compatible = "edt,etm0700g0dh6"; pinctrl-0 = <&pinctrl_display_gpio>; + pinctrl-names = "default"; enable-gpios = <&gpio6 0 GPIO_ACTIVE_HIGH>; port { @@ -76,8 +77,7 @@ reg_usbh1_vbus: regulator-usbh1-vbus { regulator-name = "vbus"; regulator-min-microvolt = <5000000>; regulator-max-microvolt = <5000000>; - gpio = <&gpio1 2 GPIO_ACTIVE_HIGH>; - enable-active-high; + gpio = <&gpio1 2 0>; }; }; @@ -388,13 +388,13 @@ MX53_PAD_LVDS0_TX3_P__LDB_LVDS0_TX3 0x80000000 pinctrl_power_button: powerbutgrp { fsl,pins = < - MX53_PAD_SD2_DATA2__GPIO1_13 0x1e4 + MX53_PAD_SD2_DATA0__GPIO1_15 0x1e4 >; }; pinctrl_power_out: poweroutgrp { fsl,pins = < - MX53_PAD_SD2_DATA0__GPIO1_15 0x1e4 + MX53_PAD_SD2_DATA2__GPIO1_13 0x1e4 >; }; diff --git a/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi b/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi index cb8b539eb2..e5c4dc65fb 100644 --- a/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi +++ b/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi @@ -5,6 +5,7 @@ #include #include #include +#include #include / { @@ -277,6 +278,7 @@ chan@0 { led-cur = /bits/ 8 <0x20>; max-cur = /bits/ 8 <0x60>; reg = <0>; + color = ; }; chan@1 { @@ -284,6 +286,7 @@ chan@1 { led-cur = /bits/ 8 <0x20>; max-cur = /bits/ 8 <0x60>; reg = <1>; + color = ; }; chan@2 { @@ -291,6 +294,7 @@ chan@2 { led-cur = /bits/ 8 <0x20>; max-cur = /bits/ 8 <0x60>; reg = <2>; + color = ; }; chan@3 { @@ -298,6 +302,7 @@ chan@3 { led-cur = /bits/ 8 <0x0>; max-cur = /bits/ 8 <0x0>; reg = <3>; + color = ; }; }; diff --git a/arch/arm/boot/dts/imx6qdl-pico.dtsi b/arch/arm/boot/dts/imx6qdl-pico.dtsi index 5de4ccb979..f7a56d6b16 100644 --- a/arch/arm/boot/dts/imx6qdl-pico.dtsi +++ b/arch/arm/boot/dts/imx6qdl-pico.dtsi @@ -176,7 +176,18 @@ &fec { pinctrl-0 = <&pinctrl_enet>; phy-mode = "rgmii-id"; phy-reset-gpios = <&gpio1 26 GPIO_ACTIVE_LOW>; + phy-handle = <&phy>; status = "okay"; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + phy: ethernet-phy@1 { + reg = <1>; + qca,clk-out-frequency = <125000000>; + }; + }; }; &hdmi { diff --git a/arch/arm/boot/dts/overlays/Makefile b/arch/arm/boot/dts/overlays/Makefile index d10ff5c61a..b36c618b01 100644 --- a/arch/arm/boot/dts/overlays/Makefile +++ b/arch/arm/boot/dts/overlays/Makefile @@ -4,6 +4,7 @@ dtb-$(CONFIG_ARCH_BCM2835) += overlay_map.dtb dtbo-$(CONFIG_ARCH_BCM2835) += \ act-led.dtbo \ + adafruit-st7735r.dtbo \ adafruit18.dtbo \ adau1977-adc.dtbo \ adau7002-simple.dtbo \ diff --git a/arch/arm/boot/dts/overlays/README b/arch/arm/boot/dts/overlays/README index aa2271cd6c..5064d8eb20 100644 --- a/arch/arm/boot/dts/overlays/README +++ b/arch/arm/boot/dts/overlays/README @@ -299,9 +299,23 @@ Params: activelow Set to "on" to invert the sense of the LED REQUIRED +Name: adafruit-st7735r +Info: Overlay for the SPI-connected Adafruit 1.8" 160x128 or 128x128 displays, + based on the ST7735R chip. + This overlay uses the newer DRM/KMS "Tiny" driver. +Load: dtoverlay=adafruit-st7735r,= +Params: 128x128 Select the 128x128 driver (default 160x128) + rotate Display rotation {0,90,180,270} (default 90) + speed SPI bus speed in Hz (default 4000000) + dc_pin GPIO pin for D/C (default 24) + reset_pin GPIO pin for RESET (default 25) + led_pin GPIO used to control backlight (default 18) + + Name: adafruit18 Info: Overlay for the SPI-connected Adafruit 1.8" display (based on the ST7735R chip). It includes support for the "green tab" version. + This overlay uses the older fbtft driver. Load: dtoverlay=adafruit18,= Params: green Use the adafruit18_green variant. rotate Display rotation {0,90,180,270} diff --git a/arch/arm/boot/dts/overlays/adafruit-st7735r-overlay.dts b/arch/arm/boot/dts/overlays/adafruit-st7735r-overlay.dts new file mode 100644 index 0000000000..bf186811ec --- /dev/null +++ b/arch/arm/boot/dts/overlays/adafruit-st7735r-overlay.dts @@ -0,0 +1,83 @@ +/* + * adafruit-st7735r-overlay.dts + * + * ST7735R based SPI LCD displays. Either + * Adafruit 1.8" 160x128 + * or + * Okaya 1.44" 128x128 + */ + +/dts-v1/; +/plugin/; + +#include + +/ { + compatible = "brcm,bcm2835"; + + fragment@0 { + target = <&spidev0>; + __overlay__ { + status = "disabled"; + }; + }; + + fragment@1 { + target = <&gpio>; + __overlay__ { + adafruit_pins: adafruit_pins { + brcm,pins = <25 24>; + brcm,function = <1>; /* out */ + }; + backlight_pins: backlight_pins { + brcm,pins = <18>; + brcm,function = <1>; /* out */ + }; + }; + }; + + fragment@2 { + target-path = "/"; + __overlay__ { + af18_backlight: backlight { + compatible = "gpio-backlight"; + gpios = <&gpio 18 GPIO_ACTIVE_HIGH>; + pinctrl-names = "default"; + pinctrl-0 = <&backlight_pins>; + }; + }; + }; + + fragment@3 { + target = <&spi0>; + __overlay__ { + /* needed to avoid dtc warning */ + #address-cells = <1>; + #size-cells = <0>; + status = "okay"; + + af18: adafruit18@0 { + compatible = "jianda,jd-t18003-t01"; + reg = <0>; + spi-max-frequency = <32000000>; + dc-gpios = <&gpio 24 GPIO_ACTIVE_HIGH>; + reset-gpios = <&gpio 25 GPIO_ACTIVE_HIGH>; + rotate = <90>; + pinctrl-names = "default"; + pinctrl-0 = <&adafruit_pins>; + backlight = <&af18_backlight>; + }; + }; + }; + + __overrides__ { + 128x128 = <&af18>, "compatible=okaya,rh128128t"; + speed = <&af18>,"spi-max-frequency:0"; + rotate = <&af18>,"rotate:0"; + dc_pin = <&af18>,"dc-gpios:4", <&adafruit_pins>,"brcm,pins:4"; + reset_pin = <&af18>,"reset-gpios:4", + <&adafruit_pins>,"brcm,pins:0"; + led_pin = <&af18_backlight>,"gpios:4", + <&backlight_pins>,"brcm,pins:0"; + }; +}; diff --git a/arch/arm/boot/dts/overlays/hifiberry-dacplusadcpro-overlay.dts b/arch/arm/boot/dts/overlays/hifiberry-dacplusadcpro-overlay.dts index cafa2ccd7f..561cd84bbb 100644 --- a/arch/arm/boot/dts/overlays/hifiberry-dacplusadcpro-overlay.dts +++ b/arch/arm/boot/dts/overlays/hifiberry-dacplusadcpro-overlay.dts @@ -43,6 +43,11 @@ hb_adc: pcm186x@4a { clocks = <&dacpro_osc>; status = "okay"; }; + hpamp: hpamp@60 { + compatible = "ti,tpa6130a2"; + reg = <0x60>; + status = "disabled"; + }; }; }; diff --git a/arch/arm/boot/dts/qcom-apq8064.dtsi b/arch/arm/boot/dts/qcom-apq8064.dtsi index 0b2bed6e7a..d1c1c6aab2 100644 --- a/arch/arm/boot/dts/qcom-apq8064.dtsi +++ b/arch/arm/boot/dts/qcom-apq8064.dtsi @@ -198,7 +198,7 @@ cxo_board: cxo_board { clock-frequency = <19200000>; }; - pxo_board { + pxo_board: pxo_board { compatible = "fixed-clock"; #clock-cells = <0>; clock-frequency = <27000000>; @@ -1148,22 +1148,21 @@ tcsr: syscon@1a400000 { }; gpu: adreno-3xx@4300000 { - compatible = "qcom,adreno-3xx"; + compatible = "qcom,adreno-320.2", "qcom,adreno"; reg = <0x04300000 0x20000>; reg-names = "kgsl_3d0_reg_memory"; interrupts = ; interrupt-names = "kgsl_3d0_irq"; clock-names = - "core_clk", - "iface_clk", - "mem_clk", - "mem_iface_clk"; + "core", + "iface", + "mem", + "mem_iface"; clocks = <&mmcc GFX3D_CLK>, <&mmcc GFX3D_AHB_CLK>, <&mmcc GFX3D_AXI_CLK>, <&mmcc MMSS_IMEM_AHB_CLK>; - qcom,chipid = <0x03020002>; iommus = <&gfx3d 0 &gfx3d 1 @@ -1306,7 +1305,7 @@ dsi0_phy: dsi-phy@4700200 { reg-names = "dsi_pll", "dsi_phy", "dsi_phy_regulator"; clock-names = "iface_clk", "ref"; clocks = <&mmcc DSI_M_AHB_CLK>, - <&cxo_board>; + <&pxo_board>; }; diff --git a/arch/arm/boot/dts/sama7g5.dtsi b/arch/arm/boot/dts/sama7g5.dtsi index cc6be6db7b..6c58c151c6 100644 --- a/arch/arm/boot/dts/sama7g5.dtsi +++ b/arch/arm/boot/dts/sama7g5.dtsi @@ -75,6 +75,17 @@ soc { #size-cells = <1>; ranges; + securam: securam@e0000000 { + compatible = "microchip,sama7g5-securam", "atmel,sama5d2-securam", "mmio-sram"; + reg = <0xe0000000 0x4000>; + clocks = <&pmc PMC_TYPE_PERIPHERAL 18>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0xe0000000 0x4000>; + no-memory-wc; + status = "okay"; + }; + secumod: secumod@e0004000 { compatible = "microchip,sama7g5-secumod", "atmel,sama5d2-secumod", "syscon"; reg = <0xe0004000 0x4000>; @@ -111,6 +122,17 @@ pmc: pmc@e0018000 { clock-names = "td_slck", "md_slck", "main_xtal"; }; + shdwc: shdwc@e001d010 { + compatible = "microchip,sama7g5-shdwc", "syscon"; + reg = <0xe001d010 0x10>; + clocks = <&clk32k 0>; + #address-cells = <1>; + #size-cells = <0>; + atmel,wakeup-rtc-timer; + atmel,wakeup-rtt-timer; + status = "disabled"; + }; + rtt: rtt@e001d020 { compatible = "microchip,sama7g5-rtt", "microchip,sam9x60-rtt", "atmel,at91sam9260-rtt"; reg = <0xe001d020 0x30>; @@ -137,6 +159,11 @@ ps_wdt: watchdog@e001d180 { clocks = <&clk32k 0>; }; + chipid@e0020000 { + compatible = "microchip,sama7g5-chipid"; + reg = <0xe0020000 0x8>; + }; + sdmmc0: mmc@e1204000 { compatible = "microchip,sama7g5-sdhci", "microchip,sam9x60-sdhci"; reg = <0xe1204000 0x4000>; @@ -515,6 +542,18 @@ spi11: spi@400 { }; }; + uddrc: uddrc@e3800000 { + compatible = "microchip,sama7g5-uddrc"; + reg = <0xe3800000 0x4000>; + status = "okay"; + }; + + ddr3phy: ddr3phy@e3804000 { + compatible = "microchip,sama7g5-ddr3phy"; + reg = <0xe3804000 0x1000>; + status = "okay"; + }; + gic: interrupt-controller@e8c11000 { compatible = "arm,cortex-a7-gic"; #interrupt-cells = <3>; diff --git a/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi b/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi index 2ad9fd7c94..8af4b77fe6 100644 --- a/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi +++ b/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi @@ -17,6 +17,7 @@ * TAKE CARE WHEN MAINTAINING THIS FILE TO PROPAGATE ANY RELEVANT * CHANGES TO vexpress-v2m.dtsi! */ +#include / { v2m_fixed_3v3: fixed-regulator-0 { @@ -101,16 +102,68 @@ led-8 { }; bus@8000000 { - motherboard-bus { - model = "V2M-P1"; + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 63>; + interrupt-map = <0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <0 1 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <0 2 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, + <0 3 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, + <0 4 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, + <0 5 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, + <0 6 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, + <0 7 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, + <0 8 &gic GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, + <0 9 &gic GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, + <0 10 &gic GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>, + <0 11 &gic GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>, + <0 12 &gic GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>, + <0 13 &gic GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>, + <0 14 &gic GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>, + <0 15 &gic GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>, + <0 16 &gic GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>, + <0 17 &gic GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, + <0 18 &gic GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>, + <0 19 &gic GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>, + <0 20 &gic GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>, + <0 21 &gic GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>, + <0 22 &gic GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>, + <0 23 &gic GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>, + <0 24 &gic GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>, + <0 25 &gic GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>, + <0 26 &gic GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>, + <0 27 &gic GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, + <0 28 &gic GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>, + <0 29 &gic GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>, + <0 30 &gic GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>, + <0 31 &gic GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>, + <0 32 &gic GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, + <0 33 &gic GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, + <0 34 &gic GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, + <0 35 &gic GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, + <0 36 &gic GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, + <0 37 &gic GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>, + <0 38 &gic GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>, + <0 39 &gic GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>, + <0 40 &gic GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, + <0 41 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, + <0 42 &gic GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + + motherboard-bus@8000000 { arm,hbi = <0x190>; arm,vexpress,site = <0>; - arm,v2m-memory-map = "rs1"; compatible = "arm,vexpress,v2m-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; + ranges = <0 0 0x08000000 0x04000000>, + <1 0 0x14000000 0x04000000>, + <2 0 0x18000000 0x04000000>, + <3 0 0x1c000000 0x04000000>, + <4 0 0x0c000000 0x04000000>, + <5 0 0x10000000 0x04000000>; nor_flash: flash@0 { compatible = "arm,vexpress-flash", "cfi-flash"; @@ -215,7 +268,7 @@ aaci@40000 { clock-names = "apb_pclk"; }; - mmci@50000 { + mmc@50000 { compatible = "arm,pl180", "arm,primecell"; reg = <0x050000 0x1000>; interrupts = <9>, <10>; @@ -275,7 +328,7 @@ v2m_serial3: serial@c0000 { clock-names = "uartclk", "apb_pclk"; }; - wdt@f0000 { + watchdog@f0000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0f0000 0x1000>; interrupts = <0>; diff --git a/arch/arm/boot/dts/vexpress-v2m.dtsi b/arch/arm/boot/dts/vexpress-v2m.dtsi index ec13ceb9ed..f434fe5cf4 100644 --- a/arch/arm/boot/dts/vexpress-v2m.dtsi +++ b/arch/arm/boot/dts/vexpress-v2m.dtsi @@ -17,18 +17,73 @@ * TAKE CARE WHEN MAINTAINING THIS FILE TO PROPAGATE ANY RELEVANT * CHANGES TO vexpress-v2m-rs1.dtsi! */ +#include / { - bus@4000000 { - motherboard { - model = "V2M-P1"; + bus@40000000 { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x40000000 0x40000000 0x10000000>, + <0x10000000 0x10000000 0x00020000>; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 63>; + interrupt-map = <0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <0 1 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <0 2 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, + <0 3 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, + <0 4 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, + <0 5 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, + <0 6 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, + <0 7 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, + <0 8 &gic GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, + <0 9 &gic GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, + <0 10 &gic GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>, + <0 11 &gic GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>, + <0 12 &gic GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>, + <0 13 &gic GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>, + <0 14 &gic GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>, + <0 15 &gic GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>, + <0 16 &gic GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>, + <0 17 &gic GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, + <0 18 &gic GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>, + <0 19 &gic GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>, + <0 20 &gic GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>, + <0 21 &gic GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>, + <0 22 &gic GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>, + <0 23 &gic GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>, + <0 24 &gic GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>, + <0 25 &gic GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>, + <0 26 &gic GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>, + <0 27 &gic GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, + <0 28 &gic GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>, + <0 29 &gic GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>, + <0 30 &gic GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>, + <0 31 &gic GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>, + <0 32 &gic GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, + <0 33 &gic GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, + <0 34 &gic GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, + <0 35 &gic GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, + <0 36 &gic GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, + <0 37 &gic GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>, + <0 38 &gic GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>, + <0 39 &gic GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>, + <0 40 &gic GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, + <0 41 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, + <0 42 &gic GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + + motherboard-bus@40000000 { arm,hbi = <0x190>; arm,vexpress,site = <0>; compatible = "arm,vexpress,v2m-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; + ranges = <0 0 0x40000000 0x04000000>, + <1 0 0x44000000 0x04000000>, + <2 0 0x48000000 0x04000000>, + <3 0 0x4c000000 0x04000000>, + <7 0 0x10000000 0x00020000>; flash@0,00000000 { compatible = "arm,vexpress-flash", "cfi-flash"; diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts b/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts index e63c5c0bfb..679537e17f 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts @@ -237,62 +237,7 @@ energy { }; bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; + ranges = <0x8000000 0 0x8000000 0x18000000>; }; site2: hsb@40000000 { diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts index 012d40a722..511e87cc2b 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts @@ -609,62 +609,7 @@ etm2_out_port: endpoint { }; smb: bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; + ranges = <0x8000000 0 0x8000000 0x18000000>; }; site2: hsb@40000000 { diff --git a/arch/arm/boot/dts/vexpress-v2p-ca5s.dts b/arch/arm/boot/dts/vexpress-v2p-ca5s.dts index 7aa64ae257..3b88209bac 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca5s.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca5s.dts @@ -207,62 +207,7 @@ temp-dcc { }; smb: bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0x08000000 0x04000000>, - <1 0 0x14000000 0x04000000>, - <2 0 0x18000000 0x04000000>, - <3 0 0x1c000000 0x04000000>, - <4 0 0x0c000000 0x04000000>, - <5 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; + ranges = <0 0x8000000 0x18000000>; }; site2: hsb@40000000 { diff --git a/arch/arm/boot/dts/vexpress-v2p-ca9.dts b/arch/arm/boot/dts/vexpress-v2p-ca9.dts index 4c58479558..5916e4877e 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca9.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca9.dts @@ -295,64 +295,6 @@ power-vd10-s3 { }; }; - smb: bus@4000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0x40000000 0x04000000>, - <1 0 0x44000000 0x04000000>, - <2 0 0x48000000 0x04000000>, - <3 0 0x4c000000 0x04000000>, - <7 0 0x10000000 0x00020000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; - }; - site2: hsb@e0000000 { compatible = "simple-bus"; #address-cells = <1>; diff --git a/arch/arm/common/sharpsl_param.c b/arch/arm/common/sharpsl_param.c index efeb5724d9..6237ede2f0 100644 --- a/arch/arm/common/sharpsl_param.c +++ b/arch/arm/common/sharpsl_param.c @@ -40,7 +40,9 @@ EXPORT_SYMBOL(sharpsl_param); void sharpsl_save_param(void) { - memcpy(&sharpsl_param, param_start(PARAM_BASE), sizeof(struct sharpsl_param_info)); + struct sharpsl_param_info *params = param_start(PARAM_BASE); + + memcpy(&sharpsl_param, params, sizeof(*params)); if (sharpsl_param.comadj_keyword != COMADJ_MAGIC) sharpsl_param.comadj=-1; diff --git a/arch/arm/configs/bcm2709_defconfig b/arch/arm/configs/bcm2709_defconfig index 39a79c6a0b..77553d3241 100644 --- a/arch/arm/configs/bcm2709_defconfig +++ b/arch/arm/configs/bcm2709_defconfig @@ -1320,6 +1320,7 @@ CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m CONFIG_F2FS_FS=y +CONFIG_F2FS_FS_SECURITY=y CONFIG_FS_ENCRYPTION=y CONFIG_FANOTIFY=y CONFIG_QFMT_V1=m diff --git a/arch/arm/configs/bcm2711_defconfig b/arch/arm/configs/bcm2711_defconfig index 2db6625274..a15b3e0a02 100644 --- a/arch/arm/configs/bcm2711_defconfig +++ b/arch/arm/configs/bcm2711_defconfig @@ -509,6 +509,7 @@ CONFIG_BCMGENET=y CONFIG_ENC28J60=m CONFIG_QCA7000_SPI=m CONFIG_QCA7000_UART=m +CONFIG_R8169=m CONFIG_WIZNET_W5100=m CONFIG_WIZNET_W5100_SPI=m CONFIG_MICREL_PHY=y @@ -1358,6 +1359,7 @@ CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m CONFIG_F2FS_FS=y +CONFIG_F2FS_FS_SECURITY=y CONFIG_FS_ENCRYPTION=y CONFIG_FANOTIFY=y CONFIG_QFMT_V1=m diff --git a/arch/arm/configs/bcmrpi_defconfig b/arch/arm/configs/bcmrpi_defconfig index 2f64a60e1e..f1a78aa396 100644 --- a/arch/arm/configs/bcmrpi_defconfig +++ b/arch/arm/configs/bcmrpi_defconfig @@ -1331,6 +1331,7 @@ CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m CONFIG_F2FS_FS=y +CONFIG_F2FS_FS_SECURITY=y CONFIG_FANOTIFY=y CONFIG_QFMT_V1=m CONFIG_QFMT_V2=m diff --git a/arch/arm/configs/gemini_defconfig b/arch/arm/configs/gemini_defconfig index d2d5f1cf81..e6ff844821 100644 --- a/arch/arm/configs/gemini_defconfig +++ b/arch/arm/configs/gemini_defconfig @@ -76,6 +76,7 @@ CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_DRM=y CONFIG_DRM_PANEL_ILITEK_IL9322=y CONFIG_DRM_TVE200=y +CONFIG_FB=y CONFIG_LOGO=y CONFIG_USB=y CONFIG_USB_MON=y diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index ccee86d004..5e4128dadd 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -292,6 +292,7 @@ CONFIG_DRM_IMX_LDB=y CONFIG_DRM_IMX_HDMI=y CONFIG_DRM_ETNAVIV=y CONFIG_DRM_MXSFB=y +CONFIG_FB=y CONFIG_FB_MODE_HELPERS=y CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_L4F00242T03=y diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index ba67c4717d..33572998db 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -197,7 +197,6 @@ CONFIG_PCI_EPF_TEST=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_OMAP_OCP2SCP=y -CONFIG_SIMPLE_PM_BUS=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y @@ -456,6 +455,7 @@ CONFIG_PINCTRL_STMFX=y CONFIG_PINCTRL_PALMAS=y CONFIG_PINCTRL_OWL=y CONFIG_PINCTRL_S500=y +CONFIG_PINCTRL_MSM=y CONFIG_PINCTRL_APQ8064=y CONFIG_PINCTRL_APQ8084=y CONFIG_PINCTRL_IPQ8064=y @@ -725,6 +725,7 @@ CONFIG_DRM_PL111=m CONFIG_DRM_LIMA=m CONFIG_DRM_PANFROST=m CONFIG_DRM_ASPEED_GFX=m +CONFIG_FB=y CONFIG_FB_EFI=y CONFIG_FB_WM8505=y CONFIG_FB_SH_MOBILE_LCDC=y @@ -1122,6 +1123,7 @@ CONFIG_PHY_DM816X_USB=m CONFIG_OMAP_USB2=y CONFIG_TI_PIPE3=y CONFIG_TWL4030_USB=m +CONFIG_RAS=y CONFIG_NVMEM_IMX_OCOTP=y CONFIG_ROCKCHIP_EFUSE=m CONFIG_NVMEM_SUNXI_SID=y diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig index cae0db6b4e..de37f7e909 100644 --- a/arch/arm/configs/oxnas_v6_defconfig +++ b/arch/arm/configs/oxnas_v6_defconfig @@ -46,7 +46,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=64 -CONFIG_SIMPLE_PM_BUS=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y diff --git a/arch/arm/configs/shmobile_defconfig b/arch/arm/configs/shmobile_defconfig index d9a27e4e09..18d2a960b2 100644 --- a/arch/arm/configs/shmobile_defconfig +++ b/arch/arm/configs/shmobile_defconfig @@ -40,7 +40,6 @@ CONFIG_PCI_RCAR_GEN2=y CONFIG_PCIE_RCAR_HOST=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y -CONFIG_SIMPLE_PM_BUS=y CONFIG_MTD=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index d6cfe7c4bb..8711d6824c 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -47,12 +47,26 @@ struct at91_pm_bu { unsigned long ddr_phy_calibration[BACKUP_DDR_PHY_CALIBRATION]; }; +/* + * struct at91_pm_sfrbu_offsets: registers mapping for SFRBU + * @pswbu: power switch BU control registers + */ +struct at91_pm_sfrbu_regs { + struct { + u32 key; + u32 ctrl; + u32 state; + u32 softsw; + } pswbu; +}; + /** * struct at91_soc_pm - AT91 SoC power management data structure * @config_shdwc_ws: wakeup sources configuration function for SHDWC * @config_pmc_ws: wakeup srouces configuration function for PMC * @ws_ids: wakup sources of_device_id array * @data: PM data to be used on last phase of suspend + * @sfrbu_regs: SFRBU registers mapping * @bu: backup unit mapped data (for backup mode) * @memcs: memory chip select */ @@ -62,6 +76,7 @@ struct at91_soc_pm { const struct of_device_id *ws_ids; struct at91_pm_bu *bu; struct at91_pm_data data; + struct at91_pm_sfrbu_regs sfrbu_regs; void *memcs; }; @@ -356,9 +371,36 @@ static int at91_suspend_finish(unsigned long val) return 0; } +static void at91_pm_switch_ba_to_vbat(void) +{ + unsigned int offset = offsetof(struct at91_pm_sfrbu_regs, pswbu); + unsigned int val; + + /* Just for safety. */ + if (!soc_pm.data.sfrbu) + return; + + val = readl(soc_pm.data.sfrbu + offset); + + /* Already on VBAT. */ + if (!(val & soc_pm.sfrbu_regs.pswbu.state)) + return; + + val &= ~soc_pm.sfrbu_regs.pswbu.softsw; + val |= soc_pm.sfrbu_regs.pswbu.key | soc_pm.sfrbu_regs.pswbu.ctrl; + writel(val, soc_pm.data.sfrbu + offset); + + /* Wait for update. */ + val = readl(soc_pm.data.sfrbu + offset); + while (val & soc_pm.sfrbu_regs.pswbu.state) + val = readl(soc_pm.data.sfrbu + offset); +} + static void at91_pm_suspend(suspend_state_t state) { if (soc_pm.data.mode == AT91_PM_BACKUP) { + at91_pm_switch_ba_to_vbat(); + cpu_suspend(0, at91_suspend_finish); /* The SRAM is lost between suspend cycles */ @@ -589,18 +631,22 @@ static const struct of_device_id ramc_phy_ids[] __initconst = { { /* Sentinel. */ }, }; -static __init void at91_dt_ramc(bool phy_mandatory) +static __init int at91_dt_ramc(bool phy_mandatory) { struct device_node *np; const struct of_device_id *of_id; int idx = 0; void *standby = NULL; const struct ramc_info *ramc; + int ret; for_each_matching_node_and_match(np, ramc_ids, &of_id) { soc_pm.data.ramc[idx] = of_iomap(np, 0); - if (!soc_pm.data.ramc[idx]) - panic(pr_fmt("unable to map ramc[%d] cpu registers\n"), idx); + if (!soc_pm.data.ramc[idx]) { + pr_err("unable to map ramc[%d] cpu registers\n", idx); + ret = -ENOMEM; + goto unmap_ramc; + } ramc = of_id->data; if (ramc) { @@ -612,25 +658,42 @@ static __init void at91_dt_ramc(bool phy_mandatory) idx++; } - if (!idx) - panic(pr_fmt("unable to find compatible ram controller node in dtb\n")); + if (!idx) { + pr_err("unable to find compatible ram controller node in dtb\n"); + ret = -ENODEV; + goto unmap_ramc; + } /* Lookup for DDR PHY node, if any. */ for_each_matching_node_and_match(np, ramc_phy_ids, &of_id) { soc_pm.data.ramc_phy = of_iomap(np, 0); - if (!soc_pm.data.ramc_phy) - panic(pr_fmt("unable to map ramc phy cpu registers\n")); + if (!soc_pm.data.ramc_phy) { + pr_err("unable to map ramc phy cpu registers\n"); + ret = -ENOMEM; + goto unmap_ramc; + } } - if (phy_mandatory && !soc_pm.data.ramc_phy) - panic(pr_fmt("DDR PHY is mandatory!\n")); + if (phy_mandatory && !soc_pm.data.ramc_phy) { + pr_err("DDR PHY is mandatory!\n"); + ret = -ENODEV; + goto unmap_ramc; + } if (!standby) { pr_warn("ramc no standby function available\n"); - return; + return 0; } at91_cpuidle_device.dev.platform_data = standby; + + return 0; + +unmap_ramc: + while (idx) + iounmap(soc_pm.data.ramc[--idx]); + + return ret; } static void at91rm9200_idle(void) @@ -1017,6 +1080,8 @@ static void __init at91_pm_init(void (*pm_idle)(void)) void __init at91rm9200_pm_init(void) { + int ret; + if (!IS_ENABLED(CONFIG_SOC_AT91RM9200)) return; @@ -1028,7 +1093,9 @@ void __init at91rm9200_pm_init(void) soc_pm.data.standby_mode = AT91_PM_STANDBY; soc_pm.data.suspend_mode = AT91_PM_ULP0; - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; /* * AT91RM9200 SDRAM low-power mode cannot be used with self-refresh. @@ -1046,13 +1113,17 @@ void __init sam9x60_pm_init(void) static const int iomaps[] __initconst = { [AT91_PM_ULP1] = AT91_PM_IOMAP(SHDWC), }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAM9X60)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); at91_pm_modes_init(iomaps, ARRAY_SIZE(iomaps)); - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(NULL); soc_pm.ws_ids = sam9x60_ws_ids; @@ -1061,6 +1132,8 @@ void __init sam9x60_pm_init(void) void __init at91sam9_pm_init(void) { + int ret; + if (!IS_ENABLED(CONFIG_SOC_AT91SAM9)) return; @@ -1072,7 +1145,10 @@ void __init at91sam9_pm_init(void) soc_pm.data.standby_mode = AT91_PM_STANDBY; soc_pm.data.suspend_mode = AT91_PM_ULP0; - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(at91sam9_idle); } @@ -1081,12 +1157,16 @@ void __init sama5_pm_init(void) static const int modes[] __initconst = { AT91_PM_STANDBY, AT91_PM_ULP0, AT91_PM_ULP0_FAST, }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAMA5)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(NULL); } @@ -1101,18 +1181,27 @@ void __init sama5d2_pm_init(void) [AT91_PM_BACKUP] = AT91_PM_IOMAP(SHDWC) | AT91_PM_IOMAP(SFRBU), }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAMA5D2)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); at91_pm_modes_init(iomaps, ARRAY_SIZE(iomaps)); - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(NULL); soc_pm.ws_ids = sama5d2_ws_ids; soc_pm.config_shdwc_ws = at91_sama5d2_config_shdwc_ws; soc_pm.config_pmc_ws = at91_sama5d2_config_pmc_ws; + + soc_pm.sfrbu_regs.pswbu.key = (0x4BD20C << 8); + soc_pm.sfrbu_regs.pswbu.ctrl = BIT(0); + soc_pm.sfrbu_regs.pswbu.softsw = BIT(1); + soc_pm.sfrbu_regs.pswbu.state = BIT(3); } void __init sama7_pm_init(void) @@ -1127,18 +1216,27 @@ void __init sama7_pm_init(void) [AT91_PM_BACKUP] = AT91_PM_IOMAP(SFRBU) | AT91_PM_IOMAP(SHDWC), }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAMA7)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); - at91_dt_ramc(true); + ret = at91_dt_ramc(true); + if (ret) + return; + at91_pm_modes_init(iomaps, ARRAY_SIZE(iomaps)); at91_pm_init(NULL); soc_pm.ws_ids = sama7g5_ws_ids; soc_pm.config_pmc_ws = at91_sam9x60_config_pmc_ws; + + soc_pm.sfrbu_regs.pswbu.key = (0x4BD20C << 8); + soc_pm.sfrbu_regs.pswbu.ctrl = BIT(0); + soc_pm.sfrbu_regs.pswbu.softsw = BIT(1); + soc_pm.sfrbu_regs.pswbu.state = BIT(2); } static int __init at91_pm_modes_select(char *str) diff --git a/arch/arm/mach-at91/pm_suspend.S b/arch/arm/mach-at91/pm_suspend.S index cbd61a3bca..fdb4f63ecd 100644 --- a/arch/arm/mach-at91/pm_suspend.S +++ b/arch/arm/mach-at91/pm_suspend.S @@ -1014,31 +1014,55 @@ ENTRY(at91_pm_suspend_in_sram) mov tmp1, #0 mcr p15, 0, tmp1, c7, c10, 4 - ldr tmp1, [r0, #PM_DATA_PMC] - str tmp1, .pmc_base - ldr tmp1, [r0, #PM_DATA_RAMC0] - str tmp1, .sramc_base - ldr tmp1, [r0, #PM_DATA_RAMC1] - str tmp1, .sramc1_base - ldr tmp1, [r0, #PM_DATA_RAMC_PHY] - str tmp1, .sramc_phy_base - ldr tmp1, [r0, #PM_DATA_MEMCTRL] - str tmp1, .memtype - ldr tmp1, [r0, #PM_DATA_MODE] - str tmp1, .pm_mode + /* Flush tlb. */ + mov r4, #0 + mcr p15, 0, r4, c8, c7, 0 + ldr tmp1, [r0, #PM_DATA_PMC_MCKR_OFFSET] str tmp1, .mckr_offset ldr tmp1, [r0, #PM_DATA_PMC_VERSION] str tmp1, .pmc_version - /* Both ldrne below are here to preload their address in the TLB */ + ldr tmp1, [r0, #PM_DATA_MEMCTRL] + str tmp1, .memtype + ldr tmp1, [r0, #PM_DATA_MODE] + str tmp1, .pm_mode + + /* + * ldrne below are here to preload their address in the TLB as access + * to RAM may be limited while in self-refresh. + */ + ldr tmp1, [r0, #PM_DATA_PMC] + str tmp1, .pmc_base + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + + ldr tmp1, [r0, #PM_DATA_RAMC0] + str tmp1, .sramc_base + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + + ldr tmp1, [r0, #PM_DATA_RAMC1] + str tmp1, .sramc1_base + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + +#ifndef CONFIG_SOC_SAM_V4_V5 + /* ldrne below are here to preload their address in the TLB */ + ldr tmp1, [r0, #PM_DATA_RAMC_PHY] + str tmp1, .sramc_phy_base + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + ldr tmp1, [r0, #PM_DATA_SHDWC] str tmp1, .shdwc cmp tmp1, #0 ldrne tmp2, [tmp1, #0] + ldr tmp1, [r0, #PM_DATA_SFRBU] str tmp1, .sfrbu cmp tmp1, #0 ldrne tmp2, [tmp1, #0x10] +#endif /* Active the self-refresh mode */ at91_sramc_self_refresh_ena diff --git a/arch/arm/mach-dove/include/mach/uncompress.h b/arch/arm/mach-dove/include/mach/uncompress.h index 7a4bd88380..ddf873f35e 100644 --- a/arch/arm/mach-dove/include/mach/uncompress.h +++ b/arch/arm/mach-dove/include/mach/uncompress.h @@ -11,7 +11,7 @@ #define LSR_THRE 0x20 -static void putc(const char c) +static inline void putc(const char c) { int i; @@ -24,7 +24,7 @@ static void putc(const char c) *UART_THR = c; } -static void flush(void) +static inline void flush(void) { } diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c index 11dcc369ec..c9d7c29d95 100644 --- a/arch/arm/mach-imx/mach-imx6q.c +++ b/arch/arm/mach-imx/mach-imx6q.c @@ -172,6 +172,9 @@ static void __init imx6q_init_machine(void) imx_get_soc_revision()); imx6q_enet_phy_init(); + + of_platform_default_populate(NULL, NULL, NULL); + imx_anatop_init(); cpu_is_imx6q() ? imx6q_pm_init() : imx6dl_pm_init(); imx6q_1588_init(); diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index 9244437cb1..f2ecca3399 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -619,6 +620,7 @@ static void __init imx6_pm_common_init(const struct imx6_pm_socdata static void imx6_pm_stby_poweroff(void) { + gic_cpu_if_down(0); imx6_set_lpm(STOP_POWER_OFF); imx6q_suspend_finish(0); diff --git a/arch/arm/mach-imx/src.c b/arch/arm/mach-imx/src.c index 95fd1fbb08..59a8e8cc44 100644 --- a/arch/arm/mach-imx/src.c +++ b/arch/arm/mach-imx/src.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -81,11 +82,6 @@ static const struct reset_control_ops imx_src_ops = { .reset = imx_src_reset_module, }; -static struct reset_controller_dev imx_reset_controller = { - .ops = &imx_src_ops, - .nr_resets = ARRAY_SIZE(sw_reset_bits), -}; - static void imx_gpcv2_set_m_core_pgc(bool enable, u32 offset) { writel_relaxed(enable, gpc_base + offset); @@ -177,10 +173,6 @@ void __init imx_src_init(void) src_base = of_iomap(np, 0); WARN_ON(!src_base); - imx_reset_controller.of_node = np; - if (IS_ENABLED(CONFIG_RESET_CONTROLLER)) - reset_controller_register(&imx_reset_controller); - /* * force warm reset sources to generate cold reset * for a more reliable restart @@ -214,3 +206,33 @@ void __init imx7_src_init(void) if (!gpc_base) return; } + +static const struct of_device_id imx_src_dt_ids[] = { + { .compatible = "fsl,imx51-src" }, + { /* sentinel */ } +}; + +static int imx_src_probe(struct platform_device *pdev) +{ + struct reset_controller_dev *rcdev; + + rcdev = devm_kzalloc(&pdev->dev, sizeof(*rcdev), GFP_KERNEL); + if (!rcdev) + return -ENOMEM; + + rcdev->ops = &imx_src_ops; + rcdev->dev = &pdev->dev; + rcdev->of_node = pdev->dev.of_node; + rcdev->nr_resets = ARRAY_SIZE(sw_reset_bits); + + return devm_reset_controller_register(&pdev->dev, rcdev); +} + +static struct platform_driver imx_src_driver = { + .driver = { + .name = "imx-src", + .of_match_table = imx_src_dt_ids, + }, + .probe = imx_src_probe, +}; +builtin_platform_driver(imx_src_driver); diff --git a/arch/arm/mach-omap1/include/mach/memory.h b/arch/arm/mach-omap1/include/mach/memory.h index 36bc0000cb..ba3a350479 100644 --- a/arch/arm/mach-omap1/include/mach/memory.h +++ b/arch/arm/mach-omap1/include/mach/memory.h @@ -9,16 +9,4 @@ /* REVISIT: omap1 legacy drivers still rely on this */ #include -/* - * Bus address is physical address, except for OMAP-1510 Local Bus. - * OMAP-1510 bus address is translated into a Local Bus address if the - * OMAP bus type is lbus. We do the address translation based on the - * device overriding the defaults used in the dma-mapping API. - */ - -/* - * OMAP-1510 Local Bus address offset - */ -#define OMAP1510_LB_OFFSET UL(0x30000000) - #endif diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c index 86d3b3c157..e60831c82b 100644 --- a/arch/arm/mach-omap1/usb.c +++ b/arch/arm/mach-omap1/usb.c @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -206,8 +207,6 @@ static inline void udc_device_init(struct omap_usb_config *pdata) #endif -#if IS_ENABLED(CONFIG_USB_OHCI_HCD) - /* The dmamask must be set for OHCI to work */ static u64 ohci_dmamask = ~(u32)0; @@ -236,20 +235,15 @@ static struct platform_device ohci_device = { static inline void ohci_device_init(struct omap_usb_config *pdata) { + if (!IS_ENABLED(CONFIG_USB_OHCI_HCD)) + return; + if (cpu_is_omap7xx()) ohci_resources[1].start = INT_7XX_USB_HHC_1; pdata->ohci_device = &ohci_device; pdata->ocpi_enable = &ocpi_enable; } -#else - -static inline void ohci_device_init(struct omap_usb_config *pdata) -{ -} - -#endif - #if defined(CONFIG_USB_OTG) && defined(CONFIG_ARCH_OMAP_OTG) static struct resource otg_resources[] = { @@ -534,6 +528,79 @@ static u32 __init omap1_usb2_init(unsigned nwires, unsigned alt_pingroup) } #ifdef CONFIG_ARCH_OMAP15XX +/* OMAP-1510 OHCI has its own MMU for DMA */ +#define OMAP1510_LB_MEMSIZE 32 /* Should be same as SDRAM size */ +#define OMAP1510_LB_CLOCK_DIV 0xfffec10c +#define OMAP1510_LB_MMU_CTL 0xfffec208 +#define OMAP1510_LB_MMU_LCK 0xfffec224 +#define OMAP1510_LB_MMU_LD_TLB 0xfffec228 +#define OMAP1510_LB_MMU_CAM_H 0xfffec22c +#define OMAP1510_LB_MMU_CAM_L 0xfffec230 +#define OMAP1510_LB_MMU_RAM_H 0xfffec234 +#define OMAP1510_LB_MMU_RAM_L 0xfffec238 + +/* + * Bus address is physical address, except for OMAP-1510 Local Bus. + * OMAP-1510 bus address is translated into a Local Bus address if the + * OMAP bus type is lbus. + */ +#define OMAP1510_LB_OFFSET UL(0x30000000) + +/* + * OMAP-1510 specific Local Bus clock on/off + */ +static int omap_1510_local_bus_power(int on) +{ + if (on) { + omap_writel((1 << 1) | (1 << 0), OMAP1510_LB_MMU_CTL); + udelay(200); + } else { + omap_writel(0, OMAP1510_LB_MMU_CTL); + } + + return 0; +} + +/* + * OMAP-1510 specific Local Bus initialization + * NOTE: This assumes 32MB memory size in OMAP1510LB_MEMSIZE. + * See also arch/mach-omap/memory.h for __virt_to_dma() and + * __dma_to_virt() which need to match with the physical + * Local Bus address below. + */ +static int omap_1510_local_bus_init(void) +{ + unsigned int tlb; + unsigned long lbaddr, physaddr; + + omap_writel((omap_readl(OMAP1510_LB_CLOCK_DIV) & 0xfffffff8) | 0x4, + OMAP1510_LB_CLOCK_DIV); + + /* Configure the Local Bus MMU table */ + for (tlb = 0; tlb < OMAP1510_LB_MEMSIZE; tlb++) { + lbaddr = tlb * 0x00100000 + OMAP1510_LB_OFFSET; + physaddr = tlb * 0x00100000 + PHYS_OFFSET; + omap_writel((lbaddr & 0x0fffffff) >> 22, OMAP1510_LB_MMU_CAM_H); + omap_writel(((lbaddr & 0x003ffc00) >> 6) | 0xc, + OMAP1510_LB_MMU_CAM_L); + omap_writel(physaddr >> 16, OMAP1510_LB_MMU_RAM_H); + omap_writel((physaddr & 0x0000fc00) | 0x300, OMAP1510_LB_MMU_RAM_L); + omap_writel(tlb << 4, OMAP1510_LB_MMU_LCK); + omap_writel(0x1, OMAP1510_LB_MMU_LD_TLB); + } + + /* Enable the walking table */ + omap_writel(omap_readl(OMAP1510_LB_MMU_CTL) | (1 << 3), OMAP1510_LB_MMU_CTL); + udelay(200); + + return 0; +} + +static void omap_1510_local_bus_reset(void) +{ + omap_1510_local_bus_power(1); + omap_1510_local_bus_init(); +} /* ULPD_DPLL_CTRL */ #define DPLL_IOB (1 << 13) @@ -543,25 +610,6 @@ static u32 __init omap1_usb2_init(unsigned nwires, unsigned alt_pingroup) /* ULPD_APLL_CTRL */ #define APLL_NDPLL_SWITCH (1 << 0) -static int omap_1510_usb_ohci_notifier(struct notifier_block *nb, - unsigned long event, void *data) -{ - struct device *dev = data; - - if (event != BUS_NOTIFY_ADD_DEVICE) - return NOTIFY_DONE; - - if (strncmp(dev_name(dev), "ohci", 4) == 0 && - dma_direct_set_offset(dev, PHYS_OFFSET, OMAP1510_LB_OFFSET, - (u64)-1)) - WARN_ONCE(1, "failed to set DMA offset\n"); - return NOTIFY_OK; -} - -static struct notifier_block omap_1510_usb_ohci_nb = { - .notifier_call = omap_1510_usb_ohci_notifier, -}; - static void __init omap_1510_usb_init(struct omap_usb_config *config) { unsigned int val; @@ -616,19 +664,19 @@ static void __init omap_1510_usb_init(struct omap_usb_config *config) } #endif -#if IS_ENABLED(CONFIG_USB_OHCI_HCD) - if (config->register_host) { + if (IS_ENABLED(CONFIG_USB_OHCI_HCD) && config->register_host) { int status; - bus_register_notifier(&platform_bus_type, - &omap_1510_usb_ohci_nb); ohci_device.dev.platform_data = config; + dma_direct_set_offset(&ohci_device.dev, PHYS_OFFSET, + OMAP1510_LB_OFFSET, (u64)-1); status = platform_device_register(&ohci_device); if (status) pr_debug("can't register OHCI device, %d\n", status); /* hcd explicitly gates 48MHz */ + + config->lb_reset = omap_1510_local_bus_reset; } -#endif } #else diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index 7f13adf26e..02c253de9b 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -112,7 +112,6 @@ config ARCH_OMAP2PLUS select PM_GENERIC_DOMAINS select PM_GENERIC_DOMAINS_OF select RESET_CONTROLLER - select SIMPLE_PM_BUS select SOC_BUS select TI_SYSC select OMAP_IRQCHIP diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 12b26e0468..0c2936c7a3 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -3614,6 +3614,8 @@ int omap_hwmod_init_module(struct device *dev, oh->flags |= HWMOD_SWSUP_SIDLE_ACT; if (data->cfg->quirks & SYSC_QUIRK_SWSUP_MSTANDBY) oh->flags |= HWMOD_SWSUP_MSTANDBY; + if (data->cfg->quirks & SYSC_QUIRK_CLKDM_NOAUTO) + oh->flags |= HWMOD_CLKDM_NOAUTO; error = omap_hwmod_check_module(dev, oh, data, sysc_fields, rev_offs, sysc_offs, syss_offs, diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a951276f05..a903b26cde 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -36,6 +36,10 @@ * +-----+ * |RSVD | JIT scratchpad * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE) + * | ... | caller-saved registers + * +-----+ + * | ... | arguments passed on stack + * ARM_SP during call => +-----| * | | * | ... | Function call stack * | | @@ -63,6 +67,12 @@ * * When popping registers off the stack at the end of a BPF function, we * reference them via the current ARM_FP register. + * + * Some eBPF operations are implemented via a call to a helper function. + * Such calls are "invisible" in the eBPF code, so it is up to the calling + * program to preserve any caller-saved ARM registers during the call. The + * JIT emits code to push and pop those registers onto the stack, immediately + * above the callee stack frame. */ #define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \ 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R9 | \ @@ -70,6 +80,8 @@ #define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR) #define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC) +#define CALLER_MASK (1 << ARM_R0 | 1 << ARM_R1 | 1 << ARM_R2 | 1 << ARM_R3) + enum { /* Stack layout - these are offsets from (top of stack - 4) */ BPF_R2_HI, @@ -464,6 +476,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) { + const int exclude_mask = BIT(ARM_R0) | BIT(ARM_R1); const s8 *tmp = bpf2a32[TMP_REG_1]; #if __LINUX_ARM_ARCH__ == 7 @@ -495,11 +508,17 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) emit(ARM_MOV_R(ARM_R0, rm), ctx); } + /* Push caller-saved registers on stack */ + emit(ARM_PUSH(CALLER_MASK & ~exclude_mask), ctx); + /* Call appropriate function */ emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx); emit_blx_r(ARM_IP, ctx); + /* Restore caller-saved registers from stack */ + emit(ARM_POP(CALLER_MASK & ~exclude_mask), ctx); + /* Save return value */ if (rd != ARM_R0) emit(ARM_MOV_R(rd, ARM_R0), ctx); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5c7ae4c395..fee914c716 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1931,8 +1931,6 @@ source "drivers/cpufreq/Kconfig" endmenu -source "drivers/firmware/Kconfig" - source "drivers/acpi/Kconfig" source "arch/arm64/kvm/Kconfig" diff --git a/arch/arm64/boot/dts/arm/foundation-v8.dtsi b/arch/arm64/boot/dts/arm/foundation-v8.dtsi index 05ae893d1b..fbf13f7c2b 100644 --- a/arch/arm64/boot/dts/arm/foundation-v8.dtsi +++ b/arch/arm64/boot/dts/arm/foundation-v8.dtsi @@ -115,7 +115,6 @@ v2m_refclk32khz: refclk32khz { bus@8000000 { compatible = "arm,vexpress,v2m-p1", "simple-bus"; - arm,v2m-memory-map = "rs1"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts b/arch/arm64/boot/dts/arm/fvp-base-revc.dts index b8a21092db..269b649934 100644 --- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts +++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts @@ -192,32 +192,9 @@ panel_in: endpoint { remote-endpoint = <&clcd_pads>; }; }; - - panel-timing { - clock-frequency = <63500127>; - hactive = <1024>; - hback-porch = <152>; - hfront-porch = <48>; - hsync-len = <104>; - vactive = <768>; - vback-porch = <23>; - vfront-porch = <3>; - vsync-len = <4>; - }; }; bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - #interrupt-cells = <1>; interrupt-map-mask = <0 0 63>; interrupt-map = <0 0 0 &gic 0 0 GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/arm/juno-base.dtsi b/arch/arm64/boot/dts/arm/juno-base.dtsi index 8e7a66943b..6288e104a0 100644 --- a/arch/arm64/boot/dts/arm/juno-base.dtsi +++ b/arch/arm64/boot/dts/arm/juno-base.dtsi @@ -27,8 +27,6 @@ mailbox: mhu@2b1f0000 { reg = <0x0 0x2b1f0000 0x0 0x1000>; interrupts = , ; - interrupt-names = "mhu_lpri_rx", - "mhu_hpri_rx"; #mbox-cells = <1>; clocks = <&soc_refclk100mhz>; clock-names = "apb_pclk"; @@ -804,16 +802,6 @@ memory@80000000 { }; bus@8000000 { - compatible = "simple-bus"; - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - #interrupt-cells = <1>; interrupt-map-mask = <0 0 15>; interrupt-map = <0 0 0 &gic 0 GIC_SPI 68 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/arm/juno-motherboard.dtsi b/arch/arm64/boot/dts/arm/juno-motherboard.dtsi index 40d95c58b5..fefd2b5f01 100644 --- a/arch/arm64/boot/dts/arm/juno-motherboard.dtsi +++ b/arch/arm64/boot/dts/arm/juno-motherboard.dtsi @@ -92,16 +92,23 @@ nmi-button { }; bus@8000000 { - motherboard-bus { + compatible = "simple-bus"; + #address-cells = <2>; + #size-cells = <1>; + ranges = <0 0x8000000 0 0x8000000 0x18000000>; + + motherboard-bus@8000000 { compatible = "arm,vexpress,v2p-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; - model = "V2M-Juno"; + ranges = <0 0 0 0x08000000 0x04000000>, + <1 0 0 0x14000000 0x04000000>, + <2 0 0 0x18000000 0x04000000>, + <3 0 0 0x1c000000 0x04000000>, + <4 0 0 0x0c000000 0x04000000>, + <5 0 0 0x10000000 0x04000000>; arm,hbi = <0x252>; arm,vexpress,site = <0>; - arm,v2m-memory-map = "rs1"; flash@0 { /* 2 * 32MiB NOR Flash memory mounted on CS0 */ @@ -218,7 +225,7 @@ led7 { }; }; - mmci@50000 { + mmc@50000 { compatible = "arm,pl180", "arm,primecell"; reg = <0x050000 0x1000>; interrupts = <5>; @@ -246,7 +253,7 @@ kmi@70000 { clock-names = "KMIREFCLK", "apb_pclk"; }; - wdt@f0000 { + watchdog@f0000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0f0000 0x10000>; interrupts = <7>; diff --git a/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts b/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts index 3050f45bad..258991ad7c 100644 --- a/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts +++ b/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts @@ -133,17 +133,6 @@ panel_in: endpoint { }; bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - #interrupt-cells = <1>; interrupt-map-mask = <0 0 63>; interrupt-map = <0 0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi index b917d9d3f1..33182d9e58 100644 --- a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi +++ b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi @@ -6,7 +6,7 @@ */ / { bus@8000000 { - motherboard-bus { + motherboard-bus@8000000 { arm,v2m-memory-map = "rs2"; iofpga-bus@300000000 { diff --git a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi index 4c4a381d2c..5f6cab668a 100644 --- a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi +++ b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi @@ -77,13 +77,21 @@ dvimode { }; bus@8000000 { - motherboard-bus { - arm,v2m-memory-map = "rs1"; + compatible = "simple-bus"; + #address-cells = <2>; + #size-cells = <1>; + ranges = <0 0x8000000 0 0x8000000 0x18000000>; + + motherboard-bus@8000000 { compatible = "arm,vexpress,v2m-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; + ranges = <0 0 0 0x08000000 0x04000000>, + <1 0 0 0x14000000 0x04000000>, + <2 0 0 0x18000000 0x04000000>, + <3 0 0 0x1c000000 0x04000000>, + <4 0 0 0x0c000000 0x04000000>, + <5 0 0 0x10000000 0x04000000>; flash@0 { compatible = "arm,vexpress-flash", "cfi-flash"; @@ -130,7 +138,7 @@ aaci@40000 { clock-names = "apb_pclk"; }; - mmci@50000 { + mmc@50000 { compatible = "arm,pl180", "arm,primecell"; reg = <0x050000 0x1000>; interrupts = <9>, <10>; @@ -190,7 +198,7 @@ v2m_serial3: serial@c0000 { clock-names = "uartclk", "apb_pclk"; }; - wdt@f0000 { + watchdog@f0000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0f0000 0x1000>; interrupts = <0>; diff --git a/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts b/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts index d859914500..5b6d9d8e93 100644 --- a/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts +++ b/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts @@ -145,61 +145,6 @@ temp-fpga { }; smb: bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, - <0 0 1 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, - <0 0 2 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, - <0 0 3 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, - <0 0 4 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, - <0 0 5 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, - <0 0 6 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, - <0 0 7 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, - <0 0 8 &gic GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, - <0 0 9 &gic GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, - <0 0 10 &gic GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>, - <0 0 11 &gic GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>, - <0 0 12 &gic GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>, - <0 0 13 &gic GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>, - <0 0 14 &gic GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>, - <0 0 15 &gic GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>, - <0 0 16 &gic GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>, - <0 0 17 &gic GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, - <0 0 18 &gic GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>, - <0 0 19 &gic GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>, - <0 0 20 &gic GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>, - <0 0 21 &gic GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>, - <0 0 22 &gic GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>, - <0 0 23 &gic GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>, - <0 0 24 &gic GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>, - <0 0 25 &gic GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>, - <0 0 26 &gic GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>, - <0 0 27 &gic GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, - <0 0 28 &gic GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>, - <0 0 29 &gic GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>, - <0 0 30 &gic GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>, - <0 0 31 &gic GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>, - <0 0 32 &gic GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, - <0 0 33 &gic GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, - <0 0 34 &gic GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, - <0 0 35 &gic GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, - <0 0 36 &gic GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, - <0 0 37 &gic GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>, - <0 0 38 &gic GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>, - <0 0 39 &gic GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>, - <0 0 40 &gic GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, - <0 0 41 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, - <0 0 42 &gic GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + ranges = <0x8000000 0 0x8000000 0x18000000>; }; }; diff --git a/arch/arm64/boot/dts/arm/vexpress-v2m-rs1.dtsi b/arch/arm64/boot/dts/arm/vexpress-v2m-rs1.dtsi index 2ad9fd7c94..8af4b77fe6 100644 --- a/arch/arm64/boot/dts/arm/vexpress-v2m-rs1.dtsi +++ b/arch/arm64/boot/dts/arm/vexpress-v2m-rs1.dtsi @@ -17,6 +17,7 @@ * TAKE CARE WHEN MAINTAINING THIS FILE TO PROPAGATE ANY RELEVANT * CHANGES TO vexpress-v2m.dtsi! */ +#include / { v2m_fixed_3v3: fixed-regulator-0 { @@ -101,16 +102,68 @@ led-8 { }; bus@8000000 { - motherboard-bus { - model = "V2M-P1"; + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 63>; + interrupt-map = <0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <0 1 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <0 2 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, + <0 3 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, + <0 4 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, + <0 5 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, + <0 6 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, + <0 7 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, + <0 8 &gic GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, + <0 9 &gic GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, + <0 10 &gic GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>, + <0 11 &gic GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>, + <0 12 &gic GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>, + <0 13 &gic GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>, + <0 14 &gic GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>, + <0 15 &gic GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>, + <0 16 &gic GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>, + <0 17 &gic GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, + <0 18 &gic GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>, + <0 19 &gic GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>, + <0 20 &gic GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>, + <0 21 &gic GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>, + <0 22 &gic GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>, + <0 23 &gic GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>, + <0 24 &gic GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>, + <0 25 &gic GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>, + <0 26 &gic GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>, + <0 27 &gic GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, + <0 28 &gic GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>, + <0 29 &gic GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>, + <0 30 &gic GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>, + <0 31 &gic GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>, + <0 32 &gic GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, + <0 33 &gic GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, + <0 34 &gic GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, + <0 35 &gic GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, + <0 36 &gic GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, + <0 37 &gic GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>, + <0 38 &gic GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>, + <0 39 &gic GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>, + <0 40 &gic GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, + <0 41 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, + <0 42 &gic GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + + motherboard-bus@8000000 { arm,hbi = <0x190>; arm,vexpress,site = <0>; - arm,v2m-memory-map = "rs1"; compatible = "arm,vexpress,v2m-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; + ranges = <0 0 0x08000000 0x04000000>, + <1 0 0x14000000 0x04000000>, + <2 0 0x18000000 0x04000000>, + <3 0 0x1c000000 0x04000000>, + <4 0 0x0c000000 0x04000000>, + <5 0 0x10000000 0x04000000>; nor_flash: flash@0 { compatible = "arm,vexpress-flash", "cfi-flash"; @@ -215,7 +268,7 @@ aaci@40000 { clock-names = "apb_pclk"; }; - mmci@50000 { + mmc@50000 { compatible = "arm,pl180", "arm,primecell"; reg = <0x050000 0x1000>; interrupts = <9>, <10>; @@ -275,7 +328,7 @@ v2m_serial3: serial@c0000 { clock-names = "uartclk", "apb_pclk"; }; - wdt@f0000 { + watchdog@f0000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0f0000 0x1000>; interrupts = <0>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 343ecf0e89..06b36cc658 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -405,9 +405,9 @@ esdhc1: mmc@2150000 { interrupts = ; clock-frequency = <0>; /* fixed up by bootloader */ clocks = <&clockgen QORIQ_CLK_HWACCEL 1>; - voltage-ranges = <1800 1800 3300 3300>; + voltage-ranges = <1800 1800>; sdhci,auto-cmd12; - broken-cd; + non-removable; little-endian; bus-width = <4>; status = "disabled"; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi index d0456daefd..9db9b90bf2 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi @@ -102,6 +102,7 @@ reg_vdd_arm: BUCK2 { regulator-min-microvolt = <850000>; regulator-max-microvolt = <950000>; regulator-boot-on; + regulator-always-on; regulator-ramp-delay = <3125>; nxp,dvs-run-voltage = <950000>; nxp,dvs-standby-voltage = <850000>; diff --git a/arch/arm64/boot/dts/freescale/imx8mq-evk.dts b/arch/arm64/boot/dts/freescale/imx8mq-evk.dts index 49f9db971f..b83df77195 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx8mq-evk.dts @@ -337,6 +337,8 @@ n25q256a: flash@0 { #size-cells = <1>; compatible = "micron,n25q256a", "jedec,spi-nor"; spi-max-frequency = <29000000>; + spi-tx-bus-width = <1>; + spi-rx-bus-width = <4>; }; }; diff --git a/arch/arm64/boot/dts/qcom/pm8150.dtsi b/arch/arm64/boot/dts/qcom/pm8150.dtsi index c566a64b13..0df76f7b1c 100644 --- a/arch/arm64/boot/dts/qcom/pm8150.dtsi +++ b/arch/arm64/boot/dts/qcom/pm8150.dtsi @@ -48,8 +48,10 @@ pm8150_0: pmic@0 { #size-cells = <0>; pon: power-on@800 { - compatible = "qcom,pm8916-pon"; + compatible = "qcom,pm8998-pon"; reg = <0x0800>; + mode-bootloader = <0x2>; + mode-recovery = <0x1>; pon_pwrkey: pwrkey { compatible = "qcom,pm8941-pwrkey"; diff --git a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts index 8ac96f8e79..28d5b55285 100644 --- a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts +++ b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts @@ -804,6 +804,16 @@ lt9611_rst_pin: lt9611-rst-pin { }; }; +&pon_pwrkey { + status = "okay"; +}; + +&pon_resin { + status = "okay"; + + linux,code = ; +}; + &qupv3_id_0 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi index 0f2b3c00e4..70c88c37de 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi @@ -273,7 +273,6 @@ sound: sound { "Headphone Jack", "HPOL", "Headphone Jack", "HPOR"; - #sound-dai-cells = <0>; #address-cells = <1>; #size-cells = <0>; @@ -301,11 +300,11 @@ sound_multimedia1_codec: codec { }; }; - dai-link@2 { + dai-link@5 { link-name = "MultiMedia2"; - reg = <2>; + reg = ; cpu { - sound-dai = <&lpass_cpu 2>; + sound-dai = <&lpass_cpu LPASS_DP_RX>; }; codec { @@ -782,7 +781,7 @@ secondary_mi2s: mi2s@1 { qcom,playback-sd-lines = <0>; }; - hdmi-primary@0 { + hdmi@5 { reg = ; }; }; diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi index 53a21d0861..fd78f16181 100644 --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi @@ -1850,9 +1850,9 @@ rpmhcc: clock-controller { cpufreq_hw: cpufreq@18591000 { compatible = "qcom,cpufreq-epss"; - reg = <0 0x18591100 0 0x900>, - <0 0x18592100 0 0x900>, - <0 0x18593100 0 0x900>; + reg = <0 0x18591000 0 0x1000>, + <0 0x18592000 0 0x1000>, + <0 0x18593000 0 0x1000>; clocks = <&rpmhcc RPMH_CXO_CLK>, <&gcc GCC_GPLL0>; clock-names = "xo", "alternate"; #freq-domain-cells = <1>; diff --git a/arch/arm64/boot/dts/qcom/sdm630.dtsi b/arch/arm64/boot/dts/qcom/sdm630.dtsi index 9153e6616b..9c7f87e42f 100644 --- a/arch/arm64/boot/dts/qcom/sdm630.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm630.dtsi @@ -654,9 +654,20 @@ a2noc: interconnect@1704000 { compatible = "qcom,sdm660-a2noc"; reg = <0x01704000 0xc100>; #interconnect-cells = <1>; - clock-names = "bus", "bus_a"; + clock-names = "bus", + "bus_a", + "ipa", + "ufs_axi", + "aggre2_ufs_axi", + "aggre2_usb3_axi", + "cfg_noc_usb2_axi"; clocks = <&rpmcc RPM_SMD_AGGR2_NOC_CLK>, - <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>; + <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>, + <&rpmcc RPM_SMD_IPA_CLK>, + <&gcc GCC_UFS_AXI_CLK>, + <&gcc GCC_AGGRE2_UFS_AXI_CLK>, + <&gcc GCC_AGGRE2_USB3_AXI_CLK>, + <&gcc GCC_CFG_NOC_USB2_AXI_CLK>; }; mnoc: interconnect@1745000 { diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi index 6d7172e6f4..b3b9119261 100644 --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi @@ -128,23 +128,28 @@ camera_mem: memory@8bf00000 { no-map; }; - wlan_msa_mem: memory@8c400000 { - reg = <0 0x8c400000 0 0x100000>; + ipa_fw_mem: memory@8c400000 { + reg = <0 0x8c400000 0 0x10000>; no-map; }; - gpu_mem: memory@8c515000 { - reg = <0 0x8c515000 0 0x2000>; + ipa_gsi_mem: memory@8c410000 { + reg = <0 0x8c410000 0 0x5000>; no-map; }; - ipa_fw_mem: memory@8c517000 { - reg = <0 0x8c517000 0 0x5a000>; + gpu_mem: memory@8c415000 { + reg = <0 0x8c415000 0 0x2000>; no-map; }; - adsp_mem: memory@8c600000 { - reg = <0 0x8c600000 0 0x1a00000>; + adsp_mem: memory@8c500000 { + reg = <0 0x8c500000 0 0x1a00000>; + no-map; + }; + + wlan_msa_mem: memory@8df00000 { + reg = <0 0x8df00000 0 0x100000>; no-map; }; diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 385e502943..2ba23aa582 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -16,6 +16,17 @@ #include "sdm850.dtsi" #include "pm8998.dtsi" +/* + * Update following upstream (sdm845.dtsi) reserved + * memory mappings for firmware loading to succeed + * and enable the IPA device. + */ +/delete-node/ &ipa_fw_mem; +/delete-node/ &ipa_gsi_mem; +/delete-node/ &gpu_mem; +/delete-node/ &adsp_mem; +/delete-node/ &wlan_msa_mem; + / { model = "Lenovo Yoga C630"; compatible = "lenovo,yoga-c630", "qcom,sdm845"; @@ -58,6 +69,29 @@ panel_in_edp: endpoint { }; }; + /* Reserved memory changes for IPA */ + reserved-memory { + wlan_msa_mem: memory@8c400000 { + reg = <0 0x8c400000 0 0x100000>; + no-map; + }; + + gpu_mem: memory@8c515000 { + reg = <0 0x8c515000 0 0x2000>; + no-map; + }; + + ipa_fw_mem: memory@8c517000 { + reg = <0 0x8c517000 0 0x5a000>; + no-map; + }; + + adsp_mem: memory@8c600000 { + reg = <0 0x8c600000 0 0x1a00000>; + no-map; + }; + }; + sn65dsi86_refclk: sn65dsi86-refclk { compatible = "fixed-clock"; #clock-cells = <0>; diff --git a/arch/arm64/configs/bcm2711_defconfig b/arch/arm64/configs/bcm2711_defconfig index d1966d9266..2a7e9aaf2b 100644 --- a/arch/arm64/configs/bcm2711_defconfig +++ b/arch/arm64/configs/bcm2711_defconfig @@ -506,6 +506,7 @@ CONFIG_BCMGENET=y CONFIG_ENC28J60=m CONFIG_QCA7000_SPI=m CONFIG_QCA7000_UART=m +CONFIG_R8169=m CONFIG_WIZNET_W5100=m CONFIG_WIZNET_W5100_SPI=m CONFIG_MICREL_PHY=y @@ -1367,6 +1368,7 @@ CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m CONFIG_F2FS_FS=y +CONFIG_F2FS_FS_SECURITY=y CONFIG_FS_ENCRYPTION=y CONFIG_FANOTIFY=y CONFIG_QFMT_V1=m diff --git a/arch/arm64/configs/bcmrpi3_defconfig b/arch/arm64/configs/bcmrpi3_defconfig index 6b0ca5314a..f168571630 100644 --- a/arch/arm64/configs/bcmrpi3_defconfig +++ b/arch/arm64/configs/bcmrpi3_defconfig @@ -1217,6 +1217,7 @@ CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m CONFIG_F2FS_FS=y +CONFIG_F2FS_FS_SECURITY=y CONFIG_FS_ENCRYPTION=y CONFIG_FANOTIFY=y CONFIG_QFMT_V1=m diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 156d96afbb..545197bc05 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -245,7 +245,6 @@ CONFIG_DEVTMPFS_MOUNT=y CONFIG_FW_LOADER_USER_HELPER=y CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y CONFIG_HISILICON_LPC=y -CONFIG_SIMPLE_PM_BUS=y CONFIG_FSL_MC_BUS=y CONFIG_TEGRA_ACONNECT=m CONFIG_GNSS=m diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 5df6193fc4..8d741f7137 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -54,7 +54,7 @@ $(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE # runtime. Because the hypervisor is part of the kernel binary, relocations # produce a kernel VA. We enumerate relocations targeting hyp at build time # and convert the kernel VAs at those positions to hyp VAs. -$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel +$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE $(call if_changed,hyprel) # 5) Compile hyp-reloc.S and link it into the existing partially linked object. diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index f9bb3b1413..c84fe24b2e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -50,9 +50,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { - if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled()) - static_branch_enable(&kvm_arm_pmu_available); - return perf_register_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f5065f23b4..2af3c37445 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -740,7 +740,14 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, kvm_pmu_create_perf_event(vcpu, select_idx); } -int kvm_pmu_probe_pmuver(void) +void kvm_host_pmu_init(struct arm_pmu *pmu) +{ + if (pmu->pmuver != 0 && pmu->pmuver != ID_AA64DFR0_PMUVER_IMP_DEF && + !kvm_arm_support_pmu_v3() && !is_protected_kvm_enabled()) + static_branch_enable(&kvm_arm_pmu_available); +} + +static int kvm_pmu_probe_pmuver(void) { struct perf_event_attr attr = { }; struct perf_event *event; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 23505fc353..a8158c9489 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -43,7 +43,7 @@ void __init arm64_hugetlb_cma_reserve(void) #ifdef CONFIG_ARM64_4K_PAGES order = PUD_SHIFT - PAGE_SHIFT; #else - order = CONT_PMD_SHIFT + PMD_SHIFT - PAGE_SHIFT; + order = CONT_PMD_SHIFT - PAGE_SHIFT; #endif /* * HugeTLB CMA reservation is required for gigantic diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 9d4d898df7..823d3d5a9e 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -8,7 +8,7 @@ config CSKY select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS - select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 + select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace) select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select COMMON_CLK select CLKSRC_MMIO @@ -241,6 +241,7 @@ endchoice menuconfig HAVE_TCM bool "Tightly-Coupled/Sram Memory" + depends on !COMPILE_TEST help The implementation are not only used by TCM (Tightly-Coupled Meory) but also used by sram on SOC bus. It follow existed linux tcm diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 91818787d8..02b72a0007 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -74,7 +74,6 @@ static __always_inline unsigned long __fls(unsigned long x) * bug fix, why only could use atomic!!!! */ #include -#define __clear_bit(nr, vaddr) clear_bit(nr, vaddr) #include #include diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c index 0105ac81b4..1a5f54e0d2 100644 --- a/arch/csky/kernel/ptrace.c +++ b/arch/csky/kernel/ptrace.c @@ -99,7 +99,8 @@ static int gpr_set(struct task_struct *target, if (ret) return ret; - regs.sr = task_pt_regs(target)->sr; + /* BIT(0) of regs.sr is Condition Code/Carry bit */ + regs.sr = (regs.sr & BIT(0)) | (task_pt_regs(target)->sr & ~BIT(0)); #ifdef CONFIG_CPU_HAS_HILO regs.dcsr = task_pt_regs(target)->dcsr; #endif diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index bc4238b9f7..c7b763d2f5 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -52,10 +52,14 @@ static long restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { int err = 0; + unsigned long sr = regs->sr; /* sc_pt_regs is structured the same as the start of pt_regs */ err |= __copy_from_user(regs, &sc->sc_pt_regs, sizeof(struct pt_regs)); + /* BIT(0) of regs->sr is Condition Code/Carry bit */ + regs->sr = (sr & ~1) | (regs->sr & 1); + /* Restore the floating-point state. */ err |= restore_fpu_state(sc); diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 045792cde4..1e33666fa6 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -388,8 +388,6 @@ config CRASH_DUMP help Generate crash dump after being started by kexec. -source "drivers/firmware/Kconfig" - endmenu menu "Power management and ACPI options" diff --git a/arch/m68k/68000/entry.S b/arch/m68k/68000/entry.S index 259b3661b6..997b549330 100644 --- a/arch/m68k/68000/entry.S +++ b/arch/m68k/68000/entry.S @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -25,7 +24,6 @@ .globl system_call .globl resume .globl ret_from_exception -.globl ret_from_signal .globl sys_call_table .globl bad_interrupt .globl inthandler1 @@ -59,8 +57,6 @@ do_trace: subql #4,%sp /* dummy return address */ SAVE_SWITCH_STACK jbsr syscall_trace_leave - -ret_from_signal: RESTORE_SWITCH_STACK addql #4,%sp jra ret_from_exception diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 774c35f47e..0b50da08a9 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -29,7 +29,6 @@ config M68K select NO_DMA if !MMU && !COLDFIRE select OLD_SIGACTION select OLD_SIGSUSPEND3 - select SET_FS select UACCESS_MEMCPY if !MMU select VIRT_TO_BUS select ZONE_DMA diff --git a/arch/m68k/coldfire/entry.S b/arch/m68k/coldfire/entry.S index d43a02795a..9f337c7024 100644 --- a/arch/m68k/coldfire/entry.S +++ b/arch/m68k/coldfire/entry.S @@ -31,7 +31,6 @@ #include #include #include -#include #include #include @@ -51,7 +50,6 @@ sw_usp: .globl system_call .globl resume .globl ret_from_exception -.globl ret_from_signal .globl sys_call_table .globl inthandler @@ -98,8 +96,6 @@ ENTRY(system_call) subql #4,%sp /* dummy return address */ SAVE_SWITCH_STACK jbsr syscall_trace_leave - -ret_from_signal: RESTORE_SWITCH_STACK addql #4,%sp diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index 3750819ac5..f4d82c619a 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -9,7 +9,6 @@ #define __ASM_M68K_PROCESSOR_H #include -#include #include #include @@ -75,11 +74,37 @@ static inline void wrusp(unsigned long usp) #define TASK_UNMAPPED_BASE 0 #endif +/* Address spaces (or Function Codes in Motorola lingo) */ +#define USER_DATA 1 +#define USER_PROGRAM 2 +#define SUPER_DATA 5 +#define SUPER_PROGRAM 6 +#define CPU_SPACE 7 + +#ifdef CONFIG_CPU_HAS_ADDRESS_SPACES +/* + * Set the SFC/DFC registers for special MM operations. For most normal + * operation these remain set to USER_DATA for the uaccess routines. + */ +static inline void set_fc(unsigned long val) +{ + WARN_ON_ONCE(in_interrupt()); + + __asm__ __volatile__ ("movec %0,%/sfc\n\t" + "movec %0,%/dfc\n\t" + : /* no outputs */ : "r" (val) : "memory"); +} +#else +static inline void set_fc(unsigned long val) +{ +} +#endif /* CONFIG_CPU_HAS_ADDRESS_SPACES */ + struct thread_struct { unsigned long ksp; /* kernel stack pointer */ unsigned long usp; /* user stack pointer */ unsigned short sr; /* saved status register */ - unsigned short fs; /* saved fs (sfc, dfc) */ + unsigned short fc; /* saved fc (sfc, dfc) */ unsigned long crp[2]; /* cpu root pointer */ unsigned long esp0; /* points to SR of stack frame */ unsigned long faddr; /* info about last fault */ @@ -92,7 +117,7 @@ struct thread_struct { #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) init_stack, \ .sr = PS_S, \ - .fs = __KERNEL_DS, \ + .fc = USER_DATA, \ } /* diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 15a757073f..c952658ba7 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -4,7 +4,6 @@ #include #include -#include /* * On machines with 4k pages we default to an 8k thread size, though we @@ -27,7 +26,6 @@ struct thread_info { struct task_struct *task; /* main task structure */ unsigned long flags; - mm_segment_t addr_limit; /* thread address space */ int preempt_count; /* 0 => preemptable, <0 => BUG */ __u32 cpu; /* should always be 0 on m68k */ unsigned long tp_value; /* thread pointer */ @@ -37,7 +35,6 @@ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ - .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ } diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index a6318ccd30..b882e2f4f5 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -13,13 +13,12 @@ static inline void flush_tlb_kernel_page(void *addr) if (CPU_IS_COLDFIRE) { mmu_write(MMUOR, MMUOR_CNL); } else if (CPU_IS_040_OR_060) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); + set_fc(SUPER_DATA); __asm__ __volatile__(".chip 68040\n\t" "pflush (%0)\n\t" ".chip 68k" : : "a" (addr)); - set_fs(old_fs); + set_fc(USER_DATA); } else if (CPU_IS_020_OR_030) __asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr)); } @@ -84,12 +83,8 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - if (vma->vm_mm == current->active_mm) { - mm_segment_t old_fs = force_uaccess_begin(); - + if (vma->vm_mm == current->active_mm) __flush_tlb_one(addr); - force_uaccess_end(old_fs); - } } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/m68k/include/asm/traps.h b/arch/m68k/include/asm/traps.h index 4aff3358fb..a9d5c1c870 100644 --- a/arch/m68k/include/asm/traps.h +++ b/arch/m68k/include/asm/traps.h @@ -267,6 +267,10 @@ struct frame { } un; }; +#ifdef CONFIG_M68040 +asmlinkage void berr_040cleanup(struct frame *fp); +#endif + #endif /* __ASSEMBLY__ */ #endif /* _M68K_TRAPS_H */ diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index f98208ccbb..ba67052388 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -9,13 +9,16 @@ */ #include #include -#include #include /* We let the MMU do all checking */ static inline int access_ok(const void __user *addr, unsigned long size) { + /* + * XXX: for !CONFIG_CPU_HAS_ADDRESS_SPACES this really needs to check + * for TASK_SIZE! + */ return 1; } @@ -35,12 +38,9 @@ static inline int access_ok(const void __user *addr, #define MOVES "move" #endif -extern int __put_user_bad(void); -extern int __get_user_bad(void); - -#define __put_user_asm(res, x, ptr, bwl, reg, err) \ +#define __put_user_asm(inst, res, x, ptr, bwl, reg, err) \ asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ + "1: "inst"."#bwl" %2,%1\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -56,6 +56,31 @@ asm volatile ("\n" \ : "+d" (res), "=m" (*(ptr)) \ : #reg (x), "i" (err)) +#define __put_user_asm8(inst, res, x, ptr) \ +do { \ + const void *__pu_ptr = (const void __force *)(ptr); \ + \ + asm volatile ("\n" \ + "1: "inst".l %2,(%1)+\n" \ + "2: "inst".l %R2,(%1)\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: movel %3,%0\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .long 3b,10b\n" \ + " .previous" \ + : "+d" (res), "+a" (__pu_ptr) \ + : "r" (x), "i" (-EFAULT) \ + : "memory"); \ +} while (0) + /* * These are the main single-value transfer routines. They automatically * use the right size if we just have the right pointer type. @@ -68,51 +93,29 @@ asm volatile ("\n" \ __chk_user_ptr(ptr); \ switch (sizeof (*(ptr))) { \ case 1: \ - __put_user_asm(__pu_err, __pu_val, ptr, b, d, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, b, d, -EFAULT); \ break; \ case 2: \ - __put_user_asm(__pu_err, __pu_val, ptr, w, r, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, w, r, -EFAULT); \ break; \ case 4: \ - __put_user_asm(__pu_err, __pu_val, ptr, l, r, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, l, r, -EFAULT); \ break; \ case 8: \ - { \ - const void __user *__pu_ptr = (ptr); \ - asm volatile ("\n" \ - "1: "MOVES".l %2,(%1)+\n" \ - "2: "MOVES".l %R2,(%1)\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: movel %3,%0\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .long 3b,10b\n" \ - " .previous" \ - : "+d" (__pu_err), "+a" (__pu_ptr) \ - : "r" (__pu_val), "i" (-EFAULT) \ - : "memory"); \ + __put_user_asm8(MOVES, __pu_err, __pu_val, ptr); \ break; \ - } \ default: \ - __pu_err = __put_user_bad(); \ - break; \ + BUILD_BUG(); \ } \ __pu_err; \ }) #define put_user(x, ptr) __put_user(x, ptr) -#define __get_user_asm(res, x, ptr, type, bwl, reg, err) ({ \ +#define __get_user_asm(inst, res, x, ptr, type, bwl, reg, err) ({ \ type __gu_val; \ asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ + "1: "inst"."#bwl" %2,%1\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -130,53 +133,57 @@ asm volatile ("\n" \ (x) = (__force typeof(*(ptr)))(__force unsigned long)__gu_val; \ }) +#define __get_user_asm8(inst, res, x, ptr) \ +do { \ + const void *__gu_ptr = (const void __force *)(ptr); \ + union { \ + u64 l; \ + __typeof__(*(ptr)) t; \ + } __gu_val; \ + \ + asm volatile ("\n" \ + "1: "inst".l (%2)+,%1\n" \ + "2: "inst".l (%2),%R1\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: move.l %3,%0\n" \ + " sub.l %1,%1\n" \ + " sub.l %R1,%R1\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .previous" \ + : "+d" (res), "=&r" (__gu_val.l), \ + "+a" (__gu_ptr) \ + : "i" (-EFAULT) \ + : "memory"); \ + (x) = __gu_val.t; \ +} while (0) + #define __get_user(x, ptr) \ ({ \ int __gu_err = 0; \ __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm(__gu_err, x, ptr, u8, b, d, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u8, b, d, -EFAULT); \ break; \ case 2: \ - __get_user_asm(__gu_err, x, ptr, u16, w, r, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u16, w, r, -EFAULT); \ break; \ case 4: \ - __get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u32, l, r, -EFAULT); \ break; \ - case 8: { \ - const void __user *__gu_ptr = (ptr); \ - union { \ - u64 l; \ - __typeof__(*(ptr)) t; \ - } __gu_val; \ - asm volatile ("\n" \ - "1: "MOVES".l (%2)+,%1\n" \ - "2: "MOVES".l (%2),%R1\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: move.l %3,%0\n" \ - " sub.l %1,%1\n" \ - " sub.l %R1,%R1\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .previous" \ - : "+d" (__gu_err), "=&r" (__gu_val.l), \ - "+a" (__gu_ptr) \ - : "i" (-EFAULT) \ - : "memory"); \ - (x) = __gu_val.t; \ + case 8: \ + __get_user_asm8(MOVES, __gu_err, x, ptr); \ break; \ - } \ default: \ - __gu_err = __get_user_bad(); \ - break; \ + BUILD_BUG(); \ } \ __gu_err; \ }) @@ -322,16 +329,19 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n) switch (n) { case 1: - __put_user_asm(res, *(u8 *)from, (u8 __user *)to, b, d, 1); + __put_user_asm(MOVES, res, *(u8 *)from, (u8 __user *)to, + b, d, 1); break; case 2: - __put_user_asm(res, *(u16 *)from, (u16 __user *)to, w, r, 2); + __put_user_asm(MOVES, res, *(u16 *)from, (u16 __user *)to, + w, r, 2); break; case 3: __constant_copy_to_user_asm(res, to, from, tmp, 3, w, b,); break; case 4: - __put_user_asm(res, *(u32 *)from, (u32 __user *)to, l, r, 4); + __put_user_asm(MOVES, res, *(u32 *)from, (u32 __user *)to, + l, r, 4); break; case 5: __constant_copy_to_user_asm(res, to, from, tmp, 5, l, b,); @@ -380,8 +390,65 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : TASK_SIZE) +#define HAVE_GET_KERNEL_NOFAULT + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + type *__gk_dst = (type *)(dst); \ + type *__gk_src = (type *)(src); \ + int __gk_err = 0; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u8, b, d, -EFAULT); \ + break; \ + case 2: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u16, w, r, -EFAULT); \ + break; \ + case 4: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u32, l, r, -EFAULT); \ + break; \ + case 8: \ + __get_user_asm8("move", __gk_err, *__gk_dst, __gk_src); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + if (unlikely(__gk_err)) \ + goto err_label; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + type __pk_src = *(type *)(src); \ + type *__pk_dst = (type *)(dst); \ + int __pk_err = 0; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + b, d, -EFAULT); \ + break; \ + case 2: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + w, r, -EFAULT); \ + break; \ + case 4: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + l, r, -EFAULT); \ + break; \ + case 8: \ + __put_user_asm8("move", __pk_err, __pk_src, __pk_dst); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + if (unlikely(__pk_err)) \ + goto err_label; \ +} while (0) extern long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 9dd76fbb7c..9434fca68d 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -78,20 +77,38 @@ ENTRY(__sys_clone3) ENTRY(sys_sigreturn) SAVE_SWITCH_STACK - movel %sp,%sp@- | switch_stack pointer - pea %sp@(SWITCH_STACK_SIZE+4) | pt_regs pointer + movel %sp,%a1 | switch_stack pointer + lea %sp@(SWITCH_STACK_SIZE),%a0 | pt_regs pointer + lea %sp@(-84),%sp | leave a gap + movel %a1,%sp@- + movel %a0,%sp@- jbsr do_sigreturn - addql #8,%sp - RESTORE_SWITCH_STACK - rts + jra 1f | shared with rt_sigreturn() ENTRY(sys_rt_sigreturn) SAVE_SWITCH_STACK - movel %sp,%sp@- | switch_stack pointer - pea %sp@(SWITCH_STACK_SIZE+4) | pt_regs pointer + movel %sp,%a1 | switch_stack pointer + lea %sp@(SWITCH_STACK_SIZE),%a0 | pt_regs pointer + lea %sp@(-84),%sp | leave a gap + movel %a1,%sp@- + movel %a0,%sp@- + | stack contents: + | [original pt_regs address] [original switch_stack address] + | [gap] [switch_stack] [pt_regs] [exception frame] jbsr do_rt_sigreturn - addql #8,%sp + +1: + | stack contents now: + | [original pt_regs address] [original switch_stack address] + | [unused part of the gap] [moved switch_stack] [moved pt_regs] + | [replacement exception frame] + | return value of do_{rt_,}sigreturn() points to moved switch_stack. + + movel %d0,%sp | discard the leftover junk RESTORE_SWITCH_STACK + | stack contents now is just [syscall return address] [pt_regs] [frame] + | return pt_regs.d0 + movel %sp@(PT_OFF_D0+4),%d0 rts ENTRY(buserr) @@ -182,25 +199,6 @@ do_trace_exit: addql #4,%sp jra .Lret_from_exception -ENTRY(ret_from_signal) - movel %curptr@(TASK_STACK),%a1 - tstb %a1@(TINFO_FLAGS+2) - jge 1f - jbsr syscall_trace -1: RESTORE_SWITCH_STACK - addql #4,%sp -/* on 68040 complete pending writebacks if any */ -#ifdef CONFIG_M68040 - bfextu %sp@(PT_OFF_FORMATVEC){#0,#4},%d0 - subql #7,%d0 | bus error frame ? - jbne 1f - movel %sp,%sp@- - jbsr berr_040cleanup - addql #4,%sp -1: -#endif - jra .Lret_from_exception - ENTRY(system_call) SAVE_ALL_SYS @@ -338,7 +336,7 @@ resume: /* save fs (sfc,%dfc) (may be pointing to kernel memory) */ movec %sfc,%d0 - movew %d0,%a0@(TASK_THREAD+THREAD_FS) + movew %d0,%a0@(TASK_THREAD+THREAD_FC) /* save usp */ /* it is better to use a movel here instead of a movew 8*) */ @@ -424,7 +422,7 @@ resume: movel %a0,%usp /* restore fs (sfc,%dfc) */ - movew %a1@(TASK_THREAD+THREAD_FS),%a0 + movew %a1@(TASK_THREAD+THREAD_FC),%a0 movec %a0,%sfc movec %a0,%dfc diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index db49f90917..1ab692b952 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -92,7 +92,7 @@ void show_regs(struct pt_regs * regs) void flush_thread(void) { - current->thread.fs = __USER_DS; + current->thread.fc = USER_DATA; #ifdef CONFIG_FPU if (!FPU_IS_EMU) { unsigned long zero = 0; @@ -155,7 +155,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, * Must save the current SFC/DFC value, NOT the value when * the parent was last descheduled - RGH 10-08-96 */ - p->thread.fs = get_fs().seg; + p->thread.fc = USER_DATA; if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 8f215e79e7..338817d0cb 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -447,7 +447,7 @@ static inline void save_fpu_state(struct sigcontext *sc, struct pt_regs *regs) if (CPU_IS_060 ? sc->sc_fpstate[2] : sc->sc_fpstate[0]) { fpu_version = sc->sc_fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -510,7 +510,7 @@ static inline int rt_save_fpu_state(struct ucontext __user *uc, struct pt_regs * if (!(CPU_IS_060 || CPU_IS_COLDFIRE)) context_size = fpstate[1]; fpu_version = fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -641,56 +641,35 @@ static inline void siginfo_build_tests(void) static int mangle_kernel_stack(struct pt_regs *regs, int formatvec, void __user *fp) { - int fsize = frame_extra_sizes(formatvec >> 12); - if (fsize < 0) { + int extra = frame_extra_sizes(formatvec >> 12); + char buf[sizeof_field(struct frame, un)]; + + if (extra < 0) { /* * user process trying to return with weird frame format */ pr_debug("user process returning with weird frame format\n"); - return 1; + return -1; } - if (!fsize) { - regs->format = formatvec >> 12; - regs->vector = formatvec & 0xfff; - } else { - struct switch_stack *sw = (struct switch_stack *)regs - 1; - /* yes, twice as much as max(sizeof(frame.un.fmt)) */ - unsigned long buf[sizeof_field(struct frame, un) / 2]; + if (extra && copy_from_user(buf, fp, extra)) + return -1; + regs->format = formatvec >> 12; + regs->vector = formatvec & 0xfff; + if (extra) { + void *p = (struct switch_stack *)regs - 1; + struct frame *new = (void *)regs - extra; + int size = sizeof(struct pt_regs)+sizeof(struct switch_stack); - /* that'll make sure that expansion won't crap over data */ - if (copy_from_user(buf + fsize / 4, fp, fsize)) - return 1; - - /* point of no return */ - regs->format = formatvec >> 12; - regs->vector = formatvec & 0xfff; -#define frame_offset (sizeof(struct pt_regs)+sizeof(struct switch_stack)) - __asm__ __volatile__ ( -#ifdef CONFIG_COLDFIRE - " movel %0,%/sp\n\t" - " bra ret_from_signal\n" -#else - " movel %0,%/a0\n\t" - " subl %1,%/a0\n\t" /* make room on stack */ - " movel %/a0,%/sp\n\t" /* set stack pointer */ - /* move switch_stack and pt_regs */ - "1: movel %0@+,%/a0@+\n\t" - " dbra %2,1b\n\t" - " lea %/sp@(%c3),%/a0\n\t" /* add offset of fmt */ - " lsrl #2,%1\n\t" - " subql #1,%1\n\t" - /* copy to the gap we'd made */ - "2: movel %4@+,%/a0@+\n\t" - " dbra %1,2b\n\t" - " bral ret_from_signal\n" + memmove(p - extra, p, size); + memcpy(p - extra + size, buf, extra); + current->thread.esp0 = (unsigned long)&new->ptregs; +#ifdef CONFIG_M68040 + /* on 68040 complete pending writebacks if any */ + if (new->ptregs.format == 7) // bus error frame + berr_040cleanup(new); #endif - : /* no outputs, it doesn't ever return */ - : "a" (sw), "d" (fsize), "d" (frame_offset/4-1), - "n" (frame_offset), "a" (buf + fsize/4) - : "a0"); -#undef frame_offset } - return 0; + return extra; } static inline int @@ -698,7 +677,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u { int formatvec; struct sigcontext context; - int err = 0; siginfo_build_tests(); @@ -707,7 +685,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u /* get previous context */ if (copy_from_user(&context, usc, sizeof(context))) - goto badframe; + return -1; /* restore passed registers */ regs->d0 = context.sc_d0; @@ -720,15 +698,10 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u wrusp(context.sc_usp); formatvec = context.sc_formatvec; - err = restore_fpu_state(&context); + if (restore_fpu_state(&context)) + return -1; - if (err || mangle_kernel_stack(regs, formatvec, fp)) - goto badframe; - - return 0; - -badframe: - return 1; + return mangle_kernel_stack(regs, formatvec, fp); } static inline int @@ -745,7 +718,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, err = __get_user(temp, &uc->uc_mcontext.version); if (temp != MCONTEXT_VERSION) - goto badframe; + return -1; /* restore passed registers */ err |= __get_user(regs->d0, &gregs[0]); err |= __get_user(regs->d1, &gregs[1]); @@ -774,22 +747,17 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, err |= restore_altstack(&uc->uc_stack); if (err) - goto badframe; + return -1; - if (mangle_kernel_stack(regs, temp, &uc->uc_extra)) - goto badframe; - - return 0; - -badframe: - return 1; + return mangle_kernel_stack(regs, temp, &uc->uc_extra); } -asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) +asmlinkage void *do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) { unsigned long usp = rdusp(); struct sigframe __user *frame = (struct sigframe __user *)(usp - 4); sigset_t set; + int size; if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -801,20 +769,22 @@ asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc, frame + 1)) + size = restore_sigcontext(regs, &frame->sc, frame + 1); + if (size < 0) goto badframe; - return regs->d0; + return (void *)sw - size; badframe: force_sig(SIGSEGV); - return 0; + return sw; } -asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) +asmlinkage void *do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) { unsigned long usp = rdusp(); struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(usp - 4); sigset_t set; + int size; if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -823,27 +793,34 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) set_current_blocked(&set); - if (rt_restore_ucontext(regs, sw, &frame->uc)) + size = rt_restore_ucontext(regs, sw, &frame->uc); + if (size < 0) goto badframe; - return regs->d0; + return (void *)sw - size; badframe: force_sig(SIGSEGV); - return 0; + return sw; +} + +static inline struct pt_regs *rte_regs(struct pt_regs *regs) +{ + return (void *)regs + regs->stkadj; } static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask) { + struct pt_regs *tregs = rte_regs(regs); sc->sc_mask = mask; sc->sc_usp = rdusp(); sc->sc_d0 = regs->d0; sc->sc_d1 = regs->d1; sc->sc_a0 = regs->a0; sc->sc_a1 = regs->a1; - sc->sc_sr = regs->sr; - sc->sc_pc = regs->pc; - sc->sc_formatvec = regs->format << 12 | regs->vector; + sc->sc_sr = tregs->sr; + sc->sc_pc = tregs->pc; + sc->sc_formatvec = tregs->format << 12 | tregs->vector; save_a5_state(sc, regs); save_fpu_state(sc, regs); } @@ -851,6 +828,7 @@ static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs *regs) { struct switch_stack *sw = (struct switch_stack *)regs - 1; + struct pt_regs *tregs = rte_regs(regs); greg_t __user *gregs = uc->uc_mcontext.gregs; int err = 0; @@ -871,9 +849,9 @@ static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs * err |= __put_user(sw->a5, &gregs[13]); err |= __put_user(sw->a6, &gregs[14]); err |= __put_user(rdusp(), &gregs[15]); - err |= __put_user(regs->pc, &gregs[16]); - err |= __put_user(regs->sr, &gregs[17]); - err |= __put_user((regs->format << 12) | regs->vector, &uc->uc_formatvec); + err |= __put_user(tregs->pc, &gregs[16]); + err |= __put_user(tregs->sr, &gregs[17]); + err |= __put_user((tregs->format << 12) | tregs->vector, &uc->uc_formatvec); err |= rt_save_fpu_state(uc, regs); return err; } @@ -890,13 +868,14 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); struct sigcontext context; int err = 0, sig = ksig->sig; if (fsize < 0) { pr_debug("setup_frame: Unknown frame format %#x\n", - regs->format); + tregs->format); return -EFAULT; } @@ -907,7 +886,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, err |= __put_user(sig, &frame->sig); - err |= __put_user(regs->vector, &frame->code); + err |= __put_user(tregs->vector, &frame->code); err |= __put_user(&frame->sc, &frame->psc); if (_NSIG_WORDS > 1) @@ -933,34 +912,28 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); - /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); + return 0; } @@ -968,7 +941,8 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); int err = 0, sig = ksig->sig; if (fsize < 0) { @@ -1018,34 +992,27 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); - /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); return 0; } diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 5b19fcdcd6..9718ce94cc 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -181,9 +181,8 @@ static inline void access_error060 (struct frame *fp) static inline unsigned long probe040(int iswrite, unsigned long addr, int wbs) { unsigned long mmusr; - mm_segment_t old_fs = get_fs(); - set_fs(MAKE_MM_SEG(wbs)); + set_fc(wbs); if (iswrite) asm volatile (".chip 68040; ptestw (%0); .chip 68k" : : "a" (addr)); @@ -192,7 +191,7 @@ static inline unsigned long probe040(int iswrite, unsigned long addr, int wbs) asm volatile (".chip 68040; movec %%mmusr,%0; .chip 68k" : "=r" (mmusr)); - set_fs(old_fs); + set_fc(USER_DATA); return mmusr; } @@ -201,10 +200,8 @@ static inline int do_040writeback1(unsigned short wbs, unsigned long wba, unsigned long wbd) { int res = 0; - mm_segment_t old_fs = get_fs(); - /* set_fs can not be moved, otherwise put_user() may oops */ - set_fs(MAKE_MM_SEG(wbs)); + set_fc(wbs); switch (wbs & WBSIZ_040) { case BA_SIZE_BYTE: @@ -218,9 +215,7 @@ static inline int do_040writeback1(unsigned short wbs, unsigned long wba, break; } - /* set_fs can not be moved, otherwise put_user() may oops */ - set_fs(old_fs); - + set_fc(USER_DATA); pr_debug("do_040writeback1, res=%d\n", res); diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 90f4e9ca12..4fab347917 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index b486c0889e..dde978e66f 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -49,24 +49,7 @@ static unsigned long virt_to_phys_slow(unsigned long vaddr) if (mmusr & MMU_R_040) return (mmusr & PAGE_MASK) | (vaddr & ~PAGE_MASK); } else { - unsigned short mmusr; - unsigned long *descaddr; - - asm volatile ("ptestr %3,%2@,#7,%0\n\t" - "pmove %%psr,%1" - : "=a&" (descaddr), "=m" (mmusr) - : "a" (vaddr), "d" (get_fs().seg)); - if (mmusr & (MMU_I|MMU_B|MMU_L)) - return 0; - descaddr = phys_to_virt((unsigned long)descaddr); - switch (mmusr & MMU_NUM) { - case 1: - return (*descaddr & 0xfe000000) | (vaddr & 0x01ffffff); - case 2: - return (*descaddr & 0xfffc0000) | (vaddr & 0x0003ffff); - case 3: - return (*descaddr & PAGE_MASK) | (vaddr & ~PAGE_MASK); - } + WARN_ON_ONCE(!CPU_IS_040_OR_060); } return 0; } @@ -107,11 +90,9 @@ void flush_icache_user_range(unsigned long address, unsigned long endaddr) void flush_icache_range(unsigned long address, unsigned long endaddr) { - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); + set_fc(SUPER_DATA); flush_icache_user_range(address, endaddr); - set_fs(old_fs); + set_fc(USER_DATA); } EXPORT_SYMBOL(flush_icache_range); diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 5d749e1882..1b47bec158 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -72,12 +72,6 @@ void __init paging_init(void) if (!empty_zero_page) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - - /* - * Set up SFC/DFC registers (user data space). - */ - set_fs (USER_DS); - max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c index 1269d513b2..20ddf71b43 100644 --- a/arch/m68k/mm/kmap.c +++ b/arch/m68k/mm/kmap.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index fe75aecfb2..c2c03b0a15 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c index f7dd47232b..203f428a03 100644 --- a/arch/m68k/sun3/config.c +++ b/arch/m68k/sun3/config.c @@ -31,7 +31,6 @@ #include #include #include -#include #include char sun3_reserved_pmeg[SUN3_PMEGS_NUM]; @@ -89,7 +88,7 @@ void __init sun3_init(void) sun3_reserved_pmeg[249] = 1; sun3_reserved_pmeg[252] = 1; sun3_reserved_pmeg[253] = 1; - set_fs(KERNEL_DS); + set_fc(USER_DATA); } /* Without this, Bad Things happen when something calls arch_reset. */ diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c index 7aa879b7c7..7ec20817c0 100644 --- a/arch/m68k/sun3/mmu_emu.c +++ b/arch/m68k/sun3/mmu_emu.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -191,14 +190,13 @@ void __init mmu_emu_init(unsigned long bootmem_end) for(seg = 0; seg < PAGE_OFFSET; seg += SUN3_PMEG_SIZE) sun3_put_segmap(seg, SUN3_INVALID_PMEG); - set_fs(MAKE_MM_SEG(3)); + set_fc(3); for(seg = 0; seg < 0x10000000; seg += SUN3_PMEG_SIZE) { i = sun3_get_segmap(seg); for(j = 1; j < CONTEXTS_NUM; j++) (*(romvec->pv_setctxt))(j, (void *)seg, i); } - set_fs(KERNEL_DS); - + set_fc(USER_DATA); } /* erase the mappings for a dead context. Uses the pg_dir for hints diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c index 41ae422119..36cc280a45 100644 --- a/arch/m68k/sun3/sun3ints.c +++ b/arch/m68k/sun3/sun3ints.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/sun3x/prom.c b/arch/m68k/sun3x/prom.c index 74d2fe5752..64c23bfaa9 100644 --- a/arch/m68k/sun3x/prom.c +++ b/arch/m68k/sun3x/prom.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 771ca53af0..6b8f591c50 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3316,8 +3316,6 @@ source "drivers/cpuidle/Kconfig" endmenu -source "drivers/firmware/Kconfig" - source "arch/mips/kvm/Kconfig" source "arch/mips/vdso/Kconfig" diff --git a/arch/mips/include/asm/mips-cps.h b/arch/mips/include/asm/mips-cps.h index 35fb8ee6dd..fd43d87689 100644 --- a/arch/mips/include/asm/mips-cps.h +++ b/arch/mips/include/asm/mips-cps.h @@ -10,8 +10,6 @@ #include #include -#include - extern unsigned long __cps_access_bad_size(void) __compiletime_error("Bad size for CPS accessor"); @@ -167,30 +165,11 @@ static inline uint64_t mips_cps_cluster_config(unsigned int cluster) */ static inline unsigned int mips_cps_numcores(unsigned int cluster) { - unsigned int ncores; - if (!mips_cm_present()) return 0; /* Add one before masking to handle 0xff indicating no cores */ - ncores = (mips_cps_cluster_config(cluster) + 1) & CM_GCR_CONFIG_PCORES; - - if (IS_ENABLED(CONFIG_SOC_MT7621)) { - struct cpulaunch *launch; - - /* - * Ralink MT7621S SoC is single core, but the GCR_CONFIG method - * always reports 2 cores. Check the second core's LAUNCH_FREADY - * flag to detect if the second core is missing. This method - * only works before the core has been started. - */ - launch = (struct cpulaunch *)CKSEG0ADDR(CPULAUNCH); - launch += 2; /* MT7621 has 2 VPEs per core */ - if (!(launch->flags & LAUNCH_FREADY)) - ncores = 1; - } - - return ncores; + return (mips_cps_cluster_config(cluster) + 1) & CM_GCR_CONFIG_PCORES; } /** diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0af88622c6..cb6d22439f 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -662,6 +662,11 @@ static void build_epilogue(struct jit_ctx *ctx) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \ func##_positive) +static bool is_bad_offset(int b_off) +{ + return b_off > 0x1ffff || b_off < -0x20000; +} + static int build_body(struct jit_ctx *ctx) { const struct bpf_prog *prog = ctx->skf; @@ -728,7 +733,10 @@ static int build_body(struct jit_ctx *ctx) /* Load return register on DS for failures */ emit_reg_move(r_ret, r_zero, ctx); /* Return with error */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_LD | BPF_W | BPF_IND: @@ -775,8 +783,10 @@ static int build_body(struct jit_ctx *ctx) emit_jalr(MIPS_R_RA, r_s0, ctx); emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */ /* Check the error value */ - emit_bcond(MIPS_COND_NE, r_ret, 0, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_NE, r_ret, 0, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); /* We are good */ /* X <- P[1:K] & 0xf */ @@ -855,8 +865,10 @@ static int build_body(struct jit_ctx *ctx) /* A /= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_div(r_A, r_X, ctx); break; @@ -864,8 +876,10 @@ static int build_body(struct jit_ctx *ctx) /* A %= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_mod(r_A, r_X, ctx); break; @@ -926,7 +940,10 @@ static int build_body(struct jit_ctx *ctx) break; case BPF_JMP | BPF_JA: /* pc += K */ - emit_b(b_imm(i + k + 1, ctx), ctx); + b_off = b_imm(i + k + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_JMP | BPF_JEQ | BPF_K: @@ -1056,12 +1073,16 @@ static int build_body(struct jit_ctx *ctx) break; case BPF_RET | BPF_A: ctx->flags |= SEEN_A; - if (i != prog->len - 1) + if (i != prog->len - 1) { /* * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); + } emit_reg_move(r_ret, r_A, ctx); /* delay slot */ break; case BPF_RET | BPF_K: @@ -1075,7 +1096,10 @@ static int build_body(struct jit_ctx *ctx) * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); } break; @@ -1133,8 +1157,10 @@ static int build_body(struct jit_ctx *ctx) /* Load *dev pointer */ emit_load_ptr(r_s0, r_skb, off, ctx); /* error (0) in the delay slot */ - emit_bcond(MIPS_COND_EQ, r_s0, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_s0, r_zero, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); if (code == (BPF_ANC | SKF_AD_IFINDEX)) { BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); @@ -1244,7 +1270,10 @@ void bpf_jit_compile(struct bpf_prog *fp) /* Generate the actual JIT code */ build_prologue(&ctx); - build_body(&ctx); + if (build_body(&ctx)) { + module_memfree(ctx.target); + goto out; + } build_epilogue(&ctx); /* Update the icache */ diff --git a/arch/nios2/Kconfig.debug b/arch/nios2/Kconfig.debug index a8bc06e96e..ca1beb87f9 100644 --- a/arch/nios2/Kconfig.debug +++ b/arch/nios2/Kconfig.debug @@ -3,9 +3,10 @@ config EARLY_PRINTK bool "Activate early kernel debugging" default y + depends on TTY select SERIAL_CORE_CONSOLE help - Enable early printk on console + Enable early printk on console. This is useful for kernel debugging when your machine crashes very early before the console code is initialized. You should normally say N here, unless you want to debug such a crash. diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index cf8d687a26..40bc8fb75e 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -149,8 +149,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, void __init setup_arch(char **cmdline_p) { - int dram_start; - console_verbose(); memory_start = memblock_start_of_DRAM(); diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 4742b6f169..27a8b49af1 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -384,6 +384,4 @@ config KEXEC_FILE endmenu -source "drivers/firmware/Kconfig" - source "drivers/parisc/Kconfig" diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index d4b145b279..9f38040f06 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -136,6 +136,14 @@ static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) if (kuap_is_disabled()) return; + if (unlikely(kuap != KUAP_NONE)) { + current->thread.kuap = KUAP_NONE; + kuap_lock(kuap, false); + } + + if (likely(regs->kuap == KUAP_NONE)) + return; + current->thread.kuap = regs->kuap; kuap_unlock(regs->kuap, false); diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index a95f63788c..4ba834599c 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -23,6 +23,7 @@ #define BRANCH_ABSOLUTE 0x2 bool is_offset_in_branch_range(long offset); +bool is_offset_in_cond_branch_range(long offset); int create_branch(struct ppc_inst *instr, const u32 *addr, unsigned long target, int flags); int create_cond_branch(struct ppc_inst *instr, const u32 *addr, diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 6b800d3e26..a1d238255f 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -265,13 +265,16 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte local_paca->irq_soft_mask = IRQS_ALL_DISABLED; local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - if (is_implicit_soft_masked(regs)) { - // Adjust regs->softe soft implicit soft-mask, so - // arch_irq_disabled_regs(regs) behaves as expected. + if (!(regs->msr & MSR_EE) || is_implicit_soft_masked(regs)) { + /* + * Adjust regs->softe to be soft-masked if it had not been + * reconcied (e.g., interrupt entry with MSR[EE]=0 but softe + * not yet set disabled), or if it was in an implicit soft + * masked state. This makes arch_irq_disabled_regs(regs) + * behave as expected. + */ regs->softe = IRQS_ALL_DISABLED; } - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) - BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE)); /* Don't do any per-CPU operations until interrupt state is fixed */ @@ -525,10 +528,9 @@ static __always_inline long ____##func(struct pt_regs *regs) /* kernel/traps.c */ DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception); #ifdef CONFIG_PPC_BOOK3S_64 -DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception); -#else -DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception); +DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async); #endif +DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception); DECLARE_INTERRUPT_HANDLER(SMIException); DECLARE_INTERRUPT_HANDLER(handle_hmi_exception); DECLARE_INTERRUPT_HANDLER(unknown_exception); diff --git a/arch/powerpc/include/asm/security_features.h b/arch/powerpc/include/asm/security_features.h index 792eefaf23..27574f218b 100644 --- a/arch/powerpc/include/asm/security_features.h +++ b/arch/powerpc/include/asm/security_features.h @@ -39,6 +39,11 @@ static inline bool security_ftr_enabled(u64 feature) return !!(powerpc_security_features & feature); } +#ifdef CONFIG_PPC_BOOK3S_64 +enum stf_barrier_type stf_barrier_type_get(void); +#else +static inline enum stf_barrier_type stf_barrier_type_get(void) { return STF_BARRIER_NONE; } +#endif // Features indicating support for Spectre/Meltdown mitigations diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 111249fd61..038ce8d906 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -184,6 +184,15 @@ u64 dma_iommu_get_required_mask(struct device *dev) struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; + if (dev_is_pci(dev)) { + u64 bypass_mask = dma_direct_get_required_mask(dev); + + if (dma_iommu_dma_supported(dev, bypass_mask)) { + dev_info(dev, "%s: returning bypass mask 0x%llx\n", __func__, bypass_mask); + return bypass_mask; + } + } + if (!tbl) return 0; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 37859e62a8..eaf1f72131 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common) li r10,MSR_RI mtmsrd r10,1 addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception + bl machine_check_exception_async b interrupt_return_srr @@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) subi r12,r12,1 sth r12,PACA_IN_MCE(r13) - /* Invoke machine_check_exception to print MCE event and panic. */ + /* + * Invoke machine_check_exception to print MCE event and panic. + * This is the NMI version of the handler because we are called from + * the early handler which is a true NMI. + */ addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception @@ -1665,27 +1669,30 @@ EXC_COMMON_BEGIN(program_check_common) */ andi. r10,r12,MSR_PR - bne 2f /* If userspace, go normal path */ + bne .Lnormal_stack /* If userspace, go normal path */ andis. r10,r12,(SRR1_PROGTM)@h - bne 1f /* If TM, emergency */ + bne .Lemergency_stack /* If TM, emergency */ cmpdi r1,-INT_FRAME_SIZE /* check if r1 is in userspace */ - blt 2f /* normal path if not */ + blt .Lnormal_stack /* normal path if not */ /* Use the emergency stack */ -1: andi. r10,r12,MSR_PR /* Set CR0 correctly for label */ +.Lemergency_stack: + andi. r10,r12,MSR_PR /* Set CR0 correctly for label */ /* 3 in EXCEPTION_PROLOG_COMMON */ mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ __ISTACK(program_check)=0 __GEN_COMMON_BODY program_check - b 3f -2: + b .Ldo_program_check + +.Lnormal_stack: __ISTACK(program_check)=1 __GEN_COMMON_BODY program_check -3: + +.Ldo_program_check: addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception REST_NVGPRS(r1) /* instruction emulation may change GPRs */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 551b653228..c4f1d6b7d9 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -229,6 +229,9 @@ notrace void arch_local_irq_restore(unsigned long mask) return; } + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(in_nmi() || in_hardirq()); + /* * After the stb, interrupts are unmasked and there are no interrupts * pending replay. The restart sequence makes this atomic with @@ -321,6 +324,9 @@ notrace void arch_local_irq_restore(unsigned long mask) if (mask) return; + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(in_nmi() || in_hardirq()); + /* * From this point onward, we can take interrupts, preempt, * etc... unless we got hard-disabled. We check if an event diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 1a998490fe..15fb5ea1b9 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -263,6 +263,11 @@ static int __init handle_no_stf_barrier(char *p) early_param("no_stf_barrier", handle_no_stf_barrier); +enum stf_barrier_type stf_barrier_type_get(void) +{ + return stf_enabled_flush_types; +} + /* This is the generic flag used by other architectures */ static int __init handle_ssbd(char *p) { diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index aac8c0412f..11741703d2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -340,10 +340,16 @@ static bool exception_common(int signr, struct pt_regs *regs, int code, return false; } - show_signal_msg(signr, regs, code, addr); + /* + * Must not enable interrupts even for user-mode exception, because + * this can be called from machine check, which may be a NMI or IRQ + * which don't like interrupts being enabled. Could check for + * in_hardirq || in_nmi perhaps, but there doesn't seem to be a good + * reason why _exception() should enable irqs for an exception handler, + * the handlers themselves do that directly. + */ - if (arch_irqs_disabled()) - interrupt_cond_local_irq_enable(regs); + show_signal_msg(signr, regs, code, addr); current->thread.trap_nr = code; @@ -790,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long err) * do_exit() checks for in_interrupt() and panics in that case, so * exit the irq/nmi before calling die. */ - if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) - irq_exit(); - else + if (in_nmi()) nmi_exit(); + else + irq_exit(); die(str, regs, err); } /* - * BOOK3S_64 does not call this handler as a non-maskable interrupt + * BOOK3S_64 does not usually call this handler as a non-maskable interrupt * (it uses its own early real-mode handler to handle the MCE proper * and then raises irq_work to call this handler when interrupts are - * enabled). + * enabled). The only time when this is not true is if the early handler + * is unrecoverable, then it does call this directly to try to get a + * message out. */ -#ifdef CONFIG_PPC_BOOK3S_64 -DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception) -#else -DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) -#endif +static void __machine_check_exception(struct pt_regs *regs) { int recover = 0; @@ -841,12 +845,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) /* Must die if the interrupt is not recoverable */ if (regs_is_unrecoverable(regs)) die_mce("Unrecoverable Machine check", regs, SIGBUS); +} #ifdef CONFIG_PPC_BOOK3S_64 - return; -#else - return 0; +DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async) +{ + __machine_check_exception(regs); +} #endif +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) +{ + __machine_check_exception(regs); + + return 0; } DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 90484425a1..eb776d0c5d 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -255,13 +255,16 @@ kvm_novcpu_exit: * r3 contains the SRR1 wakeup value, SRR1 is trashed. */ _GLOBAL(idle_kvm_start_guest) - ld r4,PACAEMERGSP(r13) mfcr r5 mflr r0 - std r1,0(r4) - std r5,8(r4) - std r0,16(r4) - subi r1,r4,STACK_FRAME_OVERHEAD + std r5, 8(r1) // Save CR in caller's frame + std r0, 16(r1) // Save LR in caller's frame + // Create frame on emergency stack + ld r4, PACAEMERGSP(r13) + stdu r1, -SWITCH_FRAME_SIZE(r4) + // Switch to new frame on emergency stack + mr r1, r4 + std r3, 32(r1) // Save SRR1 wakeup value SAVE_NVGPRS(r1) /* @@ -313,6 +316,10 @@ kvm_unsplit_wakeup: kvm_secondary_got_guest: + // About to go to guest, clear saved SRR1 + li r0, 0 + std r0, 32(r1) + /* Set HSTATE_DSCR(r13) to something sensible */ ld r6, PACA_DSCR_DEFAULT(r13) std r6, HSTATE_DSCR(r13) @@ -392,13 +399,12 @@ kvm_no_guest: mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 - /* set up r3 for return */ - mfspr r3,SPRN_SRR1 + // Return SRR1 wakeup value, or 0 if we went into the guest + ld r3, 32(r1) REST_NVGPRS(r1) - addi r1, r1, STACK_FRAME_OVERHEAD - ld r0, 16(r1) - ld r5, 8(r1) - ld r1, 0(r1) + ld r1, 0(r1) // Switch back to caller stack + ld r0, 16(r1) // Reload LR + ld r5, 8(r1) // Reload CR mtlr r0 mtcr r5 blr diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index f9a3019e37..c5ed988238 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -228,6 +228,11 @@ bool is_offset_in_branch_range(long offset) return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3)); } +bool is_offset_in_cond_branch_range(long offset) +{ + return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3); +} + /* * Helper to check if a given instruction is a conditional branch * Derived from the conditional checks in analyse_instr() @@ -280,7 +285,7 @@ int create_cond_branch(struct ppc_inst *instr, const u32 *addr, offset = offset - (unsigned long)addr; /* Check we can represent the target in the instruction format */ - if (offset < -0x8000 || offset > 0x7FFF || offset & 0x3) + if (!is_offset_in_cond_branch_range(offset)) return 1; /* Mask out the flags and target, so they don't step on each other. */ diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 99fad093f4..7e9b978b76 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -24,16 +24,30 @@ #define EMIT(instr) PLANT_INSTR(image, ctx->idx, instr) /* Long jump; (unconditional 'branch') */ -#define PPC_JMP(dest) EMIT(PPC_INST_BRANCH | \ - (((dest) - (ctx->idx * 4)) & 0x03fffffc)) +#define PPC_JMP(dest) \ + do { \ + long offset = (long)(dest) - (ctx->idx * 4); \ + if (!is_offset_in_branch_range(offset)) { \ + pr_err_ratelimited("Branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \ + return -ERANGE; \ + } \ + EMIT(PPC_INST_BRANCH | (offset & 0x03fffffc)); \ + } while (0) + /* blr; (unconditional 'branch' with link) to absolute address */ #define PPC_BL_ABS(dest) EMIT(PPC_INST_BL | \ (((dest) - (unsigned long)(image + ctx->idx)) & 0x03fffffc)) /* "cond" here covers BO:BI fields. */ -#define PPC_BCC_SHORT(cond, dest) EMIT(PPC_INST_BRANCH_COND | \ - (((cond) & 0x3ff) << 16) | \ - (((dest) - (ctx->idx * 4)) & \ - 0xfffc)) +#define PPC_BCC_SHORT(cond, dest) \ + do { \ + long offset = (long)(dest) - (ctx->idx * 4); \ + if (!is_offset_in_cond_branch_range(offset)) { \ + pr_err_ratelimited("Conditional branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \ + return -ERANGE; \ + } \ + EMIT(PPC_INST_BRANCH_COND | (((cond) & 0x3ff) << 16) | (offset & 0xfffc)); \ + } while (0) + /* Sign-extended 32-bit immediate load */ #define PPC_LI32(d, i) do { \ if ((int)(uintptr_t)(i) >= -32768 && \ @@ -78,11 +92,6 @@ #define PPC_FUNC_ADDR(d,i) do { PPC_LI32(d, i); } while(0) #endif -static inline bool is_nearbranch(int offset) -{ - return (offset < 32768) && (offset >= -32768); -} - /* * The fly in the ointment of code size changing from pass to pass is * avoided by padding the short branch case with a NOP. If code size differs @@ -91,7 +100,7 @@ static inline bool is_nearbranch(int offset) * state. */ #define PPC_BCC(cond, dest) do { \ - if (is_nearbranch((dest) - (ctx->idx * 4))) { \ + if (is_offset_in_cond_branch_range((long)(dest) - (ctx->idx * 4))) { \ PPC_BCC_SHORT(cond, dest); \ EMIT(PPC_RAW_NOP()); \ } else { \ diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index 7b713edfa7..b63b35e45e 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -16,18 +16,18 @@ * with our redzone usage. * * [ prev sp ] <------------- - * [ nv gpr save area ] 6*8 | + * [ nv gpr save area ] 5*8 | * [ tail_call_cnt ] 8 | - * [ local_tmp_var ] 8 | + * [ local_tmp_var ] 16 | * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ /* for gpr non volatile registers BPG_REG_6 to 10 */ -#define BPF_PPC_STACK_SAVE (6*8) +#define BPF_PPC_STACK_SAVE (5*8) /* for bpf JIT code internal usage */ -#define BPF_PPC_STACK_LOCALS 16 +#define BPF_PPC_STACK_LOCALS 24 /* stack frame excluding BPF stack, ensure this is quadword aligned */ #define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 53aefee3fe..fcbf7a917c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -210,7 +210,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(code_base, &cgctx); - bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); + if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass)) { + bpf_jit_binary_free(bpf_hdr); + fp = org_fp; + goto out_addrs; + } bpf_jit_build_epilogue(code_base, &cgctx); if (bpf_jit_enable > 1) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index beb12cbc8c..0da31d41d4 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -200,7 +200,7 @@ void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 fun } } -static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) +static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* * By now, the eBPF program has already setup parameters in r3-r6 @@ -261,7 +261,9 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 bpf_jit_emit_common_epilogue(image, ctx); EMIT(PPC_RAW_BCTR()); + /* out: */ + return 0; } /* Assemble the body code between the prologue & epilogue */ @@ -355,7 +357,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * PPC_LI32(_R0, imm); EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, _R0)); } - if (imm >= 0) + if (imm >= 0 || (BPF_OP(code) == BPF_SUB && imm == 0x80000000)) EMIT(PPC_RAW_ADDZE(dst_reg_h, dst_reg_h)); else EMIT(PPC_RAW_ADDME(dst_reg_h, dst_reg_h)); @@ -623,7 +625,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * EMIT(PPC_RAW_LI(dst_reg_h, 0)); break; case BPF_ALU | BPF_ARSH | BPF_X: /* (s32) dst >>= src */ - EMIT(PPC_RAW_SRAW(dst_reg_h, dst_reg, src_reg)); + EMIT(PPC_RAW_SRAW(dst_reg, dst_reg, src_reg)); break; case BPF_ALU64 | BPF_ARSH | BPF_X: /* (s64) dst >>= src */ bpf_set_seen_register(ctx, tmp_reg); @@ -1073,7 +1075,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * break; case BPF_JMP32 | BPF_JSET | BPF_K: /* andi does not sign-extend the immediate */ - if (imm >= -32768 && imm < 32768) { + if (imm >= 0 && imm < 32768) { /* PPC_ANDI is _only/always_ dot-form */ EMIT(PPC_RAW_ANDI(_R0, dst_reg, imm)); } else { @@ -1090,7 +1092,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * */ case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; - bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + ret = bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + if (ret < 0) + return ret; break; default: @@ -1103,7 +1107,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * return -EOPNOTSUPP; } if (BPF_CLASS(code) == BPF_ALU && !fp->aux->verifier_zext && - !insn_is_zext(&insn[i + 1])) + !insn_is_zext(&insn[i + 1]) && !(BPF_OP(code) == BPF_END && imm == 64)) EMIT(PPC_RAW_LI(dst_reg_h, 0)); } diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index b87a63dba9..8b5157ccfe 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "bpf_jit64.h" @@ -35,9 +36,9 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ prev sp ] <------------- * [ ... ] | * sp (r1) ---> [ stack pointer ] -------------- - * [ nv gpr save area ] 6*8 + * [ nv gpr save area ] 5*8 * [ tail_call_cnt ] 8 - * [ local_tmp_var ] 8 + * [ local_tmp_var ] 16 * [ unused red zone ] 208 bytes protected */ static int bpf_jit_stack_local(struct codegen_context *ctx) @@ -45,12 +46,12 @@ static int bpf_jit_stack_local(struct codegen_context *ctx) if (bpf_has_stack_frame(ctx)) return STACK_FRAME_MIN_SIZE + ctx->stack_size; else - return -(BPF_PPC_STACK_SAVE + 16); + return -(BPF_PPC_STACK_SAVE + 24); } static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) { - return bpf_jit_stack_local(ctx) + 8; + return bpf_jit_stack_local(ctx) + 16; } static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) @@ -206,7 +207,7 @@ void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 fun EMIT(PPC_RAW_BCTRL()); } -static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) +static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* * By now, the eBPF program has already setup parameters in r3, r4 and r5 @@ -267,13 +268,38 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 bpf_jit_emit_common_epilogue(image, ctx); EMIT(PPC_RAW_BCTR()); + /* out: */ + return 0; } +/* + * We spill into the redzone always, even if the bpf program has its own stackframe. + * Offsets hardcoded based on BPF_PPC_STACK_SAVE -- see bpf_jit_stack_local() + */ +void bpf_stf_barrier(void); + +asm ( +" .global bpf_stf_barrier ;" +" bpf_stf_barrier: ;" +" std 21,-64(1) ;" +" std 22,-56(1) ;" +" sync ;" +" ld 21,-64(1) ;" +" ld 22,-56(1) ;" +" ori 31,31,0 ;" +" .rept 14 ;" +" b 1f ;" +" 1: ;" +" .endr ;" +" blr ;" +); + /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, u32 *addrs, bool extra_pass) { + enum stf_barrier_type stf_barrier = stf_barrier_type_get(); const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; int i, ret; @@ -328,18 +354,25 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * EMIT(PPC_RAW_SUB(dst_reg, dst_reg, src_reg)); goto bpf_alu32_trunc; case BPF_ALU | BPF_ADD | BPF_K: /* (u32) dst += (u32) imm */ - case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */ + if (!imm) { + goto bpf_alu32_trunc; + } else if (imm >= -32768 && imm < 32768) { + EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm))); + } else { + PPC_LI32(b2p[TMP_REG_1], imm); + EMIT(PPC_RAW_ADD(dst_reg, dst_reg, b2p[TMP_REG_1])); + } + goto bpf_alu32_trunc; + case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */ - if (BPF_OP(code) == BPF_SUB) - imm = -imm; - if (imm) { - if (imm >= -32768 && imm < 32768) - EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm))); - else { - PPC_LI32(b2p[TMP_REG_1], imm); - EMIT(PPC_RAW_ADD(dst_reg, dst_reg, b2p[TMP_REG_1])); - } + if (!imm) { + goto bpf_alu32_trunc; + } else if (imm > -32768 && imm <= 32768) { + EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(-imm))); + } else { + PPC_LI32(b2p[TMP_REG_1], imm); + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, b2p[TMP_REG_1])); } goto bpf_alu32_trunc; case BPF_ALU | BPF_MUL | BPF_X: /* (u32) dst *= (u32) src */ @@ -389,8 +422,14 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * case BPF_ALU64 | BPF_DIV | BPF_K: /* dst /= imm */ if (imm == 0) return -EINVAL; - else if (imm == 1) - goto bpf_alu32_trunc; + if (imm == 1) { + if (BPF_OP(code) == BPF_DIV) { + goto bpf_alu32_trunc; + } else { + EMIT(PPC_RAW_LI(dst_reg, 0)); + break; + } + } PPC_LI32(b2p[TMP_REG_1], imm); switch (BPF_CLASS(code)) { @@ -631,6 +670,29 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * BPF_ST NOSPEC (speculation barrier) */ case BPF_ST | BPF_NOSPEC: + if (!security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) || + !security_ftr_enabled(SEC_FTR_STF_BARRIER)) + break; + + switch (stf_barrier) { + case STF_BARRIER_EIEIO: + EMIT(PPC_RAW_EIEIO() | 0x02000000); + break; + case STF_BARRIER_SYNC_ORI: + EMIT(PPC_RAW_SYNC()); + EMIT(PPC_RAW_LD(b2p[TMP_REG_1], _R13, 0)); + EMIT(PPC_RAW_ORI(_R31, _R31, 0)); + break; + case STF_BARRIER_FALLBACK: + EMIT(PPC_RAW_MFLR(b2p[TMP_REG_1])); + PPC_LI64(12, dereference_kernel_function_descriptor(bpf_stf_barrier)); + EMIT(PPC_RAW_MTCTR(12)); + EMIT(PPC_RAW_BCTRL()); + EMIT(PPC_RAW_MTLR(b2p[TMP_REG_1])); + break; + case STF_BARRIER_NONE: + break; + } break; /* @@ -993,7 +1055,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * */ case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; - bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + ret = bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + if (ret < 0) + return ret; break; default: diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index bc15200852..09fafcf2d3 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -867,6 +867,10 @@ static int __init eeh_pseries_init(void) if (is_kdump_kernel() || reset_devices) { pr_info("Issue PHB reset ...\n"); list_for_each_entry(phb, &hose_list, list_node) { + // Skip if the slot is empty + if (list_empty(&PCI_DN(phb->dn)->child_list)) + continue; + pdn = list_first_entry(&PCI_DN(phb->dn)->child_list, struct pci_dn, list); config_addr = pseries_eeh_get_pe_config_addr(pdn); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 1b305e4118..8627362f61 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -507,12 +507,27 @@ static void pseries_msi_unmask(struct irq_data *d) irq_chip_unmask_parent(d); } +static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(data); + + /* + * Do not update the MSIx vector table. It's not strictly necessary + * because the table is initialized by the underlying hypervisor, PowerVM + * or QEMU/KVM. However, if the MSIx vector entry is cleared, any further + * activation will fail. This can happen in some drivers (eg. IPR) which + * deactivate an IRQ used for testing MSI support. + */ + entry->msg = *msg; +} + static struct irq_chip pseries_pci_msi_irq_chip = { .name = "pSeries-PCI-MSI", .irq_shutdown = pseries_msi_shutdown, .irq_mask = pseries_msi_mask, .irq_unmask = pseries_msi_unmask, .irq_eoi = irq_chip_eoi_parent, + .irq_write_msi_msg = pseries_msi_write_msg, }; static struct msi_domain_info pseries_msi_domain_info = { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index c732ce5a3e..c5d75c02ad 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -945,7 +945,8 @@ static int xive_get_irqchip_state(struct irq_data *data, * interrupt to be inactive in that case. */ *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && - (xd->saved_p || !!(pq & XIVE_ESB_VAL_P)); + (xd->saved_p || (!!(pq & XIVE_ESB_VAL_P) && + !irqd_irq_disabled(data))); return 0; default: return -EINVAL; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 301a54233c..6a6fa9e976 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -561,5 +561,3 @@ menu "Power management options" source "kernel/power/Kconfig" endmenu - -source "drivers/firmware/Kconfig" diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index b933b1583c..34fbb3ea21 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -82,4 +82,5 @@ static inline int syscall_get_arch(struct task_struct *task) #endif } +asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); #endif /* _ASM_RISCV_SYSCALL_H */ diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index 893e47195e..208e31bc5d 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -16,18 +16,24 @@ #ifdef CONFIG_MMU #include -#include +/* + * All systems with an MMU have a VDSO, but systems without an MMU don't + * support shared libraries and therefor don't have one. + */ +#ifdef CONFIG_MMU -#ifndef CONFIG_GENERIC_TIME_VSYSCALL -struct vdso_data { -}; -#endif +#define __VVAR_PAGES 1 + +#ifndef __ASSEMBLY__ +#include #define VDSO_SYMBOL(base, name) \ (void __user *)((unsigned long)(base) + __vdso_##name##_offset) #endif /* CONFIG_MMU */ -asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); +#endif /* !__ASSEMBLY__ */ + +#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_VDSO_H */ diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h index 4b989ae15d..8062996c2d 100644 --- a/arch/riscv/include/uapi/asm/unistd.h +++ b/arch/riscv/include/uapi/asm/unistd.h @@ -18,9 +18,10 @@ #ifdef __LP64__ #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SET_GET_RLIMIT -#define __ARCH_WANT_SYS_CLONE3 #endif /* __LP64__ */ +#define __ARCH_WANT_SYS_CLONE3 + #include /* diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index a63c667c27..44b1420a22 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #undef __SYSCALL diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 25a3b88495..b70956d804 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -12,14 +12,24 @@ #include #include #include +#include + #ifdef CONFIG_GENERIC_TIME_VSYSCALL #include #else -#include +struct vdso_data { +}; #endif extern char vdso_start[], vdso_end[]; +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + +#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) + static unsigned int vdso_pages __ro_after_init; static struct page **vdso_pagelist __ro_after_init; @@ -38,7 +48,7 @@ static int __init vdso_init(void) vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; vdso_pagelist = - kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); + kcalloc(vdso_pages + VVAR_NR_PAGES, sizeof(struct page *), GFP_KERNEL); if (unlikely(vdso_pagelist == NULL)) { pr_err("vdso: pagelist allocation failed\n"); return -ENOMEM; @@ -63,38 +73,41 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, unsigned long vdso_base, vdso_len; int ret; - vdso_len = (vdso_pages + 1) << PAGE_SHIFT; + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + + vdso_len = (vdso_pages + VVAR_NR_PAGES) << PAGE_SHIFT; + + if (mmap_write_lock_killable(mm)) + return -EINTR; - mmap_write_lock(mm); vdso_base = get_unmapped_area(NULL, 0, vdso_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { ret = vdso_base; goto end; } + mm->context.vdso = NULL; + ret = install_special_mapping(mm, vdso_base, VVAR_SIZE, + (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]); + if (unlikely(ret)) + goto end; + + ret = + install_special_mapping(mm, vdso_base + VVAR_SIZE, + vdso_pages << PAGE_SHIFT, + (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), + vdso_pagelist); + + if (unlikely(ret)) + goto end; + /* * Put vDSO base into mm struct. We need to do this before calling * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO (since arch_vma_name fails). */ - mm->context.vdso = (void *)vdso_base; + mm->context.vdso = (void *)vdso_base + VVAR_SIZE; - ret = - install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, - (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), - vdso_pagelist); - - if (unlikely(ret)) { - mm->context.vdso = NULL; - goto end; - } - - vdso_base += (vdso_pages << PAGE_SHIFT); - ret = install_special_mapping(mm, vdso_base, PAGE_SIZE, - (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]); - - if (unlikely(ret)) - mm->context.vdso = NULL; end: mmap_write_unlock(mm); return ret; @@ -105,7 +118,7 @@ const char *arch_vma_name(struct vm_area_struct *vma) if (vma->vm_mm && (vma->vm_start == (long)vma->vm_mm->context.vdso)) return "[vdso]"; if (vma->vm_mm && (vma->vm_start == - (long)vma->vm_mm->context.vdso + PAGE_SIZE)) + (long)vma->vm_mm->context.vdso - VVAR_SIZE)) return "[vdso_data]"; return NULL; } diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index e6f558bca7..e9111f700a 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -3,12 +3,13 @@ * Copyright (C) 2012 Regents of the University of California */ #include +#include OUTPUT_ARCH(riscv) SECTIONS { - PROVIDE(_vdso_data = . + PAGE_SIZE); + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 0941186632..89f81067e0 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -16,6 +16,8 @@ static void ipi_remote_fence_i(void *info) void flush_icache_all(void) { + local_flush_icache_all(); + if (IS_ENABLED(CONFIG_RISCV_SBI)) sbi_remote_fence_i(NULL); else diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index e4803ec511..6b3c366af7 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -207,6 +207,8 @@ int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); +void zpci_device_reserved(struct zpci_dev *zdev); +bool zpci_is_device_configured(struct zpci_dev *zdev); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); int zpci_unregister_ioat(struct zpci_dev *, u8); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 16256e17a5..10722455fd 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 752a0ffab9..6a6dd5e1da 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4066,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index ecd741ee32..52bc8fbaa6 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index cfcdf76d6a..a95ca6df4e 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -259,14 +259,13 @@ EXPORT_SYMBOL(strcmp); #ifdef __HAVE_ARCH_STRRCHR char *strrchr(const char *s, int c) { - size_t len = __strend(s) - s; + ssize_t len = __strend(s) - s; - if (len) - do { - if (s[len] == (char) c) - return (char *) s + len; - } while (--len > 0); - return NULL; + do { + if (s[len] == (char)c) + return (char *)s + len; + } while (--len >= 0); + return NULL; } EXPORT_SYMBOL(strrchr); #endif diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 840d859443..1a374d021e 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1826,7 +1826,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); if (jit.addrs == NULL) { fp = orig_fp; - goto out; + goto free_addrs; } /* * Three initial passes: diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index e7e6788d75..b833155ce8 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -92,7 +92,7 @@ void zpci_remove_reserved_devices(void) spin_unlock(&zpci_list_lock); list_for_each_entry_safe(zdev, tmp, &remove, entry) - zpci_zdev_put(zdev); + zpci_device_reserved(zdev); } int pci_domain_nr(struct pci_bus *bus) @@ -751,6 +751,14 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) return ERR_PTR(rc); } +bool zpci_is_device_configured(struct zpci_dev *zdev) +{ + enum zpci_state state = zdev->state; + + return state != ZPCI_FN_STATE_RESERVED && + state != ZPCI_FN_STATE_STANDBY; +} + /** * zpci_scan_configured_device() - Scan a freshly configured zpci_dev * @zdev: The zpci_dev to be configured @@ -822,6 +830,31 @@ int zpci_deconfigure_device(struct zpci_dev *zdev) return 0; } +/** + * zpci_device_reserved() - Mark device as resverved + * @zdev: the zpci_dev that was reserved + * + * Handle the case that a given zPCI function was reserved by another system. + * After a call to this function the zpci_dev can not be found via + * get_zdev_by_fid() anymore but may still be accessible via existing + * references though it will not be functional anymore. + */ +void zpci_device_reserved(struct zpci_dev *zdev) +{ + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + /* + * Remove device from zpci_list as it is going away. This also + * makes sure we ignore subsequent zPCI events for this device. + */ + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zdev->state = ZPCI_FN_STATE_RESERVED; + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + zpci_zdev_put(zdev); +} + void zpci_release_device(struct kref *kref) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); @@ -843,6 +876,12 @@ void zpci_release_device(struct kref *kref) case ZPCI_FN_STATE_STANDBY: if (zdev->has_hp_slot) zpci_exit_slot(zdev); + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + fallthrough; + case ZPCI_FN_STATE_RESERVED: if (zdev->has_resources) zpci_cleanup_bus_resources(zdev); zpci_bus_device_unregister(zdev); @@ -851,10 +890,6 @@ void zpci_release_device(struct kref *kref) default: break; } - - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); zpci_dbg(3, "rem fid:%x\n", zdev->fid); kfree(zdev); } diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index c856f80cb2..5b8d647523 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -140,7 +140,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) /* The 0x0304 event may immediately reserve the device */ if (!clp_get_state(zdev->fid, &state) && state == ZPCI_FN_STATE_RESERVED) { - zpci_zdev_put(zdev); + zpci_device_reserved(zdev); } } break; @@ -151,7 +151,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0308: /* Standby -> Reserved */ if (!zdev) break; - zpci_zdev_put(zdev); + zpci_device_reserved(zdev); break; default: break; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab83c22d27..d9830e7e10 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1405,7 +1405,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1525,7 +1525,6 @@ config AMD_MEM_ENCRYPT config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT bool "Activate AMD Secure Memory Encryption (SME) by default" - default y depends on AMD_MEM_ENCRYPT help Say yes to have system memory encrypted by default if running on @@ -2832,8 +2831,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -source "drivers/firmware/Kconfig" - source "arch/x86/kvm/Kconfig" source "arch/x86/Kconfig.assembler" diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index fa2c3f50ae..18d2f51991 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -367,10 +367,11 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8) * %rdx: src (1..8 blocks) * %rcx: num blocks (1..8) */ - FRAME_BEGIN - cmpq $5, %rcx; jb sm4_aesni_avx_crypt4; + + FRAME_BEGIN + vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 2a57dbed48..6dfa8ddaa6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2465,6 +2465,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); + event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7011e87be6..9a04443807 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -263,6 +263,7 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT(0xef, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), EVENT_CONSTRAINT_END }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index c853b28efa..96c775abe3 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -68,6 +68,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_ATOM_SILVERMONT: case INTEL_FAM6_ATOM_SILVERMONT_D: diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 32a1ad356c..db2d92fb44 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -122,17 +122,27 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, ipi_arg->reserved = 0; ipi_arg->vp_set.valid_bank_mask = 0; - if (!cpumask_equal(mask, cpu_present_mask)) { + /* + * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET + * when the IPI is sent to all currently present CPUs. + */ + if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; if (exclude_self) nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask); else nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); - } - if (nr_bank < 0) - goto ipi_mask_ex_done; - if (!nr_bank) + + /* + * 'nr_bank <= 0' means some CPUs in cpumask can't be + * represented in VP_SET. Return an error and fall back to + * native (architectural) method of sending IPIs. + */ + if (nr_bank <= 0) + goto ipi_mask_ex_done; + } else { ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; + } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, ipi_arg, NULL); diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 14ebd21965..43184640b5 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -25,7 +25,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) * For !SMAP hardware we patch out CLAC on entry. */ if (boot_cpu_has(X86_FEATURE_SMAP) || - (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV))) mask |= X86_EFLAGS_AC; WARN_ON_ONCE(flags & mask); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index 87bd6025d9..6a5f3acf2b 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -46,7 +46,7 @@ struct kvm_page_track_notifier_node { struct kvm_page_track_notifier_node *node); }; -void kvm_page_track_init(struct kvm *kvm); +int kvm_page_track_init(struct kvm *kvm); void kvm_page_track_cleanup(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index eceea92990..6c57651921 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -2,6 +2,20 @@ #ifndef _ASM_X86_KVM_CLOCK_H #define _ASM_X86_KVM_CLOCK_H +#include + extern struct clocksource kvm_clock; +DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} + #endif /* _ASM_X86_KVM_CLOCK_H */ diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 3506d8c598..4557f7cb0f 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -14,16 +14,19 @@ static inline int pci_xen_hvm_init(void) return -1; } #endif -#if defined(CONFIG_XEN_DOM0) +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void); -int xen_find_device_domain_owner(struct pci_dev *dev); -int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); -int xen_unregister_device_domain_owner(struct pci_dev *dev); #else static inline int __init pci_xen_initial_domain(void) { return -1; } +#endif +#ifdef CONFIG_XEN_DOM0 +int xen_find_device_domain_owner(struct pci_dev *dev); +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); +int xen_unregister_device_domain_owner(struct pci_dev *dev); +#else static inline int xen_find_device_domain_owner(struct pci_dev *dev) { return -1; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f8885949e..b3410f1ac2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -326,6 +326,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_SMAP cr4_set_bits(X86_CR4_SMAP); #else + clear_cpu_cap(c, X86_FEATURE_SMAP); cr4_clear_bits(X86_CR4_SMAP); #endif } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4b8813baff..bb1c3f5f60 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(d); + kfree(hw_dom); return; } if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(d); + kfree(hw_dom->ctrl_val); + kfree(hw_dom->mbps_val); + kfree(hw_dom); return; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 38837dad46..391a4e2b86 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -714,12 +714,6 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3e20, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3ec4, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x8a12, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 445c57c9c5..831b25c5e7 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -379,9 +379,14 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, sizeof(fpu->state.fxsave))) return -EFAULT; - /* Reject invalid MXCSR values. */ - if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) - return -EINVAL; + if (IS_ENABLED(CONFIG_X86_64)) { + /* Reject invalid MXCSR values. */ + if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + } else { + /* Mask invalid bits out for historical reasons (broken hardware). */ + fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 42fc41dd0e..882213df37 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -10,6 +10,7 @@ #include #include #include +#include #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -916,6 +917,83 @@ static bool __init hpet_counting(void) return false; } +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ @@ -929,6 +1007,9 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; + if (hpet_is_pc10_damaged()) + return 0; + hpet_set_mapping(); if (!hpet_virt_address) return 0; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ad273e5861..73c74b961d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -49,18 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); static struct pvclock_wall_clock wall_clock __bss_decrypted; -static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static struct pvclock_vsyscall_time_info *hvclock_mem; - -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) -{ - return &this_cpu_read(hv_clock_per_cpu)->pvti; -} - -static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) -{ - return this_cpu_read(hv_clock_per_cpu); -} +DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); /* * The wallclock is the time of day when we booted. Since then, some time may diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 9f90f460a2..bf1033a62e 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -130,6 +130,8 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, } else { ret = ES_VMM_ERROR; } + } else if (ghcb->save.sw_exit_info_1 & 0xffffffff) { + ret = ES_VMM_ERROR; } else { ret = ES_OK; } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2837110e66..9a144ca8e1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -435,7 +435,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); __FOP_RET(#op) asm(".pushsection .fixup, \"ax\"\n" - ".global kvm_fastop_exception \n" "kvm_fastop_exception: xor %esi, %esi; ret\n" ".popsection"); @@ -4206,7 +4205,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) u64 cr4 = ctxt->ops->get_cr(ctxt, 4); if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) - return emulate_ud(ctxt); + return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 232a86a6fa..d5124b520f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -939,7 +939,7 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); - hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); + hv_vcpu->vp_index = vcpu->vcpu_idx; return 0; } @@ -1444,7 +1444,6 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); - int vcpu_idx = kvm_vcpu_get_idx(vcpu); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) @@ -1459,9 +1458,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ - if (hv_vcpu->vp_index == vcpu_idx) + if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); - else if (new_vp_index == vcpu_idx) + else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 730da8537d..ed1c4e546d 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -83,7 +83,7 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu); + return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx; } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index ff005fe738..8c065da73f 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -319,8 +319,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) unsigned index; bool mask_before, mask_after; union kvm_ioapic_redirect_entry *e; - unsigned long vcpu_bitmap; int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -384,9 +384,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) irq.shorthand = APIC_DEST_NOSHORT; irq.dest_id = e->fields.dest_id; irq.msi_redir_hint = false; - bitmap_zero(&vcpu_bitmap, 16); + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, - &vcpu_bitmap); + vcpu_bitmap); if (old_dest_mode != e->fields.dest_mode || old_dest_id != e->fields.dest_id) { /* @@ -399,10 +399,10 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) kvm_lapic_irq_dest_mode( !!e->fields.dest_mode); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, - &vcpu_bitmap); + vcpu_bitmap); } kvm_make_scan_ioapic_request_mask(ioapic->kvm, - &vcpu_bitmap); + vcpu_bitmap); } else { kvm_make_scan_ioapic_request(ioapic->kvm); } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d7e61122a..1a64ba5b94 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2027,8 +2027,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) } while (!sp->unsync_children); } -static void mmu_sync_children(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *parent) +static int mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent, bool can_yield) { int i; struct kvm_mmu_page *sp; @@ -2055,12 +2055,18 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + if (!can_yield) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + return -EINTR; + } + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + return 0; } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) @@ -2146,9 +2152,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - __clear_sp_write_flooding_count(sp); trace_get_page: @@ -3684,7 +3687,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); @@ -3700,7 +3703,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); } } diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 269f11f92f..21427e84a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -164,13 +164,13 @@ void kvm_page_track_cleanup(struct kvm *kvm) cleanup_srcu_struct(&head->track_srcu); } -void kvm_page_track_init(struct kvm *kvm) +int kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; - init_srcu_struct(&head->track_srcu); INIT_HLIST_HEAD(&head->track_notifier_list); + return init_srcu_struct(&head->track_srcu); } /* diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d03e9b7cc..913d52a792 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -707,8 +707,27 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access); + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, + it.level-1, false, access); + /* + * We must synchronize the pagetable before linking it + * because the guest doesn't need to flush tlb when + * the gpte is changed from non-present to present. + * Otherwise, the guest may use the wrong mapping. + * + * For PG_LEVEL_4K, kvm_mmu_get_page() has already + * synchronized it transiently via kvm_sync_page(). + * + * For higher level pagetable, we synchronize it via + * the slower mmu_sync_children(). If it needs to + * break, some progress has been made; return + * RET_PF_RETRY and retry on the next #PF. + * KVM_REQ_MMU_SYNC is not necessary but it + * expedites the process. + */ + if (sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; } /* @@ -1047,14 +1066,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. - * - * Note: - * We should flush all tlbs if spte is dropped even though guest is - * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page - * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't - * used by guest then tlbs are not flushed, so guest is allowed to access the - * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1107,13 +1118,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - /* - * Update spte before increasing tlbs_dirty to make - * sure no tlb flush is lost after spte is zapped; see - * the comments in kvm_flush_remote_tlbs(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; continue; } @@ -1128,12 +1133,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - /* - * The same as above where we are doing - * prefetch_invalid_gpte(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; continue; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 2545d0c619..510b833cbd 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -545,7 +545,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); - svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; svm->vmcb->control.int_state = svm->nested.ctl.int_state; svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; @@ -579,7 +578,7 @@ static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to } int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, - struct vmcb *vmcb12) + struct vmcb *vmcb12, bool from_vmrun) { struct vcpu_svm *svm = to_svm(vcpu); int ret; @@ -609,13 +608,16 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, - nested_npt_enabled(svm), true); + nested_npt_enabled(svm), from_vmrun); if (ret) return ret; if (!npt_enabled) vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + if (!from_vmrun) + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + svm_set_gif(svm, true); return 0; @@ -681,7 +683,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75e0b21ad0..c36b5fe4c2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -595,43 +595,50 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) return 0; } +static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, + int *error) +{ + struct sev_data_launch_update_vmsa vmsa; + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + /* Perform some pre-encryption checks against the VMSA */ + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* + * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of + * the VMSA memory content (i.e it will write the same memory region + * with the guest's key), so invalidate it first. + */ + clflush_cache_range(svm->vmsa, PAGE_SIZE); + + vmsa.reserved = 0; + vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; + vmsa.address = __sme_pa(svm->vmsa); + vmsa.len = PAGE_SIZE; + return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); +} + static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_update_vmsa vmsa; struct kvm_vcpu *vcpu; int i, ret; if (!sev_es_guest(kvm)) return -ENOTTY; - vmsa.reserved = 0; - kvm_for_each_vcpu(i, vcpu, kvm) { - struct vcpu_svm *svm = to_svm(vcpu); - - /* Perform some pre-encryption checks against the VMSA */ - ret = sev_es_sync_vmsa(svm); + ret = mutex_lock_killable(&vcpu->mutex); if (ret) return ret; - /* - * The LAUNCH_UPDATE_VMSA command will perform in-place - * encryption of the VMSA memory content (i.e it will write - * the same memory region with the guest's key), so invalidate - * it first. - */ - clflush_cache_range(svm->vmsa, PAGE_SIZE); + ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); - vmsa.handle = sev->handle; - vmsa.address = __sme_pa(svm->vmsa); - vmsa.len = PAGE_SIZE; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, - &argp->error); + mutex_unlock(&vcpu->mutex); if (ret) return ret; - - svm->vcpu.arch.guest_state_protected = true; } return 0; @@ -1397,8 +1404,10 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Bind ASID to this guest */ ret = sev_bind_asid(kvm, start.handle, error); - if (ret) + if (ret) { + sev_decommission(start.handle); goto e_free_session; + } params.handle = start.handle; if (copy_to_user((void __user *)(uintptr_t)argp->data, @@ -1464,7 +1473,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 0); + PAGE_SIZE, &n, 1); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1501,6 +1510,20 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } +static bool cmd_allowed_from_miror(u32 cmd_id) +{ + /* + * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES + * active mirror VMs. Also allow the debugging and status commands. + */ + if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || + cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || + cmd_id == KVM_SEV_DBG_ENCRYPT) + return true; + + return false; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -1517,8 +1540,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); - /* enc_context_owner handles all memory enc operations */ - if (is_mirroring_enc_context(kvm)) { + /* Only the enc_context_owner handles some memory enc operations. */ + if (is_mirroring_enc_context(kvm) && + !cmd_allowed_from_miror(sev_cmd.id)) { r = -EINVAL; goto out; } @@ -1715,8 +1739,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info *mirror_sev; - unsigned int asid; + struct kvm_sev_info source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); @@ -1739,7 +1762,8 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) goto e_source_unlock; } - asid = to_kvm_svm(source_kvm)->sev_info.asid; + memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, + sizeof(source_sev)); /* * The mirror kvm holds an enc_context_owner ref so its asid can't @@ -1759,8 +1783,16 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; - mirror_sev->asid = asid; mirror_sev->active = true; + mirror_sev->asid = source_sev.asid; + mirror_sev->fd = source_sev.fd; + mirror_sev->es_active = source_sev.es_active; + mirror_sev->handle = source_sev.handle; + /* + * Do not copy ap_jump_table. Since the mirror does not share the same + * KVM contexts as the original, and they may have different + * memory-views. + */ mutex_unlock(&kvm->lock); return 0; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 05e8d4d279..989685098b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1566,6 +1566,8 @@ static void svm_clear_vintr(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & V_IRQ_INJECTION_BITS_MASK; + + svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; } vmcb_mark_dirty(svm->vmcb, VMCB_INTR); @@ -2222,6 +2224,10 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (error_code) goto reinject; + /* All SVM instructions expect page aligned RAX */ + if (svm->vmcb->save.rax & ~PAGE_MASK) + goto reinject; + /* Decode the instruction for usage later */ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; @@ -4285,43 +4291,44 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) struct kvm_host_map map_save; int ret; - if (is_guest_mode(vcpu)) { - /* FED8h - SVM Guest */ - put_smstate(u64, smstate, 0x7ed8, 1); - /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + if (!is_guest_mode(vcpu)) + return 0; - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + /* FED8h - SVM Guest */ + put_smstate(u64, smstate, 0x7ed8, 1); + /* FEE0h - SVM Guest VMCB Physical Address */ + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); - ret = nested_svm_vmexit(svm); - if (ret) - return ret; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - /* - * KVM uses VMCB01 to store L1 host state while L2 runs but - * VMCB01 is going to be used during SMM and thus the state will - * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save - * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the - * format of the area is identical to guest save area offsetted - * by 0x400 (matches the offset of 'struct vmcb_save_area' - * within 'struct vmcb'). Note: HSAVE area may also be used by - * L1 hypervisor to save additional host context (e.g. KVM does - * that, see svm_prepare_guest_switch()) which must be - * preserved. - */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) - return 1; + ret = nested_svm_vmexit(svm); + if (ret) + return ret; - BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); + /* + * KVM uses VMCB01 to store L1 host state while L2 runs but + * VMCB01 is going to be used during SMM and thus the state will + * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save + * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the + * format of the area is identical to guest save area offsetted + * by 0x400 (matches the offset of 'struct vmcb_save_area' + * within 'struct vmcb'). Note: HSAVE area may also be used by + * L1 hypervisor to save additional host context (e.g. KVM does + * that, see svm_prepare_guest_switch()) which must be + * preserved. + */ + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), + &map_save) == -EINVAL) + return 1; - svm_copy_vmrun_state(map_save.hva + 0x400, - &svm->vmcb01.ptr->save); + BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); - kvm_vcpu_unmap(vcpu, &map_save, true); - } + svm_copy_vmrun_state(map_save.hva + 0x400, + &svm->vmcb01.ptr->save); + + kvm_vcpu_unmap(vcpu, &map_save, true); return 0; } @@ -4329,50 +4336,54 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; - int ret = 0; + u64 saved_efer, vmcb12_gpa; + struct vmcb *vmcb12; + int ret; - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { - u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); - u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); - u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - struct vmcb *vmcb12; + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return 0; - if (guest) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - return 1; + /* Non-zero if SMI arrived while vCPU was in guest mode. */ + if (!GET_SMSTATE(u64, smstate, 0x7ed8)) + return 0; - if (!(saved_efer & EFER_SVME)) - return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + return 1; - if (kvm_vcpu_map(vcpu, - gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) - return 1; + saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); + if (!(saved_efer & EFER_SVME)) + return 1; - if (svm_allocate_nested(svm)) - return 1; + vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + return 1; - vmcb12 = map.hva; + ret = 1; + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) + goto unmap_map; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + if (svm_allocate_nested(svm)) + goto unmap_save; - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); - kvm_vcpu_unmap(vcpu, &map, true); + /* + * Restore L1 host state from L1 HSAVE area as VMCB01 was + * used during SMM (see svm_enter_smm()) + */ - /* - * Restore L1 host state from L1 HSAVE area as VMCB01 was - * used during SMM (see svm_enter_smm()) - */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) - return 1; + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); - svm_copy_vmrun_state(&svm->vmcb01.ptr->save, - map_save.hva + 0x400); + /* + * Enter the nested guest now + */ - kvm_vcpu_unmap(vcpu, &map_save, true); - } - } + vmcb12 = map.hva; + nested_load_control_from_vmcb12(svm, &vmcb12->control); + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); +unmap_save: + kvm_vcpu_unmap(vcpu, &map_save, true); +unmap_map: + kvm_vcpu_unmap(vcpu, &map, true); return ret; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 524d943f3e..128a54b1fb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -459,7 +459,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } -int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12); +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, + u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 0dab1b7b52..ba6f99f584 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -353,14 +353,20 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + break; + case MSR_IA32_VMX_PINBASED_CTLS: + ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + break; + case MSR_IA32_VMX_VMFUNC: + ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC; break; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ccb03d6954..eedcebf580 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2583,8 +2583,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Guest state is invalid and unrestricted guest is disabled, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. + * + * However when force loading the guest state (SMM exit or + * loading nested state after migration, it is possible to + * have invalid guest state now, which will be later fixed by + * restoring L2 register state */ - if (CC(!vmx_guest_state_valid(vcpu))) { + if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -4351,6 +4356,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); + + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) @@ -4899,14 +4906,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) return -ENOMEM; } -/* - * Emulate the VMXON instruction. - * Currently, we just remember that VMX is active, and do not save or even - * inspect the argument to VMXON (the so-called "VMXON pointer") because we - * do not currently need to store anything in that guest-allocated memory - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their - * argument is different from the VMXON pointer (which the spec says they do). - */ +/* Emulate the VMXON instruction. */ static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; @@ -5903,6 +5903,12 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; + case EXIT_REASON_BUS_LOCK: + /* + * At present, bus lock VM exit is never exposed to L1. + * Handle L2's bus locks in L0 directly. + */ + return true; default: break; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0c2c0d5ae8..116b08904a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1323,7 +1323,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_prepare_switch_to_host(to_vmx(vcpu)); } -static bool emulation_required(struct kvm_vcpu *vcpu) +bool vmx_emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } @@ -1367,7 +1367,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = emulation_required(vcpu); + vmx->emulation_required = vmx_emulation_required(vcpu); } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -1837,10 +1837,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &msr_info->data)) return 1; /* - * Enlightened VMCS v1 doesn't have certain fields, but buggy - * Hyper-V versions are still trying to use corresponding - * features when they are exposed. Filter out the essential - * minimum. + * Enlightened VMCS v1 doesn't have certain VMCS fields but + * instead of just ignoring the features, different Hyper-V + * versions are either trying to use them and fail or do some + * sanity checking and refuse to boot. Filter all unsupported + * features out. */ if (!msr_info->host_initiated && vmx->nested.enlightened_vmcs_enabled) @@ -3077,7 +3078,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = emulation_required(vcpu); + vmx->emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_tdp_level(void) @@ -3330,7 +3331,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -6621,10 +6622,24 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->soft_vnmi_blocked)) vmx->loaded_vmcs->entry_time = ktime_get(); - /* Don't enter VMX if guest state is invalid, let the exit handler - start emulation until we arrive back to a valid state */ - if (vmx->emulation_required) + /* + * Don't enter VMX if guest state is invalid, let the exit handler + * start emulation until we arrive back to a valid state. Synthesize a + * consistency check VM-Exit due to invalid guest state and bail. + */ + if (unlikely(vmx->emulation_required)) { + + /* We don't emulate invalid state of a nested guest */ + vmx->fail = is_guest_mode(vcpu); + + vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->exit_reason.failed_vmentry = 1; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = 0; return EXIT_FASTPATH_NONE; + } trace_kvm_entry(vcpu); @@ -6833,7 +6848,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) */ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (tsx_ctrl) - vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; } err = alloc_loaded_vmcs(&vmx->vmcs01); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4858c5fd95..592217fd7d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -248,12 +248,8 @@ struct vcpu_vmx { * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to * be loaded into hardware if those conditions aren't met. - * nr_active_uret_msrs tracks the number of MSRs that need to be loaded - * into hardware when running the guest. guest_uret_msrs[] is resorted - * whenever the number of "active" uret MSRs is modified. */ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; - int nr_active_uret_msrs; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; @@ -359,6 +355,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); +bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28ef141557..aabd3a2ec1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1332,6 +1332,13 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; @@ -2969,7 +2976,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_set) kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); - if (v == kvm_get_vcpu(v->kvm, 0)) + if (!v->vcpu_idx) kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -7658,6 +7665,13 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) /* Process a latched INIT or SMI, if any. */ kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, + * on SMM exit we still need to reload them from + * guest memory + */ + vcpu->arch.pdptrs_from_userspace = false; } kvm_mmu_reset_context(vcpu); @@ -10652,6 +10666,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int r; vcpu->arch.last_vmentry_cpu = -1; + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -10893,6 +10909,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); + vcpu->arch.cr3 = 0; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + /* * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions * of Intel's SDM list CD/NW as being set on INIT, but they contradict @@ -11139,9 +11158,15 @@ void kvm_arch_free_vm(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + int ret; + if (type) return -EINVAL; + ret = kvm_page_track_init(kvm); + if (ret) + return ret; + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); @@ -11174,7 +11199,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); - kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); kvm_xen_init_vm(kvm); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0fe6aacef3..9ea57389c5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1341,9 +1341,10 @@ st: if (is_imm8(insn->off)) if (insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH)) { - u8 *branch_target; bool is64 = BPF_SIZE(insn->code) == BPF_DW; u32 real_src_reg = src_reg; + u32 real_dst_reg = dst_reg; + u8 *branch_target; /* * Can't be implemented with a single x86 insn. @@ -1354,11 +1355,13 @@ st: if (is_imm8(insn->off)) emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); if (src_reg == BPF_REG_0) real_src_reg = BPF_REG_AX; + if (dst_reg == BPF_REG_0) + real_dst_reg = BPF_REG_AX; branch_target = prog; /* Load old value */ emit_ldx(&prog, BPF_SIZE(insn->code), - BPF_REG_0, dst_reg, insn->off); + BPF_REG_0, real_dst_reg, insn->off); /* * Perform the (commutative) operation locally, * put the result in the AUX_REG. @@ -1369,7 +1372,8 @@ st: if (is_imm8(insn->off)) add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ err = emit_atomic(&prog, BPF_CMPXCHG, - dst_reg, AUX_REG, insn->off, + real_dst_reg, AUX_REG, + insn->off, BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; @@ -1383,11 +1387,10 @@ st: if (is_imm8(insn->off)) /* Restore R0 after clobbering RAX */ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break; - } err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; @@ -1744,7 +1747,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool mod_ret) + struct bpf_prog *p, int stack_size, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; @@ -1777,11 +1780,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; - /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + /* + * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return * of the previous call which is then passed on the stack to * the next BPF program. + * + * BPF_TRAMP_FENTRY trampoline may need to return the return + * value of BPF_PROG_TYPE_STRUCT_OPS prog. */ - if (mod_ret) + if (save_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); /* replace 2 nops with JE insn, since jmp target is known */ @@ -1828,13 +1835,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size) + struct bpf_tramp_progs *tp, int stack_size, + bool save_ret) { int i; u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, + save_ret)) return -EINVAL; } *pprog = prog; @@ -1877,6 +1886,23 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } +static bool is_valid_bpf_tramp_flags(unsigned int flags) +{ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return false; + + /* + * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ + if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && + (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) + return false; + + return true; +} + /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -1949,17 +1975,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; + bool save_ret; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ if (nr_args > 6) return -ENOTSUPP; - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && - (flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!is_valid_bpf_tramp_flags(flags)) return -EINVAL; - if (flags & BPF_TRAMP_F_CALL_ORIG) - stack_size += 8; /* room for return value of orig_call */ + /* room for return value of orig_call or fentry prog */ + save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); + if (save_ret) + stack_size += 8; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ @@ -2005,7 +2033,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_progs) - if (invoke_bpf(m, &prog, fentry, stack_size)) + if (invoke_bpf(m, &prog, fentry, stack_size, + flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; if (fmod_ret->nr_progs) { @@ -2052,7 +2081,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) { + if (invoke_bpf(m, &prog, fexit, stack_size, false)) { ret = -EINVAL; goto cleanup; } @@ -2072,9 +2101,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - /* restore original return value back into RAX */ - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); } + /* restore return value of orig_call or fentry prog back into RAX */ + if (save_ret) + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 3d41a09c2c..5debe4ac6f 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -113,7 +113,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, false /* no mapping of GSI to PIRQ */); } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static int xen_register_gsi(u32 gsi, int triggering, int polarity) { int rc, irq; @@ -261,7 +261,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return irq; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static bool __read_mostly pci_seg_supported = true; static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) @@ -375,10 +375,10 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev) WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret); } } -#else /* CONFIG_XEN_DOM0 */ +#else /* CONFIG_XEN_PV_DOM0 */ #define xen_initdom_setup_msi_irqs NULL #define xen_initdom_restore_msi_irqs NULL -#endif /* !CONFIG_XEN_DOM0 */ +#endif /* !CONFIG_XEN_PV_DOM0 */ static void xen_teardown_msi_irqs(struct pci_dev *dev) { @@ -555,7 +555,7 @@ int __init pci_xen_hvm_init(void) return 0; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void) { int irq; @@ -583,6 +583,9 @@ int __init pci_xen_initial_domain(void) } return 0; } +#endif + +#ifdef CONFIG_XEN_DOM0 struct xen_device_domain_owner { domid_t domain; @@ -656,4 +659,4 @@ int xen_unregister_device_domain_owner(struct pci_dev *dev) return 0; } EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); -#endif +#endif /* CONFIG_XEN_DOM0 */ diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index ee2beda590..1d4a00e767 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -274,7 +274,7 @@ static struct olpc_ec_driver ec_xo1_driver = { static struct olpc_ec_driver ec_xo1_5_driver = { .ec_cmd = olpc_xo1_ec_cmd, -#ifdef CONFIG_OLPC_XO1_5_SCI +#ifdef CONFIG_OLPC_XO15_SCI /* * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is * compiled in diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 9ac7457f52..ed0442e354 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -16,15 +16,15 @@ /* * PVH variables. * - * pvh_bootparams and pvh_start_info need to live in the data segment since + * pvh_bootparams and pvh_start_info need to live in a data segment since * they are used after startup_{32|64}, which clear .bss, are invoked. */ -struct boot_params pvh_bootparams __section(".data"); -struct hvm_start_info pvh_start_info __section(".data"); +struct boot_params __initdata pvh_bootparams; +struct hvm_start_info __initdata pvh_start_info; -unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +const unsigned int __initconst pvh_start_info_sz = sizeof(pvh_start_info); -static u64 pvh_get_root_pointer(void) +static u64 __init pvh_get_root_pointer(void) { return pvh_start_info.rsdp_paddr; } @@ -107,7 +107,7 @@ void __init __weak xen_pvh_init(struct boot_params *boot_params) BUG(); } -static void hypervisor_specific_init(bool xen_guest) +static void __init hypervisor_specific_init(bool xen_guest) { if (xen_guest) xen_pvh_init(&pvh_bootparams); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index afc1da68b0..6bcd3d8ca6 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -43,13 +43,9 @@ config XEN_PV_SMP def_bool y depends on XEN_PV && SMP -config XEN_DOM0 - bool "Xen PV Dom0 support" - default y - depends on XEN_PV && PCI_XEN && SWIOTLB_XEN - depends on X86_IO_APIC && ACPI && PCI - help - Support running as a Xen PV Dom0 guest. +config XEN_PV_DOM0 + def_bool y + depends on XEN_PV && XEN_DOM0 config XEN_PVHVM def_bool y @@ -86,3 +82,12 @@ config XEN_PVH def_bool n help Support for running as a Xen PVH guest. + +config XEN_DOM0 + bool "Xen Dom0 support" + default XEN_PV + depends on (XEN_PV && SWIOTLB_XEN) || (XEN_PVH && X86_64) + depends on X86_IO_APIC && ACPI && PCI + select X86_X2APIC if XEN_PVH && X86_64 + help + Support running as a Xen Dom0 guest. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 40b5779fce..4953260e28 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,7 +45,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += vga.o +obj-$(CONFIG_XEN_PV_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c79bd0af2e..95d970359e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -3,6 +3,7 @@ #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG #include #endif +#include #include #include #include @@ -10,12 +11,15 @@ #include #include +#include +#include #include #include #include #include #include +#include #include "xen-ops.h" #include "smp.h" @@ -52,9 +56,6 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); -enum xen_domain_type xen_domain_type = XEN_NATIVE; -EXPORT_SYMBOL_GPL(xen_domain_type); - unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); unsigned long machine_to_phys_nr; @@ -69,10 +70,12 @@ __read_mostly int xen_have_vector_callback; EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* - * NB: needs to live in .data because it's used by xen_prepare_pvh which runs - * before clearing the bss. + * NB: These need to live in .data or alike because they're used by + * xen_prepare_pvh() which runs before clearing the bss. */ -uint32_t xen_start_flags __section(".data") = 0; +enum xen_domain_type __ro_after_init xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); +uint32_t __ro_after_init xen_start_flags; EXPORT_SYMBOL(xen_start_flags); /* @@ -258,6 +261,45 @@ int xen_vcpu_setup(int cpu) return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); } +void __init xen_banner(void) +{ + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + + pr_info("Booting kernel on %s\n", pv_info.name); + pr_info("Xen version: %u.%u%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) + ? " (preserve-AD)" : ""); +} + +/* Check if running on Xen version (major, minor) or later */ +bool xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} + +void __init xen_add_preferred_consoles(void) +{ + add_preferred_console("xenboot", 0, NULL); + if (!boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + if (boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); +} + void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 6e0d0754f9..a7b7d674f5 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -109,17 +108,6 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); -static void __init xen_banner(void) -{ - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); - struct xen_extraversion extra; - HYPERVISOR_xen_version(XENVER_extraversion, &extra); - - pr_info("Booting paravirtualized kernel on %s\n", pv_info.name); - pr_info("Xen version: %d.%d%s (preserve-AD)\n", - version >> 16, version & 0xffff, extra.extraversion); -} - static void __init xen_pv_init_platform(void) { populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); @@ -142,22 +130,6 @@ static void __init xen_pv_guest_late_init(void) #endif } -/* Check if running on Xen version (major, minor) or later */ -bool -xen_running_on_version_or_later(unsigned int major, unsigned int minor) -{ - unsigned int version; - - if (!xen_domain()) - return false; - - version = HYPERVISOR_xen_version(XENVER_version, NULL); - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || - ((version >> 16) > major)) - return true; - return false; -} - static __read_mostly unsigned int cpuid_leaf5_ecx_val; static __read_mostly unsigned int cpuid_leaf5_edx_val; @@ -1364,7 +1336,6 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { - add_preferred_console("xenboot", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; x86_platform.set_legacy_features = @@ -1409,11 +1380,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } - if (!boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); - add_preferred_console("hvc", 0, NULL); - if (boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); + xen_add_preferred_consoles(); #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 0d5e34b9e6..bcae606bbc 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include @@ -18,10 +19,11 @@ /* * PVH variables. * - * The variable xen_pvh needs to live in the data segment since it is used + * The variable xen_pvh needs to live in a data segment since it is used * after startup_{32|64} is invoked, which will clear the .bss segment. */ -bool xen_pvh __section(".data") = 0; +bool __ro_after_init xen_pvh; +EXPORT_SYMBOL_GPL(xen_pvh); void __init xen_pvh_init(struct boot_params *boot_params) { @@ -36,6 +38,10 @@ void __init xen_pvh_init(struct boot_params *boot_params) pfn = __pa(hypercall_page); wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + if (xen_initial_domain()) + x86_init.oem.arch_setup = xen_add_preferred_consoles; + x86_init.oem.banner = xen_banner; + xen_efi_init(boot_params); } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 8d751939c6..3359c23573 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2398,7 +2398,7 @@ static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data) int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, - unsigned int domid, bool no_translate, struct page **pages) + unsigned int domid, bool no_translate) { int err = 0; struct remap_data rmd; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8d7ec49a35..8bc8b72a20 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -51,6 +51,7 @@ void __init xen_remap_memory(void); phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); +void xen_banner(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); @@ -109,7 +110,7 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, @@ -118,6 +119,8 @@ static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, } #endif +void xen_add_preferred_consoles(void); + void __init xen_init_apic(void); #ifdef CONFIG_XEN_EFI diff --git a/arch/xtensa/include/asm/kmem_layout.h b/arch/xtensa/include/asm/kmem_layout.h index 7cbf68ca71..6fc05cba61 100644 --- a/arch/xtensa/include/asm/kmem_layout.h +++ b/arch/xtensa/include/asm/kmem_layout.h @@ -78,7 +78,7 @@ #endif #define XCHAL_KIO_SIZE 0x10000000 -#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_OF) +#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_USE_OF) #define XCHAL_KIO_PADDR xtensa_get_kio_paddr() #ifndef __ASSEMBLY__ extern unsigned long xtensa_kio_paddr; diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c index 764b54bef7..15051a8a15 100644 --- a/arch/xtensa/kernel/irq.c +++ b/arch/xtensa/kernel/irq.c @@ -143,7 +143,7 @@ unsigned xtensa_get_ext_irq_no(unsigned irq) void __init init_IRQ(void) { -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF irqchip_init(); #else #ifdef CONFIG_HAVE_SMP diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index ed184106e4..ee9082a142 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -63,7 +63,7 @@ extern unsigned long initrd_end; extern int initrd_below_start_ok; #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF void *dtb_start = __dtb_start; #endif @@ -125,7 +125,7 @@ __tagtable(BP_TAG_INITRD, parse_tag_initrd); #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF static int __init parse_tag_fdt(const bp_tag_t *tag) { @@ -135,7 +135,7 @@ static int __init parse_tag_fdt(const bp_tag_t *tag) __tagtable(BP_TAG_FDT, parse_tag_fdt); -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ static int __init parse_tag_cmdline(const bp_tag_t* tag) { @@ -183,7 +183,7 @@ static int __init parse_bootparam(const bp_tag_t *tag) } #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF #if !XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY unsigned long xtensa_kio_paddr = XCHAL_KIO_DEFAULT_PADDR; @@ -232,7 +232,7 @@ void __init early_init_devtree(void *params) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); } -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ /* * Initialize architecture. (Early stage) @@ -253,7 +253,7 @@ void __init init_arch(bp_tag_t *bp_start) if (bp_start) parse_bootparam(bp_start); -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF early_init_devtree(dtb_start); #endif diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index 7e4d97dc8b..38acda4f04 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -101,7 +101,7 @@ void init_mmu(void) void init_kio(void) { -#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY && defined(CONFIG_OF) +#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY && defined(CONFIG_USE_OF) /* * Update the IO area mapping in case xtensa_kio_paddr has changed */ diff --git a/arch/xtensa/platforms/xtfpga/setup.c b/arch/xtensa/platforms/xtfpga/setup.c index 4f7d6142d4..538e6748e8 100644 --- a/arch/xtensa/platforms/xtfpga/setup.c +++ b/arch/xtensa/platforms/xtfpga/setup.c @@ -51,8 +51,12 @@ void platform_power_off(void) void platform_restart(void) { - /* Flush and reset the mmu, simulate a processor reset, and - * jump to the reset vector. */ + /* Try software reset first. */ + WRITE_ONCE(*(u32 *)XTFPGA_SWRST_VADDR, 0xdead); + + /* If software reset did not work, flush and reset the mmu, + * simulate a processor reset, and jump to the reset vector. + */ cpu_reset(); /* control never gets here */ } @@ -66,7 +70,7 @@ void __init platform_calibrate_ccount(void) #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF static void __init xtfpga_clk_setup(struct device_node *np) { @@ -284,4 +288,4 @@ static int __init xtavnet_init(void) */ arch_initcall(xtavnet_init); -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ diff --git a/block/Kconfig b/block/Kconfig index a2297edfdd..8e28ae7718 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -29,35 +29,15 @@ if BLOCK config BLK_RQ_ALLOC_TIME bool -config BLK_SCSI_REQUEST - bool - config BLK_CGROUP_RWSTAT bool -config BLK_DEV_BSG - bool "Block layer SG support v4" - default y - select BLK_SCSI_REQUEST - help - Saying Y here will enable generic SG (SCSI generic) v4 support - for any block device. - - Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4 - can handle complicated SCSI commands: tagged variable length cdbs - with bidirectional data transfers and generic request/response - protocols (e.g. Task Management Functions and SMP in Serial - Attached SCSI). - - This option is required by recent UDEV versions to properly - access device serial numbers, etc. - - If unsure, say Y. +config BLK_DEV_BSG_COMMON + tristate config BLK_DEV_BSGLIB bool "Block layer SG support v4 helper lib" - select BLK_DEV_BSG - select BLK_SCSI_REQUEST + select BLK_DEV_BSG_COMMON help Subsystems will normally enable this if needed. Users will not normally need to manually enable this. @@ -114,16 +94,6 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. -config BLK_CMDLINE_PARSER - bool "Block device command line partition parser" - help - Enabling this option allows you to specify the partition layout from - the kernel boot args. This is typically of use for embedded devices - which don't otherwise have any standardized method for listing the - partitions on a block device. - - See Documentation/block/cmdline-partition.rst for more information. - config BLK_WBT bool "Enable support for block device writeback throttling" help @@ -133,6 +103,13 @@ config BLK_WBT dynamically on an algorithm loosely based on CoDel, factoring in the realtime performance of the disk. +config BLK_WBT_MQ + bool "Enable writeback throttling by default" + default y + depends on BLK_WBT + help + Enable writeback throttling by default for request-based block devices. + config BLK_CGROUP_IOLATENCY bool "Enable support for latency based cgroup IO protection" depends on BLK_CGROUP=y @@ -144,6 +121,15 @@ config BLK_CGROUP_IOLATENCY Note, this is an experimental interface and could be changed someday. +config BLK_CGROUP_FC_APPID + bool "Enable support to track FC I/O Traffic across cgroup applications" + depends on BLK_CGROUP && NVME_FC + help + Enabling this option enables the support to track FC I/O traffic across + cgroup applications. It enables the Fabric and the storage targets to + identify, monitor, and handle FC traffic based on VM tags by inserting + application specific identification into the FC frame. + config BLK_CGROUP_IOCOST bool "Enable support for cost model based cgroup IO controller" depends on BLK_CGROUP=y @@ -155,12 +141,14 @@ config BLK_CGROUP_IOCOST distributes IO capacity between different groups based on their share of the overall weight distribution. -config BLK_WBT_MQ - bool "Multiqueue writeback throttling" - default y - depends on BLK_WBT +config BLK_CGROUP_IOPRIO + bool "Cgroup I/O controller for assigning an I/O priority class" + depends on BLK_CGROUP help - Enable writeback throttling by default on multiqueue devices. + Enable the .prio interface for assigning an I/O priority class to + requests. The I/O priority class affects the order in which an I/O + scheduler and block devices process requests. Only some I/O schedulers + and some block devices support I/O priorities. config BLK_DEBUG_FS bool "Block layer debugging information in debugfs" @@ -233,4 +221,8 @@ config BLK_MQ_RDMA config BLK_PM def_bool BLOCK && PM +# do not use in new code +config BLOCK_HOLDER_DEPRECATED + bool + source "block/Kconfig.iosched" diff --git a/block/Makefile b/block/Makefile index 8d841f5f98..41aa1ba69c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,20 +3,21 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ - genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ + disk-events.o obj-$(CONFIG_BOUNCE) += bounce.o -obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o -obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o +obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o @@ -24,7 +25,6 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o @@ -38,3 +38,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o +obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/bdev.c b/block/bdev.c new file mode 100644 index 0000000000..485a258b0a --- /dev/null +++ b/block/bdev.c @@ -0,0 +1,1058 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright (C) 2016 - 2020 Christoph Hellwig + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../fs/internal.h" +#include "blk.h" + +struct bdev_inode { + struct block_device bdev; + struct inode vfs_inode; +}; + +static inline struct bdev_inode *BDEV_I(struct inode *inode) +{ + return container_of(inode, struct bdev_inode, vfs_inode); +} + +struct block_device *I_BDEV(struct inode *inode) +{ + return &BDEV_I(inode)->bdev; +} +EXPORT_SYMBOL(I_BDEV); + +static void bdev_write_inode(struct block_device *bdev) +{ + struct inode *inode = bdev->bd_inode; + int ret; + + spin_lock(&inode->i_lock); + while (inode->i_state & I_DIRTY) { + spin_unlock(&inode->i_lock); + ret = write_inode_now(inode, true); + if (ret) { + char name[BDEVNAME_SIZE]; + pr_warn_ratelimited("VFS: Dirty inode writeback failed " + "for block device %s (err=%d).\n", + bdevname(bdev, name), ret); + } + spin_lock(&inode->i_lock); + } + spin_unlock(&inode->i_lock); +} + +/* Kill _all_ buffers and pagecache , dirty or not.. */ +static void kill_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping_empty(mapping)) + return; + + invalidate_bh_lrus(); + truncate_inode_pages(mapping, 0); +} + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages) { + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + } + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_invalidate_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev); + +/* + * Drop all buffers & page cache for given bdev range. This function bails + * with error if bdev has other exclusive owner (such as filesystem). + */ +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + /* + * If we don't hold exclusive handle for the device, upgrade to it + * while we discard the buffer cache to avoid discarding buffers + * under live filesystem. + */ + if (!(mode & FMODE_EXCL)) { + int err = bd_prepare_to_claim(bdev, truncate_bdev_range); + if (err) + goto invalidate; + } + + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + if (!(mode & FMODE_EXCL)) + bd_abort_claiming(bdev, truncate_bdev_range); + return 0; + +invalidate: + /* + * Someone else has handle exclusively open. Try invalidating instead. + * The 'end' argument is inclusive so the rounding is safe. + */ + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, + lstart >> PAGE_SHIFT, + lend >> PAGE_SHIFT); +} + +static void set_init_blocksize(struct block_device *bdev) +{ + unsigned int bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + +int set_blocksize(struct block_device *bdev, int size) +{ + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + return -EINVAL; + + /* Size cannot be smaller than the size supported by the device */ + if (size < bdev_logical_block_size(bdev)) + return -EINVAL; + + /* Don't change the size if it is same as current */ + if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { + sync_blockdev(bdev); + bdev->bd_inode->i_blkbits = blksize_bits(size); + kill_bdev(bdev); + } + return 0; +} + +EXPORT_SYMBOL(set_blocksize); + +int sb_set_blocksize(struct super_block *sb, int size) +{ + if (set_blocksize(sb->s_bdev, size)) + return 0; + /* If we get here, we know size is power of two + * and it's value is between 512 and PAGE_SIZE */ + sb->s_blocksize = size; + sb->s_blocksize_bits = blksize_bits(size); + return sb->s_blocksize; +} + +EXPORT_SYMBOL(sb_set_blocksize); + +int sb_min_blocksize(struct super_block *sb, int size) +{ + int minsize = bdev_logical_block_size(sb->s_bdev); + if (size < minsize) + size = minsize; + return sb_set_blocksize(sb, size); +} + +EXPORT_SYMBOL(sb_min_blocksize); + +int __sync_blockdev(struct block_device *bdev, int wait) +{ + if (!bdev) + return 0; + if (!wait) + return filemap_flush(bdev->bd_inode->i_mapping); + return filemap_write_and_wait(bdev->bd_inode->i_mapping); +} + +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ + return __sync_blockdev(bdev, 1); +} +EXPORT_SYMBOL(sync_blockdev); + +/* + * Write out and wait upon all dirty data associated with this + * device. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_bdev(struct block_device *bdev) +{ + struct super_block *sb = get_super(bdev); + if (sb) { + int res = sync_filesystem(sb); + drop_super(sb); + return res; + } + return sync_blockdev(bdev); +} +EXPORT_SYMBOL(fsync_bdev); + +/** + * freeze_bdev -- lock a filesystem and force it into a consistent state + * @bdev: blockdevice to lock + * + * If a superblock is found on this device, we take the s_umount semaphore + * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. + */ +int freeze_bdev(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (++bdev->bd_fsfreeze_count > 1) + goto done; + + sb = get_active_super(bdev); + if (!sb) + goto sync; + if (sb->s_op->freeze_super) + error = sb->s_op->freeze_super(sb); + else + error = freeze_super(sb); + deactivate_super(sb); + + if (error) { + bdev->bd_fsfreeze_count--; + goto done; + } + bdev->bd_fsfreeze_sb = sb; + +sync: + sync_blockdev(bdev); +done: + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; +} +EXPORT_SYMBOL(freeze_bdev); + +/** + * thaw_bdev -- unlock filesystem + * @bdev: blockdevice to unlock + * + * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + */ +int thaw_bdev(struct block_device *bdev) +{ + struct super_block *sb; + int error = -EINVAL; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) + goto out; + + error = 0; + if (--bdev->bd_fsfreeze_count > 0) + goto out; + + sb = bdev->bd_fsfreeze_sb; + if (!sb) + goto out; + + if (sb->s_op->thaw_super) + error = sb->s_op->thaw_super(sb); + else + error = thaw_super(sb); + if (error) + bdev->bd_fsfreeze_count++; + else + bdev->bd_fsfreeze_sb = NULL; +out: + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; +} +EXPORT_SYMBOL(thaw_bdev); + +/** + * bdev_read_page() - Start reading a page from a block device + * @bdev: The device to read the page from + * @sector: The offset on the device to read the page to (need not be aligned) + * @page: The page to read + * + * On entry, the page should be locked. It will be unlocked when the page + * has been read. If the block driver implements rw_page synchronously, + * that will be true on exit from this function, but it need not be. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to read this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_read_page(struct block_device *bdev, sector_t sector, + struct page *page) +{ + const struct block_device_operations *ops = bdev->bd_disk->fops; + int result = -EOPNOTSUPP; + + if (!ops->rw_page || bdev_get_integrity(bdev)) + return result; + + result = blk_queue_enter(bdev->bd_disk->queue, 0); + if (result) + return result; + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_READ); + blk_queue_exit(bdev->bd_disk->queue); + return result; +} + +/** + * bdev_write_page() - Start writing a page to a block device + * @bdev: The device to write the page to + * @sector: The offset on the device to write the page to (need not be aligned) + * @page: The page to write + * @wbc: The writeback_control for the write + * + * On entry, the page should be locked and not currently under writeback. + * On exit, if the write started successfully, the page will be unlocked and + * under writeback. If the write failed already (eg the driver failed to + * queue the page to the device), the page will still be locked. If the + * caller is a ->writepage implementation, it will need to unlock the page. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to write this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_write_page(struct block_device *bdev, sector_t sector, + struct page *page, struct writeback_control *wbc) +{ + int result; + const struct block_device_operations *ops = bdev->bd_disk->fops; + + if (!ops->rw_page || bdev_get_integrity(bdev)) + return -EOPNOTSUPP; + result = blk_queue_enter(bdev->bd_disk->queue, 0); + if (result) + return result; + + set_page_writeback(page); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_WRITE); + if (result) { + end_page_writeback(page); + } else { + clean_page_buffers(page); + unlock_page(page); + } + blk_queue_exit(bdev->bd_disk->queue); + return result; +} + +/* + * pseudo-fs + */ + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); +static struct kmem_cache * bdev_cachep __read_mostly; + +static struct inode *bdev_alloc_inode(struct super_block *sb) +{ + struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + + if (!ei) + return NULL; + memset(&ei->bdev, 0, sizeof(ei->bdev)); + return &ei->vfs_inode; +} + +static void bdev_free_inode(struct inode *inode) +{ + struct block_device *bdev = I_BDEV(inode); + + free_percpu(bdev->bd_stats); + kfree(bdev->bd_meta_info); + + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); + kfree(bdev->bd_disk); + } + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + + kmem_cache_free(bdev_cachep, BDEV_I(inode)); +} + +static void init_once(void *data) +{ + struct bdev_inode *ei = data; + + inode_init_once(&ei->vfs_inode); +} + +static void bdev_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + invalidate_inode_buffers(inode); /* is it needed here? */ + clear_inode(inode); +} + +static const struct super_operations bdev_sops = { + .statfs = simple_statfs, + .alloc_inode = bdev_alloc_inode, + .free_inode = bdev_free_inode, + .drop_inode = generic_delete_inode, + .evict_inode = bdev_evict_inode, +}; + +static int bd_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); + if (!ctx) + return -ENOMEM; + fc->s_iflags |= SB_I_CGROUPWB; + ctx->ops = &bdev_sops; + return 0; +} + +static struct file_system_type bd_type = { + .name = "bdev", + .init_fs_context = bd_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct super_block *blockdev_superblock __read_mostly; +EXPORT_SYMBOL_GPL(blockdev_superblock); + +void __init bdev_cache_init(void) +{ + int err; + static struct vfsmount *bd_mnt; + + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), + init_once); + err = register_filesystem(&bd_type); + if (err) + panic("Cannot register bdev pseudo-fs"); + bd_mnt = kern_mount(&bd_type); + if (IS_ERR(bd_mnt)) + panic("Cannot create bdev pseudo-fs"); + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ +} + +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) +{ + struct block_device *bdev; + struct inode *inode; + + inode = new_inode(blockdev_superblock); + if (!inode) + return NULL; + inode->i_mode = S_IFBLK; + inode->i_rdev = 0; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + + bdev = I_BDEV(inode); + mutex_init(&bdev->bd_fsfreeze_mutex); + spin_lock_init(&bdev->bd_size_lock); + bdev->bd_partno = partno; + bdev->bd_inode = inode; + bdev->bd_stats = alloc_percpu(struct disk_stats); + if (!bdev->bd_stats) { + iput(inode); + return NULL; + } + bdev->bd_disk = disk; + return bdev; +} + +void bdev_add(struct block_device *bdev, dev_t dev) +{ + bdev->bd_dev = dev; + bdev->bd_inode->i_rdev = dev; + bdev->bd_inode->i_ino = dev; + insert_inode_hash(bdev->bd_inode); +} + +long nr_blockdev_pages(void) +{ + struct inode *inode; + long ret = 0; + + spin_lock(&blockdev_superblock->s_inode_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + ret += inode->i_mapping->nrpages; + spin_unlock(&blockdev_superblock->s_inode_list_lock); + + return ret; +} + +/** + * bd_may_claim - test whether a block device can be claimed + * @bdev: block device of interest + * @whole: whole block device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Test whether @bdev can be claimed by @holder. + * + * CONTEXT: + * spin_lock(&bdev_lock). + * + * RETURNS: + * %true if @bdev can be claimed, %false otherwise. + */ +static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + if (bdev->bd_holder == holder) + return true; /* already a holder */ + else if (bdev->bd_holder != NULL) + return false; /* held by someone else */ + else if (whole == bdev) + return true; /* is a whole device which isn't held */ + + else if (whole->bd_holder == bd_may_claim) + return true; /* is a partition of a device that is being partitioned */ + else if (whole->bd_holder != NULL) + return false; /* is a partition of a held device */ + else + return true; /* is a partition of an un-held device */ +} + +/** + * bd_prepare_to_claim - claim a block device + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * + * Claim @bdev. This function fails if @bdev is already claimed by another + * holder and waits if another claiming is in progress. return, the caller + * has ownership of bd_claiming and bd_holder[s]. + * + * RETURNS: + * 0 if @bdev can be claimed, -EBUSY otherwise. + */ +int bd_prepare_to_claim(struct block_device *bdev, void *holder) +{ + struct block_device *whole = bdev_whole(bdev); + + if (WARN_ON_ONCE(!holder)) + return -EINVAL; +retry: + spin_lock(&bdev_lock); + /* if someone else claimed, fail */ + if (!bd_may_claim(bdev, whole, holder)) { + spin_unlock(&bdev_lock); + return -EBUSY; + } + + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { + wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + DEFINE_WAIT(wait); + + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&bdev_lock); + schedule(); + finish_wait(wq, &wait); + goto retry; + } + + /* yay, all mine */ + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); + return 0; +} +EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ + +static void bd_clear_claiming(struct block_device *whole, void *holder) +{ + lockdep_assert_held(&bdev_lock); + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); +} + +/** + * bd_finish_claiming - finish claiming of a block device + * @bdev: block device of interest + * @holder: holder that has claimed @bdev + * + * Finish exclusive open of a block device. Mark the device as exlusively + * open by the holder and wake up all waiters for exclusive open to finish. + */ +static void bd_finish_claiming(struct block_device *bdev, void *holder) +{ + struct block_device *whole = bdev_whole(bdev); + + spin_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders will be incremented twice, + * and bd_holder will be set to bd_may_claim before being set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} + +/** + * bd_abort_claiming - abort claiming of a block device + * @bdev: block device of interest + * @holder: holder that has claimed @bdev + * + * Abort claiming of a block device when the exclusive open failed. This can be + * also used when exclusive open is not actually desired and we just needed + * to block other exclusive openers for a while. + */ +void bd_abort_claiming(struct block_device *bdev, void *holder) +{ + spin_lock(&bdev_lock); + bd_clear_claiming(bdev_whole(bdev), holder); + spin_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_abort_claiming); + +static void blkdev_flush_mapping(struct block_device *bdev) +{ + WARN_ON_ONCE(bdev->bd_holders); + sync_blockdev(bdev); + kill_bdev(bdev); + bdev_write_inode(bdev); +} + +static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + int ret = 0; + + if (disk->fops->open) { + ret = disk->fops->open(bdev, mode); + if (ret) { + /* avoid ghost partitions on a removed medium */ + if (ret == -ENOMEDIUM && + test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, true); + return ret; + } + } + + if (!bdev->bd_openers) + set_init_blocksize(bdev); + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, false); + bdev->bd_openers++; + return 0;; +} + +static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) +{ + if (!--bdev->bd_openers) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk, mode); +} + +static int blkdev_get_part(struct block_device *part, fmode_t mode) +{ + struct gendisk *disk = part->bd_disk; + int ret; + + if (part->bd_openers) + goto done; + + ret = blkdev_get_whole(bdev_whole(part), mode); + if (ret) + return ret; + + ret = -ENXIO; + if (!bdev_nr_sectors(part)) + goto out_blkdev_put; + + disk->open_partitions++; + set_init_blocksize(part); +done: + part->bd_openers++; + return 0; + +out_blkdev_put: + blkdev_put_whole(bdev_whole(part), mode); + return ret; +} + +static void blkdev_put_part(struct block_device *part, fmode_t mode) +{ + struct block_device *whole = bdev_whole(part); + + if (--part->bd_openers) + return; + blkdev_flush_mapping(part); + whole->bd_disk->open_partitions--; + blkdev_put_whole(whole, mode); +} + +struct block_device *blkdev_get_no_open(dev_t dev) +{ + struct block_device *bdev; + struct inode *inode; + + inode = ilookup(blockdev_superblock, dev); + if (!inode) { + blk_request_module(dev); + inode = ilookup(blockdev_superblock, dev); + if (!inode) + return NULL; + } + + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + + return bdev; +} + +void blkdev_put_no_open(struct block_device *bdev) +{ + module_put(bdev->bd_disk->fops->owner); + put_device(&bdev->bd_device); +} + +/** + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the block device described by device number @dev. If @mode includes + * %FMODE_EXCL, the block device is opened with exclusive access. Specifying + * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for + * the same @holder. + * + * Use this interface ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a device + * number. Everything else should use blkdev_get_by_path(). + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Reference to the block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +{ + bool unblock_events = true; + struct block_device *bdev; + struct gendisk *disk; + int ret; + + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, + MAJOR(dev), MINOR(dev), + ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | + ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); + if (ret) + return ERR_PTR(ret); + + bdev = blkdev_get_no_open(dev); + if (!bdev) + return ERR_PTR(-ENXIO); + disk = bdev->bd_disk; + + if (mode & FMODE_EXCL) { + ret = bd_prepare_to_claim(bdev, holder); + if (ret) + goto put_blkdev; + } + + disk_block_events(disk); + + mutex_lock(&disk->open_mutex); + ret = -ENXIO; + if (!disk_live(disk)) + goto abort_claiming; + if (bdev_is_partition(bdev)) + ret = blkdev_get_part(bdev, mode); + else + ret = blkdev_get_whole(bdev, mode); + if (ret) + goto abort_claiming; + if (mode & FMODE_EXCL) { + bd_finish_claiming(bdev, holder); + + /* + * Block event polling for write claims if requested. Any write + * holder makes the write_holder state stick until all are + * released. This is good enough and tracking individual + * writeable reference is too fragile given the way @mode is + * used in blkdev_get/put(). + */ + if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + unblock_events = false; + } + } + mutex_unlock(&disk->open_mutex); + + if (unblock_events) + disk_unblock_events(disk); + return bdev; + +abort_claiming: + if (mode & FMODE_EXCL) + bd_abort_claiming(bdev, holder); + mutex_unlock(&disk->open_mutex); + disk_unblock_events(disk); +put_blkdev: + blkdev_put_no_open(bdev); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(blkdev_get_by_dev); + +/** + * blkdev_get_by_path - open a block device by name + * @path: path to the block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the block device described by the device file at @path. If @mode + * includes %FMODE_EXCL, the block device is opened with exclusive access. + * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may + * nest for the same @holder. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Reference to the block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder) +{ + struct block_device *bdev; + dev_t dev; + int error; + + error = lookup_bdev(path, &dev); + if (error) + return ERR_PTR(error); + + bdev = blkdev_get_by_dev(dev, mode, holder); + if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, mode); + return ERR_PTR(-EACCES); + } + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_path); + +void blkdev_put(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + + /* + * Sync early if it looks like we're the last one. If someone else + * opens the block device between now and the decrement of bd_openers + * then we did a sync that we didn't need to, but that's not the end + * of the world and we want to avoid long (could be several minute) + * syncs while holding the mutex. + */ + if (bdev->bd_openers == 1) + sync_blockdev(bdev); + + mutex_lock(&disk->open_mutex); + if (mode & FMODE_EXCL) { + struct block_device *whole = bdev_whole(bdev); + bool bdev_free; + + /* + * Release a claim on the device. The holder fields + * are protected with bdev_lock. open_mutex is to + * synchronize disk_holder unlinking. + */ + spin_lock(&bdev_lock); + + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--whole->bd_holders < 0); + + if ((bdev_free = !bdev->bd_holders)) + bdev->bd_holder = NULL; + if (!whole->bd_holders) + whole->bd_holder = NULL; + + spin_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and + * unblock evpoll if it was a write holder. + */ + if (bdev_free && bdev->bd_write_holder) { + disk_unblock_events(disk); + bdev->bd_write_holder = false; + } + } + + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); + + if (bdev_is_partition(bdev)) + blkdev_put_part(bdev, mode); + else + blkdev_put_whole(bdev, mode); + mutex_unlock(&disk->open_mutex); + + blkdev_put_no_open(bdev); +} +EXPORT_SYMBOL(blkdev_put); + +/** + * lookup_bdev - lookup a struct block_device by name + * @pathname: special file representing the block device + * @dev: return value of the block device's dev_t + * + * Get a reference to the blockdevice at @pathname in the current + * namespace if possible and return it. Return ERR_PTR(error) + * otherwise. + */ +int lookup_bdev(const char *pathname, dev_t *dev) +{ + struct inode *inode; + struct path path; + int error; + + if (!pathname || !*pathname) + return -EINVAL; + + error = kern_path(pathname, LOOKUP_FOLLOW, &path); + if (error) + return error; + + inode = d_backing_inode(path.dentry); + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto out_path_put; + error = -EACCES; + if (!may_open_dev(&path)) + goto out_path_put; + + *dev = inode->i_rdev; + error = 0; +out_path_put: + path_put(&path); + return error; +} +EXPORT_SYMBOL(lookup_bdev); + +int __invalidate_device(struct block_device *bdev, bool kill_dirty) +{ + struct super_block *sb = get_super(bdev); + int res = 0; + + if (sb) { + /* + * no need to lock the super, get_super holds the + * read mutex so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb, kill_dirty); + drop_super(sb); + } + invalidate_bdev(bdev); + return res; +} +EXPORT_SYMBOL(__invalidate_device); + +void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +{ + struct inode *inode, *old_inode = NULL; + + spin_lock(&blockdev_superblock->s_inode_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + struct block_device *bdev; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * s_inode_list_lock We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * s_inode_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + bdev = I_BDEV(inode); + + mutex_lock(&bdev->bd_disk->open_mutex); + if (bdev->bd_openers) + func(bdev, arg); + mutex_unlock(&bdev->bd_disk->open_mutex); + + spin_lock(&blockdev_superblock->s_inode_list_lock); + } + spin_unlock(&blockdev_superblock->s_inode_list_lock); + iput(old_inode); +} diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b791e2041e..85b8e1c3a7 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -547,6 +547,8 @@ static void bfq_pd_init(struct blkg_policy_data *pd) entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; + entity->last_bfqq_created = NULL; + bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() @@ -664,6 +666,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(bfqq_group(bfqq)); + if (entity->parent && + entity->parent->last_bfqq_created == bfqq) + entity->parent->last_bfqq_created = NULL; + else if (bfqd->last_bfqq_created == bfqq) + bfqd->last_bfqq_created = NULL; + entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c91dca641e..480e1a1348 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -125,6 +125,8 @@ #include #include +#include + #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" @@ -158,10 +160,9 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ -/* Expiration time of sync (0) and async (1) requests, in ns. */ +/* Expiration time of async (0) and sync (1) requests, in ns. */ static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */ @@ -363,6 +364,16 @@ static int ref_wr_duration[2]; */ static const unsigned long max_service_from_wr = 120000; +/* + * Maximum time between the creation of two queues, for stable merge + * to be activated (in ms) + */ +static const unsigned long bfq_activation_stable_merging = 600; +/* + * Minimum time to be waited before evaluating delayed stable merge (in ms) + */ +static const unsigned long bfq_late_stable_merging = 600; + #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) @@ -371,9 +382,38 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) return bic->bfqq[is_sync]; } +static void bfq_put_stable_ref(struct bfq_queue *bfqq); + void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) { + /* + * If bfqq != NULL, then a non-stable queue merge between + * bic->bfqq and bfqq is happening here. This causes troubles + * in the following case: bic->bfqq has also been scheduled + * for a possible stable merge with bic->stable_merge_bfqq, + * and bic->stable_merge_bfqq == bfqq happens to + * hold. Troubles occur because bfqq may then undergo a split, + * thereby becoming eligible for a stable merge. Yet, if + * bic->stable_merge_bfqq points exactly to bfqq, then bfqq + * would be stably merged with itself. To avoid this anomaly, + * we cancel the stable merge if + * bic->stable_merge_bfqq == bfqq. + */ bic->bfqq[is_sync] = bfqq; + + if (bfqq && bic->stable_merge_bfqq == bfqq) { + /* + * Actually, these same instructions are executed also + * in bfq_setup_cooperator, in case of abort or actual + * execution of a stable merge. We could avoid + * repeating these instructions there too, but if we + * did so, we would nest even more complexity in this + * function. + */ + bfq_put_stable_ref(bic->stable_merge_bfqq); + + bic->stable_merge_bfqq = NULL; + } } struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) @@ -1011,7 +1051,7 @@ static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) { - unsigned int old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); if (bic->saved_has_short_ttime) @@ -1024,9 +1064,22 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, else bfq_clear_bfqq_IO_bound(bfqq); + bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; + bfqq->inject_limit = bic->saved_inject_limit; + bfqq->decrease_time_jif = bic->saved_decrease_time_jif; + bfqq->entity.new_weight = bic->saved_weight; bfqq->ttime = bic->saved_ttime; - bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->io_start_time = bic->saved_io_start_time; + bfqq->tot_idle_time = bic->saved_tot_idle_time; + /* + * Restore weight coefficient only if low_latency is on + */ + if (bfqd->low_latency) { + old_wr_coeff = bfqq->wr_coeff; + bfqq->wr_coeff = bic->saved_wr_coeff; + } + bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; @@ -1061,7 +1114,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - - (bfqq->weight_counter != NULL); + (bfqq->weight_counter != NULL) - bfqq->stable_ref; } /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ @@ -1647,6 +1700,8 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +static bool bfq_better_to_idle(struct bfq_queue *bfqq); + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1671,19 +1726,36 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * - it is sync, * - it does not belong to a large burst, * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense). + * - is linked to a bfq_io_cq (it is not shared in any sense), + * - has a default weight (otherwise we assume the user wanted + * to control its weight explicitly) */ in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && !BFQQ_TOTALLY_SEEKY(bfqq) && !in_burst && time_is_before_jiffies(bfqq->soft_rt_next_start) && - bfqq->dispatched == 0; - *interactive = !in_burst && idle_for_long_time; + bfqq->dispatched == 0 && + bfqq->entity.new_weight == 40; + *interactive = !in_burst && idle_for_long_time && + bfqq->entity.new_weight == 40; + /* + * Merged bfq_queues are kept out of weight-raising + * (low-latency) mechanisms. The reason is that these queues + * are usually created for non-interactive and + * non-soft-real-time tasks. Yet this is not the case for + * stably-merged queues. These queues are merged just because + * they are created shortly after each other. So they may + * easily serve the I/O of an interactive or soft-real time + * application, if the application happens to spawn multiple + * processes. So let also stably-merged queued enjoy weight + * raising. + */ wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || (bfq_bfqq_sync(bfqq) && - bfqq->bic && (*interactive || soft_rt))); + (bfqq->bic || RQ_BIC(rq)->stably_merged) && + (*interactive || soft_rt))); /* * Using the last flag, update budget and check whether bfqq @@ -1717,17 +1789,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_clear_bfqq_just_created(bfqq); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (arrived_in_time) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - if (bfqd->low_latency) { if (unlikely(time_is_after_jiffies(bfqq->split_time))) /* wraparound */ @@ -1755,10 +1816,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_add_bfqq_busy(bfqd, bfqq); /* - * Expire in-service queue only if preemption may be needed - * for guarantees. In particular, we care only about two - * cases. The first is that bfqq has to recover a service - * hole, as explained in the comments on + * Expire in-service queue if preemption may be needed for + * guarantees or throughput. As for guarantees, we care + * explicitly about two cases. The first is that bfqq has to + * recover a service hole, as explained in the comments on * bfq_bfqq_update_budg_for_activation(), i.e., that * bfqq_wants_to_preempt is true. However, if bfqq does not * carry time-critical I/O, then bfqq's bandwidth is less @@ -1785,11 +1846,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * timestamps of the in-service queue would need to be * updated, and this operation is quite costly (see the * comments on bfq_bfqq_update_budg_for_activation()). + * + * As for throughput, we ask bfq_better_to_idle() whether we + * still need to plug I/O dispatching. If bfq_better_to_idle() + * says no, then plugging is not needed any longer, either to + * boost throughput or to perserve service guarantees. Then + * the best option is to stop plugging I/O, as not doing so + * would certainly lower throughput. We may end up in this + * case if: (1) upon a dispatch attempt, we detected that it + * was better to plug I/O dispatch, and to wait for a new + * request to arrive for the currently in-service queue, but + * (2) this switch of bfqq to busy changes the scenario. */ if (bfqd->in_service_queue && ((bfqq_wants_to_preempt && bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || - bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) || + !bfq_better_to_idle(bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); @@ -1861,6 +1934,143 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd, bfqq->decrease_time_jif = jiffies; } +static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) +{ + u64 tot_io_time = now_ns - bfqq->io_start_time; + + if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0) + bfqq->tot_idle_time += + now_ns - bfqq->ttime.last_end_request; + + if (unlikely(bfq_bfqq_just_created(bfqq))) + return; + + /* + * Must be busy for at least about 80% of the time to be + * considered I/O bound. + */ + if (bfqq->tot_idle_time * 5 > tot_io_time) + bfq_clear_bfqq_IO_bound(bfqq); + else + bfq_mark_bfqq_IO_bound(bfqq); + + /* + * Keep an observation window of at most 200 ms in the past + * from now. + */ + if (tot_io_time > 200 * NSEC_PER_MSEC) { + bfqq->io_start_time = now_ns - (tot_io_time>>1); + bfqq->tot_idle_time >>= 1; + } +} + +/* + * Detect whether bfqq's I/O seems synchronized with that of some + * other queue, i.e., whether bfqq, after remaining empty, happens to + * receive new I/O only right after some I/O request of the other + * queue has been completed. We call waker queue the other queue, and + * we assume, for simplicity, that bfqq may have at most one waker + * queue. + * + * A remarkable throughput boost can be reached by unconditionally + * injecting the I/O of the waker queue, every time a new + * bfq_dispatch_request happens to be invoked while I/O is being + * plugged for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth and latency for + * bfqq. Note that these same results may be achieved with the general + * injection mechanism, but less effectively. For details on this + * aspect, see the comments on the choice of the queue for injection + * in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a queue Q is deemed + * as a waker queue for bfqq if, for three consecutive times, bfqq + * happens to become non empty right after a request of Q has been + * completed. In this respect, even if bfqq is empty, we do not check + * for a waker if it still has some in-flight I/O. In fact, in this + * case bfqq is actually still being served by the drive, and may + * receive new I/O on the completion of some of the in-flight + * requests. In particular, on the first time, Q is tentatively set as + * a candidate waker queue, while on the third consecutive time that Q + * is detected, the field waker_bfqq is set to Q, to confirm that Q is + * a waker queue for bfqq. These detection steps are performed only if + * bfqq has a long think time, so as to make it more likely that + * bfqq's I/O is actually being blocked by a synchronization. This + * last filter, plus the above three-times requirement, make false + * positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner throughput can be + * boosted by injecting I/O from the waker queue. Fortunately, + * detection is likely to be actually fast, for the following + * reasons. While blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at least equal to 1 + * (see the comments in bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served during the very + * first I/O-plugging time interval for bfqq. This triggers the first + * step of the detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be confirmed no later than + * during the next I/O-plugging interval for bfqq. + * + * ISSUE + * + * On queue merging all waker information is lost. + */ +static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, + u64 now_ns) +{ + if (!bfqd->last_completed_rq_bfqq || + bfqd->last_completed_rq_bfqq == bfqq || + bfq_bfqq_has_short_ttime(bfqq) || + bfqq->dispatched > 0 || + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) + return; + + if (bfqd->last_completed_rq_bfqq != + bfqq->tentative_waker_bfqq) { + /* + * First synchronization detected with a + * candidate waker queue, or with a different + * candidate waker queue from the current one. + */ + bfqq->tentative_waker_bfqq = + bfqd->last_completed_rq_bfqq; + bfqq->num_waker_detections = 1; + } else /* Same tentative waker queue detected again */ + bfqq->num_waker_detections++; + + if (bfqq->num_waker_detections == 3) { + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + } +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1868,117 +2078,14 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; + u64 now_ns = ktime_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { - /* - * Detect whether bfqq's I/O seems synchronized with - * that of some other queue, i.e., whether bfqq, after - * remaining empty, happens to receive new I/O only - * right after some I/O request of the other queue has - * been completed. We call waker queue the other - * queue, and we assume, for simplicity, that bfqq may - * have at most one waker queue. - * - * A remarkable throughput boost can be reached by - * unconditionally injecting the I/O of the waker - * queue, every time a new bfq_dispatch_request - * happens to be invoked while I/O is being plugged - * for bfqq. In addition to boosting throughput, this - * unblocks bfqq's I/O, thereby improving bandwidth - * and latency for bfqq. Note that these same results - * may be achieved with the general injection - * mechanism, but less effectively. For details on - * this aspect, see the comments on the choice of the - * queue for injection in bfq_select_queue(). - * - * Turning back to the detection of a waker queue, a - * queue Q is deemed as a waker queue for bfqq if, for - * two consecutive times, bfqq happens to become non - * empty right after a request of Q has been - * completed. In particular, on the first time, Q is - * tentatively set as a candidate waker queue, while - * on the second time, the flag - * bfq_bfqq_has_waker(bfqq) is set to confirm that Q - * is a waker queue for bfqq. These detection steps - * are performed only if bfqq has a long think time, - * so as to make it more likely that bfqq's I/O is - * actually being blocked by a synchronization. This - * last filter, plus the above two-times requirement, - * make false positives less likely. - * - * NOTE - * - * The sooner a waker queue is detected, the sooner - * throughput can be boosted by injecting I/O from the - * waker queue. Fortunately, detection is likely to be - * actually fast, for the following reasons. While - * blocked by synchronization, bfqq has a long think - * time. This implies that bfqq's inject limit is at - * least equal to 1 (see the comments in - * bfq_update_inject_limit()). So, thanks to - * injection, the waker queue is likely to be served - * during the very first I/O-plugging time interval - * for bfqq. This triggers the first step of the - * detection mechanism. Thanks again to injection, the - * candidate waker queue is then likely to be - * confirmed no later than during the next - * I/O-plugging interval for bfqq. - */ - if (bfqd->last_completed_rq_bfqq && - !bfq_bfqq_has_short_ttime(bfqq) && - ktime_get_ns() - bfqd->last_completion < - 200 * NSEC_PER_USEC) { - if (bfqd->last_completed_rq_bfqq != bfqq && - bfqd->last_completed_rq_bfqq != - bfqq->waker_bfqq) { - /* - * First synchronization detected with - * a candidate waker queue, or with a - * different candidate waker queue - * from the current one. - */ - bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; - - /* - * If the waker queue disappears, then - * bfqq->waker_bfqq must be reset. To - * this goal, we maintain in each - * waker queue a list, woken_list, of - * all the queues that reference the - * waker queue through their - * waker_bfqq pointer. When the waker - * queue exits, the waker_bfqq pointer - * of all the queues in the woken_list - * is reset. - * - * In addition, if bfqq is already in - * the woken_list of a waker queue, - * then, before being inserted into - * the woken_list of a new waker - * queue, bfqq must be removed from - * the woken_list of the old waker - * queue. - */ - if (!hlist_unhashed(&bfqq->woken_list_node)) - hlist_del_init(&bfqq->woken_list_node); - hlist_add_head(&bfqq->woken_list_node, - &bfqd->last_completed_rq_bfqq->woken_list); - - bfq_clear_bfqq_has_waker(bfqq); - } else if (bfqd->last_completed_rq_bfqq == - bfqq->waker_bfqq && - !bfq_bfqq_has_waker(bfqq)) { - /* - * synchronization with waker_bfqq - * seen for the second time - */ - bfq_mark_bfqq_has_waker(bfqq); - } - } + bfq_check_waker(bfqd, bfqq, now_ns); /* * Periodically reset inject limit, to make sure that @@ -2047,6 +2154,9 @@ static void bfq_add_request(struct request *rq) } } + if (bfq_bfqq_sync(bfqq)) + bfq_update_io_intensity(bfqq, now_ns); + elv_rb_add(&bfqq->sort_list, rq); /* @@ -2235,9 +2345,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock_irq(&bfqd->lock); if (free) blk_mq_free_request(free); - spin_unlock_irq(&bfqd->lock); return ret; } @@ -2251,6 +2361,9 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } @@ -2323,7 +2436,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, *next_bfqq = bfq_init_rq(next); if (!bfqq) - return; + goto remove; /* * If next and rq belong to the same bfq_queue and next is older @@ -2346,11 +2459,37 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +remove: + /* Merged request may be in the IO scheduler. Remove it. */ + if (!RB_EMPTY_NODE(&next->rb_node)) { + bfq_remove_request(next->q, next); + if (next_bfqq) + bfqg_stats_update_io_remove(bfqq_group(next_bfqq), + next->cmd_flags); + } } /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { + /* + * If bfqq has been enjoying interactive weight-raising, then + * reset soft_rt_next_start. We do it for the following + * reason. bfqq may have been conveying the I/O needed to load + * a soft real-time application. Such an application actually + * exhibits a soft real-time I/O pattern after it finishes + * loading, and finally starts doing its job. But, if bfqq has + * been receiving a lot of bandwidth so far (likely to happen + * on a fast device), then soft_rt_next_start now contains a + * high value that. So, without this reset, bfqq would be + * prevented from being possibly considered as soft_rt for a + * very long time. + */ + + if (bfqq->wr_cur_max_time != + bfqq->bfqd->bfq_wr_rt_max_time) + bfqq->soft_rt_next_start = jiffies; + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; @@ -2369,7 +2508,7 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd, int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) if (bfqg->async_bfqq[i][j]) bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); if (bfqg->async_idle_bfqq) @@ -2556,6 +2695,9 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, return true; } +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2578,10 +2720,57 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request) + void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* + * Check delayed stable merge for rotational or non-queueing + * devs. For this branch to be executed, bfqq must not be + * currently merged with some other queue (i.e., bfqq->bic + * must be non null). If we considered also merged queues, + * then we should also check whether bfqq has already been + * merged with bic->stable_merge_bfqq. But this would be + * costly and complicated. + */ + if (unlikely(!bfqd->nonrot_with_queueing)) { + /* + * Make sure also that bfqq is sync, because + * bic->stable_merge_bfqq may point to some queue (for + * stable merging) also if bic is associated with a + * sync queue, but this bfqq is async + */ + if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && + !bfq_bfqq_just_created(bfqq) && + time_is_before_jiffies(bfqq->split_time + + msecs_to_jiffies(bfq_late_stable_merging)) && + time_is_before_jiffies(bfqq->creation_time + + msecs_to_jiffies(bfq_late_stable_merging))) { + struct bfq_queue *stable_merge_bfqq = + bic->stable_merge_bfqq; + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + + /* deschedule stable merge, because done or aborted here */ + bfq_put_stable_ref(stable_merge_bfqq); + + bic->stable_merge_bfqq = NULL; + + if (!idling_boosts_thr_without_issues(bfqd, bfqq) && + proc_ref > 0) { + /* next function will take at least one ref */ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, stable_merge_bfqq); + + bic->stably_merged = true; + if (new_bfqq && new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + return new_bfqq; + } else + return NULL; + } + } + /* * Do not perform queue merging if the device is non * rotational and performs internal queueing. In fact, such a @@ -2685,10 +2874,16 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; + bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bic->saved_inject_limit = bfqq->inject_limit; + bic->saved_decrease_time_jif = bfqq->decrease_time_jif; + bic->saved_weight = bfqq->entity.orig_weight; bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_io_start_time = bfqq->io_start_time; + bic->saved_tot_idle_time = bfqq->tot_idle_time; bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && @@ -2711,11 +2906,23 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_wr_coeff = bfqq->wr_coeff; bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_service_from_wr = bfqq->service_from_wr; bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } } + +static void +bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +{ + if (cur_bfqq->entity.parent && + cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) + cur_bfqq->entity.parent->last_bfqq_created = new_bfqq; + else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq) + cur_bfqq->bfqd->last_bfqq_created = new_bfqq; +} + void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* @@ -2733,6 +2940,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq != bfqd->in_service_queue) bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_reassign_last_bfqq(bfqq, NULL); + bfq_put_queue(bfqq); } @@ -2749,6 +2958,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_mark_bfqq_IO_bound(new_bfqq); bfq_clear_bfqq_IO_bound(bfqq); + /* + * The processes associated with bfqq are cooperators of the + * processes associated with new_bfqq. So, if bfqq has a + * waker, then assume that all these processes will be happy + * to let bfqq's waker freely inject I/O when they have no + * I/O. + */ + if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq && + bfqq->waker_bfqq != new_bfqq) { + new_bfqq->waker_bfqq = bfqq->waker_bfqq; + new_bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be reset. So insert + * new_bfqq into the woken_list of the waker. See + * bfq_check_waker for details. + */ + hlist_add_head(&new_bfqq->woken_list_node, + &new_bfqq->waker_bfqq->woken_list); + + } + /* * If bfqq is weight-raised, then let new_bfqq inherit * weight-raising. To reduce false positives, neglect the case @@ -2806,6 +3038,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; + + bfq_reassign_last_bfqq(bfqq, new_bfqq); + bfq_release_process_ref(bfqd, bfqq); } @@ -2833,7 +3068,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic); if (new_bfqq) { /* * bic still points to bfqq, then it has not yet been @@ -3442,20 +3677,38 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * order until all the requests already queued in the device have been * served. The last sub-condition commented above somewhat mitigates * this problem for weight-raised queues. + * + * However, as an additional mitigation for this problem, we preserve + * plugging for a special symmetric case that may suddenly turn into + * asymmetric: the case where only bfqq is busy. In this case, not + * expiring bfqq does not cause any harm to any other queues in terms + * of service guarantees. In contrast, it avoids the following unlucky + * sequence of events: (1) bfqq is expired, (2) a new queue with a + * lower weight than bfqq becomes busy (or more queues), (3) the new + * queue is served until a new request arrives for bfqq, (4) when bfqq + * is finally served, there are so many requests of the new queue in + * the drive that the pending requests for bfqq take a lot of time to + * be served. In particular, event (2) may case even already + * dispatched requests of bfqq to be delayed, inside the drive. So, to + * avoid this series of events, the scenario is preventively declared + * as asymmetric also if bfqq is the only busy queues */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + int tot_busy_queues = bfq_tot_busy_queues(bfqd); + /* No point in idling for bfqq if it won't get requests any longer */ if (unlikely(!bfqq_process_refs(bfqq))) return false; return (bfqq->wr_coeff > 1 && (bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd) || + tot_busy_queues || bfqd->rq_in_driver >= bfqq->dispatched + 4)) || - bfq_asymmetric_scenario(bfqd, bfqq); + bfq_asymmetric_scenario(bfqd, bfqq) || + tot_busy_queues == 1; } static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3939,10 +4192,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) bfq_bfqq_charge_time(bfqd, bfqq, delta); - if (reason == BFQQE_TOO_IDLE && - entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - if (bfqd->low_latency && bfqq->wr_coeff == 1) bfqq->last_wr_start_finish = jiffies; @@ -3952,30 +4201,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * If we get here, and there are no outstanding * requests, then the request pattern is isochronous * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. And we do it, unless bfqq is in - * interactive weight raising. We do not do it in the - * latter subcase, for the following reason. bfqq may - * be conveying the I/O needed to load a soft - * real-time application. Such an application will - * actually exhibit a soft real-time I/O pattern after - * it finally starts doing its job. But, if - * soft_rt_next_start is computed here for an - * interactive bfqq, and bfqq had received a lot of - * service before remaining with no outstanding - * request (likely to happen on a fast device), then - * soft_rt_next_start would be assigned such a high - * value that, for a very long time, bfqq would be - * prevented from being possibly considered as soft - * real time. + * bfq_bfqq_softrt_next_start()). Therefore we can + * compute soft_rt_next_start. * * If, instead, the queue still has outstanding * requests, then we have to wait for the completion * of all the outstanding requests to discover whether * the request pattern is actually isochronous. */ - if (bfqq->dispatched == 0 && - bfqq->wr_coeff != bfqd->bfq_wr_coeff) + if (bfqq->dispatched == 0) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); else if (bfqq->dispatched > 0) { @@ -4419,9 +4653,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_bfqq_busy(bfqq->bic->bfqq[0]) && bfqq->bic->bfqq[0]->next_rq ? bfqq->bic->bfqq[0] : NULL; + struct bfq_queue *blocked_bfqq = + !hlist_empty(&bfqq->woken_list) ? + container_of(bfqq->woken_list.first, + struct bfq_queue, + woken_list_node) + : NULL; /* - * The next three mutually-exclusive ifs decide + * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to * pick an I/O request from. * @@ -4454,7 +4694,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * next bfqq's I/O is brought forward dramatically, * for it is not blocked for milliseconds. * - * The third if checks whether bfqq is a queue for + * The third if checks whether there is a queue woken + * by bfqq, and currently with pending I/O. Such a + * woken queue does not steal bandwidth from bfqq, + * because it remains soon without I/O if bfqq is not + * served. So there is virtually no risk of loss of + * bandwidth for bfqq if this woken queue has I/O + * dispatched while bfqq is waiting for new I/O. + * + * The fourth if checks whether bfqq is a queue for * which it is better to avoid injection. It is so if * bfqq delivers more throughput when served without * any further I/O from other queues in the middle, or @@ -4474,11 +4722,11 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * bfq_update_has_short_ttime(), it is rather likely * that, if I/O is being plugged for bfqq and the * waker queue has pending I/O requests that are - * blocking bfqq's I/O, then the third alternative + * blocking bfqq's I/O, then the fourth alternative * above lets the waker queue get served before the * I/O-plugging timeout fires. So one may deem the * second alternative superfluous. It is not, because - * the third alternative may be way less effective in + * the fourth alternative may be way less effective in * case of a synchronization. For two main * reasons. First, throughput may be low because the * inject limit may be too low to guarantee the same @@ -4487,7 +4735,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * guarantees (the second alternative unconditionally * injects a pending I/O request of the waker queue * for each bfq_dispatch_request()). Second, with the - * third alternative, the duration of the plugging, + * fourth alternative, the duration of the plugging, * i.e., the time before bfqq finally receives new I/O, * may not be minimized, because the waker queue may * happen to be served only after other queues. @@ -4497,14 +4745,22 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) bfqq = bfqq->bic->bfqq[0]; - else if (bfq_bfqq_has_waker(bfqq) && + else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && - bfqq->next_rq && + bfqq->waker_bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, bfqq->waker_bfqq) <= bfq_bfqq_budget_left(bfqq->waker_bfqq) ) bfqq = bfqq->waker_bfqq; + else if (blocked_bfqq && + bfq_bfqq_busy(blocked_bfqq) && + blocked_bfqq->next_rq && + bfq_serv_to_charge(blocked_bfqq->next_rq, + blocked_bfqq) <= + bfq_bfqq_budget_left(blocked_bfqq) + ) + bfqq = blocked_bfqq; else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || !bfq_bfqq_has_short_ttime(bfqq))) @@ -4559,9 +4815,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->wr_cur_max_time)) { if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) + bfq_wr_duration(bfqd))) { + /* + * Either in interactive weight + * raising, or in soft_rt weight + * raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ bfq_bfqq_end_wr(bfqq); - else { + } else { /* + * soft_rt finishing while still in + * interactive period, switch back to + * interactive weight raising + */ switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } @@ -4640,9 +4908,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - if (!atomic_read(&hctx->elevator_queued)) - return false; - /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -4892,7 +5157,6 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_for_each_entry_safe(item, n, &bfqq->woken_list, woken_list_node) { item->waker_bfqq = NULL; - bfq_clear_bfqq_has_waker(item); hlist_del_init(&item->woken_list_node); } @@ -4903,6 +5167,12 @@ void bfq_put_queue(struct bfq_queue *bfqq) bfqg_and_blkg_put(bfqg); } +static void bfq_put_stable_ref(struct bfq_queue *bfqq) +{ + bfqq->stable_ref--; + bfq_put_queue(bfqq); +} + static void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; @@ -4959,6 +5229,24 @@ static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + if (bic->stable_merge_bfqq) { + struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; + + /* + * bfqd is NULL if scheduler already exited, and in + * that case this is the last time bfqq is accessed. + */ + if (bfqd) { + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + bfq_put_stable_ref(bic->stable_merge_bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + bfq_put_stable_ref(bic->stable_merge_bfqq); + } + } + bfq_exit_icq_bfqq(bic, true); bfq_exit_icq_bfqq(bic, false); } @@ -4981,8 +5269,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), - ioprio_class); + bdi_dev_name(bfqq->bfqd->queue->disk->bdi), + ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: /* @@ -5005,19 +5293,22 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) break; } - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d", + bfqq->new_ioprio, bfqq->entity.new_weight); bfqq->entity.prio_changed = 1; } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); + struct bfq_io_cq *bic, + bool respawn); static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { @@ -5037,7 +5328,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); bic_set_bfqq(bic, bfqq, false); } @@ -5049,6 +5340,8 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync) { + u64 now_ns = ktime_get_ns(); + RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5076,7 +5369,11 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq); /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = ktime_get_ns() + 1; + bfqq->ttime.last_end_request = now_ns + 1; + + bfqq->creation_time = jiffies; + + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -5114,7 +5411,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, case IOPRIO_CLASS_RT: return &bfqg->async_bfqq[0][ioprio]; case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; + ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: return &bfqg->async_bfqq[1][ioprio]; @@ -5125,9 +5422,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } } +static struct bfq_queue * +bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, + struct bfq_queue *last_bfqq_created) +{ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, last_bfqq_created); + + if (!new_bfqq) + return bfqq; + + if (new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + bic->stably_merged = true; + + /* + * Reusing merge functions. This implies that + * bfqq->bic must be set too, for + * bfq_merge_bfqqs to correctly save bfqq's + * state before killing it. + */ + bfqq->bic = bic; + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + + return new_bfqq; +} + +/* + * Many throughput-sensitive workloads are made of several parallel + * I/O flows, with all flows generated by the same application, or + * more generically by the same task (e.g., system boot). The most + * counterproductive action with these workloads is plugging I/O + * dispatch when one of the bfq_queues associated with these flows + * remains temporarily empty. + * + * To avoid this plugging, BFQ has been using a burst-handling + * mechanism for years now. This mechanism has proven effective for + * throughput, and not detrimental for service guarantees. The + * following function pushes this mechanism a little bit further, + * basing on the following two facts. + * + * First, all the I/O flows of a the same application or task + * contribute to the execution/completion of that common application + * or task. So the performance figures that matter are total + * throughput of the flows and task-wide I/O latency. In particular, + * these flows do not need to be protected from each other, in terms + * of individual bandwidth or latency. + * + * Second, the above fact holds regardless of the number of flows. + * + * Putting these two facts together, this commits merges stably the + * bfq_queues associated with these I/O flows, i.e., with the + * processes that generate these IO/ flows, regardless of how many the + * involved processes are. + * + * To decide whether a set of bfq_queues is actually associated with + * the I/O flows of a common application or task, and to merge these + * queues stably, this function operates as follows: given a bfq_queue, + * say Q2, currently being created, and the last bfq_queue, say Q1, + * created before Q2, Q2 is merged stably with Q1 if + * - very little time has elapsed since when Q1 was created + * - Q2 has the same ioprio as Q1 + * - Q2 belongs to the same group as Q1 + * + * Merging bfq_queues also reduces scheduling overhead. A fio test + * with ten random readers on /dev/nullb shows a throughput boost of + * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of + * the total per-request processing time, the above throughput boost + * implies that BFQ's overhead is reduced by more than 50%. + * + * This new mechanism most certainly obsoletes the current + * burst-handling heuristics. We keep those heuristics for the moment. + */ +static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + struct bfq_queue **source_bfqq = bfqq->entity.parent ? + &bfqq->entity.parent->last_bfqq_created : + &bfqd->last_bfqq_created; + + struct bfq_queue *last_bfqq_created = *source_bfqq; + + /* + * If last_bfqq_created has not been set yet, then init it. If + * it has been set already, but too long ago, then move it + * forward to bfqq. Finally, move also if bfqq belongs to a + * different group than last_bfqq_created, or if bfqq has a + * different ioprio or ioprio_class. If none of these + * conditions holds true, then try an early stable merge or + * schedule a delayed stable merge. + * + * A delayed merge is scheduled (instead of performing an + * early merge), in case bfqq might soon prove to be more + * throughput-beneficial if not merged. Currently this is + * possible only if bfqd is rotational with no queueing. For + * such a drive, not merging bfqq is better for throughput if + * bfqq happens to contain sequential I/O. So, we wait a + * little bit for enough I/O to flow through bfqq. After that, + * if such an I/O is sequential, then the merge is + * canceled. Otherwise the merge is finally performed. + */ + if (!last_bfqq_created || + time_before(last_bfqq_created->creation_time + + msecs_to_jiffies(bfq_activation_stable_merging), + bfqq->creation_time) || + bfqq->entity.parent != last_bfqq_created->entity.parent || + bfqq->ioprio != last_bfqq_created->ioprio || + bfqq->ioprio_class != last_bfqq_created->ioprio_class) + *source_bfqq = bfqq; + else if (time_after_eq(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time)) { + if (likely(bfqd->nonrot_with_queueing)) + /* + * With this type of drive, leaving + * bfqq alone may provide no + * throughput benefits compared with + * merging bfqq. So merge bfqq now. + */ + bfqq = bfq_do_early_stable_merge(bfqd, bfqq, + bic, + last_bfqq_created); + else { /* schedule tentative stable merge */ + /* + * get reference on last_bfqq_created, + * to prevent it from being freed, + * until we decide whether to merge + */ + last_bfqq_created->ref++; + /* + * need to keep track of stable refs, to + * compute process refs correctly + */ + last_bfqq_created->stable_ref++; + /* + * Record the bfqq to merge to. + */ + bic->stable_merge_bfqq = last_bfqq_created; + } + } + + return bfqq; +} + + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic) + struct bfq_io_cq *bic, + bool respawn) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); @@ -5185,7 +5629,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, out: bfqq->ref++; /* get a process reference to this queue */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + + if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) + bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); + rcu_read_unlock(); return bfqq; } @@ -5194,11 +5641,19 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + u64 elapsed; + /* + * We are really interested in how long it takes for the queue to + * become busy when there is no outstanding IO for this queue. So + * ignore cases when the bfq queue has already IO queued. + */ + if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) + return; + elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); - ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ttime->ttime_samples); @@ -5213,8 +5668,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - BFQQ_TOTALLY_SEEKY(bfqq)) - bfq_bfqq_end_wr(bfqq); + BFQQ_TOTALLY_SEEKY(bfqq)) { + if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + /* + * In soft_rt weight raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ + bfq_bfqq_end_wr(bfqq); + } else { /* + * stopping soft_rt weight raising + * while still in interactive period, + * switch back to interactive weight + * raising + */ + switch_back_to_interactive_wr(bfqq, bfqd); + bfqq->entity.prio_changed = 1; + } + } } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, @@ -5238,12 +5711,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, return; /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to - * decide whether to mark as has_short_ttime + * bfqq. Otherwise check average think time to decide whether + * to mark as has_short_ttime. To this goal, compare average + * think time with half the I/O-plugging timeout. */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || (bfq_sample_valid(bfqq->ttime.ttime_samples) && - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1)) has_short_ttime = false; state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); @@ -5408,7 +5882,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), - *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true, + RQ_BIC(rq)); bool waiting, idle_timer_disabled = false; if (new_bfqq) { @@ -5497,24 +5972,67 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; + LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); return; } spin_unlock_irq(&bfqd->lock); - blk_mq_sched_request_inserted(rq); + trace_block_rq_insert(rq); spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); - if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { + + /* + * Reqs with at_head or passthrough flags set are to be put + * directly into dispatch list. Additional case for putting rq + * directly into the dispatch queue: the only active + * bfq_queues are bfqq and either its waker bfq_queue or one + * of its woken bfq_queues. The rationale behind this + * additional condition is as follows: + * - consider a bfq_queue, say Q1, detected as a waker of + * another bfq_queue, say Q2 + * - by definition of a waker, Q1 blocks the I/O of Q2, i.e., + * some I/O of Q1 needs to be completed for new I/O of Q2 + * to arrive. A notable example of waker is journald + * - so, Q1 and Q2 are in any respect the queues of two + * cooperating processes (or of two cooperating sets of + * processes): the goal of Q1's I/O is doing what needs to + * be done so that new Q2's I/O can finally be + * issued. Therefore, if the service of Q1's I/O is delayed, + * then Q2's I/O is delayed too. Conversely, if Q2's I/O is + * delayed, the goal of Q1's I/O is hindered. + * - as a consequence, if some I/O of Q1/Q2 arrives while + * Q2/Q1 is the only queue in service, there is absolutely + * no point in delaying the service of such an I/O. The + * only possible result is a throughput loss + * - so, when the above condition holds, the best option is to + * have the new I/O dispatched as soon as possible + * - the most effective and efficient way to attain the above + * goal is to put the new I/O directly in the dispatch + * list + * - as an additional restriction, Q1 and Q2 must be the only + * busy queues for this commit to put the I/O of Q2/Q1 in + * the dispatch list. This is necessary, because, if also + * other queues are waiting for service, then putting new + * I/O directly in the dispatch list may evidently cause a + * violation of service guarantees for the other queues + */ + if (!bfqq || + (bfqq != bfqd->in_service_queue && + bfqd->in_service_queue != NULL && + bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && + (bfqq->waker_bfqq == bfqd->in_service_queue || + bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else @@ -5557,7 +6075,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } } @@ -5655,7 +6172,19 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 1UL<<(BFQ_RATE_SHIFT - 10)) bfq_update_rate_reset(bfqd, NULL); bfqd->last_completion = now_ns; - bfqd->last_completed_rq_bfqq = bfqq; + /* + * Shared queues are likely to receive I/O at a high + * rate. This may deceptively let them be considered as wakers + * of other queues. But a false waker will unjustly steal + * bandwidth to its supposedly woken queue. So considering + * also shared queues in the waking mechanism may cause more + * control troubles than throughput benefits. Then reset + * last_completed_rq_bfqq if bfqq is a shared queue. + */ + if (!bfq_bfqq_coop(bfqq)) + bfqd->last_completed_rq_bfqq = bfqq; + else + bfqd->last_completed_rq_bfqq = NULL; /* * If we are waiting to discover whether the request pattern @@ -5898,6 +6427,7 @@ static void bfq_finish_requeue_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; + unsigned long flags; /* * rq either is not associated with any icq, or is an already @@ -5915,40 +6445,15 @@ static void bfq_finish_requeue_request(struct request *rq) rq->io_start_time_ns, rq->cmd_flags); + spin_lock_irqsave(&bfqd->lock, flags); if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - if (rq == bfqd->waited_rq) bfq_update_inject_limit(bfqd, bfqq); bfq_completed_request(bfqq, bfqd); - bfq_finish_requeue_request_body(bfqq); - atomic_dec(&rq->mq_hctx->elevator_queued); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, - * in which case we need to remove it (this should - * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. - * This situation seems to occur only in process - * context, as a consequence of a merge. In the - * current version of the code, this implies that the - * lock is held. - */ - - if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } - bfq_finish_requeue_request_body(bfqq); } + bfq_finish_requeue_request_body(bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); /* * Reset private fields. In case of a requeue, this allows @@ -6013,7 +6518,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, if (bfqq) bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { @@ -6134,8 +6639,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && + !bic->stably_merged) { + struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) @@ -6144,11 +6650,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq = bfq_split_bfqq(bic, bfqq); split = true; - if (!bfqq) + if (!bfqq) { bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); - else + bfqq->waker_bfqq = old_bfqq->waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (bfqq->waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } else bfqq_already_existing = true; } } @@ -6306,7 +6825,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); @@ -6489,8 +7008,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 8; bfqd->bfq_burst_interval = msecs_to_jiffies(180); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 7038952245..a73488eec8 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -197,6 +197,9 @@ struct bfq_entity { /* flag, set if the entity is counted in groups_with_pending_reqs */ bool in_groups_with_pending_reqs; + + /* last child queue of entity created (for non-leaf entities) */ + struct bfq_queue *last_bfqq_created; }; struct bfq_group; @@ -230,6 +233,8 @@ struct bfq_ttime { struct bfq_queue { /* reference counter */ int ref; + /* counter of references from other queues for delayed stable merge */ + int stable_ref; /* parent bfq_data */ struct bfq_data *bfqd; @@ -291,6 +296,11 @@ struct bfq_queue { /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* when bfqq started to do I/O within the last observation window */ + u64 io_start_time; + /* how long bfqq has remained empty during the last observ. window */ + u64 tot_idle_time; + /* bit vector: a 1 for each seeky requests in history */ u32 seek_history; @@ -360,6 +370,8 @@ struct bfq_queue { unsigned long first_IO_time; /* time of first I/O for this queue */ + unsigned long creation_time; /* when this queue is created */ + /* max service rate measured so far */ u32 max_service_rate; @@ -371,6 +383,11 @@ struct bfq_queue { * bfq_select_queue(). */ struct bfq_queue *waker_bfqq; + /* pointer to the curr. tentative waker queue, see bfq_check_waker() */ + struct bfq_queue *tentative_waker_bfqq; + /* number of times the same tentative waker has been detected */ + unsigned int num_waker_detections; + /* node for woken_list, see below */ struct hlist_node woken_list_node; /* @@ -407,6 +424,9 @@ struct bfq_io_cq { */ bool saved_IO_bound; + u64 saved_io_start_time; + u64 saved_tot_idle_time; + /* * Same purpose as the previous fields for the value of the * field keeping the queue's belonging to a large burst @@ -432,9 +452,20 @@ struct bfq_io_cq { */ unsigned long saved_wr_coeff; unsigned long saved_last_wr_start_finish; + unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; + + /* Save also injection state */ + u64 saved_last_serv_time_ns; + unsigned int saved_inject_limit; + unsigned long saved_decrease_time_jif; + + /* candidate queue for a stable merge (due to close creation time) */ + struct bfq_queue *stable_merge_bfqq; + + bool stably_merged; /* non splittable if true */ }; /** @@ -559,6 +590,9 @@ struct bfq_data { /* bfqq owning the last completed rq */ struct bfq_queue *last_completed_rq_bfqq; + /* last bfqq created, among those in the root group */ + struct bfq_queue *last_bfqq_created; + /* time of last transition from empty to non-empty (ns) */ u64 last_empty_occupied_ns; @@ -641,14 +675,6 @@ struct bfq_data { */ unsigned int bfq_timeout; - /* - * Number of consecutive requests that must be issued within - * the idle time slice to set again idling to a queue which - * was marked as non-I/O-bound (see the definition of the - * IO_bound flag for further details). - */ - unsigned int bfq_requests_within_timer; - /* * Force device idling whenever needed to provide accurate * service guarantees, without caring about throughput @@ -770,7 +796,6 @@ enum bfqq_state_flags { */ BFQQF_coop, /* bfqq is shared */ BFQQF_split_coop, /* shared bfqq will be split */ - BFQQF_has_waker /* bfqq has a waker queue */ }; #define BFQ_BFQQ_FNS(name) \ @@ -790,7 +815,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS /* Expiration reasons. */ @@ -907,7 +931,7 @@ struct bfq_group { void *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct bfq_entity *my_entity; @@ -924,15 +948,13 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct rb_root rq_pos_tree; }; #endif -struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - /* --------------- main algorithm interface ----------------- */ #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 26776bdbdf..b74cc0da11 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -137,9 +137,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, sd->next_in_service = next_in_service; - if (!next_in_service) - return parent_sched_may_change; - return parent_sched_may_change; } @@ -508,7 +505,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, */ unsigned short bfq_ioprio_to_weight(int ioprio) { - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; + return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -517,12 +514,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio) * * To preserve as much as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF. */ static unsigned short bfq_weight_to_ioprio(int weight) { return max_t(int, 0, - IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); + IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight); } static void bfq_get_entity(struct bfq_entity *entity) @@ -1709,4 +1706,12 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; + + /* Move bfqq to the head of the woken list of its waker */ + if (!hlist_unhashed(&bfqq->woken_list_node) && + &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) { + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 9ffd7e2895..6b47cddbbc 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -14,8 +14,6 @@ #include #include "blk.h" -#define BIP_INLINE_VECS 4 - static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; @@ -30,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs, if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); + bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); @@ -63,7 +61,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIP_INLINE_VECS; + inline_vecs = BIO_INLINE_VECS; } if (unlikely(!bip)) @@ -72,14 +70,11 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { - unsigned long idx = 0; - - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - &bs->bvec_integrity_pool); + bip->bip_max_vcnt = nr_vecs; + bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, + &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; - bip->bip_max_vcnt = bvec_nr_vecs(idx); - bip->bip_slab = idx; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; @@ -109,8 +104,7 @@ void bio_integrity_free(struct bio *bio) struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) - kfree(page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset); + kfree(bvec_virt(bip->bip_vec)); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -140,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bio->bi_disk->queue, + bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; @@ -162,33 +156,29 @@ EXPORT_SYMBOL(bio_integrity_add_page); static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; - void *prot_buf = page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset; - iter.disk_name = bio->bi_disk->disk_name; + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; - iter.prot_buf = prot_buf; + iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = kmap_atomic(bv.bv_page); + void *kaddr = bvec_kmap_local(&bv); - iter.data_buf = kaddr + bv.bv_offset; + iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } + kunmap_local(kaddr); + + if (ret) + break; - kunmap_atomic(kaddr); } return ret; } @@ -208,8 +198,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); - struct request_queue *q = bio->bi_disk->queue; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -243,7 +232,7 @@ bool bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = intervals * bi->tuple_size; - buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); + buf = kmalloc(len, GFP_NOIO); status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); @@ -329,7 +318,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced @@ -355,7 +344,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && @@ -381,7 +370,7 @@ bool __bio_integrity_endio(struct bio *bio) void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bytes_done >> 9; @@ -397,7 +386,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } @@ -470,6 +459,6 @@ void __init bio_integrity_init(void) bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIP_INLINE_VECS, + sizeof(struct bio_vec) * BIO_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } diff --git a/block/bio.c b/block/bio.c index 9c931df2d9..a6fb6a0b42 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,27 +19,45 @@ #include #include #include +#include #include #include "blk.h" #include "blk-rq-qos.h" -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } -static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { - BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), +struct bio_alloc_cache { + struct bio_list free_list; + unsigned int nr; }; -#undef BV + +static struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +} bvec_slabs[] __read_mostly = { + { .nr_vecs = 16, .name = "biovec-16" }, + { .nr_vecs = 64, .name = "biovec-64" }, + { .nr_vecs = 128, .name = "biovec-128" }, + { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, +}; + +static struct biovec_slab *biovec_slab(unsigned short nr_vecs) +{ + switch (nr_vecs) { + /* smaller bios use inline vecs */ + case 5 ... 16: + return &bvec_slabs[0]; + case 17 ... 64: + return &bvec_slabs[1]; + case 65 ... 128: + return &bvec_slabs[2]; + case 129 ... BIO_MAX_VECS: + return &bvec_slabs[3]; + default: + BUG(); + return NULL; + } +} /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by @@ -58,178 +76,133 @@ struct bio_slab { char name[8]; }; static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; +static DEFINE_XARRAY(bio_slabs); -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +static struct bio_slab *create_bio_slab(unsigned int size) { - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; + struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); + + if (!bslab) + return NULL; + + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); + bslab->slab = kmem_cache_create(bslab->name, size, + ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + if (!bslab->slab) + goto fail_alloc_slab; + + bslab->slab_ref = 1; + bslab->slab_size = size; + + if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) + return bslab; + + kmem_cache_destroy(bslab->slab); + +fail_alloc_slab: + kfree(bslab); + return NULL; +} + +static inline unsigned int bs_bio_slab_size(struct bio_set *bs) +{ + return bs->front_pad + sizeof(struct bio) + bs->back_pad; +} + +static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) +{ + unsigned int size = bs_bio_slab_size(bs); + struct bio_slab *bslab; mutex_lock(&bio_slab_lock); - - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; - - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } - - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; - - bslab = &bio_slabs[entry]; - - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, - SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; - - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: + bslab = xa_load(&bio_slabs, size); + if (bslab) + bslab->slab_ref++; + else + bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); - return slab; + + if (bslab) + return bslab->slab; + return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; - unsigned int i; + unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - + bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; + WARN_ON_ONCE(bslab->slab != bs->bio_slab); + WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; + xa_erase(&bio_slabs, slab_size); + kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; + kfree(bslab); out: mutex_unlock(&bio_slab_lock); } -unsigned int bvec_nr_vecs(unsigned short idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - return bvec_slabs[--idx].nr_vecs; -} + BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - if (!idx) - return; - idx--; - - BIO_BUG_ON(idx >= BVEC_POOL_NR); - - if (idx == BVEC_POOL_MAX) { + if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); - } else { - struct biovec_slab *bvs = bvec_slabs + idx; - - kmem_cache_free(bvs->slab, bv); - } + else if (nr_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) +/* + * Make the first allocation restricted and don't dump info on allocation + * failures, since we'll fall back to the mempool in case of failure. + */ +static inline gfp_t bvec_alloc_gfp(gfp_t gfp) { - struct bio_vec *bvl; + return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; +} - /* - * see comment near bvec_array define! - */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask) +{ + struct biovec_slab *bvs = biovec_slab(*nr_vecs); + + if (WARN_ON_ONCE(!bvs)) return NULL; - } /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. + * Upgrade the nr_vecs request to take full advantage of the allocation. + * We also rely on this in the bvec_free path. */ - if (*idx == BVEC_POOL_MAX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { - struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); + *nr_vecs = bvs->nr_vecs; - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + /* + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_VECS entries. + */ + if (*nr_vecs < BIO_MAX_VECS) { + struct bio_vec *bvl; - /* - * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM - * is set, retry with the 1-entry mempool - */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { - *idx = BVEC_POOL_MAX; - goto fallback; - } + bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + return bvl; + *nr_vecs = BIO_MAX_VECS; } - (*idx)++; - return bvl; + return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) @@ -255,7 +228,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); /* * If we have front padding, adjust the bio pointer before freeing @@ -278,12 +251,40 @@ static void bio_free(struct bio *bio) void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs) { - memset(bio, 0, sizeof(*bio)); + bio->bi_next = NULL; + bio->bi_bdev = NULL; + bio->bi_opf = 0; + bio->bi_flags = 0; + bio->bi_ioprio = 0; + bio->bi_write_hint = 0; + bio->bi_status = 0; + bio->bi_iter.bi_sector = 0; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_idx = 0; + bio->bi_iter.bi_bvec_done = 0; + bio->bi_end_io = NULL; + bio->bi_private = NULL; +#ifdef CONFIG_BLK_CGROUP + bio->bi_blkg = NULL; + bio->bi_issue.value = 0; +#ifdef CONFIG_BLK_CGROUP_IOCOST + bio->bi_iocost_cost = 0; +#endif +#endif +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + bio->bi_crypt_context = NULL; +#endif +#ifdef CONFIG_BLK_DEV_INTEGRITY + bio->bi_integrity = NULL; +#endif + bio->bi_vcnt = 0; + atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); - bio->bi_io_vec = table; bio->bi_max_vecs = max_vecs; + bio->bi_io_vec = table; + bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); @@ -299,12 +300,8 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio) { - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - bio_uninit(bio); - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -405,122 +402,97 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. + * Allocate a bio from the mempools in @bs. * - * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will - * always be able to allocate a bio. This is due to the mempool guarantees. - * To make this work, callers must never allocate more than 1 bio at a time - * from this pool. Callers that need to allocate more than 1 bio must always - * submit the previously allocated bio for IO before attempting to allocate - * a new one. Failure to do so can cause deadlocks under memory pressure. + * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to + * allocate a bio. This is due to the mempool guarantees. To make this work, + * callers must never allocate more than 1 bio at a time from the general pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under submit_bio_noacct() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * submit_bio_noacct() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block driver), + * bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under - * submit_bio_noacct() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under submit_bio_noacct() + * would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * RETURNS: - * Pointer to new bio on success, NULL on failure. + * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - struct bio_vec *bvl = NULL; struct bio *bio; void *p; - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + return NULL; - p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && - nr_iovecs > 0)) - return NULL; - /* - * submit_bio_noacct() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath submit_bio_noacct(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_DIRECT_RECLAIM; if that fails, we punt those - * bios we would be blocking to the rescuer workqueue before - * we retry with the original gfp_flags. - */ - - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + /* + * submit_bio_noacct() converts recursion to iteration; this means if + * we're running beneath it, any bios we allocate and submit will not be + * submitted (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios + * from the same bio_set() while running underneath submit_bio_noacct(). + * If we were to allocate multiple bios (say a stacking block driver + * that was splitting bios), we would deadlock if we exhausted the + * mempool's reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are bios on + * current->bio_list, we first try the allocation without + * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be + * blocking to the rescuer workqueue before we retry with the original + * gfp_flags. + */ + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; } - if (unlikely(!p)) return NULL; - bio = p + front_pad; - bio_init(bio, NULL, 0); + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { + struct bio_vec *bvl = NULL; - if (nr_iovecs > inline_vecs) { - unsigned long idx = 0; - - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); } - if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio_init(bio, bvl, nr_iovecs); } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; + bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + } else { + bio_init(bio, NULL, 0); } bio->bi_pool = bs; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bvl; return bio; err_free: @@ -529,20 +501,40 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, } EXPORT_SYMBOL(bio_alloc_bioset); -void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) +/** + * bio_kmalloc - kmalloc a bio for I/O + * @gfp_mask: the GFP_* mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Use kmalloc to allocate and initialize a bio. + * + * Returns: Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) +{ + struct bio *bio; + + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); + bio->bi_pool = NULL; + return bio; +} +EXPORT_SYMBOL(bio_kmalloc); + +void zero_fill_bio(struct bio *bio) { - unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - __bio_for_each_segment(bv, bio, iter, start) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } + bio_for_each_segment(bv, bio, iter) + memzero_bvec(&bv); } -EXPORT_SYMBOL(zero_fill_bio_iter); +EXPORT_SYMBOL(zero_fill_bio); /** * bio_truncate - truncate the bio to small size of @new_size @@ -607,16 +599,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) */ void guard_bio_eod(struct bio *bio) { - sector_t maxsector; - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = part_nr_sects_read(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; @@ -636,6 +619,53 @@ void guard_bio_eod(struct bio *bio) bio_truncate(bio, maxsector << 9); } +#define ALLOC_CACHE_MAX 512 +#define ALLOC_CACHE_SLACK 64 + +static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + unsigned int i = 0; + struct bio *bio; + + while ((bio = bio_list_pop(&cache->free_list)) != NULL) { + cache->nr--; + bio_free(bio); + if (++i == nr) + break; + } +} + +static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct bio_set *bs; + + bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); + if (bs->cache) { + struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); + + bio_alloc_cache_prune(cache, -1U); + } + return 0; +} + +static void bio_alloc_cache_destroy(struct bio_set *bs) +{ + int cpu; + + if (!bs->cache) + return; + + cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + for_each_possible_cpu(cpu) { + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bs->cache, cpu); + bio_alloc_cache_prune(cache, -1U); + } + free_percpu(bs->cache); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to @@ -646,16 +676,23 @@ void guard_bio_eod(struct bio *bio) **/ void bio_put(struct bio *bio) { - if (!bio_flagged(bio, BIO_REFFED)) - bio_free(bio); - else { + if (unlikely(bio_flagged(bio, BIO_REFFED))) { BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + if (!atomic_dec_and_test(&bio->__bi_cnt)) + return; + } - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->__bi_cnt)) - bio_free(bio); + if (bio_flagged(bio, BIO_PERCPU_CACHE)) { + struct bio_alloc_cache *cache; + + bio_uninit(bio); + cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); + bio_list_add_head(&cache->free_list, bio); + if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) + bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); + put_cpu(); + } else { + bio_free(bio); } } EXPORT_SYMBOL(bio_put); @@ -673,17 +710,18 @@ EXPORT_SYMBOL(bio_put); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); + WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); /* - * most users will be overriding ->bi_disk with a new target, + * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_disk = bio_src->bi_disk; - bio->bi_partno = bio_src->bi_partno; + bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; @@ -730,7 +768,7 @@ EXPORT_SYMBOL(bio_clone_fast); const char *bio_devname(struct bio *bio, char *buf) { - return disk_name(bio->bi_disk, bio->bi_partno, buf); + return bdevname(bio->bi_bdev, buf); } EXPORT_SYMBOL(bio_devname); @@ -851,6 +889,39 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL(bio_add_pc_page); +/** + * bio_add_zone_append_page - attempt to add page to zone-append bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist of a bio that will be submitted + * for a zone-append request. This can fail for a number of reasons, such as the + * bio being full or the target block device is not a zoned block device or + * other limitations of the target block device. The target block device must + * allow bio's up to PAGE_SIZE, so it is always possible to add a single page + * to an empty bio. + * + * Returns: number of bytes added to the bio, or 0 in case of a failure. + */ +int bio_add_zone_append_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + bool same_page = false; + + if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) + return 0; + + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return 0; + + return bio_add_hw_page(q, bio, page, len, offset, + queue_max_zone_append_sectors(q), &same_page); +} +EXPORT_SYMBOL_GPL(bio_add_zone_append_page); + /** * __bio_try_merge_page - try appending data to an existing bvec. * @bio: destination bio @@ -960,24 +1031,44 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - const struct bio_vec *bv = iter->bvec; - unsigned int len; - size_t size; + WARN_ON_ONCE(bio->bi_max_vecs); - if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) - return -EINVAL; + bio->bi_vcnt = iter->nr_segs; + bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = iter->count; + bio_set_flag(bio, BIO_NO_PAGE_REF); + bio_set_flag(bio, BIO_CLONED); +} - len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); - size = bio_add_page(bio, bv->bv_page, len, - bv->bv_offset + iter->iov_offset); - if (unlikely(size != len)) - return -EINVAL; - iov_iter_advance(iter, size); +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +{ + __bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, iter->count); return 0; } +static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct iov_iter i = *iter; + + iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); + __bio_iov_bvec_set(bio, &i); + iov_iter_advance(iter, i.count); + return 0; +} + +static void bio_put_pages(struct page **pages, size_t size, size_t off) +{ + size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); + + for (i = 0; i < nr; i++) + put_page(pages[i]); +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1022,8 +1113,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (same_page) put_page(page); } else { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; + if (WARN_ON_ONCE(bio_full(bio, len))) { + bio_put_pages(pages + i, left, offset); + return -EINVAL; + } __bio_add_page(bio, page, len, offset); } offset = 0; @@ -1037,7 +1130,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1068,6 +1161,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_add_hw_page(q, bio, page, len, offset, max_append_sectors, &same_page) != len) { + bio_put_pages(pages + i, left, offset); ret = -EINVAL; break; } @@ -1088,41 +1182,40 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. If we're adding kernel pages, and the caller told us it's safe to - * do so, we just have to add the pages to the bio directly. We don't grab an - * extra reference to those pages (the user should already have that), and we - * don't put the page on IO completion. The caller needs to check if the bio is - * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be - * released. + * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided + * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs + * to ensure the bvecs and pages stay referenced until the submitted I/O is + * completed by a call to ->ki_complete() or returns with an error other than + * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF + * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. + * + * It's intended for direct IO, so doesn't do PSI tracking, the caller is + * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - const bool is_bvec = iov_iter_is_bvec(iter); - int ret; + int ret = 0; - if (WARN_ON_ONCE(bio->bi_vcnt)) - return -EINVAL; + if (iov_iter_is_bvec(iter)) { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + return bio_iov_bvec_set_append(bio, iter); + return bio_iov_bvec_set(bio, iter); + } do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (WARN_ON_ONCE(is_bvec)) - return -EINVAL; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) ret = __bio_iov_append_get_pages(bio, iter); - } else { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); - } + else + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - if (is_bvec) - bio_set_flag(bio, BIO_NO_PAGE_REF); + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -1145,7 +1238,8 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); unsigned long hang_check; bio->bi_private = &done; @@ -1190,30 +1284,18 @@ EXPORT_SYMBOL(bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf; - bytes = min(src_bv.bv_len, dst_bv.bv_len); + src_buf = bvec_kmap_local(&src_bv); + memcpy_to_bvec(&dst_bv, src_buf); + kunmap_local(src_buf); - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); - - bio_advance_iter(src, src_iter, bytes); - bio_advance_iter(dst, dst_iter, bytes); + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); @@ -1235,43 +1317,6 @@ void bio_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_copy_data); -/** - * bio_list_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * Stops when it reaches the end of either the @src list or @dst list - that is, - * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of - * bios). - */ -void bio_list_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter = src->bi_iter; - struct bvec_iter dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } -} -EXPORT_SYMBOL(bio_list_copy_data); - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; @@ -1411,8 +1456,7 @@ static inline bool bio_remaining_done(struct bio *bio) * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the - * last time. At this point the BLK_TA_COMPLETE tracing event will be - * generated if BIO_TRACE_COMPLETION is set. + * last time. **/ void bio_endio(struct bio *bio) { @@ -1422,8 +1466,13 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) - rq_qos_done_bio(bio->bi_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) + rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); + + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + } /* * Need to have a real endio function for chained bios, otherwise @@ -1438,11 +1487,6 @@ void bio_endio(struct bio *bio) goto again; } - if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio); - bio_clear_flag(bio, BIO_TRACE_COMPLETION); - } - blk_throtl_bio_endio(bio); /* release cgroup info */ bio_uninit(bio); @@ -1500,12 +1544,15 @@ EXPORT_SYMBOL(bio_split); * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors + * + * This function is typically used for bios that are cloned and submitted + * to the underlying device in parts. */ -void bio_trim(struct bio *bio, int offset, int size) +void bio_trim(struct bio *bio, sector_t offset, sector_t size) { - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - */ + if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || + offset + size > bio->bi_iter.bi_size)) + return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) @@ -1516,7 +1563,6 @@ void bio_trim(struct bio *bio, int offset, int size) if (bio_integrity(bio)) bio_integrity_trim(bio); - } EXPORT_SYMBOL_GPL(bio_trim); @@ -1526,7 +1572,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1539,6 +1585,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries) */ void bioset_exit(struct bio_set *bs) { + bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; @@ -1579,15 +1626,17 @@ int bioset_init(struct bio_set *bs, unsigned int front_pad, int flags) { - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; @@ -1598,12 +1647,18 @@ int bioset_init(struct bio_set *bs, biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; - if (!(flags & BIOSET_NEED_RESCUER)) - return 0; - - bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); - if (!bs->rescue_workqueue) - goto bad; + if (flags & BIOSET_NEED_RESCUER) { + bs->rescue_workqueue = alloc_workqueue("bioset", + WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + } + if (flags & BIOSET_PERCPU_CACHE) { + bs->cache = alloc_percpu(struct bio_alloc_cache); + if (!bs->cache) + goto bad; + cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + } return 0; bad: @@ -1630,39 +1685,62 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -static void __init biovec_init_slabs(void) +/** + * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb + * @kiocb: kiocb describing the IO + * @nr_vecs: number of iovecs to pre-allocate + * @bs: bio_set to allocate from + * + * Description: + * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only + * used to check if we should dip into the per-cpu bio_set allocation + * cache. The allocation uses GFP_KERNEL internally. On return, the + * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio + * MUST be done from process context, not hard/soft IRQ. + * + */ +struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, + struct bio_set *bs) { - int i; + struct bio_alloc_cache *cache; + struct bio *bio; - for (i = 0; i < BVEC_POOL_NR; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; + if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS) + return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } - - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + cache = per_cpu_ptr(bs->cache, get_cpu()); + bio = bio_list_pop(&cache->free_list); + if (bio) { + cache->nr--; + put_cpu(); + bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs); + bio->bi_pool = bs; + bio_set_flag(bio, BIO_PERCPU_CACHE); + return bio; } + put_cpu(); + bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); + bio_set_flag(bio, BIO_PERCPU_CACHE); + return bio; } +EXPORT_SYMBOL_GPL(bio_alloc_kiocb); static int __init init_bio(void) { - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), - GFP_KERNEL); - - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - - if (!bio_slabs) - panic("bio: can't allocate bios\n"); + int i; bio_integrity_init(); - biovec_init_slabs(); + + for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { + struct biovec_slab *bvs = bvec_slabs + i; + + bvs->slab = kmem_cache_create(bvs->name, + bvs->nr_vecs * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + } + + cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, + bio_cpu_dead); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f13688c4b9..38b9f76849 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -31,8 +31,7 @@ #include #include #include "blk.h" - -#define MAX_KEY_LEN 100 +#include "blk-ioprio.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -57,6 +56,8 @@ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; static struct workqueue_struct *blkcg_punt_bio_wq; +#define BLKG_DESTROY_BATCH_SIZE 64 + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -423,7 +424,9 @@ static void blkg_destroy(struct blkcg_gq *blkg) static void blkg_destroy_all(struct request_queue *q) { struct blkcg_gq *blkg, *n; + int count = BLKG_DESTROY_BATCH_SIZE; +restart: spin_lock_irq(&q->queue_lock); list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; @@ -431,6 +434,17 @@ static void blkg_destroy_all(struct request_queue *q) spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); + + /* + * in order to avoid holding the spin lock for too long, release + * it when a batch of blkgs are destroyed. + */ + if (!(--count)) { + count = BLKG_DESTROY_BATCH_SIZE; + spin_unlock_irq(&q->queue_lock); + cond_resched(); + goto restart; + } } q->root_blkg = NULL; @@ -475,10 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info->dev) - return bdi_dev_name(blkg->q->backing_dev_info); - return NULL; + if (!blkg->q->disk || !blkg->q->disk->bdi->dev) + return NULL; + return bdi_dev_name(blkg->q->disk->bdi); } /** @@ -556,22 +569,22 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, } /** - * blkg_conf_prep - parse and prepare for per-blkg config update + * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching gendisk. *@inputp is + * from @input and get and return the matching bdev. *@inputp is * updated to point past the device node prefix. Returns an ERR_PTR() * value on error. * * Use this function iff blkg_conf_prep() can't be used for some reason. */ -struct gendisk *blkcg_conf_get_disk(char **inputp) +struct block_device *blkcg_conf_open_bdev(char **inputp) { char *input = *inputp; unsigned int major, minor; - struct gendisk *disk; - int key_len, part; + struct block_device *bdev; + int key_len; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return ERR_PTR(-EINVAL); @@ -581,16 +594,16 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) return ERR_PTR(-EINVAL); input = skip_spaces(input); - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk) + bdev = blkdev_get_no_open(MKDEV(major, minor)); + if (!bdev) return ERR_PTR(-ENODEV); - if (part) { - put_disk_and_module(disk); + if (bdev_is_partition(bdev)) { + blkdev_put_no_open(bdev); return ERR_PTR(-ENODEV); } *inputp = input; - return disk; + return bdev; } /** @@ -607,18 +620,18 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&disk->queue->queue_lock) + __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) { - struct gendisk *disk; + struct block_device *bdev; struct request_queue *q; struct blkcg_gq *blkg; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - q = disk->queue; + q = bdev->bd_disk->queue; rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -689,7 +702,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - ctx->disk = disk; + ctx->bdev = bdev; ctx->blkg = blkg; ctx->body = input; return 0; @@ -700,7 +713,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); fail: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -723,11 +736,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) { - spin_unlock_irq(&ctx->disk->queue->queue_lock); + spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); rcu_read_unlock(); - put_disk_and_module(ctx->disk); + blkdev_put_no_open(ctx->bdev); } EXPORT_SYMBOL_GPL(blkg_conf_finish); @@ -766,12 +779,17 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; + /* Root-level stats are sourced from system-wide IO stats */ + if (!cgroup_parent(css->cgroup)) + return; + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { struct blkcg_gq *parent = blkg->parent; struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur, delta; + unsigned long flags; unsigned int seq; /* fetch the current per-cpu values */ @@ -781,21 +799,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) } while (u64_stats_fetch_retry(&bisc->sync, seq)); /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, &cur); blkg_iostat_sub(&delta, &bisc->last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); - /* propagate global delta to parent */ - if (parent) { - u64_stats_update_begin(&parent->iostat.sync); + /* propagate global delta to parent (unless that's root) */ + if (parent && parent->parent) { + flags = u64_stats_update_begin_irqsave(&parent->iostat.sync); blkg_iostat_set(&delta, &blkg->iostat.cur); blkg_iostat_sub(&delta, &blkg->iostat.last); blkg_iostat_add(&parent->iostat.cur, &delta); blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); + u64_stats_update_end_irqrestore(&parent->iostat.sync, flags); } } @@ -803,10 +821,11 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) } /* - * The rstat algorithms intentionally don't handle the root cgroup to avoid - * incurring overhead when no cgroups are defined. For that reason, - * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the - * iostat in the root cgroup's blkcg_gq. + * We source root cgroup stats from the system-wide stats to avoid + * tracking the same information twice and incurring overhead when no + * cgroups are defined. For that reason, cgroup_rstat_flush in + * blkcg_print_stat does not actually fill out the iostat in the root + * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate @@ -820,17 +839,18 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part = disk_get_part(disk, 0); - struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct block_device *bdev = dev_to_bdev(dev); + struct blkcg_gq *blkg = + blk_queue_root_blkg(bdev->bd_disk->queue); struct blkg_iostat tmp; int cpu; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; + unsigned long flags; - cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += @@ -845,14 +865,70 @@ static void blkcg_fill_root_iostats(void) tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; - u64_stats_update_begin(&blkg->iostat.sync); + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); - u64_stats_update_end(&blkg->iostat.sync); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } - disk_put_part(part); } } +static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) +{ + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + bool has_stats = false; + const char *dname; + unsigned seq; + int i; + + if (!blkg->online) + return; + + dname = blkg_dev_name(blkg); + if (!dname) + return; + + seq_printf(s, "%s ", dname); + + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + has_stats = true; + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + has_stats = true; + seq_printf(s, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); + } + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + if (pol->pd_stat_fn(blkg->pd[i], s)) + has_stats = true; + } + + if (has_stats) + seq_printf(s, "\n"); +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); @@ -864,86 +940,11 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkg_iostat_set *bis = &blkg->iostat; - const char *dname; - char *buf; - u64 rbytes, wbytes, rios, wios, dbytes, dios; - size_t size = seq_get_buf(sf, &buf), off = 0; - int i; - bool has_stats = false; - unsigned seq; - spin_lock_irq(&blkg->q->queue_lock); - - if (!blkg->online) - goto skip; - - dname = blkg_dev_name(blkg); - if (!dname) - goto skip; - - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); - - do { - seq = u64_stats_fetch_begin(&bis->sync); - - rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; - wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; - dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; - rios = bis->cur.ios[BLKG_IOSTAT_READ]; - wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; - dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; - } while (u64_stats_fetch_retry(&bis->sync, seq)); - - if (rbytes || wbytes || rios || wios) { - has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", - rbytes, wbytes, rios, wios, - dbytes, dios); - } - - if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; - off += scnprintf(buf+off, size-off, - " use_delay=%d delay_nsec=%llu", - atomic_read(&blkg->use_delay), - (unsigned long long)atomic64_read(&blkg->delay_nsec)); - } - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; - - if (!blkg->pd[i] || !pol->pd_stat_fn) - continue; - - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) - has_stats = true; - off += written; - } - - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); - } else { - seq_commit(sf, -1); - } - } - skip: + blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } - rcu_read_unlock(); return 0; } @@ -1181,6 +1182,10 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); + ret = blk_ioprio_init(q); + if (ret) + goto err_destroy_all; + ret = blk_throtl_init(q); if (ret) goto err_destroy_all; @@ -1190,6 +1195,7 @@ int blkcg_init_queue(struct request_queue *q) blk_throtl_exit(q); goto err_destroy_all; } + return 0; err_destroy_all: @@ -1215,32 +1221,6 @@ void blkcg_exit_queue(struct request_queue *q) blk_throtl_exit(q); } -/* - * We cannot support shared io contexts, as we have no mean to support - * two tasks with the same ioc in two different groups without major rework - * of the main cic data structures. For now we allow a task to change - * its cgroup only if it's the only owner of its ioc. - */ -static int blkcg_can_attach(struct cgroup_taskset *tset) -{ - struct task_struct *task; - struct cgroup_subsys_state *dst_css; - struct io_context *ioc; - int ret = 0; - - /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, dst_css, tset) { - task_lock(task); - ioc = task->io_context; - if (ioc && atomic_read(&ioc->nr_tasks) > 1) - ret = -EINVAL; - task_unlock(task); - if (ret) - break; - } - return ret; -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1273,7 +1253,6 @@ struct cgroup_subsys io_cgrp_subsys = { .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, - .can_attach = blkcg_can_attach, .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, @@ -1387,10 +1366,14 @@ int blkcg_activate_policy(struct request_queue *q, /* alloc failed, nothing's initialized yet, free everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } + spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; @@ -1422,12 +1405,16 @@ void blkcg_deactivate_policy(struct request_queue *q, __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } + spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); @@ -1766,12 +1753,15 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) if (unlikely(current->flags & PF_KTHREAD)) return; - if (!blk_get_queue(q)) - return; + if (current->throttle_queue != q) { + if (!blk_get_queue(q)) + return; + + if (current->throttle_queue) + blk_put_queue(current->throttle_queue); + current->throttle_queue = q; + } - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); @@ -1809,7 +1799,8 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); + blkg = blkg_lookup_create(css_to_blkcg(css), + bio->bi_bdev->bd_disk->queue); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1845,8 +1836,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bio->bi_disk->queue->root_blkg); - bio->bi_blkg = bio->bi_disk->queue->root_blkg; + blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); diff --git a/block/blk-core.c b/block/blk-core.c index 2d53e2ff48..4d8f5fe915 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -14,7 +14,6 @@ */ #include #include -#include #include #include #include @@ -50,7 +49,6 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" -#include "blk-rq-qos.h" struct dentry *blk_debugfs_root; @@ -59,6 +57,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); DEFINE_IDA(blk_queue_ida); @@ -121,7 +120,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; - refcount_set(&rq->ref, 1); blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); @@ -141,8 +139,6 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), - REQ_OP_NAME(SCSI_IN), - REQ_OP_NAME(SCSI_OUT), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; @@ -340,23 +336,25 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_set_queue_dying(struct request_queue *q) +void blk_queue_start_drain(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ blk_freeze_queue_start(q); - if (queue_is_mq(q)) blk_mq_wake_waiters(q); - /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); } + +void blk_set_queue_dying(struct request_queue *q) +{ + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); +} EXPORT_SYMBOL_GPL(blk_set_queue_dying); /** @@ -388,17 +386,9 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); - rq_qos_exit(q); - blk_queue_flag_set(QUEUE_FLAG_DEAD, q); - /* for synchronous bio-based driver finish in-flight integrity i/o */ - blk_flush_integrity(); - - /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); - if (queue_is_mq(q)) blk_mq_exit_queue(q); @@ -422,6 +412,30 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); +static bool blk_try_enter_queue(struct request_queue *q, bool pm) +{ + rcu_read_lock(); + if (!percpu_ref_tryget_live(&q->q_usage_counter)) + goto fail; + + /* + * The code that increments the pm_only counter must ensure that the + * counter is globally visible before the queue is unfrozen. + */ + if (blk_queue_pm_only(q) && + (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) + goto fail_put; + + rcu_read_unlock(); + return true; + +fail_put: + percpu_ref_put(&q->q_usage_counter); +fail: + rcu_read_unlock(); + return false; +} + /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -431,40 +445,18 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; - while (true) { - bool success = false; - - rcu_read_lock(); - if (percpu_ref_tryget_live(&q->q_usage_counter)) { - /* - * The code that increments the pm_only counter is - * responsible for ensuring that that counter is - * globally visible before the queue is unfrozen. - */ - if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) || - !blk_queue_pm_only(q)) { - success = true; - } else { - percpu_ref_put(&q->q_usage_counter); - } - } - rcu_read_unlock(); - - if (success) - return 0; - + while (!blk_try_enter_queue(q, pm)) { if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* - * read pair of barrier in blk_freeze_queue_start(), - * we need to order reading __PERCPU_REF_DEAD flag of - * .q_usage_counter and reading .mq_freeze_depth or - * queue dying flag, otherwise the following wait may - * never return if the two reads are reordered. + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. */ smp_rmb(); - wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || @@ -472,23 +464,43 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) if (blk_queue_dying(q)) return -ENODEV; } + + return 0; } static inline int bio_queue_enter(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; - bool nowait = bio->bi_opf & REQ_NOWAIT; - int ret; + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct request_queue *q = disk->queue; - ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); - if (unlikely(ret)) { - if (nowait && !blk_queue_dying(q)) + while (!blk_try_enter_queue(q, false)) { + if (bio->bi_opf & REQ_NOWAIT) { + if (test_bit(GD_DEAD, &disk->state)) + goto dead; bio_wouldblock_error(bio); - else - bio_io_error(bio); + return -EBUSY; + } + + /* + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. + */ + smp_rmb(); + wait_event(q->mq_freeze_wq, + (!q->mq_freeze_depth && + blk_pm_resume_queue(false, q)) || + test_bit(GD_DEAD, &disk->state)); + if (test_bit(GD_DEAD, &disk->state)) + goto dead; } - return ret; + return 0; +dead: + bio_io_error(bio); + return -ENODEV; } void blk_queue_exit(struct request_queue *q) @@ -531,24 +543,18 @@ struct request_queue *blk_alloc_queue(int node_id) if (q->id < 0) goto fail_q; - ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc(node_id); - if (!q->backing_dev_info) - goto fail_split; - q->stats = blk_alloc_queue_stats(); if (!q->stats) - goto fail_stats; + goto fail_split; q->node = node_id; atomic_set(&q->nr_active_requests_shared_sbitmap, 0); - timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, - laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); @@ -573,7 +579,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto fail_bdi; + goto fail_stats; if (blkcg_init_queue(q)) goto fail_ref; @@ -586,10 +592,8 @@ struct request_queue *blk_alloc_queue(int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); -fail_bdi: - blk_free_queue_stats(q->stats); fail_stats: - bdi_put(q->backing_dev_info); + blk_free_queue_stats(q->stats); fail_split: bioset_exit(&q->bio_split); fail_id: @@ -598,7 +602,6 @@ struct request_queue *blk_alloc_queue(int node_id) kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue); /** * blk_get_queue - increment the request_queue refcount @@ -667,9 +670,9 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct hd_struct *part, unsigned int bytes) +static bool should_fail_request(struct block_device *part, unsigned int bytes) { - return part->make_it_fail && should_fail(&fail_make_request, bytes); + return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -684,7 +687,7 @@ late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool should_fail_request(struct hd_struct *part, +static inline bool should_fail_request(struct block_device *part, unsigned int bytes) { return false; @@ -692,11 +695,9 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +static inline bool bio_check_ro(struct bio *bio) { - const int op = bio_op(bio); - - if (part->policy && op_is_write(op)) { + if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) @@ -704,7 +705,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), part->partno); + bio_devname(bio, b), bio->bi_bdev->bd_partno); /* Older lvm-tools actually trigger this */ return false; } @@ -714,7 +715,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -725,8 +726,9 @@ ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ -static inline int bio_check_eod(struct bio *bio, sector_t maxsector) +static inline int bio_check_eod(struct bio *bio) { + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && maxsector && @@ -741,32 +743,20 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) /* * Remap block n of partition p to block n+start(p) of the disk. */ -static inline int blk_partition_remap(struct bio *bio) +static int blk_partition_remap(struct bio *bio) { - struct hd_struct *p; - int ret = -EIO; + struct block_device *p = bio->bi_bdev; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (unlikely(!p)) - goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) - goto out; - if (unlikely(bio_check_ro(bio, p))) - goto out; - + return -EIO; if (bio_sectors(bio)) { - if (bio_check_eod(bio, part_nr_sects_read(p))) - goto out; - bio->bi_iter.bi_sector += p->start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); + bio->bi_iter.bi_sector += p->bd_start_sect; + trace_block_bio_remap(bio, p->bd_dev, + bio->bi_iter.bi_sector - + p->bd_start_sect); } - bio->bi_partno = 0; - ret = 0; -out: - rcu_read_unlock(); - return ret; + bio_set_flag(bio, BIO_REMAPPED); + return 0; } /* @@ -806,7 +796,8 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static noinline_for_stack bool submit_bio_checks(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct block_device *bdev = bio->bi_bdev; + struct request_queue *q = bdev->bd_disk->queue; blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; @@ -825,14 +816,12 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - - if (bio->bi_partno) { - if (unlikely(blk_partition_remap(bio))) + if (unlikely(bio_check_ro(bio))) + goto end_io; + if (!bio_flagged(bio, BIO_REMAPPED)) { + if (unlikely(bio_check_eod(bio))) goto end_io; - } else { - if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) - goto end_io; - if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) + if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) goto end_io; } @@ -850,7 +839,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) } if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - bio->bi_opf &= ~REQ_HIPRI; + bio_clear_hipri(bio); switch (bio_op(bio)) { case REQ_OP_DISCARD: @@ -907,7 +896,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_queue(q, bio); + trace_block_bio_queue(bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ @@ -925,14 +914,21 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) static blk_qc_t __submit_bio(struct bio *bio) { - struct gendisk *disk = bio->bi_disk; + struct gendisk *disk = bio->bi_bdev->bd_disk; blk_qc_t ret = BLK_QC_T_NONE; - if (blk_crypto_bio_prep(&bio)) { - if (!disk->fops->submit_bio) - return blk_mq_submit_bio(bio); + if (unlikely(bio_queue_enter(bio) != 0)) + return BLK_QC_T_NONE; + + if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio)) + goto queue_exit; + if (disk->fops->submit_bio) { ret = disk->fops->submit_bio(bio); + goto queue_exit; } + return blk_mq_submit_bio(bio); + +queue_exit: blk_queue_exit(disk->queue); return ret; } @@ -967,12 +963,9 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct bio_list lower, same; - if (unlikely(bio_queue_enter(bio) != 0)) - continue; - /* * Create a fresh bio_list for all subordinate requests. */ @@ -988,7 +981,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_disk->queue) + if (q == bio->bi_bdev->bd_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -1008,23 +1001,12 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; - blk_qc_t ret = BLK_QC_T_NONE; + blk_qc_t ret; current->bio_list = bio_list; do { - struct gendisk *disk = bio->bi_disk; - - if (unlikely(bio_queue_enter(bio) != 0)) - continue; - - if (!blk_crypto_bio_prep(&bio)) { - blk_queue_exit(disk->queue); - ret = BLK_QC_T_NONE; - continue; - } - - ret = blk_mq_submit_bio(bio); + ret = __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; @@ -1042,9 +1024,6 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) */ blk_qc_t submit_bio_noacct(struct bio *bio) { - if (!submit_bio_checks(bio)) - return BLK_QC_T_NONE; - /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list @@ -1056,7 +1035,7 @@ blk_qc_t submit_bio_noacct(struct bio *bio) return BLK_QC_T_NONE; } - if (!bio->bi_disk->fops->submit_bio) + if (!bio->bi_bdev->bd_disk->fops->submit_bio) return __submit_bio_noacct_mq(bio); return __submit_bio_noacct(bio); } @@ -1068,7 +1047,7 @@ EXPORT_SYMBOL(submit_bio_noacct); * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_disk and bi_partno fields. + * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback @@ -1088,7 +1067,8 @@ blk_qc_t submit_bio(struct bio *bio) unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size(bio->bi_disk->queue) >> 9; + count = queue_logical_block_size( + bio->bi_bdev->bd_disk->queue) >> 9; else count = bio_sectors(bio); @@ -1098,15 +1078,6 @@ blk_qc_t submit_bio(struct bio *bio) task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } - - if (unlikely(block_dump)) { - char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", - current->comm, task_pid_nr(current), - op_is_write(bio_op(bio)) ? "WRITE" : "READ", - (unsigned long long)bio->bi_iter.bi_sector, - bio_devname(bio, b), count); - } } /* @@ -1173,10 +1144,8 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, } /* - * queue's settings related to segment counting like q->bounce_pfn - * may differ from that of other stacking queues. - * Recalculate it to check the request correctly on this queue's - * limitation. + * The queue settings related to segment counting may differ from the + * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { @@ -1202,7 +1171,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return ret; if (rq->rq_disk && - should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) + should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -1261,17 +1230,18 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) +static void update_io_ticks(struct block_device *part, unsigned long now, + bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->stamp); - if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) + stamp = READ_ONCE(part->bd_stamp); + if (unlikely(time_after(now, stamp))) { + if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } - if (part->partno) { - part = &part_to_disk(part)->part0; + if (part->bd_partno) { + part = bdev_whole(part); goto again; } } @@ -1280,11 +1250,9 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - part_stat_add(part, sectors[sgrp], bytes >> 9); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -1299,17 +1267,12 @@ void blk_account_io_done(struct request *req, u64 now) if (req->part && blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - - update_io_ticks(part, jiffies, true); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); - - hd_struct_put(part); } } @@ -1318,14 +1281,18 @@ void blk_account_io_start(struct request *rq) if (!blk_do_io_stat(rq)) return; - rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + /* passthrough requests can hold bios that do not have ->bi_bdev set */ + if (rq->bio && rq->bio->bi_bdev) + rq->part = rq->bio->bi_bdev; + else + rq->part = rq->rq_disk->part0; part_stat_lock(); update_io_ticks(rq->part, jiffies, false); part_stat_unlock(); } -static unsigned long __part_start_io_acct(struct hd_struct *part, +static unsigned long __part_start_io_acct(struct block_device *part, unsigned int sectors, unsigned int op) { const int sgrp = op_stat_group(op); @@ -1341,23 +1308,26 @@ static unsigned long __part_start_io_acct(struct hd_struct *part, return now; } -unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, - struct bio *bio) +/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * + * Returns the start time that should be passed back to bio_end_io_acct(). + */ +unsigned long bio_start_io_acct(struct bio *bio) { - *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); - - return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); + return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio)); } -EXPORT_SYMBOL_GPL(part_start_io_acct); +EXPORT_SYMBOL_GPL(bio_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(&disk->part0, sectors, op); + return __part_start_io_acct(disk->part0, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); -static void __part_end_io_acct(struct hd_struct *part, unsigned int op, +static void __part_end_io_acct(struct block_device *part, unsigned int op, unsigned long start_time) { const int sgrp = op_stat_group(op); @@ -1371,18 +1341,17 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op, part_stat_unlock(); } -void part_end_io_acct(struct hd_struct *part, struct bio *bio, - unsigned long start_time) +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev) { - __part_end_io_acct(part, bio_op(bio), start_time); - hd_struct_put(part); + __part_end_io_acct(orig_bdev, bio_op(bio), start_time); } -EXPORT_SYMBOL_GPL(part_end_io_acct); +EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(&disk->part0, op, start_time); + __part_end_io_acct(disk->part0, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); @@ -1408,26 +1377,22 @@ void blk_steal_bios(struct bio_list *list, struct request *rq) EXPORT_SYMBOL_GPL(blk_steal_bios); /** - * blk_update_request - Special helper function for request stacking drivers + * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code - * @nr_bytes: number of bytes to complete @req + * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * - * This special helper function is only for request stacking drivers - * (e.g. request-based dm) so that they can handle partial completion. - * Actual device drivers should use blk_mq_end_request instead. - * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: - * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both - * blk_rq_bytes() and in blk_update_request(). + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function + * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c162b754ef..c322176a1e 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -80,6 +80,7 @@ static struct blk_crypto_keyslot { static struct blk_keyslot_manager blk_crypto_ksm; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; +static struct bio_set crypto_bio_split; /* * This is the key we set when evicting a keyslot. This *should* be the all 0's @@ -164,10 +165,12 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) struct bio_vec bv; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); + bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; + bio->bi_bdev = bio_src->bi_bdev; + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; @@ -216,13 +219,14 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) bio_for_each_segment(bv, bio, iter) { num_sectors += bv.bv_len >> SECTOR_SHIFT; - if (++i == BIO_MAX_PAGES) + if (++i == BIO_MAX_VECS) break; } if (num_sectors < bio_sectors(bio)) { struct bio *split_bio; - split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL); + split_bio = bio_split(bio, num_sectors, GFP_NOIO, + &crypto_bio_split); if (!split_bio) { bio->bi_status = BLK_STS_RESOURCE; return false; @@ -536,9 +540,13 @@ static int blk_crypto_fallback_init(void) prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); - err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) goto out; + + err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + if (err) + goto fail_free_bioset; err = -ENOMEM; blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; @@ -589,6 +597,8 @@ static int blk_crypto_fallback_init(void) destroy_workqueue(blk_crypto_wq); fail_free_ksm: blk_ksm_destroy(&blk_crypto_ksm); +fail_free_bioset: + bioset_exit(&crypto_bio_split); out: return err; } diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 5da43f0973..103c2e2d50 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -280,7 +280,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, &bc_key->crypto_cfg)) return true; @@ -332,7 +332,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, if (mode->keysize == 0) return -EINVAL; - if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) @@ -409,3 +409,4 @@ int blk_crypto_evict_key(struct request_queue *q, */ return blk_crypto_fallback_evict_key(key); } +EXPORT_SYMBOL_GPL(blk_crypto_evict_key); diff --git a/block/blk-exec.c b/block/blk-exec.c index 85324d53d0..d6cd501c0d 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -21,7 +21,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) { struct completion *waiting = rq->end_io_data; - rq->end_io_data = NULL; + rq->end_io_data = (void *)(uintptr_t)error; /* * complete last, if this is a stack request the process (and thus @@ -31,8 +31,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) } /** - * blk_execute_rq_nowait - insert a request into queue for execution - * @q: queue to insert the request in + * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue @@ -45,9 +44,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) * Note: * This function will invoke @done directly if the queue is dead. */ -void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head, - rq_end_io_fn *done) +void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, + int at_head, rq_end_io_fn *done) { WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -65,9 +63,21 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); +static bool blk_rq_is_poll(struct request *rq) +{ + return rq->mq_hctx && rq->mq_hctx->type == HCTX_TYPE_POLL; +} + +static void blk_rq_poll_completion(struct request *rq, struct completion *wait) +{ + do { + blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), true); + cond_resched(); + } while (!completion_done(wait)); +} + /** * blk_execute_rq - insert a request into queue for execution - * @q: queue to insert the request in * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue @@ -75,21 +85,26 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). */ -void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head) +blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); + blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) + + if (blk_rq_is_poll(rq)) + blk_rq_poll_completion(rq, &wait); + else if (hang_check) while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else wait_for_completion_io(&wait); + + return (blk_status_t)(uintptr_t)rq->end_io_data; } EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index 7ee7e5e890..4201728bf3 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,7 +69,6 @@ #include #include #include -#include #include "blk.h" #include "blk-mq.h" @@ -139,7 +138,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = &rq->rq_disk->part0; + struct block_device *part = rq->rq_disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); @@ -263,6 +262,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } +bool is_flush_rq(struct request *rq) +{ + return rq->end_io == flush_end_io; +} + /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked @@ -330,6 +334,14 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; + /* + * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one + * implied in refcount_inc_not_zero() called from + * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref + * and READ flush_rq->end_io + */ + smp_wmb(); + refcount_set(&flush_rq->ref, 1); blk_flush_queue_rq(flush_rq, false); } @@ -432,23 +444,18 @@ void blk_insert_flush(struct request *rq) /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for - * @gfp_mask: memory allocation flags (for bio_alloc) * * Description: * Issue a flush for the block device in question. */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +int blkdev_issue_flush(struct block_device *bdev) { - struct bio *bio; - int ret = 0; + struct bio bio; - bio = bio_alloc(gfp_mask, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - ret = submit_bio_wait(bio); - bio_put(bio); - return ret; + bio_init(&bio, NULL, 0); + bio_set_dev(&bio, bdev); + bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); @@ -473,9 +480,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_data_in_flight); - lockdep_register_key(&fq->key); - lockdep_set_class(&fq->mq_flush_lock, &fq->key); - return fq; fail_rq: @@ -490,7 +494,31 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return; - lockdep_unregister_key(&fq->key); kfree(fq->flush_rq); kfree(fq); } + +/* + * Allow driver to set its own lock class to fq->mq_flush_lock for + * avoiding lockdep complaint. + * + * flush_end_io() may be called recursively from some driver, such as + * nvme-loop, so lockdep may complain 'possible recursive locking' because + * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class + * key. We need to assign different lock class for these driver's + * fq->mq_flush_lock for avoiding the lockdep warning. + * + * Use dynamically allocated lock class key for each 'blk_flush_queue' + * instance is over-kill, and more worse it introduces horrible boot delay + * issue because synchronize_rcu() is implied in lockdep_unregister_key which + * is called for each hctx release. SCSI probing may synchronously create and + * destroy lots of MQ request_queues for non-existent devices, and some robot + * test kernel always enable lockdep option. It is observed that more than half + * an hour is taken during SCSI MQ probe with per-fq lock class. + */ +void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, + struct lock_class_key *key) +{ + lockdep_set_class(&hctx->fq->mq_flush_lock, key); +} +EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 410da060d1..16d5d53383 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -426,18 +426,27 @@ EXPORT_SYMBOL(blk_integrity_register); */ void blk_integrity_unregister(struct gendisk *disk) { + struct blk_integrity *bi = &disk->queue->integrity; + + if (!bi->profile) + return; + + /* ensure all bios are off the integrity workqueue */ + blk_flush_integrity(); blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); - memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); + memset(bi, 0, sizeof(*bi)); } EXPORT_SYMBOL(blk_integrity_unregister); -void blk_integrity_add(struct gendisk *disk) +int blk_integrity_add(struct gendisk *disk) { - if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, "%s", "integrity")) - return; + int ret; - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity"); + if (!ret) + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + return ret; } void blk_integrity_del(struct gendisk *disk) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index e95b93f72b..b3880e4ba2 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -39,7 +39,7 @@ * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default - * paramters for several different classes of devices are provided and the + * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * @@ -77,7 +77,7 @@ * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks - * the vtime consumed by past IOs and can issue a new IO iff doing so + * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * @@ -155,7 +155,7 @@ * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see - * https://github.com/osandov/drgn. The ouput looks like the following. + * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% @@ -370,8 +370,6 @@ enum { AUTOP_SSD_FAST, }; -struct ioc_gq; - struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; @@ -492,7 +490,7 @@ struct ioc_gq { /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents - * the currently available IO budget. If runnning ahead, the + * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather @@ -973,6 +971,54 @@ static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } +static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, + int nr_lagging, int nr_shortages, + int prev_busy_level, u32 *missed_ppm) +{ + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { + if (ioc->busy_level != prev_busy_level || nr_lagging) + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + return; + } + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -1056,7 +1102,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* * The delta between inuse and active sums indicates that - * that much of weight is being given away. Parent's inuse + * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { @@ -2082,13 +2128,91 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, } } +/* + * Check the active iocgs' state to avoid oversleeping and deactive + * idle iocgs. + * + * Since waiters determine the sleep durations based on the vrate + * they saw at the time of sleep, if vrate has increased, some + * waiters could be sleeping for too long. Wake up tardy waiters + * which should have woken up in the last period and expire idle + * iocgs. + */ +static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) +{ + int nr_debtors = 0; + struct ioc_gq *iocg, *tiocg; + + list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && + !iocg->delay && !iocg_is_idle(iocg)) + continue; + + spin_lock(&iocg->waitq.lock); + + /* flush wait and indebt stat deltas */ + if (iocg->wait_since) { + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = now->now; + } + if (iocg->indebt_since) { + iocg->local_stat.indebt_us += + now->now - iocg->indebt_since; + iocg->indebt_since = now->now; + } + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += + now->now - iocg->indelay_since; + iocg->indelay_since = now->now; + } + + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || + iocg->delay) { + /* might be oversleeping vtime / hweight changes, kick */ + iocg_kick_waitq(iocg, true, now); + if (iocg->abs_vdebt || iocg->delay) + nr_debtors++; + } else if (iocg_is_idle(iocg)) { + /* no waiter and idle, deactivate */ + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* + * @iocg has been inactive for a full duration and will + * have a high budget. Account anything above target as + * error and throw away. On reactivation, it'll start + * with the target budget. + */ + excess = now->vnow - vtime - ioc->margins.target; + if (excess > 0) { + u32 old_hwi; + + current_hweight(iocg, NULL, &old_hwi); + ioc->vtime_err -= div64_u64(excess * old_hwi, + WEIGHT_ONE); + } + + TRACE_IOCG_PATH(iocg_idle, iocg, now, + atomic64_read(&iocg->active_period), + atomic64_read(&ioc->cur_period), vtime); + __propagate_weights(iocg, 0, 0, false, now); + list_del_init(&iocg->active_list); + } + + spin_unlock(&iocg->waitq.lock); + } + + commit_weights(ioc); + return nr_debtors; +} + static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); - int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; + int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; @@ -2110,68 +2234,7 @@ static void ioc_timer_fn(struct timer_list *timer) return; } - /* - * Waiters determine the sleep durations based on the vrate they - * saw at the time of sleep. If vrate has increased, some waiters - * could be sleeping for too long. Wake up tardy waiters which - * should have woken up in the last period and expire idle iocgs. - */ - list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && - !iocg->delay && !iocg_is_idle(iocg)) - continue; - - spin_lock(&iocg->waitq.lock); - - /* flush wait and indebt stat deltas */ - if (iocg->wait_since) { - iocg->local_stat.wait_us += now.now - iocg->wait_since; - iocg->wait_since = now.now; - } - if (iocg->indebt_since) { - iocg->local_stat.indebt_us += - now.now - iocg->indebt_since; - iocg->indebt_since = now.now; - } - if (iocg->indelay_since) { - iocg->local_stat.indelay_us += - now.now - iocg->indelay_since; - iocg->indelay_since = now.now; - } - - if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || - iocg->delay) { - /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, true, &now); - if (iocg->abs_vdebt || iocg->delay) - nr_debtors++; - } else if (iocg_is_idle(iocg)) { - /* no waiter and idle, deactivate */ - u64 vtime = atomic64_read(&iocg->vtime); - s64 excess; - - /* - * @iocg has been inactive for a full duration and will - * have a high budget. Account anything above target as - * error and throw away. On reactivation, it'll start - * with the target budget. - */ - excess = now.vnow - vtime - ioc->margins.target; - if (excess > 0) { - u32 old_hwi; - - current_hweight(iocg, NULL, &old_hwi); - ioc->vtime_err -= div64_u64(excess * old_hwi, - WEIGHT_ONE); - } - - __propagate_weights(iocg, 0, 0, false, &now); - list_del_init(&iocg->active_list); - } - - spin_unlock(&iocg->waitq.lock); - } - commit_weights(ioc); + nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation @@ -2181,8 +2244,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, usage_dur; - u32 usage, hw_active, hw_inuse; + u64 vdone, vtime, usage_us; + u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -2213,30 +2276,32 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; - if (vdone != vtime) { - u64 inflight_us = DIV64_U64_ROUND_UP( - cost_to_abs_cost(vtime - vdone, hw_inuse), - ioc->vtime_base_rate); - usage_us = max(usage_us, inflight_us); - } - - /* convert to hweight based usage ratio */ - if (time_after64(iocg->activated_at, ioc->period_at)) - usage_dur = max_t(u64, now.now - iocg->activated_at, 1); - else - usage_dur = max_t(u64, now.now - ioc->period_at, 1); - - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); - /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, old_hwi, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi, usage; + u64 usage_dur; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. @@ -2320,51 +2385,8 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = ioc->vtime_base_rate; - u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; - - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - - /* - * If vrate is out of bounds, apply clamp gradually as the - * bounds can change abruptly. Otherwise, apply busy_level - * based adjustment. - */ - if (vrate < vrate_min) { - vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), - 100); - vrate = min(vrate, vrate_min); - } else if (vrate > vrate_max) { - vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), - 100); - vrate = max(vrate, vrate_max); - } else { - int idx = min_t(int, abs(ioc->busy_level), - ARRAY_SIZE(vrate_adj_pct) - 1); - u32 adj_pct = vrate_adj_pct[idx]; - - if (ioc->busy_level > 0) - adj_pct = 100 - adj_pct; - else - adj_pct = 100 + adj_pct; - - vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), - vrate_min, vrate_max); - } - - trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages); - - ioc->vtime_base_rate = vrate; - ioc_refresh_margins(ioc); - } else if (ioc->busy_level != prev_busy_level || nr_lagging) { - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), - missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages); - } + ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, + prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); @@ -2411,7 +2433,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, return cost; /* - * We only increase inuse during period and do so iff the margin has + * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || @@ -2966,34 +2988,29 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; - size_t pos = 0; if (!ioc->enabled) - return 0; + return false; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); - pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", - vp10k / 100, vp10k % 100); + seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } - pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", - iocg->last_stat.usage_us); + seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) - pos += scnprintf(buf + pos, size - pos, - " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", - iocg->last_stat.wait_us, - iocg->last_stat.indebt_us, - iocg->last_stat.indelay_us); - - return pos; + seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, @@ -3137,23 +3154,23 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3248,12 +3265,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } @@ -3304,23 +3321,23 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3373,13 +3390,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index d8b0d8bd13..c0545f9da5 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -890,8 +890,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, - size_t size) +static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -906,39 +905,40 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, preempt_enable(); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " missed=%llu total=%llu depth=max", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total); - return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + seq_printf(s, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + else + seq_printf(s, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); + return true; } -static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, - size_t size) +static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return 0; + return false; if (iolat->ssd) - return iolatency_ssd_stat(iolat, buf, size); + return iolatency_ssd_stat(iolat, s); avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", - avg_lat, cur_win); - - return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + seq_printf(s, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); + return true; } - static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) diff --git a/block/blk-lib.c b/block/blk-lib.c index e90614fd8d..9f09beadcb 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -21,6 +21,7 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp) return new; } +EXPORT_SYMBOL_GPL(blk_next_bio); int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, @@ -65,7 +66,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* In case the discard request is in a partition */ if (bdev_is_partition(bdev)) - part_offset = bdev->bd_part->start_sect; + part_offset = bdev->bd_start_sect; while (nr_sects) { sector_t granularity_aligned_lba, req_sects; @@ -296,7 +297,7 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); - return min(pages, (sector_t)BIO_MAX_PAGES); + return min(pages, (sector_t)BIO_MAX_VECS); } static int __blkdev_issue_zero_pages(struct block_device *bdev, diff --git a/block/blk-map.c b/block/blk-map.c index 21630dccac..4526adde01 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -123,7 +123,6 @@ static int bio_uncopy_user(struct bio *bio) bio_free_pages(bio); } kfree(bmd); - bio_put(bio); return ret; } @@ -132,7 +131,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, { struct bio_map_data *bmd; struct page *page; - struct bio *bio, *bounce_bio; + struct bio *bio; int i = 0, ret; int nr_pages; unsigned int len = iter->count; @@ -150,9 +149,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bmd->is_our_pages = !map_data; bmd->is_null_mapped = (map_data && map_data->null_mapped); - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; + nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; bio = bio_kmalloc(gfp_mask, nr_pages); @@ -183,7 +180,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, i++; } else { - page = alloc_page(rq->q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | gfp_mask); if (!page) { ret = -ENOMEM; goto cleanup; @@ -220,16 +217,9 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio->bi_private = bmd; - bounce_bio = bio; - ret = blk_rq_append_bio(rq, &bounce_bio); + ret = blk_rq_append_bio(rq, bio); if (ret) goto cleanup; - - /* - * We link the bounce buffer in and could have to traverse it later, so - * we have to get a ref to prevent it from being freed - */ - bio_get(bounce_bio); return 0; cleanup: if (!map_data) @@ -244,14 +234,14 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { unsigned int max_sectors = queue_max_hw_sectors(rq->q); - struct bio *bio, *bounce_bio; + struct bio *bio; int ret; int j; if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS)); if (!bio) return -ENOMEM; bio->bi_opf |= req_op(rq); @@ -306,52 +296,20 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, break; } - /* - * Subtle: if we end up needing to bounce a bio, it would normally - * disappear when its bi_end_io is run. However, we need the original - * bio for the unmap, so grab an extra reference to it - */ - bio_get(bio); - - bounce_bio = bio; - ret = blk_rq_append_bio(rq, &bounce_bio); + ret = blk_rq_append_bio(rq, bio); if (ret) - goto out_put_orig; - - /* - * We link the bounce buffer in and could have to traverse it - * later, so we have to get a ref to prevent it from being freed - */ - bio_get(bounce_bio); + goto out_unmap; return 0; - out_put_orig: - bio_put(bio); out_unmap: bio_release_pages(bio, false); bio_put(bio); return ret; } -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -static void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); -} - static void bio_invalidate_vmalloc_pages(struct bio *bio) { -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE if (bio->bi_private && !op_is_write(bio_op(bio))) { unsigned long i, len = 0; @@ -442,7 +400,7 @@ static void bio_copy_kern_endio_read(struct bio *bio) struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + memcpy_from_bvec(p, bvec); p += bvec->bv_len; } @@ -488,7 +446,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | gfp_mask); if (!page) goto cleanup; @@ -521,33 +479,24 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. */ -int blk_rq_append_bio(struct request *rq, struct bio **bio) +int blk_rq_append_bio(struct request *rq, struct bio *bio) { - struct bio *orig_bio = *bio; struct bvec_iter iter; struct bio_vec bv; unsigned int nr_segs = 0; - blk_queue_bounce(rq->q, bio); - - bio_for_each_bvec(bv, *bio, iter) + bio_for_each_bvec(bv, bio, iter) nr_segs++; if (!rq->bio) { - blk_rq_bio_prep(rq, *bio, nr_segs); + blk_rq_bio_prep(rq, bio, nr_segs); } else { - if (!ll_back_merge_fn(rq, *bio, nr_segs)) { - if (orig_bio != *bio) { - bio_put(*bio); - *bio = orig_bio; - } + if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; - } - - rq->biotail->bi_next = *bio; - rq->biotail = *bio; - rq->__data_len += (*bio)->bi_iter.bi_size; - bio_crypt_free_ctx(*bio); + rq->biotail->bi_next = bio; + rq->biotail = bio; + rq->__data_len += (bio)->bi_iter.bi_size; + bio_crypt_free_ctx(bio); } return 0; @@ -568,12 +517,6 @@ EXPORT_SYMBOL(blk_rq_append_bio); * * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, @@ -590,6 +533,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (map_data) copy = true; + else if (blk_queue_may_bounce(q)) + copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (queue_virt_boundary(q)) @@ -643,25 +588,21 @@ EXPORT_SYMBOL(blk_rq_map_user); */ int blk_rq_unmap_user(struct bio *bio) { - struct bio *mapped_bio; + struct bio *next_bio; int ret = 0, ret2; while (bio) { - mapped_bio = bio; - if (unlikely(bio_flagged(bio, BIO_BOUNCED))) - mapped_bio = bio->bi_private; - if (bio->bi_private) { - ret2 = bio_uncopy_user(mapped_bio); + ret2 = bio_uncopy_user(bio); if (ret2 && !ret) ret = ret2; } else { - bio_unmap_user(mapped_bio); + bio_release_pages(bio, bio_data_dir(bio) == READ); } - mapped_bio = bio; + next_bio = bio; bio = bio->bi_next; - bio_put(mapped_bio); + bio_put(next_bio); } return ret; @@ -686,7 +627,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; - struct bio *bio, *orig_bio; + struct bio *bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) @@ -694,7 +635,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || + blk_queue_may_bounce(q)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); @@ -705,14 +647,9 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); - orig_bio = bio; - ret = blk_rq_append_bio(rq, &bio); - if (unlikely(ret)) { - /* request is too big */ - bio_put(orig_bio); - return ret; - } - - return 0; + ret = blk_rq_append_bio(rq, bio); + if (unlikely(ret)) + bio_put(bio); + return ret; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index 349cd7d3af..7a5c81c02c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -279,6 +279,14 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, return NULL; split: *segs = nsegs; + + /* + * Bio splitting may cause subtle trouble such as hang when doing sync + * iopoll in direct IO routine. Given performance gain of iopoll for + * big IO can be trival, disable iopoll when split needed. + */ + bio_clear_hipri(bio); + return bio_split(bio, sectors, GFP_NOIO, bs); } @@ -290,14 +298,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * Split a bio into two bios, chain the two bios, submit the second half and * store a pointer to the first half in *@bio. If the second bio is still too * big it will be split by a recursive call to this function. Since this - * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is - * the responsibility of the caller to ensure that - * @bio->bi_disk->queue->bio_split is only released after processing of the - * split bio has finished. + * function may allocate a new bio from q->bio_split, it is the responsibility + * of the caller to ensure that q->bio_split is only released after processing + * of the split bio has finished. */ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) { - struct request_queue *q = (*bio)->bi_disk->queue; + struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -338,9 +345,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) split->bi_opf |= REQ_NOMERGE; bio_chain(split, *bio); - trace_block_split(q, split, (*bio)->bi_iter.bi_sector); + trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; + + blk_throtl_charge_bio_split(*bio); } } @@ -350,9 +359,9 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) * * Split a bio into two bios, chains the two bios, submit the second half and * store a pointer to the first half in *@bio. Since this function may allocate - * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of - * the caller to ensure that @bio->bi_disk->queue->bio_split is only released - * after processing of the split bio has finished. + * a new bio from q->bio_split, it is the responsibility of the caller to ensure + * that q->bio_split is only released after processing of the split bio has + * finished. */ void blk_queue_split(struct bio **bio) { @@ -695,27 +704,9 @@ static void blk_account_io_merge_request(struct request *req) part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); part_stat_unlock(); - - hd_struct_put(req->part); } } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static enum elv_merge blk_try_req_merge(struct request *req, struct request *next) { @@ -813,7 +804,7 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); - trace_block_rq_merge(q, next); + trace_block_rq_merge(next); /* * ownership of bio passed from next to req, return 'next' for @@ -845,18 +836,15 @@ static struct request *attempt_front_merge(struct request_queue *q, return NULL; } -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next) +/* + * Try to merge 'next' into 'rq'. Return true if the merge happened, false + * otherwise. The caller is responsible for freeing 'next' if the merge + * happened. + */ +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) { - struct request *free; - - free = attempt_merge(q, rq, next); - if (free) { - blk_put_request(free); - return 1; - } - - return 0; + return attempt_merge(q, rq, next); } bool blk_rq_merge_ok(struct request *rq, struct bio *bio) @@ -872,7 +860,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device */ - if (rq->rq_disk != bio->bi_disk) + if (rq->rq_disk != bio->bi_bdev->bd_disk) return false; /* only merge integrity protected bio into ditto rq */ @@ -936,7 +924,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, if (!ll_back_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_backmerge(req->q, req, bio); + trace_block_bio_backmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) @@ -960,7 +948,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_frontmerge(req->q, req, bio); + trace_block_bio_frontmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4de03da9a6..3b38d15723 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -129,6 +129,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), + QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(NOWAIT), }; #undef QUEUE_FLAG_NAME @@ -292,7 +293,6 @@ static const char *const cmd_flag_name[] = { #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { - RQF_NAME(SORTED), RQF_NAME(STARTED), RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), @@ -303,7 +303,6 @@ static const char *const rqf_name[] = { RQF_NAME(QUIET), RQF_NAME(ELVPRIV), RQF_NAME(IO_STAT), - RQF_NAME(ALLOCED), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), @@ -939,6 +938,21 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q) q->sched_debugfs_dir = NULL; } +static const char *rq_qos_id_to_name(enum rq_qos_id id) +{ + switch (id) { + case RQ_QOS_WBT: + return "wbt"; + case RQ_QOS_LATENCY: + return "latency"; + case RQ_QOS_COST: + return "cost"; + case RQ_QOS_IOPRIO: + return "ioprio"; + } + return "unknown"; +} + void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { debugfs_remove_recursive(rqos->debugfs_dir); @@ -974,6 +988,14 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, { struct elevator_type *e = q->elevator->type; + /* + * If the parent debugfs directory has not been created yet, return; + * We will be called again later on with appropriate parent debugfs + * directory from blk_register_queue() + */ + if (!hctx->debugfs_dir) + return; + if (!e->hctx_debugfs_attrs) return; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 581be65a53..0f006cabfd 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -75,7 +75,8 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) blk_mq_run_hw_queue(hctx, true); } -static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) +static int sched_rq_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); @@ -131,6 +132,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) do { struct request *rq; + int budget_token; if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; @@ -140,12 +142,13 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) break; } - if (!blk_mq_get_dispatch_budget(q)) + budget_token = blk_mq_get_dispatch_budget(q); + if (budget_token < 0) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { - blk_mq_put_dispatch_budget(q); + blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the @@ -157,15 +160,27 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) break; } + blk_mq_set_rq_budget_token(rq, budget_token); + /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ list_add_tail(&rq->queuelist, &rq_list); + count++; if (rq->mq_hctx != hctx) multi_hctxs = true; - } while (++count < max_dispatch); + + /* + * If we cannot get tag for the request, stop dequeueing + * requests from the IO scheduler. We are unlikely to be able + * to submit them anyway and it creates false impression for + * scheduling heuristics that the device can take more IO. + */ + if (!blk_mq_get_driver_tag(rq)) + break; + } while (count < max_dispatch); if (!count) { if (run_queue) @@ -230,6 +245,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) struct request *rq; do { + int budget_token; + if (!list_empty_careful(&hctx->dispatch)) { ret = -EAGAIN; break; @@ -238,12 +255,13 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - if (!blk_mq_get_dispatch_budget(q)) + budget_token = blk_mq_get_dispatch_budget(q); + if (budget_token < 0) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { - blk_mq_put_dispatch_budget(q); + blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the @@ -255,6 +273,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) break; } + blk_mq_set_rq_budget_token(rq, budget_token); + /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() @@ -274,8 +294,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct elevator_queue *e = q->elevator; - const bool has_sched_dispatch = e && e->type->ops.dispatch_request; + const bool has_sched = q->elevator; int ret = 0; LIST_HEAD(rq_list); @@ -306,12 +325,12 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { - if (has_sched_dispatch) + if (has_sched) ret = blk_mq_do_dispatch_sched(hctx); else ret = blk_mq_do_dispatch_ctx(hctx); } - } else if (has_sched_dispatch) { + } else if (has_sched) { ret = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) { /* dequeue request one by one from sw queue if queue is busy */ @@ -380,20 +399,14 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return ret; } -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -void blk_mq_sched_request_inserted(struct request *rq) -{ - trace_block_rq_insert(rq->q, rq); -} -EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); - static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, - bool has_sched, struct request *rq) { /* @@ -410,9 +423,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) return true; - if (has_sched) - rq->rq_flags |= RQF_SORTED; - return false; } @@ -426,7 +436,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); - if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { + if (blk_mq_sched_bypass_insert(hctx, rq)) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( @@ -453,7 +463,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, goto run; } - if (e && e->type->ops.insert_requests) { + if (e) { LIST_HEAD(list); list_add(&rq->queuelist, &list); @@ -484,9 +494,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_get(&q->q_usage_counter); e = hctx->queue->elevator; - if (e && e->type->ops.insert_requests) + if (e) { e->type->ops.insert_requests(hctx, list, false); - else { + } else { /* * try to issue requests directly if the hw queue isn't * busy in case of 'none' scheduler, and this way may save @@ -505,36 +515,23 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_put(&q->q_usage_counter); } -static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - - if (hctx->sched_tags) { - blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags, flags); - hctx->sched_tags = NULL; - } -} - static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; - /* Clear HCTX_SHARED so tags are init'ed */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; int ret; hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, flags); + set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM; ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) - blk_mq_sched_free_tags(set, hctx, hctx_idx); + if (ret) { + blk_mq_free_rq_map(hctx->sched_tags, set->flags); + hctx->sched_tags = NULL; + } return ret; } @@ -546,16 +543,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - /* Clear HCTX_SHARED so tags are freed */ - unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); hctx->sched_tags = NULL; } } } +static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) +{ + struct blk_mq_tag_set *set = queue->tag_set; + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Set initial depth at max so that we don't need to reallocate for + * updating nr_requests. + */ + ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, + &queue->sched_breserved_tags, + MAX_SCHED_RQ, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; + + queue_for_each_hw_ctx(queue, hctx, i) { + hctx->sched_tags->bitmap_tags = + &queue->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &queue->sched_breserved_tags; + } + + sbitmap_queue_resize(&queue->sched_bitmap_tags, + queue->nr_requests - set->reserved_tags); + + return 0; +} + +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) +{ + sbitmap_queue_free(&queue->sched_bitmap_tags); + sbitmap_queue_free(&queue->sched_breserved_tags); +} + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; @@ -580,12 +611,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); if (ret) - goto err; + goto err_free_tags; + } + + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + ret = blk_mq_init_sched_shared_sbitmap(q); + if (ret) + goto err_free_tags; } ret = e->ops.init_sched(q, e); if (ret) - goto err; + goto err_free_sbitmap; blk_mq_debugfs_register_sched(q); @@ -605,7 +642,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; -err: +err_free_sbitmap: + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); +err_free_tags: blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; @@ -631,6 +671,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned int i; + unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); @@ -638,10 +679,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } + flags = hctx->flags; } blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); + if (blk_mq_is_sbitmap_shared(flags)) + blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0476360f05..5246ae0407 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -5,14 +5,16 @@ #include "blk-mq.h" #include "blk-mq-tag.h" +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) + void blk_mq_sched_assign_ioc(struct request *rq); -void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 7b52e7657b..253c857cba 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -45,60 +45,12 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) kfree(hctx); } -struct blk_mq_ctx_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_mq_ctx *, char *); - ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); -}; - struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; -static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->show) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->show(ctx, page); - mutex_unlock(&q->sysfs_lock); - return res; -} - -static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(ctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -198,23 +150,16 @@ static struct attribute *default_hw_ctx_attrs[] = { }; ATTRIBUTE_GROUPS(default_hw_ctx); -static const struct sysfs_ops blk_mq_sysfs_ops = { - .show = blk_mq_sysfs_show, - .store = blk_mq_sysfs_store, -}; - static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, .store = blk_mq_hw_sysfs_store, }; static struct kobj_type blk_mq_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_sysfs_release, }; static struct kobj_type blk_mq_ctx_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_ctx_sysfs_release, }; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c4f2f6c123..ff5caeb825 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h" /* @@ -207,7 +208,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; - if (!rq || !refcount_inc_not_zero(&rq->ref)) + if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; @@ -399,8 +400,8 @@ static bool blk_mq_tagset_count_completed_rqs(struct request *rq, } /** - * blk_mq_tagset_wait_completed_request - wait until all completed req's - * complete funtion is run + * blk_mq_tagset_wait_completed_request - Wait until all scheduled request + * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown @@ -471,39 +472,54 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, unsigned int reserved, + int node, int alloc_policy) +{ + unsigned int depth = queue_depth - reserved; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + + if (bt_alloc(bitmap_tags, depth, round_robin, node)) + return -ENOMEM; + if (bt_alloc(breserved_tags, reserved, round_robin, node)) + goto free_bitmap_tags; + + return 0; + +free_bitmap_tags: + sbitmap_queue_free(bitmap_tags); + return -ENOMEM; +} + static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, int node, int alloc_policy) { - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + int ret; - if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, - round_robin, node)) - goto free_bitmap_tags; + ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, + &tags->__breserved_tags, + tags->nr_tags, tags->nr_reserved_tags, + node, alloc_policy); + if (ret) + return ret; tags->bitmap_tags = &tags->__bitmap_tags; tags->breserved_tags = &tags->__breserved_tags; return 0; -free_bitmap_tags: - sbitmap_queue_free(&tags->__bitmap_tags); - return -ENOMEM; } -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) { - unsigned int depth = set->queue_depth - set->reserved_tags; int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - int i, node = set->numa_node; + int i, ret; - if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(&set->__breserved_tags, set->reserved_tags, - round_robin, node)) - goto free_bitmap_tags; + ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, + set->queue_depth, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; @@ -513,9 +529,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) } return 0; -free_bitmap_tags: - sbitmap_queue_free(&set->__bitmap_tags); - return -ENOMEM; } void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) @@ -544,7 +557,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); - if (flags & BLK_MQ_F_TAG_HCTX_SHARED) + if (blk_mq_is_sbitmap_shared(flags)) return tags; if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { @@ -556,7 +569,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) { - if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { + if (!blk_mq_is_sbitmap_shared(flags)) { sbitmap_queue_free(tags->bitmap_tags); sbitmap_queue_free(tags->breserved_tags); } @@ -578,8 +591,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; - /* Only sched tags can grow, so clear HCTX_SHARED flag */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; struct blk_mq_tags *new; bool ret; @@ -590,21 +601,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ - if (tdepth > 16 * BLKDEV_MAX_RQ) + if (tdepth > MAX_SCHED_RQ) return -EINVAL; new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, flags); + tags->nr_reserved_tags, set->flags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); if (ret) { - blk_mq_free_rq_map(new, flags); + blk_mq_free_rq_map(new, set->flags); return -ENOMEM; } blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, flags); + blk_mq_free_rq_map(*tagsptr, set->flags); *tagsptr = new; } else { /* diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index f887988e5e..8ed55af084 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -32,11 +32,14 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, unsigned int flags); extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); +extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, + unsigned int reserved, + int node, int alloc_policy); -extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, - unsigned int flags); +extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); - extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); diff --git a/block/blk-mq.c b/block/blk-mq.c index 044d0e3a15..bc026372de 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,7 +41,7 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); +static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -95,7 +95,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, } struct mq_inflight { - struct hd_struct *part; + struct block_device *part; unsigned int inflight[2]; }; @@ -105,13 +105,15 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (rq->part == mi->part && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) + if ((!mi->part->bd_partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part) { struct mq_inflight mi = { .part = part }; @@ -120,8 +122,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) return mi.inflight[0] + mi.inflight[1]; } -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; @@ -186,9 +188,11 @@ void blk_mq_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void blk_mq_unfreeze_queue(struct request_queue *q) +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { mutex_lock(&q->mq_freeze_lock); + if (force_atomic) + q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { @@ -197,6 +201,11 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } mutex_unlock(&q->mq_freeze_lock); } + +void blk_mq_unfreeze_queue(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* @@ -359,11 +368,12 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) if (e) { /* - * Flush requests are special and go directly to the + * Flush/passthrough requests are special and go directly to the * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ if (!op_is_flush(data->cmd_flags) && + !blk_op_is_passthrough(data->cmd_flags) && e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.limit_depth(data->cmd_flags, data); @@ -522,7 +532,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->backing_dev_info); + laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); @@ -565,80 +575,29 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); -/* - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ -static __latent_entropy void blk_done_softirq(struct softirq_action *h) +static void blk_complete_reqs(struct llist_head *list) { - struct list_head *cpu_list, local_list; + struct llist_node *entry = llist_reverse_order(llist_del_all(list)); + struct request *rq, *next; - local_irq_disable(); - cpu_list = this_cpu_ptr(&blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, ipi_list); - list_del_init(&rq->ipi_list); + llist_for_each_entry_safe(rq, next, entry, ipi_list) rq->q->mq_ops->complete(rq); - } } -static void blk_mq_trigger_softirq(struct request *rq) +static __latent_entropy void blk_done_softirq(struct softirq_action *h) { - struct list_head *list; - unsigned long flags; - - local_irq_save(flags); - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&rq->ipi_list, list); - - /* - * If the list only contains our just added request, signal a raise of - * the softirq. If there are already entries there, someone already - * raised the irq but it hasn't run yet. - */ - if (list->next == &rq->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_restore(flags); + blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } static int blk_softirq_cpu_dead(unsigned int cpu) { - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - + blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); return 0; } - static void __blk_mq_complete_request_remote(void *data) { - struct request *rq = data; - - /* - * For most of single queue controllers, there is only one irq vector - * for handling I/O completion, and the only irq's affinity is set - * to all possible CPUs. On most of ARCHs, this affinity means the irq - * is handled on one specific CPU. - * - * So complete I/O requests in softirq context in case of single queue - * devices to avoid degrading I/O performance due to irqsoff latency. - */ - if (rq->q->nr_hw_queues == 1) - blk_mq_trigger_softirq(rq); - else - rq->q->mq_ops->complete(rq); + __raise_softirq_irqoff(BLOCK_SOFTIRQ); } static inline bool blk_mq_complete_need_ipi(struct request *rq) @@ -648,6 +607,14 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; + /* + * With force threaded interrupts enabled, raising softirq from an SMP + * function call will always result in waking the ksoftirqd thread. + * This is probably worse than completing the request on a different + * cache domain. + */ + if (force_irqthreads()) + return false; /* same CPU or cache domain? Complete locally */ if (cpu == rq->mq_ctx->cpu || @@ -659,6 +626,30 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) return cpu_online(rq->mq_ctx->cpu); } +static void blk_mq_complete_send_ipi(struct request *rq) +{ + struct llist_head *list; + unsigned int cpu; + + cpu = rq->mq_ctx->cpu; + list = &per_cpu(blk_cpu_done, cpu); + if (llist_add(&rq->ipi_list, list)) { + INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); + smp_call_function_single_async(cpu, &rq->csd); + } +} + +static void blk_mq_raise_softirq(struct request *rq) +{ + struct llist_head *list; + + preempt_disable(); + list = this_cpu_ptr(&blk_cpu_done); + if (llist_add(&rq->ipi_list, list)) + raise_softirq(BLOCK_SOFTIRQ); + preempt_enable(); +} + bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); @@ -671,17 +662,15 @@ bool blk_mq_complete_request_remote(struct request *rq) return false; if (blk_mq_complete_need_ipi(rq)) { - rq->csd.func = __blk_mq_complete_request_remote; - rq->csd.info = rq; - rq->csd.flags = 0; - smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); - } else { - if (rq->q->nr_hw_queues > 1) - return false; - blk_mq_trigger_softirq(rq); + blk_mq_complete_send_ipi(rq); + return true; } - return true; + if (rq->q->nr_hw_queues == 1) { + blk_mq_raise_softirq(rq); + return true; + } + return false; } EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); @@ -731,7 +720,7 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - trace_block_rq_issue(q, rq); + trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); @@ -758,7 +747,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); - trace_block_rq_requeue(q, rq); + trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { @@ -929,7 +918,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) void blk_mq_put_rq_ref(struct request *rq) { - if (is_flush_rq(rq, rq->mq_hctx)) + if (is_flush_rq(rq)) rq->end_io(rq, 0); else if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); @@ -1102,7 +1091,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -static bool blk_mq_get_driver_tag(struct request *rq) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; @@ -1277,10 +1266,15 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, bool need_budget) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + int budget_token = -1; - if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) { - blk_mq_put_driver_tag(rq); - return PREP_DISPATCH_NO_BUDGET; + if (need_budget) { + budget_token = blk_mq_get_dispatch_budget(rq->q); + if (budget_token < 0) { + blk_mq_put_driver_tag(rq); + return PREP_DISPATCH_NO_BUDGET; + } + blk_mq_set_rq_budget_token(rq, budget_token); } if (!blk_mq_get_driver_tag(rq)) { @@ -1297,7 +1291,7 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, * together during handling partial dispatch */ if (need_budget) - blk_mq_put_dispatch_budget(rq->q); + blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG; } } @@ -1307,12 +1301,16 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ static void blk_mq_release_budgets(struct request_queue *q, - unsigned int nr_budgets) + struct list_head *list) { - int i; + struct request *rq; - for (i = 0; i < nr_budgets; i++) - blk_mq_put_dispatch_budget(q); + list_for_each_entry(rq, list, queuelist) { + int budget_token = blk_mq_get_rq_budget_token(rq); + + if (budget_token >= 0) + blk_mq_put_dispatch_budget(q, budget_token); + } } /* @@ -1385,7 +1383,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, break; default: errors++; - blk_mq_end_request(rq, BLK_STS_IOERR); + blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: @@ -1410,7 +1408,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; - blk_mq_release_budgets(q, nr_budgets); + if (nr_budgets) + blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); @@ -1476,31 +1475,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; - /* - * We should be running this queue from one of the CPUs that - * are mapped to it. - * - * There are at least two related races now between setting - * hctx->next_cpu from blk_mq_hctx_next_cpu() and running - * __blk_mq_run_hw_queue(): - * - * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), - * but later it becomes online, then this warning is harmless - * at all - * - * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), - * but later it becomes offline, then the warning can't be - * triggered, and we depend on blk-mq timeout handler to - * handle dispatched requests to this hctx - */ - if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)) { - printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", - raw_smp_processor_id(), - cpumask_empty(hctx->cpumask) ? "inactive": "active"); - dump_stack(); - } - /* * We can't run the queue inline with ints disabled. Ensure that * we catch bad users of this early. @@ -1573,7 +1547,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * If !@async, try to run the queue now. Else, run the queue asynchronously and * with a delay of @msecs. @@ -1602,7 +1576,7 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ @@ -1644,6 +1618,42 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queue); +/* + * Is the request queue handled by an IO scheduler that does not respect + * hardware queues when dispatching? + */ +static bool blk_mq_has_sqsched(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.dispatch_request && + !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) + return true; + return false; +} + +/* + * Return prefered queue to dispatch from (if any) for non-mq aware IO + * scheduler. + */ +static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + + /* + * If the IO scheduler does not respect hardware queues when + * dispatching, we just don't bother with multiple HW queues and + * dispatch from hctx for the current CPU since running multiple queues + * just causes lock contention inside the scheduler and pointless cache + * bouncing. + */ + hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, + raw_smp_processor_id()); + if (!blk_mq_hctx_stopped(hctx)) + return hctx; + return NULL; +} + /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. @@ -1651,14 +1661,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queue); */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i; + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_run_hw_queue(hctx, async); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); @@ -1666,18 +1685,27 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. - * @msecs: Microseconds of delay to wait before running the queues. + * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i; + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_delay_run_hw_queue(hctx, msecs); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); @@ -1800,7 +1828,7 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, lockdep_assert_held(&ctx->lock); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_lists[type]); @@ -1857,7 +1885,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); } spin_lock(&ctx->lock); @@ -1866,7 +1894,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, spin_unlock(&ctx->lock); } -static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_rq_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); @@ -1980,6 +2009,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = rq->q; bool run_queue = true; + int budget_token; /* * RCU or SRCU read lock is needed before checking quiesced flag. @@ -1997,11 +2027,14 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (q->elevator && !bypass_insert) goto insert; - if (!blk_mq_get_dispatch_budget(q)) + budget_token = blk_mq_get_dispatch_budget(q); + if (budget_token < 0) goto insert; + blk_mq_set_rq_budget_token(rq, budget_token); + if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(q); + blk_mq_put_dispatch_budget(q, budget_token); goto insert; } @@ -2109,6 +2142,18 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) } } +/* + * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 4; + return BLK_MAX_REQUEST_COUNT; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2126,7 +2171,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ blk_qc_t blk_mq_submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { @@ -2138,6 +2183,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) unsigned int nr_segs; blk_qc_t cookie; blk_status_t ret; + bool hipri; blk_queue_bounce(q, &bio); __blk_queue_split(&bio, &nr_segs); @@ -2154,6 +2200,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) rq_qos_throttle(q, bio); + hipri = bio->bi_opf & REQ_HIPRI; + data.cmd_flags = bio->bi_opf; rq = __blk_mq_alloc_request(&data); if (unlikely(!rq)) { @@ -2163,7 +2211,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - trace_block_getrq(q, bio, bio->bi_opf); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -2202,7 +2250,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) else last = list_entry_rq(plug->mq_list.prev); - if (request_count >= BLK_MAX_REQUEST_COUNT || (last && + if (request_count >= blk_plug_max_rq_count(plug) || (last && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); trace_block_plug(q); @@ -2247,6 +2295,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) blk_mq_sched_insert_request(rq, false, true, true); } + if (!hipri) + return BLK_QC_T_NONE; return cookie; queue_exit: blk_queue_exit(q); @@ -2589,16 +2639,49 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) &hctx->cpuhp_dead); } +/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); +} + /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct request *flush_rq = hctx->fq->flush_rq; + if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); + set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -2683,7 +2766,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_hctx; atomic_set(&hctx->nr_active, 0); - atomic_set(&hctx->elevator_queued, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; @@ -2706,7 +2788,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), - gfp, node)) + gfp, node, false, false)) goto free_ctxs; hctx->nr_ctx = 0; @@ -2911,10 +2993,12 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) + if (shared) { hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; - else + } else { + blk_mq_tag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; + } } } @@ -3026,27 +3110,23 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, +static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata) { - struct request_queue *uninit_q, *q; + struct request_queue *q; + int ret; - uninit_q = blk_alloc_queue(set->numa_node); - if (!uninit_q) + q = blk_alloc_queue(set->numa_node); + if (!q) return ERR_PTR(-ENOMEM); - uninit_q->queuedata = queuedata; - - /* - * Initialize the queue without an elevator. device_add_disk() will do - * the initialization. - */ - q = blk_mq_init_allocated_queue(set, uninit_q, false); - if (IS_ERR(q)) - blk_cleanup_queue(uninit_q); - + q->queuedata = queuedata; + ret = blk_mq_init_allocated_queue(set, q); + if (ret) { + blk_cleanup_queue(q); + return ERR_PTR(ret); + } return q; } -EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { @@ -3054,39 +3134,24 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -/* - * Helper for setting up a queue with mq ops, given queue depth, and - * the passed in mq ops flags. - */ -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass) { struct request_queue *q; - int ret; + struct gendisk *disk; - memset(set, 0, sizeof(*set)); - set->ops = ops; - set->nr_hw_queues = 1; - set->nr_maps = 1; - set->queue_depth = queue_depth; - set->numa_node = NUMA_NO_NODE; - set->flags = set_flags; + q = blk_mq_init_queue_data(set, queuedata); + if (IS_ERR(q)) + return ERR_CAST(q); - ret = blk_mq_alloc_tag_set(set); - if (ret) - return ERR_PTR(ret); - - q = blk_mq_init_queue(set); - if (IS_ERR(q)) { - blk_mq_free_tag_set(set); - return q; + disk = __alloc_disk_node(q, set->numa_node, lkclass); + if (!disk) { + blk_cleanup_queue(q); + return ERR_PTR(-ENOMEM); } - - return q; + return disk; } -EXPORT_SYMBOL(blk_mq_init_sq_queue); +EXPORT_SYMBOL(__blk_mq_alloc_disk); static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, @@ -3199,9 +3264,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init) +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -3235,8 +3299,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, set->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, q); - q->sg_reserved_size = INT_MAX; - INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); @@ -3251,11 +3313,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - - if (elevator_init) - elevator_init_mq(q); - - return q; + return 0; err_hctxs: kfree(q->queue_hw_ctx); @@ -3266,7 +3324,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->poll_cb = NULL; err_exit: q->mq_ops = NULL; - return ERR_PTR(-ENOMEM); + return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); @@ -3395,6 +3453,12 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return 0; } +static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, + int new_nr_hw_queues) +{ + return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3448,7 +3512,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) + if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; @@ -3472,7 +3536,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (blk_mq_is_sbitmap_shared(set->flags)) { atomic_set(&set->active_queues_shared_sbitmap, 0); - if (blk_mq_init_shared_sbitmap(set, set->flags)) { + if (blk_mq_init_shared_sbitmap(set)) { ret = -ENOMEM; goto out_free_mq_rq_maps; } @@ -3497,6 +3561,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_alloc_tag_set); +/* allocate and initialize a tagset for a simple single-queue device */ +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags) +{ + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; + return blk_mq_alloc_tag_set(set); +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); + void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; @@ -3548,15 +3628,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); + if (blk_mq_is_sbitmap_shared(set->flags)) { + hctx->sched_tags->bitmap_tags = + &q->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &q->sched_breserved_tags; + } } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } - - if (!ret) + if (!ret) { q->nr_requests = nr; + if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) + sbitmap_queue_resize(&q->sched_bitmap_tags, + nr - set->reserved_tags); + } blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); @@ -3867,7 +3956,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { struct blk_mq_hw_ctx *hctx; - long state; + unsigned int state; if (!blk_qc_t_valid(cookie) || !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) @@ -3883,14 +3972,15 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) * the state. Like for the other success return cases, the * caller is responsible for checking if the IO completed. If * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. + * straight to the busy poll loop. If specified not to spin, + * we also should not sleep. */ - if (blk_mq_poll_hybrid(q, hctx, cookie)) + if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) return 1; hctx->poll_considered++; - state = current->state; + state = get_current_state(); do { int ret; @@ -3906,7 +3996,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); - if (current->state == TASK_RUNNING) + if (task_is_running(current)) return 1; if (ret < 0 || !spin) break; @@ -3929,7 +4019,7 @@ static int __init blk_mq_init(void) int i; for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + init_llist_head(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, diff --git a/block/blk-mq.h b/block/blk-mq.h index f792a0920e..d08779f77a 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -100,7 +100,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @flags: request command flags - * @cpu: cpu ctx + * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, @@ -183,21 +183,39 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part); +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]); -static inline void blk_mq_put_dispatch_budget(struct request_queue *q) +static inline void blk_mq_put_dispatch_budget(struct request_queue *q, + int budget_token) { if (q->mq_ops->put_budget) - q->mq_ops->put_budget(q); + q->mq_ops->put_budget(q, budget_token); } -static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) +static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); - return true; + return 0; +} + +static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) +{ + if (token < 0) + return; + + if (rq->q->mq_ops->set_rq_budget_token) + rq->q->mq_ops->set_rq_budget_token(rq, token); +} + +static inline int blk_mq_get_rq_budget_token(struct request *rq) +{ + if (rq->q->mq_ops->get_rq_budget_token) + return rq->q->mq_ops->get_rq_budget_token(rq); + return -1; } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) @@ -242,6 +260,8 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } +bool blk_mq_get_driver_tag(struct request *rq); + static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; @@ -282,6 +302,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, return NULL; } +/* Free all requests on the list */ +static inline void blk_mq_free_requests(struct list_head *list) +{ + while (!list_empty(list)) { + struct request *rq = list_entry_rq(list->next); + + list_del_init(&rq->queuelist); + blk_mq_free_request(rq); + } +} + /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. diff --git a/block/blk-pm.h b/block/blk-pm.h index a2283cc9f7..8a5a0d4b35 100644 --- a/block/blk-pm.h +++ b/block/blk-pm.h @@ -21,31 +21,6 @@ static inline void blk_pm_mark_last_busy(struct request *rq) if (rq->q->dev && !(rq->rq_flags & RQF_PM)) pm_runtime_mark_last_busy(rq->q->dev); } - -static inline void blk_pm_requeue_request(struct request *rq) -{ - lockdep_assert_held(&rq->q->queue_lock); - - if (rq->q->dev && !(rq->rq_flags & RQF_PM)) - rq->q->nr_pending--; -} - -static inline void blk_pm_add_request(struct request_queue *q, - struct request *rq) -{ - lockdep_assert_held(&q->queue_lock); - - if (q->dev && !(rq->rq_flags & RQF_PM)) - q->nr_pending++; -} - -static inline void blk_pm_put_request(struct request *rq) -{ - lockdep_assert_held(&rq->q->queue_lock); - - if (rq->q->dev && !(rq->rq_flags & RQF_PM)) - --rq->q->nr_pending; -} #else static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { @@ -55,19 +30,6 @@ static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) static inline void blk_pm_mark_last_busy(struct request *rq) { } - -static inline void blk_pm_requeue_request(struct request *rq) -{ -} - -static inline void blk_pm_add_request(struct request_queue *q, - struct request *rq) -{ -} - -static inline void blk_pm_put_request(struct request *rq) -{ -} #endif #endif /* _BLOCK_BLK_PM_H_ */ diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bcb3495e3..f000f83e06 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,6 +17,7 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, + RQ_QOS_IOPRIO, }; struct rq_wait { @@ -79,19 +80,6 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_LATENCY); } -static inline const char *rq_qos_id_to_name(enum rq_qos_id id) -{ - switch (id) { - case RQ_QOS_WBT: - return "wbt"; - case RQ_QOS_LATENCY: - return "latency"; - case RQ_QOS_COST: - return "cost"; - } - return "unknown"; -} - static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); diff --git a/block/blk-settings.c b/block/blk-settings.c index c3aa7f8ee3..a7c857ad7d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -7,7 +7,8 @@ #include #include #include -#include /* for max_pfn/max_low_pfn */ +#include +#include #include #include #include @@ -17,11 +18,6 @@ #include "blk.h" #include "blk-wbt.h" -unsigned long blk_max_low_pfn; -EXPORT_SYMBOL(blk_max_low_pfn); - -unsigned long blk_max_pfn; - void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; @@ -55,11 +51,12 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); + lim->bounce = BLK_BOUNCE_NONE; lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; lim->zoned = BLK_ZONED_NONE; + lim->zone_write_granularity = 0; } EXPORT_SYMBOL(blk_set_default_limits); @@ -91,39 +88,16 @@ EXPORT_SYMBOL(blk_set_stacking_limits); /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device - * @max_addr: the maximum address the device can handle + * @bounce: bounce limit to enforce * * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @max_addr. + * Force bouncing for ISA DMA ranges or highmem. + * + * DEPRECATED, don't use in new code. **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) +void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce) { - unsigned long b_pfn = max_addr >> PAGE_SHIFT; - int dma = 0; - - q->bounce_gfp = GFP_NOIO; -#if BITS_PER_LONG == 64 - /* - * Assume anything <= 4GB can be handled by IOMMU. Actually - * some IOMMUs can handle everything, but I don't know of a - * way to test this here. - */ - if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) - dma = 1; - q->limits.bounce_pfn = max(max_low_pfn, b_pfn); -#else - if (b_pfn < blk_max_low_pfn) - dma = 1; - q->limits.bounce_pfn = b_pfn; -#endif - if (dma) { - init_emergency_isa_pool(); - q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->limits.bounce_pfn = b_pfn; - } + q->limits.bounce = bounce; } EXPORT_SYMBOL(blk_queue_bounce_limit); @@ -157,11 +131,19 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto __func__, max_hw_sectors); } + max_hw_sectors = round_down(max_hw_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_hw_sectors = max_hw_sectors; + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + max_sectors = round_down(max_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); + + if (!q->disk) + return; + q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -321,13 +303,20 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); **/ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { - q->limits.logical_block_size = size; + struct queue_limits *limits = &q->limits; - if (q->limits.physical_block_size < size) - q->limits.physical_block_size = size; + limits->logical_block_size = size; - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; + if (limits->physical_block_size < size) + limits->physical_block_size = size; + + if (limits->io_min < limits->physical_block_size) + limits->io_min = limits->physical_block_size; + + limits->max_hw_sectors = + round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); + limits->max_sectors = + round_down(limits->max_sectors, size >> SECTOR_SHIFT); } EXPORT_SYMBOL(blk_queue_logical_block_size); @@ -353,6 +342,28 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) } EXPORT_SYMBOL(blk_queue_physical_block_size); +/** + * blk_queue_zone_write_granularity - set zone write granularity for the queue + * @q: the request queue for the zoned device + * @size: the zone write granularity size, in bytes + * + * Description: + * This should be set to the lowest possible size allowing to write in + * sequential zones of a zoned block device. + */ +void blk_queue_zone_write_granularity(struct request_queue *q, + unsigned int size) +{ + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return; + + q->limits.zone_write_granularity = size; + + if (q->limits.zone_write_granularity < q->limits.logical_block_size) + q->limits.zone_write_granularity = q->limits.logical_block_size; +} +EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); + /** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device @@ -372,18 +383,19 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); -void blk_queue_update_readahead(struct request_queue *q) +void disk_update_readahead(struct gendisk *disk) { + struct request_queue *q = disk->queue; + /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. */ - q->backing_dev_info->ra_pages = + disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - q->backing_dev_info->io_pages = - queue_max_sectors(q) >> (PAGE_SHIFT - 9); + disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } -EXPORT_SYMBOL_GPL(blk_queue_update_readahead); +EXPORT_SYMBOL_GPL(disk_update_readahead); /** * blk_limits_io_min - set minimum request size for a device @@ -463,7 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - q->backing_dev_info->ra_pages = + if (!q->disk) + return; + q->disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); @@ -511,7 +525,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(t->max_zone_append_sectors, b->max_zone_append_sectors); - t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); + t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); @@ -630,6 +644,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } + t->zone_write_granularity = max(t->zone_write_granularity, + b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); return ret; } @@ -651,17 +667,11 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) { - char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; + get_start_sect(bdev) + (offset >> 9)) < 0) + pr_notice("%s: Warning: Device %pg is misaligned\n", + disk->disk_name, bdev); - disk_name(disk, 0, top); - bdevname(bdev, bottom); - - printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", - top, bottom); - } - - blk_queue_update_readahead(disk->queue); + disk_update_readahead(disk); } EXPORT_SYMBOL(disk_stack_limits); @@ -846,6 +856,8 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); */ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) { + struct request_queue *q = disk->queue; + switch (model) { case BLK_ZONED_HM: /* @@ -864,7 +876,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - disk_has_partitions(disk)) + !xa_empty(&disk->part_tbl)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: @@ -874,14 +886,16 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) break; } - disk->queue->limits.zoned = model; + q->limits.zoned = model; + if (model != BLK_ZONED_NONE) { + /* + * Set the zone write granularity to the device logical block + * size by default. The driver can change this value if needed. + */ + blk_queue_zone_write_granularity(q, + queue_logical_block_size(q)); + } else { + blk_queue_clear_zone_settings(q); + } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); - -static int __init blk_settings_init(void) -{ - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - return 0; -} -subsys_initcall(blk_settings_init); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b513f1683a..614d9d47de 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -60,7 +60,7 @@ static ssize_t queue_var_store64(s64 *var, const char *page) static ssize_t queue_requests_show(struct request_queue *q, char *page) { - return queue_var_show(q->nr_requests, (page)); + return queue_var_show(q->nr_requests, page); } static ssize_t @@ -88,23 +88,26 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info->ra_pages << - (PAGE_SHIFT - 10); + unsigned long ra_kb; - return queue_var_show(ra_kb, (page)); + if (!q->disk) + return -EINVAL; + ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); + return queue_var_show(ra_kb, page); } static ssize_t queue_ra_store(struct request_queue *q, const char *page, size_t count) { unsigned long ra_kb; - ssize_t ret = queue_var_store(&ra_kb, page, count); + ssize_t ret; + if (!q->disk) + return -EINVAL; + ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); - + q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -112,28 +115,28 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) { int max_sectors_kb = queue_max_sectors(q) >> 1; - return queue_var_show(max_sectors_kb, (page)); + return queue_var_show(max_sectors_kb, page); } static ssize_t queue_max_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segments(q), (page)); + return queue_var_show(queue_max_segments(q), page); } static ssize_t queue_max_discard_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_discard_segments(q), (page)); + return queue_var_show(queue_max_discard_segments(q), page); } static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.max_integrity_segments, (page)); + return queue_var_show(q->limits.max_integrity_segments, page); } static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segment_size(q), (page)); + return queue_var_show(queue_max_segment_size(q), page); } static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) @@ -219,6 +222,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_write_granularity_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_zone_write_granularity(q), page); +} + static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { unsigned long long max_sectors = q->limits.max_zone_append_sectors; @@ -245,7 +254,8 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); + if (q->disk) + q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; @@ -255,7 +265,12 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) { int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; - return queue_var_show(max_hw_sectors_kb, (page)); + return queue_var_show(max_hw_sectors_kb, page); +} + +static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.virt_boundary_mask, page); } #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ @@ -428,10 +443,13 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - if (poll_on) + if (poll_on) { blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else + } else { + blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + blk_mq_unfreeze_queue(q); + } return ret; } @@ -585,6 +603,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); @@ -600,6 +619,7 @@ QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); +QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); @@ -639,6 +659,7 @@ static struct attribute *queue_attrs[] = { &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, + &queue_zone_write_granularity_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, @@ -659,6 +680,7 @@ static struct attribute *queue_attrs[] = { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &blk_throtl_sample_time_entry.attr, #endif + &queue_virt_boundary_mask_entry.attr, NULL, }; @@ -748,13 +770,6 @@ static void blk_exit_queue(struct request_queue *q) * e.g. blkcg_print_blkgs() to crash. */ blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); } /** @@ -841,29 +856,6 @@ int blk_register_queue(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - if (WARN_ON(!q)) - return -ENXIO; - - WARN_ONCE(blk_queue_registered(q), - "%s is registering an already registered queue\n", - kobject_name(&dev->kobj)); - - /* - * SCSI probing may synchronously create and destroy a lot of - * request_queues for non-existent devices. Shutting down a fully - * functional queue takes measureable wallclock time as RCU grace - * periods are involved. To avoid excessive latency in these - * cases, a request_queue starts out in a degraded mode which is - * faster to shut down and is made fully functional here as - * request_queues for non-existent devices never get registered. - */ - if (!blk_queue_init_done(q)) { - blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); - percpu_ref_switch_to_percpu(&q->q_usage_counter); - } - - blk_queue_update_readahead(q); - ret = blk_trace_init_sysfs(dev); if (ret) return ret; @@ -920,9 +912,23 @@ int blk_register_queue(struct gendisk *disk) ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); + + /* + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. + */ + if (!blk_queue_init_done(q)) { + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); + } + return ret; } -EXPORT_SYMBOL_GPL(blk_register_queue); /** * blk_unregister_queue - counterpart of blk_register_queue() diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b771c42999..7c4e7993ba 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -178,6 +178,9 @@ struct throtl_grp { unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -587,6 +590,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void blk_throtl_update_limit_valid(struct throtl_data *td) { struct cgroup_subsys_state *pos_css; @@ -607,6 +611,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) td->limit_valid[LIMIT_LOW] = low_valid; } +#else +static inline void blk_throtl_update_limit_valid(struct throtl_data *td) +{ +} +#endif static void throtl_upgrade_state(struct throtl_data *td); static void throtl_pd_offline(struct blkg_policy_data *pd) @@ -771,6 +780,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + atomic_set(&tg->io_split_cnt[rw], 0); + /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -793,6 +804,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + + atomic_set(&tg->io_split_cnt[rw], 0); + throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -1025,6 +1039,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } + if (iops_limit != UINT_MAX) + tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -2046,12 +2063,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg) } if (tg->iops[READ][LIMIT_LOW]) { + tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { + tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2170,9 +2189,28 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +void blk_throtl_charge_bio_split(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_grp *parent = blkg_to_tg(blkg); + struct throtl_service_queue *parent_sq; + bool rw = bio_data_dir(bio); + + do { + if (!parent->has_rules[rw]) + break; + + atomic_inc(&parent->io_split_cnt[rw]); + atomic_inc(&parent->last_io_split_cnt[rw]); + + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + } while (parent); +} + bool blk_throtl_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); @@ -2420,6 +2458,7 @@ int blk_throtl_init(struct request_queue *q) void blk_throtl_exit(struct request_queue *q) { BUG_ON(!q->td); + del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); free_percpu(q->td->latency_buckets[READ]); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 35d81b5dea..874c1c37bf 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; + struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,7 +359,7 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, + trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, inflight); /* @@ -519,7 +519,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } -static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +static inline bool wbt_should_throttle(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -546,7 +546,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; - } else if (wbt_should_throttle(rwb, bio)) { + } else if (wbt_should_throttle(bio)) { if (current_is_kswapd()) flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) @@ -564,7 +564,6 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) } /* - * Returns true if the IO request should be accounted, false if not. * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. @@ -840,7 +839,6 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; - wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ab7d7ebcf6..1d0c76c18f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -52,14 +52,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -static inline sector_t blk_zone_start(struct request_queue *q, - sector_t sector) -{ - sector_t zone_mask = blk_queue_zone_sectors(q) - 1; - - return sector & ~zone_mask; -} - /* * Return true if a request is a write requests that needs zone write locking. */ @@ -169,18 +161,89 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); -static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, - sector_t sector, - sector_t nr_sectors) +static inline unsigned long *blk_alloc_zone_bitmap(int node, + unsigned int nr_zones) { - if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) - return false; + return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), + GFP_NOIO, node); +} +static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ /* - * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors - * of the applicable zone range is the entire disk. + * For an all-zones reset, ignore conventional, empty, read-only + * and offline zones. */ - return !sector && nr_sectors == get_capacity(bdev->bd_disk); + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + return 0; + default: + set_bit(idx, (unsigned long *)data); + return 0; + } +} + +static int blkdev_zone_reset_all_emulated(struct block_device *bdev, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + sector_t capacity = get_capacity(bdev->bd_disk); + sector_t zone_sectors = blk_queue_zone_sectors(q); + unsigned long *need_reset; + struct bio *bio = NULL; + sector_t sector = 0; + int ret; + + need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones); + if (!need_reset) + return -ENOMEM; + + ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0, + q->nr_zones, blk_zone_need_reset_cb, + need_reset); + if (ret < 0) + goto out_free_need_reset; + + ret = 0; + while (sector < capacity) { + if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) { + sector += zone_sectors; + continue; + } + + bio = blk_next_bio(bio, 0, gfp_mask); + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC; + bio->bi_iter.bi_sector = sector; + sector += zone_sectors; + + /* This may take a while, so be nice to others */ + cond_resched(); + } + + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + +out_free_need_reset: + kfree(need_reset); + return ret; +} + +static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) +{ + struct bio bio; + + bio_init(&bio, NULL, 0); + bio_set_dev(&bio, bdev); + bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; + + return submit_bio_wait(&bio); } /** @@ -208,7 +271,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, sector_t capacity = get_capacity(bdev->bd_disk); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; - int ret; + int ret = 0; if (!blk_queue_is_zoned(q)) return -EOPNOTSUPP; @@ -230,20 +293,21 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) return -EINVAL; + /* + * In the case of a zone reset operation over all zones, + * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this + * command. For other devices, we emulate this command behavior by + * identifying the zones needing a reset. + */ + if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { + if (!blk_queue_zone_resetall(q)) + return blkdev_zone_reset_all_emulated(bdev, gfp_mask); + return blkdev_zone_reset_all(bdev, gfp_mask); + } + while (sector < end_sector) { bio = blk_next_bio(bio, 0, gfp_mask); bio_set_dev(bio, bdev); - - /* - * Special case for the zone reset operation that reset all - * zones, this is useful for applications like mkfs. - */ - if (op == REQ_OP_ZONE_RESET && - blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { - bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; - break; - } - bio->bi_opf = op | REQ_SYNC; bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -296,9 +360,6 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; @@ -357,9 +418,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!(mode & FMODE_WRITE)) return -EBADF; @@ -404,13 +462,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, return ret; } -static inline unsigned long *blk_alloc_zone_bitmap(int node, - unsigned int nr_zones) -{ - return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), - GFP_NOIO, node); -} - void blk_queue_free_zone_bitmaps(struct request_queue *q) { kfree(q->conv_zones_bitmap); @@ -542,15 +593,29 @@ int blk_revalidate_disk_zones(struct gendisk *disk, noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); + if (!ret) { + pr_warn("%s: No zones reported\n", disk->disk_name); + ret = -ENODEV; + } memalloc_noio_restore(noio_flag); + /* + * If zones where reported, make sure that the entire disk capacity + * has been checked. + */ + if (ret > 0 && args.sector != get_capacity(disk)) { + pr_warn("%s: Missing zones from sector %llu\n", + disk->disk_name, args.sector); + ret = -ENODEV; + } + /* * Install the new bitmaps and update nr_zones only once the queue is * stopped and all I/Os are completed (i.e. a scheduler is not * referencing the bitmaps). */ blk_mq_freeze_queue(q); - if (ret >= 0) { + if (ret > 0) { blk_queue_chunk_sectors(q, args.zone_sectors); q->nr_zones = args.nr_zones; swap(q->seq_zones_wlock, args.seq_zones_wlock); @@ -569,3 +634,20 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); + +void blk_queue_clear_zone_settings(struct request_queue *q) +{ + blk_mq_freeze_queue(q); + + blk_queue_free_zone_bitmaps(q); + blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); + q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; + q->nr_zones = 0; + q->max_open_zones = 0; + q->max_active_zones = 0; + q->limits.chunk_sectors = 0; + q->limits.zone_write_granularity = 0; + q->limits.max_zone_append_sectors = 0; + + blk_mq_unfreeze_queue(q); +} diff --git a/block/blk.h b/block/blk.h index dfab98465d..6c3c00a8fe 100644 --- a/block/blk.h +++ b/block/blk.h @@ -6,6 +6,7 @@ #include #include #include +#include /* for max_pfn/max_low_pfn */ #include #include "blk-crypto-internal.h" #include "blk-mq.h" @@ -25,7 +26,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - struct lock_class_key key; spinlock_t mq_flush_lock; }; @@ -44,17 +44,20 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } -static inline bool -is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) -{ - return hctx->fq->flush_rq == req; -} +bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +void blk_queue_start_drain(struct request_queue *q); + +#define BIO_INLINE_VECS 4 +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask); +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) @@ -91,18 +94,6 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); - - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; -} - #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); @@ -139,7 +130,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } -void blk_integrity_add(struct gendisk *); +int blk_integrity_add(struct gendisk *disk); void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, @@ -173,8 +164,9 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } -static inline void blk_integrity_add(struct gendisk *disk) +static inline int blk_integrity_add(struct gendisk *disk) { + return 0; } static inline void blk_integrity_del(struct gendisk *disk) { @@ -199,7 +191,6 @@ void blk_account_io_done(struct request *req, u64 now); void blk_insert_flush(struct request *rq); -void elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); @@ -215,8 +206,6 @@ static inline void elevator_exit(struct request_queue *q, __elevator_exit(q, e); } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, @@ -234,7 +223,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); @@ -303,11 +292,13 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); +extern void blk_throtl_charge_bio_split(struct bio *bio); bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW @@ -321,18 +312,20 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif -#ifdef CONFIG_BOUNCE -extern int init_emergency_isa_pool(void); -extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); -#else -static inline int init_emergency_isa_pool(void) +void __blk_queue_bounce(struct request_queue *q, struct bio **bio); + +static inline bool blk_queue_may_bounce(struct request_queue *q) { - return 0; + return IS_ENABLED(CONFIG_BOUNCE) && + q->limits.bounce == BLK_BOUNCE_HIGH && + max_low_pfn >= max_pfn; } + static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) { + if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio))) + __blk_queue_bounce(q, bio); } -#endif /* CONFIG_BOUNCE */ #ifdef CONFIG_BLK_CGROUP_IOLATENCY extern int blk_iolatency_init(struct request_queue *q); @@ -344,104 +337,44 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); #ifdef CONFIG_BLK_DEV_ZONED void blk_queue_free_zone_bitmaps(struct request_queue *q); +void blk_queue_clear_zone_settings(struct request_queue *q); #else static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} +static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} #endif -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); - -int blk_alloc_devt(struct hd_struct *part, dev_t *devt); -void blk_free_devt(dev_t devt); -void blk_invalidate_devt(dev_t devt); -char *disk_name(struct gendisk *hd, int partno, char *buf); +int blk_alloc_ext_minor(void); +void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct hd_struct *part); -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); -int bdev_del_partition(struct block_device *bdev, int partno); -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); -int disk_expand_part_tbl(struct gendisk *disk, int target); -int hd_ref_init(struct hd_struct *part); - -/* no need to get/put refcount of part0 */ -static inline int hd_struct_try_get(struct hd_struct *part) -{ - if (part->partno) - return percpu_ref_tryget_live(&part->ref); - return 1; -} - -static inline void hd_struct_put(struct hd_struct *part) -{ - if (part->partno) - percpu_ref_put(&part->ref); -} - -static inline void hd_free_part(struct hd_struct *part) -{ - free_percpu(part->dkstats); - kfree(part->info); - percpu_ref_exit(&part->ref); -} - -/* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. - * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. - */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif -} - -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - preempt_disable(); - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); - preempt_enable(); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif -} +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); +int bdev_del_partition(struct gendisk *disk, int partno); +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +struct request_queue *blk_alloc_queue(int node_id); + +int disk_alloc_events(struct gendisk *disk); +void disk_add_events(struct gendisk *disk); +void disk_del_events(struct gendisk *disk); +void disk_release_events(struct gendisk *disk); +extern struct device_attribute dev_attr_events; +extern struct device_attribute dev_attr_events_async; +extern struct device_attribute dev_attr_events_poll_msecs; + +static inline void bio_clear_hipri(struct bio *bio) +{ + /* can't support alloc cache if we turn off polling */ + bio_clear_flag(bio, BIO_PERCPU_CACHE); + bio->bi_opf &= ~REQ_HIPRI; +} + +extern const struct address_space_operations def_blk_aops; + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index 162a6eee89..05fc714848 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -29,7 +28,7 @@ #define ISA_POOL_SIZE 16 static struct bio_set bounce_bio_set, bounce_bio_split; -static mempool_t page_pool, isa_page_pool; +static mempool_t page_pool; static void init_bounce_bioset(void) { @@ -49,11 +48,11 @@ static void init_bounce_bioset(void) bounce_bs_setup = true; } -#if defined(CONFIG_HIGHMEM) static __init int init_emergency_pool(void) { int ret; -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) + +#ifndef CONFIG_MEMORY_HOTPLUG if (max_pfn <= max_low_pfn) return 0; #endif @@ -67,62 +66,6 @@ static __init int init_emergency_pool(void) } __initcall(init_emergency_pool); -#endif - -#ifdef CONFIG_HIGHMEM -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned char *vto; - - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); -} - -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif /* CONFIG_HIGHMEM */ - -/* - * allocate pages in the DMA region for the ISA pool - */ -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - -static DEFINE_MUTEX(isa_mutex); - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - int ret; - - mutex_lock(&isa_mutex); - - if (mempool_initialized(&isa_page_pool)) { - mutex_unlock(&isa_mutex); - return 0; - } - - ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(ret); - - pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); - init_bounce_bioset(); - mutex_unlock(&isa_mutex); - return 0; -} /* * Simple bounce buffer support for highmem pages. Depending on the @@ -131,7 +74,6 @@ int init_emergency_isa_pool(void) */ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { - unsigned char *vfrom; struct bio_vec tovec, fromvec; struct bvec_iter iter; /* @@ -149,17 +91,14 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec.bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); + memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + + tovec.bv_offset); } bio_advance_iter(from, &from_iter, tovec.bv_len); } } -static void bounce_end_io(struct bio *bio, mempool_t *pool) +static void bounce_end_io(struct bio *bio) { struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, orig_vec; @@ -173,7 +112,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) orig_vec = bio_iter_iovec(bio_orig, orig_iter); if (bvec->bv_page != orig_vec.bv_page) { dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); + mempool_free(bvec->bv_page, &page_pool); } bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } @@ -185,37 +124,20 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) static void bounce_end_io_write(struct bio *bio) { - bounce_end_io(bio, &page_pool); + bounce_end_io(bio); } -static void bounce_end_io_write_isa(struct bio *bio) -{ - - bounce_end_io(bio, &isa_page_pool); -} - -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) +static void bounce_end_io_read(struct bio *bio) { struct bio *bio_orig = bio->bi_private; if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); - bounce_end_io(bio, pool); + bounce_end_io(bio); } -static void bounce_end_io_read(struct bio *bio) -{ - __bounce_end_io_read(bio, &page_pool); -} - -static void bounce_end_io_read_isa(struct bio *bio) -{ - __bounce_end_io_read(bio, &isa_page_pool); -} - -static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) +static struct bio *bounce_clone_bio(struct bio *bio_src) { struct bvec_iter iter; struct bio_vec bv; @@ -230,10 +152,10 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, * - The point of cloning the biovec is to produce a bio with a biovec * the caller can modify: bi_idx and bi_bvec_done should be 0. * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * - The original bio could've had more than BIO_MAX_VECS biovecs; if * we tried to clone the whole thing bio_alloc_bioset() would fail. * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. + * actually need to allocate is fewer than BIO_MAX_VECS. * * - Lastly, bi_vcnt should not be looked at or relied upon by code * that does not own the bio - reason being drivers don't use it for @@ -242,11 +164,11 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, * asking for trouble and would force extra work on * __bio_clone_fast() anyways. */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) - return NULL; - bio->bi_disk = bio_src->bi_disk; + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), + &bounce_bio_set); + bio->bi_bdev = bio_src->bi_bdev; + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; @@ -267,11 +189,11 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, break; } - if (bio_crypt_clone(bio, bio_src, gfp_mask) < 0) + if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0) goto err_put; if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, gfp_mask) < 0) + bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0) goto err_put; bio_clone_blkg_association(bio, bio_src); @@ -284,8 +206,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, return NULL; } -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool) +void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { struct bio *bio; int rw = bio_data_dir(*bio_orig); @@ -294,25 +215,23 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, unsigned i = 0; bool bounce = false; int sectors = 0; - bool passthrough = bio_is_passthrough(*bio_orig); bio_for_each_segment(from, *bio_orig, iter) { - if (i++ < BIO_MAX_PAGES) + if (i++ < BIO_MAX_VECS) sectors += from.bv_len >> 9; - if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) + if (PageHighMem(from.bv_page)) bounce = true; } if (!bounce) return; - if (!passthrough && sectors < bio_sectors(*bio_orig)) { + if (sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); submit_bio_noacct(*bio_orig); *bio_orig = bio; } - bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : - &bounce_bio_set); + bio = bounce_clone_bio(*bio_orig); /* * Bvec table can't be updated by bio_for_each_segment_all(), @@ -320,70 +239,30 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, * because the 'bio' is single-page bvec. */ for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *page = to->bv_page; + struct page *bounce_page; - if (page_to_pfn(page) <= q->limits.bounce_pfn) + if (!PageHighMem(to->bv_page)) continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - inc_zone_page_state(to->bv_page, NR_BOUNCE); + bounce_page = mempool_alloc(&page_pool, GFP_NOIO); + inc_zone_page_state(bounce_page, NR_BOUNCE); if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); + flush_dcache_page(to->bv_page); + memcpy_from_bvec(page_address(bounce_page), to); } + to->bv_page = bounce_page; } - trace_block_bio_bounce(q, *bio_orig); + trace_block_bio_bounce(*bio_orig); bio->bi_flags |= (1 << BIO_BOUNCED); - if (pool == &page_pool) { + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + else bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } bio->bi_private = *bio_orig; *bio_orig = bio; } - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) -{ - mempool_t *pool; - - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (q->limits.bounce_pfn >= blk_max_pfn) - return; - pool = &page_pool; - } else { - BUG_ON(!mempool_initialized(&isa_page_pool)); - pool = &isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool); -} diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 330fede772..ccb98276c9 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -6,6 +6,7 @@ * Copyright (C) 2011 Red Hat, Inc. All rights reserved. * Copyright (C) 2011 Mike Christie */ +#include #include #include #include @@ -19,36 +20,44 @@ struct bsg_set { struct blk_mq_tag_set tag_set; + struct bsg_device *bd; bsg_job_fn *job_fn; bsg_timeout_fn *timeout_fn; }; -static int bsg_transport_check_proto(struct sg_io_v4 *hdr) +static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, + fmode_t mode, unsigned int timeout) { + struct bsg_job *job; + struct request *rq; + struct bio *bio; + int ret; + if (hdr->protocol != BSG_PROTOCOL_SCSI || hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_TRANSPORT) return -EINVAL; if (!capable(CAP_SYS_RAWIO)) return -EPERM; - return 0; -} -static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, - fmode_t mode) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(rq); - int ret; + rq = blk_get_request(q, hdr->dout_xfer_len ? + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); + if (IS_ERR(rq)) + return PTR_ERR(rq); + rq->timeout = timeout; + job = blk_mq_rq_to_pdu(rq); job->request_len = hdr->request_len; job->request = memdup_user(uptr64(hdr->request), hdr->request_len); - if (IS_ERR(job->request)) - return PTR_ERR(job->request); + if (IS_ERR(job->request)) { + ret = PTR_ERR(job->request); + goto out_put_request; + } if (hdr->dout_xfer_len && hdr->din_xfer_len) { - job->bidi_rq = blk_get_request(rq->q, REQ_OP_SCSI_IN, 0); + job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0); if (IS_ERR(job->bidi_rq)) { ret = PTR_ERR(job->bidi_rq); - goto out; + goto out_free_job_request; } ret = blk_rq_map_user(rq->q, job->bidi_rq, NULL, @@ -63,20 +72,20 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, job->bidi_bio = NULL; } - return 0; + ret = 0; + if (hdr->dout_xfer_len) { + ret = blk_rq_map_user(rq->q, rq, NULL, uptr64(hdr->dout_xferp), + hdr->dout_xfer_len, GFP_KERNEL); + } else if (hdr->din_xfer_len) { + ret = blk_rq_map_user(rq->q, rq, NULL, uptr64(hdr->din_xferp), + hdr->din_xfer_len, GFP_KERNEL); + } -out_free_bidi_rq: - if (job->bidi_rq) - blk_put_request(job->bidi_rq); -out: - kfree(job->request); - return ret; -} + if (ret) + goto out_unmap_bidi_rq; -static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(rq); - int ret = 0; + bio = rq->bio; + blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); /* * The assignments below don't make much sense, but are kept for @@ -84,7 +93,7 @@ static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) */ hdr->device_status = job->result & 0xff; hdr->transport_status = host_byte(job->result); - hdr->driver_status = driver_byte(job->result); + hdr->driver_status = 0; hdr->info = 0; if (hdr->device_status || hdr->transport_status || hdr->driver_status) hdr->info |= SG_INFO_CHECK; @@ -119,28 +128,20 @@ static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) hdr->din_resid = 0; } + blk_rq_unmap_user(bio); +out_unmap_bidi_rq: + if (job->bidi_rq) + blk_rq_unmap_user(job->bidi_bio); +out_free_bidi_rq: + if (job->bidi_rq) + blk_put_request(job->bidi_rq); +out_free_job_request: + kfree(job->request); +out_put_request: + blk_put_request(rq); return ret; } -static void bsg_transport_free_rq(struct request *rq) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(rq); - - if (job->bidi_rq) { - blk_rq_unmap_user(job->bidi_bio); - blk_put_request(job->bidi_rq); - } - - kfree(job->request); -} - -static const struct bsg_ops bsg_transport_ops = { - .check_proto = bsg_transport_check_proto, - .fill_hdr = bsg_transport_fill_hdr, - .complete_rq = bsg_transport_complete_rq, - .free_rq = bsg_transport_free_rq, -}; - /** * bsg_teardown_job - routine to teardown a bsg job * @kref: kref inside bsg_job that is to be torn down @@ -327,7 +328,7 @@ void bsg_remove_queue(struct request_queue *q) struct bsg_set *bset = container_of(q->tag_set, struct bsg_set, tag_set); - bsg_unregister_queue(q); + bsg_unregister_queue(bset->bd); blk_cleanup_queue(q); blk_mq_free_tag_set(&bset->tag_set); kfree(bset); @@ -396,10 +397,9 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, q->queuedata = dev; blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); - ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); - if (ret) { - printk(KERN_ERR "%s: bsg interface failed to " - "initialize - register queue\n", dev->kobj.name); + bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn); + if (IS_ERR(bset->bd)) { + ret = PTR_ERR(bset->bd); goto out_cleanup_queue; } diff --git a/block/bsg.c b/block/bsg.c index 3d78e843a8..882f56bff1 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -15,341 +15,97 @@ #include #include -#include -#include -#include #include #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" #define BSG_VERSION "0.4" -#define bsg_dbg(bd, fmt, ...) \ - pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) - struct bsg_device { struct request_queue *queue; - spinlock_t lock; - struct hlist_node dev_list; - refcount_t ref_count; - char name[20]; + struct device device; + struct cdev cdev; int max_queue; + unsigned int timeout; + unsigned int reserved_size; + bsg_sg_io_fn *sg_io_fn; }; +static inline struct bsg_device *to_bsg_device(struct inode *inode) +{ + return container_of(inode->i_cdev, struct bsg_device, cdev); +} + #define BSG_DEFAULT_CMDS 64 #define BSG_MAX_DEVS 32768 -static DEFINE_MUTEX(bsg_mutex); -static DEFINE_IDR(bsg_minor_idr); - -#define BSG_LIST_ARRAY_SIZE 8 -static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE]; - +static DEFINE_IDA(bsg_minor_ida); static struct class *bsg_class; static int bsg_major; -static inline struct hlist_head *bsg_dev_idx_hash(int index) +static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr) { - return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; + unsigned int timeout = BLK_DEFAULT_SG_TIMEOUT; + + if (hdr->timeout) + timeout = msecs_to_jiffies(hdr->timeout); + else if (bd->timeout) + timeout = bd->timeout; + + return max_t(unsigned int, timeout, BLK_MIN_SG_TIMEOUT); } -#define uptr64(val) ((void __user *)(uintptr_t)(val)) - -static int bsg_scsi_check_proto(struct sg_io_v4 *hdr) +static int bsg_sg_io(struct bsg_device *bd, fmode_t mode, void __user *uarg) { - if (hdr->protocol != BSG_PROTOCOL_SCSI || - hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD) - return -EINVAL; - return 0; -} - -static int bsg_scsi_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, - fmode_t mode) -{ - struct scsi_request *sreq = scsi_req(rq); - - if (hdr->dout_xfer_len && hdr->din_xfer_len) { - pr_warn_once("BIDI support in bsg has been removed.\n"); - return -EOPNOTSUPP; - } - - sreq->cmd_len = hdr->request_len; - if (sreq->cmd_len > BLK_MAX_CDB) { - sreq->cmd = kzalloc(sreq->cmd_len, GFP_KERNEL); - if (!sreq->cmd) - return -ENOMEM; - } - - if (copy_from_user(sreq->cmd, uptr64(hdr->request), sreq->cmd_len)) - return -EFAULT; - if (blk_verify_command(sreq->cmd, mode)) - return -EPERM; - return 0; -} - -static int bsg_scsi_complete_rq(struct request *rq, struct sg_io_v4 *hdr) -{ - struct scsi_request *sreq = scsi_req(rq); - int ret = 0; - - /* - * fill in all the output members - */ - hdr->device_status = sreq->result & 0xff; - hdr->transport_status = host_byte(sreq->result); - hdr->driver_status = driver_byte(sreq->result); - hdr->info = 0; - if (hdr->device_status || hdr->transport_status || hdr->driver_status) - hdr->info |= SG_INFO_CHECK; - hdr->response_len = 0; - - if (sreq->sense_len && hdr->response) { - int len = min_t(unsigned int, hdr->max_response_len, - sreq->sense_len); - - if (copy_to_user(uptr64(hdr->response), sreq->sense, len)) - ret = -EFAULT; - else - hdr->response_len = len; - } - - if (rq_data_dir(rq) == READ) - hdr->din_resid = sreq->resid_len; - else - hdr->dout_resid = sreq->resid_len; - - return ret; -} - -static void bsg_scsi_free_rq(struct request *rq) -{ - scsi_req_free_cmd(scsi_req(rq)); -} - -static const struct bsg_ops bsg_scsi_ops = { - .check_proto = bsg_scsi_check_proto, - .fill_hdr = bsg_scsi_fill_hdr, - .complete_rq = bsg_scsi_complete_rq, - .free_rq = bsg_scsi_free_rq, -}; - -static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) -{ - struct request *rq; - struct bio *bio; struct sg_io_v4 hdr; int ret; if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - - if (!q->bsg_dev.class_dev) - return -ENXIO; - if (hdr.guard != 'Q') return -EINVAL; - ret = q->bsg_dev.ops->check_proto(&hdr); - if (ret) - return ret; - - rq = blk_get_request(q, hdr.dout_xfer_len ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode); - if (ret) { - blk_put_request(rq); - return ret; - } - - rq->timeout = msecs_to_jiffies(hdr.timeout); - if (!rq->timeout) - rq->timeout = q->sg_timeout; - if (!rq->timeout) - rq->timeout = BLK_DEFAULT_SG_TIMEOUT; - if (rq->timeout < BLK_MIN_SG_TIMEOUT) - rq->timeout = BLK_MIN_SG_TIMEOUT; - - if (hdr.dout_xfer_len) { - ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.dout_xferp), - hdr.dout_xfer_len, GFP_KERNEL); - } else if (hdr.din_xfer_len) { - ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.din_xferp), - hdr.din_xfer_len, GFP_KERNEL); - } - - if (ret) - goto out_free_rq; - - bio = rq->bio; - - blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); - ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr); - blk_rq_unmap_user(bio); - -out_free_rq: - rq->q->bsg_dev.ops->free_rq(rq); - blk_put_request(rq); + ret = bd->sg_io_fn(bd->queue, &hdr, mode, bsg_timeout(bd, &hdr)); if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr))) return -EFAULT; return ret; } -static struct bsg_device *bsg_alloc_device(void) -{ - struct bsg_device *bd; - - bd = kzalloc(sizeof(struct bsg_device), GFP_KERNEL); - if (unlikely(!bd)) - return NULL; - - spin_lock_init(&bd->lock); - bd->max_queue = BSG_DEFAULT_CMDS; - INIT_HLIST_NODE(&bd->dev_list); - return bd; -} - -static int bsg_put_device(struct bsg_device *bd) -{ - struct request_queue *q = bd->queue; - - mutex_lock(&bsg_mutex); - - if (!refcount_dec_and_test(&bd->ref_count)) { - mutex_unlock(&bsg_mutex); - return 0; - } - - hlist_del(&bd->dev_list); - mutex_unlock(&bsg_mutex); - - bsg_dbg(bd, "tearing down\n"); - - /* - * close can always block - */ - kfree(bd); - blk_put_queue(q); - return 0; -} - -static struct bsg_device *bsg_add_device(struct inode *inode, - struct request_queue *rq, - struct file *file) -{ - struct bsg_device *bd; - unsigned char buf[32]; - - lockdep_assert_held(&bsg_mutex); - - if (!blk_get_queue(rq)) - return ERR_PTR(-ENXIO); - - bd = bsg_alloc_device(); - if (!bd) { - blk_put_queue(rq); - return ERR_PTR(-ENOMEM); - } - - bd->queue = rq; - - refcount_set(&bd->ref_count, 1); - hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); - - strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); - bsg_dbg(bd, "bound to <%s>, max queue %d\n", - format_dev_t(buf, inode->i_rdev), bd->max_queue); - - return bd; -} - -static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) -{ - struct bsg_device *bd; - - lockdep_assert_held(&bsg_mutex); - - hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { - if (bd->queue == q) { - refcount_inc(&bd->ref_count); - goto found; - } - } - bd = NULL; -found: - return bd; -} - -static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file) -{ - struct bsg_device *bd; - struct bsg_class_device *bcd; - - /* - * find the class device - */ - mutex_lock(&bsg_mutex); - bcd = idr_find(&bsg_minor_idr, iminor(inode)); - - if (!bcd) { - bd = ERR_PTR(-ENODEV); - goto out_unlock; - } - - bd = __bsg_get_device(iminor(inode), bcd->queue); - if (!bd) - bd = bsg_add_device(inode, bcd->queue, file); - -out_unlock: - mutex_unlock(&bsg_mutex); - return bd; -} - static int bsg_open(struct inode *inode, struct file *file) { - struct bsg_device *bd; - - bd = bsg_get_device(inode, file); - - if (IS_ERR(bd)) - return PTR_ERR(bd); - - file->private_data = bd; + if (!blk_get_queue(to_bsg_device(inode)->queue)) + return -ENXIO; return 0; } static int bsg_release(struct inode *inode, struct file *file) { - struct bsg_device *bd = file->private_data; - - file->private_data = NULL; - return bsg_put_device(bd); + blk_put_queue(to_bsg_device(inode)->queue); + return 0; } static int bsg_get_command_q(struct bsg_device *bd, int __user *uarg) { - return put_user(bd->max_queue, uarg); + return put_user(READ_ONCE(bd->max_queue), uarg); } static int bsg_set_command_q(struct bsg_device *bd, int __user *uarg) { - int queue; + int max_queue; - if (get_user(queue, uarg)) + if (get_user(max_queue, uarg)) return -EFAULT; - if (queue < 1) + if (max_queue < 1) return -EINVAL; - - spin_lock_irq(&bd->lock); - bd->max_queue = queue; - spin_unlock_irq(&bd->lock); + WRITE_ONCE(bd->max_queue, max_queue); return 0; } static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct bsg_device *bd = file->private_data; + struct bsg_device *bd = to_bsg_device(file_inode(file)); + struct request_queue *q = bd->queue; void __user *uarg = (void __user *) arg; + int __user *intp = uarg; + int val; switch (cmd) { /* @@ -364,17 +120,37 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * SCSI/sg ioctls */ case SG_GET_VERSION_NUM: + return put_user(30527, intp); case SCSI_IOCTL_GET_IDLUN: + return put_user(0, intp); case SCSI_IOCTL_GET_BUS_NUMBER: + return put_user(0, intp); case SG_SET_TIMEOUT: + if (get_user(val, intp)) + return -EFAULT; + bd->timeout = clock_t_to_jiffies(val); + return 0; case SG_GET_TIMEOUT: + return jiffies_to_clock_t(bd->timeout); case SG_GET_RESERVED_SIZE: + return put_user(min(bd->reserved_size, queue_max_bytes(q)), + intp); case SG_SET_RESERVED_SIZE: + if (get_user(val, intp)) + return -EFAULT; + if (val < 0) + return -EINVAL; + bd->reserved_size = + min_t(unsigned int, val, queue_max_bytes(q)); + return 0; case SG_EMULATED_HOST: - case SCSI_IOCTL_SEND_COMMAND: - return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg); + return put_user(1, intp); case SG_IO: - return bsg_sg_io(bd->queue, file->f_mode, uarg); + return bsg_sg_io(bd, file->f_mode, uarg); + case SCSI_IOCTL_SEND_COMMAND: + pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n", + current->comm); + return -EINVAL; default: return -ENOTTY; } @@ -389,92 +165,72 @@ static const struct file_operations bsg_fops = { .llseek = default_llseek, }; -void bsg_unregister_queue(struct request_queue *q) +static void bsg_device_release(struct device *dev) { - struct bsg_class_device *bcd = &q->bsg_dev; + struct bsg_device *bd = container_of(dev, struct bsg_device, device); - if (!bcd->class_dev) - return; + ida_simple_remove(&bsg_minor_ida, MINOR(bd->device.devt)); + kfree(bd); +} - mutex_lock(&bsg_mutex); - idr_remove(&bsg_minor_idr, bcd->minor); - if (q->kobj.sd) - sysfs_remove_link(&q->kobj, "bsg"); - device_unregister(bcd->class_dev); - bcd->class_dev = NULL; - mutex_unlock(&bsg_mutex); +void bsg_unregister_queue(struct bsg_device *bd) +{ + if (bd->queue->kobj.sd) + sysfs_remove_link(&bd->queue->kobj, "bsg"); + cdev_device_del(&bd->cdev, &bd->device); + put_device(&bd->device); } EXPORT_SYMBOL_GPL(bsg_unregister_queue); -int bsg_register_queue(struct request_queue *q, struct device *parent, - const char *name, const struct bsg_ops *ops) +struct bsg_device *bsg_register_queue(struct request_queue *q, + struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn) { - struct bsg_class_device *bcd; - dev_t dev; + struct bsg_device *bd; int ret; - struct device *class_dev = NULL; - /* - * we need a proper transport to send commands, not a stacked device - */ - if (!queue_is_mq(q)) - return 0; + bd = kzalloc(sizeof(*bd), GFP_KERNEL); + if (!bd) + return ERR_PTR(-ENOMEM); + bd->max_queue = BSG_DEFAULT_CMDS; + bd->reserved_size = INT_MAX; + bd->queue = q; + bd->sg_io_fn = sg_io_fn; - bcd = &q->bsg_dev; - memset(bcd, 0, sizeof(*bcd)); - - mutex_lock(&bsg_mutex); - - ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL); + ret = ida_simple_get(&bsg_minor_ida, 0, BSG_MAX_DEVS, GFP_KERNEL); if (ret < 0) { - if (ret == -ENOSPC) { - printk(KERN_ERR "bsg: too many bsg devices\n"); - ret = -EINVAL; - } - goto unlock; + if (ret == -ENOSPC) + dev_err(parent, "bsg: too many bsg devices\n"); + kfree(bd); + return ERR_PTR(ret); } + bd->device.devt = MKDEV(bsg_major, ret); + bd->device.class = bsg_class; + bd->device.parent = parent; + bd->device.release = bsg_device_release; + dev_set_name(&bd->device, "%s", name); + device_initialize(&bd->device); - bcd->minor = ret; - bcd->queue = q; - bcd->ops = ops; - dev = MKDEV(bsg_major, bcd->minor); - class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name); - if (IS_ERR(class_dev)) { - ret = PTR_ERR(class_dev); - goto idr_remove; - } - bcd->class_dev = class_dev; + cdev_init(&bd->cdev, &bsg_fops); + bd->cdev.owner = THIS_MODULE; + ret = cdev_device_add(&bd->cdev, &bd->device); + if (ret) + goto out_put_device; if (q->kobj.sd) { - ret = sysfs_create_link(&q->kobj, &bcd->class_dev->kobj, "bsg"); + ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg"); if (ret) - goto unregister_class_dev; + goto out_device_del; } - mutex_unlock(&bsg_mutex); - return 0; + return bd; -unregister_class_dev: - device_unregister(class_dev); -idr_remove: - idr_remove(&bsg_minor_idr, bcd->minor); -unlock: - mutex_unlock(&bsg_mutex); - return ret; +out_device_del: + cdev_device_del(&bd->cdev, &bd->device); +out_put_device: + put_device(&bd->device); + return ERR_PTR(ret); } - -int bsg_scsi_register_queue(struct request_queue *q, struct device *parent) -{ - if (!blk_queue_scsi_passthrough(q)) { - WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); - return -EINVAL; - } - - return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops); -} -EXPORT_SYMBOL_GPL(bsg_scsi_register_queue); - -static struct cdev bsg_cdev; +EXPORT_SYMBOL_GPL(bsg_register_queue); static char *bsg_devnode(struct device *dev, umode_t *mode) { @@ -483,11 +239,8 @@ static char *bsg_devnode(struct device *dev, umode_t *mode) static int __init bsg_init(void) { - int ret, i; dev_t devid; - - for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) - INIT_HLIST_HEAD(&bsg_device_list[i]); + int ret; bsg_class = class_create(THIS_MODULE, "bsg"); if (IS_ERR(bsg_class)) @@ -497,19 +250,12 @@ static int __init bsg_init(void) ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); if (ret) goto destroy_bsg_class; - bsg_major = MAJOR(devid); - cdev_init(&bsg_cdev, &bsg_fops); - ret = cdev_add(&bsg_cdev, MKDEV(bsg_major, 0), BSG_MAX_DEVS); - if (ret) - goto unregister_chrdev; - printk(KERN_INFO BSG_DESCRIPTION " version " BSG_VERSION " loaded (major %d)\n", bsg_major); return 0; -unregister_chrdev: - unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); + destroy_bsg_class: class_destroy(bsg_class); return ret; diff --git a/block/disk-events.c b/block/disk-events.c index a75931ff5d..8d5496e759 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -163,15 +163,31 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) spin_unlock_irq(&ev->lock); } +/* + * Tell userland about new events. Only the events listed in @disk->events are + * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are + * processed internally but never get reported to userland. + */ +static void disk_event_uevent(struct gendisk *disk, unsigned int events) +{ + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + int nr_events = 0, i; + + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & disk->events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; - int nr_events = 0, i; /* check events */ events = disk->fops->check_events(disk, clearing); @@ -190,19 +206,11 @@ static void disk_check_events(struct disk_events *ev, spin_unlock_irq(&ev->lock); - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; + if (events & DISK_EVENT_MEDIA_CHANGE) + inc_diskseq(disk); - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) + disk_event_uevent(disk, events); } /** @@ -281,6 +289,32 @@ bool bdev_check_media_change(struct block_device *bdev) } EXPORT_SYMBOL(bdev_check_media_change); +/** + * disk_force_media_change - force a media change event + * @disk: the disk which will raise the event + * @events: the events to raise + * + * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present, + * attempt to free all dentries and inodes and invalidates all block + * device page cache entries in that case. + * + * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not. + */ +bool disk_force_media_change(struct gendisk *disk, unsigned int events) +{ + disk_event_uevent(disk, events); + + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(disk->part0, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &disk->state); + return true; +} +EXPORT_SYMBOL_GPL(disk_force_media_change); + /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. @@ -410,17 +444,17 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -void disk_alloc_events(struct gendisk *disk) +int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) - return; + return 0; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; + return -ENOMEM; } INIT_LIST_HEAD(&ev->node); @@ -432,6 +466,7 @@ void disk_alloc_events(struct gendisk *disk) INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; + return 0; } void disk_add_events(struct gendisk *disk) diff --git a/block/elevator.c b/block/elevator.c index 293c5c8139..ff45d8388f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -336,6 +336,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } @@ -350,9 +353,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * - * Returns true if we merged, false otherwise + * Returns true if we merged, false otherwise. 'free' will contain all + * requests that need to be freed. */ -bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { struct request *__rq; bool ret; @@ -363,8 +368,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) /* * First try one-hit cache. */ - if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { + list_add(&rq->queuelist, free); return true; + } if (blk_queue_noxmerges(q)) return false; @@ -378,6 +385,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; + list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; @@ -522,6 +530,10 @@ void elv_unregister_queue(struct request_queue *q) int elv_register(struct elevator_type *e) { + /* insert_requests and dispatch_request are mandatory */ + if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) + return -EINVAL; + /* create icq_cache if requested */ if (e->icq_size) { if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || @@ -621,7 +633,11 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { - if (q->nr_hw_queues != 1) + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + + if (q->nr_hw_queues != 1 && + !blk_mq_is_sbitmap_shared(q->tag_set->flags)) return NULL; return elevator_get(q, "mq-deadline", false); @@ -693,7 +709,6 @@ void elevator_init_mq(struct request_queue *q) } } - /* * switch to new_e io scheduler. be careful not to introduce deadlocks - * we don't free the old io scheduler, before we have allocated what we diff --git a/block/fops.c b/block/fops.c new file mode 100644 index 0000000000..1e970c247e --- /dev/null +++ b/block/fops.c @@ -0,0 +1,639 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright (C) 2016 - 2020 Christoph Hellwig + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk.h" + +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + +static unsigned int dio_bio_write_op(struct kiocb *iocb) +{ + unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + + /* avoid the need for a I/O completion work item */ + if (iocb->ki_flags & IOCB_DSYNC) + op |= REQ_FUA; + return op; +} + +#define DIO_INLINE_BIO_VECS 4 + +static void blkdev_bio_end_io_simple(struct bio *bio) +{ + struct task_struct *waiter = bio->bi_private; + + WRITE_ONCE(bio->bi_private, NULL); + blk_wake_io_task(waiter); +} + +static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, + struct iov_iter *iter, unsigned int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; + loff_t pos = iocb->ki_pos; + bool should_dirty = false; + struct bio bio; + ssize_t ret; + blk_qc_t qc; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + if (nr_pages <= DIO_INLINE_BIO_VECS) + vecs = inline_vecs; + else { + vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + if (!vecs) + return -ENOMEM; + } + + bio_init(&bio, vecs, nr_pages); + bio_set_dev(&bio, bdev); + bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; + bio.bi_private = current; + bio.bi_end_io = blkdev_bio_end_io_simple; + bio.bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(&bio, iter); + if (unlikely(ret)) + goto out; + ret = bio.bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio.bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) + should_dirty = true; + } else { + bio.bi_opf = dio_bio_write_op(iocb); + task_io_account_write(ret); + } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(&bio, iocb); + + qc = submit_bio(&bio); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) + break; + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + + bio_release_pages(&bio, should_dirty); + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); + +out: + if (vecs != inline_vecs) + kfree(vecs); + + bio_uninit(&bio); + + return ret; +} + +struct blkdev_dio { + union { + struct kiocb *iocb; + struct task_struct *waiter; + }; + size_t size; + atomic_t ref; + bool multi_bio : 1; + bool should_dirty : 1; + bool is_sync : 1; + struct bio bio; +}; + +static struct bio_set blkdev_dio_pool; + +static int blkdev_iopoll(struct kiocb *kiocb, bool wait) +{ + struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); + struct request_queue *q = bdev_get_queue(bdev); + + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); +} + +static void blkdev_bio_end_io(struct bio *bio) +{ + struct blkdev_dio *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { + if (!dio->is_sync) { + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (likely(!dio->bio.bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); + } + + dio->iocb->ki_complete(iocb, ret, 0); + if (dio->multi_bio) + bio_put(&dio->bio); + } else { + struct task_struct *waiter = dio->waiter; + + WRITE_ONCE(dio->waiter, NULL); + blk_wake_io_task(waiter); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); + struct blk_plug plug; + struct blkdev_dio *dio; + struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; + bool is_read = (iov_iter_rw(iter) == READ), is_sync; + loff_t pos = iocb->ki_pos; + blk_qc_t qc = BLK_QC_T_NONE; + int ret = 0; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + + dio = container_of(bio, struct blkdev_dio, bio); + dio->is_sync = is_sync = is_sync_kiocb(iocb); + if (dio->is_sync) { + dio->waiter = current; + bio_get(bio); + } else { + dio->iocb = iocb; + } + + dio->size = 0; + dio->multi_bio = false; + dio->should_dirty = is_read && iter_is_iovec(iter); + + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + + for (;;) { + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_private = dio; + bio->bi_end_io = blkdev_bio_end_io; + bio->bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + break; + } + + if (is_read) { + bio->bi_opf = REQ_OP_READ; + if (dio->should_dirty) + bio_set_pages_dirty(bio); + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); + if (!nr_pages) { + bool polled = false; + + if (iocb->ki_flags & IOCB_HIPRI) { + bio_set_polled(bio, iocb); + polled = true; + } + + qc = submit_bio(bio); + + if (polled) + WRITE_ONCE(iocb->ki_cookie, qc); + break; + } + + if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); + dio->multi_bio = true; + atomic_set(&dio->ref, 2); + } else { + atomic_inc(&dio->ref); + } + + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); + } + + if (!is_poll) + blk_finish_plug(&plug); + + if (!is_sync) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); + if (likely(!ret)) + ret = dio->size; + + bio_put(&dio->bio); + return ret; +} + +static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + unsigned int nr_pages; + + if (!iov_iter_count(iter)) + return 0; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + + return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); +} + +static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, blkdev_get_block, wbc); +} + +static int blkdev_readpage(struct file * file, struct page * page) +{ + return block_read_full_page(page, blkdev_get_block); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + mpage_readahead(rac, blkdev_get_block); +} + +static int blkdev_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, struct page **pagep, + void **fsdata) +{ + return block_write_begin(mapping, pos, len, flags, pagep, + blkdev_get_block); +} + +static int blkdev_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + int ret; + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + unlock_page(page); + put_page(page); + + return ret; +} + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return generic_writepages(mapping, wbc); +} + +const struct address_space_operations def_blk_aops = { + .set_page_dirty = __set_page_dirty_buffers, + .readpage = blkdev_readpage, + .readahead = blkdev_readahead, + .writepage = blkdev_writepage, + .write_begin = blkdev_write_begin, + .write_end = blkdev_write_end, + .writepages = blkdev_writepages, + .direct_IO = blkdev_direct_IO, + .migratepage = buffer_migrate_page_norefs, + .is_dirty_writeback = buffer_check_dirty_writeback, +}; + +/* + * for a block special file file_inode(file)->i_size is zero + * so we compute the size by hand (just as in block_read/write above) + */ +static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *bd_inode = bdev_file_inode(file); + loff_t retval; + + inode_lock(bd_inode); + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); + inode_unlock(bd_inode); + return retval; +} + +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *bd_inode = bdev_file_inode(filp); + struct block_device *bdev = I_BDEV(bd_inode); + int error; + + error = file_write_and_wait_range(filp, start, end); + if (error) + return error; + + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + error = blkdev_issue_flush(bdev); + if (error == -EOPNOTSUPP) + error = 0; + + return error; +} + +static int blkdev_open(struct inode *inode, struct file *filp) +{ + struct block_device *bdev; + + /* + * Preserve backwards compatibility and allow large file access + * even if userspace doesn't ask for it explicitly. Some mkfs + * binary needs it. We might want to drop this workaround + * during an unstable branch. + */ + filp->f_flags |= O_LARGEFILE; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + + if (filp->f_flags & O_NDELAY) + filp->f_mode |= FMODE_NDELAY; + if (filp->f_flags & O_EXCL) + filp->f_mode |= FMODE_EXCL; + if ((filp->f_flags & O_ACCMODE) == 3) + filp->f_mode |= FMODE_WRITE_IOCTL; + + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + filp->f_mapping = bdev->bd_inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + return 0; +} + +static int blkdev_close(struct inode *inode, struct file *filp) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); + + blkdev_put(bdev, filp->f_mode); + return 0; +} + +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + fmode_t mode = file->f_mode; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + return blkdev_ioctl(bdev, mode, cmd, arg); +} + +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = bdev_file_inode(file); + loff_t size = i_size_read(bd_inode); + struct blk_plug plug; + size_t shorted = 0; + ssize_t ret; + + if (bdev_read_only(I_BDEV(bd_inode))) + return -EPERM; + + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if (iocb->ki_pos >= size) + return -ENOSPC; + + if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) + return -EOPNOTSUPP; + + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } + + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); + blk_finish_plug(&plug); + return ret; +} + +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = bdev_file_inode(file); + loff_t size = i_size_read(bd_inode); + loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret; + + if (pos >= size) + return 0; + + size -= pos; + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + ret = generic_file_read_iter(iocb, to); + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; +} + +#define BLKDEV_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + +static long blkdev_fallocate(struct file *file, int mode, loff_t start, + loff_t len) +{ + struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); + loff_t end = start + len - 1; + loff_t isize; + int error; + + /* Fail if we don't recognize the flags. */ + if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* Don't go off the end of the device. */ + isize = i_size_read(bdev->bd_inode); + if (start >= isize) + return -EINVAL; + if (end >= isize) { + if (mode & FALLOC_FL_KEEP_SIZE) { + len = isize - start; + end = start + len - 1; + } else + return -EINVAL; + } + + /* + * Don't allow IO that isn't aligned to logical block size. + */ + if ((start | len) & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + filemap_invalidate_lock(inode->i_mapping); + + /* Invalidate the page cache, including dirty pages. */ + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + goto fail; + + switch (mode) { + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + default: + error = -EOPNOTSUPP; + } + + fail: + filemap_invalidate_unlock(inode->i_mapping); + return error; +} + +const struct file_operations def_blk_fops = { + .open = blkdev_open, + .release = blkdev_close, + .llseek = blkdev_llseek, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, + .iopoll = blkdev_iopoll, + .mmap = generic_file_mmap, + .fsync = blkdev_fsync, + .unlocked_ioctl = block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = blkdev_fallocate, +}; + +static __init int blkdev_init(void) +{ + return bioset_init(&blkdev_dio_pool, 4, + offsetof(struct blkdev_dio, bio), + BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); +} +module_init(blkdev_init); diff --git a/block/genhd.c b/block/genhd.c index 796baf7612..b49858550f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* * gendisk handling + * + * Portions Copyright (C) 2020 Christoph Hellwig */ #include @@ -17,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -25,57 +26,88 @@ #include #include "blk.h" +#include "blk-rq-qos.h" -static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; +/* + * Unique, monotonically increasing sequential number associated with block + * devices instances (i.e. incremented each time a device is attached). + * Associating uevents with block devices in userspace is difficult and racy: + * the uevent netlink socket is lossy, and on slow and overloaded systems has + * a very high latency. + * Block devices do not have exclusive owners in userspace, any process can set + * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 + * can be reused again and again). + * A userspace process setting up a block device and watching for its events + * cannot thus reliably tell whether an event relates to the device it just set + * up or another earlier instance with the same name. + * This sequential number allows userspace processes to solve this problem, and + * uniquely associate an uevent to the lifetime to a device. + */ +static atomic64_t diskseq; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) +static DEFINE_IDA(ext_devt_ida); -/* For extended devt allocation. ext_devt_lock prevents look up - * results from going away underneath its user. - */ -static DEFINE_SPINLOCK(ext_devt_lock); -static DEFINE_IDR(ext_devt_idr); +void set_capacity(struct gendisk *disk, sector_t sectors) +{ + struct block_device *bdev = disk->part0; -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr); -static void disk_alloc_events(struct gendisk *disk); -static void disk_add_events(struct gendisk *disk); -static void disk_del_events(struct gendisk *disk); -static void disk_release_events(struct gendisk *disk); + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} +EXPORT_SYMBOL(set_capacity); /* - * Set disk capacity and notify if the size is not currently - * zero and will not be set to zero + * Set disk capacity and notify if the size is not currently zero and will not + * be set to zero. Returns true if a uevent was sent, otherwise false. */ -bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool update_bdev) +bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); + char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); - if (update_bdev) - revalidate_disk_size(disk, true); - if (capacity != size && capacity != 0 && size != 0) { - char *envp[] = { "RESIZE=1", NULL }; + /* + * Only print a message and send a uevent if the gendisk is user visible + * and alive. This avoids spamming the log and udev when setting the + * initial capacity during probing. + */ + if (size == capacity || + !disk_live(disk) || + (disk->flags & GENHD_FL_HIDDEN)) + return false; - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); - return true; - } + pr_info("%s: detected capacity change from %lld to %lld\n", + disk->disk_name, capacity, size); - return false; + /* + * Historically we did not send a uevent for changes to/from an empty + * device. + */ + if (!capacity || !size) + return false; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } - -EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); +EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* - * Format the device name of the indicated disk into the supplied buffer and - * return a pointer to that same buffer for convenience. + * Format the device name of the indicated block device into the supplied buffer + * and return a pointer to that same buffer for convenience. + * + * Note: do not use this in new code, use the %pg specifier to sprintf and + * printk insted. */ -char *disk_name(struct gendisk *hd, int partno, char *buf) +const char *bdevname(struct block_device *bdev, char *buf) { + struct gendisk *hd = bdev->bd_disk; + int partno = bdev->bd_partno; + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) @@ -85,20 +117,16 @@ char *disk_name(struct gendisk *hd, int partno, char *buf) return buf; } - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_partno, buf); -} EXPORT_SYMBOL(bdevname); -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +static void part_stat_read_all(struct block_device *part, + struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -112,7 +140,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } } -static unsigned int part_in_flight(struct hd_struct *part) +static unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; @@ -127,7 +155,8 @@ static unsigned int part_in_flight(struct hd_struct *part) return inflight; } -static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) +static void part_in_flight_rw(struct block_device *part, + unsigned int inflight[2]) { int cpu; @@ -143,250 +172,6 @@ static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) inflight[1] = 0; } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); - - if (unlikely(partno < 0 || partno >= ptbl->len)) - return NULL; - return rcu_dereference(ptbl->part[partno]); -} - -/** - * disk_get_part - get partition - * @disk: disk to look partition from - * @partno: partition number - * - * Look for partition @partno from @disk. If found, increment - * reference count and return it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Pointer to the found partition on success, NULL if not found. - */ -struct hd_struct *disk_get_part(struct gendisk *disk, int partno) -{ - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(disk, partno); - if (part) - get_device(part_to_dev(part)); - rcu_read_unlock(); - - return part; -} - -/** - * disk_part_iter_init - initialize partition iterator - * @piter: iterator to initialize - * @disk: disk to iterate over - * @flags: DISK_PITER_* flags - * - * Initialize @piter so that it iterates over partitions of @disk. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, - unsigned int flags) -{ - struct disk_part_tbl *ptbl; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - piter->disk = disk; - piter->part = NULL; - - if (flags & DISK_PITER_REVERSE) - piter->idx = ptbl->len - 1; - else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) - piter->idx = 0; - else - piter->idx = 1; - - piter->flags = flags; - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(disk_part_iter_init); - -/** - * disk_part_iter_next - proceed iterator to the next partition and return it - * @piter: iterator of interest - * - * Proceed @piter to the next partition and return it. - * - * CONTEXT: - * Don't care. - */ -struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) -{ - struct disk_part_tbl *ptbl; - int inc, end; - - /* put the last partition */ - disk_put_part(piter->part); - piter->part = NULL; - - /* get part_tbl */ - rcu_read_lock(); - ptbl = rcu_dereference(piter->disk->part_tbl); - - /* determine iteration parameters */ - if (piter->flags & DISK_PITER_REVERSE) { - inc = -1; - if (piter->flags & (DISK_PITER_INCL_PART0 | - DISK_PITER_INCL_EMPTY_PART0)) - end = -1; - else - end = 0; - } else { - inc = 1; - end = ptbl->len; - } - - /* iterate to the next partition */ - for (; piter->idx != end; piter->idx += inc) { - struct hd_struct *part; - - part = rcu_dereference(ptbl->part[piter->idx]); - if (!part) - continue; - get_device(part_to_dev(part)); - piter->part = part; - if (!part_nr_sects_read(part) && - !(piter->flags & DISK_PITER_INCL_EMPTY) && - !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) { - put_device(part_to_dev(part)); - piter->part = NULL; - continue; - } - - piter->idx += inc; - break; - } - - rcu_read_unlock(); - - return piter->part; -} -EXPORT_SYMBOL_GPL(disk_part_iter_next); - -/** - * disk_part_iter_exit - finish up partition iteration - * @piter: iter of interest - * - * Called when iteration is over. Cleans up @piter. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_exit(struct disk_part_iter *piter) -{ - disk_put_part(piter->part); - piter->part = NULL; -} -EXPORT_SYMBOL_GPL(disk_part_iter_exit); - -static inline int sector_in_part(struct hd_struct *part, sector_t sector) -{ - return part->start_sect <= sector && - sector < part->start_sect + part_nr_sects_read(part); -} - -/** - * disk_map_sector_rcu - map sector to partition - * @disk: gendisk of interest - * @sector: sector to map - * - * Find out which partition @sector maps to on @disk. This is - * primarily used for stats accounting. - * - * CONTEXT: - * RCU read locked. The returned partition pointer is always valid - * because its refcount is grabbed except for part0, which lifetime - * is same with the disk. - * - * RETURNS: - * Found partition on success, part0 is returned if no partition matches - * or the matched partition is being deleted. - */ -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) -{ - struct disk_part_tbl *ptbl; - struct hd_struct *part; - int i; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) - goto out_unlock; - - for (i = 1; i < ptbl->len; i++) { - part = rcu_dereference(ptbl->part[i]); - - if (part && sector_in_part(part, sector)) { - /* - * only live partition can be cached for lookup, - * so use-after-free on cached & deleting partition - * can be avoided - */ - if (!hd_struct_try_get(part)) - break; - rcu_assign_pointer(ptbl->last_lookup, part); - goto out_unlock; - } - } - - part = &disk->part0; -out_unlock: - rcu_read_unlock(); - return part; -} - -/** - * disk_has_partitions - * @disk: gendisk of interest - * - * Walk through the partition table and check if valid partition exists. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * True if the gendisk has at least one valid non-zero size partition. - * Otherwise false. - */ -bool disk_has_partitions(struct gendisk *disk) -{ - struct disk_part_tbl *ptbl; - int i; - bool ret = false; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - /* Iterate partitions skipping the whole device at index 0 */ - for (i = 1; i < ptbl->len; i++) { - if (rcu_dereference(ptbl->part[i])) { - ret = true; - break; - } - } - - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL_GPL(disk_has_partitions); - /* * Can be deleted altogether. Later. * @@ -396,7 +181,10 @@ static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; + void (*probe)(dev_t devt); } *major_names[BLKDEV_MAJOR_HASH_SIZE]; +static DEFINE_MUTEX(major_names_lock); +static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) @@ -409,20 +197,21 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - mutex_lock(&block_class_lock); + spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_class_lock); + spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ /** - * register_blkdev - register a new block device + * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string + * @probe: allback that is called on access to any minor number of @major * * The @name must be unique within the system. * @@ -436,13 +225,16 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. + * + * Use register_blkdev instead for any new code. */ -int register_blkdev(unsigned int major, const char *name) +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { @@ -476,10 +268,12 @@ int register_blkdev(unsigned int major, const char *name) } p->major = major; + p->probe = probe; strlcpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; @@ -488,6 +282,7 @@ int register_blkdev(unsigned int major, const char *name) *n = p; else ret = -EBUSY; + spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", @@ -495,11 +290,10 @@ int register_blkdev(unsigned int major, const char *name) kfree(p); } out: - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); return ret; } - -EXPORT_SYMBOL(register_blkdev); +EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { @@ -507,7 +301,8 @@ void unregister_blkdev(unsigned int major, const char *name) struct blk_major_name *p = NULL; int index = major_to_index(major); - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -517,117 +312,26 @@ void unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } - mutex_unlock(&block_class_lock); + spin_unlock(&major_names_spinlock); + mutex_unlock(&major_names_lock); kfree(p); } EXPORT_SYMBOL(unregister_blkdev); -static struct kobj_map *bdev_map; - -/** - * blk_mangle_minor - scatter minor numbers apart - * @minor: minor number to mangle - * - * Scatter consecutively allocated @minor number apart if MANGLE_DEVT - * is enabled. Mangling twice gives the original value. - * - * RETURNS: - * Mangled value. - * - * CONTEXT: - * Don't care. - */ -static int blk_mangle_minor(int minor) +int blk_alloc_ext_minor(void) { -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - int i; - - for (i = 0; i < MINORBITS / 2; i++) { - int low = minor & (1 << i); - int high = minor & (1 << (MINORBITS - 1 - i)); - int distance = MINORBITS - 1 - 2 * i; - - minor ^= low | high; /* clear both bits */ - low <<= distance; /* swap the positions */ - high >>= distance; - minor |= low | high; /* and set */ - } -#endif - return minor; -} - -/** - * blk_alloc_devt - allocate a dev_t for a partition - * @part: partition to allocate dev_t for - * @devt: out parameter for resulting dev_t - * - * Allocate a dev_t for block device. - * - * RETURNS: - * 0 on success, allocated dev_t is returned in *@devt. -errno on - * failure. - * - * CONTEXT: - * Might sleep. - */ -int blk_alloc_devt(struct hd_struct *part, dev_t *devt) -{ - struct gendisk *disk = part_to_disk(part); int idx; - /* in consecutive minor range? */ - if (part->partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->partno); - return 0; - } - - /* allocate ext devt */ - idr_preload(GFP_KERNEL); - - spin_lock_bh(&ext_devt_lock); - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); - spin_unlock_bh(&ext_devt_lock); - - idr_preload_end(); - if (idx < 0) - return idx == -ENOSPC ? -EBUSY : idx; - - *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); - return 0; + idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); + if (idx == -ENOSPC) + return -EBUSY; + return idx; } -/** - * blk_free_devt - free a dev_t - * @devt: dev_t to free - * - * Free @devt which was allocated using blk_alloc_devt(). - * - * CONTEXT: - * Might sleep. - */ -void blk_free_devt(dev_t devt) +void blk_free_ext_minor(unsigned int minor) { - if (devt == MKDEV(0, 0)) - return; - - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } -} - -/* - * We invalidate devt by assigning NULL pointer for devt in idr. - */ -void blk_invalidate_devt(dev_t devt) -{ - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } + ida_free(&ext_devt_ida, minor); } static char *bdevt_str(dev_t devt, char *buf) @@ -642,42 +346,26 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -/* - * Register device numbers dev..(dev+range-1) - * range must be nonzero - * The hash chain is sorted on range, so that subranges can override. - */ -void blk_register_region(dev_t devt, unsigned long range, struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), void *data) +void disk_uevent(struct gendisk *disk, enum kobject_action action) { - kobj_map(bdev_map, devt, range, module, probe, lock, data); -} - -EXPORT_SYMBOL(blk_register_region); - -void blk_unregister_region(dev_t devt, unsigned long range) -{ - kobj_unmap(bdev_map, devt, range); -} - -EXPORT_SYMBOL(blk_unregister_region); - -static struct kobject *exact_match(dev_t devt, int *partno, void *data) -{ - struct gendisk *p = data; - - return &disk_to_dev(p)->kobj; -} - -static int exact_lock(dev_t devt, void *data) -{ - struct gendisk *p = data; - - if (!get_disk_and_module(p)) - return -1; - return 0; + struct block_device *part; + unsigned long idx; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part) && !bdev_nr_sectors(part)) + continue; + if (!kobject_get_unless_zero(&part->bd_device.kobj)) + continue; + + rcu_read_unlock(); + kobject_uevent(bdev_kobj(part), action); + put_device(&part->bd_device); + rcu_read_lock(); + } + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(disk_uevent); static void disk_scan_partitions(struct gendisk *disk) { @@ -692,34 +380,78 @@ static void disk_scan_partitions(struct gendisk *disk) blkdev_put(bdev, FMODE_READ); } -static void register_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) +/** + * device_add_disk - add disk information to kernel list + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * + * This function registers the partitioning information in @disk + * with the kernel. + */ +int device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + { struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct hd_struct *part; - int err; + int ret; - ddev->parent = parent; + /* + * The disk queue should now be all set with enough information about + * the device for the elevator code to pick an adequate default + * elevator if one is needed, that is, for devices requesting queue + * registration. + */ + elevator_init_mq(disk->queue); - dev_set_name(ddev, "%s", disk->disk_name); + /* + * If the driver provides an explicit major number it also must provide + * the number of minors numbers supported, and those will be used to + * setup the gendisk. + * Otherwise just allocate the device numbers for both the whole device + * and all partitions from the extended dev_t space. + */ + if (disk->major) { + if (WARN_ON(!disk->minors)) + return -EINVAL; + + if (disk->minors > DISK_MAX_PARTS) { + pr_err("block: can't allocate more than %d partitions\n", + DISK_MAX_PARTS); + disk->minors = DISK_MAX_PARTS; + } + } else { + if (WARN_ON(disk->minors)) + return -EINVAL; + + ret = blk_alloc_ext_minor(); + if (ret < 0) + return ret; + disk->major = BLOCK_EXT_MAJOR; + disk->first_minor = ret; + disk->flags |= GENHD_FL_EXT_DEVT; + } + + ret = disk_alloc_events(disk); + if (ret) + goto out_free_ext_minor; /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); - if (groups) { - WARN_ON(ddev->groups); - ddev->groups = groups; - } - if (device_add(ddev)) - return; + ddev->parent = parent; + ddev->groups = groups; + dev_set_name(ddev, "%s", disk->disk_name); + if (!(disk->flags & GENHD_FL_HIDDEN)) + ddev->devt = MKDEV(disk->major, disk->first_minor); + ret = device_add(ddev); + if (ret) + goto out_disk_release_events; if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, + ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } + if (ret) + goto out_device_del; } /* @@ -729,79 +461,25 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + ret = blk_integrity_add(disk); + if (ret) + goto out_del_block_link; + + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); + if (!disk->part0->bd_holder_dir) + goto out_del_integrity; disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (!disk->slave_dir) + goto out_put_holder_dir; - if (disk->flags & GENHD_FL_HIDDEN) - return; + ret = bd_register_pending_holders(disk); + if (ret < 0) + goto out_put_slave_dir; - disk_scan_partitions(disk); - - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); - - if (disk->queue->backing_dev_info->dev) { - err = sysfs_create_link(&ddev->kobj, - &disk->queue->backing_dev_info->dev->kobj, - "bdi"); - WARN_ON(err); - } -} - -/** - * __device_add_disk - add disk information to kernel list - * @parent: parent device for the disk - * @disk: per-device partitioning information - * @groups: Additional per-device sysfs groups - * @register_queue: register the queue if set to true - * - * This function registers the partitioning information in @disk - * with the kernel. - * - * FIXME: error handling - */ -static void __device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - bool register_queue) -{ - dev_t devt; - int retval; - - /* - * The disk queue should now be all set with enough information about - * the device for the elevator code to pick an adequate default - * elevator if one is needed, that is, for devices requesting queue - * registration. - */ - if (register_queue) - elevator_init_mq(disk->queue); - - /* minors == 0 indicates to use ext devt from part0 and should - * be accompanied with EXT_DEVT flag. Make sure all - * parameters make sense. - */ - WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && - !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); - - disk->flags |= GENHD_FL_UP; - - retval = blk_alloc_devt(&disk->part0, &devt); - if (retval) { - WARN_ON(1); - return; - } - disk->major = MAJOR(devt); - disk->first_minor = MINOR(devt); - - disk_alloc_events(disk); + ret = blk_register_queue(disk); + if (ret) + goto out_put_slave_dir; if (disk->flags & GENHD_FL_HIDDEN) { /* @@ -811,65 +489,56 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct backing_dev_info *bdi = disk->queue->backing_dev_info; - struct device *dev = disk_to_dev(disk); - int ret; + ret = bdi_register(disk->bdi, "%u:%u", + disk->major, disk->first_minor); + if (ret) + goto out_unregister_queue; + bdi_set_owner(disk->bdi, ddev); + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + if (ret) + goto out_unregister_bdi; - /* Register BDI before referencing it from bdev */ - dev->devt = devt; - ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); - WARN_ON(ret); - bdi_set_owner(bdi, dev); - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); + bdev_add(disk->part0, ddev->devt); + disk_scan_partitions(disk); + + /* + * Announce the disk and partitions after all partitions are + * created. (for hidden disks uevents remain suppressed forever) + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); } - register_disk(parent, disk, groups); - if (register_queue) - blk_register_queue(disk); - - /* - * Take an extra ref on queue which will be put on disk_release() - * so that it sticks around as long as @disk is there. - */ - WARN_ON_ONCE(!blk_get_queue(disk->queue)); + disk_update_readahead(disk); disk_add_events(disk); - blk_integrity_add(disk); -} + return 0; -void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) - -{ - __device_add_disk(parent, disk, groups, true); +out_unregister_bdi: + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->bdi); +out_unregister_queue: + blk_unregister_queue(disk); +out_put_slave_dir: + kobject_put(disk->slave_dir); +out_put_holder_dir: + kobject_put(disk->part0->bd_holder_dir); +out_del_integrity: + blk_integrity_del(disk); +out_del_block_link: + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(ddev)); +out_device_del: + device_del(ddev); +out_disk_release_events: + disk_release_events(disk); +out_free_ext_minor: + if (disk->major == BLOCK_EXT_MAJOR) + blk_free_ext_minor(disk->first_minor); + return WARN_ON_ONCE(ret); /* keep until all callers handle errors */ } EXPORT_SYMBOL(device_add_disk); -void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) -{ - __device_add_disk(parent, disk, NULL, false); -} -EXPORT_SYMBOL(device_add_disk_no_queue_reg); - -static void invalidate_partition(struct gendisk *disk, int partno) -{ - struct block_device *bdev; - - bdev = bdget_disk(disk, partno); - if (!bdev) - return; - - fsync_bdev(bdev); - __invalidate_device(bdev, true); - - /* - * Unhash the bdev inode for this device so that it gets evicted as soon - * as last inode reference is dropped. - */ - remove_inode_hash(bdev->bd_inode); - bdput(bdev); -} - /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove @@ -891,62 +560,62 @@ static void invalidate_partition(struct gendisk *disk, int partno) */ void del_gendisk(struct gendisk *disk) { - struct disk_part_iter piter; - struct hd_struct *part; + struct request_queue *q = disk->queue; might_sleep(); + if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) + return; + blk_integrity_del(disk); disk_del_events(disk); + mutex_lock(&disk->open_mutex); + remove_inode_hash(disk->part0->bd_inode); + blk_drop_partitions(disk); + mutex_unlock(&disk->open_mutex); + + fsync_bdev(disk->part0); + __invalidate_device(disk->part0, true); + /* - * Block lookups of the disk until all bdevs are unhashed and the - * disk is marked as dead (GENHD_FL_UP cleared). + * Fail any new I/O. */ - down_write(&disk->lookup_sem); - /* invalidate stuff */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); - while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->partno); - delete_partition(part); - } - disk_part_iter_exit(&piter); - - invalidate_partition(disk, 0); + set_bit(GD_DEAD, &disk->state); set_capacity(disk, 0); - disk->flags &= ~GENHD_FL_UP; - up_write(&disk->lookup_sem); - if (!(disk->flags & GENHD_FL_HIDDEN)) + /* + * Prevent new I/O from crossing bio_queue_enter(). + */ + blk_queue_start_drain(q); + blk_mq_freeze_queue_wait(q); + + rq_qos_exit(q); + blk_sync_queue(q); + blk_flush_integrity(); + /* + * Allow using passthrough request again after the queue is torn down. + */ + blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + __blk_mq_unfreeze_queue(q, true); + + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - if (disk->queue) { + /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - if (!(disk->flags & GENHD_FL_HIDDEN)) - bdi_unregister(disk->queue->backing_dev_info); - blk_unregister_queue(disk); - } else { - WARN_ON(1); + bdi_unregister(disk->bdi); } - if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk_devt(disk), disk->minors); - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(disk_devt(disk)); + blk_unregister_queue(disk); - kobject_put(disk->part0.holder_dir); + kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); - part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; + part_stat_set_all(disk->part0, 0); + disk->part0->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -979,86 +648,26 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } -/** - * get_gendisk - get partitioning information for a given device - * @devt: device to get partitioning information for - * @partno: returned partition index - * - * This function gets the structure containing partitioning - * information for the given device @devt. - * - * Context: can sleep - */ -struct gendisk *get_gendisk(dev_t devt, int *partno) +void blk_request_module(dev_t devt) { - struct gendisk *disk = NULL; + unsigned int major = MAJOR(devt); + struct blk_major_name **n; - might_sleep(); - - if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - struct kobject *kobj; - - kobj = kobj_lookup(bdev_map, devt, partno); - if (kobj) - disk = dev_to_disk(kobj_to_dev(kobj)); - } else { - struct hd_struct *part; - - spin_lock_bh(&ext_devt_lock); - part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk_and_module(part_to_disk(part))) { - *partno = part->partno; - disk = part_to_disk(part); + mutex_lock(&major_names_lock); + for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { + if ((*n)->major == major && (*n)->probe) { + (*n)->probe(devt); + mutex_unlock(&major_names_lock); + return; } - spin_unlock_bh(&ext_devt_lock); } + mutex_unlock(&major_names_lock); - if (!disk) - return NULL; - - /* - * Synchronize with del_gendisk() to not return disk that is being - * destroyed. - */ - down_read(&disk->lookup_sem); - if (unlikely((disk->flags & GENHD_FL_HIDDEN) || - !(disk->flags & GENHD_FL_UP))) { - up_read(&disk->lookup_sem); - put_disk_and_module(disk); - disk = NULL; - } else { - up_read(&disk->lookup_sem); - } - return disk; + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("block-major-%d", MAJOR(devt)); } -/** - * bdget_disk - do bdget() by gendisk and partition number - * @disk: gendisk of interest - * @partno: partition number - * - * Find partition @partno from @disk, do bdget() on it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Resulting block_device on success, NULL on failure. - */ -struct block_device *bdget_disk(struct gendisk *disk, int partno) -{ - struct hd_struct *part; - struct block_device *bdev = NULL; - - part = disk_get_part(disk, partno); - if (part) - bdev = bdget_part(part); - disk_put_part(part); - - return bdev; -} -EXPORT_SYMBOL(bdget_disk); - /* * print a full list of all partitions - intended for places where the root * filesystem can't be mounted and thus to give the victim some idea of what @@ -1072,10 +681,9 @@ void __init printk_all_partitions(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct disk_part_iter piter; - struct hd_struct *part; - char name_buf[BDEVNAME_SIZE]; + struct block_device *part; char devt_buf[BDEVT_SIZE]; + unsigned long idx; /* * Don't show empty devices or things that have been @@ -1086,29 +694,28 @@ void __init printk_all_partitions(void) continue; /* - * Note, unlike /proc/partitions, I am showing the - * numbers in hex - the same format as the root= - * option takes. + * Note, unlike /proc/partitions, I am showing the numbers in + * hex - the same format as the root= option takes. */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == &disk->part0; - - printk("%s%s %10llu %s %s", is_part0 ? "" : " ", - bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part_nr_sects_read(part) >> 1 - , disk_name(disk, part->partno, name_buf), - part->info ? part->info->uuid : ""); - if (is_part0) { - if (dev->parent && dev->parent->driver) - printk(" driver: %s\n", - dev->parent->driver->name); - else - printk(" (driver?)\n"); - } else + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + printk("%s%s %10llu %pg %s", + bdev_is_partition(part) ? " " : "", + bdevt_str(part->bd_dev, devt_buf), + bdev_nr_sectors(part) >> 1, part, + part->bd_meta_info ? + part->bd_meta_info->uuid : ""); + if (bdev_is_partition(part)) printk("\n"); + else if (dev->parent && dev->parent->driver) + printk(" driver: %s\n", + dev->parent->driver->name); + else + printk(" (driver?)\n"); } - disk_part_iter_exit(&piter); + rcu_read_unlock(); } class_dev_iter_exit(&iter); } @@ -1173,9 +780,8 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; - struct disk_part_iter piter; - struct hd_struct *part; - char buf[BDEVNAME_SIZE]; + struct block_device *part; + unsigned long idx; /* Don't show non-partitionable removeable devices or empty devices */ if (!get_capacity(sgp) || (!disk_max_parts(sgp) && @@ -1184,15 +790,15 @@ static int show_partition(struct seq_file *seqf, void *v) if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) return 0; - /* show the full disk and all non-0 size partitions of it */ - disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - seq_printf(seqf, "%4d %7d %10llu %s\n", - MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part_nr_sects_read(part) >> 1, - disk_name(sgp, part->partno, buf)); - disk_part_iter_exit(&piter); - + rcu_read_lock(); + xa_for_each(&sgp->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + seq_printf(seqf, "%4d %7d %10llu %pg\n", + MAJOR(part->bd_dev), MINOR(part->bd_dev), + bdev_nr_sectors(part) >> 1, part); + } + rcu_read_unlock(); return 0; } @@ -1204,15 +810,6 @@ static const struct seq_operations partitions_op = { }; #endif - -static struct kobject *base_probe(dev_t devt, int *partno, void *data) -{ - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); - return NULL; -} - static int __init genhd_device_init(void) { int error; @@ -1221,7 +818,6 @@ static int __init genhd_device_init(void) error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map = kobj_map_init(base_probe, &block_class_lock); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); @@ -1279,25 +875,22 @@ static ssize_t disk_ro_show(struct device *dev, ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", - (unsigned long long)part_nr_sects_read(p)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; struct disk_stats stat; unsigned int inflight; - part_stat_read_all(p, &stat); + part_stat_read_all(bdev, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p); + inflight = blk_mq_in_flight(q, bdev); else - inflight = part_in_flight(p); + inflight = part_in_flight(bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1332,14 +925,14 @@ ssize_t part_stat_show(struct device *dev, ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p, inflight); + blk_mq_in_flight_rw(q, bdev, inflight); else - part_in_flight_rw(p, inflight); + part_in_flight_rw(bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1370,6 +963,14 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static ssize_t diskseq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->diskseq); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -1382,25 +983,23 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); +static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct hd_struct *p = dev_to_part(dev); int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; + dev_to_bdev(dev)->bd_make_it_fail = i; return count; } @@ -1427,6 +1026,10 @@ static struct attribute *disk_attrs[] = { &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + &dev_attr_diskseq.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1456,94 +1059,12 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; -/** - * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way - * @disk: disk to replace part_tbl for - * @new_ptbl: new part_tbl to install - * - * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The - * original ptbl is freed using RCU callback. - * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. - */ -static void disk_replace_part_tbl(struct gendisk *disk, - struct disk_part_tbl *new_ptbl) -{ - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(disk->part_tbl, new_ptbl); - - if (old_ptbl) { - rcu_assign_pointer(old_ptbl->last_lookup, NULL); - kfree_rcu(old_ptbl, rcu_head); - } -} - -/** - * disk_expand_part_tbl - expand disk->part_tbl - * @disk: disk to expand part_tbl for - * @partno: expand such that this partno can fit in - * - * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl - * uses RCU to allow unlocked dereferencing for stats and other stuff. - * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int disk_expand_part_tbl(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - struct disk_part_tbl *new_ptbl; - int len = old_ptbl ? old_ptbl->len : 0; - int i, target; - - /* - * check for int overflow, since we can get here from blkpg_ioctl() - * with a user passed 'partno'. - */ - target = partno + 1; - if (target < 0) - return -EINVAL; - - /* disk_max_parts() is zero during initialization, ignore if so */ - if (disk_max_parts(disk) && target > disk_max_parts(disk)) - return -EINVAL; - - if (target <= len) - return 0; - - new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, - disk->node_id); - if (!new_ptbl) - return -ENOMEM; - - new_ptbl->len = target; - - for (i = 0; i < len; i++) - rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); - - disk_replace_part_tbl(disk, new_ptbl); - return 0; -} - /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk * * This function releases all allocated resources of the gendisk. * - * The struct gendisk refcount is incremented with get_gendisk() or - * get_disk_and_module(), and its refcount is decremented with - * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this - * function is called. - * * Drivers which used __device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the @@ -1557,18 +1078,26 @@ static void disk_release(struct device *dev) struct gendisk *disk = dev_to_disk(dev); might_sleep(); + WARN_ON_ONCE(disk_live(disk)); - blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); - disk_replace_part_tbl(disk, NULL); - hd_free_part(&disk->part0); - if (disk->queue) - blk_put_queue(disk->queue); - kfree(disk); + xa_destroy(&disk->part_tbl); + disk->queue->disk = NULL; + blk_put_queue(disk->queue); + iput(disk->part0->bd_inode); /* frees the disk */ } + +static int block_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct gendisk *disk = dev_to_disk(dev); + + return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); +} + struct class block_class = { .name = "block", + .dev_uevent = block_uevent, }; static char *block_devnode(struct device *dev, umode_t *mode, @@ -1599,11 +1128,10 @@ const struct device_type disk_type = { static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; - struct disk_part_iter piter; - struct hd_struct *hd; - char buf[BDEVNAME_SIZE]; + struct block_device *hd; unsigned int inflight; struct disk_stats stat; + unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1613,23 +1141,24 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n\n"); */ - disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); - while ((hd = disk_part_iter_next(&piter))) { + rcu_read_lock(); + xa_for_each(&gp->part_tbl, idx, hd) { + if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) + continue; part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else inflight = part_in_flight(hd); - seq_printf(seqf, "%4d %7d %s " + seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], @@ -1657,7 +1186,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) NSEC_PER_MSEC) ); } - disk_part_iter_exit(&piter); + rcu_read_unlock(); return 0; } @@ -1678,6 +1207,20 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ +dev_t part_devt(struct gendisk *disk, u8 partno) +{ + struct block_device *part; + dev_t devt = 0; + + rcu_read_lock(); + part = xa_load(&disk->part_tbl, partno); + if (part) + devt = part->bd_dev; + rcu_read_unlock(); + + return devt; +} + dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); @@ -1687,7 +1230,6 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part; if (strcmp(dev_name(dev), name)) continue; @@ -1698,106 +1240,86 @@ dev_t blk_lookup_devt(const char *name, int partno) */ devt = MKDEV(MAJOR(dev->devt), MINOR(dev->devt) + partno); - break; + } else { + devt = part_devt(disk, partno); + if (devt) + break; } - part = disk_get_part(disk, partno); - if (part) { - devt = part_devt(part); - disk_put_part(part); - break; - } - disk_put_part(part); } class_dev_iter_exit(&iter); return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id) +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; - struct disk_part_tbl *ptbl; - if (minors > DISK_MAX_PARTS) { - printk(KERN_ERR - "block: can't allocate more than %d partitions\n", - DISK_MAX_PARTS); - minors = DISK_MAX_PARTS; - } + if (!blk_get_queue(q)) + return NULL; disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) - return NULL; + goto out_put_queue; - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) + disk->bdi = bdi_alloc(node_id); + if (!disk->bdi) goto out_free_disk; - init_rwsem(&disk->lookup_sem); + disk->part0 = bdev_alloc(disk, 0); + if (!disk->part0) + goto out_free_bdi; + disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_percpu(disk->part0.dkstats); - goto out_free_disk; - } + mutex_init(&disk->open_mutex); + xa_init(&disk->part_tbl); + if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) + goto out_destroy_part_tbl; - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); - - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); - if (hd_ref_init(&disk->part0)) - goto out_free_part0; - - disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); + inc_diskseq(disk); + disk->queue = q; + q->disk = disk; + lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + INIT_LIST_HEAD(&disk->slave_bdevs); +#endif return disk; -out_free_part0: - hd_free_part(&disk->part0); +out_destroy_part_tbl: + xa_destroy(&disk->part_tbl); + disk->part0->bd_disk = NULL; + iput(disk->part0->bd_inode); +out_free_bdi: + bdi_put(disk->bdi); out_free_disk: kfree(disk); +out_put_queue: + blk_put_queue(q); return NULL; } EXPORT_SYMBOL(__alloc_disk_node); -/** - * get_disk_and_module - increments the gendisk and gendisk fops module refcount - * @disk: the struct gendisk to increment the refcount for - * - * This increments the refcount for the struct gendisk, and the gendisk's - * fops module owner. - * - * Context: Any context. - */ -struct kobject *get_disk_and_module(struct gendisk *disk) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { - struct module *owner; - struct kobject *kobj; + struct request_queue *q; + struct gendisk *disk; - if (!disk->fops) + q = blk_alloc_queue(node); + if (!q) return NULL; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return NULL; - kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); - if (kobj == NULL) { - module_put(owner); + + disk = __alloc_disk_node(q, node, lkclass); + if (!disk) { + blk_cleanup_queue(q); return NULL; } - return kobj; - + return disk; } -EXPORT_SYMBOL(get_disk_and_module); +EXPORT_SYMBOL(__blk_alloc_disk); /** * put_disk - decrements the gendisk refcount @@ -1812,30 +1334,25 @@ EXPORT_SYMBOL(get_disk_and_module); void put_disk(struct gendisk *disk) { if (disk) - kobject_put(&disk_to_dev(disk)->kobj); + put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); /** - * put_disk_and_module - decrements the module and gendisk refcount - * @disk: the struct gendisk to decrement the refcount for + * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk + * @disk: gendisk to shutdown * - * This is a counterpart of get_disk_and_module() and thus also of - * get_gendisk(). + * Mark the queue hanging off @disk DYING, drain all pending requests, then mark + * the queue DEAD, destroy and put it and the gendisk structure. * - * Context: Any context, but the last reference must not be dropped from - * atomic context. + * Context: can sleep */ -void put_disk_and_module(struct gendisk *disk) +void blk_cleanup_disk(struct gendisk *disk) { - if (disk) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - } + blk_cleanup_queue(disk->queue); + put_disk(disk); } -EXPORT_SYMBOL(put_disk_and_module); +EXPORT_SYMBOL(blk_cleanup_disk); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { @@ -1847,521 +1364,35 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } -void set_device_ro(struct block_device *bdev, int flag) +/** + * set_disk_ro - set a gendisk read-only + * @disk: gendisk to operate on + * @read_only: %true to set the disk read-only, %false set the disk read/write + * + * This function is used to indicate whether a given disk device should have its + * read-only flag set. set_disk_ro() is typically used by device drivers to + * indicate whether the underlying physical device is write-protected. + */ +void set_disk_ro(struct gendisk *disk, bool read_only) { - bdev->bd_part->policy = flag; -} - -EXPORT_SYMBOL(set_device_ro); - -void set_disk_ro(struct gendisk *disk, int flag) -{ - struct disk_part_iter piter; - struct hd_struct *part; - - if (disk->part0.policy != flag) { - set_disk_ro_uevent(disk, flag); - disk->part0.policy = flag; + if (read_only) { + if (test_and_set_bit(GD_READ_ONLY, &disk->state)) + return; + } else { + if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) + return; } - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - part->policy = flag; - disk_part_iter_exit(&piter); + set_disk_ro_uevent(disk, read_only); } - EXPORT_SYMBOL(set_disk_ro); int bdev_read_only(struct block_device *bdev) { - if (!bdev) - return 0; - return bdev->bd_part->policy; + return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } - EXPORT_SYMBOL(bdev_read_only); -/* - * Disk events - monitor disk events like media change and eject request. - */ -struct disk_events { - struct list_head node; /* all disk_event's */ - struct gendisk *disk; /* the associated disk */ - spinlock_t lock; - - struct mutex block_mutex; /* protects blocking */ - int block; /* event blocking depth */ - unsigned int pending; /* events already sent out */ - unsigned int clearing; /* events being cleared */ - - long poll_msecs; /* interval, -1 for default */ - struct delayed_work dwork; -}; - -static const char *disk_events_strs[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", -}; - -static char *disk_uevents[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", -}; - -/* list of all disk_events */ -static DEFINE_MUTEX(disk_events_mutex); -static LIST_HEAD(disk_events); - -/* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs; - -static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +void inc_diskseq(struct gendisk *disk) { - struct disk_events *ev = disk->ev; - long intv_msecs = 0; - - /* - * If device-specific poll interval is set, always use it. If - * the default is being used, poll if the POLL flag is set. - */ - if (ev->poll_msecs >= 0) - intv_msecs = ev->poll_msecs; - else if (disk->event_flags & DISK_EVENT_FLAG_POLL) - intv_msecs = disk_events_dfl_poll_msecs; - - return msecs_to_jiffies(intv_msecs); -} - -/** - * disk_block_events - block and flush disk event checking - * @disk: disk to block events for - * - * On return from this function, it is guaranteed that event checking - * isn't in progress and won't happen until unblocked by - * disk_unblock_events(). Events blocking is counted and the actual - * unblocking happens after the matching number of unblocks are done. - * - * Note that this intentionally does not block event checking from - * disk_clear_events(). - * - * CONTEXT: - * Might sleep. - */ -void disk_block_events(struct gendisk *disk) -{ - struct disk_events *ev = disk->ev; - unsigned long flags; - bool cancel; - - if (!ev) - return; - - /* - * Outer mutex ensures that the first blocker completes canceling - * the event work before further blockers are allowed to finish. - */ - mutex_lock(&ev->block_mutex); - - spin_lock_irqsave(&ev->lock, flags); - cancel = !ev->block++; - spin_unlock_irqrestore(&ev->lock, flags); - - if (cancel) - cancel_delayed_work_sync(&disk->ev->dwork); - - mutex_unlock(&ev->block_mutex); -} - -static void __disk_unblock_events(struct gendisk *disk, bool check_now) -{ - struct disk_events *ev = disk->ev; - unsigned long intv; - unsigned long flags; - - spin_lock_irqsave(&ev->lock, flags); - - if (WARN_ON_ONCE(ev->block <= 0)) - goto out_unlock; - - if (--ev->block) - goto out_unlock; - - intv = disk_events_poll_jiffies(disk); - if (check_now) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - else if (intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); -out_unlock: - spin_unlock_irqrestore(&ev->lock, flags); -} - -/** - * disk_unblock_events - unblock disk event checking - * @disk: disk to unblock events for - * - * Undo disk_block_events(). When the block count reaches zero, it - * starts events polling if configured. - * - * CONTEXT: - * Don't care. Safe to call from irq context. - */ -void disk_unblock_events(struct gendisk *disk) -{ - if (disk->ev) - __disk_unblock_events(disk, false); -} - -/** - * disk_flush_events - schedule immediate event checking and flushing - * @disk: disk to check and flush events for - * @mask: events to flush - * - * Schedule immediate event checking on @disk if not blocked. Events in - * @mask are scheduled to be cleared from the driver. Note that this - * doesn't clear the events from @disk->ev. - * - * CONTEXT: - * If @mask is non-zero must be called with bdev->bd_mutex held. - */ -void disk_flush_events(struct gendisk *disk, unsigned int mask) -{ - struct disk_events *ev = disk->ev; - - if (!ev) - return; - - spin_lock_irq(&ev->lock); - ev->clearing |= mask; - if (!ev->block) - mod_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - spin_unlock_irq(&ev->lock); -} - -/** - * disk_clear_events - synchronously check, clear and return pending events - * @disk: disk to fetch and clear events from - * @mask: mask of events to be fetched and cleared - * - * Disk events are synchronously checked and pending events in @mask - * are cleared and returned. This ignores the block count. - * - * CONTEXT: - * Might sleep. - */ -static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) -{ - struct disk_events *ev = disk->ev; - unsigned int pending; - unsigned int clearing = mask; - - if (!ev) - return 0; - - disk_block_events(disk); - - /* - * store the union of mask and ev->clearing on the stack so that the - * race with disk_flush_events does not cause ambiguity (ev->clearing - * can still be modified even if events are blocked). - */ - spin_lock_irq(&ev->lock); - clearing |= ev->clearing; - ev->clearing = 0; - spin_unlock_irq(&ev->lock); - - disk_check_events(ev, &clearing); - /* - * if ev->clearing is not 0, the disk_flush_events got called in the - * middle of this function, so we want to run the workfn without delay. - */ - __disk_unblock_events(disk, ev->clearing ? true : false); - - /* then, fetch and clear pending events */ - spin_lock_irq(&ev->lock); - pending = ev->pending & mask; - ev->pending &= ~mask; - spin_unlock_irq(&ev->lock); - WARN_ON_ONCE(clearing & mask); - - return pending; -} - -/** - * bdev_check_media_change - check if a removable media has been changed - * @bdev: block device to check - * - * Check whether a removable media has been changed, and attempt to free all - * dentries and inodes and invalidates all block device page cache entries in - * that case. - * - * Returns %true if the block device changed, or %false if not. - */ -bool bdev_check_media_change(struct block_device *bdev) -{ - unsigned int events; - - events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | - DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return false; - - if (__invalidate_device(bdev, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - bdev->bd_disk->disk_name); - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - return true; -} -EXPORT_SYMBOL(bdev_check_media_change); - -/* - * Separate this part out so that a different pointer for clearing_ptr can be - * passed in for disk_clear_events. - */ -static void disk_events_workfn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct disk_events *ev = container_of(dwork, struct disk_events, dwork); - - disk_check_events(ev, &ev->clearing); -} - -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr) -{ - struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; - unsigned int clearing = *clearing_ptr; - unsigned int events; - unsigned long intv; - int nr_events = 0, i; - - /* check events */ - events = disk->fops->check_events(disk, clearing); - - /* accumulate pending events and schedule next poll if necessary */ - spin_lock_irq(&ev->lock); - - events &= ~ev->pending; - ev->pending |= events; - *clearing_ptr &= ~clearing; - - intv = disk_events_poll_jiffies(disk); - if (!ev->block && intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); - - spin_unlock_irq(&ev->lock); - - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; - - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); -} - -/* - * A disk events enabled device has the following sysfs nodes under - * its /sys/block/X/ directory. - * - * events : list of all supported events - * events_async : list of events which can be detected w/o polling - * (always empty, only for backwards compatibility) - * events_poll_msecs : polling interval, 0: disable, -1: system default - */ -static ssize_t __disk_events_show(unsigned int events, char *buf) -{ - const char *delim = ""; - ssize_t pos = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) - if (events & (1 << i)) { - pos += sprintf(buf + pos, "%s%s", - delim, disk_events_strs[i]); - delim = " "; - } - if (pos) - pos += sprintf(buf + pos, "\n"); - return pos; -} - -static ssize_t disk_events_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - return 0; - - return __disk_events_show(disk->events, buf); -} - -static ssize_t disk_events_async_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return 0; -} - -static ssize_t disk_events_poll_msecs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!disk->ev) - return sprintf(buf, "-1\n"); - - return sprintf(buf, "%ld\n", disk->ev->poll_msecs); -} - -static ssize_t disk_events_poll_msecs_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct gendisk *disk = dev_to_disk(dev); - long intv; - - if (!count || !sscanf(buf, "%ld", &intv)) - return -EINVAL; - - if (intv < 0 && intv != -1) - return -EINVAL; - - if (!disk->ev) - return -ENODEV; - - disk_block_events(disk); - disk->ev->poll_msecs = intv; - __disk_unblock_events(disk, true); - - return count; -} - -static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); -static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); -static const DEVICE_ATTR(events_poll_msecs, 0644, - disk_events_poll_msecs_show, - disk_events_poll_msecs_store); - -static const struct attribute *disk_events_attrs[] = { - &dev_attr_events.attr, - &dev_attr_events_async.attr, - &dev_attr_events_poll_msecs.attr, - NULL, -}; - -/* - * The default polling interval can be specified by the kernel - * parameter block.events_dfl_poll_msecs which defaults to 0 - * (disable). This can also be modified runtime by writing to - * /sys/module/block/parameters/events_dfl_poll_msecs. - */ -static int disk_events_set_dfl_poll_msecs(const char *val, - const struct kernel_param *kp) -{ - struct disk_events *ev; - int ret; - - ret = param_set_ulong(val, kp); - if (ret < 0) - return ret; - - mutex_lock(&disk_events_mutex); - - list_for_each_entry(ev, &disk_events, node) - disk_flush_events(ev->disk, 0); - - mutex_unlock(&disk_events_mutex); - - return 0; -} - -static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { - .set = disk_events_set_dfl_poll_msecs, - .get = param_get_ulong, -}; - -#undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "block." - -module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, - &disk_events_dfl_poll_msecs, 0644); - -/* - * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. - */ -static void disk_alloc_events(struct gendisk *disk) -{ - struct disk_events *ev; - - if (!disk->fops->check_events || !disk->events) - return; - - ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) { - pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; - } - - INIT_LIST_HEAD(&ev->node); - ev->disk = disk; - spin_lock_init(&ev->lock); - mutex_init(&ev->block_mutex); - ev->block = 1; - ev->poll_msecs = -1; - INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); - - disk->ev = ev; -} - -static void disk_add_events(struct gendisk *disk) -{ - /* FIXME: error handling */ - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - - if (!disk->ev) - return; - - mutex_lock(&disk_events_mutex); - list_add_tail(&disk->ev->node, &disk_events); - mutex_unlock(&disk_events_mutex); - - /* - * Block count is initialized to 1 and the following initial - * unblock kicks it into action. - */ - __disk_unblock_events(disk, true); -} - -static void disk_del_events(struct gendisk *disk) -{ - if (disk->ev) { - disk_block_events(disk); - - mutex_lock(&disk_events_mutex); - list_del_init(&disk->ev->node); - mutex_unlock(&disk_events_mutex); - } - - sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); -} - -static void disk_release_events(struct gendisk *disk) -{ - /* the block count should be 1 from disk_del_events() */ - WARN_ON_ONCE(disk->ev && disk->ev->block != 1); - kfree(disk->ev); + disk->diskseq = atomic64_inc_return(&diskseq); } diff --git a/block/holder.c b/block/holder.c new file mode 100644 index 0000000000..9dc0841823 --- /dev/null +++ b/block/holder.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +struct bd_holder_disk { + struct list_head list; + struct block_device *bdev; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &disk->slave_bdevs, list) + if (holder->bdev == bdev) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + int ret; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + return ret; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + return ret; +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + mutex_lock(&disk->open_mutex); + + WARN_ON_ONCE(!bdev->bd_holder); + + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!bdev->bd_holder_dir)) + goto out_unlock; + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->bdev = bdev; + holder->refcnt = 1; + if (disk->slave_dir) { + ret = __link_disk_holder(bdev, disk); + if (ret) { + kfree(holder); + goto out_unlock; + } + } + + list_add(&holder->list, &disk->slave_bdevs); + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we need + * to keep our own here to allow for cleanup past that point. + */ + kobject_get(bdev->bd_holder_dir); + +out_unlock: + mutex_unlock(&disk->open_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +static void __unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); +} + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + mutex_lock(&disk->open_mutex); + holder = bd_find_holder_disk(bdev, disk); + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + if (disk->slave_dir) + __unlink_disk_holder(bdev, disk); + kobject_put(bdev->bd_holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + mutex_unlock(&disk->open_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); + +int bd_register_pending_holders(struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret; + + mutex_lock(&disk->open_mutex); + list_for_each_entry(holder, &disk->slave_bdevs, list) { + ret = __link_disk_holder(holder->bdev, disk); + if (ret) + goto out_undo; + } + mutex_unlock(&disk->open_mutex); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) + __unlink_disk_holder(holder->bdev, disk); + mutex_unlock(&disk->open_mutex); + return ret; +} diff --git a/block/ioctl.c b/block/ioctl.c index ed240e170e..eb0491e90b 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -16,6 +16,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { + struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; long long start, length; @@ -30,28 +31,19 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; if (op == BLKPG_DEL_PARTITION) - return bdev_del_partition(bdev, p.pno); + return bdev_del_partition(disk, p.pno); start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) < sizeof(long long)) { - long pstart = start, plength = length; - - if (pstart != start || plength != length || pstart < 0 || - plength < 0 || p.pno > 65535) - return -EINVAL; - } - switch (op) { case BLKPG_ADD_PARTITION: /* check if partition is aligned to blocksize */ if (p.start & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - return bdev_add_partition(bdev, p.pno, start, length); + return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: - return bdev_resize_partition(bdev, p.pno, start, length); + return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } @@ -98,7 +90,7 @@ static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (bdev->bd_part_count) + if (bdev->bd_disk->open_partitions) return -EBUSY; /* @@ -228,23 +220,6 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) } #endif -int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - struct gendisk *disk = bdev->bd_disk; - - if (disk->fops->ioctl) - return disk->fops->ioctl(bdev, mode, cmd, arg); - - return -ENOTTY; -} -/* - * For the record: _GPL here is only because somebody decided to slap it - * on the previous export. Sheer idiocy, since it wasn't copyrightable - * at all and could be open-coded without any exports by anybody who cares. - */ -EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); - #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block @@ -355,38 +330,11 @@ static int blkdev_pr_clear(struct block_device *bdev, return ops->pr_clear(bdev, c.key); } -/* - * Is it an unrecognized ioctl? The correct returns are either - * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a - * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl - * code before returning. - * - * Confused drivers sometimes return EINVAL, which is wrong. It - * means "I understood the ioctl command, but the parameters to - * it were wrong". - * - * We should aim to just fix the broken drivers, the EINVAL case - * should go away. - */ -static inline int is_unrecognized_ioctl(int ret) -{ - return ret == -EINVAL || - ret == -ENOTTY || - ret == -ENOIOCTLCMD; -} - static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; - fsync_bdev(bdev); invalidate_bdev(bdev); return 0; @@ -400,12 +348,14 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; - set_device_ro(bdev, n); + if (bdev->bd_disk->fops->set_read_only) { + ret = bdev->bd_disk->fops->set_read_only(bdev, n); + if (ret) + return ret; + } + bdev->bd_read_only = n; return 0; } @@ -520,6 +470,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKGETDISKSEQ: + return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: @@ -555,7 +507,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: return blkdev_reread_part(bdev, mode); @@ -605,7 +557,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRAGET: if (!argp) return -EINVAL; - return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + return put_long(argp, + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -628,10 +581,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); - if (ret == -ENOIOCTLCMD) - return __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (ret != -ENOIOCTLCMD) + return ret; - return ret; + if (!bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ @@ -648,8 +603,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); - struct inode *inode = file->f_mapping->host; - struct block_device *bdev = inode->i_bdev; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; loff_t size; @@ -676,7 +630,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!argp) return -EINVAL; return compat_put_long(argp, - (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) diff --git a/block/ioprio.c b/block/ioprio.c index 364d2294ba..0e4ff245f2 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -74,9 +74,8 @@ int ioprio_check_cap(int ioprio) fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; - break; case IOPRIO_CLASS_IDLE: break; @@ -119,11 +118,17 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); + + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); - if (ret) - break; + if (ret) { + read_unlock(&tasklist_lock); + goto out; + } } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); + break; case IOPRIO_WHO_USER: uid = make_kuid(current_user_ns(), who); @@ -153,6 +158,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) ret = -EINVAL; } +out: rcu_read_unlock(); return ret; } @@ -164,7 +170,7 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + ret = IOPRIO_DEFAULT; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; @@ -176,9 +182,9 @@ static int get_task_ioprio(struct task_struct *p) int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) - aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + aprio = IOPRIO_DEFAULT; if (!ioprio_valid(bprio)) - bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + bprio = IOPRIO_DEFAULT; return min(aprio, bprio); } diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c index 86f8195d80..2c4a55bea6 100644 --- a/block/keyslot-manager.c +++ b/block/keyslot-manager.c @@ -29,6 +29,7 @@ #define pr_fmt(fmt) "blk-crypto: " fmt #include +#include #include #include #include @@ -62,6 +63,11 @@ static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) pm_runtime_put_sync(ksm->dev); } +static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm) +{ + return ksm->num_slots == 0; +} + /** * blk_ksm_init() - Initialize a keyslot manager * @ksm: The keyslot_manager to initialize. @@ -127,6 +133,34 @@ int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) } EXPORT_SYMBOL_GPL(blk_ksm_init); +static void blk_ksm_destroy_callback(void *ksm) +{ + blk_ksm_destroy(ksm); +} + +/** + * devm_blk_ksm_init() - Resource-managed blk_ksm_init() + * @dev: The device which owns the blk_keyslot_manager. + * @ksm: The blk_keyslot_manager to initialize. + * @num_slots: The number of key slots to manage. + * + * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically + * on driver detach. + * + * Return: 0 on success, or else a negative error code. + */ +int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, + unsigned int num_slots) +{ + int err = blk_ksm_init(ksm, num_slots); + + if (err) + return err; + + return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm); +} +EXPORT_SYMBOL_GPL(devm_blk_ksm_init); + static inline struct hlist_head * blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, const struct blk_crypto_key *key) @@ -205,6 +239,10 @@ blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, int err; *slot_ptr = NULL; + + if (blk_ksm_is_passthrough(ksm)) + return BLK_STS_OK; + down_read(&ksm->lock); slot = blk_ksm_find_and_grab_keyslot(ksm, key); up_read(&ksm->lock); @@ -325,6 +363,16 @@ int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, struct blk_ksm_keyslot *slot; int err = 0; + if (blk_ksm_is_passthrough(ksm)) { + if (ksm->ksm_ll_ops.keyslot_evict) { + blk_ksm_hw_enter(ksm); + err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1); + blk_ksm_hw_exit(ksm); + return err; + } + return 0; + } + blk_ksm_hw_enter(ksm); slot = blk_ksm_find_keyslot(ksm, key); if (!slot) @@ -360,6 +408,9 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) { unsigned int slot; + if (blk_ksm_is_passthrough(ksm)) + return; + /* This is for device initialization, so don't resume the device */ down_write(&ksm->lock); for (slot = 0; slot < ksm->num_slots; slot++) { @@ -401,3 +452,127 @@ void blk_ksm_unregister(struct request_queue *q) { q->ksm = NULL; } + +/** + * blk_ksm_intersect_modes() - restrict supported modes by child device + * @parent: The keyslot manager for parent device + * @child: The keyslot manager for child device, or NULL + * + * Clear any crypto mode support bits in @parent that aren't set in @child. + * If @child is NULL, then all parent bits are cleared. + * + * Only use this when setting up the keyslot manager for a layered device, + * before it's been exposed yet. + */ +void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, + const struct blk_keyslot_manager *child) +{ + if (child) { + unsigned int i; + + parent->max_dun_bytes_supported = + min(parent->max_dun_bytes_supported, + child->max_dun_bytes_supported); + for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported); + i++) { + parent->crypto_modes_supported[i] &= + child->crypto_modes_supported[i]; + } + } else { + parent->max_dun_bytes_supported = 0; + memset(parent->crypto_modes_supported, 0, + sizeof(parent->crypto_modes_supported)); + } +} +EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes); + +/** + * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes + * and DUN bytes that another KSM supports. Here, + * "superset" refers to the mathematical meaning of the + * word - i.e. if two KSMs have the *same* capabilities, + * they *are* considered supersets of each other. + * @ksm_superset: The KSM that we want to verify is a superset + * @ksm_subset: The KSM that we want to verify is a subset + * + * Return: True if @ksm_superset supports a superset of the crypto modes and DUN + * bytes that @ksm_subset supports. + */ +bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, + struct blk_keyslot_manager *ksm_subset) +{ + int i; + + if (!ksm_subset) + return true; + + if (!ksm_superset) + return false; + + for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) { + if (ksm_subset->crypto_modes_supported[i] & + (~ksm_superset->crypto_modes_supported[i])) { + return false; + } + } + + if (ksm_subset->max_dun_bytes_supported > + ksm_superset->max_dun_bytes_supported) { + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_ksm_is_superset); + +/** + * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of + * another KSM + * @target_ksm: The KSM whose restrictions to update. + * @reference_ksm: The KSM to whose restrictions this function will update + * @target_ksm's restrictions to. + * + * Blk-crypto requires that crypto capabilities that were + * advertised when a bio was created continue to be supported by the + * device until that bio is ended. This is turn means that a device cannot + * shrink its advertised crypto capabilities without any explicit + * synchronization with upper layers. So if there's no such explicit + * synchronization, @reference_ksm must support all the crypto capabilities that + * @target_ksm does + * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true). + * + * Note also that as long as the crypto capabilities are being expanded, the + * order of updates becoming visible is not important because it's alright + * for blk-crypto to see stale values - they only cause blk-crypto to + * believe that a crypto capability isn't supported when it actually is (which + * might result in blk-crypto-fallback being used if available, or the bio being + * failed). + */ +void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, + struct blk_keyslot_manager *reference_ksm) +{ + memcpy(target_ksm->crypto_modes_supported, + reference_ksm->crypto_modes_supported, + sizeof(target_ksm->crypto_modes_supported)); + + target_ksm->max_dun_bytes_supported = + reference_ksm->max_dun_bytes_supported; +} +EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities); + +/** + * blk_ksm_init_passthrough() - Init a passthrough keyslot manager + * @ksm: The keyslot manager to init + * + * Initialize a passthrough keyslot manager. + * Called by e.g. storage drivers to set up a keyslot manager in their + * request_queue, when the storage driver wants to manage its keys by itself. + * This is useful for inline encryption hardware that doesn't have the concept + * of keyslots, and for layered devices. + */ +void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm) +{ + memset(ksm, 0, sizeof(*ksm)); + init_rwsem(&ksm->lock); +} +EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 81e3279ecd..a0ffbabfac 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -151,6 +151,7 @@ struct kyber_ctx_queue { struct kyber_queue_data { struct request_queue *q; + dev_t dev; /* * Each scheduling domain has a limited number of in-flight requests @@ -257,7 +258,7 @@ static int calculate_percentile(struct kyber_queue_data *kqd, } memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); - trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain], kyber_latency_type_names[type], percentile, bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); @@ -270,7 +271,7 @@ static void kyber_resize_domain(struct kyber_queue_data *kqd, depth = clamp(depth, 1U, kyber_depth[sched_domain]); if (depth != kqd->domain_tokens[sched_domain].sb.depth) { sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); - trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain], depth); } } @@ -366,6 +367,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) goto err; kqd->q = q; + kqd->dev = disk_devt(q->disk); kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, GFP_KERNEL | __GFP_ZERO); @@ -596,13 +598,13 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *head = &kcq->rq_list[sched_domain]; spin_lock(&kcq->lock); + trace_block_rq_insert(rq); if (at_head) list_move(&rq->queuelist, head); else list_move_tail(&rq->queuelist, head); sbitmap_set_bit(&khd->kcq_map[sched_domain], rq->mq_ctx->index_hw[hctx->type]); - trace_block_rq_insert(rq); spin_unlock(&kcq->lock); } } @@ -774,7 +776,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { @@ -787,7 +789,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 2b9635d0dc..7f3c3932b7 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -18,6 +18,8 @@ #include #include +#include + #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -33,41 +35,122 @@ static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ +enum dd_data_dir { + DD_READ = READ, + DD_WRITE = WRITE, +}; + +enum { DD_DIR_COUNT = 2 }; + +enum dd_prio { + DD_RT_PRIO = 0, + DD_BE_PRIO = 1, + DD_IDLE_PRIO = 2, + DD_PRIO_MAX = 2, +}; + +enum { DD_PRIO_COUNT = 3 }; + +/* I/O statistics per I/O priority. */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + +/* I/O statistics for all I/O priorities (enum dd_prio). */ +struct io_stats { + struct io_stats_per_prio stats[DD_PRIO_COUNT]; +}; + +/* + * Deadline scheduler data per I/O priority (enum dd_prio). Requests are + * present on both sort_list[] and fifo_list[]. + */ +struct dd_per_prio { + struct list_head dispatch; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; + /* Next request in FIFO order. Read, write or both are NULL. */ + struct request *next_rq[DD_DIR_COUNT]; +}; + struct deadline_data { /* * run time data */ - /* - * requests (deadline_rq s) are present on both sort_list and fifo_list - */ - struct rb_root sort_list[2]; - struct list_head fifo_list[2]; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; - /* - * next in sort order. read, write or both are NULL - */ - struct request *next_rq[2]; + /* Data direction of latest dispatched request. */ + enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ + struct io_stats __percpu *stats; + /* * settings that change how the i/o scheduler behaves */ - int fifo_expire[2]; + int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; + u32 async_depth; spinlock_t lock; spinlock_t zone_lock; - struct list_head dispatch; +}; + +/* Count one event of type 'event_type' and with I/O priority 'prio' */ +#define dd_count(dd, event_type, prio) do { \ + struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + local_inc(&io_stats->stats[(prio)].event_type); \ + put_cpu_ptr(io_stats); \ +} while (0) + +/* + * Returns the total number of dd_count(dd, event_type, prio) calls across all + * CPUs. No locking or barriers since it is fine if the returned sum is slightly + * outdated. + */ +#define dd_sum(dd, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + +/* Maps an I/O priority class to a deadline scheduler priority. */ +static const enum dd_prio ioprio_class_to_prio[] = { + [IOPRIO_CLASS_NONE] = DD_BE_PRIO, + [IOPRIO_CLASS_RT] = DD_RT_PRIO, + [IOPRIO_CLASS_BE] = DD_BE_PRIO, + [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * -deadline_rb_root(struct deadline_data *dd, struct request *rq) +deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { - return &dd->sort_list[rq_data_dir(rq)]; + return &per_prio->sort_list[rq_data_dir(rq)]; +} + +/* + * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a + * request. + */ +static u8 dd_rq_ioclass(struct request *rq) +{ + return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* @@ -85,38 +168,38 @@ deadline_latter_request(struct request *rq) } static void -deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - struct rb_root *root = deadline_rb_root(dd, rq); + struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void -deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); + if (per_prio->next_rq[data_dir] == rq) + per_prio->next_rq[data_dir] = deadline_latter_request(rq); - elv_rb_del(deadline_rb_root(dd, rq), rq); + elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ -static void deadline_remove_request(struct request_queue *q, struct request *rq) +static void deadline_remove_request(struct request_queue *q, + struct dd_per_prio *per_prio, + struct request *rq) { - struct deadline_data *dd = q->elevator->elevator_data; - list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) - deadline_del_rq_rb(dd, rq); + deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) @@ -127,19 +210,31 @@ static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(req); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(dd, req), req); - deadline_add_rq_rb(dd, req); + elv_rb_del(deadline_rb_root(per_prio, req), req); + deadline_add_rq_rb(per_prio, req); } } +/* + * Callback function that is invoked after @next has been merged into @req. + */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(next); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + + dd_count(dd, merged, prio); + /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo @@ -155,34 +250,34 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, /* * kill knowledge of next, this one is a goner */ - deadline_remove_request(q, next); + deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void -deadline_move_request(struct deadline_data *dd, struct request *rq) +deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + struct request *rq) { - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; - dd->next_rq[data_dir] = deadline_latter_request(rq); + per_prio->next_rq[data_dir] = deadline_latter_request(rq); /* * take it off the sort and fifo list */ - deadline_remove_request(rq->q, rq); + deadline_remove_request(rq->q, per_prio, rq); } /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +static inline int deadline_check_fifo(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); /* * rq is expired! @@ -198,19 +293,17 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) * dispatch using arrival ordered lists. */ static struct request * -deadline_fifo_request(struct deadline_data *dd, int data_dir) +deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; - if (list_empty(&dd->fifo_list[data_dir])) - return NULL; - - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* @@ -218,7 +311,7 @@ deadline_fifo_request(struct deadline_data *dd, int data_dir) * an unlocked target zone. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq)) goto out; } @@ -234,19 +327,17 @@ deadline_fifo_request(struct deadline_data *dd, int data_dir) * dispatch using sector position sorted lists. */ static struct request * -deadline_next_request(struct deadline_data *dd, int data_dir) +deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - - rq = dd->next_rq[data_dir]; + rq = per_prio->next_rq[data_dir]; if (!rq) return NULL; - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* @@ -268,28 +359,27 @@ deadline_next_request(struct deadline_data *dd, int data_dir) * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ -static struct request *__dd_dispatch_request(struct deadline_data *dd) +static struct request *__dd_dispatch_request(struct deadline_data *dd, + struct dd_per_prio *per_prio) { struct request *rq, *next_rq; - bool reads, writes; - int data_dir; + enum dd_data_dir data_dir; + enum dd_prio prio; + u8 ioprio_class; - if (!list_empty(&dd->dispatch)) { - rq = list_first_entry(&dd->dispatch, struct request, queuelist); + lockdep_assert_held(&dd->lock); + + if (!list_empty(&per_prio->dispatch)) { + rq = list_first_entry(&per_prio->dispatch, struct request, + queuelist); list_del_init(&rq->queuelist); goto done; } - reads = !list_empty(&dd->fifo_list[READ]); - writes = !list_empty(&dd->fifo_list[WRITE]); - /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, WRITE); - if (!rq) - rq = deadline_next_request(dd, READ); - + rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; @@ -299,14 +389,14 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * data direction (read / write) */ - if (reads) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + if (!list_empty(&per_prio->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); - if (deadline_fifo_request(dd, WRITE) && + if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; - data_dir = READ; + data_dir = DD_READ; goto dispatch_find_request; } @@ -315,13 +405,13 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (writes) { + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; - data_dir = WRITE; + data_dir = DD_WRITE; goto dispatch_find_request; } @@ -332,14 +422,14 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) /* * we are not running a batch, find best request for selected data_dir */ - next_rq = deadline_next_request(dd, data_dir); - if (deadline_check_fifo(dd, data_dir) || !next_rq) { + next_rq = deadline_next_request(dd, per_prio, data_dir); + if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = deadline_fifo_request(dd, data_dir); + rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in @@ -355,6 +445,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) if (!rq) return NULL; + dd->last_dir = data_dir; dd->batching = 0; dispatch_request: @@ -362,8 +453,11 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * rq is the selected appropriate request. */ dd->batching++; - deadline_move_request(dd, rq); + deadline_move_request(dd, per_prio, rq); done: + ioprio_class = dd_rq_ioclass(rq); + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, dispatched, prio); /* * If the request needs its target zone locked, do it. */ @@ -373,6 +467,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) } /* + * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). + * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared @@ -382,22 +478,70 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; + enum dd_prio prio; spin_lock(&dd->lock); - rq = __dd_dispatch_request(dd); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + if (rq) + break; + } spin_unlock(&dd->lock); - if (rq) - atomic_dec(&rq->mq_hctx->elevator_queued); return rq; } -static void dd_exit_queue(struct elevator_queue *e) +/* + * Called by __blk_mq_alloc_request(). The shallow_depth value set by this + * function is used by __blk_mq_get_tag(). + */ +static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct deadline_data *dd = data->q->elevator->elevator_data; + + /* Do not throttle synchronous reads. */ + if (op_is_sync(op) && !op_is_write(op)) + return; + + /* + * Throttle asynchronous requests and writes such that these requests + * do not block the allocation of synchronous requests. + */ + data->shallow_depth = dd->async_depth; +} + +/* Called by blk_mq_update_nr_requests(). */ +static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + + dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); +} + +/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ +static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + dd_depth_updated(hctx); + return 0; +} + +static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; + enum dd_prio prio; - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); + } + + free_percpu(dd->stats); kfree(dd); } @@ -405,55 +549,82 @@ static void dd_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; + enum dd_prio prio; + int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) - return -ENOMEM; + return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } + if (!dd) + goto put_eq; + eq->elevator_data = dd; - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; - dd->fifo_expire[READ] = read_expire; - dd->fifo_expire[WRITE] = write_expire; + dd->stats = alloc_percpu_gfp(typeof(*dd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!dd->stats) + goto free_dd; + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + INIT_LIST_HEAD(&per_prio->dispatch); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); + per_prio->sort_list[DD_READ] = RB_ROOT; + per_prio->sort_list[DD_WRITE] = RB_ROOT; + } + dd->fifo_expire[DD_READ] = read_expire; + dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; + dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); - INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; return 0; + +free_dd: + kfree(dd); + +put_eq: + kobject_put(&eq->kobj); + return ret; } +/* + * Try to merge @bio into an existing request. If @bio has been merged into + * an existing request, store the pointer to that request into *@rq. + */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } @@ -461,6 +632,10 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, return ELEVATOR_NO_MERGE; } +/* + * Attempt to merge a bio into an existing request. This function is called + * before @bio is associated with a request. + */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { @@ -486,7 +661,14 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); + u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + struct dd_per_prio *per_prio; + enum dd_prio prio; + LIST_HEAD(free); + + lockdep_assert_held(&dd->lock); /* * This may be a requeue of a write request that has locked its @@ -494,18 +676,22 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); - if (blk_mq_sched_try_insert_merge(q, rq)) + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, inserted, prio); + rq->elv.priv[0] = (void *)(uintptr_t)1; + + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + blk_mq_free_requests(&free); return; + } - blk_mq_sched_request_inserted(rq); + trace_block_rq_insert(rq); - if (at_head || blk_rq_is_passthrough(rq)) { - if (at_head) - list_add(&rq->queuelist, &dd->dispatch); - else - list_add_tail(&rq->queuelist, &dd->dispatch); + per_prio = &dd->per_prio[prio]; + if (at_head) { + list_add(&rq->queuelist, &per_prio->dispatch); } else { - deadline_add_rq_rb(dd, rq); + deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -517,10 +703,13 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } +/* + * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). + */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { @@ -534,20 +723,19 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } spin_unlock(&dd->lock); } -/* - * Nothing to do here. This is defined only to ensure that .finish_request - * method is called upon request completion. - */ +/* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { + rq->elv.priv[0] = NULL; } /* + * Callback from inside blk_mq_free_request(). + * * For zoned block devices, write unlock the target zone of * completed write requests. Do this while holding the zone lock * spinlock so that the zone is never unlocked while deadline_fifo_request() @@ -564,86 +752,99 @@ static void dd_prepare_request(struct request *rq) static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(rq); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + /* + * The block layer core may call dd_finish_request() without having + * called dd_insert_requests(). Hence only update statistics for + * requests for which dd_insert_requests() has been called. See also + * blk_mq_request_bypass_insert(). + */ + if (rq->elv.priv[0]) + dd_count(dd, completed, prio); if (blk_queue_is_zoned(q)) { - struct deadline_data *dd = q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&dd->fifo_list[WRITE])) + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); } } +static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) +{ + return !list_empty_careful(&per_prio->dispatch) || + !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); +} + static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio prio; - if (!atomic_read(&hctx->elevator_queued)) - return false; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) + if (dd_has_work_for_prio(&dd->per_prio[prio])) + return true; - return !list_empty_careful(&dd->dispatch) || - !list_empty_careful(&dd->fifo_list[0]) || - !list_empty_careful(&dd->fifo_list[1]); + return false; } /* * sysfs parts below */ -static ssize_t -deadline_var_show(int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static void -deadline_var_store(int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtol(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +#define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return deadline_var_show(__data, (page)); \ + \ + return sysfs_emit(page, "%d\n", __VAR); \ } -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); -SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); -SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); -SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); -#undef SHOW_FUNCTION +#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) +SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); +SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_INT(deadline_writes_starved_show, dd->writes_starved); +SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->front_merges); +SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); +#undef SHOW_INT +#undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data; \ - deadline_var_store(&__data, (page)); \ + int __data, __ret; \ + \ + __ret = kstrtoint(page, 0, &__data); \ + if (__ret < 0) \ + return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ + *(__PTR) = __CONV(__data); \ return count; \ } -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); -STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); -STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) +#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) +STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); +STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); +STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); +STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION +#undef STORE_INT +#undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) @@ -653,21 +854,23 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), + DD_ATTR(async_depth), DD_ATTR(fifo_batch), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS -#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \ +#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ - return seq_list_start(&dd->fifo_list[ddir], *pos); \ + return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ @@ -675,8 +878,9 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ - return seq_list_next(v, &dd->fifo_list[ddir], pos); \ + return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ @@ -700,14 +904,20 @@ static int deadline_##name##_next_rq_show(void *data, \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ - struct request *rq = dd->next_rq[ddir]; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + struct request *rq = per_prio->next_rq[data_dir]; \ \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } -DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read) -DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write) + +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) @@ -728,49 +938,120 @@ static int deadline_starved_show(void *data, struct seq_file *m) return 0; } -static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) - __acquires(&dd->lock) +static int dd_async_depth_show(void *data, struct seq_file *m) { - struct request_queue *q = m->private; + struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; - spin_lock(&dd->lock); - return seq_list_start(&dd->dispatch, *pos); + seq_printf(m, "%u\n", dd->async_depth); + return 0; } -static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { - struct request_queue *q = m->private; + return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); +} + +static int dd_queued_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; - return seq_list_next(v, &dd->dispatch, pos); + seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), + dd_queued(dd, DD_BE_PRIO), + dd_queued(dd, DD_IDLE_PRIO)); + return 0; } -static void deadline_dispatch_stop(struct seq_file *m, void *v) - __releases(&dd->lock) +/* Number of requests owned by the block driver for a given priority. */ +static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { - struct request_queue *q = m->private; + return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) + - dd_sum(dd, completed, prio); +} + +static int dd_owned_by_driver_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; - spin_unlock(&dd->lock); + seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), + dd_owned_by_driver(dd, DD_BE_PRIO), + dd_owned_by_driver(dd, DD_IDLE_PRIO)); + return 0; } -static const struct seq_operations deadline_dispatch_seq_ops = { - .start = deadline_dispatch_start, - .next = deadline_dispatch_next, - .stop = deadline_dispatch_stop, - .show = blk_mq_debugfs_rq_show, -}; +#define DEADLINE_DISPATCH_ATTR(prio) \ +static void *deadline_dispatch##prio##_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->dispatch, *pos); \ +} \ + \ +static void *deadline_dispatch##prio##_next(struct seq_file *m, \ + void *v, loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->dispatch, pos); \ +} \ + \ +static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ + .start = deadline_dispatch##prio##_start, \ + .next = deadline_dispatch##prio##_next, \ + .stop = deadline_dispatch##prio##_stop, \ + .show = blk_mq_debugfs_rq_show, \ +} -#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ - {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \ +DEADLINE_DISPATCH_ATTR(0); +DEADLINE_DISPATCH_ATTR(1); +DEADLINE_DISPATCH_ATTR(2); +#undef DEADLINE_DISPATCH_ATTR + +#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ + {#name "_fifo_list", 0400, \ + .seq_ops = &deadline_##name##_fifo_seq_ops} +#define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { - DEADLINE_QUEUE_DDIR_ATTRS(read), - DEADLINE_QUEUE_DDIR_ATTRS(write), + DEADLINE_QUEUE_DDIR_ATTRS(read0), + DEADLINE_QUEUE_DDIR_ATTRS(write0), + DEADLINE_QUEUE_DDIR_ATTRS(read1), + DEADLINE_QUEUE_DDIR_ATTRS(write1), + DEADLINE_QUEUE_DDIR_ATTRS(read2), + DEADLINE_QUEUE_DDIR_ATTRS(write2), + DEADLINE_NEXT_RQ_ATTR(read0), + DEADLINE_NEXT_RQ_ATTR(write0), + DEADLINE_NEXT_RQ_ATTR(read1), + DEADLINE_NEXT_RQ_ATTR(write1), + DEADLINE_NEXT_RQ_ATTR(read2), + DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, - {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, + {"async_depth", 0400, dd_async_depth_show}, + {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, + {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, + {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"owned_by_driver", 0400, dd_owned_by_driver_show}, + {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS @@ -778,6 +1059,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { static struct elevator_type mq_deadline = { .ops = { + .depth_updated = dd_depth_updated, + .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, @@ -789,8 +1072,9 @@ static struct elevator_type mq_deadline = { .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, - .init_sched = dd_init_queue, - .exit_sched = dd_exit_queue, + .init_sched = dd_init_sched, + .exit_sched = dd_exit_sched, + .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS @@ -817,6 +1101,6 @@ static void __exit deadline_exit(void) module_init(deadline_init); module_exit(deadline_exit); -MODULE_AUTHOR("Jens Axboe"); +MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler"); diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 6e2a649669..278593b8e4 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -264,7 +264,6 @@ config SYSV68_PARTITION config CMDLINE_PARTITION bool "Command line partition support" if PARTITION_ADVANCED - select BLK_CMDLINE_PARSER help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index c64c57b958..2c381c694c 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -275,7 +275,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) /* * Work out start of non-adfs partition. */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = get_capacity(state->disk) - start_sect; if (start_sect) { switch (id) { @@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) if (i != 0) { sector_t size; - size = get_capacity(state->bdev->bd_disk); + size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); strlcat(state->pp_buf, "\n", PAGE_SIZE); } diff --git a/block/partitions/aix.c b/block/partitions/aix.c index c7b4fd1a4a..85f4b96756 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -66,22 +66,6 @@ struct pvd { #define LVM_MAXLVS 256 -/** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; -} - /** * read_lba(): Read bytes from disk, starting at given LBA * @state @@ -89,7 +73,7 @@ static u64 last_lba(struct block_device *bdev) * @buffer * @count * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, @@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, { size_t totalreadcount = 0; - if (!buffer || lba + count / 512 > last_lba(state->bdev)) + if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL) return 0; while (count) { diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 9526491d9a..5c8624e26a 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -34,7 +34,6 @@ int amiga_partition(struct parsed_partitions *state) int start_sect, nr_sects, blk, part, res = 0; int blksize = 1; /* Multiplier for disk block size */ int slot = 1; - char b[BDEVNAME_SIZE]; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) @@ -42,7 +41,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ int amiga_partition(struct parsed_partitions *state) } pr_err("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ @@ -84,7 +83,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 2305840c85..da59941754 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state) * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ - if (bdev_logical_block_size(state->bdev) != 512) + if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, §); @@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state) return -1; /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; + hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && diff --git a/block/partitions/check.h b/block/partitions/check.h index c577e9ee67..d5b28e309d 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -9,7 +9,7 @@ * description. */ struct parsed_partitions { - struct block_device *bdev; + struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 8f545c36cd..1af610f0ba 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -14,20 +14,248 @@ * For further information, see "Documentation/block/cmdline-partition.rst" * */ - -#include - +#include +#include +#include #include "check.h" + +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +static void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +static int cmdline_parts_parse(struct cmdline_parts **parts, + const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} + static char *cmdline; static struct cmdline_parts *bdev_parts; -static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +static int add_part(int slot, struct cmdline_subpart *subpart, + struct parsed_partitions *state) { int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; - struct parsed_partitions *state = (struct parsed_partitions *)param; if (slot >= state->limit) return 1; @@ -50,6 +278,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param) return 0; } +static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + struct parsed_partitions *state) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + int slot = 1; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, state)) + break; + } + + return slot; +} + static int __init cmdline_parts_setup(char *s) { cmdline = s; @@ -123,7 +380,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; - char bdev[BDEVNAME_SIZE]; struct cmdline_parts *parts; if (cmdline) { @@ -140,14 +396,13 @@ int cmdline_partition(struct parsed_partitions *state) if (!bdev_parts) return 0; - bdevname(state->bdev, bdev); - parts = cmdline_parts_find(bdev_parts, bdev); + parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; - disk_size = get_capacity(state->bdev->bd_disk) << 9; + disk_size = get_capacity(state->disk) << 9; - cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); diff --git a/block/partitions/core.c b/block/partitions/core.c index a02e224115..58c4c362c9 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -2,6 +2,7 @@ /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King + * Copyright (C) 2020 Christoph Hellwig */ #include #include @@ -85,6 +86,13 @@ static int (*check_part[])(struct parsed_partitions *) = { NULL }; +static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) +{ + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} + static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; @@ -112,8 +120,7 @@ static void free_partitions(struct parsed_partitions *state) kfree(state); } -static struct parsed_partitions *check_partition(struct gendisk *hd, - struct block_device *bdev) +static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; @@ -128,8 +135,8 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, } state->pp_buf[0] = '\0'; - state->bdev = bdev; - disk_name(hd, 0, state->name); + state->disk = hd; + snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); @@ -175,44 +182,39 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->partno); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); + return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); + return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->start_sect)); + queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->start_sect)); + queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -257,19 +259,17 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - struct hd_struct *p = dev_to_part(dev); - blk_free_devt(dev->devt); - hd_free_part(p); - kfree(p); + put_disk(dev_to_bdev(dev)->bd_disk); + iput(dev_to_bdev(dev)->bd_inode); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { - struct hd_struct *part = dev_to_part(dev); + struct block_device *part = dev_to_bdev(dev); - add_uevent_var(env, "PARTN=%u", part->partno); - if (part->info && part->info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", part->info->volname); + add_uevent_var(env, "PARTN=%u", part->bd_partno); + if (part->bd_meta_info && part->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); return 0; } @@ -280,73 +280,24 @@ struct device_type part_type = { .uevent = part_uevent, }; -static void hd_struct_free_work(struct work_struct *work) +static void delete_partition(struct block_device *part) { - struct hd_struct *part = - container_of(to_rcu_work(work), struct hd_struct, rcu_work); - struct gendisk *disk = part_to_disk(part); + lockdep_assert_held(&part->bd_disk->open_mutex); + + fsync_bdev(part); + __invalidate_device(part, true); + + xa_erase(&part->bd_disk->part_tbl, part->bd_partno); + kobject_put(part->bd_holder_dir); + device_del(&part->bd_device); /* - * Release the disk reference acquired in delete_partition here. - * We can't release it in hd_struct_free because the final put_device - * needs process context and thus can't be run directly from a - * percpu_ref ->release handler. + * Remove the block device from the inode hash, so that it cannot be + * looked up any more even when openers still hold references. */ - put_device(disk_to_dev(disk)); + remove_inode_hash(part->bd_inode); - part->start_sect = 0; - part->nr_sects = 0; - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -static void hd_struct_free(struct percpu_ref *ref) -{ - struct hd_struct *part = container_of(ref, struct hd_struct, ref); - struct gendisk *disk = part_to_disk(part); - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(ptbl->last_lookup, NULL); - - INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); - queue_rcu_work(system_wq, &part->rcu_work); -} - -int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL)) - return -ENOMEM; - return 0; -} - -/* - * Must be called either with bd_mutex held, before a disk can be opened or - * after all disk users are gone. - */ -void delete_partition(struct hd_struct *part) -{ - struct gendisk *disk = part_to_disk(part); - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - /* - * ->part_tbl is referenced in this part's release handler, so - * we have to hold the disk device - */ - get_device(disk_to_dev(disk)); - rcu_assign_pointer(ptbl->part[part->partno], NULL); - kobject_put(part->holder_dir); - device_del(part_to_dev(part)); - - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(part_devt(part)); - percpu_ref_kill(&part->ref); + put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, @@ -357,21 +308,25 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* - * Must be called either with bd_mutex held, before a disk can be opened or + * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ -static struct hd_struct *add_partition(struct gendisk *disk, int partno, +static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { - struct hd_struct *p; dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; - struct disk_part_tbl *ptbl; + struct block_device *bdev; const char *dname; int err; + lockdep_assert_held(&disk->open_mutex); + + if (partno >= disk_max_parts(disk)) + return ERR_PTR(-EINVAL); + /* * Partitions are not supported on zoned block devices that are used as * such. @@ -384,50 +339,27 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, case BLK_ZONED_HA: pr_info("%s: disabling host aware zoned block device support due to partitions\n", disk->disk_name); - disk->queue->limits.zoned = BLK_ZONED_NONE; + blk_queue_set_zoned(disk, BLK_ZONED_NONE); break; case BLK_ZONED_NONE: break; } - err = disk_expand_part_tbl(disk, partno); - if (err) - return ERR_PTR(err); - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - - if (ptbl->part[partno]) + if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return ERR_PTR(-EBUSY); + /* ensure we always have a reference to the whole disk */ + get_device(disk_to_dev(disk)); - p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) { - err = -ENOMEM; - goto out_free; - } + err = -ENOMEM; + bdev = bdev_alloc(disk, partno); + if (!bdev) + goto out_put_disk; - hd_sects_seq_init(p); - pdev = part_to_dev(p); - - p->start_sect = start; - p->nr_sects = len; - p->partno = partno; - p->policy = get_disk_ro(disk); - - if (info) { - struct partition_meta_info *pinfo; - - pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) { - err = -ENOMEM; - goto out_free_stats; - } - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; - } + bdev->bd_start_sect = start; + bdev_set_nr_sectors(bdev, len); + pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -439,11 +371,24 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev->type = &part_type; pdev->parent = ddev; - err = blk_alloc_devt(p, &devt); - if (err) - goto out_free_info; + /* in consecutive minor range? */ + if (bdev->bd_partno < disk->minors) { + devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); + } else { + err = blk_alloc_ext_minor(); + if (err < 0) + goto out_put; + devt = MKDEV(BLOCK_EXT_MAJOR, err); + } pdev->devt = devt; + if (info) { + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) + goto out_put; + } + /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); @@ -451,8 +396,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_put; err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) + bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); @@ -462,149 +407,118 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_del; } - err = hd_ref_init(p); - if (err) { - if (flags & ADDPART_FLAG_WHOLEDISK) - goto out_remove_file; - goto out_del; - } - /* everything is up and running, commence */ - rcu_assign_pointer(ptbl->part[partno], p); + err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); + if (err) + goto out_del; + bdev_add(bdev, devt); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - return p; + return bdev; -out_free_info: - kfree(p->info); -out_free_stats: - free_percpu(p->dkstats); -out_free: - kfree(p); - return ERR_PTR(err); -out_remove_file: - device_remove_file(pdev, &dev_attr_whole_disk); out_del: - kobject_put(p->holder_dir); + kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); +out_put_disk: + put_disk(disk); return ERR_PTR(err); } static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { - struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; bool overlap = false; + unsigned long idx; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) { - if (part->partno == skip_partno || - start >= part->start_sect + part->nr_sects || - start + length <= part->start_sect) - continue; - overlap = true; - break; + rcu_read_lock(); + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + if (part->bd_partno != skip_partno && + start < part->bd_start_sect + bdev_nr_sectors(part) && + start + length > part->bd_start_sect) { + overlap = true; + break; + } } + rcu_read_unlock(); - disk_part_iter_exit(&piter); return overlap; } -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { - struct hd_struct *part; - - mutex_lock(&bdev->bd_mutex); - if (partition_overlaps(bdev->bd_disk, start, length, -1)) { - mutex_unlock(&bdev->bd_mutex); - return -EBUSY; - } - - part = add_partition(bdev->bd_disk, partno, start, length, - ADDPART_FLAG_NONE, NULL); - mutex_unlock(&bdev->bd_mutex); - return PTR_ERR_OR_ZERO(part); -} - -int bdev_del_partition(struct block_device *bdev, int partno) -{ - struct block_device *bdevp; - struct hd_struct *part = NULL; + struct block_device *part; int ret; - bdevp = bdget_disk(bdev->bd_disk, partno); - if (!bdevp) - return -ENXIO; + mutex_lock(&disk->open_mutex); + if (!disk_live(disk)) { + ret = -ENXIO; + goto out; + } - mutex_lock(&bdevp->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); + if (partition_overlaps(disk, start, length, -1)) { + ret = -EBUSY; + goto out; + } - ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); + part = add_partition(disk, partno, start, length, + ADDPART_FLAG_NONE, NULL); + ret = PTR_ERR_OR_ZERO(part); +out: + mutex_unlock(&disk->open_mutex); + return ret; +} + +int bdev_del_partition(struct gendisk *disk, int partno) +{ + struct block_device *part = NULL; + int ret = -ENXIO; + + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EBUSY; - if (bdevp->bd_openers) + if (part->bd_openers) goto out_unlock; - sync_blockdev(bdevp); - invalidate_bdev(bdevp); - delete_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); - if (part) - disk_put_part(part); + mutex_unlock(&disk->open_mutex); return ret; } -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { - struct block_device *bdevp; - struct hd_struct *part; - int ret = 0; + struct block_device *part = NULL; + int ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) - return -ENXIO; - - ret = -ENOMEM; - bdevp = bdget_part(part); - if (!bdevp) - goto out_put_part; - - mutex_lock(&bdevp->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); + goto out_unlock; ret = -EINVAL; - if (start != part->start_sect) + if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; - if (partition_overlaps(bdev->bd_disk, start, length, partno)) + if (partition_overlaps(disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, length); - bd_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdevp->bd_mutex); - mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); -out_put_part: - disk_put_part(part); + mutex_unlock(&disk->open_mutex); return ret; } @@ -624,35 +538,23 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int blk_drop_partitions(struct block_device *bdev) +void blk_drop_partitions(struct gendisk *disk) { - struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; + unsigned long idx; - if (bdev->bd_part_count) - return -EBUSY; + lockdep_assert_held(&disk->open_mutex); - sync_blockdev(bdev); - invalidate_bdev(bdev); - - disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) + xa_for_each_start(&disk->part_tbl, idx, part, 1) delete_partition(part); - disk_part_iter_exit(&piter); - - return 0; } -#ifdef CONFIG_S390 -/* for historic reasons in the DASD driver */ -EXPORT_SYMBOL_GPL(blk_drop_partitions); -#endif -static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, +static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; - struct hd_struct *part; + struct block_device *part; if (!size) return true; @@ -692,20 +594,20 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part_to_dev(part)->devt); + md_autodetect_dev(part->bd_dev); return true; } -int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; - int ret = -EAGAIN, p, highest; + int ret = -EAGAIN, p; if (!disk_part_scan_enabled(disk)) return 0; - state = check_partition(disk, bdev); + state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { @@ -748,17 +650,8 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - /* - * Detect the highest partition number and preallocate disk->part_tbl. - * This is an optimization and not strictly necessary. - */ - for (p = 1, highest = 0; p < state->limit; p++) - if (state->parts[p].size) - highest = p; - disk_expand_part_tbl(disk, highest); - for (p = 1; p < state->limit; p++) - if (!blk_add_partition(disk, bdev, state, p)) + if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; @@ -767,12 +660,64 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) return ret; } +int bdev_disk_changed(struct gendisk *disk, bool invalidate) +{ + int ret = 0; + + lockdep_assert_held(&disk->open_mutex); + + if (!disk_live(disk)) + return -ENXIO; + +rescan: + if (disk->open_partitions) + return -EBUSY; + sync_blockdev(disk->part0); + invalidate_bdev(disk->part0); + blk_drop_partitions(disk); + + clear_bit(GD_NEED_PART_SCAN, &disk->state); + + /* + * Historically we only set the capacity to zero for devices that + * support partitions (independ of actually having partitions created). + * Doing that is rather inconsistent, but changing it broke legacy + * udisks polling for legacy ide-cdrom devices. Use the crude check + * below to get the sane behavior for most device while not breaking + * userspace for this particular setup. + */ + if (invalidate) { + if (disk_part_scan_enabled(disk) || + !(disk->flags & GENHD_FL_REMOVABLE)) + set_capacity(disk, 0); + } + + if (get_capacity(disk)) { + ret = blk_add_partitions(disk); + if (ret == -EAGAIN) + goto rescan; + } else if (invalidate) { + /* + * Tell userspace that the media / partition table may have + * changed. + */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + } + + return ret; +} +/* + * Only exported for loop and dasd for historic reasons. Don't use in new + * code! + */ +EXPORT_SYMBOL_GPL(bdev_disk_changed); + void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = state->bdev->bd_inode->i_mapping; + struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; struct page *page; - if (n >= get_capacity(state->bdev->bd_disk)) { + if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; return NULL; } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index b64bfdd432..7ca5c4c374 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len) /** * last_lba(): return number of last logical block of device - * @bdev: block device + * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 last_lba(struct block_device *bdev) +static u64 last_lba(struct gendisk *disk) { - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; + return div_u64(disk->part0->bd_inode->i_size, + queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) @@ -231,17 +229,17 @@ static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) * @buffer: destination buffer * @count: bytes to read * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + sector_t n = lba * + (queue_logical_block_size(state->disk->queue) / 512); - if (!buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { @@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. + * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); + unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) @@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { + queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); + queue_logical_block_size(state->disk->queue)); goto fail; } @@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -587,13 +585,15 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; - sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + struct gendisk *disk = state->disk; + const struct block_device_operations *fops = disk->fops; + sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); @@ -621,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { + sector_t agpt_sector; + int err; + + err = fops->alternative_gpt_sector(disk, &agpt_sector); + if (!err) + good_agpt = is_gpt_valid(state, agpt_sector, + &agpt, &aptes); + } + /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; @@ -682,7 +692,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) } /** - * efi_partition(struct parsed_partitions *state) + * efi_partition - scan for GPT partitions * @state: disk parsed partitions * * Description: called from check.c, if the disk contains GPT @@ -705,7 +715,7 @@ int efi_partition(struct parsed_partitions *state) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -722,7 +732,7 @@ int efi_partition(struct parsed_partitions *state) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 4b044e620d..9bca396aef 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -290,8 +290,8 @@ static int find_cms1_partitions(struct parsed_partitions *state, int ibm_partition(struct parsed_partitions *state) { int (*fn)(struct gendisk *disk, dasd_information2_t *info); - struct block_device *bdev = state->bdev; - struct gendisk *disk = bdev->bd_disk; + struct gendisk *disk = state->disk; + struct block_device *bdev = disk->part0; int blocksize, res; loff_t i_size, offset, size; dasd_information2_t *info; diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index cc86534c80..27f6c7d9c7 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) * * Copyright (C) 2001,2002 Richard Russon @@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state, } } - num_sects = state->bdev->bd_inode->i_size >> 9; + num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -339,11 +339,11 @@ static bool ldm_validate_privheads(struct parsed_partitions *state, /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. + * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * @@ -486,8 +486,8 @@ static bool ldm_validate_vmdb(struct parsed_partitions *state, * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->disk is a dynamic disk + * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { @@ -1340,7 +1340,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1432,10 +1432,10 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk + * Return: 1 Success, @state->disk is a dynamic disk and we handled it + * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted + * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b609533563..7b521df00a 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, + note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index c94de377c5..b5d5c229cc 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -135,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state, Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; + sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; @@ -579,7 +580,7 @@ static struct { int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; @@ -587,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state) int slot; u32 disksig; + sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; @@ -620,7 +622,7 @@ int msdos_partition(struct parsed_partitions *state) for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* - * Even without a valid boot inidicator value + * Even without a valid boot indicator value * its still possible this is valid FAT filesystem * without a partition table. */ diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 4273f1bb05..9cc6b8c1ee 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state) Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; - char b[BDEVNAME_SIZE]; label = read_part_sector(state, 0, §); if (!label) @@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state) magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ + state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } @@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 47dc53eccf..ddf9e6def4 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state) } * label; struct sun_partition *p; unsigned long spc; - char b[BDEVNAME_SIZE]; int use_vtoc; int nparts; @@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ + state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } @@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/t10-pi.c b/block/t10-pi.c index d910534b3a..00c203b2a9 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -147,11 +147,10 @@ static void t10_pi_type1_prepare(struct request *rq) break; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -161,8 +160,7 @@ static void t10_pi_type1_prepare(struct request *rq) ref_tag++; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } bip->bip_flags |= BIP_MAPPED_INTEGRITY; @@ -195,11 +193,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) struct bvec_iter iter; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -210,8 +207,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) intervals--; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } } } diff --git a/drivers/Kconfig b/drivers/Kconfig index 30d2db37cc..0d399ddaa1 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -17,6 +17,8 @@ source "drivers/bus/Kconfig" source "drivers/connector/Kconfig" +source "drivers/firmware/Kconfig" + source "drivers/gnss/Kconfig" source "drivers/mtd/Kconfig" diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 0a0a982f9c..c0e77c1c8e 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -36,7 +36,7 @@ struct acpi_gtdt_descriptor { static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata; -static inline void *next_platform_timer(void *platform_timer) +static inline __init void *next_platform_timer(void *platform_timer) { struct acpi_gtdt_header *gh = platform_timer; diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index a3ef6cce64..7dd80acf92 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3007,6 +3007,18 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc->target_node = NUMA_NO_NODE; } + /* Fallback to address based numa information if node lookup failed */ + if (ndr_desc->numa_node == NUMA_NO_NODE) { + ndr_desc->numa_node = memory_add_physaddr_to_nid(spa->address); + dev_info(acpi_desc->dev, "changing numa node from %d to %d for nfit region [%pa-%pa]", + NUMA_NO_NODE, ndr_desc->numa_node, &res.start, &res.end); + } + if (ndr_desc->target_node == NUMA_NO_NODE) { + ndr_desc->target_node = phys_to_target_node(spa->address); + dev_info(acpi_desc->dev, "changing target node from %d to %d for nfit region [%pa-%pa]", + NUMA_NO_NODE, ndr_desc->numa_node, &res.start, &res.end); + } + /* * Persistence domain bits are hierarchical, if * ACPI_NFIT_CAPABILITY_CACHE_FLUSH is set then diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c index bd92b549fd..1c48358b43 100644 --- a/drivers/acpi/x86/s2idle.c +++ b/drivers/acpi/x86/s2idle.c @@ -371,7 +371,7 @@ static int lps0_device_attach(struct acpi_device *adev, return 0; if (acpi_s2idle_vendor_amd()) { - /* AMD0004, AMDI0005: + /* AMD0004, AMD0005, AMDI0005: * - Should use rev_id 0x0 * - function mask > 0x3: Should use AMD method, but has off by one bug * - function mask = 0x3: Should use Microsoft method @@ -390,6 +390,7 @@ static int lps0_device_attach(struct acpi_device *adev, ACPI_LPS0_DSM_UUID_MICROSOFT, 0, &lps0_dsm_guid_microsoft); if (lps0_dsm_func_mask > 0x3 && (!strcmp(hid, "AMD0004") || + !strcmp(hid, "AMD0005") || !strcmp(hid, "AMDI0005"))) { lps0_dsm_func_mask = (lps0_dsm_func_mask << 1) | 0x1; acpi_handle_debug(adev->handle, "_DSM UUID %s: Adjusted function mask: 0x%x\n", diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index b2f5520882..0910441321 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -440,10 +440,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, hpriv->phy_regulator = devm_regulator_get(dev, "phy"); if (IS_ERR(hpriv->phy_regulator)) { rc = PTR_ERR(hpriv->phy_regulator); - if (rc == -EPROBE_DEFER) - goto err_out; - rc = 0; - hpriv->phy_regulator = NULL; + goto err_out; } if (flags & AHCI_PLATFORM_GET_RESETS) { diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c index c3e6592712..0a8bf09a5c 100644 --- a/drivers/ata/pata_legacy.c +++ b/drivers/ata/pata_legacy.c @@ -352,7 +352,8 @@ static unsigned int pdc_data_xfer_vlb(struct ata_queued_cmd *qc, iowrite32_rep(ap->ioaddr.data_addr, buf, buflen >> 2); if (unlikely(slop)) { - __le32 pad; + __le32 pad = 0; + if (rw == READ) { pad = cpu_to_le32(ioread32(ap->ioaddr.data_addr)); memcpy(buf + buflen - slop, &pad, slop); @@ -742,7 +743,8 @@ static unsigned int vlb32_data_xfer(struct ata_queued_cmd *qc, ioread32_rep(ap->ioaddr.data_addr, buf, buflen >> 2); if (unlikely(slop)) { - __le32 pad; + __le32 pad = 0; + if (rw == WRITE) { memcpy(&pad, buf + buflen - slop, slop); iowrite32(le32_to_cpu(pad), ap->ioaddr.data_addr); diff --git a/drivers/base/core.c b/drivers/base/core.c index e65dd803a4..249da49658 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -95,12 +95,29 @@ int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup) list_add(&link->s_hook, &sup->consumers); list_add(&link->c_hook, &con->suppliers); + pr_debug("%pfwP Linked as a fwnode consumer to %pfwP\n", + con, sup); out: mutex_unlock(&fwnode_link_lock); return ret; } +/** + * __fwnode_link_del - Delete a link between two fwnode_handles. + * @link: the fwnode_link to be deleted + * + * The fwnode_link_lock needs to be held when this function is called. + */ +static void __fwnode_link_del(struct fwnode_link *link) +{ + pr_debug("%pfwP Dropping the fwnode link to %pfwP\n", + link->consumer, link->supplier); + list_del(&link->s_hook); + list_del(&link->c_hook); + kfree(link); +} + /** * fwnode_links_purge_suppliers - Delete all supplier links of fwnode_handle. * @fwnode: fwnode whose supplier links need to be deleted @@ -112,11 +129,8 @@ static void fwnode_links_purge_suppliers(struct fwnode_handle *fwnode) struct fwnode_link *link, *tmp; mutex_lock(&fwnode_link_lock); - list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) { - list_del(&link->s_hook); - list_del(&link->c_hook); - kfree(link); - } + list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) + __fwnode_link_del(link); mutex_unlock(&fwnode_link_lock); } @@ -131,11 +145,8 @@ static void fwnode_links_purge_consumers(struct fwnode_handle *fwnode) struct fwnode_link *link, *tmp; mutex_lock(&fwnode_link_lock); - list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) { - list_del(&link->s_hook); - list_del(&link->c_hook); - kfree(link); - } + list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) + __fwnode_link_del(link); mutex_unlock(&fwnode_link_lock); } @@ -676,7 +687,8 @@ struct device_link *device_link_add(struct device *consumer, { struct device_link *link; - if (!consumer || !supplier || flags & ~DL_ADD_VALID_FLAGS || + if (!consumer || !supplier || consumer == supplier || + flags & ~DL_ADD_VALID_FLAGS || (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) || (flags & DL_FLAG_SYNC_STATE_ONLY && (flags & ~DL_FLAG_INFERRED) != DL_FLAG_SYNC_STATE_ONLY) || @@ -975,6 +987,7 @@ int device_links_check_suppliers(struct device *dev) { struct device_link *link; int ret = 0; + struct fwnode_handle *sup_fw; /* * Device waiting for supplier to become available is not allowed to @@ -983,10 +996,11 @@ int device_links_check_suppliers(struct device *dev) mutex_lock(&fwnode_link_lock); if (dev->fwnode && !list_empty(&dev->fwnode->suppliers) && !fw_devlink_is_permissive()) { - dev_dbg(dev, "probe deferral - wait for supplier %pfwP\n", - list_first_entry(&dev->fwnode->suppliers, - struct fwnode_link, - c_hook)->supplier); + sup_fw = list_first_entry(&dev->fwnode->suppliers, + struct fwnode_link, + c_hook)->supplier; + dev_err_probe(dev, -EPROBE_DEFER, "wait for supplier %pfwP\n", + sup_fw); mutex_unlock(&fwnode_link_lock); return -EPROBE_DEFER; } @@ -1001,8 +1015,9 @@ int device_links_check_suppliers(struct device *dev) if (link->status != DL_STATE_AVAILABLE && !(link->flags & DL_FLAG_SYNC_STATE_ONLY)) { device_links_missing_supplier(dev); - dev_dbg(dev, "probe deferral - supplier %s not ready\n", - dev_name(link->supplier)); + dev_err_probe(dev, -EPROBE_DEFER, + "supplier %s not ready\n", + dev_name(link->supplier)); ret = -EPROBE_DEFER; break; } @@ -1722,6 +1737,25 @@ static int fw_devlink_create_devlink(struct device *con, struct device *sup_dev; int ret = 0; + /* + * In some cases, a device P might also be a supplier to its child node + * C. However, this would defer the probe of C until the probe of P + * completes successfully. This is perfectly fine in the device driver + * model. device_add() doesn't guarantee probe completion of the device + * by the time it returns. + * + * However, there are a few drivers that assume C will finish probing + * as soon as it's added and before P finishes probing. So, we provide + * a flag to let fw_devlink know not to delay the probe of C until the + * probe of P completes successfully. + * + * When such a flag is set, we can't create device links where P is the + * supplier of C as that would delay the probe of C. + */ + if (sup_handle->flags & FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD && + fwnode_is_ancestor_of(sup_handle, con->fwnode)) + return -EINVAL; + sup_dev = get_dev_from_fwnode(sup_handle); if (sup_dev) { /* @@ -1772,14 +1806,21 @@ static int fw_devlink_create_devlink(struct device *con, * be broken by applying logic. Check for these types of cycles and * break them so that devices in the cycle probe properly. * - * If the supplier's parent is dependent on the consumer, then - * the consumer-supplier dependency is a false dependency. So, - * treat it as an invalid link. + * If the supplier's parent is dependent on the consumer, then the + * consumer and supplier have a cyclic dependency. Since fw_devlink + * can't tell which of the inferred dependencies are incorrect, don't + * enforce probe ordering between any of the devices in this cyclic + * dependency. Do this by relaxing all the fw_devlink device links in + * this cycle and by treating the fwnode link between the consumer and + * the supplier as an invalid dependency. */ sup_dev = fwnode_get_next_parent_dev(sup_handle); if (sup_dev && device_is_dependent(con, sup_dev)) { - dev_dbg(con, "Not linking to %pfwP - False link\n", - sup_handle); + dev_info(con, "Fixing up cyclic dependency with %pfwP (%s)\n", + sup_handle, dev_name(sup_dev)); + device_links_write_lock(); + fw_devlink_relax_cycle(con, sup_dev); + device_links_write_unlock(); ret = -EINVAL; } else { /* @@ -1858,9 +1899,7 @@ static void __fw_devlink_link_to_consumers(struct device *dev) if (!own_link || ret == -EAGAIN) continue; - list_del(&link->s_hook); - list_del(&link->c_hook); - kfree(link); + __fwnode_link_del(link); } } @@ -1912,9 +1951,7 @@ static void __fw_devlink_link_to_suppliers(struct device *dev, if (!own_link || ret == -EAGAIN) continue; - list_del(&link->s_hook); - list_del(&link->c_hook); - kfree(link); + __fwnode_link_del(link); /* If no device link was created, nothing more to do. */ if (ret) diff --git a/drivers/base/test/Makefile b/drivers/base/test/Makefile index 2f15fae862..7f76fee6f9 100644 --- a/drivers/base/test/Makefile +++ b/drivers/base/test/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_TEST_ASYNC_DRIVER_PROBE) += test_async_driver_probe.o -obj-$(CONFIG_KUNIT_DRIVER_PE_TEST) += property-entry-test.o -CFLAGS_REMOVE_property-entry-test.o += -fplugin-arg-structleak_plugin-byref -fplugin-arg-structleak_plugin-byref-all +obj-$(CONFIG_DRIVER_PE_KUNIT_TEST) += property-entry-test.o +CFLAGS_property-entry-test.o += $(DISABLE_STRUCTLEAK_PLUGIN) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 58ec167aa0..530b312402 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -373,10 +373,22 @@ static int brd_alloc(int i) struct gendisk *disk; char buf[DISK_NAME_LEN]; + mutex_lock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) { + if (brd->brd_number == i) { + mutex_unlock(&brd_devices_mutex); + return -EEXIST; + } + } brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) + if (!brd) { + mutex_unlock(&brd_devices_mutex); return -ENOMEM; + } brd->brd_number = i; + list_add_tail(&brd->brd_list, &brd_devices); + mutex_unlock(&brd_devices_mutex); + spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); @@ -411,37 +423,30 @@ static int brd_alloc(int i) blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); add_disk(disk); - list_add_tail(&brd->brd_list, &brd_devices); return 0; out_free_dev: + mutex_lock(&brd_devices_mutex); + list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); kfree(brd); return -ENOMEM; } static void brd_probe(dev_t dev) { - int i = MINOR(dev) / max_part; - struct brd_device *brd; - - mutex_lock(&brd_devices_mutex); - list_for_each_entry(brd, &brd_devices, brd_list) { - if (brd->brd_number == i) - goto out_unlock; - } - - brd_alloc(i); -out_unlock: - mutex_unlock(&brd_devices_mutex); + brd_alloc(MINOR(dev) / max_part); } static void brd_del_one(struct brd_device *brd) { - list_del(&brd->brd_list); del_gendisk(brd->brd_disk); blk_cleanup_disk(brd->brd_disk); brd_free_pages(brd); + mutex_lock(&brd_devices_mutex); + list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); kfree(brd); } @@ -491,25 +496,21 @@ static int __init brd_init(void) brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - mutex_lock(&brd_devices_mutex); for (i = 0; i < rd_nr; i++) { err = brd_alloc(i); if (err) goto out_free; } - mutex_unlock(&brd_devices_mutex); - pr_info("brd: module loaded\n"); return 0; out_free: + unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); debugfs_remove_recursive(brd_debugfs_dir); list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - mutex_unlock(&brd_devices_mutex); - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); pr_info("brd: module NOT loaded !!!\n"); return err; @@ -519,13 +520,12 @@ static void __exit brd_exit(void) { struct brd_device *brd, *next; + unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); debugfs_remove_recursive(brd_debugfs_dir); list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - pr_info("brd: module unloaded\n"); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 5170a63077..1183f7872b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -97,13 +97,18 @@ struct nbd_config { atomic_t recv_threads; wait_queue_head_t recv_wq; - loff_t blksize; + unsigned int blksize_bits; loff_t bytesize; #if IS_ENABLED(CONFIG_DEBUG_FS) struct dentry *dbg_dir; #endif }; +static inline unsigned int nbd_blksize(struct nbd_config *config) +{ + return 1u << config->blksize_bits; +} + struct nbd_device { struct blk_mq_tag_set tag_set; @@ -146,7 +151,7 @@ static struct dentry *nbd_dbg_dir; #define NBD_MAGIC 0x68797548 -#define NBD_DEF_BLKSIZE 1024 +#define NBD_DEF_BLKSIZE_BITS 10 static unsigned int nbds_max = 16; static int max_part = 16; @@ -317,12 +322,12 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { if (!blksize) - blksize = NBD_DEF_BLKSIZE; + blksize = 1u << NBD_DEF_BLKSIZE_BITS; if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize)) return -EINVAL; nbd->config->bytesize = bytesize; - nbd->config->blksize = blksize; + nbd->config->blksize_bits = __ffs(blksize); if (!nbd->task_recv) return 0; @@ -1337,7 +1342,7 @@ static int nbd_start_device(struct nbd_device *nbd) args->index = i; queue_work(nbd->recv_workq, &args->work); } - return nbd_set_size(nbd, config->bytesize, config->blksize); + return nbd_set_size(nbd, config->bytesize, nbd_blksize(config)); } static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) @@ -1406,11 +1411,11 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_BLKSIZE: return nbd_set_size(nbd, config->bytesize, arg); case NBD_SET_SIZE: - return nbd_set_size(nbd, arg, config->blksize); + return nbd_set_size(nbd, arg, nbd_blksize(config)); case NBD_SET_SIZE_BLOCKS: - if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize)) + if (check_shl_overflow(arg, config->blksize_bits, &bytesize)) return -EINVAL; - return nbd_set_size(nbd, bytesize, config->blksize); + return nbd_set_size(nbd, bytesize, nbd_blksize(config)); case NBD_SET_TIMEOUT: nbd_set_cmd_timeout(nbd, arg); return 0; @@ -1476,7 +1481,7 @@ static struct nbd_config *nbd_alloc_config(void) atomic_set(&config->recv_threads, 0); init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->conn_wait); - config->blksize = NBD_DEF_BLKSIZE; + config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); try_module_get(THIS_MODULE); return config; @@ -1604,7 +1609,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops); debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); - debugfs_create_u64("blocksize", 0444, dir, &config->blksize); + debugfs_create_u32("blocksize_bits", 0444, dir, &config->blksize_bits); debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops); return 0; @@ -1826,7 +1831,7 @@ nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = { static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) { struct nbd_config *config = nbd->config; - u64 bsize = config->blksize; + u64 bsize = nbd_blksize(config); u64 bytes = config->bytesize; if (info->attrs[NBD_ATTR_SIZE_BYTES]) @@ -1835,7 +1840,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); - if (bytes != config->bytesize || bsize != config->blksize) + if (bytes != config->bytesize || bsize != nbd_blksize(config)) return nbd_set_size(nbd, bytes, bsize); return 0; } diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 4b93fd83bf..44e45af00e 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -71,8 +71,10 @@ static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, int opt_mask = 0; int token; int ret = -EINVAL; - int i, dest_port, nr_poll_queues; + int nr_poll_queues = 0; + int dest_port = 0; int p_cnt = 0; + int i; options = kstrdup(buf, GFP_KERNEL); if (!options) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 9b3bd083b4..303caf2d17 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -689,28 +689,6 @@ static const struct blk_mq_ops virtio_mq_ops = { static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); -static int virtblk_validate(struct virtio_device *vdev) -{ - u32 blk_size; - - if (!vdev->config->get) { - dev_err(&vdev->dev, "%s failure: config access disabled\n", - __func__); - return -EINVAL; - } - - if (!virtio_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) - return 0; - - blk_size = virtio_cread32(vdev, - offsetof(struct virtio_blk_config, blk_size)); - - if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE) - __virtio_clear_bit(vdev, VIRTIO_BLK_F_BLK_SIZE); - - return 0; -} - static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -722,6 +700,12 @@ static int virtblk_probe(struct virtio_device *vdev) u8 physical_block_exp, alignment_offset; unsigned int queue_depth; + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS), GFP_KERNEL); if (err < 0) @@ -836,14 +820,6 @@ static int virtblk_probe(struct virtio_device *vdev) else blk_size = queue_logical_block_size(q); - if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE) { - dev_err(&vdev->dev, - "block size is changed unexpectedly, now is %u\n", - blk_size); - err = -EINVAL; - goto out_cleanup_disk; - } - /* Use topology information if available */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, physical_block_exp, @@ -1009,7 +985,6 @@ static struct virtio_driver virtio_blk = { .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, - .validate = virtblk_validate, .probe = virtblk_probe, .remove = virtblk_remove, .config_changed = virtblk_config_changed, diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig index a5b96f3aad..a4cf3d692d 100644 --- a/drivers/bus/Kconfig +++ b/drivers/bus/Kconfig @@ -152,18 +152,6 @@ config QCOM_EBI2 Interface 2, which can be used to connect things like NAND Flash, SRAM, ethernet adapters, FPGAs and LCD displays. -config SIMPLE_PM_BUS - tristate "Simple Power-Managed Bus Driver" - depends on OF && PM - help - Driver for transparent busses that don't need a real driver, but - where the bus controller is part of a PM domain, or under the control - of a functional clock, and thus relies on runtime PM for managing - this PM domain and/or clock. - An example of such a bus controller is the Renesas Bus State - Controller (BSC, sometimes called "LBSC within Bus Bridge", or - "External Bus Interface") as found on several Renesas ARM SoCs. - config SUN50I_DE2_BUS bool "Allwinner A64 DE2 Bus Driver" default ARM64 diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile index 1c29c5e8ff..52c2f35a26 100644 --- a/drivers/bus/Makefile +++ b/drivers/bus/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_OMAP_OCP2SCP) += omap-ocp2scp.o obj-$(CONFIG_QCOM_EBI2) += qcom-ebi2.o obj-$(CONFIG_SUN50I_DE2_BUS) += sun50i-de2.o obj-$(CONFIG_SUNXI_RSB) += sunxi-rsb.o -obj-$(CONFIG_SIMPLE_PM_BUS) += simple-pm-bus.o +obj-$(CONFIG_OF) += simple-pm-bus.o obj-$(CONFIG_TEGRA_ACONNECT) += tegra-aconnect.o obj-$(CONFIG_TEGRA_GMI) += tegra-gmi.o obj-$(CONFIG_TI_PWMSS) += ti-pwmss.o diff --git a/drivers/bus/simple-pm-bus.c b/drivers/bus/simple-pm-bus.c index 01a3d0cd08..6b8d6257ed 100644 --- a/drivers/bus/simple-pm-bus.c +++ b/drivers/bus/simple-pm-bus.c @@ -13,11 +13,36 @@ #include #include - static int simple_pm_bus_probe(struct platform_device *pdev) { - const struct of_dev_auxdata *lookup = dev_get_platdata(&pdev->dev); - struct device_node *np = pdev->dev.of_node; + const struct device *dev = &pdev->dev; + const struct of_dev_auxdata *lookup = dev_get_platdata(dev); + struct device_node *np = dev->of_node; + const struct of_device_id *match; + + /* + * Allow user to use driver_override to bind this driver to a + * transparent bus device which has a different compatible string + * that's not listed in simple_pm_bus_of_match. We don't want to do any + * of the simple-pm-bus tasks for these devices, so return early. + */ + if (pdev->driver_override) + return 0; + + match = of_match_device(dev->driver->of_match_table, dev); + /* + * These are transparent bus devices (not simple-pm-bus matches) that + * have their child nodes populated automatically. So, don't need to + * do anything more. We only match with the device if this driver is + * the most specific match because we don't want to incorrectly bind to + * a device that has a more specific driver. + */ + if (match && match->data) { + if (of_property_match_string(np, "compatible", match->compatible) == 0) + return 0; + else + return -ENODEV; + } dev_dbg(&pdev->dev, "%s\n", __func__); @@ -31,14 +56,25 @@ static int simple_pm_bus_probe(struct platform_device *pdev) static int simple_pm_bus_remove(struct platform_device *pdev) { + const void *data = of_device_get_match_data(&pdev->dev); + + if (pdev->driver_override || data) + return 0; + dev_dbg(&pdev->dev, "%s\n", __func__); pm_runtime_disable(&pdev->dev); return 0; } +#define ONLY_BUS ((void *) 1) /* Match if the device is only a bus. */ + static const struct of_device_id simple_pm_bus_of_match[] = { { .compatible = "simple-pm-bus", }, + { .compatible = "simple-bus", .data = ONLY_BUS }, + { .compatible = "simple-mfd", .data = ONLY_BUS }, + { .compatible = "isa", .data = ONLY_BUS }, + { .compatible = "arm,amba-bus", .data = ONLY_BUS }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, simple_pm_bus_of_match); diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index a51c2a8fee..6a8b7fb5be 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -1464,6 +1464,9 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = { /* Quirks that need to be set based on detected module */ SYSC_QUIRK("aess", 0, 0, 0x10, -ENODEV, 0x40000000, 0xffffffff, SYSC_MODULE_QUIRK_AESS), + /* Errata i893 handling for dra7 dcan1 and 2 */ + SYSC_QUIRK("dcan", 0x4ae3c000, 0x20, -ENODEV, -ENODEV, 0xa3170504, 0xffffffff, + SYSC_QUIRK_CLKDM_NOAUTO), SYSC_QUIRK("dcan", 0x48480000, 0x20, -ENODEV, -ENODEV, 0xa3170504, 0xffffffff, SYSC_QUIRK_CLKDM_NOAUTO), SYSC_QUIRK("dss", 0x4832a000, 0, 0x10, 0x14, 0x00000020, 0xffffffff, @@ -2954,6 +2957,7 @@ static int sysc_init_soc(struct sysc *ddata) break; case SOC_AM3: sysc_add_disabled(0x48310000); /* rng */ + break; default: break; } diff --git a/drivers/char/broadcom/vcio.c b/drivers/char/broadcom/vcio.c index a39155a94f..ac31461722 100644 --- a/drivers/char/broadcom/vcio.c +++ b/drivers/char/broadcom/vcio.c @@ -1,6 +1,7 @@ /* * Copyright (C) 2010 Broadcom * Copyright (C) 2015 Noralf Trønnes + * Copyright (C) 2021 Raspberry Pi (Trading) Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -8,8 +9,6 @@ * */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -19,24 +18,22 @@ #include #include #include +#include #include -#define MBOX_CHAN_PROPERTY 8 - +#define MODULE_NAME "vcio" #define VCIO_IOC_MAGIC 100 #define IOCTL_MBOX_PROPERTY _IOWR(VCIO_IOC_MAGIC, 0, char *) #ifdef CONFIG_COMPAT #define IOCTL_MBOX_PROPERTY32 _IOWR(VCIO_IOC_MAGIC, 0, compat_uptr_t) #endif -static struct { - dev_t devt; - struct cdev cdev; - struct class *class; +struct vcio_data { struct rpi_firmware *fw; -} vcio; + struct miscdevice misc_dev; +}; -static int vcio_user_property_list(void *user) +static int vcio_user_property_list(struct vcio_data *vcio, void *user) { u32 *buf, size; int ret; @@ -55,7 +52,7 @@ static int vcio_user_property_list(void *user) } /* Strip off protocol encapsulation */ - ret = rpi_firmware_property_list(vcio.fw, &buf[2], size - 12); + ret = rpi_firmware_property_list(vcio->fw, &buf[2], size - 12); if (ret) { kfree(buf); return ret; @@ -87,9 +84,12 @@ static int vcio_device_release(struct inode *inode, struct file *file) static long vcio_device_ioctl(struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) { + struct vcio_data *vcio = container_of(file->private_data, + struct vcio_data, misc_dev); + switch (ioctl_num) { case IOCTL_MBOX_PROPERTY: - return vcio_user_property_list((void *)ioctl_param); + return vcio_user_property_list(vcio, (void *)ioctl_param); default: pr_err("unknown ioctl: %x\n", ioctl_num); return -EINVAL; @@ -100,9 +100,12 @@ static long vcio_device_ioctl(struct file *file, unsigned int ioctl_num, static long vcio_device_compat_ioctl(struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) { + struct vcio_data *vcio = container_of(file->private_data, + struct vcio_data, misc_dev); + switch (ioctl_num) { case IOCTL_MBOX_PROPERTY32: - return vcio_user_property_list(compat_ptr(ioctl_param)); + return vcio_user_property_list(vcio, compat_ptr(ioctl_param)); default: pr_err("unknown ioctl: %x\n", ioctl_num); return -EINVAL; @@ -119,77 +122,65 @@ const struct file_operations vcio_fops = { .release = vcio_device_release, }; -static int __init vcio_init(void) +static int vcio_probe(struct platform_device *pdev) { - struct device_node *np; - static struct device *dev; - int ret; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct device_node *fw_node; + struct rpi_firmware *fw; + struct vcio_data *vcio; - np = of_find_compatible_node(NULL, NULL, - "raspberrypi,bcm2835-firmware"); - if (!of_device_is_available(np)) - return -ENODEV; - - vcio.fw = rpi_firmware_get(np); - if (!vcio.fw) - return -ENODEV; - - ret = alloc_chrdev_region(&vcio.devt, 0, 1, "vcio"); - if (ret) { - pr_err("failed to allocate device number\n"); - return ret; + fw_node = of_get_parent(np); + if (!fw_node) { + dev_err(dev, "Missing firmware node\n"); + return -ENOENT; } - cdev_init(&vcio.cdev, &vcio_fops); - vcio.cdev.owner = THIS_MODULE; - ret = cdev_add(&vcio.cdev, vcio.devt, 1); - if (ret) { - pr_err("failed to register device\n"); - goto err_unregister_chardev; - } + fw = rpi_firmware_get(fw_node); + of_node_put(fw_node); + if (!fw) + return -EPROBE_DEFER; - /* - * Create sysfs entries - * 'bcm2708_vcio' is used for backwards compatibility so we don't break - * userspace. Raspian has a udev rule that changes the permissions. - */ - vcio.class = class_create(THIS_MODULE, "bcm2708_vcio"); - if (IS_ERR(vcio.class)) { - ret = PTR_ERR(vcio.class); - pr_err("failed to create class\n"); - goto err_cdev_del; - } + vcio = devm_kzalloc(dev, sizeof(struct vcio_data), GFP_KERNEL); + if (!vcio) + return -ENOMEM; - dev = device_create(vcio.class, NULL, vcio.devt, NULL, "vcio"); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - pr_err("failed to create device\n"); - goto err_class_destroy; - } + vcio->fw = fw; + vcio->misc_dev.fops = &vcio_fops; + vcio->misc_dev.minor = MISC_DYNAMIC_MINOR; + vcio->misc_dev.name = "vcio"; + vcio->misc_dev.parent = dev; + return misc_register(&vcio->misc_dev); +} + +static int vcio_remove(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + + misc_deregister(dev_get_drvdata(dev)); return 0; - -err_class_destroy: - class_destroy(vcio.class); -err_cdev_del: - cdev_del(&vcio.cdev); -err_unregister_chardev: - unregister_chrdev_region(vcio.devt, 1); - - return ret; } -module_init(vcio_init); -static void __exit vcio_exit(void) -{ - device_destroy(vcio.class, vcio.devt); - class_destroy(vcio.class); - cdev_del(&vcio.cdev); - unregister_chrdev_region(vcio.devt, 1); -} -module_exit(vcio_exit); +static const struct of_device_id vcio_ids[] = { + { .compatible = "raspberrypi,vcio" }, + { } +}; +MODULE_DEVICE_TABLE(of, vcio_ids); + +static struct platform_driver vcio_driver = { + .driver = { + .name = MODULE_NAME, + .of_match_table = of_match_ptr(vcio_ids), + }, + .probe = vcio_probe, + .remove = vcio_remove, +}; + +module_platform_driver(vcio_driver); MODULE_AUTHOR("Gray Girling"); MODULE_AUTHOR("Noralf Trønnes"); MODULE_DESCRIPTION("Mailbox userspace access"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:rpi-vcio"); diff --git a/drivers/clk/qcom/Kconfig b/drivers/clk/qcom/Kconfig index 0a5596797b..9ef007b3cf 100644 --- a/drivers/clk/qcom/Kconfig +++ b/drivers/clk/qcom/Kconfig @@ -564,6 +564,7 @@ config SM_GCC_6125 config SM_GCC_6350 tristate "SM6350 Global Clock Controller" + select QCOM_GDSC help Support for the global clock controller on SM6350 devices. Say Y if you want to use peripheral devices such as UART, diff --git a/drivers/clk/renesas/r9a07g044-cpg.c b/drivers/clk/renesas/r9a07g044-cpg.c index 4c94b94c41..1490446985 100644 --- a/drivers/clk/renesas/r9a07g044-cpg.c +++ b/drivers/clk/renesas/r9a07g044-cpg.c @@ -186,6 +186,8 @@ static struct rzg2l_reset r9a07g044_resets[] = { static const unsigned int r9a07g044_crit_mod_clks[] __initconst = { MOD_CLK_BASE + R9A07G044_GIC600_GICCLK, + MOD_CLK_BASE + R9A07G044_IA55_CLK, + MOD_CLK_BASE + R9A07G044_DMAC_ACLK, }; const struct rzg2l_cpg_info r9a07g044_cpg_info = { diff --git a/drivers/clk/renesas/rzg2l-cpg.c b/drivers/clk/renesas/rzg2l-cpg.c index 3b3b2c3347..761922ea5d 100644 --- a/drivers/clk/renesas/rzg2l-cpg.c +++ b/drivers/clk/renesas/rzg2l-cpg.c @@ -391,7 +391,7 @@ static int rzg2l_mod_clock_is_enabled(struct clk_hw *hw) value = readl(priv->base + CLK_MON_R(clock->off)); - return !(value & bitmask); + return value & bitmask; } static const struct clk_ops rzg2l_mod_clock_ops = { diff --git a/drivers/clk/socfpga/clk-agilex.c b/drivers/clk/socfpga/clk-agilex.c index 242e94c0cf..bf8cd928c2 100644 --- a/drivers/clk/socfpga/clk-agilex.c +++ b/drivers/clk/socfpga/clk-agilex.c @@ -165,13 +165,6 @@ static const struct clk_parent_data mpu_mux[] = { .name = "boot_clk", }, }; -static const struct clk_parent_data s2f_usr0_mux[] = { - { .fw_name = "f2s-free-clk", - .name = "f2s-free-clk", }, - { .fw_name = "boot_clk", - .name = "boot_clk", }, -}; - static const struct clk_parent_data emac_mux[] = { { .fw_name = "emaca_free_clk", .name = "emaca_free_clk", }, @@ -312,8 +305,6 @@ static const struct stratix10_gate_clock agilex_gate_clks[] = { 4, 0x44, 28, 1, 0, 0, 0}, { AGILEX_CS_TIMER_CLK, "cs_timer_clk", NULL, noc_mux, ARRAY_SIZE(noc_mux), 0, 0x24, 5, 0, 0, 0, 0x30, 1, 0}, - { AGILEX_S2F_USER0_CLK, "s2f_user0_clk", NULL, s2f_usr0_mux, ARRAY_SIZE(s2f_usr0_mux), 0, 0x24, - 6, 0, 0, 0, 0, 0, 0}, { AGILEX_EMAC0_CLK, "emac0_clk", NULL, emac_mux, ARRAY_SIZE(emac_mux), 0, 0x7C, 0, 0, 0, 0, 0x94, 26, 0}, { AGILEX_EMAC1_CLK, "emac1_clk", NULL, emac_mux, ARRAY_SIZE(emac_mux), 0, 0x7C, diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c index bb88198c87..aa4e1a5006 100644 --- a/drivers/crypto/ccp/ccp-ops.c +++ b/drivers/crypto/ccp/ccp-ops.c @@ -778,7 +778,7 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) in_place ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE); if (ret) - goto e_ctx; + goto e_aad; if (in_place) { dst = src; @@ -863,7 +863,7 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) op.u.aes.size = 0; ret = cmd_q->ccp->vdata->perform->aes(&op); if (ret) - goto e_dst; + goto e_final_wa; if (aes->action == CCP_AES_ACTION_ENCRYPT) { /* Put the ciphered tag after the ciphertext. */ @@ -873,17 +873,19 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) ret = ccp_init_dm_workarea(&tag, cmd_q, authsize, DMA_BIDIRECTIONAL); if (ret) - goto e_tag; + goto e_final_wa; ret = ccp_set_dm_area(&tag, 0, p_tag, 0, authsize); - if (ret) - goto e_tag; + if (ret) { + ccp_dm_free(&tag); + goto e_final_wa; + } ret = crypto_memneq(tag.address, final_wa.address, authsize) ? -EBADMSG : 0; ccp_dm_free(&tag); } -e_tag: +e_final_wa: ccp_dm_free(&final_wa); e_dst: diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 220a58cf0a..cda7d7162c 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -203,10 +203,7 @@ config INTEL_STRATIX10_RSU Say Y here if you want Intel RSU support. config QCOM_SCM - tristate "Qcom SCM driver" - depends on ARM || ARM64 - depends on HAVE_ARM_SMCCC - select RESET_CONTROLLER + tristate config QCOM_SCM_DOWNLOAD_MODE_DEFAULT bool "Qualcomm download mode enabled by default" diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index 00fe595a5b..641a918190 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -49,6 +49,13 @@ static int ffa_device_probe(struct device *dev) return ffa_drv->probe(ffa_dev); } +static void ffa_device_remove(struct device *dev) +{ + struct ffa_driver *ffa_drv = to_ffa_driver(dev->driver); + + ffa_drv->remove(to_ffa_dev(dev)); +} + static int ffa_device_uevent(struct device *dev, struct kobj_uevent_env *env) { struct ffa_device *ffa_dev = to_ffa_dev(dev); @@ -86,6 +93,7 @@ struct bus_type ffa_bus_type = { .name = "arm_ffa", .match = ffa_device_match, .probe = ffa_device_probe, + .remove = ffa_device_remove, .uevent = ffa_device_uevent, .dev_groups = ffa_device_attributes_groups, }; @@ -127,7 +135,7 @@ static void ffa_release_device(struct device *dev) static int __ffa_devices_unregister(struct device *dev, void *data) { - ffa_release_device(dev); + device_unregister(dev); return 0; } diff --git a/drivers/firmware/arm_scmi/Kconfig b/drivers/firmware/arm_scmi/Kconfig index 7f4d243550..3d7081e848 100644 --- a/drivers/firmware/arm_scmi/Kconfig +++ b/drivers/firmware/arm_scmi/Kconfig @@ -68,7 +68,7 @@ config ARM_SCMI_TRANSPORT_SMC config ARM_SCMI_TRANSPORT_VIRTIO bool "SCMI transport based on VirtIO" - depends on VIRTIO + depends on VIRTIO=y || VIRTIO=ARM_SCMI_PROTOCOL select ARM_SCMI_HAVE_TRANSPORT select ARM_SCMI_HAVE_MSG help diff --git a/drivers/firmware/arm_scmi/virtio.c b/drivers/firmware/arm_scmi/virtio.c index 224577f869..11e8efb713 100644 --- a/drivers/firmware/arm_scmi/virtio.c +++ b/drivers/firmware/arm_scmi/virtio.c @@ -110,18 +110,16 @@ static void scmi_finalize_message(struct scmi_vio_channel *vioch, if (vioch->is_rx) { scmi_vio_feed_vq_rx(vioch, msg); } else { - unsigned long flags; - - spin_lock_irqsave(&vioch->lock, flags); + /* Here IRQs are assumed to be already disabled by the caller */ + spin_lock(&vioch->lock); list_add(&msg->list, &vioch->free_list); - spin_unlock_irqrestore(&vioch->lock, flags); + spin_unlock(&vioch->lock); } } static void scmi_vio_complete_cb(struct virtqueue *vqueue) { unsigned long ready_flags; - unsigned long flags; unsigned int length; struct scmi_vio_channel *vioch; struct scmi_vio_msg *msg; @@ -140,7 +138,8 @@ static void scmi_vio_complete_cb(struct virtqueue *vqueue) goto unlock_ready_out; } - spin_lock_irqsave(&vioch->lock, flags); + /* IRQs already disabled here no need to irqsave */ + spin_lock(&vioch->lock); if (cb_enabled) { virtqueue_disable_cb(vqueue); cb_enabled = false; @@ -151,7 +150,7 @@ static void scmi_vio_complete_cb(struct virtqueue *vqueue) goto unlock_out; cb_enabled = true; } - spin_unlock_irqrestore(&vioch->lock, flags); + spin_unlock(&vioch->lock); if (msg) { msg->rx_len = length; @@ -161,11 +160,18 @@ static void scmi_vio_complete_cb(struct virtqueue *vqueue) scmi_finalize_message(vioch, msg); } + /* + * Release ready_lock and re-enable IRQs between loop iterations + * to allow virtio_chan_free() to possibly kick in and set the + * flag vioch->ready to false even in between processing of + * messages, so as to force outstanding messages to be ignored + * when system is shutting down. + */ spin_unlock_irqrestore(&vioch->ready_lock, ready_flags); } unlock_out: - spin_unlock_irqrestore(&vioch->lock, flags); + spin_unlock(&vioch->lock); unlock_ready_out: spin_unlock_irqrestore(&vioch->ready_lock, ready_flags); } @@ -384,8 +390,11 @@ static int scmi_vio_probe(struct virtio_device *vdev) struct virtqueue *vqs[VIRTIO_SCMI_VQ_MAX_CNT]; /* Only one SCMI VirtiO device allowed */ - if (scmi_vdev) - return -EINVAL; + if (scmi_vdev) { + dev_err(dev, + "One SCMI Virtio device was already initialized: only one allowed.\n"); + return -EBUSY; + } have_vq_rx = scmi_vio_have_vq_rx(vdev); vq_cnt = have_vq_rx ? VIRTIO_SCMI_VQ_MAX_CNT : 1; @@ -428,16 +437,25 @@ static int scmi_vio_probe(struct virtio_device *vdev) } vdev->priv = channels; - scmi_vdev = vdev; + /* Ensure initialized scmi_vdev is visible */ + smp_store_mb(scmi_vdev, vdev); return 0; } static void scmi_vio_remove(struct virtio_device *vdev) { + /* + * Once we get here, virtio_chan_free() will have already been called by + * the SCMI core for any existing channel and, as a consequence, all the + * virtio channels will have been already marked NOT ready, causing any + * outstanding message on any vqueue to be ignored by complete_cb: now + * we can just stop processing buffers and destroy the vqueues. + */ vdev->config->reset(vdev); vdev->config->del_vqs(vdev); - scmi_vdev = NULL; + /* Ensure scmi_vdev is visible as NULL */ + smp_store_mb(scmi_vdev, NULL); } static int scmi_vio_validate(struct virtio_device *vdev) @@ -476,7 +494,7 @@ static int __init virtio_scmi_init(void) return register_virtio_driver(&virtio_scmi_driver); } -static void __exit virtio_scmi_exit(void) +static void virtio_scmi_exit(void) { unregister_virtio_driver(&virtio_scmi_driver); } diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 73bdbd207e..6ec8edec63 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -25,8 +25,6 @@ #include #include -static char rcd_decode_str[CPER_REC_LEN]; - /* * CPER record ID need to be unique even after reboot, because record * ID is used as index for ERST storage, while CPER records from @@ -312,6 +310,7 @@ const char *cper_mem_err_unpack(struct trace_seq *p, struct cper_mem_err_compact *cmem) { const char *ret = trace_seq_buffer_ptr(p); + char rcd_decode_str[CPER_REC_LEN]; if (cper_mem_err_location(cmem, rcd_decode_str)) trace_seq_printf(p, "%s", rcd_decode_str); @@ -326,6 +325,7 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem, int len) { struct cper_mem_err_compact cmem; + char rcd_decode_str[CPER_REC_LEN]; /* Don't trust UEFI 2.1/2.2 structure with bad validation bits */ if (len == sizeof(struct cper_sec_mem_err_old) && diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index 365c3a43a1..fe567be0f1 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -271,7 +271,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, return status; } - efi_info("Exiting boot services and installing virtual address map...\n"); + efi_info("Exiting boot services...\n"); map.map = &memory_map; status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, ULONG_MAX); diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 1410beaef5..f3e54f6616 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -414,7 +414,7 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { - if (down_interruptible(&efi_runtime_lock)) { + if (down_trylock(&efi_runtime_lock)) { pr_warn("failed to invoke the reset_system() runtime service:\n" "could not get exclusive access to the firmware\n"); return; diff --git a/drivers/fpga/ice40-spi.c b/drivers/fpga/ice40-spi.c index 69dec5af23..029d3cdb91 100644 --- a/drivers/fpga/ice40-spi.c +++ b/drivers/fpga/ice40-spi.c @@ -192,12 +192,19 @@ static const struct of_device_id ice40_fpga_of_match[] = { }; MODULE_DEVICE_TABLE(of, ice40_fpga_of_match); +static const struct spi_device_id ice40_fpga_spi_ids[] = { + { .name = "ice40-fpga-mgr", }, + {}, +}; +MODULE_DEVICE_TABLE(spi, ice40_fpga_spi_ids); + static struct spi_driver ice40_fpga_driver = { .probe = ice40_fpga_probe, .driver = { .name = "ice40spi", .of_match_table = of_match_ptr(ice40_fpga_of_match), }, + .id_table = ice40_fpga_spi_ids, }; module_spi_driver(ice40_fpga_driver); diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 43c30308b1..d5f139be43 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -485,6 +485,14 @@ config GPIO_PMIC_EIC_SPRD help Say yes here to support Spreadtrum PMIC EIC device. +config GPIO_PWM + tristate "PWM chip GPIO" + depends on OF_GPIO + depends on PWM + help + Turn on support for exposing a PWM chip as a GPIO + driver. + config GPIO_PXA bool "PXA GPIO support" depends on ARCH_PXA || ARCH_MMP || COMPILE_TEST diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 0900302c9c..0980f9b189 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -121,6 +121,7 @@ obj-$(CONFIG_GPIO_PCI_IDIO_16) += gpio-pci-idio-16.o obj-$(CONFIG_GPIO_PISOSR) += gpio-pisosr.o obj-$(CONFIG_GPIO_PL061) += gpio-pl061.o obj-$(CONFIG_GPIO_PMIC_EIC_SPRD) += gpio-pmic-eic-sprd.o +obj-$(CONFIG_GPIO_PWM) += gpio-pwm.o obj-$(CONFIG_GPIO_PXA) += gpio-pxa.o obj-$(CONFIG_GPIO_RASPBERRYPI_EXP) += gpio-raspberrypi-exp.o obj-$(CONFIG_GPIO_RC5T583) += gpio-rc5t583.o diff --git a/drivers/gpio/gpio-74x164.c b/drivers/gpio/gpio-74x164.c index 05637d5851..4a55cdf089 100644 --- a/drivers/gpio/gpio-74x164.c +++ b/drivers/gpio/gpio-74x164.c @@ -174,6 +174,13 @@ static int gen_74x164_remove(struct spi_device *spi) return 0; } +static const struct spi_device_id gen_74x164_spi_ids[] = { + { .name = "74hc595" }, + { .name = "74lvc594" }, + {}, +}; +MODULE_DEVICE_TABLE(spi, gen_74x164_spi_ids); + static const struct of_device_id gen_74x164_dt_ids[] = { { .compatible = "fairchild,74hc595" }, { .compatible = "nxp,74lvc594" }, @@ -188,6 +195,7 @@ static struct spi_driver gen_74x164_driver = { }, .probe = gen_74x164_probe, .remove = gen_74x164_remove, + .id_table = gen_74x164_spi_ids, }; module_spi_driver(gen_74x164_driver); diff --git a/drivers/gpio/gpio-bcm-virt.c b/drivers/gpio/gpio-bcm-virt.c index 49e28ad976..55c40190e8 100644 --- a/drivers/gpio/gpio-bcm-virt.c +++ b/drivers/gpio/gpio-bcm-virt.c @@ -49,7 +49,7 @@ static int brcmvirt_gpio_get(struct gpio_chip *gc, unsigned off) unsigned v; gpio = container_of(gc, struct brcmvirt_gpio, gc); v = readl(gpio->ts_base + off); - return (v >> off) & 1; + return (s16)((v >> 16) - v) > 0; } static void brcmvirt_gpio_set(struct gpio_chip *gc, unsigned off, int val) diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index 0a9d746a0f..d26bff2915 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -476,10 +476,19 @@ static struct platform_device *gpio_mockup_pdevs[GPIO_MOCKUP_MAX_GC]; static void gpio_mockup_unregister_pdevs(void) { + struct platform_device *pdev; + struct fwnode_handle *fwnode; int i; - for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++) - platform_device_unregister(gpio_mockup_pdevs[i]); + for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++) { + pdev = gpio_mockup_pdevs[i]; + if (!pdev) + continue; + + fwnode = dev_fwnode(&pdev->dev); + platform_device_unregister(pdev); + fwnode_remove_software_node(fwnode); + } } static __init char **gpio_mockup_make_line_names(const char *label, @@ -508,6 +517,7 @@ static int __init gpio_mockup_register_chip(int idx) struct property_entry properties[GPIO_MOCKUP_MAX_PROP]; struct platform_device_info pdevinfo; struct platform_device *pdev; + struct fwnode_handle *fwnode; char **line_names = NULL; char chip_label[32]; int prop = 0, base; @@ -536,13 +546,18 @@ static int __init gpio_mockup_register_chip(int idx) "gpio-line-names", line_names, ngpio); } + fwnode = fwnode_create_software_node(properties, NULL); + if (IS_ERR(fwnode)) + return PTR_ERR(fwnode); + pdevinfo.name = "gpio-mockup"; pdevinfo.id = idx; - pdevinfo.properties = properties; + pdevinfo.fwnode = fwnode; pdev = platform_device_register_full(&pdevinfo); kfree_strarray(line_names, ngpio); if (IS_ERR(pdev)) { + fwnode_remove_software_node(fwnode); pr_err("error registering device"); return PTR_ERR(pdev); } diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index f5cfc06987..d2fe76f3f3 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -468,15 +468,8 @@ static int pca953x_gpio_get_value(struct gpio_chip *gc, unsigned off) mutex_lock(&chip->i2c_lock); ret = regmap_read(chip->regmap, inreg, ®_val); mutex_unlock(&chip->i2c_lock); - if (ret < 0) { - /* - * NOTE: - * diagnostic already emitted; that's all we should - * do unless gpio_*_value_cansleep() calls become different - * from their nonsleeping siblings (and report faults). - */ - return 0; - } + if (ret < 0) + return ret; return !!(reg_val & bit); } @@ -566,21 +559,21 @@ static int pca953x_gpio_set_pull_up_down(struct pca953x_chip *chip, mutex_lock(&chip->i2c_lock); - /* Disable pull-up/pull-down */ - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); - if (ret) - goto exit; - /* Configure pull-up/pull-down */ if (config == PIN_CONFIG_BIAS_PULL_UP) ret = regmap_write_bits(chip->regmap, pull_sel_reg, bit, bit); else if (config == PIN_CONFIG_BIAS_PULL_DOWN) ret = regmap_write_bits(chip->regmap, pull_sel_reg, bit, 0); + else + ret = 0; if (ret) goto exit; - /* Enable pull-up/pull-down */ - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); + /* Disable/Enable pull-up/pull-down */ + if (config == PIN_CONFIG_BIAS_DISABLE) + ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); + else + ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); exit: mutex_unlock(&chip->i2c_lock); @@ -594,7 +587,9 @@ static int pca953x_gpio_set_config(struct gpio_chip *gc, unsigned int offset, switch (pinconf_to_config_param(config)) { case PIN_CONFIG_BIAS_PULL_UP: + case PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: case PIN_CONFIG_BIAS_PULL_DOWN: + case PIN_CONFIG_BIAS_DISABLE: return pca953x_gpio_set_pull_up_down(chip, offset, config); default: return -ENOTSUPP; diff --git a/drivers/gpio/gpio-pwm.c b/drivers/gpio/gpio-pwm.c new file mode 100644 index 0000000000..89f5d6b353 --- /dev/null +++ b/drivers/gpio/gpio-pwm.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * GPIO driver wrapping PWM API + * + * PWM 0% and PWM 100% are equivalent to digital GPIO + * outputs, and there are times where it is useful to use + * PWM outputs as straight GPIOs (eg outputs of NXP PCA9685 + * I2C PWM chip). This driver wraps the PWM API as a GPIO + * controller. + * + * Copyright (C) 2021 Raspberry Pi (Trading) Ltd. + */ + +#include +#include +#include +#include +#include + +struct pwm_gpio { + struct gpio_chip gc; + struct pwm_device **pwm; +}; + +static int pwm_gpio_get_direction(struct gpio_chip *gc, unsigned int off) +{ + return GPIO_LINE_DIRECTION_OUT; +} + +static void pwm_gpio_set(struct gpio_chip *gc, unsigned int off, int val) +{ + struct pwm_gpio *pwm_gpio = gpiochip_get_data(gc); + struct pwm_state state; + + pwm_get_state(pwm_gpio->pwm[off], &state); + state.duty_cycle = val ? state.period : 0; + pwm_apply_state(pwm_gpio->pwm[off], &state); +} + +static int pwm_gpio_parse_dt(struct pwm_gpio *pwm_gpio, + struct device *dev) +{ + struct device_node *node = dev->of_node; + struct pwm_state state; + int ret = 0, i, num_gpios; + const char *pwm_name; + + if (!node) + return -ENODEV; + + num_gpios = of_property_count_strings(node, "pwm-names"); + if (num_gpios <= 0) + return 0; + + pwm_gpio->pwm = devm_kzalloc(dev, + sizeof(*pwm_gpio->pwm) * num_gpios, + GFP_KERNEL); + if (!pwm_gpio->pwm) + return -ENOMEM; + + for (i = 0; i < num_gpios; i++) { + ret = of_property_read_string_index(node, "pwm-names", i, + &pwm_name); + if (ret) { + dev_err(dev, "unable to get pwm device index %d, name %s", + i, pwm_name); + goto error; + } + + pwm_gpio->pwm[i] = devm_pwm_get(dev, pwm_name); + if (IS_ERR(pwm_gpio->pwm[i])) { + ret = PTR_ERR(pwm_gpio->pwm[i]); + if (ret != -EPROBE_DEFER) + dev_err(dev, "unable to request PWM\n"); + goto error; + } + + /* Sync up PWM state. */ + pwm_init_state(pwm_gpio->pwm[i], &state); + + state.duty_cycle = 0; + pwm_apply_state(pwm_gpio->pwm[i], &state); + } + + pwm_gpio->gc.ngpio = num_gpios; + +error: + return ret; +} + +static int pwm_gpio_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct pwm_gpio *pwm_gpio; + int ret; + + pwm_gpio = devm_kzalloc(dev, sizeof(*pwm_gpio), GFP_KERNEL); + if (!pwm_gpio) + return -ENOMEM; + + pwm_gpio->gc.parent = dev; + pwm_gpio->gc.label = "pwm-gpio"; + pwm_gpio->gc.owner = THIS_MODULE; + pwm_gpio->gc.of_node = dev->of_node; + pwm_gpio->gc.base = -1; + + pwm_gpio->gc.get_direction = pwm_gpio_get_direction; + pwm_gpio->gc.set = pwm_gpio_set; + pwm_gpio->gc.can_sleep = true; + + ret = pwm_gpio_parse_dt(pwm_gpio, dev); + if (ret) + return ret; + + if (!pwm_gpio->gc.ngpio) + return 0; + + return devm_gpiochip_add_data(dev, &pwm_gpio->gc, pwm_gpio); +} + +static int pwm_gpio_remove(struct platform_device *pdev) +{ + return 0; +} + +static const struct of_device_id pwm_gpio_of_match[] = { + { .compatible = "pwm-gpio" }, + { } +}; +MODULE_DEVICE_TABLE(of, pwm_gpio_of_match); + +static struct platform_driver pwm_gpio_driver = { + .driver = { + .name = "pwm-gpio", + .of_match_table = of_match_ptr(pwm_gpio_of_match), + }, + .probe = pwm_gpio_probe, + .remove = pwm_gpio_remove, +}; +module_platform_driver(pwm_gpio_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dave Stevenson "); +MODULE_DESCRIPTION("PWM GPIO driver"); diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index 3335bd5776..ce63cbd14d 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -689,6 +689,7 @@ static int rockchip_gpio_probe(struct platform_device *pdev) struct device_node *pctlnp = of_get_parent(np); struct pinctrl_dev *pctldev = NULL; struct rockchip_pin_bank *bank = NULL; + struct rockchip_pin_output_deferred *cfg; static int gpio; int id, ret; @@ -716,12 +717,33 @@ static int rockchip_gpio_probe(struct platform_device *pdev) if (ret) return ret; + /* + * Prevent clashes with a deferred output setting + * being added right at this moment. + */ + mutex_lock(&bank->deferred_lock); + ret = rockchip_gpiolib_register(bank); if (ret) { clk_disable_unprepare(bank->clk); + mutex_unlock(&bank->deferred_lock); return ret; } + while (!list_empty(&bank->deferred_output)) { + cfg = list_first_entry(&bank->deferred_output, + struct rockchip_pin_output_deferred, head); + list_del(&cfg->head); + + ret = rockchip_gpio_direction_output(&bank->gpio_chip, cfg->pin, cfg->arg); + if (ret) + dev_warn(dev, "setting output pin %u to %u failed\n", cfg->pin, cfg->arg); + + kfree(cfg); + } + + mutex_unlock(&bank->deferred_lock); + platform_set_drvdata(pdev, bank); dev_info(dev, "probed %pOF\n", np); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d356e329e6..269437b013 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1087,6 +1087,7 @@ struct amdgpu_device { bool no_hw_access; struct pci_saved_state *pci_state; + pci_channel_state_t pci_channel_state; struct amdgpu_reset_control *reset_cntl; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 2d6b2d77b7..054c1a224d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -563,6 +563,7 @@ kfd_mem_dmaunmap_userptr(struct kgd_mem *mem, dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0); sg_free_table(ttm->sg); + kfree(ttm->sg); ttm->sg = NULL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ab3794c42d..af9bdf16ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2394,10 +2394,6 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; - r = amdgpu_amdkfd_resume_iommu(adev); - if (r) - goto init_failed; - r = amdgpu_device_ip_hw_init_phase1(adev); if (r) goto init_failed; @@ -2436,6 +2432,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); + r = amdgpu_amdkfd_resume_iommu(adev); + if (r) + goto init_failed; + amdgpu_fru_get_product_info(adev); init_failed: @@ -5399,6 +5399,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta return PCI_ERS_RESULT_DISCONNECT; } + adev->pci_channel_state = state; + switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; @@ -5541,6 +5543,10 @@ void amdgpu_pci_resume(struct pci_dev *pdev) DRM_INFO("PCI error: resume callback!!\n"); + /* Only continue execution for the case of pci_channel_io_frozen */ + if (adev->pci_channel_state != pci_channel_io_frozen) + return; + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 7a73167319..dc50c05f23 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -837,6 +837,28 @@ static int convert_tiling_flags_to_modifier(struct amdgpu_framebuffer *afb) return 0; } +/* Mirrors the is_displayable check in radeonsi's gfx6_compute_surface */ +static int check_tiling_flags_gfx6(struct amdgpu_framebuffer *afb) +{ + u64 micro_tile_mode; + + /* Zero swizzle mode means linear */ + if (AMDGPU_TILING_GET(afb->tiling_flags, SWIZZLE_MODE) == 0) + return 0; + + micro_tile_mode = AMDGPU_TILING_GET(afb->tiling_flags, MICRO_TILE_MODE); + switch (micro_tile_mode) { + case 0: /* DISPLAY */ + case 3: /* RENDER */ + return 0; + default: + drm_dbg_kms(afb->base.dev, + "Micro tile mode %llu not supported for scanout\n", + micro_tile_mode); + return -EINVAL; + } +} + static void get_block_dimensions(unsigned int block_log2, unsigned int cpp, unsigned int *width, unsigned int *height) { @@ -1103,6 +1125,7 @@ int amdgpu_display_framebuffer_init(struct drm_device *dev, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object *obj) { + struct amdgpu_device *adev = drm_to_adev(dev); int ret, i; /* @@ -1122,6 +1145,14 @@ int amdgpu_display_framebuffer_init(struct drm_device *dev, if (ret) return ret; + if (!dev->mode_config.allow_fb_modifiers) { + drm_WARN_ONCE(dev, adev->family >= AMDGPU_FAMILY_AI, + "GFX9+ requires FB check based on format modifier\n"); + ret = check_tiling_flags_gfx6(rfb); + if (ret) + return ret; + } + if (dev->mode_config.allow_fb_modifiers && !(rfb->base.flags & DRM_MODE_FB_MODIFIERS)) { ret = convert_tiling_flags_to_modifier(rfb); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e7f06bd0f0..1916ec84dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -31,6 +31,8 @@ /* delay 0.1 second to enable gfx off feature */ #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) +#define GFX_OFF_NO_DELAY 0 + /* * GPU GFX IP block helpers function. */ @@ -558,6 +560,8 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev) void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) { + unsigned long delay = GFX_OFF_DELAY_ENABLE; + if (!(adev->pm.pp_feature & PP_GFXOFF_MASK)) return; @@ -573,8 +577,14 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) adev->gfx.gfx_off_req_count--; - if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state) - schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE); + if (adev->gfx.gfx_off_req_count == 0 && + !adev->gfx.gfx_off_state) { + /* If going to s2idle, no need to wait */ + if (adev->in_s0ix) + delay = GFX_OFF_NO_DELAY; + schedule_delayed_work(&adev->gfx.gfx_off_delay_work, + delay); + } } else { if (adev->gfx.gfx_off_req_count == 0) { cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 603c259b07..025184a556 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3599,7 +3599,7 @@ static int gfx_v9_0_mqd_init(struct amdgpu_ring *ring) /* set static priority for a queue/ring */ gfx_v9_0_mqd_set_priority(ring, mqd); - mqd->cp_hqd_quantum = RREG32(mmCP_HQD_QUANTUM); + mqd->cp_hqd_quantum = RREG32_SOC15(GC, 0, mmCP_HQD_QUANTUM); /* map_queues packet doesn't need activate the queue, * so only kiq need set this field. diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 41c3a0d70b..e47104a1f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -1098,6 +1098,8 @@ static int gmc_v10_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + gmc_v10_0_gart_disable(adev); + if (amdgpu_sriov_vf(adev)) { /* full access mode, so don't touch any GMC register */ DRM_DEBUG("For SRIOV client, shouldn't do anything.\n"); @@ -1106,7 +1108,6 @@ static int gmc_v10_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); - gmc_v10_0_gart_disable(adev); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index d90c16a6b2..5551359d5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1794,6 +1794,8 @@ static int gmc_v9_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + gmc_v9_0_gart_disable(adev); + if (amdgpu_sriov_vf(adev)) { /* full access mode, so don't touch any GMC register */ DRM_DEBUG("For SRIOV client, shouldn't do anything.\n"); @@ -1802,7 +1804,6 @@ static int gmc_v9_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); - gmc_v9_0_gart_disable(adev); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index 779f5c911e..e32efcfb0c 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -868,6 +868,12 @@ static int sdma_v5_2_start(struct amdgpu_device *adev) msleep(1000); } + /* TODO: check whether can submit a doorbell request to raise + * a doorbell fence to exit gfxoff. + */ + if (adev->in_s0ix) + amdgpu_gfx_off_ctrl(adev, false); + sdma_v5_2_soft_reset(adev); /* unhalt the MEs */ sdma_v5_2_enable(adev, true); @@ -876,6 +882,8 @@ static int sdma_v5_2_start(struct amdgpu_device *adev) /* start the gfx rings and rlc compute queues */ r = sdma_v5_2_gfx_resume(adev); + if (adev->in_s0ix) + amdgpu_gfx_off_ctrl(adev, true); if (r) return r; r = sdma_v5_2_rlc_resume(adev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c2a4d920da..4a416231b2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1085,18 +1085,12 @@ static int kfd_resume(struct kfd_dev *kfd) int err = 0; err = kfd->dqm->ops.start(kfd->dqm); - if (err) { + if (err) dev_err(kfd_device, "Error starting queue manager for device %x:%x\n", kfd->pdev->vendor, kfd->pdev->device); - goto dqm_start_error; - } return err; - -dqm_start_error: - kfd_iommu_suspend(kfd); - return err; } static inline void kfd_queue_work(struct workqueue_struct *wq, diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 7dffc04a55..127667e549 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -25,6 +25,8 @@ config DRM_AMD_DC_HDCP config DRM_AMD_DC_SI bool "AMD DC support for Southern Islands ASICs" + depends on DRM_AMDGPU_SI + depends on DRM_AMD_DC default n help Choose this option to enable new AMD DC support for SI asics diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 66c799f5c7..1ea31dcc7a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1115,6 +1115,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) init_data.asic_id.pci_revision_id = adev->pdev->revision; init_data.asic_id.hw_internal_rev = adev->external_rev_id; + init_data.asic_id.chip_id = adev->pdev->device; init_data.asic_id.vram_width = adev->gmc.vram_width; /* TODO: initialize init_data.asic_id.vram_type here!!!! */ @@ -1719,6 +1720,7 @@ static int dm_late_init(void *handle) linear_lut[i] = 0xFFFF * i / 15; params.set = 0; + params.backlight_ramping_override = false; params.backlight_ramping_start = 0xCCCC; params.backlight_ramping_reduction = 0xCCCCCCCC; params.backlight_lut_array_size = 16; diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index f6dbc5a747..6d655e1582 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -1306,12 +1306,6 @@ static void override_training_settings( { uint32_t lane; - /* Override link settings */ - if (link->preferred_link_setting.link_rate != LINK_RATE_UNKNOWN) - lt_settings->link_settings.link_rate = link->preferred_link_setting.link_rate; - if (link->preferred_link_setting.lane_count != LANE_COUNT_UNKNOWN) - lt_settings->link_settings.lane_count = link->preferred_link_setting.lane_count; - /* Override link spread */ if (!link->dp_ss_off && overrides->downspread != NULL) lt_settings->link_settings.link_spread = *overrides->downspread ? @@ -1826,14 +1820,13 @@ bool perform_link_training_with_retries( if (panel_mode == DP_PANEL_MODE_EDP) { struct cp_psp *cp_psp = &stream->ctx->cp_psp; - if (cp_psp && cp_psp->funcs.enable_assr) { - if (!cp_psp->funcs.enable_assr(cp_psp->handle, link)) { - /* since eDP implies ASSR on, change panel - * mode to disable ASSR - */ - panel_mode = DP_PANEL_MODE_DEFAULT; - } - } + if (cp_psp && cp_psp->funcs.enable_assr) + /* ASSR is bound to fail with unsigned PSP + * verstage used during devlopment phase. + * Report and continue with eDP panel mode to + * perform eDP link training with right settings + */ + cp_psp->funcs.enable_assr(cp_psp->handle, link); } #endif diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h index d8b22618b7..c337588231 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h @@ -118,6 +118,7 @@ struct dcn10_link_enc_registers { uint32_t RDPCSTX_PHY_CNTL4; uint32_t RDPCSTX_PHY_CNTL5; uint32_t RDPCSTX_PHY_CNTL6; + uint32_t RDPCSPIPE_PHY_CNTL6; uint32_t RDPCSTX_PHY_CNTL7; uint32_t RDPCSTX_PHY_CNTL8; uint32_t RDPCSTX_PHY_CNTL9; diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c index 90127c1f9e..b0892443fb 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c @@ -37,6 +37,7 @@ #include "link_enc_cfg.h" #include "dc_dmub_srv.h" +#include "dal_asic_id.h" #define CTX \ enc10->base.ctx @@ -62,6 +63,10 @@ #define AUX_REG_WRITE(reg_name, val) \ dm_write_reg(CTX, AUX_REG(reg_name), val) +#ifndef MIN +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#endif + void dcn31_link_encoder_set_dio_phy_mux( struct link_encoder *enc, enum encoder_type_select sel, @@ -215,8 +220,8 @@ static const struct link_encoder_funcs dcn31_link_enc_funcs = { .fec_is_active = enc2_fec_is_active, .get_dig_frontend = dcn10_get_dig_frontend, .get_dig_mode = dcn10_get_dig_mode, - .is_in_alt_mode = dcn20_link_encoder_is_in_alt_mode, - .get_max_link_cap = dcn20_link_encoder_get_max_link_cap, + .is_in_alt_mode = dcn31_link_encoder_is_in_alt_mode, + .get_max_link_cap = dcn31_link_encoder_get_max_link_cap, .set_dio_phy_mux = dcn31_link_encoder_set_dio_phy_mux, }; @@ -404,3 +409,60 @@ void dcn31_link_encoder_disable_output( } } +bool dcn31_link_encoder_is_in_alt_mode(struct link_encoder *enc) +{ + struct dcn10_link_encoder *enc10 = TO_DCN10_LINK_ENC(enc); + uint32_t dp_alt_mode_disable; + bool is_usb_c_alt_mode = false; + + if (enc->features.flags.bits.DP_IS_USB_C) { + if (enc->ctx->asic_id.hw_internal_rev != YELLOW_CARP_B0) { + // [Note] no need to check hw_internal_rev once phy mux selection is ready + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, &dp_alt_mode_disable); + } else { + /* + * B0 phys use a new set of registers to check whether alt mode is disabled. + * if value == 1 alt mode is disabled, otherwise it is enabled. + */ + if ((enc10->base.transmitter == TRANSMITTER_UNIPHY_A) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_B) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_E)) { + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, &dp_alt_mode_disable); + } else { + // [Note] need to change TRANSMITTER_UNIPHY_C/D to F/G once phy mux selection is ready + REG_GET(RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, &dp_alt_mode_disable); + } + } + + is_usb_c_alt_mode = (dp_alt_mode_disable == 0); + } + + return is_usb_c_alt_mode; +} + +void dcn31_link_encoder_get_max_link_cap(struct link_encoder *enc, + struct dc_link_settings *link_settings) +{ + struct dcn10_link_encoder *enc10 = TO_DCN10_LINK_ENC(enc); + uint32_t is_in_usb_c_dp4_mode = 0; + + dcn10_link_encoder_get_max_link_cap(enc, link_settings); + + /* in usb c dp2 mode, max lane count is 2 */ + if (enc->funcs->is_in_alt_mode && enc->funcs->is_in_alt_mode(enc)) { + if (enc->ctx->asic_id.hw_internal_rev != YELLOW_CARP_B0) { + // [Note] no need to check hw_internal_rev once phy mux selection is ready + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, &is_in_usb_c_dp4_mode); + } else { + if ((enc10->base.transmitter == TRANSMITTER_UNIPHY_A) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_B) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_E)) { + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, &is_in_usb_c_dp4_mode); + } else { + REG_GET(RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, &is_in_usb_c_dp4_mode); + } + } + if (!is_in_usb_c_dp4_mode) + link_settings->lane_count = MIN(LANE_COUNT_TWO, link_settings->lane_count); + } +} diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h index 32d1463128..3454f1e7c1 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h @@ -69,6 +69,7 @@ SRI(RDPCSTX_PHY_CNTL4, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL5, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL6, RDPCSTX, id), \ + SRI(RDPCSPIPE_PHY_CNTL6, RDPCSPIPE, id), \ SRI(RDPCSTX_PHY_CNTL7, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL8, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL9, RDPCSTX, id), \ @@ -115,7 +116,9 @@ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DP_TX2_MPLL_EN, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DP_TX3_MPLL_EN, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, mask_sh),\ - LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, mask_sh),\ + LE_SF(RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, mask_sh),\ + LE_SF(RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, mask_sh),\ + LE_SF(RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE_ACK, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL7, RDPCS_PHY_DP_MPLLB_FRACN_QUOT, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL7, RDPCS_PHY_DP_MPLLB_FRACN_DEN, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL8, RDPCS_PHY_DP_MPLLB_SSC_PEAK, mask_sh),\ @@ -243,4 +246,13 @@ void dcn31_link_encoder_disable_output( struct link_encoder *enc, enum signal_type signal); +/* + * Check whether USB-C DP Alt mode is disabled + */ +bool dcn31_link_encoder_is_in_alt_mode( + struct link_encoder *enc); + +void dcn31_link_encoder_get_max_link_cap(struct link_encoder *enc, + struct dc_link_settings *link_settings); + #endif /* __DC_LINK_ENCODER__DCN31_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c index a7702d3c75..0006bbac46 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c @@ -928,7 +928,7 @@ static const struct dc_debug_options debug_defaults_drv = { .disable_dcc = DCC_ENABLE, .vsr_support = true, .performance_trace = false, - .max_downscale_src_width = 7680,/*upto 8K*/ + .max_downscale_src_width = 3840,/*upto 4K*/ .disable_pplib_wm_range = false, .scl_reset_length10 = true, .sanity_checks = false, @@ -1284,6 +1284,12 @@ static struct stream_encoder *dcn31_stream_encoder_create( if (!enc1 || !vpg || !afmt) return NULL; + if (ctx->asic_id.chip_family == FAMILY_YELLOW_CARP && + ctx->asic_id.hw_internal_rev == YELLOW_CARP_B0) { + if ((eng_id == ENGINE_ID_DIGC) || (eng_id == ENGINE_ID_DIGD)) + eng_id = eng_id + 3; // For B0 only. C->F, D->G. + } + dcn30_dio_stream_encoder_construct(enc1, ctx, ctx->dc_bios, eng_id, vpg, afmt, &stream_enc_regs[eng_id], diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h index 381c17caac..5adc471bef 100644 --- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h +++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h @@ -227,7 +227,7 @@ enum { #define FAMILY_YELLOW_CARP 146 #define YELLOW_CARP_A0 0x01 -#define YELLOW_CARP_B0 0x02 // TODO: DCN31 - update with correct B0 ID +#define YELLOW_CARP_B0 0x1A #define YELLOW_CARP_UNKNOWN 0xFF #ifndef ASICREV_IS_YELLOW_CARP diff --git a/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h index 92caf8441d..01a56556cd 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h +++ b/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h @@ -11932,5 +11932,32 @@ #define ixDPCSSYS_CR4_RAWLANEX_DIG_PCS_XF_RX_OVRD_OUT_2 0xe0c7 #define ixDPCSSYS_CR4_RAWLANEX_DIG_PCS_XF_TX_OVRD_IN_2 0xe0c8 +//RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4__SHIFT 0x10 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE__SHIFT 0x11 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK__SHIFT 0x12 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4_MASK 0x00010000L +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_MASK 0x00020000L +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK_MASK 0x00040000L + +//RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4__SHIFT 0x10 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE__SHIFT 0x11 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK__SHIFT 0x12 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4_MASK 0x00010000L +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_MASK 0x00020000L +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK_MASK 0x00040000L + +//[Note] Hack. RDPCSPIPE only has 2 instances. +#define regRDPCSPIPE0_RDPCSPIPE_PHY_CNTL6 0x2d73 +#define regRDPCSPIPE0_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE1_RDPCSPIPE_PHY_CNTL6 0x2e4b +#define regRDPCSPIPE1_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE2_RDPCSPIPE_PHY_CNTL6 0x2d73 +#define regRDPCSPIPE2_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE3_RDPCSPIPE_PHY_CNTL6 0x2e4b +#define regRDPCSPIPE3_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE4_RDPCSPIPE_PHY_CNTL6 0x2d73 +#define regRDPCSPIPE4_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 #endif diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 6325877c5f..ea9a79bc95 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -1834,11 +1834,20 @@ static void connector_bad_edid(struct drm_connector *connector, u8 *edid, int num_blocks) { int i; - u8 num_of_ext = edid[0x7e]; + u8 last_block; + + /* + * 0x7e in the EDID is the number of extension blocks. The EDID + * is 1 (base block) + num_ext_blocks big. That means we can think + * of 0x7e in the EDID of the _index_ of the last block in the + * combined chunk of memory. + */ + last_block = edid[0x7e]; /* Calculate real checksum for the last edid extension block data */ - connector->real_edid_checksum = - drm_edid_block_checksum(edid + num_of_ext * EDID_LENGTH); + if (last_block < num_blocks) + connector->real_edid_checksum = + drm_edid_block_checksum(edid + last_block * EDID_LENGTH); if (connector->bad_edid_counter++ && !drm_debug_enabled(DRM_UT_KMS)) return; diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 3ab0783210..8e7a124d6c 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -1506,6 +1506,7 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper, { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; + struct drm_mode_config *config = &dev->mode_config; int ret = 0; int crtc_count = 0; struct drm_connector_list_iter conn_iter; @@ -1663,6 +1664,11 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper, /* Handle our overallocation */ sizes.surface_height *= drm_fbdev_overalloc; sizes.surface_height /= 100; + if (sizes.surface_height > config->max_height) { + drm_dbg_kms(dev, "Fbdev over-allocation too large; clamping height to %d\n", + config->max_height); + sizes.surface_height = config->max_height; + } /* push down into drivers */ ret = (*fb_helper->funcs->fb_probe)(fb_helper, &sizes); diff --git a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c index 9870c4e6af..b5001db7a9 100644 --- a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c +++ b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c @@ -793,7 +793,6 @@ static int exynos5433_decon_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct decon_context *ctx; - struct resource *res; int ret; int i; @@ -818,8 +817,7 @@ static int exynos5433_decon_probe(struct platform_device *pdev) ctx->clks[i] = clk; } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - ctx->addr = devm_ioremap_resource(dev, res); + ctx->addr = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(ctx->addr)) return PTR_ERR(ctx->addr); diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c b/drivers/gpu/drm/exynos/exynos_drm_dsi.c index e39fac889e..8d13785781 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c @@ -1738,7 +1738,6 @@ static const struct component_ops exynos_dsi_component_ops = { static int exynos_dsi_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct resource *res; struct exynos_dsi *dsi; int ret, i; @@ -1789,8 +1788,7 @@ static int exynos_dsi_probe(struct platform_device *pdev) } } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - dsi->reg_base = devm_ioremap_resource(dev, res); + dsi->reg_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(dsi->reg_base)) return PTR_ERR(dsi->reg_base); diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimc.c b/drivers/gpu/drm/exynos/exynos_drm_fimc.c index a3c718148c..ecfd82d0af 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimc.c @@ -85,7 +85,6 @@ struct fimc_scaler { /* * A structure of fimc context. * - * @regs_res: register resources. * @regs: memory mapped io registers. * @lock: locking of operations. * @clocks: fimc clocks. @@ -103,7 +102,6 @@ struct fimc_context { struct exynos_drm_ipp_formats *formats; unsigned int num_formats; - struct resource *regs_res; void __iomem *regs; spinlock_t lock; struct clk *clocks[FIMC_CLKS_MAX]; @@ -1327,8 +1325,7 @@ static int fimc_probe(struct platform_device *pdev) ctx->num_formats = num_formats; /* resource memory */ - ctx->regs_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - ctx->regs = devm_ioremap_resource(dev, ctx->regs_res); + ctx->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(ctx->regs)) return PTR_ERR(ctx->regs); diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimd.c b/drivers/gpu/drm/exynos/exynos_drm_fimd.c index 700ca4fa66..c735e53939 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimd.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimd.c @@ -1202,9 +1202,7 @@ static int fimd_probe(struct platform_device *pdev) return PTR_ERR(ctx->lcd_clk); } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - - ctx->regs = devm_ioremap_resource(dev, res); + ctx->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(ctx->regs)) return PTR_ERR(ctx->regs); diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index b00230626c..471fd6c813 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -1449,7 +1449,6 @@ static const struct component_ops g2d_component_ops = { static int g2d_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct resource *res; struct g2d_data *g2d; int ret; @@ -1491,9 +1490,7 @@ static int g2d_probe(struct platform_device *pdev) clear_bit(G2D_BIT_SUSPEND_RUNQUEUE, &g2d->flags); clear_bit(G2D_BIT_ENGINE_BUSY, &g2d->flags); - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - - g2d->regs = devm_ioremap_resource(dev, res); + g2d->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(g2d->regs)) { ret = PTR_ERR(g2d->regs); goto err_put_clk; diff --git a/drivers/gpu/drm/exynos/exynos_drm_gsc.c b/drivers/gpu/drm/exynos/exynos_drm_gsc.c index 90d7bf9068..166a802628 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gsc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gsc.c @@ -86,7 +86,6 @@ struct gsc_scaler { /* * A structure of gsc context. * - * @regs_res: register resources. * @regs: memory mapped io registers. * @gsc_clk: gsc gate clock. * @sc: scaler infomations. @@ -103,7 +102,6 @@ struct gsc_context { struct exynos_drm_ipp_formats *formats; unsigned int num_formats; - struct resource *regs_res; void __iomem *regs; const char **clk_names; struct clk *clocks[GSC_MAX_CLOCKS]; @@ -1272,9 +1270,7 @@ static int gsc_probe(struct platform_device *pdev) } } - /* resource memory */ - ctx->regs_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - ctx->regs = devm_ioremap_resource(dev, ctx->regs_res); + ctx->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(ctx->regs)) return PTR_ERR(ctx->regs); diff --git a/drivers/gpu/drm/exynos/exynos_drm_rotator.c b/drivers/gpu/drm/exynos/exynos_drm_rotator.c index ee61be4cf1..dec7df35ba 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_rotator.c +++ b/drivers/gpu/drm/exynos/exynos_drm_rotator.c @@ -278,7 +278,6 @@ static const struct component_ops rotator_component_ops = { static int rotator_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct resource *regs_res; struct rot_context *rot; const struct rot_variant *variant; int irq; @@ -292,8 +291,7 @@ static int rotator_probe(struct platform_device *pdev) rot->formats = variant->formats; rot->num_formats = variant->num_formats; rot->dev = dev; - regs_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - rot->regs = devm_ioremap_resource(dev, regs_res); + rot->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(rot->regs)) return PTR_ERR(rot->regs); diff --git a/drivers/gpu/drm/exynos/exynos_drm_scaler.c b/drivers/gpu/drm/exynos/exynos_drm_scaler.c index f9ae5b038d..3a7851b7dc 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_scaler.c +++ b/drivers/gpu/drm/exynos/exynos_drm_scaler.c @@ -485,7 +485,6 @@ static const struct component_ops scaler_component_ops = { static int scaler_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct resource *regs_res; struct scaler_context *scaler; int irq; int ret, i; @@ -498,8 +497,7 @@ static int scaler_probe(struct platform_device *pdev) (struct scaler_data *)of_device_get_match_data(dev); scaler->dev = dev; - regs_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - scaler->regs = devm_ioremap_resource(dev, regs_res); + scaler->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(scaler->regs)) return PTR_ERR(scaler->regs); diff --git a/drivers/gpu/drm/exynos/exynos_hdmi.c b/drivers/gpu/drm/exynos/exynos_hdmi.c index c769dec576..7655142a46 100644 --- a/drivers/gpu/drm/exynos/exynos_hdmi.c +++ b/drivers/gpu/drm/exynos/exynos_hdmi.c @@ -1957,7 +1957,6 @@ static int hdmi_probe(struct platform_device *pdev) struct hdmi_audio_infoframe *audio_infoframe; struct device *dev = &pdev->dev; struct hdmi_context *hdata; - struct resource *res; int ret; hdata = devm_kzalloc(dev, sizeof(struct hdmi_context), GFP_KERNEL); @@ -1979,8 +1978,7 @@ static int hdmi_probe(struct platform_device *pdev) return ret; } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - hdata->regs = devm_ioremap_resource(dev, res); + hdata->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(hdata->regs)) { ret = PTR_ERR(hdata->regs); return ret; diff --git a/drivers/gpu/drm/hyperv/hyperv_drm.h b/drivers/gpu/drm/hyperv/hyperv_drm.h index 886add4f9c..d2d8582b36 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm.h +++ b/drivers/gpu/drm/hyperv/hyperv_drm.h @@ -46,6 +46,7 @@ int hyperv_mode_config_init(struct hyperv_drm_device *hv); int hyperv_update_vram_location(struct hv_device *hdev, phys_addr_t vram_pp); int hyperv_update_situation(struct hv_device *hdev, u8 active, u32 bpp, u32 w, u32 h, u32 pitch); +int hyperv_hide_hw_ptr(struct hv_device *hdev); int hyperv_update_dirt(struct hv_device *hdev, struct drm_rect *rect); int hyperv_connect_vsp(struct hv_device *hdev); diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c index 6dd4717d3e..8c97a20dfe 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c +++ b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c @@ -101,6 +101,7 @@ static void hyperv_pipe_enable(struct drm_simple_display_pipe *pipe, struct hyperv_drm_device *hv = to_hv(pipe->crtc.dev); struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(plane_state); + hyperv_hide_hw_ptr(hv->hdev); hyperv_update_situation(hv->hdev, 1, hv->screen_depth, crtc_state->mode.hdisplay, crtc_state->mode.vdisplay, diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_proto.c b/drivers/gpu/drm/hyperv/hyperv_drm_proto.c index 6d4bdccfbd..c0155c6271 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm_proto.c +++ b/drivers/gpu/drm/hyperv/hyperv_drm_proto.c @@ -299,6 +299,55 @@ int hyperv_update_situation(struct hv_device *hdev, u8 active, u32 bpp, return 0; } +/* + * Hyper-V supports a hardware cursor feature. It's not used by Linux VM, + * but the Hyper-V host still draws a point as an extra mouse pointer, + * which is unwanted, especially when Xorg is running. + * + * The hyperv_fb driver uses synthvid_send_ptr() to hide the unwanted + * pointer, by setting msg.ptr_pos.is_visible = 1 and setting the + * msg.ptr_shape.data. Note: setting msg.ptr_pos.is_visible to 0 doesn't + * work in tests. + * + * Copy synthvid_send_ptr() to hyperv_drm and rename it to + * hyperv_hide_hw_ptr(). Note: hyperv_hide_hw_ptr() is also called in the + * handler of the SYNTHVID_FEATURE_CHANGE event, otherwise the host still + * draws an extra unwanted mouse pointer after the VM Connection window is + * closed and reopened. + */ +int hyperv_hide_hw_ptr(struct hv_device *hdev) +{ + struct synthvid_msg msg; + + memset(&msg, 0, sizeof(struct synthvid_msg)); + msg.vid_hdr.type = SYNTHVID_POINTER_POSITION; + msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) + + sizeof(struct synthvid_pointer_position); + msg.ptr_pos.is_visible = 1; + msg.ptr_pos.video_output = 0; + msg.ptr_pos.image_x = 0; + msg.ptr_pos.image_y = 0; + hyperv_sendpacket(hdev, &msg); + + memset(&msg, 0, sizeof(struct synthvid_msg)); + msg.vid_hdr.type = SYNTHVID_POINTER_SHAPE; + msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) + + sizeof(struct synthvid_pointer_shape); + msg.ptr_shape.part_idx = SYNTHVID_CURSOR_COMPLETE; + msg.ptr_shape.is_argb = 1; + msg.ptr_shape.width = 1; + msg.ptr_shape.height = 1; + msg.ptr_shape.hot_x = 0; + msg.ptr_shape.hot_y = 0; + msg.ptr_shape.data[0] = 0; + msg.ptr_shape.data[1] = 1; + msg.ptr_shape.data[2] = 1; + msg.ptr_shape.data[3] = 1; + hyperv_sendpacket(hdev, &msg); + + return 0; +} + int hyperv_update_dirt(struct hv_device *hdev, struct drm_rect *rect) { struct hyperv_drm_device *hv = hv_get_drvdata(hdev); @@ -392,8 +441,11 @@ static void hyperv_receive_sub(struct hv_device *hdev) return; } - if (msg->vid_hdr.type == SYNTHVID_FEATURE_CHANGE) + if (msg->vid_hdr.type == SYNTHVID_FEATURE_CHANGE) { hv->dirt_needed = msg->feature_chg.is_dirt_needed; + if (hv->dirt_needed) + hyperv_hide_hw_ptr(hv->hdev); + } } static void hyperv_receive(void *ctx) diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index 43ec7fcd3f..a3eae3f3ea 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -1577,8 +1577,14 @@ static void gen11_dsi_sync_state(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) { struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); - struct intel_crtc *intel_crtc = to_intel_crtc(crtc_state->uapi.crtc); - enum pipe pipe = intel_crtc->pipe; + struct intel_crtc *intel_crtc; + enum pipe pipe; + + if (!crtc_state) + return; + + intel_crtc = to_intel_crtc(crtc_state->uapi.crtc); + pipe = intel_crtc->pipe; /* wa verify 1409054076:icl,jsl,ehl */ if (DISPLAY_VER(dev_priv) == 11 && pipe == PIPE_B && diff --git a/drivers/gpu/drm/i915/display/intel_acpi.c b/drivers/gpu/drm/i915/display/intel_acpi.c index 7cfe91fc05..68abeaf2d7 100644 --- a/drivers/gpu/drm/i915/display/intel_acpi.c +++ b/drivers/gpu/drm/i915/display/intel_acpi.c @@ -186,13 +186,16 @@ void intel_dsm_get_bios_data_funcs_supported(struct drm_i915_private *i915) { struct pci_dev *pdev = to_pci_dev(i915->drm.dev); acpi_handle dhandle; + union acpi_object *obj; dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) return; - acpi_evaluate_dsm(dhandle, &intel_dsm_guid2, INTEL_DSM_REVISION_ID, - INTEL_DSM_FN_GET_BIOS_DATA_FUNCS_SUPPORTED, NULL); + obj = acpi_evaluate_dsm(dhandle, &intel_dsm_guid2, INTEL_DSM_REVISION_ID, + INTEL_DSM_FN_GET_BIOS_DATA_FUNCS_SUPPORTED, NULL); + if (obj) + ACPI_FREE(obj); } /* diff --git a/drivers/gpu/drm/i915/display/intel_audio.c b/drivers/gpu/drm/i915/display/intel_audio.c index 5322375885..4e0f96bf61 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.c +++ b/drivers/gpu/drm/i915/display/intel_audio.c @@ -1308,8 +1308,9 @@ static void i915_audio_component_init(struct drm_i915_private *dev_priv) else aud_freq = aud_freq_init; - /* use BIOS provided value for TGL unless it is a known bad value */ - if (IS_TIGERLAKE(dev_priv) && aud_freq_init != AUD_FREQ_TGL_BROKEN) + /* use BIOS provided value for TGL and RKL unless it is a known bad value */ + if ((IS_TIGERLAKE(dev_priv) || IS_ROCKETLAKE(dev_priv)) && + aud_freq_init != AUD_FREQ_TGL_BROKEN) aud_freq = aud_freq_init; drm_dbg_kms(&dev_priv->drm, "use AUD_FREQ_CNTRL of 0x%x (init value 0x%x)\n", diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index e86e6ed2d3..fd71346aac 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -451,13 +451,23 @@ parse_lfp_backlight(struct drm_i915_private *i915, } i915->vbt.backlight.type = INTEL_BACKLIGHT_DISPLAY_DDI; - if (bdb->version >= 191 && - get_blocksize(backlight_data) >= sizeof(*backlight_data)) { - const struct lfp_backlight_control_method *method; + if (bdb->version >= 191) { + size_t exp_size; - method = &backlight_data->backlight_control[panel_type]; - i915->vbt.backlight.type = method->type; - i915->vbt.backlight.controller = method->controller; + if (bdb->version >= 236) + exp_size = sizeof(struct bdb_lfp_backlight_data); + else if (bdb->version >= 234) + exp_size = EXP_BDB_LFP_BL_DATA_SIZE_REV_234; + else + exp_size = EXP_BDB_LFP_BL_DATA_SIZE_REV_191; + + if (get_blocksize(backlight_data) >= exp_size) { + const struct lfp_backlight_control_method *method; + + method = &backlight_data->backlight_control[panel_type]; + i915->vbt.backlight.type = method->type; + i915->vbt.backlight.controller = method->controller; + } } i915->vbt.backlight.pwm_freq_hz = entry->pwm_freq_hz; diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 9903a78df8..bd184325d0 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -3807,7 +3807,13 @@ void hsw_ddi_get_config(struct intel_encoder *encoder, static void intel_ddi_sync_state(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) { - if (intel_crtc_has_dp_encoder(crtc_state)) + struct drm_i915_private *i915 = to_i915(encoder->base.dev); + enum phy phy = intel_port_to_phy(i915, encoder->port); + + if (intel_phy_is_tc(i915, phy)) + intel_tc_port_sanitize(enc_to_dig_port(encoder)); + + if (crtc_state && intel_crtc_has_dp_encoder(crtc_state)) intel_dp_sync_state(encoder, crtc_state); } diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 134a6acbd8..17f44ffea5 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -13082,18 +13082,16 @@ static void intel_modeset_readout_hw_state(struct drm_device *dev) readout_plane_state(dev_priv); for_each_intel_encoder(dev, encoder) { + struct intel_crtc_state *crtc_state = NULL; + pipe = 0; if (encoder->get_hw_state(encoder, &pipe)) { - struct intel_crtc_state *crtc_state; - crtc = intel_get_crtc_for_pipe(dev_priv, pipe); crtc_state = to_intel_crtc_state(crtc->base.state); encoder->base.crtc = &crtc->base; intel_encoder_get_config(encoder, crtc_state); - if (encoder->sync_state) - encoder->sync_state(encoder, crtc_state); /* read out to slave crtc as well for bigjoiner */ if (crtc_state->bigjoiner) { @@ -13108,6 +13106,9 @@ static void intel_modeset_readout_hw_state(struct drm_device *dev) encoder->base.crtc = NULL; } + if (encoder->sync_state) + encoder->sync_state(encoder, crtc_state); + drm_dbg_kms(&dev_priv->drm, "[ENCODER:%d:%s] hw state readout: %s, pipe %c\n", encoder->base.base.id, encoder->base.name, @@ -13390,17 +13391,6 @@ intel_modeset_setup_hw_state(struct drm_device *dev, intel_modeset_readout_hw_state(dev); /* HW state is read out, now we need to sanitize this mess. */ - - /* Sanitize the TypeC port mode upfront, encoders depend on this */ - for_each_intel_encoder(dev, encoder) { - enum phy phy = intel_port_to_phy(dev_priv, encoder->port); - - /* We need to sanitize only the MST primary port. */ - if (encoder->type != INTEL_OUTPUT_DP_MST && - intel_phy_is_tc(dev_priv, phy)) - intel_tc_port_sanitize(enc_to_dig_port(encoder)); - } - get_encoder_power_domains(dev_priv); if (HAS_PCH_IBX(dev_priv)) diff --git a/drivers/gpu/drm/i915/display/intel_vbt_defs.h b/drivers/gpu/drm/i915/display/intel_vbt_defs.h index 330077c2e5..a2108a8f54 100644 --- a/drivers/gpu/drm/i915/display/intel_vbt_defs.h +++ b/drivers/gpu/drm/i915/display/intel_vbt_defs.h @@ -814,6 +814,11 @@ struct lfp_brightness_level { u16 reserved; } __packed; +#define EXP_BDB_LFP_BL_DATA_SIZE_REV_191 \ + offsetof(struct bdb_lfp_backlight_data, brightness_level) +#define EXP_BDB_LFP_BL_DATA_SIZE_REV_234 \ + offsetof(struct bdb_lfp_backlight_data, brightness_precision_bits) + struct bdb_lfp_backlight_data { u8 entry_size; struct lfp_backlight_data_entry data[16]; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 9ccf4b29b8..166bb46408 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -937,6 +937,10 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, unsigned int n; e = alloc_engines(num_engines); + if (!e) + return ERR_PTR(-ENOMEM); + e->num_engines = num_engines; + for (n = 0; n < num_engines; n++) { struct intel_context *ce; int ret; @@ -970,7 +974,6 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, goto free_engines; } } - e->num_engines = num_engines; return e; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c index e382b7f235..5ab136ffde 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c @@ -118,7 +118,7 @@ i915_gem_shrink(struct i915_gem_ww_ctx *ww, intel_wakeref_t wakeref = 0; unsigned long count = 0; unsigned long scanned = 0; - int err; + int err = 0; /* CHV + VTD workaround use stop_machine(); need to trylock vm->mutex */ bool trylock_vm = !ww && intel_vm_no_concurrent_access_wa(i915); @@ -242,12 +242,15 @@ i915_gem_shrink(struct i915_gem_ww_ctx *ww, list_splice_tail(&still_in_list, phase->list); spin_unlock_irqrestore(&i915->mm.obj_lock, flags); if (err) - return err; + break; } if (shrink & I915_SHRINK_BOUND) intel_runtime_pm_put(&i915->runtime_pm, wakeref); + if (err) + return err; + if (nr_scanned) *nr_scanned += scanned; return count; diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 745e84c72c..17ca4dc4d0 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -362,8 +362,9 @@ static int __intel_context_active(struct i915_active *active) return 0; } -static int sw_fence_dummy_notify(struct i915_sw_fence *sf, - enum i915_sw_fence_notify state) +static int __i915_sw_fence_call +sw_fence_dummy_notify(struct i915_sw_fence *sf, + enum i915_sw_fence_notify state) { return NOTIFY_DONE; } @@ -420,6 +421,7 @@ void intel_context_fini(struct intel_context *ce) mutex_destroy(&ce->pin_mutex); i915_active_fini(&ce->active); + i915_sw_fence_fini(&ce->guc_blocked); } void i915_context_module_exit(void) diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 591a522428..0a03fbed9f 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -882,8 +882,6 @@ void intel_rps_park(struct intel_rps *rps) if (!intel_rps_is_enabled(rps)) return; - GEM_BUG_ON(atomic_read(&rps->num_waiters)); - if (!intel_rps_clear_active(rps)) return; diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index b56a8e37a3..1bb1be5c48 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -576,7 +576,7 @@ static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload) /* No one is going to touch shadow bb from now on. */ i915_gem_object_flush_map(bb->obj); - i915_gem_object_unlock(bb->obj); + i915_gem_ww_ctx_fini(&ww); } } return 0; @@ -630,7 +630,7 @@ static int prepare_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) return ret; } - i915_gem_object_unlock(wa_ctx->indirect_ctx.obj); + i915_gem_ww_ctx_fini(&ww); /* FIXME: we are not tracking our pinned VMA leaving it * up to the core to fix up the stray pin_count upon diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 664970f2bc..4037030f09 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -8193,6 +8193,11 @@ enum { #define HSW_SPR_STRETCH_MAX_X1 REG_FIELD_PREP(HSW_SPR_STRETCH_MAX_MASK, 3) #define HSW_FBCQ_DIS (1 << 22) #define BDW_DPRS_MASK_VBLANK_SRD (1 << 0) +#define SKL_PLANE1_STRETCH_MAX_MASK REG_GENMASK(1, 0) +#define SKL_PLANE1_STRETCH_MAX_X8 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 0) +#define SKL_PLANE1_STRETCH_MAX_X4 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 1) +#define SKL_PLANE1_STRETCH_MAX_X2 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 2) +#define SKL_PLANE1_STRETCH_MAX_X1 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 3) #define CHICKEN_PIPESL_1(pipe) _MMIO_PIPE(pipe, _CHICKEN_PIPESL_1_A, _CHICKEN_PIPESL_1_B) #define _CHICKEN_TRANS_A 0x420c0 diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index ce446716d0..79da5eca60 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -829,8 +829,6 @@ static void __i915_request_ctor(void *arg) i915_sw_fence_init(&rq->submit, submit_notify); i915_sw_fence_init(&rq->semaphore, semaphore_notify); - dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); - rq->capture_list = NULL; init_llist_head(&rq->execute_cb); @@ -905,17 +903,12 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) rq->ring = ce->ring; rq->execution_mask = ce->engine->mask; - kref_init(&rq->fence.refcount); - rq->fence.flags = 0; - rq->fence.error = 0; - INIT_LIST_HEAD(&rq->fence.cb_list); - ret = intel_timeline_get_seqno(tl, rq, &seqno); if (ret) goto err_free; - rq->fence.context = tl->fence_context; - rq->fence.seqno = seqno; + dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, + tl->fence_context, seqno); RCU_INIT_POINTER(rq->timeline, tl); rq->hwsp_seqno = tl->hwsp_seqno; diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 65bc3709f5..a725792d52 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -76,6 +76,8 @@ struct intel_wm_config { static void gen9_init_clock_gating(struct drm_i915_private *dev_priv) { + enum pipe pipe; + if (HAS_LLC(dev_priv)) { /* * WaCompressedResourceDisplayNewHashMode:skl,kbl @@ -89,6 +91,16 @@ static void gen9_init_clock_gating(struct drm_i915_private *dev_priv) SKL_DE_COMPRESSED_HASH_MODE); } + for_each_pipe(dev_priv, pipe) { + /* + * "Plane N strech max must be programmed to 11b (x1) + * when Async flips are enabled on that plane." + */ + if (!IS_GEMINILAKE(dev_priv) && intel_vtd_active()) + intel_uncore_rmw(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe), + SKL_PLANE1_STRETCH_MAX_MASK, SKL_PLANE1_STRETCH_MAX_X1); + } + /* See Bspec note for PSR2_CTL bit 31, Wa#828:skl,bxt,kbl,cfl */ intel_uncore_write(&dev_priv->uncore, CHICKEN_PAR1_1, intel_uncore_read(&dev_priv->uncore, CHICKEN_PAR1_1) | SKL_EDP_PSR_FIX_RDWRAP); diff --git a/drivers/gpu/drm/kmb/kmb_drv.c b/drivers/gpu/drm/kmb/kmb_drv.c index 1c2f4799f4..12ce669650 100644 --- a/drivers/gpu/drm/kmb/kmb_drv.c +++ b/drivers/gpu/drm/kmb/kmb_drv.c @@ -172,10 +172,10 @@ static int kmb_setup_mode_config(struct drm_device *drm) ret = drmm_mode_config_init(drm); if (ret) return ret; - drm->mode_config.min_width = KMB_MIN_WIDTH; - drm->mode_config.min_height = KMB_MIN_HEIGHT; - drm->mode_config.max_width = KMB_MAX_WIDTH; - drm->mode_config.max_height = KMB_MAX_HEIGHT; + drm->mode_config.min_width = KMB_FB_MIN_WIDTH; + drm->mode_config.min_height = KMB_FB_MIN_HEIGHT; + drm->mode_config.max_width = KMB_FB_MAX_WIDTH; + drm->mode_config.max_height = KMB_FB_MAX_HEIGHT; drm->mode_config.funcs = &kmb_mode_config_funcs; ret = kmb_setup_crtc(drm); diff --git a/drivers/gpu/drm/kmb/kmb_drv.h b/drivers/gpu/drm/kmb/kmb_drv.h index ebbaa5f422..69a62e2d03 100644 --- a/drivers/gpu/drm/kmb/kmb_drv.h +++ b/drivers/gpu/drm/kmb/kmb_drv.h @@ -20,6 +20,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 1 +#define KMB_FB_MAX_WIDTH 1920 +#define KMB_FB_MAX_HEIGHT 1080 +#define KMB_FB_MIN_WIDTH 1 +#define KMB_FB_MIN_HEIGHT 1 + #define KMB_LCD_DEFAULT_CLK 200000000 #define KMB_SYS_CLK_MHZ 500 diff --git a/drivers/gpu/drm/kmb/kmb_plane.c b/drivers/gpu/drm/kmb/kmb_plane.c index ecee678261..06b0c42c9e 100644 --- a/drivers/gpu/drm/kmb/kmb_plane.c +++ b/drivers/gpu/drm/kmb/kmb_plane.c @@ -94,9 +94,10 @@ static int kmb_plane_atomic_check(struct drm_plane *plane, if (ret) return ret; - if (new_plane_state->crtc_w > KMB_MAX_WIDTH || new_plane_state->crtc_h > KMB_MAX_HEIGHT) - return -EINVAL; - if (new_plane_state->crtc_w < KMB_MIN_WIDTH || new_plane_state->crtc_h < KMB_MIN_HEIGHT) + if (new_plane_state->crtc_w > KMB_FB_MAX_WIDTH || + new_plane_state->crtc_h > KMB_FB_MAX_HEIGHT || + new_plane_state->crtc_w < KMB_FB_MIN_WIDTH || + new_plane_state->crtc_h < KMB_FB_MIN_HEIGHT) return -EINVAL; can_position = (plane->type == DRM_PLANE_TYPE_OVERLAY); crtc_state = @@ -277,6 +278,44 @@ static void config_csc(struct kmb_drm_private *kmb, int plane_id) kmb_write_lcd(kmb, LCD_LAYERn_CSC_OFF3(plane_id), csc_coef_lcd[11]); } +static void kmb_plane_set_alpha(struct kmb_drm_private *kmb, + const struct drm_plane_state *state, + unsigned char plane_id, + unsigned int *val) +{ + u16 plane_alpha = state->alpha; + u16 pixel_blend_mode = state->pixel_blend_mode; + int has_alpha = state->fb->format->has_alpha; + + if (plane_alpha != DRM_BLEND_ALPHA_OPAQUE) + *val |= LCD_LAYER_ALPHA_STATIC; + + if (has_alpha) { + switch (pixel_blend_mode) { + case DRM_MODE_BLEND_PIXEL_NONE: + break; + case DRM_MODE_BLEND_PREMULTI: + *val |= LCD_LAYER_ALPHA_EMBED | LCD_LAYER_ALPHA_PREMULT; + break; + case DRM_MODE_BLEND_COVERAGE: + *val |= LCD_LAYER_ALPHA_EMBED; + break; + default: + DRM_DEBUG("Missing pixel blend mode case (%s == %ld)\n", + __stringify(pixel_blend_mode), + (long)pixel_blend_mode); + break; + } + } + + if (plane_alpha == DRM_BLEND_ALPHA_OPAQUE && !has_alpha) { + *val &= LCD_LAYER_ALPHA_DISABLED; + return; + } + + kmb_write_lcd(kmb, LCD_LAYERn_ALPHA(plane_id), plane_alpha); +} + static void kmb_plane_atomic_update(struct drm_plane *plane, struct drm_atomic_state *state) { @@ -303,11 +342,12 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, fb = new_plane_state->fb; if (!fb) return; + num_planes = fb->format->num_planes; kmb_plane = to_kmb_plane(plane); - plane_id = kmb_plane->id; kmb = to_kmb(plane->dev); + plane_id = kmb_plane->id; spin_lock_irq(&kmb->irq_lock); if (kmb->kmb_under_flow || kmb->kmb_flush_done) { @@ -400,20 +440,32 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, config_csc(kmb, plane_id); } + kmb_plane_set_alpha(kmb, plane->state, plane_id, &val); + kmb_write_lcd(kmb, LCD_LAYERn_CFG(plane_id), val); + /* Configure LCD_CONTROL */ + ctrl = kmb_read_lcd(kmb, LCD_CONTROL); + + /* Set layer blending config */ + ctrl &= ~LCD_CTRL_ALPHA_ALL; + ctrl |= LCD_CTRL_ALPHA_BOTTOM_VL1 | + LCD_CTRL_ALPHA_BLEND_VL2; + + ctrl &= ~LCD_CTRL_ALPHA_BLEND_BKGND_DISABLE; + switch (plane_id) { case LAYER_0: - ctrl = LCD_CTRL_VL1_ENABLE; + ctrl |= LCD_CTRL_VL1_ENABLE; break; case LAYER_1: - ctrl = LCD_CTRL_VL2_ENABLE; + ctrl |= LCD_CTRL_VL2_ENABLE; break; case LAYER_2: - ctrl = LCD_CTRL_GL1_ENABLE; + ctrl |= LCD_CTRL_GL1_ENABLE; break; case LAYER_3: - ctrl = LCD_CTRL_GL2_ENABLE; + ctrl |= LCD_CTRL_GL2_ENABLE; break; } @@ -425,7 +477,7 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, */ ctrl |= LCD_CTRL_VHSYNC_IDLE_LVL; - kmb_set_bitmask_lcd(kmb, LCD_CONTROL, ctrl); + kmb_write_lcd(kmb, LCD_CONTROL, ctrl); /* Enable pipeline AXI read transactions for the DMA * after setting graphics layers. This must be done @@ -490,6 +542,9 @@ struct kmb_plane *kmb_plane_init(struct drm_device *drm) enum drm_plane_type plane_type; const u32 *plane_formats; int num_plane_formats; + unsigned int blend_caps = BIT(DRM_MODE_BLEND_PIXEL_NONE) | + BIT(DRM_MODE_BLEND_PREMULTI) | + BIT(DRM_MODE_BLEND_COVERAGE); for (i = 0; i < KMB_MAX_PLANES; i++) { plane = drmm_kzalloc(drm, sizeof(*plane), GFP_KERNEL); @@ -521,8 +576,16 @@ struct kmb_plane *kmb_plane_init(struct drm_device *drm) drm_dbg(drm, "%s : %d i=%d type=%d", __func__, __LINE__, i, plane_type); + drm_plane_create_alpha_property(&plane->base_plane); + + drm_plane_create_blend_mode_property(&plane->base_plane, + blend_caps); + + drm_plane_create_zpos_immutable_property(&plane->base_plane, i); + drm_plane_helper_add(&plane->base_plane, &kmb_plane_helper_funcs); + if (plane_type == DRM_PLANE_TYPE_PRIMARY) { primary = plane; kmb->plane = plane; diff --git a/drivers/gpu/drm/kmb/kmb_plane.h b/drivers/gpu/drm/kmb/kmb_plane.h index 486490f7a3..6e8d22cf88 100644 --- a/drivers/gpu/drm/kmb/kmb_plane.h +++ b/drivers/gpu/drm/kmb/kmb_plane.h @@ -35,6 +35,9 @@ #define POSSIBLE_CRTCS 1 #define to_kmb_plane(x) container_of(x, struct kmb_plane, base_plane) +#define POSSIBLE_CRTCS 1 +#define KMB_MAX_PLANES 2 + enum layer_id { LAYER_0, LAYER_1, @@ -43,8 +46,6 @@ enum layer_id { /* KMB_MAX_PLANES */ }; -#define KMB_MAX_PLANES 1 - enum sub_plane_id { Y_PLANE, U_PLANE, diff --git a/drivers/gpu/drm/kmb/kmb_regs.h b/drivers/gpu/drm/kmb/kmb_regs.h index 48150569f7..9756101b0d 100644 --- a/drivers/gpu/drm/kmb/kmb_regs.h +++ b/drivers/gpu/drm/kmb/kmb_regs.h @@ -43,8 +43,10 @@ #define LCD_CTRL_OUTPUT_ENABLED BIT(19) #define LCD_CTRL_BPORCH_ENABLE BIT(21) #define LCD_CTRL_FPORCH_ENABLE BIT(22) +#define LCD_CTRL_ALPHA_BLEND_BKGND_DISABLE BIT(23) #define LCD_CTRL_PIPELINE_DMA BIT(28) #define LCD_CTRL_VHSYNC_IDLE_LVL BIT(31) +#define LCD_CTRL_ALPHA_ALL (0xff << 6) /* interrupts */ #define LCD_INT_STATUS (0x4 * 0x001) @@ -115,6 +117,7 @@ #define LCD_LAYER_ALPHA_EMBED BIT(5) #define LCD_LAYER_ALPHA_COMBI (LCD_LAYER_ALPHA_STATIC | \ LCD_LAYER_ALPHA_EMBED) +#define LCD_LAYER_ALPHA_DISABLED ~(LCD_LAYER_ALPHA_COMBI) /* RGB multiplied with alpha */ #define LCD_LAYER_ALPHA_PREMULT BIT(6) #define LCD_LAYER_INVERT_COL BIT(7) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c index 5f81489fc6..a4e80e4996 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c @@ -4,8 +4,6 @@ */ #include -#include -#include #include #include #include @@ -52,11 +50,8 @@ struct mtk_drm_crtc { bool pending_async_planes; #if IS_REACHABLE(CONFIG_MTK_CMDQ) - struct mbox_client cmdq_cl; - struct mbox_chan *cmdq_chan; - struct cmdq_pkt cmdq_handle; + struct cmdq_client *cmdq_client; u32 cmdq_event; - u32 cmdq_vblank_cnt; #endif struct device *mmsys_dev; @@ -227,79 +222,9 @@ struct mtk_ddp_comp *mtk_drm_ddp_comp_for_plane(struct drm_crtc *crtc, } #if IS_REACHABLE(CONFIG_MTK_CMDQ) -static int mtk_drm_cmdq_pkt_create(struct mbox_chan *chan, struct cmdq_pkt *pkt, - size_t size) +static void ddp_cmdq_cb(struct cmdq_cb_data data) { - struct device *dev; - dma_addr_t dma_addr; - - pkt->va_base = kzalloc(size, GFP_KERNEL); - if (!pkt->va_base) { - kfree(pkt); - return -ENOMEM; - } - pkt->buf_size = size; - - dev = chan->mbox->dev; - dma_addr = dma_map_single(dev, pkt->va_base, pkt->buf_size, - DMA_TO_DEVICE); - if (dma_mapping_error(dev, dma_addr)) { - dev_err(dev, "dma map failed, size=%u\n", (u32)(u64)size); - kfree(pkt->va_base); - kfree(pkt); - return -ENOMEM; - } - - pkt->pa_base = dma_addr; - - return 0; -} - -static void mtk_drm_cmdq_pkt_destroy(struct mbox_chan *chan, struct cmdq_pkt *pkt) -{ - dma_unmap_single(chan->mbox->dev, pkt->pa_base, pkt->buf_size, - DMA_TO_DEVICE); - kfree(pkt->va_base); - kfree(pkt); -} - -static void ddp_cmdq_cb(struct mbox_client *cl, void *mssg) -{ - struct mtk_drm_crtc *mtk_crtc = container_of(cl, struct mtk_drm_crtc, cmdq_cl); - struct cmdq_cb_data *data = mssg; - struct mtk_crtc_state *state; - unsigned int i; - - state = to_mtk_crtc_state(mtk_crtc->base.state); - - state->pending_config = false; - - if (mtk_crtc->pending_planes) { - for (i = 0; i < mtk_crtc->layer_nr; i++) { - struct drm_plane *plane = &mtk_crtc->planes[i]; - struct mtk_plane_state *plane_state; - - plane_state = to_mtk_plane_state(plane->state); - - plane_state->pending.config = false; - } - mtk_crtc->pending_planes = false; - } - - if (mtk_crtc->pending_async_planes) { - for (i = 0; i < mtk_crtc->layer_nr; i++) { - struct drm_plane *plane = &mtk_crtc->planes[i]; - struct mtk_plane_state *plane_state; - - plane_state = to_mtk_plane_state(plane->state); - - plane_state->pending.async_config = false; - } - mtk_crtc->pending_async_planes = false; - } - - mtk_crtc->cmdq_vblank_cnt = 0; - mtk_drm_cmdq_pkt_destroy(mtk_crtc->cmdq_chan, data->pkt); + cmdq_pkt_destroy(data.data); } #endif @@ -453,8 +378,7 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc, state->pending_vrefresh, 0, cmdq_handle); - if (!cmdq_handle) - state->pending_config = false; + state->pending_config = false; } if (mtk_crtc->pending_planes) { @@ -474,12 +398,9 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc, mtk_ddp_comp_layer_config(comp, local_layer, plane_state, cmdq_handle); - if (!cmdq_handle) - plane_state->pending.config = false; + plane_state->pending.config = false; } - - if (!cmdq_handle) - mtk_crtc->pending_planes = false; + mtk_crtc->pending_planes = false; } if (mtk_crtc->pending_async_planes) { @@ -499,12 +420,9 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc, mtk_ddp_comp_layer_config(comp, local_layer, plane_state, cmdq_handle); - if (!cmdq_handle) - plane_state->pending.async_config = false; + plane_state->pending.async_config = false; } - - if (!cmdq_handle) - mtk_crtc->pending_async_planes = false; + mtk_crtc->pending_async_planes = false; } } @@ -512,7 +430,7 @@ static void mtk_drm_crtc_update_config(struct mtk_drm_crtc *mtk_crtc, bool needs_vblank) { #if IS_REACHABLE(CONFIG_MTK_CMDQ) - struct cmdq_pkt *cmdq_handle = &mtk_crtc->cmdq_handle; + struct cmdq_pkt *cmdq_handle; #endif struct drm_crtc *crtc = &mtk_crtc->base; struct mtk_drm_private *priv = crtc->dev->dev_private; @@ -550,24 +468,14 @@ static void mtk_drm_crtc_update_config(struct mtk_drm_crtc *mtk_crtc, mtk_mutex_release(mtk_crtc->mutex); } #if IS_REACHABLE(CONFIG_MTK_CMDQ) - if (mtk_crtc->cmdq_chan) { - mbox_flush(mtk_crtc->cmdq_chan, 2000); - cmdq_handle->cmd_buf_size = 0; + if (mtk_crtc->cmdq_client) { + mbox_flush(mtk_crtc->cmdq_client->chan, 2000); + cmdq_handle = cmdq_pkt_create(mtk_crtc->cmdq_client, PAGE_SIZE); cmdq_pkt_clear_event(cmdq_handle, mtk_crtc->cmdq_event); cmdq_pkt_wfe(cmdq_handle, mtk_crtc->cmdq_event, false); mtk_crtc_ddp_config(crtc, cmdq_handle); cmdq_pkt_finalize(cmdq_handle); - dma_sync_single_for_device(mtk_crtc->cmdq_chan->mbox->dev, - cmdq_handle->pa_base, - cmdq_handle->cmd_buf_size, - DMA_TO_DEVICE); - /* - * CMDQ command should execute in next vblank, - * If it fail to execute in next 2 vblank, timeout happen. - */ - mtk_crtc->cmdq_vblank_cnt = 2; - mbox_send_message(mtk_crtc->cmdq_chan, cmdq_handle); - mbox_client_txdone(mtk_crtc->cmdq_chan, 0); + cmdq_pkt_flush_async(cmdq_handle, ddp_cmdq_cb, cmdq_handle); } #endif mtk_crtc->config_updating = false; @@ -581,15 +489,12 @@ static void mtk_crtc_ddp_irq(void *data) struct mtk_drm_private *priv = crtc->dev->dev_private; #if IS_REACHABLE(CONFIG_MTK_CMDQ) - if (!priv->data->shadow_register && !mtk_crtc->cmdq_chan) - mtk_crtc_ddp_config(crtc, NULL); - else if (mtk_crtc->cmdq_vblank_cnt > 0 && --mtk_crtc->cmdq_vblank_cnt == 0) - DRM_ERROR("mtk_crtc %d CMDQ execute command timeout!\n", - drm_crtc_index(&mtk_crtc->base)); + if (!priv->data->shadow_register && !mtk_crtc->cmdq_client) #else if (!priv->data->shadow_register) - mtk_crtc_ddp_config(crtc, NULL); #endif + mtk_crtc_ddp_config(crtc, NULL); + mtk_drm_finish_page_flip(mtk_crtc); } @@ -924,20 +829,16 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev, mutex_init(&mtk_crtc->hw_lock); #if IS_REACHABLE(CONFIG_MTK_CMDQ) - mtk_crtc->cmdq_cl.dev = mtk_crtc->mmsys_dev; - mtk_crtc->cmdq_cl.tx_block = false; - mtk_crtc->cmdq_cl.knows_txdone = true; - mtk_crtc->cmdq_cl.rx_callback = ddp_cmdq_cb; - mtk_crtc->cmdq_chan = - mbox_request_channel(&mtk_crtc->cmdq_cl, - drm_crtc_index(&mtk_crtc->base)); - if (IS_ERR(mtk_crtc->cmdq_chan)) { + mtk_crtc->cmdq_client = + cmdq_mbox_create(mtk_crtc->mmsys_dev, + drm_crtc_index(&mtk_crtc->base)); + if (IS_ERR(mtk_crtc->cmdq_client)) { dev_dbg(dev, "mtk_crtc %d failed to create mailbox client, writing register by CPU now\n", drm_crtc_index(&mtk_crtc->base)); - mtk_crtc->cmdq_chan = NULL; + mtk_crtc->cmdq_client = NULL; } - if (mtk_crtc->cmdq_chan) { + if (mtk_crtc->cmdq_client) { ret = of_property_read_u32_index(priv->mutex_node, "mediatek,gce-events", drm_crtc_index(&mtk_crtc->base), @@ -945,18 +846,8 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev, if (ret) { dev_dbg(dev, "mtk_crtc %d failed to get mediatek,gce-events property\n", drm_crtc_index(&mtk_crtc->base)); - mbox_free_channel(mtk_crtc->cmdq_chan); - mtk_crtc->cmdq_chan = NULL; - } else { - ret = mtk_drm_cmdq_pkt_create(mtk_crtc->cmdq_chan, - &mtk_crtc->cmdq_handle, - PAGE_SIZE); - if (ret) { - dev_dbg(dev, "mtk_crtc %d failed to create cmdq packet\n", - drm_crtc_index(&mtk_crtc->base)); - mbox_free_channel(mtk_crtc->cmdq_chan); - mtk_crtc->cmdq_chan = NULL; - } + cmdq_mbox_destroy(mtk_crtc->cmdq_client); + mtk_crtc->cmdq_client = NULL; } } #endif diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig index e9c6af78b1..3ddf739a6f 100644 --- a/drivers/gpu/drm/msm/Kconfig +++ b/drivers/gpu/drm/msm/Kconfig @@ -17,7 +17,7 @@ config DRM_MSM select DRM_SCHED select SHMEM select TMPFS - select QCOM_SCM if ARCH_QCOM + select QCOM_SCM select WANT_DEV_COREDUMP select SND_SOC_HDMI_CODEC if SND_SOC select SYNC_FILE @@ -55,7 +55,7 @@ config DRM_MSM_GPU_SUDO config DRM_MSM_HDMI_HDCP bool "Enable HDMI HDCP support in MSM DRM driver" - depends on DRM_MSM && QCOM_SCM + depends on DRM_MSM default y help Choose this option to enable HDCP state machine diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c index 4534633fe7..8fb847c174 100644 --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c @@ -571,13 +571,14 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev) } icc_path = devm_of_icc_get(&pdev->dev, "gfx-mem"); - ret = IS_ERR(icc_path); - if (ret) + if (IS_ERR(icc_path)) { + ret = PTR_ERR(icc_path); goto fail; + } ocmem_icc_path = devm_of_icc_get(&pdev->dev, "ocmem"); - ret = IS_ERR(ocmem_icc_path); - if (ret) { + if (IS_ERR(ocmem_icc_path)) { + ret = PTR_ERR(ocmem_icc_path); /* allow -ENODATA, ocmem icc is optional */ if (ret != -ENODATA) goto fail; diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c index 82bebb4023..a96ee79cc5 100644 --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c @@ -699,13 +699,14 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev) } icc_path = devm_of_icc_get(&pdev->dev, "gfx-mem"); - ret = IS_ERR(icc_path); - if (ret) + if (IS_ERR(icc_path)) { + ret = PTR_ERR(icc_path); goto fail; + } ocmem_icc_path = devm_of_icc_get(&pdev->dev, "ocmem"); - ret = IS_ERR(ocmem_icc_path); - if (ret) { + if (IS_ERR(ocmem_icc_path)) { + ret = PTR_ERR(ocmem_icc_path); /* allow -ENODATA, ocmem icc is optional */ if (ret != -ENODATA) goto fail; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index a7c5801895..8b73f70766 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -296,6 +296,8 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) u32 val; int request, ack; + WARN_ON_ONCE(!mutex_is_locked(&gmu->lock)); + if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits)) return -EINVAL; @@ -337,6 +339,8 @@ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) { int bit; + WARN_ON_ONCE(!mutex_is_locked(&gmu->lock)); + if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits)) return; @@ -1482,6 +1486,8 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) if (!pdev) return -ENODEV; + mutex_init(&gmu->lock); + gmu->dev = &pdev->dev; of_dma_configure(gmu->dev, node, true); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index 3c74f64e31..84bd516f01 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -44,6 +44,9 @@ struct a6xx_gmu_bo { struct a6xx_gmu { struct device *dev; + /* For serializing communication with the GMU: */ + struct mutex lock; + struct msm_gem_address_space *aspace; void * __iomem mmio; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 40c9fef457..33da25b816 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -106,7 +106,7 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu, u32 asid; u64 memptr = rbmemptr(ring, ttbr0); - if (ctx == a6xx_gpu->cur_ctx) + if (ctx->seqno == a6xx_gpu->cur_ctx_seqno) return; if (msm_iommu_pagetable_params(ctx->aspace->mmu, &ttbr, &asid)) @@ -139,7 +139,7 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu, OUT_PKT7(ring, CP_EVENT_WRITE, 1); OUT_RING(ring, 0x31); - a6xx_gpu->cur_ctx = ctx; + a6xx_gpu->cur_ctx_seqno = ctx->seqno; } static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) @@ -881,7 +881,7 @@ static int a6xx_zap_shader_init(struct msm_gpu *gpu) A6XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \ A6XX_RBBM_INT_0_MASK_UCHE_TRAP_INTR) -static int a6xx_hw_init(struct msm_gpu *gpu) +static int hw_init(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); @@ -1081,7 +1081,7 @@ static int a6xx_hw_init(struct msm_gpu *gpu) /* Always come up on rb 0 */ a6xx_gpu->cur_ring = gpu->rb[0]; - a6xx_gpu->cur_ctx = NULL; + a6xx_gpu->cur_ctx_seqno = 0; /* Enable the SQE_to start the CP engine */ gpu_write(gpu, REG_A6XX_CP_SQE_CNTL, 1); @@ -1135,6 +1135,19 @@ static int a6xx_hw_init(struct msm_gpu *gpu) return ret; } +static int a6xx_hw_init(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + int ret; + + mutex_lock(&a6xx_gpu->gmu.lock); + ret = hw_init(gpu); + mutex_unlock(&a6xx_gpu->gmu.lock); + + return ret; +} + static void a6xx_dump(struct msm_gpu *gpu) { DRM_DEV_INFO(&gpu->pdev->dev, "status: %08x\n", @@ -1509,7 +1522,9 @@ static int a6xx_pm_resume(struct msm_gpu *gpu) trace_msm_gpu_resume(0); + mutex_lock(&a6xx_gpu->gmu.lock); ret = a6xx_gmu_resume(a6xx_gpu); + mutex_unlock(&a6xx_gpu->gmu.lock); if (ret) return ret; @@ -1532,7 +1547,9 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) msm_devfreq_suspend(gpu); + mutex_lock(&a6xx_gpu->gmu.lock); ret = a6xx_gmu_stop(a6xx_gpu); + mutex_unlock(&a6xx_gpu->gmu.lock); if (ret) return ret; @@ -1547,18 +1564,19 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); - static DEFINE_MUTEX(perfcounter_oob); - mutex_lock(&perfcounter_oob); + mutex_lock(&a6xx_gpu->gmu.lock); /* Force the GPU power on so we can read this register */ a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); *value = gpu_read64(gpu, REG_A6XX_CP_ALWAYS_ON_COUNTER_LO, - REG_A6XX_CP_ALWAYS_ON_COUNTER_HI); + REG_A6XX_CP_ALWAYS_ON_COUNTER_HI); a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); - mutex_unlock(&perfcounter_oob); + + mutex_unlock(&a6xx_gpu->gmu.lock); + return 0; } @@ -1622,6 +1640,16 @@ static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu) return (unsigned long)busy_time; } +void a6xx_gpu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + + mutex_lock(&a6xx_gpu->gmu.lock); + a6xx_gmu_set_freq(gpu, opp); + mutex_unlock(&a6xx_gpu->gmu.lock); +} + static struct msm_gem_address_space * a6xx_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) { @@ -1766,7 +1794,7 @@ static const struct adreno_gpu_funcs funcs = { #endif .gpu_busy = a6xx_gpu_busy, .gpu_get_freq = a6xx_gmu_get_freq, - .gpu_set_freq = a6xx_gmu_set_freq, + .gpu_set_freq = a6xx_gpu_set_freq, #if defined(CONFIG_DRM_MSM_GPU_STATE) .gpu_state_get = a6xx_gpu_state_get, .gpu_state_put = a6xx_gpu_state_put, diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 0bc2d062f5..8e5527c881 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -19,7 +19,16 @@ struct a6xx_gpu { uint64_t sqe_iova; struct msm_ringbuffer *cur_ring; - struct msm_file_private *cur_ctx; + + /** + * cur_ctx_seqno: + * + * The ctx->seqno value of the context with current pgtables + * installed. Tracked by seqno rather than pointer value to + * avoid dangling pointers, and cases where a ctx can be freed + * and a new one created with the same address. + */ + int cur_ctx_seqno; struct a6xx_gmu gmu; diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c index f482e0911d..bb7d066618 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c @@ -1125,6 +1125,20 @@ static void mdp5_crtc_reset(struct drm_crtc *crtc) __drm_atomic_helper_crtc_reset(crtc, &mdp5_cstate->base); } +static const struct drm_crtc_funcs mdp5_crtc_no_lm_cursor_funcs = { + .set_config = drm_atomic_helper_set_config, + .destroy = mdp5_crtc_destroy, + .page_flip = drm_atomic_helper_page_flip, + .reset = mdp5_crtc_reset, + .atomic_duplicate_state = mdp5_crtc_duplicate_state, + .atomic_destroy_state = mdp5_crtc_destroy_state, + .atomic_print_state = mdp5_crtc_atomic_print_state, + .get_vblank_counter = mdp5_crtc_get_vblank_counter, + .enable_vblank = msm_crtc_enable_vblank, + .disable_vblank = msm_crtc_disable_vblank, + .get_vblank_timestamp = drm_crtc_vblank_helper_get_vblank_timestamp, +}; + static const struct drm_crtc_funcs mdp5_crtc_funcs = { .set_config = drm_atomic_helper_set_config, .destroy = mdp5_crtc_destroy, @@ -1313,6 +1327,8 @@ struct drm_crtc *mdp5_crtc_init(struct drm_device *dev, mdp5_crtc->lm_cursor_enabled = cursor_plane ? false : true; drm_crtc_init_with_planes(dev, crtc, plane, cursor_plane, + cursor_plane ? + &mdp5_crtc_no_lm_cursor_funcs : &mdp5_crtc_funcs, NULL); drm_flip_work_init(&mdp5_crtc->unref_cursor_work, diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index fbe4c2cd52..a0392e4d81 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -1309,14 +1309,14 @@ static int dp_pm_resume(struct device *dev) * can not declared display is connected unless * HDMI cable is plugged in and sink_count of * dongle become 1 + * also only signal audio when disconnected */ - if (dp->link->sink_count) + if (dp->link->sink_count) { dp->dp_display.is_connected = true; - else + } else { dp->dp_display.is_connected = false; - - dp_display_handle_plugged_change(g_dp_display, - dp->dp_display.is_connected); + dp_display_handle_plugged_change(g_dp_display, false); + } DRM_DEBUG_DP("After, sink_count=%d is_connected=%d core_inited=%d power_on=%d\n", dp->link->sink_count, dp->dp_display.is_connected, diff --git a/drivers/gpu/drm/msm/dsi/dsi.c b/drivers/gpu/drm/msm/dsi/dsi.c index 614dc7f26f..75ae3008b6 100644 --- a/drivers/gpu/drm/msm/dsi/dsi.c +++ b/drivers/gpu/drm/msm/dsi/dsi.c @@ -215,8 +215,10 @@ int msm_dsi_modeset_init(struct msm_dsi *msm_dsi, struct drm_device *dev, goto fail; } - if (!msm_dsi_manager_validate_current_config(msm_dsi->id)) + if (!msm_dsi_manager_validate_current_config(msm_dsi->id)) { + ret = -EINVAL; goto fail; + } msm_dsi->encoder = encoder; diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c b/drivers/gpu/drm/msm/dsi/dsi_host.c index e269df2851..c86b5090fa 100644 --- a/drivers/gpu/drm/msm/dsi/dsi_host.c +++ b/drivers/gpu/drm/msm/dsi/dsi_host.c @@ -451,7 +451,7 @@ static int dsi_bus_clk_enable(struct msm_dsi_host *msm_host) return 0; err: - for (; i > 0; i--) + while (--i >= 0) clk_disable_unprepare(msm_host->bus_clks[i]); return ret; diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c index d13552b221..5b4e991f22 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c @@ -110,14 +110,13 @@ static struct dsi_pll_14nm *pll_14nm_list[DSI_MAX]; static bool pll_14nm_poll_for_ready(struct dsi_pll_14nm *pll_14nm, u32 nb_tries, u32 timeout_us) { - bool pll_locked = false; + bool pll_locked = false, pll_ready = false; void __iomem *base = pll_14nm->phy->pll_base; u32 tries, val; tries = nb_tries; while (tries--) { - val = dsi_phy_read(base + - REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); + val = dsi_phy_read(base + REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); pll_locked = !!(val & BIT(5)); if (pll_locked) @@ -126,23 +125,24 @@ static bool pll_14nm_poll_for_ready(struct dsi_pll_14nm *pll_14nm, udelay(timeout_us); } - if (!pll_locked) { - tries = nb_tries; - while (tries--) { - val = dsi_phy_read(base + - REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); - pll_locked = !!(val & BIT(0)); + if (!pll_locked) + goto out; - if (pll_locked) - break; + tries = nb_tries; + while (tries--) { + val = dsi_phy_read(base + REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); + pll_ready = !!(val & BIT(0)); - udelay(timeout_us); - } + if (pll_ready) + break; + + udelay(timeout_us); } - DBG("DSI PLL is %slocked", pll_locked ? "" : "*not* "); +out: + DBG("DSI PLL is %slocked, %sready", pll_locked ? "" : "*not* ", pll_ready ? "" : "*not* "); - return pll_locked; + return pll_locked && pll_ready; } static void dsi_pll_14nm_config_init(struct dsi_pll_config *pconf) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c index aaa37456f4..71ed4aa0dc 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c @@ -428,7 +428,7 @@ static int pll_28nm_register(struct dsi_pll_28nm *pll_28nm, struct clk_hw **prov bytediv->reg = pll_28nm->phy->pll_base + REG_DSI_28nm_8960_PHY_PLL_CTRL_9; snprintf(parent_name, 32, "dsi%dvco_clk", pll_28nm->phy->id); - snprintf(clk_name, 32, "dsi%dpllbyte", pll_28nm->phy->id); + snprintf(clk_name, 32, "dsi%dpllbyte", pll_28nm->phy->id + 1); bytediv_init.name = clk_name; bytediv_init.ops = &clk_bytediv_ops; @@ -442,7 +442,7 @@ static int pll_28nm_register(struct dsi_pll_28nm *pll_28nm, struct clk_hw **prov return ret; provided_clocks[DSI_BYTE_PLL_CLK] = &bytediv->hw; - snprintf(clk_name, 32, "dsi%dpll", pll_28nm->phy->id); + snprintf(clk_name, 32, "dsi%dpll", pll_28nm->phy->id + 1); /* DIV3 */ hw = devm_clk_hw_register_divider(dev, clk_name, parent_name, 0, pll_28nm->phy->pll_base + diff --git a/drivers/gpu/drm/msm/edp/edp_ctrl.c b/drivers/gpu/drm/msm/edp/edp_ctrl.c index 4fb397ee7c..fe1366b4c4 100644 --- a/drivers/gpu/drm/msm/edp/edp_ctrl.c +++ b/drivers/gpu/drm/msm/edp/edp_ctrl.c @@ -1116,7 +1116,7 @@ void msm_edp_ctrl_power(struct edp_ctrl *ctrl, bool on) int msm_edp_ctrl_init(struct msm_edp *edp) { struct edp_ctrl *ctrl = NULL; - struct device *dev = &edp->pdev->dev; + struct device *dev; int ret; if (!edp) { @@ -1124,6 +1124,7 @@ int msm_edp_ctrl_init(struct msm_edp *edp) return -EINVAL; } + dev = &edp->pdev->dev; ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL); if (!ctrl) return -ENOMEM; diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 2e6fc185e5..d4e09703a8 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -630,10 +630,11 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) if (ret) goto err_msm_uninit; - ret = msm_disp_snapshot_init(ddev); - if (ret) - DRM_DEV_ERROR(dev, "msm_disp_snapshot_init failed ret = %d\n", ret); - + if (kms) { + ret = msm_disp_snapshot_init(ddev); + if (ret) + DRM_DEV_ERROR(dev, "msm_disp_snapshot_init failed ret = %d\n", ret); + } drm_mode_config_reset(ddev); #ifdef CONFIG_DRM_FBDEV_EMULATION @@ -682,6 +683,7 @@ static void load_gpu(struct drm_device *dev) static int context_init(struct drm_device *dev, struct drm_file *file) { + static atomic_t ident = ATOMIC_INIT(0); struct msm_drm_private *priv = dev->dev_private; struct msm_file_private *ctx; @@ -689,12 +691,17 @@ static int context_init(struct drm_device *dev, struct drm_file *file) if (!ctx) return -ENOMEM; + INIT_LIST_HEAD(&ctx->submitqueues); + rwlock_init(&ctx->queuelock); + kref_init(&ctx->ref); msm_submitqueue_init(dev, ctx); ctx->aspace = msm_gpu_create_private_address_space(priv->gpu, current); file->driver_priv = ctx; + ctx->seqno = atomic_inc_return(&ident); + return 0; } diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 8b005d1ac8..c552f0c389 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -53,14 +53,6 @@ struct msm_disp_state; #define FRAC_16_16(mult, div) (((mult) << 16) / (div)) -struct msm_file_private { - rwlock_t queuelock; - struct list_head submitqueues; - int queueid; - struct msm_gem_address_space *aspace; - struct kref ref; -}; - enum msm_mdp_plane_property { PLANE_PROP_ZPOS, PLANE_PROP_ALPHA, @@ -488,41 +480,6 @@ void msm_writel(u32 data, void __iomem *addr); u32 msm_readl(const void __iomem *addr); void msm_rmw(void __iomem *addr, u32 mask, u32 or); -struct msm_gpu_submitqueue; -int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx); -struct msm_gpu_submitqueue *msm_submitqueue_get(struct msm_file_private *ctx, - u32 id); -int msm_submitqueue_create(struct drm_device *drm, - struct msm_file_private *ctx, - u32 prio, u32 flags, u32 *id); -int msm_submitqueue_query(struct drm_device *drm, struct msm_file_private *ctx, - struct drm_msm_submitqueue_query *args); -int msm_submitqueue_remove(struct msm_file_private *ctx, u32 id); -void msm_submitqueue_close(struct msm_file_private *ctx); - -void msm_submitqueue_destroy(struct kref *kref); - -static inline void __msm_file_private_destroy(struct kref *kref) -{ - struct msm_file_private *ctx = container_of(kref, - struct msm_file_private, ref); - - msm_gem_address_space_put(ctx->aspace); - kfree(ctx); -} - -static inline void msm_file_private_put(struct msm_file_private *ctx) -{ - kref_put(&ctx->ref, __msm_file_private_destroy); -} - -static inline struct msm_file_private *msm_file_private_get( - struct msm_file_private *ctx) -{ - kref_get(&ctx->ref); - return ctx; -} - #define DBG(fmt, ...) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__) #define VERB(fmt, ...) if (0) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__) @@ -547,7 +504,7 @@ static inline int align_pitch(int width, int bpp) static inline unsigned long timeout_to_jiffies(const ktime_t *timeout) { ktime_t now = ktime_get(); - unsigned long remaining_jiffies; + s64 remaining_jiffies; if (ktime_compare(*timeout, now) < 0) { remaining_jiffies = 0; @@ -556,7 +513,7 @@ static inline unsigned long timeout_to_jiffies(const ktime_t *timeout) remaining_jiffies = ktime_divns(rem, NSEC_PER_SEC / HZ); } - return remaining_jiffies; + return clamp(remaining_jiffies, 0LL, (s64)INT_MAX); } #endif /* __MSM_DRV_H__ */ diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index fdc5367aec..151d19e445 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -46,7 +46,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, if (!submit) return ERR_PTR(-ENOMEM); - ret = drm_sched_job_init(&submit->base, &queue->entity, queue); + ret = drm_sched_job_init(&submit->base, queue->entity, queue); if (ret) { kfree(submit); return ERR_PTR(ret); @@ -171,7 +171,8 @@ static int submit_lookup_objects(struct msm_gem_submit *submit, static int submit_lookup_cmds(struct msm_gem_submit *submit, struct drm_msm_gem_submit *args, struct drm_file *file) { - unsigned i, sz; + unsigned i; + size_t sz; int ret = 0; for (i = 0; i < args->nr_cmds; i++) { @@ -907,7 +908,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, /* The scheduler owns a ref now: */ msm_gem_submit_get(submit); - drm_sched_entity_push_job(&submit->base, &queue->entity); + drm_sched_entity_push_job(&submit->base, queue->entity); args->fence = submit->fence_id; diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 0e4b45bff2..030f82f149 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -257,6 +257,39 @@ struct msm_gpu_perfcntr { */ #define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_HIGH - DRM_SCHED_PRIORITY_MIN) +/** + * struct msm_file_private - per-drm_file context + * + * @queuelock: synchronizes access to submitqueues list + * @submitqueues: list of &msm_gpu_submitqueue created by userspace + * @queueid: counter incremented each time a submitqueue is created, + * used to assign &msm_gpu_submitqueue.id + * @aspace: the per-process GPU address-space + * @ref: reference count + * @seqno: unique per process seqno + */ +struct msm_file_private { + rwlock_t queuelock; + struct list_head submitqueues; + int queueid; + struct msm_gem_address_space *aspace; + struct kref ref; + int seqno; + + /** + * entities: + * + * Table of per-priority-level sched entities used by submitqueues + * associated with this &drm_file. Because some userspace apps + * make assumptions about rendering from multiple gl contexts + * (of the same priority) within the process happening in FIFO + * order without requiring any fencing beyond MakeCurrent(), we + * create at most one &drm_sched_entity per-process per-priority- + * level. + */ + struct drm_sched_entity *entities[NR_SCHED_PRIORITIES * MSM_GPU_MAX_RINGS]; +}; + /** * msm_gpu_convert_priority - Map userspace priority to ring # and sched priority * @@ -304,6 +337,8 @@ static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio, } /** + * struct msm_gpu_submitqueues - Userspace created context. + * * A submitqueue is associated with a gl context or vk queue (or equiv) * in userspace. * @@ -321,7 +356,7 @@ static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio, * seqno, protected by submitqueue lock * @lock: submitqueue lock * @ref: reference count - * @entity: the submit job-queue + * @entity: the submit job-queue */ struct msm_gpu_submitqueue { int id; @@ -333,7 +368,7 @@ struct msm_gpu_submitqueue { struct idr fence_idr; struct mutex lock; struct kref ref; - struct drm_sched_entity entity; + struct drm_sched_entity *entity; }; struct msm_gpu_state_bo { @@ -421,6 +456,33 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val) int msm_gpu_pm_suspend(struct msm_gpu *gpu); int msm_gpu_pm_resume(struct msm_gpu *gpu); +int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx); +struct msm_gpu_submitqueue *msm_submitqueue_get(struct msm_file_private *ctx, + u32 id); +int msm_submitqueue_create(struct drm_device *drm, + struct msm_file_private *ctx, + u32 prio, u32 flags, u32 *id); +int msm_submitqueue_query(struct drm_device *drm, struct msm_file_private *ctx, + struct drm_msm_submitqueue_query *args); +int msm_submitqueue_remove(struct msm_file_private *ctx, u32 id); +void msm_submitqueue_close(struct msm_file_private *ctx); + +void msm_submitqueue_destroy(struct kref *kref); + +void __msm_file_private_destroy(struct kref *kref); + +static inline void msm_file_private_put(struct msm_file_private *ctx) +{ + kref_put(&ctx->ref, __msm_file_private_destroy); +} + +static inline struct msm_file_private *msm_file_private_get( + struct msm_file_private *ctx) +{ + kref_get(&ctx->ref); + return ctx; +} + void msm_devfreq_init(struct msm_gpu *gpu); void msm_devfreq_cleanup(struct msm_gpu *gpu); void msm_devfreq_resume(struct msm_gpu *gpu); diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c b/drivers/gpu/drm/msm/msm_gpu_devfreq.c index 0a1ee20296..84e98c07c9 100644 --- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c +++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c @@ -151,6 +151,9 @@ void msm_devfreq_active(struct msm_gpu *gpu) unsigned int idle_time; unsigned long target_freq = df->idle_freq; + if (!df->devfreq) + return; + /* * Hold devfreq lock to synchronize with get_dev_status()/ * target() callbacks @@ -186,6 +189,9 @@ void msm_devfreq_idle(struct msm_gpu *gpu) struct msm_gpu_devfreq *df = &gpu->devfreq; unsigned long idle_freq, target_freq = 0; + if (!df->devfreq) + return; + /* * Hold devfreq lock to synchronize with get_dev_status()/ * target() callbacks diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c index 32a55d81b5..b8621c6e05 100644 --- a/drivers/gpu/drm/msm/msm_submitqueue.c +++ b/drivers/gpu/drm/msm/msm_submitqueue.c @@ -7,6 +7,24 @@ #include "msm_gpu.h" +void __msm_file_private_destroy(struct kref *kref) +{ + struct msm_file_private *ctx = container_of(kref, + struct msm_file_private, ref); + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->entities); i++) { + if (!ctx->entities[i]) + continue; + + drm_sched_entity_destroy(ctx->entities[i]); + kfree(ctx->entities[i]); + } + + msm_gem_address_space_put(ctx->aspace); + kfree(ctx); +} + void msm_submitqueue_destroy(struct kref *kref) { struct msm_gpu_submitqueue *queue = container_of(kref, @@ -14,8 +32,6 @@ void msm_submitqueue_destroy(struct kref *kref) idr_destroy(&queue->fence_idr); - drm_sched_entity_destroy(&queue->entity); - msm_file_private_put(queue->ctx); kfree(queue); @@ -61,13 +77,47 @@ void msm_submitqueue_close(struct msm_file_private *ctx) } } +static struct drm_sched_entity * +get_sched_entity(struct msm_file_private *ctx, struct msm_ringbuffer *ring, + unsigned ring_nr, enum drm_sched_priority sched_prio) +{ + static DEFINE_MUTEX(entity_lock); + unsigned idx = (ring_nr * NR_SCHED_PRIORITIES) + sched_prio; + + /* We should have already validated that the requested priority is + * valid by the time we get here. + */ + if (WARN_ON(idx >= ARRAY_SIZE(ctx->entities))) + return ERR_PTR(-EINVAL); + + mutex_lock(&entity_lock); + + if (!ctx->entities[idx]) { + struct drm_sched_entity *entity; + struct drm_gpu_scheduler *sched = &ring->sched; + int ret; + + entity = kzalloc(sizeof(*ctx->entities[idx]), GFP_KERNEL); + + ret = drm_sched_entity_init(entity, sched_prio, &sched, 1, NULL); + if (ret) { + kfree(entity); + return ERR_PTR(ret); + } + + ctx->entities[idx] = entity; + } + + mutex_unlock(&entity_lock); + + return ctx->entities[idx]; +} + int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, u32 prio, u32 flags, u32 *id) { struct msm_drm_private *priv = drm->dev_private; struct msm_gpu_submitqueue *queue; - struct msm_ringbuffer *ring; - struct drm_gpu_scheduler *sched; enum drm_sched_priority sched_prio; unsigned ring_nr; int ret; @@ -91,12 +141,10 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, queue->flags = flags; queue->ring_nr = ring_nr; - ring = priv->gpu->rb[ring_nr]; - sched = &ring->sched; - - ret = drm_sched_entity_init(&queue->entity, - sched_prio, &sched, 1, NULL); - if (ret) { + queue->entity = get_sched_entity(ctx, priv->gpu->rb[ring_nr], + ring_nr, sched_prio); + if (IS_ERR(queue->entity)) { + ret = PTR_ERR(queue->entity); kfree(queue); return ret; } @@ -140,10 +188,6 @@ int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx) */ default_prio = DIV_ROUND_UP(max_priority, 2); - INIT_LIST_HEAD(&ctx->submitqueues); - - rwlock_init(&ctx->queuelock); - return msm_submitqueue_create(drm, ctx, default_prio, 0, NULL); } diff --git a/drivers/gpu/drm/nouveau/dispnv50/crc.c b/drivers/gpu/drm/nouveau/dispnv50/crc.c index b8c31b6977..66f32d965c 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/crc.c +++ b/drivers/gpu/drm/nouveau/dispnv50/crc.c @@ -704,6 +704,7 @@ static const struct file_operations nv50_crc_flip_threshold_fops = { .open = nv50_crc_debugfs_flip_threshold_open, .read = seq_read, .write = nv50_crc_debugfs_flip_threshold_set, + .release = single_release, }; int nv50_head_crc_late_register(struct nv50_head *head) diff --git a/drivers/gpu/drm/nouveau/include/nvif/class.h b/drivers/gpu/drm/nouveau/include/nvif/class.h index c68cc95724..a582c0cb0c 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/class.h +++ b/drivers/gpu/drm/nouveau/include/nvif/class.h @@ -71,6 +71,7 @@ #define PASCAL_CHANNEL_GPFIFO_A /* cla06f.h */ 0x0000c06f #define VOLTA_CHANNEL_GPFIFO_A /* clc36f.h */ 0x0000c36f #define TURING_CHANNEL_GPFIFO_A /* clc36f.h */ 0x0000c46f +#define AMPERE_CHANNEL_GPFIFO_B /* clc36f.h */ 0x0000c76f #define NV50_DISP /* cl5070.h */ 0x00005070 #define G82_DISP /* cl5070.h */ 0x00008270 @@ -200,6 +201,7 @@ #define PASCAL_DMA_COPY_B 0x0000c1b5 #define VOLTA_DMA_COPY_A 0x0000c3b5 #define TURING_DMA_COPY_A 0x0000c5b5 +#define AMPERE_DMA_COPY_B 0x0000c7b5 #define FERMI_DECOMPRESS 0x000090b8 diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h index 54fab7cc36..64ee82c7c1 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h @@ -77,4 +77,5 @@ int gp100_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct int gp10b_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); int gv100_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); int tu102_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); +int ga102_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); #endif diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 6d07e653f8..c58bcdba2c 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -844,6 +844,7 @@ nouveau_bo_move_init(struct nouveau_drm *drm) struct ttm_resource *, struct ttm_resource *); int (*init)(struct nouveau_channel *, u32 handle); } _methods[] = { + { "COPY", 4, 0xc7b5, nve0_bo_move_copy, nve0_bo_move_init }, { "COPY", 4, 0xc5b5, nve0_bo_move_copy, nve0_bo_move_init }, { "GRCE", 0, 0xc5b5, nve0_bo_move_copy, nvc0_bo_move_init }, { "COPY", 4, 0xc3b5, nve0_bo_move_copy, nve0_bo_move_init }, diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index 80099ef757..ea7769135b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -250,7 +250,8 @@ static int nouveau_channel_ind(struct nouveau_drm *drm, struct nvif_device *device, u64 runlist, bool priv, struct nouveau_channel **pchan) { - static const u16 oclasses[] = { TURING_CHANNEL_GPFIFO_A, + static const u16 oclasses[] = { AMPERE_CHANNEL_GPFIFO_B, + TURING_CHANNEL_GPFIFO_A, VOLTA_CHANNEL_GPFIFO_A, PASCAL_CHANNEL_GPFIFO_A, MAXWELL_CHANNEL_GPFIFO_A, @@ -386,7 +387,8 @@ nouveau_channel_init(struct nouveau_channel *chan, u32 vram, u32 gart) nvif_object_map(&chan->user, NULL, 0); - if (chan->user.oclass >= FERMI_CHANNEL_GPFIFO) { + if (chan->user.oclass >= FERMI_CHANNEL_GPFIFO && + chan->user.oclass < AMPERE_CHANNEL_GPFIFO_B) { ret = nvif_notify_ctor(&chan->user, "abi16ChanKilled", nouveau_channel_killed, true, NV906F_V0_NTFY_KILLED, diff --git a/drivers/gpu/drm/nouveau/nouveau_debugfs.c b/drivers/gpu/drm/nouveau/nouveau_debugfs.c index c2bc05eb2e..1cbe01048b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_debugfs.c +++ b/drivers/gpu/drm/nouveau/nouveau_debugfs.c @@ -207,6 +207,7 @@ static const struct file_operations nouveau_pstate_fops = { .open = nouveau_debugfs_pstate_open, .read = seq_read, .write = nouveau_debugfs_pstate_set, + .release = single_release, }; static struct drm_info_list nouveau_debugfs_list[] = { diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 1f828c9f69..6109cd9e33 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -345,6 +345,9 @@ nouveau_accel_gr_init(struct nouveau_drm *drm) u32 arg0, arg1; int ret; + if (device->info.family >= NV_DEVICE_INFO_V0_AMPERE) + return; + /* Allocate channel that has access to the graphics engine. */ if (device->info.family >= NV_DEVICE_INFO_V0_KEPLER) { arg0 = nvif_fifo_runlist(device, NV_DEVICE_HOST_RUNLIST_ENGINES_GR); @@ -469,6 +472,7 @@ nouveau_accel_init(struct nouveau_drm *drm) case PASCAL_CHANNEL_GPFIFO_A: case VOLTA_CHANNEL_GPFIFO_A: case TURING_CHANNEL_GPFIFO_A: + case AMPERE_CHANNEL_GPFIFO_B: ret = nvc0_fence_create(drm); break; default: diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 5b27845075..8c2ecc2827 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -247,10 +247,8 @@ nouveau_gem_new(struct nouveau_cli *cli, u64 size, int align, uint32_t domain, } ret = nouveau_bo_init(nvbo, size, align, domain, NULL, NULL); - if (ret) { - nouveau_bo_ref(NULL, &nvbo); + if (ret) return ret; - } /* we restrict allowed domains on nv50+ to only the types * that were requested at creation time. not possibly on diff --git a/drivers/gpu/drm/nouveau/nv84_fence.c b/drivers/gpu/drm/nouveau/nv84_fence.c index 7c9c928c31..c3526a8622 100644 --- a/drivers/gpu/drm/nouveau/nv84_fence.c +++ b/drivers/gpu/drm/nouveau/nv84_fence.c @@ -204,7 +204,7 @@ nv84_fence_create(struct nouveau_drm *drm) priv->base.context_new = nv84_fence_context_new; priv->base.context_del = nv84_fence_context_del; - priv->base.uevent = true; + priv->base.uevent = drm->client.device.info.family < NV_DEVICE_INFO_V0_AMPERE; mutex_init(&priv->mutex); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 93ddf63d11..ca75c5f6ec 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2602,6 +2602,7 @@ nv172_chipset = { .top = { 0x00000001, ga100_top_new }, .disp = { 0x00000001, ga102_disp_new }, .dma = { 0x00000001, gv100_dma_new }, + .fifo = { 0x00000001, ga102_fifo_new }, }; static const struct nvkm_device_chip @@ -2622,6 +2623,7 @@ nv174_chipset = { .top = { 0x00000001, ga100_top_new }, .disp = { 0x00000001, ga102_disp_new }, .dma = { 0x00000001, gv100_dma_new }, + .fifo = { 0x00000001, ga102_fifo_new }, }; static const struct nvkm_device_chip @@ -2642,6 +2644,7 @@ nv177_chipset = { .top = { 0x00000001, ga100_top_new }, .disp = { 0x00000001, ga102_disp_new }, .dma = { 0x00000001, gv100_dma_new }, + .fifo = { 0x00000001, ga102_fifo_new }, }; static int diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild index 3209eb7af6..5e831d347a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild @@ -18,6 +18,7 @@ nvkm-y += nvkm/engine/fifo/gp100.o nvkm-y += nvkm/engine/fifo/gp10b.o nvkm-y += nvkm/engine/fifo/gv100.o nvkm-y += nvkm/engine/fifo/tu102.o +nvkm-y += nvkm/engine/fifo/ga102.o nvkm-y += nvkm/engine/fifo/chan.o nvkm-y += nvkm/engine/fifo/channv50.o diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c index 353b77d9b3..3492c561f2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c @@ -82,7 +82,7 @@ g84_fifo_chan_engine_fini(struct nvkm_fifo_chan *base, if (offset < 0) return 0; - engn = fifo->base.func->engine_id(&fifo->base, engine); + engn = fifo->base.func->engine_id(&fifo->base, engine) - 1; save = nvkm_mask(device, 0x002520, 0x0000003f, 1 << engn); nvkm_wr32(device, 0x0032fc, chan->base.inst->addr >> 12); done = nvkm_msec(device, 2000, diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c new file mode 100644 index 0000000000..c630dbd291 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c @@ -0,0 +1,311 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#define ga102_fifo(p) container_of((p), struct ga102_fifo, base.engine) +#define ga102_chan(p) container_of((p), struct ga102_chan, object) +#include +#include "user.h" + +#include +#include +#include +#include + +#include +#include +#include + +struct ga102_fifo { + struct nvkm_fifo base; +}; + +struct ga102_chan { + struct nvkm_object object; + + struct { + u32 runl; + u32 chan; + } ctrl; + + struct nvkm_memory *mthd; + struct nvkm_memory *inst; + struct nvkm_memory *user; + struct nvkm_memory *runl; + + struct nvkm_vmm *vmm; +}; + +static int +ga102_chan_sclass(struct nvkm_object *object, int index, struct nvkm_oclass *oclass) +{ + if (index == 0) { + oclass->ctor = nvkm_object_new; + oclass->base = (struct nvkm_sclass) { -1, -1, AMPERE_DMA_COPY_B }; + return 0; + } + + return -EINVAL; +} + +static int +ga102_chan_map(struct nvkm_object *object, void *argv, u32 argc, + enum nvkm_object_map *type, u64 *addr, u64 *size) +{ + struct ga102_chan *chan = ga102_chan(object); + struct nvkm_device *device = chan->object.engine->subdev.device; + u64 bar2 = nvkm_memory_bar2(chan->user); + + if (bar2 == ~0ULL) + return -EFAULT; + + *type = NVKM_OBJECT_MAP_IO; + *addr = device->func->resource_addr(device, 3) + bar2; + *size = 0x1000; + return 0; +} + +static int +ga102_chan_fini(struct nvkm_object *object, bool suspend) +{ + struct ga102_chan *chan = ga102_chan(object); + struct nvkm_device *device = chan->object.engine->subdev.device; + + nvkm_wr32(device, chan->ctrl.chan, 0x00000003); + + nvkm_wr32(device, chan->ctrl.runl + 0x098, 0x01000000); + nvkm_msec(device, 2000, + if (!(nvkm_rd32(device, chan->ctrl.runl + 0x098) & 0x00100000)) + break; + ); + + nvkm_wr32(device, chan->ctrl.runl + 0x088, 0); + + nvkm_wr32(device, chan->ctrl.chan, 0xffffffff); + return 0; +} + +static int +ga102_chan_init(struct nvkm_object *object) +{ + struct ga102_chan *chan = ga102_chan(object); + struct nvkm_device *device = chan->object.engine->subdev.device; + + nvkm_mask(device, chan->ctrl.runl + 0x300, 0x80000000, 0x80000000); + + nvkm_wr32(device, chan->ctrl.runl + 0x080, lower_32_bits(nvkm_memory_addr(chan->runl))); + nvkm_wr32(device, chan->ctrl.runl + 0x084, upper_32_bits(nvkm_memory_addr(chan->runl))); + nvkm_wr32(device, chan->ctrl.runl + 0x088, 2); + + nvkm_wr32(device, chan->ctrl.chan, 0x00000002); + nvkm_wr32(device, chan->ctrl.runl + 0x0090, 0); + return 0; +} + +static void * +ga102_chan_dtor(struct nvkm_object *object) +{ + struct ga102_chan *chan = ga102_chan(object); + + if (chan->vmm) { + nvkm_vmm_part(chan->vmm, chan->inst); + nvkm_vmm_unref(&chan->vmm); + } + + nvkm_memory_unref(&chan->runl); + nvkm_memory_unref(&chan->user); + nvkm_memory_unref(&chan->inst); + nvkm_memory_unref(&chan->mthd); + return chan; +} + +static const struct nvkm_object_func +ga102_chan = { + .dtor = ga102_chan_dtor, + .init = ga102_chan_init, + .fini = ga102_chan_fini, + .map = ga102_chan_map, + .sclass = ga102_chan_sclass, +}; + +static int +ga102_chan_new(struct nvkm_device *device, + const struct nvkm_oclass *oclass, void *argv, u32 argc, struct nvkm_object **pobject) +{ + struct volta_channel_gpfifo_a_v0 *args = argv; + struct nvkm_top_device *tdev; + struct nvkm_vmm *vmm; + struct ga102_chan *chan; + int ret; + + if (argc != sizeof(*args)) + return -ENOSYS; + + vmm = nvkm_uvmm_search(oclass->client, args->vmm); + if (IS_ERR(vmm)) + return PTR_ERR(vmm); + + if (!(chan = kzalloc(sizeof(*chan), GFP_KERNEL))) + return -ENOMEM; + + nvkm_object_ctor(&ga102_chan, oclass, &chan->object); + *pobject = &chan->object; + + list_for_each_entry(tdev, &device->top->device, head) { + if (tdev->type == NVKM_ENGINE_CE) { + chan->ctrl.runl = tdev->runlist; + break; + } + } + + if (!chan->ctrl.runl) + return -ENODEV; + + chan->ctrl.chan = nvkm_rd32(device, chan->ctrl.runl + 0x004) & 0xfffffff0; + + args->chid = 0; + args->inst = 0; + args->token = nvkm_rd32(device, chan->ctrl.runl + 0x008) & 0xffff0000; + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->mthd); + if (ret) + return ret; + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->inst); + if (ret) + return ret; + + nvkm_kmap(chan->inst); + nvkm_wo32(chan->inst, 0x010, 0x0000face); + nvkm_wo32(chan->inst, 0x030, 0x7ffff902); + nvkm_wo32(chan->inst, 0x048, lower_32_bits(args->ioffset)); + nvkm_wo32(chan->inst, 0x04c, upper_32_bits(args->ioffset) | + (order_base_2(args->ilength / 8) << 16)); + nvkm_wo32(chan->inst, 0x084, 0x20400000); + nvkm_wo32(chan->inst, 0x094, 0x30000001); + nvkm_wo32(chan->inst, 0x0ac, 0x00020000); + nvkm_wo32(chan->inst, 0x0e4, 0x00000000); + nvkm_wo32(chan->inst, 0x0e8, 0); + nvkm_wo32(chan->inst, 0x0f4, 0x00001000); + nvkm_wo32(chan->inst, 0x0f8, 0x10003080); + nvkm_mo32(chan->inst, 0x218, 0x00000000, 0x00000000); + nvkm_wo32(chan->inst, 0x220, lower_32_bits(nvkm_memory_bar2(chan->mthd))); + nvkm_wo32(chan->inst, 0x224, upper_32_bits(nvkm_memory_bar2(chan->mthd))); + nvkm_done(chan->inst); + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->user); + if (ret) + return ret; + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->runl); + if (ret) + return ret; + + nvkm_kmap(chan->runl); + nvkm_wo32(chan->runl, 0x00, 0x80030001); + nvkm_wo32(chan->runl, 0x04, 1); + nvkm_wo32(chan->runl, 0x08, 0); + nvkm_wo32(chan->runl, 0x0c, 0x00000000); + nvkm_wo32(chan->runl, 0x10, lower_32_bits(nvkm_memory_addr(chan->user))); + nvkm_wo32(chan->runl, 0x14, upper_32_bits(nvkm_memory_addr(chan->user))); + nvkm_wo32(chan->runl, 0x18, lower_32_bits(nvkm_memory_addr(chan->inst))); + nvkm_wo32(chan->runl, 0x1c, upper_32_bits(nvkm_memory_addr(chan->inst))); + nvkm_done(chan->runl); + + ret = nvkm_vmm_join(vmm, chan->inst); + if (ret) + return ret; + + chan->vmm = nvkm_vmm_ref(vmm); + return 0; +} + +static const struct nvkm_device_oclass +ga102_chan_oclass = { + .ctor = ga102_chan_new, +}; + +static int +ga102_user_new(struct nvkm_device *device, + const struct nvkm_oclass *oclass, void *argv, u32 argc, struct nvkm_object **pobject) +{ + return tu102_fifo_user_new(oclass, argv, argc, pobject); +} + +static const struct nvkm_device_oclass +ga102_user_oclass = { + .ctor = ga102_user_new, +}; + +static int +ga102_fifo_sclass(struct nvkm_oclass *oclass, int index, const struct nvkm_device_oclass **class) +{ + if (index == 0) { + oclass->base = (struct nvkm_sclass) { -1, -1, VOLTA_USERMODE_A }; + *class = &ga102_user_oclass; + return 0; + } else + if (index == 1) { + oclass->base = (struct nvkm_sclass) { 0, 0, AMPERE_CHANNEL_GPFIFO_B }; + *class = &ga102_chan_oclass; + return 0; + } + + return 2; +} + +static int +ga102_fifo_info(struct nvkm_engine *engine, u64 mthd, u64 *data) +{ + switch (mthd) { + case NV_DEVICE_HOST_CHANNELS: *data = 1; return 0; + default: + break; + } + + return -ENOSYS; +} + +static void * +ga102_fifo_dtor(struct nvkm_engine *engine) +{ + return ga102_fifo(engine); +} + +static const struct nvkm_engine_func +ga102_fifo = { + .dtor = ga102_fifo_dtor, + .info = ga102_fifo_info, + .base.sclass = ga102_fifo_sclass, +}; + +int +ga102_fifo_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, + struct nvkm_fifo **pfifo) +{ + struct ga102_fifo *fifo; + + if (!(fifo = kzalloc(sizeof(*fifo), GFP_KERNEL))) + return -ENOMEM; + + nvkm_engine_ctor(&ga102_fifo, device, type, inst, true, &fifo->base.engine); + *pfifo = &fifo->base; + return 0; +} diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c index 31933f3e5a..c982d834c8 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c @@ -54,7 +54,7 @@ ga100_top_oneinit(struct nvkm_top *top) info->reset = (data & 0x0000001f); break; case 2: - info->runlist = (data & 0x0000fc00) >> 10; + info->runlist = (data & 0x00fffc00); info->engine = (data & 0x00000003); break; default: @@ -85,9 +85,10 @@ ga100_top_oneinit(struct nvkm_top *top) } nvkm_debug(subdev, "%02x.%d (%8s): addr %06x fault %2d " - "runlist %2d engine %2d reset %2d\n", type, inst, + "runlist %6x engine %2d reset %2d\n", type, inst, info->type == NVKM_SUBDEV_NR ? "????????" : nvkm_subdev_type[info->type], - info->addr, info->fault, info->runlist, info->engine, info->reset); + info->addr, info->fault, info->runlist < 0 ? 0 : info->runlist, + info->engine, info->reset); info = NULL; } diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index beb581b96e..418638e6e3 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -295,6 +295,7 @@ config DRM_PANEL_OLIMEX_LCD_OLINUXINO depends on OF depends on I2C depends on BACKLIGHT_CLASS_DEVICE + select CRC32 help The panel is used with different sizes LCDs, from 480x272 to 1280x800, and 24 bit per pixel. diff --git a/drivers/gpu/drm/r128/ati_pcigart.c b/drivers/gpu/drm/r128/ati_pcigart.c index 0ecccf25a3..d2a0f5394f 100644 --- a/drivers/gpu/drm/r128/ati_pcigart.c +++ b/drivers/gpu/drm/r128/ati_pcigart.c @@ -214,7 +214,7 @@ int drm_ati_pcigart_init(struct drm_device *dev, struct drm_ati_pcigart_info *ga } ret = 0; -#if defined(__i386__) || defined(__x86_64__) +#ifdef CONFIG_X86 wbinvd(); #else mb(); diff --git a/drivers/gpu/drm/rcar-du/rcar_du_encoder.c b/drivers/gpu/drm/rcar-du/rcar_du_encoder.c index 0daa8bba50..4bf4e25d7f 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_encoder.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_encoder.c @@ -86,12 +86,20 @@ int rcar_du_encoder_init(struct rcar_du_device *rcdu, } /* - * Create and initialize the encoder. On Gen3 skip the LVDS1 output if + * Create and initialize the encoder. On Gen3, skip the LVDS1 output if * the LVDS1 encoder is used as a companion for LVDS0 in dual-link - * mode. + * mode, or any LVDS output if it isn't connected. The latter may happen + * on D3 or E3 as the LVDS encoders are needed to provide the pixel + * clock to the DU, even when the LVDS outputs are not used. */ - if (rcdu->info->gen >= 3 && output == RCAR_DU_OUTPUT_LVDS1) { - if (rcar_lvds_dual_link(bridge)) + if (rcdu->info->gen >= 3) { + if (output == RCAR_DU_OUTPUT_LVDS1 && + rcar_lvds_dual_link(bridge)) + return -ENOLINK; + + if ((output == RCAR_DU_OUTPUT_LVDS0 || + output == RCAR_DU_OUTPUT_LVDS1) && + !rcar_lvds_is_connected(bridge)) return -ENOLINK; } diff --git a/drivers/gpu/drm/rcar-du/rcar_lvds.c b/drivers/gpu/drm/rcar-du/rcar_lvds.c index d061b8de74..b672c5bd72 100644 --- a/drivers/gpu/drm/rcar-du/rcar_lvds.c +++ b/drivers/gpu/drm/rcar-du/rcar_lvds.c @@ -576,6 +576,9 @@ static int rcar_lvds_attach(struct drm_bridge *bridge, { struct rcar_lvds *lvds = bridge_to_rcar_lvds(bridge); + if (!lvds->next_bridge) + return 0; + return drm_bridge_attach(bridge->encoder, lvds->next_bridge, bridge, flags); } @@ -598,6 +601,14 @@ bool rcar_lvds_dual_link(struct drm_bridge *bridge) } EXPORT_SYMBOL_GPL(rcar_lvds_dual_link); +bool rcar_lvds_is_connected(struct drm_bridge *bridge) +{ + struct rcar_lvds *lvds = bridge_to_rcar_lvds(bridge); + + return lvds->next_bridge != NULL; +} +EXPORT_SYMBOL_GPL(rcar_lvds_is_connected); + /* ----------------------------------------------------------------------------- * Probe & Remove */ diff --git a/drivers/gpu/drm/rcar-du/rcar_lvds.h b/drivers/gpu/drm/rcar-du/rcar_lvds.h index 222ec0e607..eb7c6ef03b 100644 --- a/drivers/gpu/drm/rcar-du/rcar_lvds.h +++ b/drivers/gpu/drm/rcar-du/rcar_lvds.h @@ -16,6 +16,7 @@ struct drm_bridge; int rcar_lvds_clk_enable(struct drm_bridge *bridge, unsigned long freq); void rcar_lvds_clk_disable(struct drm_bridge *bridge); bool rcar_lvds_dual_link(struct drm_bridge *bridge); +bool rcar_lvds_is_connected(struct drm_bridge *bridge); #else static inline int rcar_lvds_clk_enable(struct drm_bridge *bridge, unsigned long freq) @@ -27,6 +28,10 @@ static inline bool rcar_lvds_dual_link(struct drm_bridge *bridge) { return false; } +static inline bool rcar_lvds_is_connected(struct drm_bridge *bridge) +{ + return false; +} #endif /* CONFIG_DRM_RCAR_LVDS */ #endif /* __RCAR_LVDS_H__ */ diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c index ba9e14da41..a25b98b7f5 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c @@ -1174,26 +1174,24 @@ static bool vop_crtc_mode_fixup(struct drm_crtc *crtc, * * Action plan: * - * 1. When DRM gives us a mode, we should add 999 Hz to it. That way - * if the clock we need is 60000001 Hz (~60 MHz) and DRM tells us to - * make 60000 kHz then the clock framework will actually give us - * the right clock. + * 1. Try to set the exact rate first, and confirm the clock framework + * can provide it. * - * NOTE: if the PLL (maybe through a divider) could actually make - * a clock rate 999 Hz higher instead of the one we want then this - * could be a problem. Unfortunately there's not much we can do - * since it's baked into DRM to use kHz. It shouldn't matter in - * practice since Rockchip PLLs are controlled by tables and - * even if there is a divider in the middle I wouldn't expect PLL - * rates in the table that are just a few kHz different. + * 2. If the clock framework cannot provide the exact rate, we should + * add 999 Hz to the requested rate. That way if the clock we need + * is 60000001 Hz (~60 MHz) and DRM tells us to make 60000 kHz then + * the clock framework will actually give us the right clock. * - * 2. Get the clock framework to round the rate for us to tell us + * 3. Get the clock framework to round the rate for us to tell us * what it will actually make. * - * 3. Store the rounded up rate so that we don't need to worry about + * 4. Store the rounded up rate so that we don't need to worry about * this in the actual clk_set_rate(). */ - rate = clk_round_rate(vop->dclk, adjusted_mode->clock * 1000 + 999); + rate = clk_round_rate(vop->dclk, adjusted_mode->clock * 1000); + if (rate / 1000 != adjusted_mode->clock) + rate = clk_round_rate(vop->dclk, + adjusted_mode->clock * 1000 + 999); adjusted_mode->clock = DIV_ROUND_UP(rate, 1000); return true; diff --git a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c index f75fb157f2..016b877051 100644 --- a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c +++ b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c @@ -216,11 +216,13 @@ static int sun8i_dw_hdmi_bind(struct device *dev, struct device *master, goto err_disable_clk_tmds; } + ret = sun8i_hdmi_phy_init(hdmi->phy); + if (ret) + goto err_disable_clk_tmds; + drm_encoder_helper_add(encoder, &sun8i_dw_hdmi_encoder_helper_funcs); drm_simple_encoder_init(drm, encoder, DRM_MODE_ENCODER_TMDS); - sun8i_hdmi_phy_init(hdmi->phy); - plat_data->mode_valid = hdmi->quirks->mode_valid; plat_data->use_drm_infoframe = hdmi->quirks->use_drm_infoframe; sun8i_hdmi_phy_set_ops(hdmi->phy, plat_data); @@ -262,6 +264,7 @@ static void sun8i_dw_hdmi_unbind(struct device *dev, struct device *master, struct sun8i_dw_hdmi *hdmi = dev_get_drvdata(dev); dw_hdmi_unbind(hdmi->hdmi); + sun8i_hdmi_phy_deinit(hdmi->phy); clk_disable_unprepare(hdmi->clk_tmds); reset_control_assert(hdmi->rst_ctrl); gpiod_set_value(hdmi->ddc_en, 0); diff --git a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h index 74f6ed0e25..bffe1b9cd3 100644 --- a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h +++ b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h @@ -169,6 +169,7 @@ struct sun8i_hdmi_phy { struct clk *clk_phy; struct clk *clk_pll0; struct clk *clk_pll1; + struct device *dev; unsigned int rcal; struct regmap *regs; struct reset_control *rst_phy; @@ -205,7 +206,8 @@ encoder_to_sun8i_dw_hdmi(struct drm_encoder *encoder) int sun8i_hdmi_phy_get(struct sun8i_dw_hdmi *hdmi, struct device_node *node); -void sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy); +int sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy); +void sun8i_hdmi_phy_deinit(struct sun8i_hdmi_phy *phy); void sun8i_hdmi_phy_set_ops(struct sun8i_hdmi_phy *phy, struct dw_hdmi_plat_data *plat_data); diff --git a/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c b/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c index c9239708d3..b64d93da65 100644 --- a/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c +++ b/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c @@ -506,9 +506,60 @@ static void sun8i_hdmi_phy_init_h3(struct sun8i_hdmi_phy *phy) phy->rcal = (val & SUN8I_HDMI_PHY_ANA_STS_RCAL_MASK) >> 2; } -void sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy) +int sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy) { + int ret; + + ret = reset_control_deassert(phy->rst_phy); + if (ret) { + dev_err(phy->dev, "Cannot deassert phy reset control: %d\n", ret); + return ret; + } + + ret = clk_prepare_enable(phy->clk_bus); + if (ret) { + dev_err(phy->dev, "Cannot enable bus clock: %d\n", ret); + goto err_assert_rst_phy; + } + + ret = clk_prepare_enable(phy->clk_mod); + if (ret) { + dev_err(phy->dev, "Cannot enable mod clock: %d\n", ret); + goto err_disable_clk_bus; + } + + if (phy->variant->has_phy_clk) { + ret = sun8i_phy_clk_create(phy, phy->dev, + phy->variant->has_second_pll); + if (ret) { + dev_err(phy->dev, "Couldn't create the PHY clock\n"); + goto err_disable_clk_mod; + } + + clk_prepare_enable(phy->clk_phy); + } + phy->variant->phy_init(phy); + + return 0; + +err_disable_clk_mod: + clk_disable_unprepare(phy->clk_mod); +err_disable_clk_bus: + clk_disable_unprepare(phy->clk_bus); +err_assert_rst_phy: + reset_control_assert(phy->rst_phy); + + return ret; +} + +void sun8i_hdmi_phy_deinit(struct sun8i_hdmi_phy *phy) +{ + clk_disable_unprepare(phy->clk_mod); + clk_disable_unprepare(phy->clk_bus); + clk_disable_unprepare(phy->clk_phy); + + reset_control_assert(phy->rst_phy); } void sun8i_hdmi_phy_set_ops(struct sun8i_hdmi_phy *phy, @@ -638,6 +689,7 @@ static int sun8i_hdmi_phy_probe(struct platform_device *pdev) return -ENOMEM; phy->variant = (struct sun8i_hdmi_phy_variant *)match->data; + phy->dev = dev; ret = of_address_to_resource(node, 0, &res); if (ret) { @@ -696,47 +748,10 @@ static int sun8i_hdmi_phy_probe(struct platform_device *pdev) goto err_put_clk_pll1; } - ret = reset_control_deassert(phy->rst_phy); - if (ret) { - dev_err(dev, "Cannot deassert phy reset control: %d\n", ret); - goto err_put_rst_phy; - } - - ret = clk_prepare_enable(phy->clk_bus); - if (ret) { - dev_err(dev, "Cannot enable bus clock: %d\n", ret); - goto err_deassert_rst_phy; - } - - ret = clk_prepare_enable(phy->clk_mod); - if (ret) { - dev_err(dev, "Cannot enable mod clock: %d\n", ret); - goto err_disable_clk_bus; - } - - if (phy->variant->has_phy_clk) { - ret = sun8i_phy_clk_create(phy, dev, - phy->variant->has_second_pll); - if (ret) { - dev_err(dev, "Couldn't create the PHY clock\n"); - goto err_disable_clk_mod; - } - - clk_prepare_enable(phy->clk_phy); - } - platform_set_drvdata(pdev, phy); return 0; -err_disable_clk_mod: - clk_disable_unprepare(phy->clk_mod); -err_disable_clk_bus: - clk_disable_unprepare(phy->clk_bus); -err_deassert_rst_phy: - reset_control_assert(phy->rst_phy); -err_put_rst_phy: - reset_control_put(phy->rst_phy); err_put_clk_pll1: clk_put(phy->clk_pll1); err_put_clk_pll0: @@ -753,12 +768,6 @@ static int sun8i_hdmi_phy_remove(struct platform_device *pdev) { struct sun8i_hdmi_phy *phy = platform_get_drvdata(pdev); - clk_disable_unprepare(phy->clk_mod); - clk_disable_unprepare(phy->clk_bus); - clk_disable_unprepare(phy->clk_phy); - - reset_control_assert(phy->rst_phy); - reset_control_put(phy->rst_phy); clk_put(phy->clk_pll0); diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 16c7aabb94..a29d64f875 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -1845,7 +1845,6 @@ tegra_crtc_update_memory_bandwidth(struct drm_crtc *crtc, bool prepare_bandwidth_transition) { const struct tegra_plane_state *old_tegra_state, *new_tegra_state; - const struct tegra_dc_state *old_dc_state, *new_dc_state; u32 i, new_avg_bw, old_avg_bw, new_peak_bw, old_peak_bw; const struct drm_plane_state *old_plane_state; const struct drm_crtc_state *old_crtc_state; @@ -1858,8 +1857,6 @@ tegra_crtc_update_memory_bandwidth(struct drm_crtc *crtc, return; old_crtc_state = drm_atomic_get_old_crtc_state(state, crtc); - old_dc_state = to_const_dc_state(old_crtc_state); - new_dc_state = to_const_dc_state(crtc->state); if (!crtc->state->active) { if (!old_crtc_state->active) diff --git a/drivers/gpu/drm/tegra/dc.h b/drivers/gpu/drm/tegra/dc.h index f0cb691852..40378308d5 100644 --- a/drivers/gpu/drm/tegra/dc.h +++ b/drivers/gpu/drm/tegra/dc.h @@ -35,12 +35,6 @@ static inline struct tegra_dc_state *to_dc_state(struct drm_crtc_state *state) return NULL; } -static inline const struct tegra_dc_state * -to_const_dc_state(const struct drm_crtc_state *state) -{ - return to_dc_state((struct drm_crtc_state *)state); -} - struct tegra_dc_stats { unsigned long frames; unsigned long vblank; diff --git a/drivers/gpu/drm/tegra/uapi.c b/drivers/gpu/drm/tegra/uapi.c index dc16a24f4d..690a339c52 100644 --- a/drivers/gpu/drm/tegra/uapi.c +++ b/drivers/gpu/drm/tegra/uapi.c @@ -222,7 +222,7 @@ int tegra_drm_ioctl_channel_map(struct drm_device *drm, void *data, struct drm_f mapping->iova = sg_dma_address(mapping->sgt->sgl); } - mapping->iova_end = mapping->iova + host1x_to_tegra_bo(mapping->bo)->size; + mapping->iova_end = mapping->iova + host1x_to_tegra_bo(mapping->bo)->gem.size; err = xa_alloc(&context->mappings, &args->mapping, mapping, XA_LIMIT(1, U32_MAX), GFP_KERNEL); diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 93d9a3d9d6..d6ce4546ea 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -1457,14 +1457,6 @@ static int vc4_hdmi_audio_prepare(struct device *dev, void *data, return 0; } -static const struct snd_soc_dapm_widget vc4_hdmi_audio_widgets[] = { - SND_SOC_DAPM_OUTPUT("TX"), -}; - -static const struct snd_soc_dapm_route vc4_hdmi_audio_routes[] = { - { "TX", NULL, "Playback" }, -}; - static const struct snd_soc_component_driver vc4_hdmi_audio_cpu_dai_comp = { .name = "vc4-hdmi-cpu-dai-component", }; diff --git a/drivers/gpu/host1x/fence.c b/drivers/gpu/host1x/fence.c index 6941add95d..ecab728821 100644 --- a/drivers/gpu/host1x/fence.c +++ b/drivers/gpu/host1x/fence.c @@ -15,7 +15,7 @@ #include "intr.h" #include "syncpt.h" -DEFINE_SPINLOCK(lock); +static DEFINE_SPINLOCK(lock); struct host1x_syncpt_fence { struct dma_fence base; @@ -152,8 +152,10 @@ struct dma_fence *host1x_fence_create(struct host1x_syncpt *sp, u32 threshold) return ERR_PTR(-ENOMEM); fence->waiter = kzalloc(sizeof(*fence->waiter), GFP_KERNEL); - if (!fence->waiter) + if (!fence->waiter) { + kfree(fence); return ERR_PTR(-ENOMEM); + } fence->sp = sp; fence->threshold = threshold; diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 833fcf07ff..6ccfa0cb99 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -336,12 +336,19 @@ static int apple_event(struct hid_device *hdev, struct hid_field *field, /* * MacBook JIS keyboard has wrong logical maximum + * Magic Keyboard JIS has wrong logical maximum */ static __u8 *apple_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { struct apple_sc *asc = hid_get_drvdata(hdev); + if(*rsize >=71 && rdesc[70] == 0x65 && rdesc[64] == 0x65) { + hid_info(hdev, + "fixing up Magic Keyboard JIS report descriptor\n"); + rdesc[64] = rdesc[70] = 0xe7; + } + if ((asc->quirks & APPLE_RDESC_JIS) && *rsize >= 60 && rdesc[53] == 0x65 && rdesc[59] == 0x65) { hid_info(hdev, diff --git a/drivers/hid/hid-betopff.c b/drivers/hid/hid-betopff.c index 0790fbd3fc..467d789f9b 100644 --- a/drivers/hid/hid-betopff.c +++ b/drivers/hid/hid-betopff.c @@ -56,15 +56,22 @@ static int betopff_init(struct hid_device *hid) { struct betopff_device *betopff; struct hid_report *report; - struct hid_input *hidinput = - list_first_entry(&hid->inputs, struct hid_input, list); + struct hid_input *hidinput; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; - struct input_dev *dev = hidinput->input; + struct input_dev *dev; int field_count = 0; int error; int i, j; + if (list_empty(&hid->inputs)) { + hid_err(hid, "no inputs found\n"); + return -ENODEV; + } + + hidinput = list_first_entry(&hid->inputs, struct hid_input, list); + dev = hidinput->input; + if (list_empty(report_list)) { hid_err(hid, "no output reports found\n"); return -ENODEV; diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c index 95e0807878..d70cd3d7f5 100644 --- a/drivers/hid/hid-u2fzero.c +++ b/drivers/hid/hid-u2fzero.c @@ -198,7 +198,9 @@ static int u2fzero_rng_read(struct hwrng *rng, void *data, } ret = u2fzero_recv(dev, &req, &resp); - if (ret < 0) + + /* ignore errors or packets without data */ + if (ret < offsetof(struct u2f_hid_msg, init.data)) return 0; /* only take the minimum amount of data it is safe to take */ diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index fd51769d09..33a6908995 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -4746,6 +4746,12 @@ static const struct wacom_features wacom_features_0x393 = { "Wacom Intuos Pro S", 31920, 19950, 8191, 63, INTUOSP2S_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7, .touch_max = 10 }; +static const struct wacom_features wacom_features_0x3c6 = + { "Wacom Intuos BT S", 15200, 9500, 4095, 63, + INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; +static const struct wacom_features wacom_features_0x3c8 = + { "Wacom Intuos BT M", 21600, 13500, 4095, 63, + INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; static const struct wacom_features wacom_features_HID_ANY_ID = { "Wacom HID", .type = HID_GENERIC, .oVid = HID_ANY_ID, .oPid = HID_ANY_ID }; @@ -4919,6 +4925,8 @@ const struct hid_device_id wacom_ids[] = { { USB_DEVICE_WACOM(0x37A) }, { USB_DEVICE_WACOM(0x37B) }, { BT_DEVICE_WACOM(0x393) }, + { BT_DEVICE_WACOM(0x3c6) }, + { BT_DEVICE_WACOM(0x3c8) }, { USB_DEVICE_WACOM(0x4001) }, { USB_DEVICE_WACOM(0x4004) }, { USB_DEVICE_WACOM(0x5000) }, diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 38bc35ac81..3618a924e7 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -362,12 +362,6 @@ static const struct hwmon_channel_info *k10temp_info[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), - HWMON_CHANNEL_INFO(in, - HWMON_I_INPUT | HWMON_I_LABEL, - HWMON_I_INPUT | HWMON_I_LABEL), - HWMON_CHANNEL_INFO(curr, - HWMON_C_INPUT | HWMON_C_LABEL, - HWMON_C_INPUT | HWMON_C_LABEL), NULL }; diff --git a/drivers/hwmon/ltc2947-core.c b/drivers/hwmon/ltc2947-core.c index bb3f7749a0..5423466de6 100644 --- a/drivers/hwmon/ltc2947-core.c +++ b/drivers/hwmon/ltc2947-core.c @@ -989,8 +989,12 @@ static int ltc2947_setup(struct ltc2947_data *st) return ret; /* check external clock presence */ - extclk = devm_clk_get(st->dev, NULL); - if (!IS_ERR(extclk)) { + extclk = devm_clk_get_optional(st->dev, NULL); + if (IS_ERR(extclk)) + return dev_err_probe(st->dev, PTR_ERR(extclk), + "Failed to get external clock\n"); + + if (extclk) { unsigned long rate_hz; u8 pre = 0, div, tbctl; u64 aux; diff --git a/drivers/hwmon/mlxreg-fan.c b/drivers/hwmon/mlxreg-fan.c index 116681fde3..89fe7b9fe2 100644 --- a/drivers/hwmon/mlxreg-fan.c +++ b/drivers/hwmon/mlxreg-fan.c @@ -315,8 +315,8 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, { struct mlxreg_fan *fan = cdev->devdata; unsigned long cur_state; + int i, config = 0; u32 regval; - int i; int err; /* @@ -329,6 +329,12 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, * overwritten. */ if (state >= MLXREG_FAN_SPEED_MIN && state <= MLXREG_FAN_SPEED_MAX) { + /* + * This is configuration change, which is only supported through sysfs. + * For configuration non-zero value is to be returned to avoid thermal + * statistics update. + */ + config = 1; state -= MLXREG_FAN_MAX_STATE; for (i = 0; i < state; i++) fan->cooling_levels[i] = state; @@ -343,7 +349,7 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, cur_state = MLXREG_FAN_PWM_DUTY2STATE(regval); if (state < cur_state) - return 0; + return config; state = cur_state; } @@ -359,7 +365,7 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, dev_err(fan->dev, "Failed to write PWM duty\n"); return err; } - return 0; + return config; } static const struct thermal_cooling_device_ops mlxreg_fan_cooling_ops = { diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c index 0d68a78be9..ae66461328 100644 --- a/drivers/hwmon/occ/common.c +++ b/drivers/hwmon/occ/common.c @@ -340,18 +340,11 @@ static ssize_t occ_show_temp_10(struct device *dev, if (val == OCC_TEMP_SENSOR_FAULT) return -EREMOTEIO; - /* - * VRM doesn't return temperature, only alarm bit. This - * attribute maps to tempX_alarm instead of tempX_input for - * VRM - */ - if (temp->fru_type != OCC_FRU_TYPE_VRM) { - /* sensor not ready */ - if (val == 0) - return -EAGAIN; + /* sensor not ready */ + if (val == 0) + return -EAGAIN; - val *= 1000; - } + val *= 1000; break; case 2: val = temp->fru_type; @@ -886,7 +879,7 @@ static int occ_setup_sensor_attrs(struct occ *occ) 0, i); attr++; - if (sensors->temp.version > 1 && + if (sensors->temp.version == 2 && temp->fru_type == OCC_FRU_TYPE_VRM) { snprintf(attr->name, sizeof(attr->name), "temp%d_alarm", s); diff --git a/drivers/hwmon/pmbus/ibm-cffps.c b/drivers/hwmon/pmbus/ibm-cffps.c index df712ce4b1..53f7d1418b 100644 --- a/drivers/hwmon/pmbus/ibm-cffps.c +++ b/drivers/hwmon/pmbus/ibm-cffps.c @@ -171,8 +171,14 @@ static ssize_t ibm_cffps_debugfs_read(struct file *file, char __user *buf, cmd = CFFPS_SN_CMD; break; case CFFPS_DEBUGFS_MAX_POWER_OUT: - rc = i2c_smbus_read_word_swapped(psu->client, - CFFPS_MAX_POWER_OUT_CMD); + if (psu->version == cffps1) { + rc = i2c_smbus_read_word_swapped(psu->client, + CFFPS_MAX_POWER_OUT_CMD); + } else { + rc = i2c_smbus_read_word_data(psu->client, + CFFPS_MAX_POWER_OUT_CMD); + } + if (rc < 0) return rc; diff --git a/drivers/hwmon/pmbus/mp2975.c b/drivers/hwmon/pmbus/mp2975.c index eb94bd5f4e..51986adfbf 100644 --- a/drivers/hwmon/pmbus/mp2975.c +++ b/drivers/hwmon/pmbus/mp2975.c @@ -54,7 +54,7 @@ #define MP2975_RAIL2_FUNC (PMBUS_HAVE_VOUT | PMBUS_HAVE_STATUS_VOUT | \ PMBUS_HAVE_IOUT | PMBUS_HAVE_STATUS_IOUT | \ - PMBUS_PHASE_VIRTUAL) + PMBUS_HAVE_POUT | PMBUS_PHASE_VIRTUAL) struct mp2975_data { struct pmbus_driver_info info; diff --git a/drivers/hwmon/tmp421.c b/drivers/hwmon/tmp421.c index ede66ea6a7..b963a369c5 100644 --- a/drivers/hwmon/tmp421.c +++ b/drivers/hwmon/tmp421.c @@ -100,71 +100,81 @@ struct tmp421_data { s16 temp[4]; }; -static int temp_from_s16(s16 reg) +static int temp_from_raw(u16 reg, bool extended) { /* Mask out status bits */ int temp = reg & ~0xf; - return (temp * 1000 + 128) / 256; + if (extended) + temp = temp - 64 * 256; + else + temp = (s16)temp; + + return DIV_ROUND_CLOSEST(temp * 1000, 256); } -static int temp_from_u16(u16 reg) +static int tmp421_update_device(struct tmp421_data *data) { - /* Mask out status bits */ - int temp = reg & ~0xf; - - /* Add offset for extended temperature range. */ - temp -= 64 * 256; - - return (temp * 1000 + 128) / 256; -} - -static struct tmp421_data *tmp421_update_device(struct device *dev) -{ - struct tmp421_data *data = dev_get_drvdata(dev); struct i2c_client *client = data->client; + int ret = 0; int i; mutex_lock(&data->update_lock); if (time_after(jiffies, data->last_updated + (HZ / 2)) || !data->valid) { - data->config = i2c_smbus_read_byte_data(client, - TMP421_CONFIG_REG_1); + ret = i2c_smbus_read_byte_data(client, TMP421_CONFIG_REG_1); + if (ret < 0) + goto exit; + data->config = ret; for (i = 0; i < data->channels; i++) { - data->temp[i] = i2c_smbus_read_byte_data(client, - TMP421_TEMP_MSB[i]) << 8; - data->temp[i] |= i2c_smbus_read_byte_data(client, - TMP421_TEMP_LSB[i]); + ret = i2c_smbus_read_byte_data(client, TMP421_TEMP_MSB[i]); + if (ret < 0) + goto exit; + data->temp[i] = ret << 8; + + ret = i2c_smbus_read_byte_data(client, TMP421_TEMP_LSB[i]); + if (ret < 0) + goto exit; + data->temp[i] |= ret; } data->last_updated = jiffies; data->valid = 1; } +exit: mutex_unlock(&data->update_lock); - return data; + if (ret < 0) { + data->valid = 0; + return ret; + } + + return 0; } static int tmp421_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { - struct tmp421_data *tmp421 = tmp421_update_device(dev); + struct tmp421_data *tmp421 = dev_get_drvdata(dev); + int ret = 0; + + ret = tmp421_update_device(tmp421); + if (ret) + return ret; switch (attr) { case hwmon_temp_input: - if (tmp421->config & TMP421_CONFIG_RANGE) - *val = temp_from_u16(tmp421->temp[channel]); - else - *val = temp_from_s16(tmp421->temp[channel]); + *val = temp_from_raw(tmp421->temp[channel], + tmp421->config & TMP421_CONFIG_RANGE); return 0; case hwmon_temp_fault: /* - * The OPEN bit signals a fault. This is bit 0 of the temperature - * register (low byte). + * Any of OPEN or /PVLD bits indicate a hardware mulfunction + * and the conversion result may be incorrect */ - *val = tmp421->temp[channel] & 0x01; + *val = !!(tmp421->temp[channel] & 0x03); return 0; default: return -EOPNOTSUPP; @@ -177,9 +187,6 @@ static umode_t tmp421_is_visible(const void *data, enum hwmon_sensor_types type, { switch (attr) { case hwmon_temp_fault: - if (channel == 0) - return 0; - return 0444; case hwmon_temp_input: return 0444; default: diff --git a/drivers/hwmon/w83791d.c b/drivers/hwmon/w83791d.c index 37b25a1474..3c1be2c11f 100644 --- a/drivers/hwmon/w83791d.c +++ b/drivers/hwmon/w83791d.c @@ -273,9 +273,6 @@ struct w83791d_data { char valid; /* !=0 if following fields are valid */ unsigned long last_updated; /* In jiffies */ - /* array of 2 pointers to subclients */ - struct i2c_client *lm75[2]; - /* volts */ u8 in[NUMBER_OF_VIN]; /* Register value */ u8 in_max[NUMBER_OF_VIN]; /* Register value */ @@ -1257,7 +1254,6 @@ static const struct attribute_group w83791d_group_fanpwm45 = { static int w83791d_detect_subclients(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; - struct w83791d_data *data = i2c_get_clientdata(client); int address = client->addr; int i, id; u8 val; @@ -1280,22 +1276,19 @@ static int w83791d_detect_subclients(struct i2c_client *client) } val = w83791d_read(client, W83791D_REG_I2C_SUBADDR); - if (!(val & 0x08)) - data->lm75[0] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + (val & 0x7)); - if (!(val & 0x80)) { - if (!IS_ERR(data->lm75[0]) && - ((val & 0x7) == ((val >> 4) & 0x7))) { - dev_err(&client->dev, - "duplicate addresses 0x%x, " - "use force_subclient\n", - data->lm75[0]->addr); - return -ENODEV; - } - data->lm75[1] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + ((val >> 4) & 0x7)); + + if (!(val & 0x88) && (val & 0x7) == ((val >> 4) & 0x7)) { + dev_err(&client->dev, + "duplicate addresses 0x%x, use force_subclient\n", 0x48 + (val & 0x7)); + return -ENODEV; } + if (!(val & 0x08)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + (val & 0x7)); + + if (!(val & 0x80)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + ((val >> 4) & 0x7)); + return 0; } diff --git a/drivers/hwmon/w83792d.c b/drivers/hwmon/w83792d.c index abd5c3a722..1f175f3813 100644 --- a/drivers/hwmon/w83792d.c +++ b/drivers/hwmon/w83792d.c @@ -264,9 +264,6 @@ struct w83792d_data { char valid; /* !=0 if following fields are valid */ unsigned long last_updated; /* In jiffies */ - /* array of 2 pointers to subclients */ - struct i2c_client *lm75[2]; - u8 in[9]; /* Register value */ u8 in_max[9]; /* Register value */ u8 in_min[9]; /* Register value */ @@ -927,7 +924,6 @@ w83792d_detect_subclients(struct i2c_client *new_client) int address = new_client->addr; u8 val; struct i2c_adapter *adapter = new_client->adapter; - struct w83792d_data *data = i2c_get_clientdata(new_client); id = i2c_adapter_id(adapter); if (force_subclients[0] == id && force_subclients[1] == address) { @@ -946,21 +942,19 @@ w83792d_detect_subclients(struct i2c_client *new_client) } val = w83792d_read_value(new_client, W83792D_REG_I2C_SUBADDR); - if (!(val & 0x08)) - data->lm75[0] = devm_i2c_new_dummy_device(&new_client->dev, adapter, - 0x48 + (val & 0x7)); - if (!(val & 0x80)) { - if (!IS_ERR(data->lm75[0]) && - ((val & 0x7) == ((val >> 4) & 0x7))) { - dev_err(&new_client->dev, - "duplicate addresses 0x%x, use force_subclient\n", - data->lm75[0]->addr); - return -ENODEV; - } - data->lm75[1] = devm_i2c_new_dummy_device(&new_client->dev, adapter, - 0x48 + ((val >> 4) & 0x7)); + + if (!(val & 0x88) && (val & 0x7) == ((val >> 4) & 0x7)) { + dev_err(&new_client->dev, + "duplicate addresses 0x%x, use force_subclient\n", 0x48 + (val & 0x7)); + return -ENODEV; } + if (!(val & 0x08)) + devm_i2c_new_dummy_device(&new_client->dev, adapter, 0x48 + (val & 0x7)); + + if (!(val & 0x80)) + devm_i2c_new_dummy_device(&new_client->dev, adapter, 0x48 + ((val >> 4) & 0x7)); + return 0; } diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c index e7d0484eab..1d2854de1c 100644 --- a/drivers/hwmon/w83793.c +++ b/drivers/hwmon/w83793.c @@ -202,7 +202,6 @@ static inline s8 TEMP_TO_REG(long val, s8 min, s8 max) } struct w83793_data { - struct i2c_client *lm75[2]; struct device *hwmon_dev; struct mutex update_lock; char valid; /* !=0 if following fields are valid */ @@ -1566,7 +1565,6 @@ w83793_detect_subclients(struct i2c_client *client) int address = client->addr; u8 tmp; struct i2c_adapter *adapter = client->adapter; - struct w83793_data *data = i2c_get_clientdata(client); id = i2c_adapter_id(adapter); if (force_subclients[0] == id && force_subclients[1] == address) { @@ -1586,21 +1584,19 @@ w83793_detect_subclients(struct i2c_client *client) } tmp = w83793_read_value(client, W83793_REG_I2C_SUBADDR); - if (!(tmp & 0x08)) - data->lm75[0] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + (tmp & 0x7)); - if (!(tmp & 0x80)) { - if (!IS_ERR(data->lm75[0]) - && ((tmp & 0x7) == ((tmp >> 4) & 0x7))) { - dev_err(&client->dev, - "duplicate addresses 0x%x, " - "use force_subclients\n", data->lm75[0]->addr); - return -ENODEV; - } - data->lm75[1] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + ((tmp >> 4) & 0x7)); + + if (!(tmp & 0x88) && (tmp & 0x7) == ((tmp >> 4) & 0x7)) { + dev_err(&client->dev, + "duplicate addresses 0x%x, use force_subclient\n", 0x48 + (tmp & 0x7)); + return -ENODEV; } + if (!(tmp & 0x08)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + (tmp & 0x7)); + + if (!(tmp & 0x80)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + ((tmp >> 4) & 0x7)); + return 0; } diff --git a/drivers/i2c/busses/i2c-mlxcpld.c b/drivers/i2c/busses/i2c-mlxcpld.c index 4e0b7c2882..015e11c466 100644 --- a/drivers/i2c/busses/i2c-mlxcpld.c +++ b/drivers/i2c/busses/i2c-mlxcpld.c @@ -49,7 +49,7 @@ #define MLXCPLD_LPCI2C_NACK_IND 2 #define MLXCPLD_I2C_FREQ_1000KHZ_SET 0x04 -#define MLXCPLD_I2C_FREQ_400KHZ_SET 0x0f +#define MLXCPLD_I2C_FREQ_400KHZ_SET 0x0c #define MLXCPLD_I2C_FREQ_100KHZ_SET 0x42 enum mlxcpld_i2c_frequency { @@ -495,7 +495,7 @@ mlxcpld_i2c_set_frequency(struct mlxcpld_i2c_priv *priv, return err; /* Set frequency only if it is not 100KHz, which is default. */ - switch ((data->reg & data->mask) >> data->bit) { + switch ((regval & data->mask) >> data->bit) { case MLXCPLD_I2C_FREQ_1000KHZ: freq = MLXCPLD_I2C_FREQ_1000KHZ_SET; break; diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index 477480d1de..7d4b3eb707 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -41,6 +41,8 @@ #define I2C_HANDSHAKE_RST 0x0020 #define I2C_FIFO_ADDR_CLR 0x0001 #define I2C_DELAY_LEN 0x0002 +#define I2C_ST_START_CON 0x8001 +#define I2C_FS_START_CON 0x1800 #define I2C_TIME_CLR_VALUE 0x0000 #define I2C_TIME_DEFAULT_VALUE 0x0003 #define I2C_WRRD_TRANAC_VALUE 0x0002 @@ -480,6 +482,7 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) { u16 control_reg; u16 intr_stat_reg; + u16 ext_conf_val; mtk_i2c_writew(i2c, I2C_CHN_CLR_FLAG, OFFSET_START); intr_stat_reg = mtk_i2c_readw(i2c, OFFSET_INTR_STAT); @@ -518,8 +521,13 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) if (i2c->dev_comp->ltiming_adjust) mtk_i2c_writew(i2c, i2c->ltiming_reg, OFFSET_LTIMING); + if (i2c->speed_hz <= I2C_MAX_STANDARD_MODE_FREQ) + ext_conf_val = I2C_ST_START_CON; + else + ext_conf_val = I2C_FS_START_CON; + if (i2c->dev_comp->timing_adjust) { - mtk_i2c_writew(i2c, i2c->ac_timing.ext, OFFSET_EXT_CONF); + ext_conf_val = i2c->ac_timing.ext; mtk_i2c_writew(i2c, i2c->ac_timing.inter_clk_div, OFFSET_CLOCK_DIV); mtk_i2c_writew(i2c, I2C_SCL_MIS_COMP_VALUE, @@ -544,6 +552,7 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) OFFSET_HS_STA_STO_AC_TIMING); } } + mtk_i2c_writew(i2c, ext_conf_val, OFFSET_EXT_CONF); /* If use i2c pin from PMIC mt6397 side, need set PATH_DIR first */ if (i2c->have_pmic) diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index aaeeacc121..546cc935e0 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -454,6 +454,7 @@ static int i2c_acpi_notify(struct notifier_block *nb, unsigned long value, break; i2c_acpi_register_device(adapter, adev, &info); + put_device(&adapter->dev); break; case ACPI_RECONFIG_DEVICE_REMOVE: if (!acpi_device_enumerated(adev)) diff --git a/drivers/iio/accel/fxls8962af-core.c b/drivers/iio/accel/fxls8962af-core.c index 0019f1ea7d..f41db9e024 100644 --- a/drivers/iio/accel/fxls8962af-core.c +++ b/drivers/iio/accel/fxls8962af-core.c @@ -738,7 +738,7 @@ static irqreturn_t fxls8962af_interrupt(int irq, void *p) if (reg & FXLS8962AF_INT_STATUS_SRC_BUF) { ret = fxls8962af_fifo_flush(indio_dev); - if (ret) + if (ret < 0) return IRQ_NONE; return IRQ_HANDLED; diff --git a/drivers/iio/adc/ad7192.c b/drivers/iio/adc/ad7192.c index ee8ed94810..2121a812b0 100644 --- a/drivers/iio/adc/ad7192.c +++ b/drivers/iio/adc/ad7192.c @@ -293,6 +293,7 @@ static const struct ad_sigma_delta_info ad7192_sigma_delta_info = { .has_registers = true, .addr_shift = 3, .read_mask = BIT(6), + .irq_flags = IRQF_TRIGGER_FALLING, }; static const struct ad_sd_calib_data ad7192_calib_arr[8] = { diff --git a/drivers/iio/adc/ad7780.c b/drivers/iio/adc/ad7780.c index 42bb952f47..b6e8c8abf6 100644 --- a/drivers/iio/adc/ad7780.c +++ b/drivers/iio/adc/ad7780.c @@ -203,7 +203,7 @@ static const struct ad_sigma_delta_info ad7780_sigma_delta_info = { .set_mode = ad7780_set_mode, .postprocess_sample = ad7780_postprocess_sample, .has_registers = false, - .irq_flags = IRQF_TRIGGER_LOW, + .irq_flags = IRQF_TRIGGER_FALLING, }; #define _AD7780_CHANNEL(_bits, _wordsize, _mask_all) \ diff --git a/drivers/iio/adc/ad7793.c b/drivers/iio/adc/ad7793.c index ef3e2d3ecb..0e7ab3fb07 100644 --- a/drivers/iio/adc/ad7793.c +++ b/drivers/iio/adc/ad7793.c @@ -206,7 +206,7 @@ static const struct ad_sigma_delta_info ad7793_sigma_delta_info = { .has_registers = true, .addr_shift = 3, .read_mask = BIT(6), - .irq_flags = IRQF_TRIGGER_LOW, + .irq_flags = IRQF_TRIGGER_FALLING, }; static const struct ad_sd_calib_data ad7793_calib_arr[6] = { diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c index 19efaa41bc..34ec0c28b2 100644 --- a/drivers/iio/adc/aspeed_adc.c +++ b/drivers/iio/adc/aspeed_adc.c @@ -183,6 +183,7 @@ static int aspeed_adc_probe(struct platform_device *pdev) data = iio_priv(indio_dev); data->dev = &pdev->dev; + platform_set_drvdata(pdev, indio_dev); data->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(data->base)) diff --git a/drivers/iio/adc/max1027.c b/drivers/iio/adc/max1027.c index 655ab02d03..b753658bb4 100644 --- a/drivers/iio/adc/max1027.c +++ b/drivers/iio/adc/max1027.c @@ -103,7 +103,7 @@ MODULE_DEVICE_TABLE(of, max1027_adc_dt_ids); .sign = 'u', \ .realbits = depth, \ .storagebits = 16, \ - .shift = 2, \ + .shift = (depth == 10) ? 2 : 0, \ .endianness = IIO_BE, \ }, \ } @@ -142,7 +142,6 @@ MODULE_DEVICE_TABLE(of, max1027_adc_dt_ids); MAX1027_V_CHAN(11, depth) #define MAX1X31_CHANNELS(depth) \ - MAX1X27_CHANNELS(depth), \ MAX1X29_CHANNELS(depth), \ MAX1027_V_CHAN(12, depth), \ MAX1027_V_CHAN(13, depth), \ diff --git a/drivers/iio/adc/mt6577_auxadc.c b/drivers/iio/adc/mt6577_auxadc.c index 79c1dd68b9..d4fccd52ef 100644 --- a/drivers/iio/adc/mt6577_auxadc.c +++ b/drivers/iio/adc/mt6577_auxadc.c @@ -82,6 +82,10 @@ static const struct iio_chan_spec mt6577_auxadc_iio_channels[] = { MT6577_AUXADC_CHANNEL(15), }; +/* For Voltage calculation */ +#define VOLTAGE_FULL_RANGE 1500 /* VA voltage */ +#define AUXADC_PRECISE 4096 /* 12 bits */ + static int mt_auxadc_get_cali_data(int rawdata, bool enable_cali) { return rawdata; @@ -191,6 +195,10 @@ static int mt6577_auxadc_read_raw(struct iio_dev *indio_dev, } if (adc_dev->dev_comp->sample_data_cali) *val = mt_auxadc_get_cali_data(*val, true); + + /* Convert adc raw data to voltage: 0 - 1500 mV */ + *val = *val * VOLTAGE_FULL_RANGE / AUXADC_PRECISE; + return IIO_VAL_INT; default: diff --git a/drivers/iio/adc/rzg2l_adc.c b/drivers/iio/adc/rzg2l_adc.c index 9996d5eef2..32fbf57c36 100644 --- a/drivers/iio/adc/rzg2l_adc.c +++ b/drivers/iio/adc/rzg2l_adc.c @@ -401,7 +401,7 @@ static int rzg2l_adc_hw_init(struct rzg2l_adc *adc) exit_hw_init: clk_disable_unprepare(adc->pclk); - return 0; + return ret; } static void rzg2l_adc_pm_runtime_disable(void *data) @@ -570,8 +570,10 @@ static int __maybe_unused rzg2l_adc_pm_runtime_resume(struct device *dev) return ret; ret = clk_prepare_enable(adc->adclk); - if (ret) + if (ret) { + clk_disable_unprepare(adc->pclk); return ret; + } rzg2l_adc_pwr(adc, true); diff --git a/drivers/iio/adc/ti-adc128s052.c b/drivers/iio/adc/ti-adc128s052.c index 3143f35a65..83c1ae07b3 100644 --- a/drivers/iio/adc/ti-adc128s052.c +++ b/drivers/iio/adc/ti-adc128s052.c @@ -171,7 +171,13 @@ static int adc128_probe(struct spi_device *spi) mutex_init(&adc->lock); ret = iio_device_register(indio_dev); + if (ret) + goto err_disable_regulator; + return 0; + +err_disable_regulator: + regulator_disable(adc->reg); return ret; } diff --git a/drivers/iio/common/ssp_sensors/ssp_spi.c b/drivers/iio/common/ssp_sensors/ssp_spi.c index 4864c38b8d..769bd92805 100644 --- a/drivers/iio/common/ssp_sensors/ssp_spi.c +++ b/drivers/iio/common/ssp_sensors/ssp_spi.c @@ -137,7 +137,7 @@ static int ssp_print_mcu_debug(char *data_frame, int *data_index, if (length > received_len - *data_index || length <= 0) { ssp_dbg("[SSP]: MSG From MCU-invalid debug length(%d/%d)\n", length, received_len); - return length ? length : -EPROTO; + return -EPROTO; } ssp_dbg("[SSP]: MSG From MCU - %s\n", &data_frame[*data_index]); @@ -273,6 +273,8 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) for (idx = 0; idx < len;) { switch (dataframe[idx++]) { case SSP_MSG2AP_INST_BYPASS_DATA: + if (idx >= len) + return -EPROTO; sd = dataframe[idx++]; if (sd < 0 || sd >= SSP_SENSOR_MAX) { dev_err(SSP_DEV, @@ -282,10 +284,13 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) if (indio_devs[sd]) { spd = iio_priv(indio_devs[sd]); - if (spd->process_data) + if (spd->process_data) { + if (idx >= len) + return -EPROTO; spd->process_data(indio_devs[sd], &dataframe[idx], data->timestamp); + } } else { dev_err(SSP_DEV, "no client for frame\n"); } @@ -293,6 +298,8 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) idx += ssp_offset_map[sd]; break; case SSP_MSG2AP_INST_DEBUG_DATA: + if (idx >= len) + return -EPROTO; sd = ssp_print_mcu_debug(dataframe, &idx, len); if (sd) { dev_err(SSP_DEV, diff --git a/drivers/iio/dac/ti-dac5571.c b/drivers/iio/dac/ti-dac5571.c index 2a5ba1b08a..546a4cf6c5 100644 --- a/drivers/iio/dac/ti-dac5571.c +++ b/drivers/iio/dac/ti-dac5571.c @@ -350,6 +350,7 @@ static int dac5571_probe(struct i2c_client *client, data->dac5571_pwrdwn = dac5571_pwrdwn_quad; break; default: + ret = -EINVAL; goto err; } diff --git a/drivers/iio/imu/adis16475.c b/drivers/iio/imu/adis16475.c index eb48102f94..287fff39a9 100644 --- a/drivers/iio/imu/adis16475.c +++ b/drivers/iio/imu/adis16475.c @@ -353,10 +353,11 @@ static int adis16475_set_freq(struct adis16475 *st, const u32 freq) if (dec > st->info->max_dec) dec = st->info->max_dec; - ret = adis_write_reg_16(&st->adis, ADIS16475_REG_DEC_RATE, dec); + ret = __adis_write_reg_16(&st->adis, ADIS16475_REG_DEC_RATE, dec); if (ret) goto error; + adis_dev_unlock(&st->adis); /* * If decimation is used, then gyro and accel data will have meaningful * bits on the LSB registers. This info is used on the trigger handler. diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c index a869a6e52a..ed129321a1 100644 --- a/drivers/iio/imu/adis16480.c +++ b/drivers/iio/imu/adis16480.c @@ -144,6 +144,7 @@ struct adis16480_chip_info { unsigned int max_dec_rate; const unsigned int *filter_freqs; bool has_pps_clk_mode; + bool has_sleep_cnt; const struct adis_data adis_data; }; @@ -939,6 +940,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16375, &adis16485_timeouts, 0), }, @@ -952,6 +954,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16480, &adis16480_timeouts, 0), }, @@ -965,6 +968,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16485, &adis16485_timeouts, 0), }, @@ -978,6 +982,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16488, &adis16485_timeouts, 0), }, @@ -1425,9 +1430,12 @@ static int adis16480_probe(struct spi_device *spi) if (ret) return ret; - ret = devm_add_action_or_reset(&spi->dev, adis16480_stop, indio_dev); - if (ret) - return ret; + if (st->chip_info->has_sleep_cnt) { + ret = devm_add_action_or_reset(&spi->dev, adis16480_stop, + indio_dev); + if (ret) + return ret; + } ret = adis16480_config_irq_pin(spi->dev.of_node, st); if (ret) diff --git a/drivers/iio/light/opt3001.c b/drivers/iio/light/opt3001.c index 52963da401..1880bd5bb2 100644 --- a/drivers/iio/light/opt3001.c +++ b/drivers/iio/light/opt3001.c @@ -276,6 +276,8 @@ static int opt3001_get_lux(struct opt3001 *opt, int *val, int *val2) ret = wait_event_timeout(opt->result_ready_queue, opt->result_ready, msecs_to_jiffies(OPT3001_RESULT_READY_LONG)); + if (ret == 0) + return -ETIMEDOUT; } else { /* Sleep for result ready time */ timeout = (opt->int_time == OPT3001_INT_TIME_SHORT) ? @@ -312,9 +314,7 @@ static int opt3001_get_lux(struct opt3001 *opt, int *val, int *val2) /* Disallow IRQ to access the device while lock is active */ opt->ok_to_ignore_lock = false; - if (ret == 0) - return -ETIMEDOUT; - else if (ret < 0) + if (ret < 0) return ret; if (opt->use_irq) { diff --git a/drivers/iio/test/Makefile b/drivers/iio/test/Makefile index f1099b4953..467519a202 100644 --- a/drivers/iio/test/Makefile +++ b/drivers/iio/test/Makefile @@ -5,3 +5,4 @@ # Keep in alphabetical order obj-$(CONFIG_IIO_TEST_FORMAT) += iio-test-format.o +CFLAGS_iio-test-format.o += $(DISABLE_STRUCTLEAK_PLUGIN) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index c40791bace..704ce59554 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1746,15 +1746,16 @@ static void cma_cancel_route(struct rdma_id_private *id_priv) } } -static void cma_cancel_listens(struct rdma_id_private *id_priv) +static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; + lockdep_assert_held(&lock); + /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ - mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { @@ -1768,6 +1769,12 @@ static void cma_cancel_listens(struct rdma_id_private *id_priv) rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + _cma_cancel_listens(id_priv); mutex_unlock(&lock); } @@ -1776,6 +1783,14 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, { switch (state) { case RDMA_CM_ADDR_QUERY: + /* + * We can avoid doing the rdma_addr_cancel() based on state, + * only RDMA_CM_ADDR_QUERY has a work that could still execute. + * Notice that the addr_handler work could still be exiting + * outside this state, however due to the interaction with the + * handler_mutex the work is guaranteed not to touch id_priv + * during exit. + */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: @@ -1810,6 +1825,8 @@ static void cma_release_port(struct rdma_id_private *id_priv) static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { + bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); + if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); @@ -1826,7 +1843,10 @@ static void destroy_mc(struct rdma_id_private *id_priv, cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, &mgid); - cma_igmp_send(ndev, &mgid, false); + + if (!send_only) + cma_igmp_send(ndev, &mgid, false); + dev_put(ndev); } @@ -2574,7 +2594,7 @@ static int cma_listen_on_all(struct rdma_id_private *id_priv) return 0; err_listen: - list_del(&id_priv->list); + _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); @@ -3413,6 +3433,21 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, @@ -3771,9 +3806,13 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { + struct sockaddr_in any_in = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + /* For a well behaved ULP state will be RDMA_CM_IDLE */ - id->route.addr.src_addr.ss_family = AF_INET; - ret = rdma_bind_addr(id, cma_src_addr(id_priv)); + ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 5c463da998..f92f101ea9 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -91,6 +91,7 @@ struct rdma_id_private { u8 afonly; u8 timeout; u8 min_rnr_timer; + u8 used_resolve_ip; enum ib_gid_type gid_type; /* diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index e74ddbe465..15b0cb0f36 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -876,14 +876,14 @@ void hfi1_ipoib_tx_timeout(struct net_device *dev, unsigned int q) struct hfi1_ipoib_txq *txq = &priv->txqs[q]; u64 completed = atomic64_read(&txq->complete_txreqs); - dd_dev_info(priv->dd, "timeout txq %llx q %u stopped %u stops %d no_desc %d ring_full %d\n", - (unsigned long long)txq, q, + dd_dev_info(priv->dd, "timeout txq %p q %u stopped %u stops %d no_desc %d ring_full %d\n", + txq, q, __netif_subqueue_stopped(dev, txq->q_idx), atomic_read(&txq->stops), atomic_read(&txq->no_desc), atomic_read(&txq->ring_full)); - dd_dev_info(priv->dd, "sde %llx engine %u\n", - (unsigned long long)txq->sde, + dd_dev_info(priv->dd, "sde %p engine %u\n", + txq->sde, txq->sde ? txq->sde->this_idx : 0); dd_dev_info(priv->dd, "flow %x\n", txq->flow.as_int); dd_dev_info(priv->dd, "sent %llu completed %llu used %llu\n", diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 1e9c3c5bee..d763f09759 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -326,19 +326,30 @@ static void set_cq_param(struct hns_roce_cq *hr_cq, u32 cq_entries, int vector, INIT_LIST_HEAD(&hr_cq->rq_list); } -static void set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata, - struct hns_roce_ib_create_cq *ucmd) +static int set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata, + struct hns_roce_ib_create_cq *ucmd) { struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device); - if (udata) { - if (udata->inlen >= offsetofend(typeof(*ucmd), cqe_size)) - hr_cq->cqe_size = ucmd->cqe_size; - else - hr_cq->cqe_size = HNS_ROCE_V2_CQE_SIZE; - } else { + if (!udata) { hr_cq->cqe_size = hr_dev->caps.cqe_sz; + return 0; } + + if (udata->inlen >= offsetofend(typeof(*ucmd), cqe_size)) { + if (ucmd->cqe_size != HNS_ROCE_V2_CQE_SIZE && + ucmd->cqe_size != HNS_ROCE_V3_CQE_SIZE) { + ibdev_err(&hr_dev->ib_dev, + "invalid cqe size %u.\n", ucmd->cqe_size); + return -EINVAL; + } + + hr_cq->cqe_size = ucmd->cqe_size; + } else { + hr_cq->cqe_size = HNS_ROCE_V2_CQE_SIZE; + } + + return 0; } int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, @@ -366,7 +377,9 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, set_cq_param(hr_cq, attr->cqe, attr->comp_vector, &ucmd); - set_cqe_size(hr_cq, udata, &ucmd); + ret = set_cqe_size(hr_cq, udata, &ucmd); + if (ret) + return ret; ret = alloc_cq_buf(hr_dev, hr_cq, udata, ucmd.buf_addr); if (ret) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5b99531057..d5f3faa162 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3299,7 +3299,7 @@ static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn, dest = get_cqe_v2(hr_cq, (prod_index + nfreed) & hr_cq->ib_cq.cqe); owner_bit = hr_reg_read(dest, CQE_OWNER); - memcpy(dest, cqe, sizeof(*cqe)); + memcpy(dest, cqe, hr_cq->cqe_size); hr_reg_write(dest, CQE_OWNER, owner_bit); } } @@ -4397,7 +4397,12 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_qp->path_mtu = ib_mtu; mtu = ib_mtu_enum_to_int(ib_mtu); - if (WARN_ON(mtu < 0)) + if (WARN_ON(mtu <= 0)) + return -EINVAL; +#define MAX_LP_MSG_LEN 65536 + /* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 64KB */ + lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); + if (WARN_ON(lp_pktn_ini >= 0xF)) return -EINVAL; if (attr_mask & IB_QP_PATH_MTU) { @@ -4405,10 +4410,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_MTU); } -#define MAX_LP_MSG_LEN 65536 - /* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 64KB */ - lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); - hr_reg_write(context, QPC_LP_PKTN_INI, lp_pktn_ini); hr_reg_clear(qpc_mask, QPC_LP_PKTN_INI); diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 6b62299abf..6dea0a49d1 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3496,7 +3496,7 @@ static void irdma_cm_disconn_true(struct irdma_qp *iwqp) original_hw_tcp_state == IRDMA_TCP_STATE_TIME_WAIT || last_ae == IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE || last_ae == IRDMA_AE_BAD_CLOSE || - last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->reset)) { + last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->rf->reset)) { issue_close = 1; iwqp->cm_id = NULL; qp->term_flags = 0; @@ -4250,7 +4250,7 @@ void irdma_cm_teardown_connections(struct irdma_device *iwdev, u32 *ipaddr, teardown_entry); attr.qp_state = IB_QPS_ERR; irdma_modify_qp(&cm_node->iwqp->ibqp, &attr, IB_QP_STATE, NULL); - if (iwdev->reset) + if (iwdev->rf->reset) irdma_cm_disconn(cm_node->iwqp); irdma_rem_ref_cm_node(cm_node); } diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 00de5ee9a2..7de525a5cc 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -176,6 +176,14 @@ static void irdma_set_flush_fields(struct irdma_sc_qp *qp, case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR: qp->flush_code = FLUSH_GENERAL_ERR; break; + case IRDMA_AE_LLP_TOO_MANY_RETRIES: + qp->flush_code = FLUSH_RETRY_EXC_ERR; + break; + case IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS: + case IRDMA_AE_AMP_MWBIND_BIND_DISABLED: + case IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS: + qp->flush_code = FLUSH_MW_BIND_ERR; + break; default: qp->flush_code = FLUSH_FATAL_ERR; break; @@ -1489,7 +1497,7 @@ void irdma_reinitialize_ieq(struct irdma_sc_vsi *vsi) irdma_puda_dele_rsrc(vsi, IRDMA_PUDA_RSRC_TYPE_IEQ, false); if (irdma_initialize_ieq(iwdev)) { - iwdev->reset = true; + iwdev->rf->reset = true; rf->gen_ops.request_reset(rf); } } @@ -1632,13 +1640,13 @@ void irdma_rt_deinit_hw(struct irdma_device *iwdev) case IEQ_CREATED: if (!iwdev->roce_mode) irdma_puda_dele_rsrc(&iwdev->vsi, IRDMA_PUDA_RSRC_TYPE_IEQ, - iwdev->reset); + iwdev->rf->reset); fallthrough; case ILQ_CREATED: if (!iwdev->roce_mode) irdma_puda_dele_rsrc(&iwdev->vsi, IRDMA_PUDA_RSRC_TYPE_ILQ, - iwdev->reset); + iwdev->rf->reset); break; default: ibdev_warn(&iwdev->ibdev, "bad init_state = %d\n", iwdev->init_state); diff --git a/drivers/infiniband/hw/irdma/i40iw_if.c b/drivers/infiniband/hw/irdma/i40iw_if.c index bddf88194d..d219f64b2c 100644 --- a/drivers/infiniband/hw/irdma/i40iw_if.c +++ b/drivers/infiniband/hw/irdma/i40iw_if.c @@ -55,7 +55,7 @@ static void i40iw_close(struct i40e_info *cdev_info, struct i40e_client *client, iwdev = to_iwdev(ibdev); if (reset) - iwdev->reset = true; + iwdev->rf->reset = true; iwdev->iw_status = 0; irdma_port_ibevent(iwdev); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 743d9e143a..b678fe7124 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -346,7 +346,6 @@ struct irdma_device { bool roce_mode:1; bool roce_dcqcn_en:1; bool dcb:1; - bool reset:1; bool iw_ooo:1; enum init_completion_state init_state; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index ff705f3232..3dcbb1fbf2 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -102,6 +102,8 @@ enum irdma_flush_opcode { FLUSH_REM_OP_ERR, FLUSH_LOC_LEN_ERR, FLUSH_FATAL_ERR, + FLUSH_RETRY_EXC_ERR, + FLUSH_MW_BIND_ERR, }; enum irdma_cmpl_status { diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index e94470991f..ac91ea5296 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2507,7 +2507,7 @@ void irdma_modify_qp_to_err(struct irdma_sc_qp *sc_qp) struct irdma_qp *qp = sc_qp->qp_uk.back_qp; struct ib_qp_attr attr; - if (qp->iwdev->reset) + if (qp->iwdev->rf->reset) return; attr.qp_state = IB_QPS_ERR; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 4fc3234020..7110ebf834 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -535,8 +535,7 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) irdma_qp_rem_ref(&iwqp->ibqp); wait_for_completion(&iwqp->free_qp); irdma_free_lsmm_rsrc(iwqp); - if (!iwdev->reset) - irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); + irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); if (!iwqp->user_mode) { if (iwqp->iwscq) { @@ -2035,7 +2034,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, /* Kmode allocations */ int rsize; - if (entries > rf->max_cqe) { + if (entries < 1 || entries > rf->max_cqe) { err_code = -EINVAL; goto cq_free_rsrc; } @@ -3353,6 +3352,10 @@ static enum ib_wc_status irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode return IB_WC_LOC_LEN_ERR; case FLUSH_GENERAL_ERR: return IB_WC_WR_FLUSH_ERR; + case FLUSH_RETRY_EXC_ERR: + return IB_WC_RETRY_EXC_ERR; + case FLUSH_MW_BIND_ERR: + return IB_WC_MW_BIND_ERR; case FLUSH_FATAL_ERR: default: return IB_WC_FATAL_ERR; diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h index 84dd682d23..b350081aeb 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib.h +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -90,7 +90,7 @@ struct usnic_ib_dev { struct usnic_ib_vf { struct usnic_ib_dev *pf; - spinlock_t lock; + struct mutex lock; struct usnic_vnic *vnic; unsigned int qp_grp_ref_cnt; struct usnic_ib_pd *pd; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 228e9a36da..d346dd48e7 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -572,7 +572,7 @@ static int usnic_ib_pci_probe(struct pci_dev *pdev, } vf->pf = pf; - spin_lock_init(&vf->lock); + mutex_init(&vf->lock); mutex_lock(&pf->usdev_lock); list_add_tail(&vf->link, &pf->vf_dev_list); /* diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 06a4e9d454..756a83bcff 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -196,7 +196,7 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, for (i = 0; dev_list[i]; i++) { dev = dev_list[i]; vf = dev_get_drvdata(dev); - spin_lock(&vf->lock); + mutex_lock(&vf->lock); vnic = vf->vnic; if (!usnic_vnic_check_room(vnic, res_spec)) { usnic_dbg("Found used vnic %s from %s\n", @@ -208,10 +208,10 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, vf, pd, res_spec, trans_spec); - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); goto qp_grp_check; } - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); } usnic_uiom_free_dev_list(dev_list); @@ -220,7 +220,7 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, /* Try to find resources on an unused vf */ list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) { - spin_lock(&vf->lock); + mutex_lock(&vf->lock); vnic = vf->vnic; if (vf->qp_grp_ref_cnt == 0 && usnic_vnic_check_room(vnic, res_spec) == 0) { @@ -228,10 +228,10 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, vf, pd, res_spec, trans_spec); - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); goto qp_grp_check; } - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); } usnic_info("No free qp grp found on %s\n", @@ -253,9 +253,9 @@ static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) WARN_ON(qp_grp->state != IB_QPS_RESET); - spin_lock(&vf->lock); + mutex_lock(&vf->lock); usnic_ib_qp_grp_destroy(qp_grp); - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); } static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 29de8412e4..4c914f75a9 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -334,6 +334,7 @@ static const struct xpad_device { { 0x24c6, 0x5b03, "Thrustmaster Ferrari 458 Racing Wheel", 0, XTYPE_XBOX360 }, { 0x24c6, 0x5d04, "Razer Sabertooth", 0, XTYPE_XBOX360 }, { 0x24c6, 0xfafe, "Rock Candy Gamepad for Xbox 360", 0, XTYPE_XBOX360 }, + { 0x3285, 0x0607, "Nacon GC-100", 0, XTYPE_XBOX360 }, { 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX }, { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, { 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN } @@ -451,6 +452,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOXONE_VENDOR(0x24c6), /* PowerA Controllers */ XPAD_XBOXONE_VENDOR(0x2e24), /* Hyperkin Duke X-Box One pad */ XPAD_XBOX360_VENDOR(0x2f24), /* GameSir Controllers */ + XPAD_XBOX360_VENDOR(0x3285), /* Nacon GC-100 */ { } }; diff --git a/drivers/input/keyboard/snvs_pwrkey.c b/drivers/input/keyboard/snvs_pwrkey.c index 2f5e3ab5ed..65286762b0 100644 --- a/drivers/input/keyboard/snvs_pwrkey.c +++ b/drivers/input/keyboard/snvs_pwrkey.c @@ -3,6 +3,7 @@ // Driver for the IMX SNVS ON/OFF Power Key // Copyright (C) 2015 Freescale Semiconductor, Inc. All Rights Reserved. +#include #include #include #include @@ -99,6 +100,11 @@ static irqreturn_t imx_snvs_pwrkey_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static void imx_snvs_pwrkey_disable_clk(void *data) +{ + clk_disable_unprepare(data); +} + static void imx_snvs_pwrkey_act(void *pdata) { struct pwrkey_drv_data *pd = pdata; @@ -111,6 +117,7 @@ static int imx_snvs_pwrkey_probe(struct platform_device *pdev) struct pwrkey_drv_data *pdata; struct input_dev *input; struct device_node *np; + struct clk *clk; int error; u32 vid; @@ -134,6 +141,28 @@ static int imx_snvs_pwrkey_probe(struct platform_device *pdev) dev_warn(&pdev->dev, "KEY_POWER without setting in dts\n"); } + clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(clk)) { + dev_err(&pdev->dev, "Failed to get snvs clock (%pe)\n", clk); + return PTR_ERR(clk); + } + + error = clk_prepare_enable(clk); + if (error) { + dev_err(&pdev->dev, "Failed to enable snvs clock (%pe)\n", + ERR_PTR(error)); + return error; + } + + error = devm_add_action_or_reset(&pdev->dev, + imx_snvs_pwrkey_disable_clk, clk); + if (error) { + dev_err(&pdev->dev, + "Failed to register clock cleanup handler (%pe)\n", + ERR_PTR(error)); + return error; + } + pdata->wakeup = of_property_read_bool(np, "wakeup-source"); pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/input/touchscreen.c b/drivers/input/touchscreen.c index dd18cb917c..4620e20d01 100644 --- a/drivers/input/touchscreen.c +++ b/drivers/input/touchscreen.c @@ -80,27 +80,27 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch, data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-x", input_abs_get_min(input, axis_x), - &minimum) | - touchscreen_get_prop_u32(dev, "touchscreen-size-x", - input_abs_get_max(input, - axis_x) + 1, - &maximum) | - touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x", - input_abs_get_fuzz(input, axis_x), - &fuzz); + &minimum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-size-x", + input_abs_get_max(input, + axis_x) + 1, + &maximum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x", + input_abs_get_fuzz(input, axis_x), + &fuzz); if (data_present) touchscreen_set_params(input, axis_x, minimum, maximum - 1, fuzz); data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-y", input_abs_get_min(input, axis_y), - &minimum) | - touchscreen_get_prop_u32(dev, "touchscreen-size-y", - input_abs_get_max(input, - axis_y) + 1, - &maximum) | - touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y", - input_abs_get_fuzz(input, axis_y), - &fuzz); + &minimum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-size-y", + input_abs_get_max(input, + axis_y) + 1, + &maximum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y", + input_abs_get_fuzz(input, axis_y), + &fuzz); if (data_present) touchscreen_set_params(input, axis_y, minimum, maximum - 1, fuzz); @@ -108,11 +108,11 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch, data_present = touchscreen_get_prop_u32(dev, "touchscreen-max-pressure", input_abs_get_max(input, axis), - &maximum) | - touchscreen_get_prop_u32(dev, - "touchscreen-fuzz-pressure", - input_abs_get_fuzz(input, axis), - &fuzz); + &maximum); + data_present |= touchscreen_get_prop_u32(dev, + "touchscreen-fuzz-pressure", + input_abs_get_fuzz(input, axis), + &fuzz); if (data_present) touchscreen_set_params(input, axis, 0, maximum, fuzz); diff --git a/drivers/input/touchscreen/resistive-adc-touch.c b/drivers/input/touchscreen/resistive-adc-touch.c index 744544a723..6f754a8d30 100644 --- a/drivers/input/touchscreen/resistive-adc-touch.c +++ b/drivers/input/touchscreen/resistive-adc-touch.c @@ -71,19 +71,22 @@ static int grts_cb(const void *data, void *private) unsigned int z2 = touch_info[st->ch_map[GRTS_CH_Z2]]; unsigned int Rt; - Rt = z2; - Rt -= z1; - Rt *= st->x_plate_ohms; - Rt = DIV_ROUND_CLOSEST(Rt, 16); - Rt *= x; - Rt /= z1; - Rt = DIV_ROUND_CLOSEST(Rt, 256); - /* - * On increased pressure the resistance (Rt) is decreasing - * so, convert values to make it looks as real pressure. - */ - if (Rt < GRTS_DEFAULT_PRESSURE_MAX) - press = GRTS_DEFAULT_PRESSURE_MAX - Rt; + if (likely(x && z1)) { + Rt = z2; + Rt -= z1; + Rt *= st->x_plate_ohms; + Rt = DIV_ROUND_CLOSEST(Rt, 16); + Rt *= x; + Rt /= z1; + Rt = DIV_ROUND_CLOSEST(Rt, 256); + /* + * On increased pressure the resistance (Rt) is + * decreasing so, convert values to make it looks as + * real pressure. + */ + if (Rt < GRTS_DEFAULT_PRESSURE_MAX) + press = GRTS_DEFAULT_PRESSURE_MAX - Rt; + } } if ((!x && !y) || (st->pressure && (press < st->pressure_min))) { diff --git a/drivers/interconnect/qcom/sdm660.c b/drivers/interconnect/qcom/sdm660.c index 632dbdd219..fb23a5b780 100644 --- a/drivers/interconnect/qcom/sdm660.c +++ b/drivers/interconnect/qcom/sdm660.c @@ -44,9 +44,9 @@ #define NOC_PERM_MODE_BYPASS (1 << NOC_QOS_MODE_BYPASS) #define NOC_QOS_PRIORITYn_ADDR(n) (0x8 + (n * 0x1000)) -#define NOC_QOS_PRIORITY_MASK 0xf +#define NOC_QOS_PRIORITY_P1_MASK 0xc +#define NOC_QOS_PRIORITY_P0_MASK 0x3 #define NOC_QOS_PRIORITY_P1_SHIFT 0x2 -#define NOC_QOS_PRIORITY_P0_SHIFT 0x3 #define NOC_QOS_MODEn_ADDR(n) (0xc + (n * 0x1000)) #define NOC_QOS_MODEn_MASK 0x3 @@ -173,6 +173,16 @@ static const struct clk_bulk_data bus_mm_clocks[] = { { .id = "iface" }, }; +static const struct clk_bulk_data bus_a2noc_clocks[] = { + { .id = "bus" }, + { .id = "bus_a" }, + { .id = "ipa" }, + { .id = "ufs_axi" }, + { .id = "aggre2_ufs_axi" }, + { .id = "aggre2_usb3_axi" }, + { .id = "cfg_noc_usb2_axi" }, +}; + /** * struct qcom_icc_provider - Qualcomm specific interconnect provider * @provider: generic interconnect provider @@ -307,7 +317,7 @@ DEFINE_QNODE(slv_bimc_cfg, SDM660_SLAVE_BIMC_CFG, 4, -1, 56, true, -1, 0, -1, 0) DEFINE_QNODE(slv_prng, SDM660_SLAVE_PRNG, 4, -1, 44, true, -1, 0, -1, 0); DEFINE_QNODE(slv_spdm, SDM660_SLAVE_SPDM, 4, -1, 60, true, -1, 0, -1, 0); DEFINE_QNODE(slv_qdss_cfg, SDM660_SLAVE_QDSS_CFG, 4, -1, 63, true, -1, 0, -1, 0); -DEFINE_QNODE(slv_cnoc_mnoc_cfg, SDM660_SLAVE_BLSP_1, 4, -1, 66, true, -1, 0, -1, SDM660_MASTER_CNOC_MNOC_CFG); +DEFINE_QNODE(slv_cnoc_mnoc_cfg, SDM660_SLAVE_CNOC_MNOC_CFG, 4, -1, 66, true, -1, 0, -1, SDM660_MASTER_CNOC_MNOC_CFG); DEFINE_QNODE(slv_snoc_cfg, SDM660_SLAVE_SNOC_CFG, 4, -1, 70, true, -1, 0, -1, 0); DEFINE_QNODE(slv_qm_cfg, SDM660_SLAVE_QM_CFG, 4, -1, 212, true, -1, 0, -1, 0); DEFINE_QNODE(slv_clk_ctl, SDM660_SLAVE_CLK_CTL, 4, -1, 47, true, -1, 0, -1, 0); @@ -624,13 +634,12 @@ static int qcom_icc_noc_set_qos_priority(struct regmap *rmap, /* Must be updated one at a time, P1 first, P0 last */ val = qos->areq_prio << NOC_QOS_PRIORITY_P1_SHIFT; rc = regmap_update_bits(rmap, NOC_QOS_PRIORITYn_ADDR(qos->qos_port), - NOC_QOS_PRIORITY_MASK, val); + NOC_QOS_PRIORITY_P1_MASK, val); if (rc) return rc; - val = qos->prio_level << NOC_QOS_PRIORITY_P0_SHIFT; return regmap_update_bits(rmap, NOC_QOS_PRIORITYn_ADDR(qos->qos_port), - NOC_QOS_PRIORITY_MASK, val); + NOC_QOS_PRIORITY_P0_MASK, qos->prio_level); } static int qcom_icc_set_noc_qos(struct icc_node *src, u64 max_bw) @@ -810,6 +819,10 @@ static int qnoc_probe(struct platform_device *pdev) qp->bus_clks = devm_kmemdup(dev, bus_mm_clocks, sizeof(bus_mm_clocks), GFP_KERNEL); qp->num_clks = ARRAY_SIZE(bus_mm_clocks); + } else if (of_device_is_compatible(dev->of_node, "qcom,sdm660-a2noc")) { + qp->bus_clks = devm_kmemdup(dev, bus_a2noc_clocks, + sizeof(bus_a2noc_clocks), GFP_KERNEL); + qp->num_clks = ARRAY_SIZE(bus_a2noc_clocks); } else { if (of_device_is_compatible(dev->of_node, "qcom,sdm660-bimc")) qp->is_bimc_node = true; diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 124c41adec..3eb68fa1b8 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -308,7 +308,6 @@ config APPLE_DART config ARM_SMMU tristate "ARM Ltd. System MMU (SMMU) Support" depends on ARM64 || ARM || (COMPILE_TEST && !GENERIC_ATOMIC64) - depends on QCOM_SCM || !QCOM_SCM #if QCOM_SCM=m this can't be =y select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU if ARM @@ -356,6 +355,14 @@ config ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT 'arm-smmu.disable_bypass' will continue to override this config. +config ARM_SMMU_QCOM + def_tristate y + depends on ARM_SMMU && ARCH_QCOM + select QCOM_SCM + help + When running on a Qualcomm platform that has the custom variant + of the ARM SMMU, this needs to be built into the SMMU driver. + config ARM_SMMU_V3 tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support" depends on ARM64 @@ -438,7 +445,7 @@ config QCOM_IOMMU # Note: iommu drivers cannot (yet?) be built as modules bool "Qualcomm IOMMU Support" depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64) - depends on QCOM_SCM=y + select QCOM_SCM select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 559db9259e..fdfa39ec2a 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -183,7 +183,6 @@ struct apple_dart_master_cfg { static struct platform_driver apple_dart_driver; static const struct iommu_ops apple_dart_iommu_ops; -static const struct iommu_flush_ops apple_dart_tlb_ops; static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom) { @@ -338,22 +337,6 @@ static void apple_dart_iotlb_sync_map(struct iommu_domain *domain, apple_dart_domain_flush_tlb(to_dart_domain(domain)); } -static void apple_dart_tlb_flush_all(void *cookie) -{ - apple_dart_domain_flush_tlb(cookie); -} - -static void apple_dart_tlb_flush_walk(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ - apple_dart_domain_flush_tlb(cookie); -} - -static const struct iommu_flush_ops apple_dart_tlb_ops = { - .tlb_flush_all = apple_dart_tlb_flush_all, - .tlb_flush_walk = apple_dart_tlb_flush_walk, -}; - static phys_addr_t apple_dart_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { @@ -435,7 +418,6 @@ static int apple_dart_finalize_domain(struct iommu_domain *domain, .ias = 32, .oas = 36, .coherent_walk = 1, - .tlb = &apple_dart_tlb_ops, .iommu_dev = dart->dev, }; @@ -661,16 +643,34 @@ static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args) return -EINVAL; } +static DEFINE_MUTEX(apple_dart_groups_lock); + +static void apple_dart_release_group(void *iommu_data) +{ + int i, sid; + struct apple_dart_stream_map *stream_map; + struct apple_dart_master_cfg *group_master_cfg = iommu_data; + + mutex_lock(&apple_dart_groups_lock); + + for_each_stream_map(i, group_master_cfg, stream_map) + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) + stream_map->dart->sid2group[sid] = NULL; + + kfree(iommu_data); + mutex_unlock(&apple_dart_groups_lock); +} + static struct iommu_group *apple_dart_device_group(struct device *dev) { - static DEFINE_MUTEX(lock); int i, sid; struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; + struct apple_dart_master_cfg *group_master_cfg; struct iommu_group *group = NULL; struct iommu_group *res = ERR_PTR(-EINVAL); - mutex_lock(&lock); + mutex_lock(&apple_dart_groups_lock); for_each_stream_map(i, cfg, stream_map) { for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) { @@ -698,6 +698,20 @@ static struct iommu_group *apple_dart_device_group(struct device *dev) #endif group = generic_device_group(dev); + res = ERR_PTR(-ENOMEM); + if (!group) + goto out; + + group_master_cfg = kzalloc(sizeof(*group_master_cfg), GFP_KERNEL); + if (!group_master_cfg) { + iommu_group_put(group); + goto out; + } + + memcpy(group_master_cfg, cfg, sizeof(*group_master_cfg)); + iommu_group_set_iommudata(group, group_master_cfg, + apple_dart_release_group); + for_each_stream_map(i, cfg, stream_map) for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) stream_map->dart->sid2group[sid] = group; @@ -705,7 +719,7 @@ static struct iommu_group *apple_dart_device_group(struct device *dev) res = group; out: - mutex_unlock(&lock); + mutex_unlock(&apple_dart_groups_lock); return res; } diff --git a/drivers/iommu/arm/arm-smmu/Makefile b/drivers/iommu/arm/arm-smmu/Makefile index e240a7bcf3..b0cc01aa20 100644 --- a/drivers/iommu/arm/arm-smmu/Makefile +++ b/drivers/iommu/arm/arm-smmu/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o obj-$(CONFIG_ARM_SMMU) += arm_smmu.o -arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o arm-smmu-qcom.o +arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o +arm_smmu-$(CONFIG_ARM_SMMU_QCOM) += arm-smmu-qcom.o diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index 9f465e1467..2c25cce380 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -215,7 +215,8 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu) of_device_is_compatible(np, "nvidia,tegra186-smmu")) return nvidia_smmu_impl_init(smmu); - smmu = qcom_smmu_impl_init(smmu); + if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM)) + smmu = qcom_smmu_impl_init(smmu); if (of_device_is_compatible(np, "marvell,ap806-smmu-500")) smmu->impl = &mrvl_mmu500_impl; diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 0ec5514c99..b7708b93f3 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1942,18 +1942,18 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type, reason = dmar_get_fault_reason(fault_reason, &fault_type); if (fault_type == INTR_REMAP) - pr_err("[INTR-REMAP] Request device [0x%02x:0x%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n", + pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n", source_id >> 8, PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr >> 48, fault_reason, reason); else if (pasid == INVALID_IOASID) - pr_err("[%s NO_PASID] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", + pr_err("[%s NO_PASID] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", type ? "DMA Read" : "DMA Write", source_id >> 8, PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason); else - pr_err("[%s PASID 0x%x] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", + pr_err("[%s PASID 0x%x] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", type ? "DMA Read" : "DMA Write", pasid, source_id >> 8, PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr, diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c index c14e65a5d3..c709861198 100644 --- a/drivers/ipack/devices/ipoctal.c +++ b/drivers/ipack/devices/ipoctal.c @@ -33,6 +33,7 @@ struct ipoctal_channel { unsigned int pointer_read; unsigned int pointer_write; struct tty_port tty_port; + bool tty_registered; union scc2698_channel __iomem *regs; union scc2698_block __iomem *block_regs; unsigned int board_id; @@ -81,22 +82,34 @@ static int ipoctal_port_activate(struct tty_port *port, struct tty_struct *tty) return 0; } -static int ipoctal_open(struct tty_struct *tty, struct file *file) +static int ipoctal_install(struct tty_driver *driver, struct tty_struct *tty) { struct ipoctal_channel *channel = dev_get_drvdata(tty->dev); struct ipoctal *ipoctal = chan_to_ipoctal(channel, tty->index); - int err; - - tty->driver_data = channel; + int res; if (!ipack_get_carrier(ipoctal->dev)) return -EBUSY; - err = tty_port_open(&channel->tty_port, tty, file); - if (err) - ipack_put_carrier(ipoctal->dev); + res = tty_standard_install(driver, tty); + if (res) + goto err_put_carrier; - return err; + tty->driver_data = channel; + + return 0; + +err_put_carrier: + ipack_put_carrier(ipoctal->dev); + + return res; +} + +static int ipoctal_open(struct tty_struct *tty, struct file *file) +{ + struct ipoctal_channel *channel = tty->driver_data; + + return tty_port_open(&channel->tty_port, tty, file); } static void ipoctal_reset_stats(struct ipoctal_stats *stats) @@ -264,7 +277,6 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, int res; int i; struct tty_driver *tty; - char name[20]; struct ipoctal_channel *channel; struct ipack_region *region; void __iomem *addr; @@ -355,8 +367,11 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, /* Fill struct tty_driver with ipoctal data */ tty->owner = THIS_MODULE; tty->driver_name = KBUILD_MODNAME; - sprintf(name, KBUILD_MODNAME ".%d.%d.", bus_nr, slot); - tty->name = name; + tty->name = kasprintf(GFP_KERNEL, KBUILD_MODNAME ".%d.%d.", bus_nr, slot); + if (!tty->name) { + res = -ENOMEM; + goto err_put_driver; + } tty->major = 0; tty->minor_start = 0; @@ -371,8 +386,7 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, res = tty_register_driver(tty); if (res) { dev_err(&ipoctal->dev->dev, "Can't register tty driver.\n"); - tty_driver_kref_put(tty); - return res; + goto err_free_name; } /* Save struct tty_driver for use it when uninstalling the device */ @@ -383,7 +397,9 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, channel = &ipoctal->channel[i]; tty_port_init(&channel->tty_port); - tty_port_alloc_xmit_buf(&channel->tty_port); + res = tty_port_alloc_xmit_buf(&channel->tty_port); + if (res) + continue; channel->tty_port.ops = &ipoctal_tty_port_ops; ipoctal_reset_stats(&channel->stats); @@ -391,13 +407,15 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, spin_lock_init(&channel->lock); channel->pointer_read = 0; channel->pointer_write = 0; - tty_dev = tty_port_register_device(&channel->tty_port, tty, i, NULL); + tty_dev = tty_port_register_device_attr(&channel->tty_port, tty, + i, NULL, channel, NULL); if (IS_ERR(tty_dev)) { dev_err(&ipoctal->dev->dev, "Failed to register tty device.\n"); + tty_port_free_xmit_buf(&channel->tty_port); tty_port_destroy(&channel->tty_port); continue; } - dev_set_drvdata(tty_dev, channel); + channel->tty_registered = true; } /* @@ -409,6 +427,13 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, ipoctal_irq_handler, ipoctal); return 0; + +err_free_name: + kfree(tty->name); +err_put_driver: + tty_driver_kref_put(tty); + + return res; } static inline int ipoctal_copy_write_buffer(struct ipoctal_channel *channel, @@ -648,6 +673,7 @@ static void ipoctal_cleanup(struct tty_struct *tty) static const struct tty_operations ipoctal_fops = { .ioctl = NULL, + .install = ipoctal_install, .open = ipoctal_open, .close = ipoctal_close, .write = ipoctal_write_tty, @@ -690,12 +716,17 @@ static void __ipoctal_remove(struct ipoctal *ipoctal) for (i = 0; i < NR_CHANNELS; i++) { struct ipoctal_channel *channel = &ipoctal->channel[i]; + + if (!channel->tty_registered) + continue; + tty_unregister_device(ipoctal->tty_drv, i); tty_port_free_xmit_buf(&channel->tty_port); tty_port_destroy(&channel->tty_port); } tty_unregister_driver(ipoctal->tty_drv); + kfree(ipoctal->tty_drv->name); tty_driver_kref_put(ipoctal->tty_drv); kfree(ipoctal); } diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index cb0afe8971..7313454e40 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -480,6 +480,11 @@ int detach_capi_ctr(struct capi_ctr *ctr) ctr_down(ctr, CAPI_CTR_DETACHED); + if (ctr->cnr < 1 || ctr->cnr - 1 >= CAPI_MAXCONTR) { + err = -EINVAL; + goto unlock_out; + } + if (capi_controller[ctr->cnr - 1] != ctr) { err = -EINVAL; goto unlock_out; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 84dbe08ad2..edd22e4d65 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -161,7 +161,7 @@ static const char *clone_device_name(struct clone *clone) static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) { - const char *descs[] = { + static const char * const descs[] = { "read-write", "read-only", "fail" diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 5b95eea517..a896dea975 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -490,6 +490,14 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct mapped_device *md = tio->md; struct dm_target *ti = md->immutable_target; + /* + * blk-mq's unquiesce may come from outside events, such as + * elevator switch, updating nr_requests or others, and request may + * come during suspend, so simply ask for blk-mq to requeue it. + */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) + return BLK_STS_RESOURCE; + if (unlikely(!ti)) { int srcu_idx; struct dm_table *map = dm_get_live_table(md, &srcu_idx); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 22a5ac8244..88288c8d6b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -475,6 +475,7 @@ static int verity_verify_io(struct dm_verity_io *io) struct bvec_iter start; unsigned b; struct crypto_wait wait; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); for (b = 0; b < io->n_blocks; b++) { int r; @@ -529,9 +530,17 @@ static int verity_verify_io(struct dm_verity_io *io) else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, cur_block, NULL, &start) == 0) continue; - else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - cur_block)) - return -EIO; + else { + if (bio->bi_status) { + /* + * Error correction failed; Just return error + */ + return -EIO; + } + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + cur_block)) + return -EIO; + } } return 0; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a011d09cb0..76d9da49fd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -496,18 +496,17 @@ static void start_io_acct(struct dm_io *io) false, 0, &io->stats_aux); } -static void end_io_acct(struct dm_io *io) +static void end_io_acct(struct mapped_device *md, struct bio *bio, + unsigned long start_time, struct dm_stats_aux *stats_aux) { - struct mapped_device *md = io->md; - struct bio *bio = io->orig_bio; - unsigned long duration = jiffies - io->start_time; + unsigned long duration = jiffies - start_time; - bio_end_io_acct(bio, io->start_time); + bio_end_io_acct(bio, start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), - true, duration, &io->stats_aux); + true, duration, stats_aux); /* nudge anyone waiting on suspend queue */ if (unlikely(wq_has_sleeper(&md->wait))) @@ -790,6 +789,8 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error) blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; + unsigned long start_time = 0; + struct dm_stats_aux stats_aux; /* Push-back supersedes any I/O errors */ if (unlikely(error)) { @@ -821,8 +822,10 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error) } io_error = io->status; - end_io_acct(io); + start_time = io->start_time; + stats_aux = io->stats_aux; free_io(md, io); + end_io_acct(md, bio, start_time, &stats_aux); if (io_error == BLK_STS_DM_REQUEUE) return; diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig index 9d77ebc523..9263d3d047 100644 --- a/drivers/media/platform/Kconfig +++ b/drivers/media/platform/Kconfig @@ -566,7 +566,7 @@ config VIDEO_QCOM_VENUS depends on VIDEO_DEV && VIDEO_V4L2 && QCOM_SMEM depends on (ARCH_QCOM && IOMMU_DMA) || COMPILE_TEST select QCOM_MDT_LOADER if ARCH_QCOM - select QCOM_SCM if ARCH_QCOM + select QCOM_SCM select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV help diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.c b/drivers/media/platform/s5p-jpeg/jpeg-core.c index d402e456f2..7d0ab19c38 100644 --- a/drivers/media/platform/s5p-jpeg/jpeg-core.c +++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c @@ -1140,8 +1140,8 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, continue; length = 0; switch (c) { - /* SOF0: baseline JPEG */ - case SOF0: + /* JPEG_MARKER_SOF0: baseline JPEG */ + case JPEG_MARKER_SOF0: if (get_word_be(&jpeg_buffer, &word)) break; length = (long)word - 2; @@ -1172,7 +1172,7 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, notfound = 0; break; - case DQT: + case JPEG_MARKER_DQT: if (get_word_be(&jpeg_buffer, &word)) break; length = (long)word - 2; @@ -1185,7 +1185,7 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, skip(&jpeg_buffer, length); break; - case DHT: + case JPEG_MARKER_DHT: if (get_word_be(&jpeg_buffer, &word)) break; length = (long)word - 2; @@ -1198,15 +1198,15 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, skip(&jpeg_buffer, length); break; - case SOS: + case JPEG_MARKER_SOS: sos = jpeg_buffer.curr - 2; /* 0xffda */ break; /* skip payload-less markers */ - case RST ... RST + 7: - case SOI: - case EOI: - case TEM: + case JPEG_MARKER_RST ... JPEG_MARKER_RST + 7: + case JPEG_MARKER_SOI: + case JPEG_MARKER_EOI: + case JPEG_MARKER_TEM: break; /* skip uninteresting payload markers */ diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.h b/drivers/media/platform/s5p-jpeg/jpeg-core.h index a77d93c098..8473a019bb 100644 --- a/drivers/media/platform/s5p-jpeg/jpeg-core.h +++ b/drivers/media/platform/s5p-jpeg/jpeg-core.h @@ -37,15 +37,15 @@ #define EXYNOS3250_IRQ_TIMEOUT 0x10000000 /* a selection of JPEG markers */ -#define TEM 0x01 -#define SOF0 0xc0 -#define DHT 0xc4 -#define RST 0xd0 -#define SOI 0xd8 -#define EOI 0xd9 -#define SOS 0xda -#define DQT 0xdb -#define DHP 0xde +#define JPEG_MARKER_TEM 0x01 +#define JPEG_MARKER_SOF0 0xc0 +#define JPEG_MARKER_DHT 0xc4 +#define JPEG_MARKER_RST 0xd0 +#define JPEG_MARKER_SOI 0xd8 +#define JPEG_MARKER_EOI 0xd9 +#define JPEG_MARKER_SOS 0xda +#define JPEG_MARKER_DQT 0xdb +#define JPEG_MARKER_DHP 0xde /* Flags that indicate a format can be used for capture/output */ #define SJPEG_FMT_FLAG_ENC_CAPTURE (1 << 0) @@ -187,11 +187,11 @@ struct s5p_jpeg_marker { * @fmt: driver-specific format of this queue * @w: image width * @h: image height - * @sos: SOS marker's position relative to the buffer beginning - * @dht: DHT markers' positions relative to the buffer beginning - * @dqt: DQT markers' positions relative to the buffer beginning - * @sof: SOF0 marker's position relative to the buffer beginning - * @sof_len: SOF0 marker's payload length (without length field itself) + * @sos: JPEG_MARKER_SOS's position relative to the buffer beginning + * @dht: JPEG_MARKER_DHT' positions relative to the buffer beginning + * @dqt: JPEG_MARKER_DQT' positions relative to the buffer beginning + * @sof: JPEG_MARKER_SOF0's position relative to the buffer beginning + * @sof_len: JPEG_MARKER_SOF0's payload length (without length field itself) * @size: image buffer size in bytes */ struct s5p_jpeg_q_data { diff --git a/drivers/media/rc/ir_toy.c b/drivers/media/rc/ir_toy.c index 3e729a17b3..48d52baec1 100644 --- a/drivers/media/rc/ir_toy.c +++ b/drivers/media/rc/ir_toy.c @@ -24,6 +24,7 @@ static const u8 COMMAND_VERSION[] = { 'v' }; // End transmit and repeat reset command so we exit sump mode static const u8 COMMAND_RESET[] = { 0xff, 0xff, 0, 0, 0, 0, 0 }; static const u8 COMMAND_SMODE_ENTER[] = { 's' }; +static const u8 COMMAND_SMODE_EXIT[] = { 0 }; static const u8 COMMAND_TXSTART[] = { 0x26, 0x24, 0x25, 0x03 }; #define REPLY_XMITCOUNT 't' @@ -309,12 +310,30 @@ static int irtoy_tx(struct rc_dev *rc, uint *txbuf, uint count) buf[i] = cpu_to_be16(v); } - buf[count] = cpu_to_be16(0xffff); + buf[count] = 0xffff; irtoy->tx_buf = buf; irtoy->tx_len = size; irtoy->emitted = 0; + // There is an issue where if the unit is receiving IR while the + // first TXSTART command is sent, the device might end up hanging + // with its led on. It does not respond to any command when this + // happens. To work around this, re-enter sample mode. + err = irtoy_command(irtoy, COMMAND_SMODE_EXIT, + sizeof(COMMAND_SMODE_EXIT), STATE_RESET); + if (err) { + dev_err(irtoy->dev, "exit sample mode: %d\n", err); + return err; + } + + err = irtoy_command(irtoy, COMMAND_SMODE_ENTER, + sizeof(COMMAND_SMODE_ENTER), STATE_COMMAND); + if (err) { + dev_err(irtoy->dev, "enter sample mode: %d\n", err); + return err; + } + err = irtoy_command(irtoy, COMMAND_TXSTART, sizeof(COMMAND_TXSTART), STATE_TX); kfree(buf); diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index ccfc5c389d..6063314514 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -232,6 +232,7 @@ config HI6421V600_IRQ tristate "HiSilicon Hi6421v600 IRQ and powerkey" depends on OF depends on SPMI + depends on HAS_IOMEM select MFD_CORE select REGMAP_SPMI help diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c index e5a4ed3701..a798fad5f0 100644 --- a/drivers/misc/cb710/sgbuf2.c +++ b/drivers/misc/cb710/sgbuf2.c @@ -47,7 +47,7 @@ static inline bool needs_unaligned_copy(const void *ptr) #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS return false; #else - return ((ptr - NULL) & 3) != 0; + return ((uintptr_t)ptr & 3) != 0; #endif } diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 4d09b672ac..6323254742 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -366,6 +366,13 @@ static const struct of_device_id at25_of_match[] = { }; MODULE_DEVICE_TABLE(of, at25_of_match); +static const struct spi_device_id at25_spi_ids[] = { + { .name = "at25",}, + { .name = "fm25",}, + { } +}; +MODULE_DEVICE_TABLE(spi, at25_spi_ids); + static int at25_probe(struct spi_device *spi) { struct at25_data *at25 = NULL; @@ -491,6 +498,7 @@ static struct spi_driver at25_driver = { .dev_groups = sernum_groups, }, .probe = at25_probe, + .id_table = at25_spi_ids, }; module_spi_driver(at25_driver); diff --git a/drivers/misc/eeprom/eeprom_93xx46.c b/drivers/misc/eeprom/eeprom_93xx46.c index 29d8971ec5..1f15399e5c 100644 --- a/drivers/misc/eeprom/eeprom_93xx46.c +++ b/drivers/misc/eeprom/eeprom_93xx46.c @@ -406,6 +406,23 @@ static const struct of_device_id eeprom_93xx46_of_table[] = { }; MODULE_DEVICE_TABLE(of, eeprom_93xx46_of_table); +static const struct spi_device_id eeprom_93xx46_spi_ids[] = { + { .name = "eeprom-93xx46", + .driver_data = (kernel_ulong_t)&at93c46_data, }, + { .name = "at93c46", + .driver_data = (kernel_ulong_t)&at93c46_data, }, + { .name = "at93c46d", + .driver_data = (kernel_ulong_t)&atmel_at93c46d_data, }, + { .name = "at93c56", + .driver_data = (kernel_ulong_t)&at93c56_data, }, + { .name = "at93c66", + .driver_data = (kernel_ulong_t)&at93c66_data, }, + { .name = "93lc46b", + .driver_data = (kernel_ulong_t)µchip_93lc46b_data, }, + {} +}; +MODULE_DEVICE_TABLE(spi, eeprom_93xx46_spi_ids); + static int eeprom_93xx46_probe_dt(struct spi_device *spi) { const struct of_device_id *of_id = @@ -555,6 +572,7 @@ static struct spi_driver eeprom_93xx46_driver = { }, .probe = eeprom_93xx46_probe, .remove = eeprom_93xx46_remove, + .id_table = eeprom_93xx46_spi_ids, }; module_spi_driver(eeprom_93xx46_driver); diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index beda610e6b..ad6ced4546 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -814,10 +814,12 @@ static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx) rpra[i].pv = (u64) ctx->args[i].ptr; pages[i].addr = ctx->maps[i]->phys; + mmap_read_lock(current->mm); vma = find_vma(current->mm, ctx->args[i].ptr); if (vma) pages[i].addr += ctx->args[i].ptr - vma->vm_start; + mmap_read_unlock(current->mm); pg_start = (ctx->args[i].ptr & PAGE_MASK) >> PAGE_SHIFT; pg_end = ((ctx->args[i].ptr + len - 1) & PAGE_MASK) >> diff --git a/drivers/misc/gehc-achc.c b/drivers/misc/gehc-achc.c index 02f33bc60c..4c9c5394da 100644 --- a/drivers/misc/gehc-achc.c +++ b/drivers/misc/gehc-achc.c @@ -539,6 +539,7 @@ static int gehc_achc_probe(struct spi_device *spi) static const struct spi_device_id gehc_achc_id[] = { { "ge,achc", 0 }, + { "achc", 0 }, { } }; MODULE_DEVICE_TABLE(spi, gehc_achc_id); diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 91b57544f7..6dafff375f 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2649,11 +2649,18 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) free_seq_arr: kfree(cs_seq_arr); - /* update output args */ - memset(args, 0, sizeof(*args)); if (rc) return rc; + if (mcs_data.wait_status == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for Multi-CS\n"); + return -EINTR; + } + + /* update output args */ + memset(args, 0, sizeof(*args)); + if (mcs_data.completion_bitmap) { args->out.status = HL_WAIT_CS_STATUS_COMPLETED; args->out.cs_completion_map = mcs_data.completion_bitmap; @@ -2667,8 +2674,6 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) /* update if some CS was gone */ if (mcs_data.timestamp) args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE; - } else if (mcs_data.wait_status == -ERESTARTSYS) { - args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; } else { args->out.status = HL_WAIT_CS_STATUS_BUSY; } @@ -2688,16 +2693,17 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq, &status, ×tamp); + if (rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for CS handle %llu\n", + seq); + return -EINTR; + } + memset(args, 0, sizeof(*args)); if (rc) { - if (rc == -ERESTARTSYS) { - dev_err_ratelimited(hdev->dev, - "user process got signal while waiting for CS handle %llu\n", - seq); - args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; - rc = -EINTR; - } else if (rc == -ETIMEDOUT) { + if (rc == -ETIMEDOUT) { dev_err_ratelimited(hdev->dev, "CS %llu has timed-out while user process is waiting for it\n", seq); @@ -2823,7 +2829,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, dev_err_ratelimited(hdev->dev, "user process got signal while waiting for interrupt ID %d\n", interrupt->interrupt_id); - *status = HL_WAIT_CS_STATUS_INTERRUPTED; rc = -EINTR; } else { *status = CS_WAIT_STATUS_BUSY; @@ -2878,8 +2883,6 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) args->in.interrupt_timeout_us, args->in.addr, args->in.target, interrupt_offset, &status); - memset(args, 0, sizeof(*args)); - if (rc) { if (rc != -EINTR) dev_err_ratelimited(hdev->dev, @@ -2888,6 +2891,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) return rc; } + memset(args, 0, sizeof(*args)); + switch (status) { case CS_WAIT_STATUS_COMPLETED: args->out.status = HL_WAIT_CS_STATUS_COMPLETED; diff --git a/drivers/misc/mei/hbm.c b/drivers/misc/mei/hbm.c index 99b5c1ecc4..be41843df7 100644 --- a/drivers/misc/mei/hbm.c +++ b/drivers/misc/mei/hbm.c @@ -1298,7 +1298,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_STARTING) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: start: on shutdown, ignoring\n"); return 0; } @@ -1381,7 +1382,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_DR_SETUP) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: dma setup response: on shutdown, ignoring\n"); return 0; } @@ -1448,7 +1450,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_CLIENT_PROPERTIES) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: properties response: on shutdown, ignoring\n"); return 0; } @@ -1490,7 +1493,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_ENUM_CLIENTS) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: enumeration response: on shutdown, ignoring\n"); return 0; } diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index cb34925e10..67bb6a25fd 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -92,6 +92,7 @@ #define MEI_DEV_ID_CDF 0x18D3 /* Cedar Fork */ #define MEI_DEV_ID_ICP_LP 0x34E0 /* Ice Lake Point LP */ +#define MEI_DEV_ID_ICP_N 0x38E0 /* Ice Lake Point N */ #define MEI_DEV_ID_JSP_N 0x4DE0 /* Jasper Lake Point N */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index c3393b383e..3a45aaf002 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -96,6 +96,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_CMP_H_3, MEI_ME_PCH8_ITOUCH_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ICP_LP, MEI_ME_PCH12_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_ICP_N, MEI_ME_PCH12_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_TGP_LP, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_TGP_H, MEI_ME_PCH15_SPS_CFG)}, diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index d8046bde19..67d1d129f8 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -586,7 +586,7 @@ config MMC_SDHCI_MSM depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI - select QCOM_SCM if MMC_CRYPTO && ARCH_QCOM + select QCOM_SCM if MMC_CRYPTO help This selects the Secure Digital Host Controller Interface (SDHCI) support present in Qualcomm SOCs. The controller supports diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 6578cc64ae..380f9aa56e 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1802,10 +1802,15 @@ static enum hrtimer_restart dw_mci_fault_timer(struct hrtimer *t) spin_lock_irqsave(&host->irq_lock, flags); - if (!host->data_status) + /* + * Only inject an error if we haven't already got an error or data over + * interrupt. + */ + if (!host->data_status) { host->data_status = SDMMC_INT_DCRC; - set_bit(EVENT_DATA_ERROR, &host->pending_events); - tasklet_schedule(&host->tasklet); + set_bit(EVENT_DATA_ERROR, &host->pending_events); + tasklet_schedule(&host->tasklet); + } spin_unlock_irqrestore(&host->irq_lock, flags); @@ -2721,12 +2726,16 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) } if (pending & DW_MCI_DATA_ERROR_FLAGS) { + spin_lock(&host->irq_lock); + /* if there is an error report DATA_ERROR */ mci_writel(host, RINTSTS, DW_MCI_DATA_ERROR_FLAGS); host->data_status = pending; smp_wmb(); /* drain writebuffer */ set_bit(EVENT_DATA_ERROR, &host->pending_events); tasklet_schedule(&host->tasklet); + + spin_unlock(&host->irq_lock); } if (pending & SDMMC_INT_DATA_OVER) { diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 3f28eb4d17..8f36536cb1 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -746,7 +746,7 @@ static void meson_mmc_desc_chain_transfer(struct mmc_host *mmc, u32 cmd_cfg) writel(start, host->regs + SD_EMMC_START); } -/* local sg copy to buffer version with _to/fromio usage for dram_access_quirk */ +/* local sg copy for dram_access_quirk */ static void meson_mmc_copy_buffer(struct meson_host *host, struct mmc_data *data, size_t buflen, bool to_buffer) { @@ -764,21 +764,27 @@ static void meson_mmc_copy_buffer(struct meson_host *host, struct mmc_data *data sg_miter_start(&miter, sgl, nents, sg_flags); while ((offset < buflen) && sg_miter_next(&miter)) { - unsigned int len; + unsigned int buf_offset = 0; + unsigned int len, left; + u32 *buf = miter.addr; len = min(miter.length, buflen - offset); + left = len; - /* When dram_access_quirk, the bounce buffer is a iomem mapping */ - if (host->dram_access_quirk) { - if (to_buffer) - memcpy_toio(host->bounce_iomem_buf + offset, miter.addr, len); - else - memcpy_fromio(miter.addr, host->bounce_iomem_buf + offset, len); + if (to_buffer) { + do { + writel(*buf++, host->bounce_iomem_buf + offset + buf_offset); + + buf_offset += 4; + left -= 4; + } while (left); } else { - if (to_buffer) - memcpy(host->bounce_buf + offset, miter.addr, len); - else - memcpy(miter.addr, host->bounce_buf + offset, len); + do { + *buf++ = readl(host->bounce_iomem_buf + offset + buf_offset); + + buf_offset += 4; + left -= 4; + } while (left); } offset += len; @@ -830,7 +836,11 @@ static void meson_mmc_start_cmd(struct mmc_host *mmc, struct mmc_command *cmd) if (data->flags & MMC_DATA_WRITE) { cmd_cfg |= CMD_CFG_DATA_WR; WARN_ON(xfer_bytes > host->bounce_buf_size); - meson_mmc_copy_buffer(host, data, xfer_bytes, true); + if (host->dram_access_quirk) + meson_mmc_copy_buffer(host, data, xfer_bytes, true); + else + sg_copy_to_buffer(data->sg, data->sg_len, + host->bounce_buf, xfer_bytes); dma_wmb(); } @@ -849,12 +859,43 @@ static void meson_mmc_start_cmd(struct mmc_host *mmc, struct mmc_command *cmd) writel(cmd->arg, host->regs + SD_EMMC_CMD_ARG); } +static int meson_mmc_validate_dram_access(struct mmc_host *mmc, struct mmc_data *data) +{ + struct scatterlist *sg; + int i; + + /* Reject request if any element offset or size is not 32bit aligned */ + for_each_sg(data->sg, sg, data->sg_len, i) { + if (!IS_ALIGNED(sg->offset, sizeof(u32)) || + !IS_ALIGNED(sg->length, sizeof(u32))) { + dev_err(mmc_dev(mmc), "unaligned sg offset %u len %u\n", + data->sg->offset, data->sg->length); + return -EINVAL; + } + } + + return 0; +} + static void meson_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) { struct meson_host *host = mmc_priv(mmc); bool needs_pre_post_req = mrq->data && !(mrq->data->host_cookie & SD_EMMC_PRE_REQ_DONE); + /* + * The memory at the end of the controller used as bounce buffer for + * the dram_access_quirk only accepts 32bit read/write access, + * check the aligment and length of the data before starting the request. + */ + if (host->dram_access_quirk && mrq->data) { + mrq->cmd->error = meson_mmc_validate_dram_access(mmc, mrq->data); + if (mrq->cmd->error) { + mmc_request_done(mmc, mrq); + return; + } + } + if (needs_pre_post_req) { meson_mmc_get_transfer_mode(mmc, mrq); if (!meson_mmc_desc_chain_mode(mrq->data)) @@ -999,7 +1040,11 @@ static irqreturn_t meson_mmc_irq_thread(int irq, void *dev_id) if (meson_mmc_bounce_buf_read(data)) { xfer_bytes = data->blksz * data->blocks; WARN_ON(xfer_bytes > host->bounce_buf_size); - meson_mmc_copy_buffer(host, data, xfer_bytes, false); + if (host->dram_access_quirk) + meson_mmc_copy_buffer(host, data, xfer_bytes, false); + else + sg_copy_from_buffer(data->sg, data->sg_len, + host->bounce_buf, xfer_bytes); } next_cmd = meson_mmc_get_next_command(cmd); diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 6fc4cf3c9d..a4407f391f 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -561,6 +561,8 @@ static void renesas_sdhi_reset(struct tmio_mmc_host *host) /* Unknown why but without polling reset status, it will hang */ read_poll_timeout(reset_control_status, ret, ret == 0, 1, 100, false, priv->rstc); + /* At least SDHI_VER_GEN2_SDR50 needs manual release of reset */ + sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); priv->needs_adjust_hs400 = false; renesas_sdhi_set_clock(host, host->clk_cache); } else if (priv->scc_ctl) { diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 5564d7b23e..d1a1c548c5 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -61,7 +62,6 @@ static void sdhci_at91_set_force_card_detect(struct sdhci_host *host) static void sdhci_at91_set_clock(struct sdhci_host *host, unsigned int clock) { u16 clk; - unsigned long timeout; host->mmc->actual_clock = 0; @@ -86,16 +86,11 @@ static void sdhci_at91_set_clock(struct sdhci_host *host, unsigned int clock) sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); /* Wait max 20 ms */ - timeout = 20; - while (!((clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL)) - & SDHCI_CLOCK_INT_STABLE)) { - if (timeout == 0) { - pr_err("%s: Internal clock never stabilised.\n", - mmc_hostname(host->mmc)); - return; - } - timeout--; - mdelay(1); + if (read_poll_timeout(sdhci_readw, clk, (clk & SDHCI_CLOCK_INT_STABLE), + 1000, 20000, false, host, SDHCI_CLOCK_CONTROL)) { + pr_err("%s: Internal clock never stabilised.\n", + mmc_hostname(host->mmc)); + return; } clk |= SDHCI_CLOCK_CARD_EN; @@ -114,6 +109,7 @@ static void sdhci_at91_reset(struct sdhci_host *host, u8 mask) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_at91_priv *priv = sdhci_pltfm_priv(pltfm_host); + unsigned int tmp; sdhci_reset(host, mask); @@ -126,6 +122,10 @@ static void sdhci_at91_reset(struct sdhci_host *host, u8 mask) sdhci_writel(host, calcr | SDMMC_CALCR_ALWYSON | SDMMC_CALCR_EN, SDMMC_CALCR); + + if (read_poll_timeout(sdhci_readl, tmp, !(tmp & SDMMC_CALCR_EN), + 10, 20000, false, host, SDMMC_CALCR)) + dev_err(mmc_dev(host->mmc), "Failed to calibrate\n"); } } diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index ef0badea4f..04e6f7b267 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -1676,13 +1676,17 @@ qcom_nandc_read_cw_raw(struct mtd_info *mtd, struct nand_chip *chip, struct nand_ecc_ctrl *ecc = &chip->ecc; int data_size1, data_size2, oob_size1, oob_size2; int ret, reg_off = FLASH_BUF_ACC, read_loc = 0; + int raw_cw = cw; nand_read_page_op(chip, page, 0, NULL, 0); host->use_ecc = false; + if (nandc->props->qpic_v2) + raw_cw = ecc->steps - 1; + clear_bam_transaction(nandc); set_address(host, host->cw_size * cw, page); - update_rw_regs(host, 1, true, cw); + update_rw_regs(host, 1, true, raw_cw); config_nand_page_read(chip); data_size1 = mtd->writesize - host->cw_size * (ecc->steps - 1); @@ -1711,7 +1715,7 @@ qcom_nandc_read_cw_raw(struct mtd_info *mtd, struct nand_chip *chip, nandc_set_read_loc(chip, cw, 3, read_loc, oob_size2, 1); } - config_nand_cw_read(chip, false, cw); + config_nand_cw_read(chip, false, raw_cw); read_data_dma(nandc, reg_off, data_buf, data_size1, 0); reg_off += data_size1; diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 1542bfb8b5..7c2968a639 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -449,8 +449,10 @@ EXPORT_SYMBOL(ksz_switch_register); void ksz_switch_remove(struct ksz_device *dev) { /* timer started */ - if (dev->mib_read_interval) + if (dev->mib_read_interval) { + dev->mib_read_interval = 0; cancel_delayed_work_sync(&dev->mib_read); + } dev->dev_ops->exit(dev); dsa_unregister_switch(dev->ds); diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 8ab0be7938..8dadcae93c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -749,7 +750,11 @@ static void mv88e6xxx_mac_link_down(struct dsa_switch *ds, int port, ops = chip->info->ops; mv88e6xxx_reg_lock(chip); - if ((!mv88e6xxx_port_ppu_updates(chip, port) || + /* Internal PHYs propagate their configuration directly to the MAC. + * External PHYs depend on whether the PPU is enabled for this port. + */ + if (((!mv88e6xxx_phy_is_internal(ds, port) && + !mv88e6xxx_port_ppu_updates(chip, port)) || mode == MLO_AN_FIXED) && ops->port_sync_link) err = ops->port_sync_link(chip, port, mode, false); mv88e6xxx_reg_unlock(chip); @@ -772,7 +777,12 @@ static void mv88e6xxx_mac_link_up(struct dsa_switch *ds, int port, ops = chip->info->ops; mv88e6xxx_reg_lock(chip); - if (!mv88e6xxx_port_ppu_updates(chip, port) || mode == MLO_AN_FIXED) { + /* Internal PHYs propagate their configuration directly to the MAC. + * External PHYs depend on whether the PPU is enabled for this port. + */ + if ((!mv88e6xxx_phy_is_internal(ds, port) && + !mv88e6xxx_port_ppu_updates(chip, port)) || + mode == MLO_AN_FIXED) { /* FIXME: for an automedia port, should we force the link * down here - what if the link comes up due to "other" media * while we're bringing the port up, how is the exclusivity @@ -1677,6 +1687,30 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, return 0; } +static int mv88e6xxx_port_commit_pvid(struct mv88e6xxx_chip *chip, int port) +{ + struct dsa_port *dp = dsa_to_port(chip->ds, port); + struct mv88e6xxx_port *p = &chip->ports[port]; + u16 pvid = MV88E6XXX_VID_STANDALONE; + bool drop_untagged = false; + int err; + + if (dp->bridge_dev) { + if (br_vlan_enabled(dp->bridge_dev)) { + pvid = p->bridge_pvid.vid; + drop_untagged = !p->bridge_pvid.valid; + } else { + pvid = MV88E6XXX_VID_BRIDGED; + } + } + + err = mv88e6xxx_port_set_pvid(chip, port, pvid); + if (err) + return err; + + return mv88e6xxx_port_drop_untagged(chip, port, drop_untagged); +} + static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack) @@ -1690,7 +1724,16 @@ static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, return -EOPNOTSUPP; mv88e6xxx_reg_lock(chip); + err = mv88e6xxx_port_set_8021q_mode(chip, port, mode); + if (err) + goto unlock; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + goto unlock; + +unlock: mv88e6xxx_reg_unlock(chip); return err; @@ -1725,11 +1768,15 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port, u16 fid; int err; - /* Null VLAN ID corresponds to the port private database */ + /* Ports have two private address databases: one for when the port is + * standalone and one for when the port is under a bridge and the + * 802.1Q mode is disabled. When the port is standalone, DSA wants its + * address database to remain 100% empty, so we never load an ATU entry + * into a standalone port's database. Therefore, translate the null + * VLAN ID into the port's database used for VLAN-unaware bridging. + */ if (vid == 0) { - err = mv88e6xxx_port_get_fid(chip, port, &fid); - if (err) - return err; + fid = MV88E6XXX_FID_BRIDGED; } else { err = mv88e6xxx_vtu_get(chip, vid, &vlan); if (err) @@ -2123,6 +2170,7 @@ static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, struct mv88e6xxx_chip *chip = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; + struct mv88e6xxx_port *p = &chip->ports[port]; bool warn; u8 member; int err; @@ -2156,13 +2204,21 @@ static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, } if (pvid) { - err = mv88e6xxx_port_set_pvid(chip, port, vlan->vid); - if (err) { - dev_err(ds->dev, "p%d: failed to set PVID %d\n", - port, vlan->vid); + p->bridge_pvid.vid = vlan->vid; + p->bridge_pvid.valid = true; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + goto out; + } else if (vlan->vid && p->bridge_pvid.vid == vlan->vid) { + /* The old pvid was reinstalled as a non-pvid VLAN */ + p->bridge_pvid.valid = false; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) goto out; - } } + out: mv88e6xxx_reg_unlock(chip); @@ -2212,6 +2268,7 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct mv88e6xxx_chip *chip = ds->priv; + struct mv88e6xxx_port *p = &chip->ports[port]; int err = 0; u16 pvid; @@ -2229,7 +2286,9 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, goto unlock; if (vlan->vid == pvid) { - err = mv88e6xxx_port_set_pvid(chip, port, 0); + p->bridge_pvid.valid = false; + + err = mv88e6xxx_port_commit_pvid(chip, port); if (err) goto unlock; } @@ -2393,7 +2452,16 @@ static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, int err; mv88e6xxx_reg_lock(chip); + err = mv88e6xxx_bridge_map(chip, br); + if (err) + goto unlock; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + goto unlock; + +unlock: mv88e6xxx_reg_unlock(chip); return err; @@ -2403,11 +2471,20 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, struct net_device *br) { struct mv88e6xxx_chip *chip = ds->priv; + int err; mv88e6xxx_reg_lock(chip); + if (mv88e6xxx_bridge_map(chip, br) || mv88e6xxx_port_vlan_map(chip, port)) dev_err(ds->dev, "failed to remap in-chip Port VLAN\n"); + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + dev_err(ds->dev, + "port %d failed to restore standalone pvid: %pe\n", + port, ERR_PTR(err)); + mv88e6xxx_reg_unlock(chip); } @@ -2834,8 +2911,8 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) if (err) return err; - /* Port Control 2: don't force a good FCS, set the maximum frame size to - * 10240 bytes, disable 802.1q tags checking, don't discard tagged or + /* Port Control 2: don't force a good FCS, set the MTU size to + * 10222 bytes, disable 802.1q tags checking, don't discard tagged or * untagged frames on this port, do a destination address lookup on all * received packets as usual, disable ARP mirroring and don't send a * copy of all transmitted/received frames on this port to the CPU. @@ -2853,8 +2930,22 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) if (err) return err; + /* Associate MV88E6XXX_VID_BRIDGED with MV88E6XXX_FID_BRIDGED in the + * ATU by virtue of the fact that mv88e6xxx_atu_new() will pick it as + * the first free FID after MV88E6XXX_FID_STANDALONE. This will be used + * as the private PVID on ports under a VLAN-unaware bridge. + * Shared (DSA and CPU) ports must also be members of it, to translate + * the VID from the DSA tag into MV88E6XXX_FID_BRIDGED, instead of + * relying on their port default FID. + */ + err = mv88e6xxx_port_vlan_join(chip, port, MV88E6XXX_VID_BRIDGED, + MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_UNTAGGED, + false); + if (err) + return err; + if (chip->info->ops->port_set_jumbo_size) { - err = chip->info->ops->port_set_jumbo_size(chip, port, 10240); + err = chip->info->ops->port_set_jumbo_size(chip, port, 10218); if (err) return err; } @@ -2925,7 +3016,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) * database, and allow bidirectional communication between the * CPU and DSA port(s), and the other ports. */ - err = mv88e6xxx_port_set_fid(chip, port, 0); + err = mv88e6xxx_port_set_fid(chip, port, MV88E6XXX_FID_STANDALONE); if (err) return err; @@ -2944,10 +3035,10 @@ static int mv88e6xxx_get_max_mtu(struct dsa_switch *ds, int port) struct mv88e6xxx_chip *chip = ds->priv; if (chip->info->ops->port_set_jumbo_size) - return 10240; + return 10240 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; else if (chip->info->ops->set_max_frame_size) - return 1632; - return 1522; + return 1632 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; + return 1522 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; } static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) @@ -2955,6 +3046,9 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) struct mv88e6xxx_chip *chip = ds->priv; int ret = 0; + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + new_mtu += EDSA_HLEN; + mv88e6xxx_reg_lock(chip); if (chip->info->ops->port_set_jumbo_size) ret = chip->info->ops->port_set_jumbo_size(chip, port, new_mtu); @@ -3112,6 +3206,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) } } + err = mv88e6xxx_vtu_setup(chip); + if (err) + goto unlock; + /* Setup Switch Port Registers */ for (i = 0; i < mv88e6xxx_num_ports(chip); i++) { if (dsa_is_unused_port(ds, i)) @@ -3141,10 +3239,6 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) if (err) goto unlock; - err = mv88e6xxx_vtu_setup(chip); - if (err) - goto unlock; - err = mv88e6xxx_pvt_setup(chip); if (err) goto unlock; @@ -3725,7 +3819,6 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, - .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .port_pause_limit = mv88e6097_port_pause_limit, .port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit, @@ -3750,6 +3843,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .avb_ops = &mv88e6165_avb_ops, .ptp_ops = &mv88e6165_ptp_ops, .phylink_validate = mv88e6185_phylink_validate, + .set_max_frame_size = mv88e6185_g1_set_max_frame_size, }; static const struct mv88e6xxx_ops mv88e6165_ops = { diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 675b1f3e43..8271b8aa7b 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -18,8 +18,12 @@ #include #include +#define EDSA_HLEN 8 #define MV88E6XXX_N_FID 4096 +#define MV88E6XXX_FID_STANDALONE 0 +#define MV88E6XXX_FID_BRIDGED 1 + /* PVT limits for 4-bit port and 5-bit switch */ #define MV88E6XXX_MAX_PVT_SWITCHES 32 #define MV88E6XXX_MAX_PVT_PORTS 16 @@ -245,9 +249,15 @@ struct mv88e6xxx_policy { u16 vid; }; +struct mv88e6xxx_vlan { + u16 vid; + bool valid; +}; + struct mv88e6xxx_port { struct mv88e6xxx_chip *chip; int port; + struct mv88e6xxx_vlan bridge_pvid; u64 serdes_stats[2]; u64 atu_member_violation; u64 atu_miss_violation; diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c index 815b0f681d..5848112036 100644 --- a/drivers/net/dsa/mv88e6xxx/global1.c +++ b/drivers/net/dsa/mv88e6xxx/global1.c @@ -232,6 +232,8 @@ int mv88e6185_g1_set_max_frame_size(struct mv88e6xxx_chip *chip, int mtu) u16 val; int err; + mtu += ETH_HLEN + ETH_FCS_LEN; + err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_CTL1, &val); if (err) return err; diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index f77e2ee64a..d9817b20ea 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -1257,6 +1257,27 @@ int mv88e6xxx_port_set_8021q_mode(struct mv88e6xxx_chip *chip, int port, return 0; } +int mv88e6xxx_port_drop_untagged(struct mv88e6xxx_chip *chip, int port, + bool drop_untagged) +{ + u16 old, new; + int err; + + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL2, &old); + if (err) + return err; + + if (drop_untagged) + new = old | MV88E6XXX_PORT_CTL2_DISCARD_UNTAGGED; + else + new = old & ~MV88E6XXX_PORT_CTL2_DISCARD_UNTAGGED; + + if (new == old) + return 0; + + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, new); +} + int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port) { u16 reg; @@ -1277,6 +1298,8 @@ int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, u16 reg; int err; + size += VLAN_ETH_HLEN + ETH_FCS_LEN; + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL2, ®); if (err) return err; diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h index b10e5aebac..03382b66f8 100644 --- a/drivers/net/dsa/mv88e6xxx/port.h +++ b/drivers/net/dsa/mv88e6xxx/port.h @@ -423,6 +423,8 @@ int mv88e6393x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, phy_interface_t mode); int mv88e6185_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode); int mv88e6352_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode); +int mv88e6xxx_port_drop_untagged(struct mv88e6xxx_chip *chip, int port, + bool drop_untagged); int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port); int mv88e6095_port_set_upstream_port(struct mv88e6xxx_chip *chip, int port, int upstream_port); diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index a3a9636430..341236dcbd 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -266,12 +266,12 @@ static void felix_8021q_cpu_port_deinit(struct ocelot *ocelot, int port) */ static int felix_setup_mmio_filtering(struct felix *felix) { - unsigned long user_ports = 0, cpu_ports = 0; + unsigned long user_ports = dsa_user_ports(felix->ds); struct ocelot_vcap_filter *redirect_rule; struct ocelot_vcap_filter *tagging_rule; struct ocelot *ocelot = &felix->ocelot; struct dsa_switch *ds = felix->ds; - int port, ret; + int cpu = -1, port, ret; tagging_rule = kzalloc(sizeof(struct ocelot_vcap_filter), GFP_KERNEL); if (!tagging_rule) @@ -284,12 +284,15 @@ static int felix_setup_mmio_filtering(struct felix *felix) } for (port = 0; port < ocelot->num_phys_ports; port++) { - if (dsa_is_user_port(ds, port)) - user_ports |= BIT(port); - if (dsa_is_cpu_port(ds, port)) - cpu_ports |= BIT(port); + if (dsa_is_cpu_port(ds, port)) { + cpu = port; + break; + } } + if (cpu < 0) + return -EINVAL; + tagging_rule->key_type = OCELOT_VCAP_KEY_ETYPE; *(__be16 *)tagging_rule->key.etype.etype.value = htons(ETH_P_1588); *(__be16 *)tagging_rule->key.etype.etype.mask = htons(0xffff); @@ -325,7 +328,7 @@ static int felix_setup_mmio_filtering(struct felix *felix) * the CPU port module */ redirect_rule->action.mask_mode = OCELOT_MASK_MODE_REDIRECT; - redirect_rule->action.port_mask = cpu_ports; + redirect_rule->action.port_mask = BIT(cpu); } else { /* Trap PTP packets only to the CPU port module (which is * redirected to the NPI port) @@ -1074,6 +1077,101 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) return 0; } +static void ocelot_port_purge_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *skb) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; + struct sk_buff *skb_match = NULL, *skb_tmp; + unsigned long flags; + + if (!clone) + return; + + spin_lock_irqsave(&ocelot_port->tx_skbs.lock, flags); + + skb_queue_walk_safe(&ocelot_port->tx_skbs, skb, skb_tmp) { + if (skb != clone) + continue; + __skb_unlink(skb, &ocelot_port->tx_skbs); + skb_match = skb; + break; + } + + spin_unlock_irqrestore(&ocelot_port->tx_skbs.lock, flags); + + WARN_ONCE(!skb_match, + "Could not find skb clone in TX timestamping list\n"); +} + +#define work_to_xmit_work(w) \ + container_of((w), struct felix_deferred_xmit_work, work) + +static void felix_port_deferred_xmit(struct kthread_work *work) +{ + struct felix_deferred_xmit_work *xmit_work = work_to_xmit_work(work); + struct dsa_switch *ds = xmit_work->dp->ds; + struct sk_buff *skb = xmit_work->skb; + u32 rew_op = ocelot_ptp_rew_op(skb); + struct ocelot *ocelot = ds->priv; + int port = xmit_work->dp->index; + int retries = 10; + + do { + if (ocelot_can_inject(ocelot, 0)) + break; + + cpu_relax(); + } while (--retries); + + if (!retries) { + dev_err(ocelot->dev, "port %d failed to inject skb\n", + port); + ocelot_port_purge_txtstamp_skb(ocelot, port, skb); + kfree_skb(skb); + return; + } + + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + + consume_skb(skb); + kfree(xmit_work); +} + +static int felix_port_setup_tagger_data(struct dsa_switch *ds, int port) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct ocelot *ocelot = ds->priv; + struct felix *felix = ocelot_to_felix(ocelot); + struct felix_port *felix_port; + + if (!dsa_port_is_user(dp)) + return 0; + + felix_port = kzalloc(sizeof(*felix_port), GFP_KERNEL); + if (!felix_port) + return -ENOMEM; + + felix_port->xmit_worker = felix->xmit_worker; + felix_port->xmit_work_fn = felix_port_deferred_xmit; + + dp->priv = felix_port; + + return 0; +} + +static void felix_port_teardown_tagger_data(struct dsa_switch *ds, int port) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct felix_port *felix_port = dp->priv; + + if (!felix_port) + return; + + dp->priv = NULL; + kfree(felix_port); +} + /* Hardware initialization done here so that we can allocate structures with * devm without fear of dsa_register_switch returning -EPROBE_DEFER and causing * us to allocate structures twice (leak memory) and map PCI memory twice @@ -1102,6 +1200,12 @@ static int felix_setup(struct dsa_switch *ds) } } + felix->xmit_worker = kthread_create_worker(0, "felix_xmit"); + if (IS_ERR(felix->xmit_worker)) { + err = PTR_ERR(felix->xmit_worker); + goto out_deinit_timestamp; + } + for (port = 0; port < ds->num_ports; port++) { if (dsa_is_unused_port(ds, port)) continue; @@ -1112,6 +1216,14 @@ static int felix_setup(struct dsa_switch *ds) * bits of vlan tag. */ felix_port_qos_map_init(ocelot, port); + + err = felix_port_setup_tagger_data(ds, port); + if (err) { + dev_err(ds->dev, + "port %d failed to set up tagger data: %pe\n", + port, ERR_PTR(err)); + goto out_deinit_ports; + } } err = ocelot_devlink_sb_register(ocelot); @@ -1126,6 +1238,7 @@ static int felix_setup(struct dsa_switch *ds) * there's no real point in checking for errors. */ felix_set_tag_protocol(ds, port, felix->tag_proto); + break; } ds->mtu_enforcement_ingress = true; @@ -1138,9 +1251,13 @@ static int felix_setup(struct dsa_switch *ds) if (dsa_is_unused_port(ds, port)) continue; + felix_port_teardown_tagger_data(ds, port); ocelot_deinit_port(ocelot, port); } + kthread_destroy_worker(felix->xmit_worker); + +out_deinit_timestamp: ocelot_deinit_timestamp(ocelot); ocelot_deinit(ocelot); @@ -1162,19 +1279,23 @@ static void felix_teardown(struct dsa_switch *ds) continue; felix_del_tag_protocol(ds, port, felix->tag_proto); + break; } - ocelot_devlink_sb_unregister(ocelot); - ocelot_deinit_timestamp(ocelot); - ocelot_deinit(ocelot); - for (port = 0; port < ocelot->num_phys_ports; port++) { if (dsa_is_unused_port(ds, port)) continue; + felix_port_teardown_tagger_data(ds, port); ocelot_deinit_port(ocelot, port); } + kthread_destroy_worker(felix->xmit_worker); + + ocelot_devlink_sb_unregister(ocelot); + ocelot_deinit_timestamp(ocelot); + ocelot_deinit(ocelot); + if (felix->info->mdio_bus_free) felix->info->mdio_bus_free(ocelot); } @@ -1291,8 +1412,12 @@ static void felix_txtstamp(struct dsa_switch *ds, int port, if (!ocelot->ptp) return; - if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) + if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) { + dev_err_ratelimited(ds->dev, + "port %d delivering skb without TX timestamp\n", + port); return; + } if (clone) OCELOT_SKB_CB(skb)->clone = clone; diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 54024b6f94..be3e42e135 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -62,6 +62,7 @@ struct felix { resource_size_t switch_base; resource_size_t imdio_base; enum dsa_tag_protocol tag_proto; + struct kthread_worker *xmit_worker; }; struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port); diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 7c0db80eff..924c3f1299 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3117,7 +3117,7 @@ static void sja1105_teardown(struct dsa_switch *ds) sja1105_static_config_free(&priv->static_config); } -const struct dsa_switch_ops sja1105_switch_ops = { +static const struct dsa_switch_ops sja1105_switch_ops = { .get_tag_protocol = sja1105_get_tag_protocol, .setup = sja1105_setup, .teardown = sja1105_teardown, @@ -3166,7 +3166,6 @@ const struct dsa_switch_ops sja1105_switch_ops = { .port_bridge_tx_fwd_offload = dsa_tag_8021q_bridge_tx_fwd_offload, .port_bridge_tx_fwd_unoffload = dsa_tag_8021q_bridge_tx_fwd_unoffload, }; -EXPORT_SYMBOL_GPL(sja1105_switch_ops); static const struct of_device_id sja1105_dt_ids[]; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 691f6dd7e6..54396992a9 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -64,6 +64,7 @@ enum sja1105_ptp_clk_mode { static int sja1105_change_rxtstamping(struct sja1105_private *priv, bool on) { + struct sja1105_tagger_data *tagger_data = &priv->tagger_data; struct sja1105_ptp_data *ptp_data = &priv->ptp_data; struct sja1105_general_params_entry *general_params; struct sja1105_table *table; @@ -79,7 +80,7 @@ static int sja1105_change_rxtstamping(struct sja1105_private *priv, priv->tagger_data.stampable_skb = NULL; } ptp_cancel_worker_sync(ptp_data->clock); - skb_queue_purge(&ptp_data->skb_txtstamp_queue); + skb_queue_purge(&tagger_data->skb_txtstamp_queue); skb_queue_purge(&ptp_data->skb_rxtstamp_queue); return sja1105_static_config_reload(priv, SJA1105_RX_HWTSTAMPING); @@ -452,40 +453,6 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, return priv->info->rxtstamp(ds, port, skb); } -void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, - enum sja1110_meta_tstamp dir, u64 tstamp) -{ - struct sja1105_private *priv = ds->priv; - struct sja1105_ptp_data *ptp_data = &priv->ptp_data; - struct sk_buff *skb, *skb_tmp, *skb_match = NULL; - struct skb_shared_hwtstamps shwt = {0}; - - /* We don't care about RX timestamps on the CPU port */ - if (dir == SJA1110_META_TSTAMP_RX) - return; - - spin_lock(&ptp_data->skb_txtstamp_queue.lock); - - skb_queue_walk_safe(&ptp_data->skb_txtstamp_queue, skb, skb_tmp) { - if (SJA1105_SKB_CB(skb)->ts_id != ts_id) - continue; - - __skb_unlink(skb, &ptp_data->skb_txtstamp_queue); - skb_match = skb; - - break; - } - - spin_unlock(&ptp_data->skb_txtstamp_queue.lock); - - if (WARN_ON(!skb_match)) - return; - - shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp)); - skb_complete_tx_timestamp(skb_match, &shwt); -} -EXPORT_SYMBOL_GPL(sja1110_process_meta_tstamp); - /* In addition to cloning the skb which is done by the common * sja1105_port_txtstamp, we need to generate a timestamp ID and save the * packet to the TX timestamping queue. @@ -494,7 +461,6 @@ void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) { struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; struct sja1105_private *priv = ds->priv; - struct sja1105_ptp_data *ptp_data = &priv->ptp_data; struct sja1105_port *sp = &priv->ports[port]; u8 ts_id; @@ -510,7 +476,7 @@ void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) spin_unlock(&sp->data->meta_lock); - skb_queue_tail(&ptp_data->skb_txtstamp_queue, clone); + skb_queue_tail(&sp->data->skb_txtstamp_queue, clone); } /* Called from dsa_skb_tx_timestamp. This callback is just to clone @@ -953,7 +919,7 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) /* Only used on SJA1105 */ skb_queue_head_init(&ptp_data->skb_rxtstamp_queue); /* Only used on SJA1110 */ - skb_queue_head_init(&ptp_data->skb_txtstamp_queue); + skb_queue_head_init(&tagger_data->skb_txtstamp_queue); spin_lock_init(&tagger_data->meta_lock); ptp_data->clock = ptp_clock_register(&ptp_data->caps, ds->dev); @@ -971,6 +937,7 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) void sja1105_ptp_clock_unregister(struct dsa_switch *ds) { struct sja1105_private *priv = ds->priv; + struct sja1105_tagger_data *tagger_data = &priv->tagger_data; struct sja1105_ptp_data *ptp_data = &priv->ptp_data; if (IS_ERR_OR_NULL(ptp_data->clock)) @@ -978,7 +945,7 @@ void sja1105_ptp_clock_unregister(struct dsa_switch *ds) del_timer_sync(&ptp_data->extts_timer); ptp_cancel_worker_sync(ptp_data->clock); - skb_queue_purge(&ptp_data->skb_txtstamp_queue); + skb_queue_purge(&tagger_data->skb_txtstamp_queue); skb_queue_purge(&ptp_data->skb_rxtstamp_queue); ptp_clock_unregister(ptp_data->clock); ptp_data->clock = NULL; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h index 3c874bb4c1..3ae6b9fdd4 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.h +++ b/drivers/net/dsa/sja1105/sja1105_ptp.h @@ -8,21 +8,6 @@ #if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) -/* Timestamps are in units of 8 ns clock ticks (equivalent to - * a fixed 125 MHz clock). - */ -#define SJA1105_TICK_NS 8 - -static inline s64 ns_to_sja1105_ticks(s64 ns) -{ - return ns / SJA1105_TICK_NS; -} - -static inline s64 sja1105_ticks_to_ns(s64 ticks) -{ - return ticks * SJA1105_TICK_NS; -} - /* Calculate the first base_time in the future that satisfies this * relationship: * @@ -77,10 +62,6 @@ struct sja1105_ptp_data { struct timer_list extts_timer; /* Used only on SJA1105 to reconstruct partial timestamps */ struct sk_buff_head skb_rxtstamp_queue; - /* Used on SJA1110 where meta frames are generated only for - * 2-step TX timestamps - */ - struct sk_buff_head skb_txtstamp_queue; struct ptp_clock_info caps; struct ptp_clock *clock; struct sja1105_ptp_cmd cmd; diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index d796684ec9..412ae3e43f 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -100,6 +100,7 @@ config JME config KORINA tristate "Korina (IDT RC32434) Ethernet support" depends on MIKROTIK_RB532 || COMPILE_TEST + select CRC32 select MII help If you have a Mikrotik RouterBoard 500 or IDT RC32434 diff --git a/drivers/net/ethernet/arc/Kconfig b/drivers/net/ethernet/arc/Kconfig index 37a41773dd..92a79c4ffa 100644 --- a/drivers/net/ethernet/arc/Kconfig +++ b/drivers/net/ethernet/arc/Kconfig @@ -21,6 +21,7 @@ config ARC_EMAC_CORE depends on ARC || ARCH_ROCKCHIP || COMPILE_TEST select MII select PHYLIB + select CRC32 config ARC_EMAC tristate "ARC EMAC support" diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index 4ab5bf64d3..df8ff839cc 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -192,6 +192,9 @@ static int bgmac_probe(struct platform_device *pdev) bgmac->dma_dev = &pdev->dev; ret = of_get_mac_address(np, bgmac->net_dev->dev_addr); + if (ret == -EPROBE_DEFER) + return ret; + if (ret) dev_warn(&pdev->dev, "MAC address not present in device tree\n"); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 60d94e0a07..4c977dfc44 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -541,8 +541,7 @@ static void enetc_mac_config(struct enetc_hw *hw, phy_interface_t phy_mode) if (phy_interface_mode_is_rgmii(phy_mode)) { val = enetc_port_rd(hw, ENETC_PM0_IF_MODE); - val &= ~ENETC_PM0_IFM_EN_AUTO; - val &= ENETC_PM0_IFM_IFMODE_MASK; + val &= ~(ENETC_PM0_IFM_EN_AUTO | ENETC_PM0_IFM_IFMODE_MASK); val |= ENETC_PM0_IFM_IFMODE_GMII | ENETC_PM0_IFM_RG; enetc_port_wr(hw, ENETC_PM0_IF_MODE, val); } diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 1d3188e8e3..92dc18a4bc 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -780,7 +780,7 @@ struct gve_queue_page_list *gve_assign_rx_qpl(struct gve_priv *priv) gve_num_tx_qpls(priv)); /* we are out of rx qpls */ - if (id == priv->qpl_cfg.qpl_map_size) + if (id == gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv)) return NULL; set_bit(id, priv->qpl_cfg.qpl_id_map); diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 099a2bc5ae..bf8a4a7c43 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -41,6 +41,7 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) { struct gve_priv *priv = netdev_priv(dev); unsigned int start; + u64 packets, bytes; int ring; if (priv->rx) { @@ -48,10 +49,12 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) do { start = u64_stats_fetch_begin(&priv->rx[ring].statss); - s->rx_packets += priv->rx[ring].rpackets; - s->rx_bytes += priv->rx[ring].rbytes; + packets = priv->rx[ring].rpackets; + bytes = priv->rx[ring].rbytes; } while (u64_stats_fetch_retry(&priv->rx[ring].statss, start)); + s->rx_packets += packets; + s->rx_bytes += bytes; } } if (priv->tx) { @@ -59,10 +62,12 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) do { start = u64_stats_fetch_begin(&priv->tx[ring].statss); - s->tx_packets += priv->tx[ring].pkt_done; - s->tx_bytes += priv->tx[ring].bytes_done; + packets = priv->tx[ring].pkt_done; + bytes = priv->tx[ring].bytes_done; } while (u64_stats_fetch_retry(&priv->tx[ring].statss, start)); + s->tx_packets += packets; + s->tx_bytes += bytes; } } } @@ -82,6 +87,9 @@ static int gve_alloc_counter_array(struct gve_priv *priv) static void gve_free_counter_array(struct gve_priv *priv) { + if (!priv->counter_array) + return; + dma_free_coherent(&priv->pdev->dev, priv->num_event_counters * sizeof(*priv->counter_array), @@ -142,6 +150,9 @@ static int gve_alloc_stats_report(struct gve_priv *priv) static void gve_free_stats_report(struct gve_priv *priv) { + if (!priv->stats_report) + return; + del_timer_sync(&priv->stats_report_timer); dma_free_coherent(&priv->pdev->dev, priv->stats_report_len, priv->stats_report, priv->stats_report_bus); @@ -370,18 +381,19 @@ static void gve_free_notify_blocks(struct gve_priv *priv) { int i; - if (priv->msix_vectors) { - /* Free the irqs */ - for (i = 0; i < priv->num_ntfy_blks; i++) { - struct gve_notify_block *block = &priv->ntfy_blocks[i]; - int msix_idx = i; + if (!priv->msix_vectors) + return; - irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, - NULL); - free_irq(priv->msix_vectors[msix_idx].vector, block); - } - free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); + /* Free the irqs */ + for (i = 0; i < priv->num_ntfy_blks; i++) { + struct gve_notify_block *block = &priv->ntfy_blocks[i]; + int msix_idx = i; + + irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, + NULL); + free_irq(priv->msix_vectors[msix_idx].vector, block); } + free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * sizeof(*priv->ntfy_blocks), priv->ntfy_blocks, priv->ntfy_block_bus); @@ -1185,9 +1197,10 @@ static void gve_handle_reset(struct gve_priv *priv) void gve_handle_report_stats(struct gve_priv *priv) { - int idx, stats_idx = 0, tx_bytes; - unsigned int start = 0; struct stats *stats = priv->stats_report->stats; + int idx, stats_idx = 0; + unsigned int start = 0; + u64 tx_bytes; if (!gve_get_report_stats(priv)) return; diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index bb82613682..94941d4e47 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -104,8 +104,14 @@ static int gve_prefill_rx_pages(struct gve_rx_ring *rx) if (!rx->data.page_info) return -ENOMEM; - if (!rx->data.raw_addressing) + if (!rx->data.raw_addressing) { rx->data.qpl = gve_assign_rx_qpl(priv); + if (!rx->data.qpl) { + kvfree(rx->data.page_info); + rx->data.page_info = NULL; + return -ENOMEM; + } + } for (i = 0; i < slots; i++) { if (!rx->data.raw_addressing) { struct page *page = rx->data.qpl->pages[i]; diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 546a605303..8ba21d6dc2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -752,7 +752,6 @@ struct hnae3_tc_info { u8 prio_tc[HNAE3_MAX_USER_PRIO]; /* TC indexed by prio */ u16 tqp_count[HNAE3_MAX_TC]; u16 tqp_offset[HNAE3_MAX_TC]; - unsigned long tc_en; /* bitmap of TC enabled */ u8 num_tc; /* Total number of enabled TCs */ bool mqprio_active; }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index adc54a7266..468b8f07bf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -623,13 +623,9 @@ static int hns3_nic_set_real_num_queue(struct net_device *netdev) return ret; } - for (i = 0; i < HNAE3_MAX_TC; i++) { - if (!test_bit(i, &tc_info->tc_en)) - continue; - + for (i = 0; i < tc_info->num_tc; i++) netdev_set_tc_queue(netdev, i, tc_info->tqp_count[i], tc_info->tqp_offset[i]); - } } ret = netif_set_real_num_tx_queues(netdev, queue_size); @@ -779,6 +775,11 @@ static int hns3_nic_net_open(struct net_device *netdev) if (hns3_nic_resetting(netdev)) return -EBUSY; + if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { + netdev_warn(netdev, "net open repeatedly!\n"); + return 0; + } + netif_carrier_off(netdev); ret = hns3_nic_set_real_num_queue(netdev); @@ -4865,12 +4866,9 @@ static void hns3_init_tx_ring_tc(struct hns3_nic_priv *priv) struct hnae3_tc_info *tc_info = &kinfo->tc_info; int i; - for (i = 0; i < HNAE3_MAX_TC; i++) { + for (i = 0; i < tc_info->num_tc; i++) { int j; - if (!test_bit(i, &tc_info->tc_en)) - continue; - for (j = 0; j < tc_info->tqp_count[i]; j++) { struct hnae3_queue *q; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 7ea511d59e..5ebd96f683 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -334,7 +334,8 @@ static void hns3_selftest_prepare(struct net_device *ndev, #if IS_ENABLED(CONFIG_VLAN_8021Q) /* Disable the vlan filter for selftest does not support it */ - if (h->ae_algo->ops->enable_vlan_filter) + if (h->ae_algo->ops->enable_vlan_filter && + ndev->features & NETIF_F_HW_VLAN_CTAG_FILTER) h->ae_algo->ops->enable_vlan_filter(h, false); #endif @@ -359,7 +360,8 @@ static void hns3_selftest_restore(struct net_device *ndev, bool if_running) h->ae_algo->ops->halt_autoneg(h, false); #if IS_ENABLED(CONFIG_VLAN_8021Q) - if (h->ae_algo->ops->enable_vlan_filter) + if (h->ae_algo->ops->enable_vlan_filter && + ndev->features & NETIF_F_HW_VLAN_CTAG_FILTER) h->ae_algo->ops->enable_vlan_filter(h, true); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index ac9b695133..9c2eeaa822 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -467,7 +467,7 @@ int hclge_cmd_queue_init(struct hclge_dev *hdev) return ret; } -static int hclge_firmware_compat_config(struct hclge_dev *hdev) +static int hclge_firmware_compat_config(struct hclge_dev *hdev, bool en) { struct hclge_firmware_compat_cmd *req; struct hclge_desc desc; @@ -475,13 +475,16 @@ static int hclge_firmware_compat_config(struct hclge_dev *hdev) hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_IMP_COMPAT_CFG, false); - req = (struct hclge_firmware_compat_cmd *)desc.data; + if (en) { + req = (struct hclge_firmware_compat_cmd *)desc.data; - hnae3_set_bit(compat, HCLGE_LINK_EVENT_REPORT_EN_B, 1); - hnae3_set_bit(compat, HCLGE_NCSI_ERROR_REPORT_EN_B, 1); - if (hnae3_dev_phy_imp_supported(hdev)) - hnae3_set_bit(compat, HCLGE_PHY_IMP_EN_B, 1); - req->compat = cpu_to_le32(compat); + hnae3_set_bit(compat, HCLGE_LINK_EVENT_REPORT_EN_B, 1); + hnae3_set_bit(compat, HCLGE_NCSI_ERROR_REPORT_EN_B, 1); + if (hnae3_dev_phy_imp_supported(hdev)) + hnae3_set_bit(compat, HCLGE_PHY_IMP_EN_B, 1); + + req->compat = cpu_to_le32(compat); + } return hclge_cmd_send(&hdev->hw, &desc, 1); } @@ -538,7 +541,7 @@ int hclge_cmd_init(struct hclge_dev *hdev) /* ask the firmware to enable some features, driver can work without * it. */ - ret = hclge_firmware_compat_config(hdev); + ret = hclge_firmware_compat_config(hdev, true); if (ret) dev_warn(&hdev->pdev->dev, "Firmware compatible features not enabled(%d).\n", @@ -568,6 +571,8 @@ static void hclge_cmd_uninit_regs(struct hclge_hw *hw) void hclge_cmd_uninit(struct hclge_dev *hdev) { + hclge_firmware_compat_config(hdev, false); + set_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state); /* wait to ensure that the firmware completes the possible left * over commands. diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 4a619e5d3f..307c9e8305 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -247,6 +247,10 @@ static int hclge_ieee_setets(struct hnae3_handle *h, struct ieee_ets *ets) } hclge_tm_schd_info_update(hdev, num_tc); + if (num_tc > 1) + hdev->flag |= HCLGE_FLAG_DCB_ENABLE; + else + hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; ret = hclge_ieee_ets_to_tm_info(hdev, ets); if (ret) @@ -306,8 +310,7 @@ static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) u8 i, j, pfc_map, *prio_tc; int ret; - if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) || - hdev->flag & HCLGE_FLAG_MQPRIO_ENABLE) + if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) return -EINVAL; if (pfc->pfc_en == hdev->tm_info.pfc_en) @@ -441,8 +444,6 @@ static int hclge_mqprio_qopt_check(struct hclge_dev *hdev, static void hclge_sync_mqprio_qopt(struct hnae3_tc_info *tc_info, struct tc_mqprio_qopt_offload *mqprio_qopt) { - int i; - memset(tc_info, 0, sizeof(*tc_info)); tc_info->num_tc = mqprio_qopt->qopt.num_tc; memcpy(tc_info->prio_tc, mqprio_qopt->qopt.prio_tc_map, @@ -451,9 +452,6 @@ static void hclge_sync_mqprio_qopt(struct hnae3_tc_info *tc_info, sizeof_field(struct hnae3_tc_info, tqp_count)); memcpy(tc_info->tqp_offset, mqprio_qopt->qopt.offset, sizeof_field(struct hnae3_tc_info, tqp_offset)); - - for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) - set_bit(tc_info->prio_tc[i], &tc_info->tc_en); } static int hclge_config_tc(struct hclge_dev *hdev, @@ -519,12 +517,17 @@ static int hclge_setup_tc(struct hnae3_handle *h, return hclge_notify_init_up(hdev); err_out: - /* roll-back */ - memcpy(&kinfo->tc_info, &old_tc_info, sizeof(old_tc_info)); - if (hclge_config_tc(hdev, &kinfo->tc_info)) - dev_err(&hdev->pdev->dev, - "failed to roll back tc configuration\n"); - + if (!tc) { + dev_warn(&hdev->pdev->dev, + "failed to destroy mqprio, will active after reset, ret = %d\n", + ret); + } else { + /* roll-back */ + memcpy(&kinfo->tc_info, &old_tc_info, sizeof(old_tc_info)); + if (hclge_config_tc(hdev, &kinfo->tc_info)) + dev_err(&hdev->pdev->dev, + "failed to roll back tc configuration\n"); + } hclge_notify_init_up(hdev); return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 87d96f82c3..32f62cd2dd 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -719,9 +719,9 @@ static void hclge_dbg_fill_shaper_content(struct hclge_tm_shaper_para *para, sprintf(result[(*index)++], "%6u", para->rate); } -static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) +static int __hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *data_str, + char *buf, int len) { - char data_str[ARRAY_SIZE(tm_pg_items)][HCLGE_DBG_DATA_STR_LEN]; struct hclge_tm_shaper_para c_shaper_para, p_shaper_para; char *result[ARRAY_SIZE(tm_pg_items)], *sch_mode_str; u8 pg_id, sch_mode, weight, pri_bit_map, i, j; @@ -729,8 +729,10 @@ static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) int pos = 0; int ret; - for (i = 0; i < ARRAY_SIZE(tm_pg_items); i++) - result[i] = &data_str[i][0]; + for (i = 0; i < ARRAY_SIZE(tm_pg_items); i++) { + result[i] = data_str; + data_str += HCLGE_DBG_DATA_STR_LEN; + } hclge_dbg_fill_content(content, sizeof(content), tm_pg_items, NULL, ARRAY_SIZE(tm_pg_items)); @@ -781,6 +783,24 @@ static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) return 0; } +static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) +{ + char *data_str; + int ret; + + data_str = kcalloc(ARRAY_SIZE(tm_pg_items), + HCLGE_DBG_DATA_STR_LEN, GFP_KERNEL); + + if (!data_str) + return -ENOMEM; + + ret = __hclge_dbg_dump_tm_pg(hdev, data_str, buf, len); + + kfree(data_str); + + return ret; +} + static int hclge_dbg_dump_tm_port(struct hclge_dev *hdev, char *buf, int len) { struct hclge_tm_shaper_para shaper_para; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 47fea89858..f5b8d1fee0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8708,15 +8708,8 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport, } /* check if we just hit the duplicate */ - if (!ret) { - dev_warn(&hdev->pdev->dev, "VF %u mac(%pM) exists\n", - vport->vport_id, addr); - return 0; - } - - dev_err(&hdev->pdev->dev, - "PF failed to add unicast entry(%pM) in the MAC table\n", - addr); + if (!ret) + return -EEXIST; return ret; } @@ -8868,7 +8861,13 @@ static void hclge_sync_vport_mac_list(struct hclge_vport *vport, } else { set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, &vport->state); - break; + + /* If one unicast mac address is existing in hardware, + * we need to try whether other unicast mac addresses + * are new addresses that can be added. + */ + if (ret != -EEXIST) + break; } } } @@ -12797,8 +12796,12 @@ static void hclge_sync_promisc_mode(struct hclge_dev *hdev) continue; if (vport->vf_info.trusted) { - uc_en = vport->vf_info.request_uc_en > 0; - mc_en = vport->vf_info.request_mc_en > 0; + uc_en = vport->vf_info.request_uc_en > 0 || + vport->overflow_promisc_flags & + HNAE3_OVERFLOW_UPE; + mc_en = vport->vf_info.request_mc_en > 0 || + vport->overflow_promisc_flags & + HNAE3_OVERFLOW_MPE; } bc_en = vport->vf_info.request_bc_en > 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 44618cc4cc..f314dbd3ce 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -687,12 +687,10 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) for (i = 0; i < HNAE3_MAX_TC; i++) { if (hdev->hw_tc_map & BIT(i) && i < kinfo->tc_info.num_tc) { - set_bit(i, &kinfo->tc_info.tc_en); kinfo->tc_info.tqp_offset[i] = i * kinfo->rss_size; kinfo->tc_info.tqp_count[i] = kinfo->rss_size; } else { /* Set to default queue if TC is disable */ - clear_bit(i, &kinfo->tc_info.tc_en); kinfo->tc_info.tqp_offset[i] = 0; kinfo->tc_info.tqp_count[i] = 1; } @@ -729,14 +727,6 @@ static void hclge_tm_tc_info_init(struct hclge_dev *hdev) for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) hdev->tm_info.prio_tc[i] = (i >= hdev->tm_info.num_tc) ? 0 : i; - - /* DCB is enabled if we have more than 1 TC or pfc_en is - * non-zero. - */ - if (hdev->tm_info.num_tc > 1 || hdev->tm_info.pfc_en) - hdev->flag |= HCLGE_FLAG_DCB_ENABLE; - else - hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; } static void hclge_tm_pg_info_init(struct hclge_dev *hdev) @@ -767,10 +757,10 @@ static void hclge_tm_pg_info_init(struct hclge_dev *hdev) static void hclge_update_fc_mode_by_dcb_flag(struct hclge_dev *hdev) { - if (!(hdev->flag & HCLGE_FLAG_DCB_ENABLE)) { + if (hdev->tm_info.num_tc == 1 && !hdev->tm_info.pfc_en) { if (hdev->fc_mode_last_time == HCLGE_FC_PFC) dev_warn(&hdev->pdev->dev, - "DCB is disable, but last mode is FC_PFC\n"); + "Only 1 tc used, but last mode is FC_PFC\n"); hdev->tm_info.fc_mode = hdev->fc_mode_last_time; } else if (hdev->tm_info.fc_mode != HCLGE_FC_PFC) { @@ -796,7 +786,7 @@ static void hclge_update_fc_mode(struct hclge_dev *hdev) } } -static void hclge_pfc_info_init(struct hclge_dev *hdev) +void hclge_tm_pfc_info_update(struct hclge_dev *hdev) { if (hdev->ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) hclge_update_fc_mode(hdev); @@ -812,7 +802,7 @@ static void hclge_tm_schd_info_init(struct hclge_dev *hdev) hclge_tm_vport_info_update(hdev); - hclge_pfc_info_init(hdev); + hclge_tm_pfc_info_update(hdev); } static int hclge_tm_pg_to_pri_map(struct hclge_dev *hdev) @@ -1558,19 +1548,6 @@ void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc) hclge_tm_schd_info_init(hdev); } -void hclge_tm_pfc_info_update(struct hclge_dev *hdev) -{ - /* DCB is enabled if we have more than 1 TC or pfc_en is - * non-zero. - */ - if (hdev->tm_info.num_tc > 1 || hdev->tm_info.pfc_en) - hdev->flag |= HCLGE_FLAG_DCB_ENABLE; - else - hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; - - hclge_pfc_info_init(hdev); -} - int hclge_tm_init_hw(struct hclge_dev *hdev, bool init) { int ret; @@ -1616,7 +1593,7 @@ int hclge_tm_vport_map_update(struct hclge_dev *hdev) if (ret) return ret; - if (!(hdev->flag & HCLGE_FLAG_DCB_ENABLE)) + if (hdev->tm_info.num_tc == 1 && !hdev->tm_info.pfc_en) return 0; return hclge_tm_bp_setup(hdev); diff --git a/drivers/net/ethernet/hisilicon/hns_mdio.c b/drivers/net/ethernet/hisilicon/hns_mdio.c index 3e54017a2a..07fdab5800 100644 --- a/drivers/net/ethernet/hisilicon/hns_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns_mdio.c @@ -354,7 +354,7 @@ static int hns_mdio_reset(struct mii_bus *bus) if (dev_of_node(bus->parent)) { if (!mdio_dev->subctrl_vbase) { - dev_err(&bus->dev, "mdio sys ctl reg has not maped\n"); + dev_err(&bus->dev, "mdio sys ctl reg has not mapped\n"); return -ENODEV; } diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index a4579b3401..6aa6ff89a7 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4708,14 +4708,6 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, return 0; } - if (adapter->failover_pending) { - adapter->init_done_rc = -EAGAIN; - netdev_dbg(netdev, "Failover pending, ignoring login response\n"); - complete(&adapter->init_done); - /* login response buffer will be released on reset */ - return 0; - } - netdev->mtu = adapter->req_mtu - ETH_HLEN; netdev_dbg(adapter->netdev, "Login Response Buffer:\n"); diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index 373eb027b9..09ae1939e6 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -2437,11 +2437,15 @@ static void e100_get_drvinfo(struct net_device *netdev, sizeof(info->bus_info)); } -#define E100_PHY_REGS 0x1C +#define E100_PHY_REGS 0x1D static int e100_get_regs_len(struct net_device *netdev) { struct nic *nic = netdev_priv(netdev); - return 1 + E100_PHY_REGS + sizeof(nic->mem->dump_buf); + + /* We know the number of registers, and the size of the dump buffer. + * Calculate the total size in bytes. + */ + return (1 + E100_PHY_REGS) * sizeof(u32) + sizeof(nic->mem->dump_buf); } static void e100_get_regs(struct net_device *netdev, @@ -2455,14 +2459,18 @@ static void e100_get_regs(struct net_device *netdev, buff[0] = ioread8(&nic->csr->scb.cmd_hi) << 24 | ioread8(&nic->csr->scb.cmd_lo) << 16 | ioread16(&nic->csr->scb.status); - for (i = E100_PHY_REGS; i >= 0; i--) - buff[1 + E100_PHY_REGS - i] = - mdio_read(netdev, nic->mii.phy_id, i); + for (i = 0; i < E100_PHY_REGS; i++) + /* Note that we read the registers in reverse order. This + * ordering is the ABI apparently used by ethtool and other + * applications. + */ + buff[1 + i] = mdio_read(netdev, nic->mii.phy_id, + E100_PHY_REGS - 1 - i); memset(nic->mem->dump_buf, 0, sizeof(nic->mem->dump_buf)); e100_exec_cb(nic, NULL, e100_dump); msleep(10); - memcpy(&buff[2 + E100_PHY_REGS], nic->mem->dump_buf, - sizeof(nic->mem->dump_buf)); + memcpy(&buff[1 + E100_PHY_REGS], nic->mem->dump_buf, + sizeof(nic->mem->dump_buf)); } static void e100_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2f20980dd9..e04b540ced 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4871,7 +4871,8 @@ static void i40e_clear_interrupt_scheme(struct i40e_pf *pf) { int i; - i40e_free_misc_vector(pf); + if (test_bit(__I40E_MISC_IRQ_REQUESTED, pf->state)) + i40e_free_misc_vector(pf); i40e_put_lump(pf->irq_pile, pf->iwarp_base_vector, I40E_IWARP_IRQ_PILE_ID); @@ -10113,7 +10114,7 @@ static int i40e_get_capabilities(struct i40e_pf *pf, if (pf->hw.aq.asq_last_status == I40E_AQ_RC_ENOMEM) { /* retry with a larger buffer */ buf_len = data_size; - } else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK) { + } else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK || err) { dev_info(&pf->pdev->dev, "capability discovery failed, err %s aq_err %s\n", i40e_stat_str(&pf->hw, err), diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 23762a7ef7..cada4e0e40 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1965,7 +1965,6 @@ static void iavf_watchdog_task(struct work_struct *work) } adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; - mutex_unlock(&adapter->crit_lock); queue_delayed_work(iavf_wq, &adapter->watchdog_task, msecs_to_jiffies(10)); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 05cc5870e4..80380aed88 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1313,22 +1313,21 @@ ice_ptp_flush_tx_tracker(struct ice_pf *pf, struct ice_ptp_tx *tx) { u8 idx; - spin_lock(&tx->lock); - for (idx = 0; idx < tx->len; idx++) { u8 phy_idx = idx + tx->quad_offset; - /* Clear any potential residual timestamp in the PHY block */ - if (!pf->hw.reset_ongoing) - ice_clear_phy_tstamp(&pf->hw, tx->quad, phy_idx); - + spin_lock(&tx->lock); if (tx->tstamps[idx].skb) { dev_kfree_skb_any(tx->tstamps[idx].skb); tx->tstamps[idx].skb = NULL; } - } + clear_bit(idx, tx->in_use); + spin_unlock(&tx->lock); - spin_unlock(&tx->lock); + /* Clear any potential residual timestamp in the PHY block */ + if (!pf->hw.reset_ongoing) + ice_clear_phy_tstamp(&pf->hw, tx->quad, phy_idx); + } } /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index fc26e4ddeb..beda8e0ef7 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -3208,7 +3208,7 @@ static unsigned int ixgbe_max_channels(struct ixgbe_adapter *adapter) max_combined = ixgbe_max_rss_indices(adapter); } - return max_combined; + return min_t(int, max_combined, num_online_cpus()); } static void ixgbe_get_channels(struct net_device *dev, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 24e06ba6f5..13c4782b92 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10112,6 +10112,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) struct ixgbe_adapter *adapter = netdev_priv(dev); struct bpf_prog *old_prog; bool need_reset; + int num_queues; if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) return -EINVAL; @@ -10161,11 +10162,14 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) /* Kick start the NAPI context if there is an AF_XDP socket open * on that queue id. This so that receiving will start. */ - if (need_reset && prog) - for (i = 0; i < adapter->num_rx_queues; i++) + if (need_reset && prog) { + num_queues = min_t(int, adapter->num_rx_queues, + adapter->num_xdp_queues); + for (i = 0; i < num_queues; i++) if (adapter->xdp_ring[i]->xsk_pool) (void)ixgbe_xsk_wakeup(adapter->netdev, i, XDP_WAKEUP_RX); + } return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index cf97985628..02e77ffe5c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -155,6 +155,8 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; int err; + mlx5_debug_cq_remove(dev, cq); + mlx5_eq_del_cq(mlx5_get_async_eq(dev), cq); mlx5_eq_del_cq(&cq->eq->core, cq); @@ -162,16 +164,13 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); MLX5_SET(destroy_cq_in, in, uid, cq->uid); err = mlx5_cmd_exec_in(dev, destroy_cq, in); - if (err) - return err; synchronize_irq(cq->irqn); - mlx5_debug_cq_remove(dev, cq); mlx5_cq_put(cq); wait_for_completion(&cq->free); - return 0; + return err; } EXPORT_SYMBOL(mlx5_core_destroy_cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 7b8c818754..03a7a4ce5c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -252,6 +252,7 @@ struct mlx5e_params { struct { u16 mode; u8 num_tc; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; } mqprio; bool rx_cqe_compress_def; bool tunneled_offload_en; @@ -845,6 +846,7 @@ struct mlx5e_priv { struct mlx5e_channel_stats channel_stats[MLX5E_MAX_NUM_CHANNELS]; struct mlx5e_channel_stats trap_stats; struct mlx5e_ptp_stats ptp_stats; + u16 stats_nch; u16 max_nch; u8 max_opened_tc; bool tx_ptp_opened; @@ -1100,12 +1102,6 @@ int mlx5e_ethtool_set_pauseparam(struct mlx5e_priv *priv, struct ethtool_pauseparam *pauseparam); /* mlx5e generic netdev management API */ -static inline unsigned int -mlx5e_calc_max_nch(struct mlx5e_priv *priv, const struct mlx5e_profile *profile) -{ - return priv->netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1); -} - static inline bool mlx5e_tx_mpwqe_supported(struct mlx5_core_dev *mdev) { @@ -1114,11 +1110,13 @@ mlx5e_tx_mpwqe_supported(struct mlx5_core_dev *mdev) } int mlx5e_priv_init(struct mlx5e_priv *priv, + const struct mlx5e_profile *profile, struct net_device *netdev, struct mlx5_core_dev *mdev); void mlx5e_priv_cleanup(struct mlx5e_priv *priv); struct net_device * -mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int rxqs); +mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile, + unsigned int txqs, unsigned int rxqs); int mlx5e_attach_netdev(struct mlx5e_priv *priv); void mlx5e_detach_netdev(struct mlx5e_priv *priv); void mlx5e_destroy_netdev(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c index ac44bbe95c..d290d7276b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c @@ -35,7 +35,7 @@ static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, void *data, { int ch, i = 0; - for (ch = 0; ch < priv->max_nch; ch++) { + for (ch = 0; ch < priv->stats_nch; ch++) { void *buf = data + i; if (WARN_ON_ONCE(buf + @@ -51,7 +51,7 @@ static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, void *data, static int mlx5e_hv_vhca_stats_buf_size(struct mlx5e_priv *priv) { return (sizeof(struct mlx5e_hv_vhca_per_ring_stats) * - priv->max_nch); + priv->stats_nch); } static void mlx5e_hv_vhca_stats_work(struct work_struct *work) @@ -100,7 +100,7 @@ static void mlx5e_hv_vhca_stats_control(struct mlx5_hv_vhca_agent *agent, sagent = &priv->stats_agent; block->version = MLX5_HV_VHCA_STATS_VERSION; - block->rings = priv->max_nch; + block->rings = priv->stats_nch; if (!block->command) { cancel_delayed_work_sync(&priv->stats_agent.work); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index ee688dec67..3a86f66d12 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -13,8 +13,6 @@ struct mlx5e_ptp_fs { bool valid; }; -#define MLX5E_PTP_CHANNEL_IX 0 - struct mlx5e_ptp_params { struct mlx5e_params params; struct mlx5e_sq_param txq_sq_param; @@ -509,6 +507,7 @@ static int mlx5e_init_ptp_rq(struct mlx5e_ptp *c, struct mlx5e_params *params, rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); rq->stats = &c->priv->ptp_stats.rq; + rq->ix = MLX5E_PTP_CHANNEL_IX; rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); err = mlx5e_rq_set_handlers(rq, params, false); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index c96668bd70..a71a32e00e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -8,6 +8,8 @@ #include "en_stats.h" #include +#define MLX5E_PTP_CHANNEL_IX 0 + struct mlx5e_ptpsq { struct mlx5e_txqsq txqsq; struct mlx5e_cq ts_cq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index b5ddaa8275..c6d2f8c78d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -475,9 +475,6 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) esw_warn(mdev, "Failed to allocate bridge offloads workqueue\n"); goto err_alloc_wq; } - INIT_DELAYED_WORK(&br_offloads->update_work, mlx5_esw_bridge_update_work); - queue_delayed_work(br_offloads->wq, &br_offloads->update_work, - msecs_to_jiffies(MLX5_ESW_BRIDGE_UPDATE_INTERVAL)); br_offloads->nb.notifier_call = mlx5_esw_bridge_switchdev_event; err = register_switchdev_notifier(&br_offloads->nb); @@ -500,6 +497,9 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) err); goto err_register_netdev; } + INIT_DELAYED_WORK(&br_offloads->update_work, mlx5_esw_bridge_update_work); + queue_delayed_work(br_offloads->wq, &br_offloads->update_work, + msecs_to_jiffies(MLX5_ESW_BRIDGE_UPDATE_INTERVAL)); return; err_register_netdev: @@ -523,10 +523,10 @@ void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) if (!br_offloads) return; + cancel_delayed_work_sync(&br_offloads->update_work); unregister_netdevice_notifier(&br_offloads->netdev_nb); unregister_switchdev_blocking_notifier(&br_offloads->nb_blk); unregister_switchdev_notifier(&br_offloads->nb); - cancel_delayed_work(&br_offloads->update_work); destroy_workqueue(br_offloads->wq); rtnl_lock(); mlx5_esw_bridge_cleanup(esw); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 306fb5d6a3..9d451b8ee4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -2036,6 +2036,17 @@ static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable) } new_params = priv->channels.params; + /* Don't allow enabling TX-port-TS if MQPRIO mode channel offload is + * active, since it defines explicitly which TC accepts the packet. + * This conflicts with TX-port-TS hijacking the PTP traffic to a specific + * HW TX-queue. + */ + if (enable && new_params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + netdev_err(priv->netdev, + "%s: MQPRIO mode channel offload is active, cannot set the TX-port-TS\n", + __func__); + return -EINVAL; + } MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_PORT_TS, enable); /* No need to verify SQ stop room as * ptpsq.txqsq.stop_room <= generic_sq->stop_room, and both diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3fd515e7bf..09c8b71b18 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2264,7 +2264,7 @@ void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv) } static int mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc, - struct tc_mqprio_qopt_offload *mqprio) + struct netdev_tc_txq *tc_to_txq) { int tc, err; @@ -2282,11 +2282,8 @@ static int mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc, for (tc = 0; tc < ntc; tc++) { u16 count, offset; - /* For DCB mode, map netdev TCs to offset 0 - * We have our own UP to TXQ mapping for QoS - */ - count = mqprio ? mqprio->qopt.count[tc] : nch; - offset = mqprio ? mqprio->qopt.offset[tc] : 0; + count = tc_to_txq[tc].count; + offset = tc_to_txq[tc].offset; netdev_set_tc_queue(netdev, tc, count, offset); } @@ -2315,19 +2312,24 @@ int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv) static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) { + struct netdev_tc_txq old_tc_to_txq[TC_MAX_QUEUE], *tc_to_txq; struct net_device *netdev = priv->netdev; int old_num_txqs, old_ntc; int num_rxqs, nch, ntc; int err; + int i; old_num_txqs = netdev->real_num_tx_queues; old_ntc = netdev->num_tc ? : 1; + for (i = 0; i < ARRAY_SIZE(old_tc_to_txq); i++) + old_tc_to_txq[i] = netdev->tc_to_txq[i]; nch = priv->channels.params.num_channels; - ntc = mlx5e_get_dcb_num_tc(&priv->channels.params); + ntc = priv->channels.params.mqprio.num_tc; num_rxqs = nch * priv->profile->rq_groups; + tc_to_txq = priv->channels.params.mqprio.tc_to_txq; - err = mlx5e_netdev_set_tcs(netdev, nch, ntc, NULL); + err = mlx5e_netdev_set_tcs(netdev, nch, ntc, tc_to_txq); if (err) goto err_out; err = mlx5e_update_tx_netdev_queues(priv); @@ -2350,11 +2352,14 @@ static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) WARN_ON_ONCE(netif_set_real_num_tx_queues(netdev, old_num_txqs)); err_tcs: - mlx5e_netdev_set_tcs(netdev, old_num_txqs / old_ntc, old_ntc, NULL); + WARN_ON_ONCE(mlx5e_netdev_set_tcs(netdev, old_num_txqs / old_ntc, old_ntc, + old_tc_to_txq)); err_out: return err; } +static MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_update_netdev_queues); + static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv, struct mlx5e_params *params) { @@ -2861,6 +2866,58 @@ static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd) return 0; } +static void mlx5e_mqprio_build_default_tc_to_txq(struct netdev_tc_txq *tc_to_txq, + int ntc, int nch) +{ + int tc; + + memset(tc_to_txq, 0, sizeof(*tc_to_txq) * TC_MAX_QUEUE); + + /* Map netdev TCs to offset 0. + * We have our own UP to TXQ mapping for DCB mode of QoS + */ + for (tc = 0; tc < ntc; tc++) { + tc_to_txq[tc] = (struct netdev_tc_txq) { + .count = nch, + .offset = 0, + }; + } +} + +static void mlx5e_mqprio_build_tc_to_txq(struct netdev_tc_txq *tc_to_txq, + struct tc_mqprio_qopt *qopt) +{ + int tc; + + for (tc = 0; tc < TC_MAX_QUEUE; tc++) { + tc_to_txq[tc] = (struct netdev_tc_txq) { + .count = qopt->count[tc], + .offset = qopt->offset[tc], + }; + } +} + +static void mlx5e_params_mqprio_dcb_set(struct mlx5e_params *params, u8 num_tc) +{ + params->mqprio.mode = TC_MQPRIO_MODE_DCB; + params->mqprio.num_tc = num_tc; + mlx5e_mqprio_build_default_tc_to_txq(params->mqprio.tc_to_txq, num_tc, + params->num_channels); +} + +static void mlx5e_params_mqprio_channel_set(struct mlx5e_params *params, + struct tc_mqprio_qopt *qopt) +{ + params->mqprio.mode = TC_MQPRIO_MODE_CHANNEL; + params->mqprio.num_tc = qopt->num_tc; + mlx5e_mqprio_build_tc_to_txq(params->mqprio.tc_to_txq, qopt); +} + +static void mlx5e_params_mqprio_reset(struct mlx5e_params *params) +{ + mlx5e_params_mqprio_dcb_set(params, 1); +} + static int mlx5e_setup_tc_mqprio_dcb(struct mlx5e_priv *priv, struct tc_mqprio_qopt *mqprio) { @@ -2874,8 +2931,7 @@ static int mlx5e_setup_tc_mqprio_dcb(struct mlx5e_priv *priv, return -EINVAL; new_params = priv->channels.params; - new_params.mqprio.mode = TC_MQPRIO_MODE_DCB; - new_params.mqprio.num_tc = tc ? tc : 1; + mlx5e_params_mqprio_dcb_set(&new_params, tc ? tc : 1); err = mlx5e_safe_switch_params(priv, &new_params, mlx5e_num_channels_changed_ctx, NULL, true); @@ -2889,9 +2945,17 @@ static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, struct tc_mqprio_qopt_offload *mqprio) { struct net_device *netdev = priv->netdev; + struct mlx5e_ptp *ptp_channel; int agg_count = 0; int i; + ptp_channel = priv->channels.ptp; + if (ptp_channel && test_bit(MLX5E_PTP_STATE_TX, ptp_channel->state)) { + netdev_err(netdev, + "Cannot activate MQPRIO mode channel since it conflicts with TX port TS\n"); + return -EINVAL; + } + if (mqprio->qopt.offset[0] != 0 || mqprio->qopt.num_tc < 1 || mqprio->qopt.num_tc > MLX5E_MAX_NUM_MQPRIO_CH_TC) return -EINVAL; @@ -2917,8 +2981,8 @@ static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, agg_count += mqprio->qopt.count[i]; } - if (priv->channels.params.num_channels < agg_count) { - netdev_err(netdev, "Num of queues (%d) exceeds available (%d)\n", + if (priv->channels.params.num_channels != agg_count) { + netdev_err(netdev, "Num of queues (%d) does not match available (%d)\n", agg_count, priv->channels.params.num_channels); return -EINVAL; } @@ -2926,25 +2990,12 @@ static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, return 0; } -static int mlx5e_mqprio_channel_set_tcs_ctx(struct mlx5e_priv *priv, void *ctx) -{ - struct tc_mqprio_qopt_offload *mqprio = (struct tc_mqprio_qopt_offload *)ctx; - struct net_device *netdev = priv->netdev; - u8 num_tc; - - if (priv->channels.params.mqprio.mode != TC_MQPRIO_MODE_CHANNEL) - return -EINVAL; - - num_tc = priv->channels.params.mqprio.num_tc; - mlx5e_netdev_set_tcs(netdev, 0, num_tc, mqprio); - - return 0; -} - static int mlx5e_setup_tc_mqprio_channel(struct mlx5e_priv *priv, struct tc_mqprio_qopt_offload *mqprio) { + mlx5e_fp_preactivate preactivate; struct mlx5e_params new_params; + bool nch_changed; int err; err = mlx5e_mqprio_channel_validate(priv, mqprio); @@ -2952,12 +3003,12 @@ static int mlx5e_setup_tc_mqprio_channel(struct mlx5e_priv *priv, return err; new_params = priv->channels.params; - new_params.mqprio.mode = TC_MQPRIO_MODE_CHANNEL; - new_params.mqprio.num_tc = mqprio->qopt.num_tc; - err = mlx5e_safe_switch_params(priv, &new_params, - mlx5e_mqprio_channel_set_tcs_ctx, mqprio, true); + mlx5e_params_mqprio_channel_set(&new_params, &mqprio->qopt); - return err; + nch_changed = mlx5e_get_dcb_num_tc(&priv->channels.params) > 1; + preactivate = nch_changed ? mlx5e_num_channels_changed_ctx : + mlx5e_update_netdev_queues_ctx; + return mlx5e_safe_switch_params(priv, &new_params, preactivate, NULL, true); } static int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, @@ -3065,7 +3116,7 @@ void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s) { int i; - for (i = 0; i < priv->max_nch; i++) { + for (i = 0; i < priv->stats_nch; i++) { struct mlx5e_channel_stats *channel_stats = &priv->channel_stats[i]; struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq; struct mlx5e_rq_stats *rq_stats = &channel_stats->rq; @@ -3274,20 +3325,67 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable) return mlx5_set_port_fcs(mdev, !enable); } +static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {}; + bool supported, curr_state; + int err; + + if (!MLX5_CAP_GEN(mdev, ports_check)) + return 0; + + err = mlx5_query_ports_check(mdev, in, sizeof(in)); + if (err) + return err; + + supported = MLX5_GET(pcmr_reg, in, rx_ts_over_crc_cap); + curr_state = MLX5_GET(pcmr_reg, in, rx_ts_over_crc); + + if (!supported || enable == curr_state) + return 0; + + MLX5_SET(pcmr_reg, in, local_port, 1); + MLX5_SET(pcmr_reg, in, rx_ts_over_crc, enable); + + return mlx5_set_ports_check(mdev, in, sizeof(in)); +} + static int set_feature_rx_fcs(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_channels *chs = &priv->channels; + struct mlx5_core_dev *mdev = priv->mdev; int err; mutex_lock(&priv->state_lock); - priv->channels.params.scatter_fcs_en = enable; - err = mlx5e_modify_channels_scatter_fcs(&priv->channels, enable); - if (err) - priv->channels.params.scatter_fcs_en = !enable; + if (enable) { + err = mlx5e_set_rx_port_ts(mdev, false); + if (err) + goto out; + chs->params.scatter_fcs_en = true; + err = mlx5e_modify_channels_scatter_fcs(chs, true); + if (err) { + chs->params.scatter_fcs_en = false; + mlx5e_set_rx_port_ts(mdev, true); + } + } else { + chs->params.scatter_fcs_en = false; + err = mlx5e_modify_channels_scatter_fcs(chs, false); + if (err) { + chs->params.scatter_fcs_en = true; + goto out; + } + err = mlx5e_set_rx_port_ts(mdev, true); + if (err) { + mlx5_core_warn(mdev, "Failed to set RX port timestamp %d\n", err); + err = 0; + } + } + +out: mutex_unlock(&priv->state_lock); - return err; } @@ -4186,13 +4284,11 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 struct mlx5_core_dev *mdev = priv->mdev; u8 rx_cq_period_mode; - priv->max_nch = mlx5e_calc_max_nch(priv, priv->profile); - params->sw_mtu = mtu; params->hard_mtu = MLX5E_ETH_HARD_MTU; params->num_channels = min_t(unsigned int, MLX5E_MAX_NUM_CHANNELS / 2, priv->max_nch); - params->mqprio.num_tc = 1; + mlx5e_params_mqprio_reset(params); /* Set an initial non-zero value, so that mlx5e_select_queue won't * divide by zero if called before first activating channels. @@ -4682,8 +4778,35 @@ static const struct mlx5e_profile mlx5e_nic_profile = { .rx_ptp_support = true, }; +static unsigned int +mlx5e_calc_max_nch(struct mlx5_core_dev *mdev, struct net_device *netdev, + const struct mlx5e_profile *profile) + +{ + unsigned int max_nch, tmp; + + /* core resources */ + max_nch = mlx5e_get_max_num_channels(mdev); + + /* netdev rx queues */ + tmp = netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1); + max_nch = min_t(unsigned int, max_nch, tmp); + + /* netdev tx queues */ + tmp = netdev->num_tx_queues; + if (mlx5_qos_is_supported(mdev)) + tmp -= mlx5e_qos_max_leaf_nodes(mdev); + if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) + tmp -= profile->max_tc; + tmp = tmp / profile->max_tc; + max_nch = min_t(unsigned int, max_nch, tmp); + + return max_nch; +} + /* mlx5e generic netdev management API (move to en_common.c) */ int mlx5e_priv_init(struct mlx5e_priv *priv, + const struct mlx5e_profile *profile, struct net_device *netdev, struct mlx5_core_dev *mdev) { @@ -4691,6 +4814,8 @@ int mlx5e_priv_init(struct mlx5e_priv *priv, priv->mdev = mdev; priv->netdev = netdev; priv->msglevel = MLX5E_MSG_LEVEL; + priv->max_nch = mlx5e_calc_max_nch(mdev, netdev, profile); + priv->stats_nch = priv->max_nch; priv->max_opened_tc = 1; if (!alloc_cpumask_var(&priv->scratchpad.cpumask, GFP_KERNEL)) @@ -4734,7 +4859,8 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) } struct net_device * -mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int rxqs) +mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile, + unsigned int txqs, unsigned int rxqs) { struct net_device *netdev; int err; @@ -4745,7 +4871,7 @@ mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int return NULL; } - err = mlx5e_priv_init(netdev_priv(netdev), netdev, mdev); + err = mlx5e_priv_init(netdev_priv(netdev), profile, netdev, mdev); if (err) { mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); goto err_free_netdev; @@ -4787,7 +4913,7 @@ int mlx5e_attach_netdev(struct mlx5e_priv *priv) clear_bit(MLX5E_STATE_DESTROYING, &priv->state); /* max number of channels may have changed */ - max_nch = mlx5e_get_max_num_channels(priv->mdev); + max_nch = mlx5e_calc_max_nch(priv->mdev, priv->netdev, profile); if (priv->channels.params.num_channels > max_nch) { mlx5_core_warn(priv->mdev, "MLX5E: Reducing number of channels to %d\n", max_nch); /* Reducing the number of channels - RXFH has to be reset, and @@ -4795,7 +4921,18 @@ int mlx5e_attach_netdev(struct mlx5e_priv *priv) */ priv->netdev->priv_flags &= ~IFF_RXFH_CONFIGURED; priv->channels.params.num_channels = max_nch; + if (priv->channels.params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + mlx5_core_warn(priv->mdev, "MLX5E: Disabling MQPRIO channel mode\n"); + mlx5e_params_mqprio_reset(&priv->channels.params); + } } + if (max_nch != priv->max_nch) { + mlx5_core_warn(priv->mdev, + "MLX5E: Updating max number of channels from %u to %u\n", + priv->max_nch, max_nch); + priv->max_nch = max_nch; + } + /* 1. Set the real number of queues in the kernel the first time. * 2. Set our default XPS cpumask. * 3. Build the RQT. @@ -4860,7 +4997,7 @@ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mde struct mlx5e_priv *priv = netdev_priv(netdev); int err; - err = mlx5e_priv_init(priv, netdev, mdev); + err = mlx5e_priv_init(priv, new_profile, netdev, mdev); if (err) { mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); return err; @@ -4886,20 +5023,12 @@ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mde int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, const struct mlx5e_profile *new_profile, void *new_ppriv) { - unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile); const struct mlx5e_profile *orig_profile = priv->profile; struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; void *orig_ppriv = priv->ppriv; int err, rollback_err; - /* sanity */ - if (new_max_nch != priv->max_nch) { - netdev_warn(netdev, "%s: Replacing profile with different max channels\n", - __func__); - return -EINVAL; - } - /* cleanup old profile */ mlx5e_detach_netdev(priv); priv->profile->cleanup(priv); @@ -4995,7 +5124,7 @@ static int mlx5e_probe(struct auxiliary_device *adev, nch = mlx5e_get_max_num_channels(mdev); txqs = nch * profile->max_tc + ptp_txqs + qos_sqs; rxqs = nch * profile->rq_groups; - netdev = mlx5e_create_netdev(mdev, txqs, rxqs); + netdev = mlx5e_create_netdev(mdev, profile, txqs, rxqs); if (!netdev) { mlx5_core_err(mdev, "mlx5e_create_netdev failed\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index ae71a17fdb..0684ac6699 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -596,7 +596,6 @@ static void mlx5e_build_rep_params(struct net_device *netdev) MLX5_CQ_PERIOD_MODE_START_FROM_CQE : MLX5_CQ_PERIOD_MODE_START_FROM_EQE; - priv->max_nch = mlx5e_calc_max_nch(priv, priv->profile); params = &priv->channels.params; params->num_channels = MLX5E_REP_PARAMS_DEF_NUM_CHANNELS; @@ -619,6 +618,11 @@ static void mlx5e_build_rep_params(struct net_device *netdev) params->mqprio.num_tc = 1; params->tunneled_offload_en = false; + /* Set an initial non-zero value, so that mlx5e_select_queue won't + * divide by zero if called before first activating channels. + */ + priv->num_tc_x_num_ch = params->num_channels * params->mqprio.num_tc; + mlx5_query_min_inline(mdev, ¶ms->tx_min_inline_mode); } @@ -644,7 +648,6 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev, netdev->hw_features |= NETIF_F_RXCSUM; netdev->features |= netdev->hw_features; - netdev->features |= NETIF_F_VLAN_CHALLENGED; netdev->features |= NETIF_F_NETNS_LOCAL; } @@ -1169,7 +1172,7 @@ mlx5e_vport_vf_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) nch = mlx5e_get_max_num_channels(dev); txqs = nch * profile->max_tc; rxqs = nch * profile->rq_groups; - netdev = mlx5e_create_netdev(dev, txqs, rxqs); + netdev = mlx5e_create_netdev(dev, profile, txqs, rxqs); if (!netdev) { mlx5_core_warn(dev, "Failed to create representor netdev for vport %d\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 3c65fd0bcf..29a6586ef2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1001,14 +1001,9 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, goto csum_unnecessary; if (likely(is_last_ethertype_ip(skb, &network_depth, &proto))) { - u8 ipproto = get_ip_proto(skb, network_depth, proto); - - if (unlikely(ipproto == IPPROTO_SCTP)) + if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP)) goto csum_unnecessary; - if (unlikely(mlx5_ipsec_is_rx_flow(cqe))) - goto csum_none; - stats->csum_complete++; skb->ip_summed = CHECKSUM_COMPLETE; skb->csum = csum_unfold((__force __sum16)cqe->check_sum); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index e4f5b63951..e1dd170190 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -34,6 +34,7 @@ #include "en.h" #include "en_accel/tls.h" #include "en_accel/en_accel.h" +#include "en/ptp.h" static unsigned int stats_grps_num(struct mlx5e_priv *priv) { @@ -450,7 +451,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw) memset(s, 0, sizeof(*s)); - for (i = 0; i < priv->max_nch; i++) { + for (i = 0; i < priv->stats_nch; i++) { struct mlx5e_channel_stats *channel_stats = &priv->channel_stats[i]; int j; @@ -2076,7 +2077,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ptp) if (priv->rx_ptp_opened) { for (i = 0; i < NUM_PTP_RQ_STATS; i++) sprintf(data + (idx++) * ETH_GSTRING_LEN, - ptp_rq_stats_desc[i].format); + ptp_rq_stats_desc[i].format, MLX5E_PTP_CHANNEL_IX); } return idx; } @@ -2119,7 +2120,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ptp) { return; } static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels) { - int max_nch = priv->max_nch; + int max_nch = priv->stats_nch; return (NUM_RQ_STATS * max_nch) + (NUM_CH_STATS * max_nch) + @@ -2133,7 +2134,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels) static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels) { bool is_xsk = priv->xsk.ever_used; - int max_nch = priv->max_nch; + int max_nch = priv->stats_nch; int i, j, tc; for (i = 0; i < max_nch; i++) @@ -2175,7 +2176,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels) static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(channels) { bool is_xsk = priv->xsk.ever_used; - int max_nch = priv->max_nch; + int max_nch = priv->stats_nch; int i, j, tc; for (i = 0; i < max_nch; i++) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 0399a396d1..60a7399001 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -79,12 +79,16 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, int dest_num = 0; int err = 0; - if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { + if (vport->egress.legacy.drop_counter) { + drop_counter = vport->egress.legacy.drop_counter; + } else if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { drop_counter = mlx5_fc_create(esw->dev, false); - if (IS_ERR(drop_counter)) + if (IS_ERR(drop_counter)) { esw_warn(esw->dev, "vport[%d] configure egress drop rule counter err(%ld)\n", vport->vport, PTR_ERR(drop_counter)); + drop_counter = NULL; + } vport->egress.legacy.drop_counter = drop_counter; } @@ -123,7 +127,7 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; /* Attach egress drop flow counter */ - if (!IS_ERR_OR_NULL(drop_counter)) { + if (drop_counter) { flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; drop_ctr_dst.counter_id = mlx5_fc_id(drop_counter); @@ -162,7 +166,7 @@ void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, esw_acl_egress_table_destroy(vport); clean_drop_counter: - if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_counter)) { + if (vport->egress.legacy.drop_counter) { mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); vport->egress.legacy.drop_counter = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index f75b86abaf..b1a5199260 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -160,7 +160,9 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, esw_acl_ingress_lgcy_rules_destroy(vport); - if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { + if (vport->ingress.legacy.drop_counter) { + counter = vport->ingress.legacy.drop_counter; + } else if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { counter = mlx5_fc_create(esw->dev, false); if (IS_ERR(counter)) { esw_warn(esw->dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 67571e5040..269ebb53ed 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -113,7 +113,7 @@ static void mlx5i_grp_sw_update_stats(struct mlx5e_priv *priv) struct mlx5e_sw_stats s = { 0 }; int i, j; - for (i = 0; i < priv->max_nch; i++) { + for (i = 0; i < priv->stats_nch; i++) { struct mlx5e_channel_stats *channel_stats; struct mlx5e_rq_stats *rq_stats; @@ -711,7 +711,7 @@ static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num, goto destroy_ht; } - err = mlx5e_priv_init(epriv, netdev, mdev); + err = mlx5e_priv_init(epriv, prof, netdev, mdev); if (err) goto destroy_mdev_resources; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index ffac8a0e7a..91e806c1aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -448,22 +448,20 @@ static u64 find_target_cycles(struct mlx5_core_dev *mdev, s64 target_ns) return cycles_now + cycles_delta; } -static u64 perout_conf_internal_timer(struct mlx5_core_dev *mdev, - s64 sec, u32 nsec) +static u64 perout_conf_internal_timer(struct mlx5_core_dev *mdev, s64 sec) { - struct timespec64 ts; + struct timespec64 ts = {}; s64 target_ns; ts.tv_sec = sec; - ts.tv_nsec = nsec; target_ns = timespec64_to_ns(&ts); return find_target_cycles(mdev, target_ns); } -static u64 perout_conf_real_time(s64 sec, u32 nsec) +static u64 perout_conf_real_time(s64 sec) { - return (u64)nsec | (u64)sec << 32; + return (u64)sec << 32; } static int mlx5_perout_configure(struct ptp_clock_info *ptp, @@ -474,6 +472,7 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); + bool rt_mode = mlx5_real_time_mode(mdev); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; struct timespec64 ts; u32 field_select = 0; @@ -501,8 +500,10 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (on) { bool rt_mode = mlx5_real_time_mode(mdev); - u32 nsec; - s64 sec; + s64 sec = rq->perout.start.sec; + + if (rq->perout.start.nsec) + return -EINVAL; pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; @@ -513,14 +514,11 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if ((ns >> 1) != 500000000LL) return -EINVAL; - nsec = rq->perout.start.nsec; - sec = rq->perout.start.sec; - if (rt_mode && sec > U32_MAX) return -EINVAL; - time_stamp = rt_mode ? perout_conf_real_time(sec, nsec) : - perout_conf_internal_timer(mdev, sec, nsec); + time_stamp = rt_mode ? perout_conf_real_time(sec) : + perout_conf_internal_timer(mdev, sec); field_select |= MLX5_MTPPS_FS_PIN_MODE | MLX5_MTPPS_FS_PATTERN | @@ -538,6 +536,9 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (err) return err; + if (rt_mode) + return 0; + return mlx5_set_mtppse(mdev, pin, 0, MLX5_EVENT_MODE_REPETETIVE & on); } @@ -705,20 +706,14 @@ static void ts_next_sec(struct timespec64 *ts) static u64 perout_conf_next_event_timer(struct mlx5_core_dev *mdev, struct mlx5_clock *clock) { - bool rt_mode = mlx5_real_time_mode(mdev); struct timespec64 ts; s64 target_ns; - if (rt_mode) - ts = mlx5_ptp_gettimex_real_time(mdev, NULL); - else - mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); - + mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); ts_next_sec(&ts); target_ns = timespec64_to_ns(&ts); - return rt_mode ? perout_conf_real_time(ts.tv_sec, ts.tv_nsec) : - find_target_cycles(mdev, target_ns); + return find_target_cycles(mdev, target_ns); } static int mlx5_pps_event(struct notifier_block *nb, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index c79a10b345..763c83a023 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -13,8 +13,8 @@ #endif #define MLX5_MAX_IRQ_NAME (32) -/* max irq_index is 255. three chars */ -#define MLX5_MAX_IRQ_IDX_CHARS (3) +/* max irq_index is 2047, so four chars */ +#define MLX5_MAX_IRQ_IDX_CHARS (4) #define MLX5_SFS_PER_CTRL_IRQ 64 #define MLX5_IRQ_CTRL_SF_MAX 8 @@ -633,8 +633,9 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev) int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table) { if (table->sf_comp_pool) - return table->sf_comp_pool->xa_num_irqs.max - - table->sf_comp_pool->xa_num_irqs.min + 1; + return min_t(int, num_online_cpus(), + table->sf_comp_pool->xa_num_irqs.max - + table->sf_comp_pool->xa_num_irqs.min + 1); else return mlx5_irq_table_get_num_comp(table); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c index 0998dcc9ca..b29824448a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c @@ -24,16 +24,8 @@ #define MLXSW_THERMAL_ZONE_MAX_NAME 16 #define MLXSW_THERMAL_TEMP_SCORE_MAX GENMASK(31, 0) #define MLXSW_THERMAL_MAX_STATE 10 +#define MLXSW_THERMAL_MIN_STATE 2 #define MLXSW_THERMAL_MAX_DUTY 255 -/* Minimum and maximum fan allowed speed in percent: from 20% to 100%. Values - * MLXSW_THERMAL_MAX_STATE + x, where x is between 2 and 10 are used for - * setting fan speed dynamic minimum. For example, if value is set to 14 (40%) - * cooling levels vector will be set to 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10 to - * introduce PWM speed in percent: 40, 40, 40, 40, 40, 50, 60. 70, 80, 90, 100. - */ -#define MLXSW_THERMAL_SPEED_MIN (MLXSW_THERMAL_MAX_STATE + 2) -#define MLXSW_THERMAL_SPEED_MAX (MLXSW_THERMAL_MAX_STATE * 2) -#define MLXSW_THERMAL_SPEED_MIN_LEVEL 2 /* 20% */ /* External cooling devices, allowed for binding to mlxsw thermal zones. */ static char * const mlxsw_thermal_external_allowed_cdev[] = { @@ -646,49 +638,16 @@ static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev, struct mlxsw_thermal *thermal = cdev->devdata; struct device *dev = thermal->bus_info->dev; char mfsc_pl[MLXSW_REG_MFSC_LEN]; - unsigned long cur_state, i; int idx; - u8 duty; int err; + if (state > MLXSW_THERMAL_MAX_STATE) + return -EINVAL; + idx = mlxsw_get_cooling_device_idx(thermal, cdev); if (idx < 0) return idx; - /* Verify if this request is for changing allowed fan dynamical - * minimum. If it is - update cooling levels accordingly and update - * state, if current state is below the newly requested minimum state. - * For example, if current state is 5, and minimal state is to be - * changed from 4 to 6, thermal->cooling_levels[0 to 5] will be changed - * all from 4 to 6. And state 5 (thermal->cooling_levels[4]) should be - * overwritten. - */ - if (state >= MLXSW_THERMAL_SPEED_MIN && - state <= MLXSW_THERMAL_SPEED_MAX) { - state -= MLXSW_THERMAL_MAX_STATE; - for (i = 0; i <= MLXSW_THERMAL_MAX_STATE; i++) - thermal->cooling_levels[i] = max(state, i); - - mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0); - err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl); - if (err) - return err; - - duty = mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl); - cur_state = mlxsw_duty_to_state(duty); - - /* If current fan state is lower than requested dynamical - * minimum, increase fan speed up to dynamical minimum. - */ - if (state < cur_state) - return 0; - - state = cur_state; - } - - if (state > MLXSW_THERMAL_MAX_STATE) - return -EINVAL; - /* Normalize the state to the valid speed range. */ state = thermal->cooling_levels[state]; mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state)); @@ -998,8 +957,7 @@ int mlxsw_thermal_init(struct mlxsw_core *core, /* Initialize cooling levels per PWM state. */ for (i = 0; i < MLXSW_THERMAL_MAX_STATE; i++) - thermal->cooling_levels[i] = max(MLXSW_THERMAL_SPEED_MIN_LEVEL, - i); + thermal->cooling_levels[i] = max(MLXSW_THERMAL_MIN_STATE, i); thermal->polling_delay = bus_info->low_frequency ? MLXSW_THERMAL_SLOW_POLL_INT : diff --git a/drivers/net/ethernet/micrel/Makefile b/drivers/net/ethernet/micrel/Makefile index 5cc00d22c7..6ecc4eb30e 100644 --- a/drivers/net/ethernet/micrel/Makefile +++ b/drivers/net/ethernet/micrel/Makefile @@ -4,8 +4,6 @@ # obj-$(CONFIG_KS8842) += ks8842.o -obj-$(CONFIG_KS8851) += ks8851.o -ks8851-objs = ks8851_common.o ks8851_spi.o -obj-$(CONFIG_KS8851_MLL) += ks8851_mll.o -ks8851_mll-objs = ks8851_common.o ks8851_par.o +obj-$(CONFIG_KS8851) += ks8851_common.o ks8851_spi.o +obj-$(CONFIG_KS8851_MLL) += ks8851_common.o ks8851_par.o obj-$(CONFIG_KSZ884X_PCI) += ksz884x.o diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 3f69bb59ba..a6db1a8156 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -1057,6 +1057,7 @@ int ks8851_suspend(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(ks8851_suspend); int ks8851_resume(struct device *dev) { @@ -1070,6 +1071,7 @@ int ks8851_resume(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(ks8851_resume); #endif static int ks8851_register_mdiobus(struct ks8851_net *ks, struct device *dev) @@ -1243,6 +1245,7 @@ int ks8851_probe_common(struct net_device *netdev, struct device *dev, err_reg_io: return ret; } +EXPORT_SYMBOL_GPL(ks8851_probe_common); int ks8851_remove_common(struct device *dev) { @@ -1261,3 +1264,8 @@ int ks8851_remove_common(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(ks8851_remove_common); + +MODULE_DESCRIPTION("KS8851 Network driver"); +MODULE_AUTHOR("Ben Dooks "); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/microchip/encx24j600-regmap.c b/drivers/net/ethernet/microchip/encx24j600-regmap.c index 796e46a539..81a8ccca7e 100644 --- a/drivers/net/ethernet/microchip/encx24j600-regmap.c +++ b/drivers/net/ethernet/microchip/encx24j600-regmap.c @@ -497,13 +497,19 @@ static struct regmap_bus phymap_encx24j600 = { .reg_read = regmap_encx24j600_phy_reg_read, }; -void devm_regmap_init_encx24j600(struct device *dev, - struct encx24j600_context *ctx) +int devm_regmap_init_encx24j600(struct device *dev, + struct encx24j600_context *ctx) { mutex_init(&ctx->mutex); regcfg.lock_arg = ctx; ctx->regmap = devm_regmap_init(dev, ®map_encx24j600, ctx, ®cfg); + if (IS_ERR(ctx->regmap)) + return PTR_ERR(ctx->regmap); ctx->phymap = devm_regmap_init(dev, &phymap_encx24j600, ctx, &phycfg); + if (IS_ERR(ctx->phymap)) + return PTR_ERR(ctx->phymap); + + return 0; } EXPORT_SYMBOL_GPL(devm_regmap_init_encx24j600); diff --git a/drivers/net/ethernet/microchip/encx24j600.c b/drivers/net/ethernet/microchip/encx24j600.c index ee921a99e4..0bc6b3176f 100644 --- a/drivers/net/ethernet/microchip/encx24j600.c +++ b/drivers/net/ethernet/microchip/encx24j600.c @@ -1023,10 +1023,13 @@ static int encx24j600_spi_probe(struct spi_device *spi) priv->speed = SPEED_100; priv->ctx.spi = spi; - devm_regmap_init_encx24j600(&spi->dev, &priv->ctx); ndev->irq = spi->irq; ndev->netdev_ops = &encx24j600_netdev_ops; + ret = devm_regmap_init_encx24j600(&spi->dev, &priv->ctx); + if (ret) + goto out_free; + mutex_init(&priv->lock); /* Reset device and check if it is connected */ diff --git a/drivers/net/ethernet/microchip/encx24j600_hw.h b/drivers/net/ethernet/microchip/encx24j600_hw.h index fac61a8fbd..34c5a28989 100644 --- a/drivers/net/ethernet/microchip/encx24j600_hw.h +++ b/drivers/net/ethernet/microchip/encx24j600_hw.h @@ -15,8 +15,8 @@ struct encx24j600_context { int bank; }; -void devm_regmap_init_encx24j600(struct device *dev, - struct encx24j600_context *ctx); +int devm_regmap_init_encx24j600(struct device *dev, + struct encx24j600_context *ctx); /* Single-byte instructions */ #define BANK_SELECT(bank) (0xC0 | ((bank & (BANK_MASK >> BANK_SHIFT)) << 1)) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 1b21030308..030ae89f3a 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1477,8 +1477,10 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, if (err) goto out; - if (cq->gdma_id >= gc->max_num_cqs) + if (WARN_ON(cq->gdma_id >= gc->max_num_cqs)) { + err = -EINVAL; goto out; + } gc->cq_table[cq->gdma_id] = cq->gdma_cq; diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 559177e6de..a08e4f530c 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -472,9 +472,9 @@ void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port, !(quirks & OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP)) ocelot_port_rmwl(ocelot_port, DEV_CLOCK_CFG_MAC_TX_RST | - DEV_CLOCK_CFG_MAC_TX_RST, + DEV_CLOCK_CFG_MAC_RX_RST, DEV_CLOCK_CFG_MAC_TX_RST | - DEV_CLOCK_CFG_MAC_TX_RST, + DEV_CLOCK_CFG_MAC_RX_RST, DEV_CLOCK_CFG); } EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_down); @@ -569,49 +569,44 @@ void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port, } EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_up); -static void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, - struct sk_buff *clone) +static int ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *clone) { struct ocelot_port *ocelot_port = ocelot->ports[port]; + unsigned long flags; - spin_lock(&ocelot_port->ts_id_lock); + spin_lock_irqsave(&ocelot->ts_id_lock, flags); + + if (ocelot_port->ptp_skbs_in_flight == OCELOT_MAX_PTP_ID || + ocelot->ptp_skbs_in_flight == OCELOT_PTP_FIFO_SIZE) { + spin_unlock_irqrestore(&ocelot->ts_id_lock, flags); + return -EBUSY; + } skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS; /* Store timestamp ID in OCELOT_SKB_CB(clone)->ts_id */ OCELOT_SKB_CB(clone)->ts_id = ocelot_port->ts_id; - ocelot_port->ts_id = (ocelot_port->ts_id + 1) % 4; + + ocelot_port->ts_id++; + if (ocelot_port->ts_id == OCELOT_MAX_PTP_ID) + ocelot_port->ts_id = 0; + + ocelot_port->ptp_skbs_in_flight++; + ocelot->ptp_skbs_in_flight++; + skb_queue_tail(&ocelot_port->tx_skbs, clone); - spin_unlock(&ocelot_port->ts_id_lock); + spin_unlock_irqrestore(&ocelot->ts_id_lock, flags); + + return 0; } -u32 ocelot_ptp_rew_op(struct sk_buff *skb) -{ - struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; - u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd; - u32 rew_op = 0; - - if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) { - rew_op = ptp_cmd; - rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; - } else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { - rew_op = ptp_cmd; - } - - return rew_op; -} -EXPORT_SYMBOL(ocelot_ptp_rew_op); - -static bool ocelot_ptp_is_onestep_sync(struct sk_buff *skb) +static bool ocelot_ptp_is_onestep_sync(struct sk_buff *skb, + unsigned int ptp_class) { struct ptp_header *hdr; - unsigned int ptp_class; u8 msgtype, twostep; - ptp_class = ptp_classify_raw(skb); - if (ptp_class == PTP_CLASS_NONE) - return false; - hdr = ptp_parse_header(skb, ptp_class); if (!hdr) return false; @@ -631,10 +626,20 @@ int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, { struct ocelot_port *ocelot_port = ocelot->ports[port]; u8 ptp_cmd = ocelot_port->ptp_cmd; + unsigned int ptp_class; + int err; + + /* Don't do anything if PTP timestamping not enabled */ + if (!ptp_cmd) + return 0; + + ptp_class = ptp_classify_raw(skb); + if (ptp_class == PTP_CLASS_NONE) + return -EINVAL; /* Store ptp_cmd in OCELOT_SKB_CB(skb)->ptp_cmd */ if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { - if (ocelot_ptp_is_onestep_sync(skb)) { + if (ocelot_ptp_is_onestep_sync(skb, ptp_class)) { OCELOT_SKB_CB(skb)->ptp_cmd = ptp_cmd; return 0; } @@ -648,8 +653,12 @@ int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, if (!(*clone)) return -ENOMEM; - ocelot_port_add_txtstamp_skb(ocelot, port, *clone); + err = ocelot_port_add_txtstamp_skb(ocelot, port, *clone); + if (err) + return err; + OCELOT_SKB_CB(skb)->ptp_cmd = ptp_cmd; + OCELOT_SKB_CB(*clone)->ptp_class = ptp_class; } return 0; @@ -683,6 +692,17 @@ static void ocelot_get_hwtimestamp(struct ocelot *ocelot, spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); } +static bool ocelot_validate_ptp_skb(struct sk_buff *clone, u16 seqid) +{ + struct ptp_header *hdr; + + hdr = ptp_parse_header(clone, OCELOT_SKB_CB(clone)->ptp_class); + if (WARN_ON(!hdr)) + return false; + + return seqid == ntohs(hdr->sequence_id); +} + void ocelot_get_txtstamp(struct ocelot *ocelot) { int budget = OCELOT_PTP_QUEUE_SZ; @@ -690,10 +710,10 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) while (budget--) { struct sk_buff *skb, *skb_tmp, *skb_match = NULL; struct skb_shared_hwtstamps shhwtstamps; + u32 val, id, seqid, txport; struct ocelot_port *port; struct timespec64 ts; unsigned long flags; - u32 val, id, txport; val = ocelot_read(ocelot, SYS_PTP_STATUS); @@ -706,10 +726,17 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) /* Retrieve the ts ID and Tx port */ id = SYS_PTP_STATUS_PTP_MESS_ID_X(val); txport = SYS_PTP_STATUS_PTP_MESS_TXPORT_X(val); + seqid = SYS_PTP_STATUS_PTP_MESS_SEQ_ID(val); - /* Retrieve its associated skb */ port = ocelot->ports[txport]; + spin_lock(&ocelot->ts_id_lock); + port->ptp_skbs_in_flight--; + ocelot->ptp_skbs_in_flight--; + spin_unlock(&ocelot->ts_id_lock); + + /* Retrieve its associated skb */ +try_again: spin_lock_irqsave(&port->tx_skbs.lock, flags); skb_queue_walk_safe(&port->tx_skbs, skb, skb_tmp) { @@ -722,12 +749,20 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) spin_unlock_irqrestore(&port->tx_skbs.lock, flags); + if (WARN_ON(!skb_match)) + continue; + + if (!ocelot_validate_ptp_skb(skb_match, seqid)) { + dev_err_ratelimited(ocelot->dev, + "port %d received stale TX timestamp for seqid %d, discarding\n", + txport, seqid); + dev_kfree_skb_any(skb); + goto try_again; + } + /* Get the h/w timestamp */ ocelot_get_hwtimestamp(ocelot, &ts); - if (unlikely(!skb_match)) - continue; - /* Set the timestamp into the skb */ memset(&shhwtstamps, 0, sizeof(shhwtstamps)); shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec); @@ -1948,7 +1983,6 @@ void ocelot_init_port(struct ocelot *ocelot, int port) struct ocelot_port *ocelot_port = ocelot->ports[port]; skb_queue_head_init(&ocelot_port->tx_skbs); - spin_lock_init(&ocelot_port->ts_id_lock); /* Basic L2 initialization */ @@ -2081,6 +2115,7 @@ int ocelot_init(struct ocelot *ocelot) mutex_init(&ocelot->stats_lock); mutex_init(&ocelot->ptp_lock); spin_lock_init(&ocelot->ptp_clock_lock); + spin_lock_init(&ocelot->ts_id_lock); snprintf(queue_name, sizeof(queue_name), "%s-stats", dev_name(ocelot->dev)); ocelot->stats_queue = create_singlethread_workqueue(queue_name); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index e54b9fb2a9..2545727fd5 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -8,6 +8,7 @@ * Copyright 2020-2021 NXP */ +#include #include #include #include @@ -1625,7 +1626,7 @@ static int ocelot_port_phylink_create(struct ocelot *ocelot, int port, if (phy_mode == PHY_INTERFACE_MODE_QSGMII) ocelot_port_rmwl(ocelot_port, 0, DEV_CLOCK_CFG_MAC_TX_RST | - DEV_CLOCK_CFG_MAC_TX_RST, + DEV_CLOCK_CFG_MAC_RX_RST, DEV_CLOCK_CFG); ocelot_port->phy_mode = phy_mode; diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index 7945393a06..99d7376a70 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -998,8 +998,8 @@ ocelot_vcap_block_find_filter_by_index(struct ocelot_vcap_block *block, } struct ocelot_vcap_filter * -ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int cookie, - bool tc_offload) +ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, + unsigned long cookie, bool tc_offload) { struct ocelot_vcap_filter *filter; diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 09c0e839cc..3b6b2e6113 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -8566,7 +8566,7 @@ static void s2io_io_resume(struct pci_dev *pdev) return; } - if (s2io_set_mac_addr(netdev, netdev->dev_addr) == FAILURE) { + if (do_s2io_prog_unicast(netdev, netdev->dev_addr) == FAILURE) { s2io_card_down(sp); pr_err("Can't restore mac addr after reset.\n"); return; diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index c029950a81..ac1dcfa1d1 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -830,10 +830,6 @@ static int nfp_flower_init(struct nfp_app *app) if (err) goto err_cleanup; - err = flow_indr_dev_register(nfp_flower_indr_setup_tc_cb, app); - if (err) - goto err_cleanup; - if (app_priv->flower_ext_feats & NFP_FL_FEATS_VF_RLIM) nfp_flower_qos_init(app); @@ -942,7 +938,20 @@ static int nfp_flower_start(struct nfp_app *app) return err; } - return nfp_tunnel_config_start(app); + err = flow_indr_dev_register(nfp_flower_indr_setup_tc_cb, app); + if (err) + return err; + + err = nfp_tunnel_config_start(app); + if (err) + goto err_tunnel_config; + + return 0; + +err_tunnel_config: + flow_indr_dev_unregister(nfp_flower_indr_setup_tc_cb, app, + nfp_flower_setup_indr_tc_release); + return err; } static void nfp_flower_stop(struct nfp_app *app) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 381966e8f5..7f3322ce04 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1292,8 +1292,10 @@ int ionic_lif_addr_add(struct ionic_lif *lif, const u8 *addr) if (err && err != -EEXIST) { /* set the state back to NEW so we can try again later */ f = ionic_rx_filter_by_addr(lif, addr); - if (f && f->state == IONIC_FILTER_STATE_SYNCED) + if (f && f->state == IONIC_FILTER_STATE_SYNCED) { f->state = IONIC_FILTER_STATE_NEW; + set_bit(IONIC_LIF_F_FILTER_SYNC_NEEDED, lif->state); + } spin_unlock_bh(&lif->rx_filters.lock); @@ -1377,6 +1379,10 @@ static int ionic_addr_add(struct net_device *netdev, const u8 *addr) static int ionic_addr_del(struct net_device *netdev, const u8 *addr) { + /* Don't delete our own address from the uc list */ + if (ether_addr_equal(addr, netdev->dev_addr)) + return 0; + return ionic_lif_list_addr(netdev_priv(netdev), addr, DEL_ADDR); } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c index 25ecfcfa12..69728f9013 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c @@ -349,9 +349,6 @@ void ionic_rx_filter_sync(struct ionic_lif *lif) list_for_each_entry_safe(sync_item, spos, &sync_add_list, list) { (void)ionic_lif_addr_add(lif, sync_item->f.cmd.mac.addr); - if (sync_item->f.state != IONIC_FILTER_STATE_SYNCED) - set_bit(IONIC_LIF_F_FILTER_SYNC_NEEDED, lif->state); - list_del(&sync_item->list); devm_kfree(dev, sync_item); } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_stats.c b/drivers/net/ethernet/pensando/ionic/ionic_stats.c index 58a854666c..c14de5fced 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_stats.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_stats.c @@ -380,15 +380,6 @@ static void ionic_sw_stats_get_txq_values(struct ionic_lif *lif, u64 **buf, &ionic_dbg_intr_stats_desc[i]); (*buf)++; } - for (i = 0; i < IONIC_NUM_DBG_NAPI_STATS; i++) { - **buf = IONIC_READ_STAT64(&txqcq->napi_stats, - &ionic_dbg_napi_stats_desc[i]); - (*buf)++; - } - for (i = 0; i < IONIC_MAX_NUM_NAPI_CNTR; i++) { - **buf = txqcq->napi_stats.work_done_cntr[i]; - (*buf)++; - } for (i = 0; i < IONIC_MAX_NUM_SG_CNTR; i++) { **buf = txstats->sg_cntr[i]; (*buf)++; diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 15ef59aa34..d10e1cd6d2 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1299,6 +1299,7 @@ static int qed_slowpath_start(struct qed_dev *cdev, } else { DP_NOTICE(cdev, "Failed to acquire PTT for aRFS\n"); + rc = -EINVAL; goto err; } } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c index fbfda55b4c..5e731a72cc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c @@ -71,6 +71,7 @@ static int dwmac_generic_probe(struct platform_device *pdev) static const struct of_device_id dwmac_generic_match[] = { { .compatible = "st,spear600-gmac"}, + { .compatible = "snps,dwmac-3.40a"}, { .compatible = "snps,dwmac-3.50a"}, { .compatible = "snps,dwmac-3.610"}, { .compatible = "snps,dwmac-3.70a"}, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index ed817011a9..6924a6aacb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "stmmac_platform.h" @@ -1528,6 +1529,8 @@ static int rk_gmac_powerup(struct rk_priv_data *bsp_priv) return ret; } + pm_runtime_get_sync(dev); + if (bsp_priv->integrated_phy) rk_gmac_integrated_phy_powerup(bsp_priv); @@ -1539,6 +1542,8 @@ static void rk_gmac_powerdown(struct rk_priv_data *gmac) if (gmac->integrated_phy) rk_gmac_integrated_phy_powerdown(gmac); + pm_runtime_put_sync(&gmac->pdev->dev); + phy_power_on(gmac, false); gmac_clk_enable(gmac, false); } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c index 90383abafa..f5581db0ba 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c @@ -218,11 +218,18 @@ static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) readl(ioaddr + DMA_BUS_MODE + i * 4); } -static void dwmac1000_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwmac1000_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap = readl(ioaddr + DMA_HW_FEATURE); + if (!hw_cap) { + /* 0x00000000 is the value read on old hardware that does not + * implement this register + */ + return -EOPNOTSUPP; + } + dma_cap->mbps_10_100 = (hw_cap & DMA_HW_FEAT_MIISEL); dma_cap->mbps_1000 = (hw_cap & DMA_HW_FEAT_GMIISEL) >> 1; dma_cap->half_duplex = (hw_cap & DMA_HW_FEAT_HDSEL) >> 2; @@ -252,6 +259,8 @@ static void dwmac1000_get_hw_feature(void __iomem *ioaddr, dma_cap->number_tx_channel = (hw_cap & DMA_HW_FEAT_TXCHCNT) >> 22; /* Alternate (enhanced) DESC mode */ dma_cap->enh_desc = (hw_cap & DMA_HW_FEAT_ENHDESSEL) >> 24; + + return 0; } static void dwmac1000_rx_watchdog(void __iomem *ioaddr, u32 riwt, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index 5be8e6a631..d99fa028c6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -347,8 +347,8 @@ static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode, writel(mtl_tx_op, ioaddr + MTL_CHAN_TX_OP_MODE(channel)); } -static void dwmac4_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwmac4_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap = readl(ioaddr + GMAC_HW_FEATURE0); @@ -437,6 +437,8 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr, dma_cap->frpbs = (hw_cap & GMAC_HW_FEAT_FRPBS) >> 11; dma_cap->frpsel = (hw_cap & GMAC_HW_FEAT_FRPSEL) >> 10; dma_cap->dvlan = (hw_cap & GMAC_HW_FEAT_DVLAN) >> 5; + + return 0; } /* Enable/disable TSO feature and set MSS */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 906e985441..5e98355f42 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -371,8 +371,8 @@ static int dwxgmac2_dma_interrupt(void __iomem *ioaddr, return ret; } -static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwxgmac2_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap; @@ -445,6 +445,8 @@ static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, dma_cap->frpes = (hw_cap & XGMAC_HWFEAT_FRPES) >> 11; dma_cap->frpbs = (hw_cap & XGMAC_HWFEAT_FRPPB) >> 9; dma_cap->frpsel = (hw_cap & XGMAC_HWFEAT_FRPSEL) >> 3; + + return 0; } static void dwxgmac2_rx_watchdog(void __iomem *ioaddr, u32 riwt, u32 queue) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 6dc1c98ebe..fe2660d569 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -203,8 +203,8 @@ struct stmmac_dma_ops { int (*dma_interrupt) (void __iomem *ioaddr, struct stmmac_extra_stats *x, u32 chan, u32 dir); /* If supported then get the optional core features */ - void (*get_hw_feature)(void __iomem *ioaddr, - struct dma_features *dma_cap); + int (*get_hw_feature)(void __iomem *ioaddr, + struct dma_features *dma_cap); /* Program the HW RX Watchdog */ void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 queue); void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan); @@ -255,7 +255,7 @@ struct stmmac_dma_ops { #define stmmac_dma_interrupt_status(__priv, __args...) \ stmmac_do_callback(__priv, dma, dma_interrupt, __args) #define stmmac_get_hw_feature(__priv, __args...) \ - stmmac_do_void_callback(__priv, dma, get_hw_feature, __args) + stmmac_do_callback(__priv, dma, get_hw_feature, __args) #define stmmac_rx_watchdog(__priv, __args...) \ stmmac_do_void_callback(__priv, dma, rx_watchdog, __args) #define stmmac_set_tx_ring_len(__priv, __args...) \ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 553c440325..eb3b7bf771 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -477,6 +477,10 @@ bool stmmac_eee_init(struct stmmac_priv *priv) stmmac_lpi_entry_timer_config(priv, 0); del_timer_sync(&priv->eee_ctrl_timer); stmmac_set_eee_timer(priv, priv->hw, 0, eee_tw_timer); + if (priv->hw->xpcs) + xpcs_config_eee(priv->hw->xpcs, + priv->plat->mult_fact_100ns, + false); } mutex_unlock(&priv->lock); return false; @@ -486,6 +490,10 @@ bool stmmac_eee_init(struct stmmac_priv *priv) timer_setup(&priv->eee_ctrl_timer, stmmac_eee_ctrl_timer, 0); stmmac_set_eee_timer(priv, priv->hw, STMMAC_DEFAULT_LIT_LS, eee_tw_timer); + if (priv->hw->xpcs) + xpcs_config_eee(priv->hw->xpcs, + priv->plat->mult_fact_100ns, + true); } if (priv->plat->has_gmac4 && priv->tx_lpi_timer <= STMMAC_ET_MAX) { @@ -1034,7 +1042,7 @@ static void stmmac_mac_link_down(struct phylink_config *config, stmmac_mac_set(priv, priv->ioaddr, false); priv->eee_active = false; priv->tx_lpi_enabled = false; - stmmac_eee_init(priv); + priv->eee_enabled = stmmac_eee_init(priv); stmmac_set_eee_pls(priv, priv->hw, false); if (priv->dma_cap.fpesel) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 62cec9bfcd..232ac98943 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -508,6 +508,14 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) plat->pmt = 1; } + if (of_device_is_compatible(np, "snps,dwmac-3.40a")) { + plat->has_gmac = 1; + plat->enh_desc = 1; + plat->tx_coe = 1; + plat->bugged_jumbo = 1; + plat->pmt = 1; + } + if (of_device_is_compatible(np, "snps,dwmac-4.00") || of_device_is_compatible(np, "snps,dwmac-4.10a") || of_device_is_compatible(np, "snps,dwmac-4.20a") || diff --git a/drivers/net/ethernet/sun/Kconfig b/drivers/net/ethernet/sun/Kconfig index 309de38a75..b0d3f9a295 100644 --- a/drivers/net/ethernet/sun/Kconfig +++ b/drivers/net/ethernet/sun/Kconfig @@ -73,6 +73,7 @@ config CASSINI config SUNVNET_COMMON tristate "Common routines to support Sun Virtual Networking" depends on SUN_LDOMS + depends on INET default m config SUNVNET diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index f4843f9672..441da03c23 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -48,6 +48,7 @@ config BPQETHER config DMASCC tristate "High-speed (DMA) SCC driver for AX.25" depends on ISA && AX25 && BROKEN_ON_SMP && ISA_DMA_API + depends on VIRT_TO_BUS help This is a driver for high-speed SCC boards, i.e. those supporting DMA on one port. You usually use those boards to connect your diff --git a/drivers/net/ipa/Kconfig b/drivers/net/ipa/Kconfig index 8f99cfa146..d037682fb7 100644 --- a/drivers/net/ipa/Kconfig +++ b/drivers/net/ipa/Kconfig @@ -4,6 +4,7 @@ config QCOM_IPA depends on ARCH_QCOM || COMPILE_TEST depends on QCOM_RPROC_COMMON || (QCOM_RPROC_COMMON=n && COMPILE_TEST) select QCOM_MDT_LOADER if ARCH_QCOM + select QCOM_SCM select QCOM_QMI_HELPERS help Choose Y or M here to include support for the Qualcomm diff --git a/drivers/net/mdio/mdio-ipq4019.c b/drivers/net/mdio/mdio-ipq4019.c index 0d7d3e15d2..5f4cd24a02 100644 --- a/drivers/net/mdio/mdio-ipq4019.c +++ b/drivers/net/mdio/mdio-ipq4019.c @@ -207,6 +207,7 @@ static int ipq4019_mdio_probe(struct platform_device *pdev) { struct ipq4019_mdio_data *priv; struct mii_bus *bus; + struct resource *res; int ret; bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*priv)); @@ -224,7 +225,10 @@ static int ipq4019_mdio_probe(struct platform_device *pdev) return PTR_ERR(priv->mdio_clk); /* The platform resource is provided on the chipset IPQ5018 */ - priv->eth_ldo_rdy = devm_platform_ioremap_resource(pdev, 1); + /* This resource is optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (res) + priv->eth_ldo_rdy = devm_ioremap_resource(&pdev->dev, res); bus->name = "ipq4019_mdio"; bus->read = ipq4019_mdio_read; diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c index 1ee592d3ea..17f98f609e 100644 --- a/drivers/net/mdio/mdio-mscc-miim.c +++ b/drivers/net/mdio/mdio-mscc-miim.c @@ -134,8 +134,9 @@ static int mscc_miim_reset(struct mii_bus *bus) static int mscc_miim_probe(struct platform_device *pdev) { - struct mii_bus *bus; struct mscc_miim_dev *dev; + struct resource *res; + struct mii_bus *bus; int ret; bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*dev)); @@ -156,10 +157,14 @@ static int mscc_miim_probe(struct platform_device *pdev) return PTR_ERR(dev->regs); } - dev->phy_regs = devm_platform_ioremap_resource(pdev, 1); - if (IS_ERR(dev->phy_regs)) { - dev_err(&pdev->dev, "Unable to map internal phy registers\n"); - return PTR_ERR(dev->phy_regs); + /* This resource is optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (res) { + dev->phy_regs = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(dev->phy_regs)) { + dev_err(&pdev->dev, "Unable to map internal phy registers\n"); + return PTR_ERR(dev->phy_regs); + } } ret = of_mdiobus_register(bus, pdev->dev.of_node); diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c index d127eb6e92..aaa628f859 100644 --- a/drivers/net/mhi_net.c +++ b/drivers/net/mhi_net.c @@ -321,7 +321,7 @@ static int mhi_net_newlink(struct mhi_device *mhi_dev, struct net_device *ndev) /* Start MHI channels */ err = mhi_prepare_for_transfer(mhi_dev); if (err) - goto out_err; + return err; /* Number of transfer descriptors determines size of the queue */ mhi_netdev->rx_queue_sz = mhi_get_free_desc_count(mhi_dev, DMA_FROM_DEVICE); @@ -331,10 +331,6 @@ static int mhi_net_newlink(struct mhi_device *mhi_dev, struct net_device *ndev) return err; return 0; - -out_err: - free_netdev(ndev); - return err; } static void mhi_net_dellink(struct mhi_device *mhi_dev, struct net_device *ndev) diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index fb0a83dc09..7de631f535 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -666,6 +666,10 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) { int ret; + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL0); + if (ret < 0) + return ret; + if (enable) { /* Enable EEE */ ret = DW_VR_MII_EEE_LTX_EN | DW_VR_MII_EEE_LRX_EN | @@ -673,9 +677,6 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) DW_VR_MII_EEE_TX_EN_CTRL | DW_VR_MII_EEE_RX_EN_CTRL | mult_fact_100ns << DW_VR_MII_EEE_MULT_FACT_100NS_SHIFT; } else { - ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL0); - if (ret < 0) - return ret; ret &= ~(DW_VR_MII_EEE_LTX_EN | DW_VR_MII_EEE_LRX_EN | DW_VR_MII_EEE_TX_QUIET_EN | DW_VR_MII_EEE_RX_QUIET_EN | DW_VR_MII_EEE_TX_EN_CTRL | DW_VR_MII_EEE_RX_EN_CTRL | @@ -690,21 +691,28 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) if (ret < 0) return ret; - ret |= DW_VR_MII_EEE_TRN_LPI; + if (enable) + ret |= DW_VR_MII_EEE_TRN_LPI; + else + ret &= ~DW_VR_MII_EEE_TRN_LPI; + return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL1, ret); } EXPORT_SYMBOL_GPL(xpcs_config_eee); static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, unsigned int mode) { - int ret; + int ret, mdio_ctrl; /* For AN for C37 SGMII mode, the settings are :- - * 1) VR_MII_AN_CTRL Bit(2:1)[PCS_MODE] = 10b (SGMII AN) - * 2) VR_MII_AN_CTRL Bit(3) [TX_CONFIG] = 0b (MAC side SGMII) + * 1) VR_MII_MMD_CTRL Bit(12) [AN_ENABLE] = 0b (Disable SGMII AN in case + it is already enabled) + * 2) VR_MII_AN_CTRL Bit(2:1)[PCS_MODE] = 10b (SGMII AN) + * 3) VR_MII_AN_CTRL Bit(3) [TX_CONFIG] = 0b (MAC side SGMII) * DW xPCS used with DW EQoS MAC is always MAC side SGMII. - * 3) VR_MII_DIG_CTRL1 Bit(9) [MAC_AUTO_SW] = 1b (Automatic + * 4) VR_MII_DIG_CTRL1 Bit(9) [MAC_AUTO_SW] = 1b (Automatic * speed/duplex mode change by HW after SGMII AN complete) + * 5) VR_MII_MMD_CTRL Bit(12) [AN_ENABLE] = 1b (Enable SGMII AN) * * Note: Since it is MAC side SGMII, there is no need to set * SR_MII_AN_ADV. MAC side SGMII receives AN Tx Config from @@ -712,6 +720,17 @@ static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, unsigned int mode) * between PHY and Link Partner. There is also no need to * trigger AN restart for MAC-side SGMII. */ + mdio_ctrl = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL); + if (mdio_ctrl < 0) + return mdio_ctrl; + + if (mdio_ctrl & AN_CL37_EN) { + ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, + mdio_ctrl & ~AN_CL37_EN); + if (ret < 0) + return ret; + } + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL); if (ret < 0) return ret; @@ -736,7 +755,15 @@ static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, unsigned int mode) else ret &= ~DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; - return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret); + ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret); + if (ret < 0) + return ret; + + if (phylink_autoneg_inband(mode)) + ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, + mdio_ctrl | AN_CL37_EN); + + return ret; } static int xpcs_config_2500basex(struct dw_xpcs *xpcs) diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index e79297a4ba..27b6a3f507 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -27,7 +27,12 @@ #define MII_BCM7XXX_SHD_2_ADDR_CTRL 0xe #define MII_BCM7XXX_SHD_2_CTRL_STAT 0xf #define MII_BCM7XXX_SHD_2_BIAS_TRIM 0x1a +#define MII_BCM7XXX_SHD_3_PCS_CTRL 0x0 +#define MII_BCM7XXX_SHD_3_PCS_STATUS 0x1 +#define MII_BCM7XXX_SHD_3_EEE_CAP 0x2 #define MII_BCM7XXX_SHD_3_AN_EEE_ADV 0x3 +#define MII_BCM7XXX_SHD_3_EEE_LP 0x4 +#define MII_BCM7XXX_SHD_3_EEE_WK_ERR 0x5 #define MII_BCM7XXX_SHD_3_PCS_CTRL_2 0x6 #define MII_BCM7XXX_PCS_CTRL_2_DEF 0x4400 #define MII_BCM7XXX_SHD_3_AN_STAT 0xb @@ -216,25 +221,37 @@ static int bcm7xxx_28nm_resume(struct phy_device *phydev) return genphy_config_aneg(phydev); } -static int phy_set_clr_bits(struct phy_device *dev, int location, - int set_mask, int clr_mask) +static int __phy_set_clr_bits(struct phy_device *dev, int location, + int set_mask, int clr_mask) { int v, ret; - v = phy_read(dev, location); + v = __phy_read(dev, location); if (v < 0) return v; v &= ~clr_mask; v |= set_mask; - ret = phy_write(dev, location, v); + ret = __phy_write(dev, location, v); if (ret < 0) return ret; return v; } +static int phy_set_clr_bits(struct phy_device *dev, int location, + int set_mask, int clr_mask) +{ + int ret; + + mutex_lock(&dev->mdio.bus->mdio_lock); + ret = __phy_set_clr_bits(dev, location, set_mask, clr_mask); + mutex_unlock(&dev->mdio.bus->mdio_lock); + + return ret; +} + static int bcm7xxx_28nm_ephy_01_afe_config_init(struct phy_device *phydev) { int ret; @@ -398,6 +415,93 @@ static int bcm7xxx_28nm_ephy_config_init(struct phy_device *phydev) return bcm7xxx_28nm_ephy_apd_enable(phydev); } +#define MII_BCM7XXX_REG_INVALID 0xff + +static u8 bcm7xxx_28nm_ephy_regnum_to_shd(u16 regnum) +{ + switch (regnum) { + case MDIO_CTRL1: + return MII_BCM7XXX_SHD_3_PCS_CTRL; + case MDIO_STAT1: + return MII_BCM7XXX_SHD_3_PCS_STATUS; + case MDIO_PCS_EEE_ABLE: + return MII_BCM7XXX_SHD_3_EEE_CAP; + case MDIO_AN_EEE_ADV: + return MII_BCM7XXX_SHD_3_AN_EEE_ADV; + case MDIO_AN_EEE_LPABLE: + return MII_BCM7XXX_SHD_3_EEE_LP; + case MDIO_PCS_EEE_WK_ERR: + return MII_BCM7XXX_SHD_3_EEE_WK_ERR; + default: + return MII_BCM7XXX_REG_INVALID; + } +} + +static bool bcm7xxx_28nm_ephy_dev_valid(int devnum) +{ + return devnum == MDIO_MMD_AN || devnum == MDIO_MMD_PCS; +} + +static int bcm7xxx_28nm_ephy_read_mmd(struct phy_device *phydev, + int devnum, u16 regnum) +{ + u8 shd = bcm7xxx_28nm_ephy_regnum_to_shd(regnum); + int ret; + + if (!bcm7xxx_28nm_ephy_dev_valid(devnum) || + shd == MII_BCM7XXX_REG_INVALID) + return -EOPNOTSUPP; + + /* set shadow mode 2 */ + ret = __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, + MII_BCM7XXX_SHD_MODE_2, 0); + if (ret < 0) + return ret; + + /* Access the desired shadow register address */ + ret = __phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL, shd); + if (ret < 0) + goto reset_shadow_mode; + + ret = __phy_read(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT); + +reset_shadow_mode: + /* reset shadow mode 2 */ + __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0, + MII_BCM7XXX_SHD_MODE_2); + return ret; +} + +static int bcm7xxx_28nm_ephy_write_mmd(struct phy_device *phydev, + int devnum, u16 regnum, u16 val) +{ + u8 shd = bcm7xxx_28nm_ephy_regnum_to_shd(regnum); + int ret; + + if (!bcm7xxx_28nm_ephy_dev_valid(devnum) || + shd == MII_BCM7XXX_REG_INVALID) + return -EOPNOTSUPP; + + /* set shadow mode 2 */ + ret = __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, + MII_BCM7XXX_SHD_MODE_2, 0); + if (ret < 0) + return ret; + + /* Access the desired shadow register address */ + ret = __phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL, shd); + if (ret < 0) + goto reset_shadow_mode; + + /* Write the desired value in the shadow register */ + __phy_write(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT, val); + +reset_shadow_mode: + /* reset shadow mode 2 */ + return __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0, + MII_BCM7XXX_SHD_MODE_2); +} + static int bcm7xxx_28nm_ephy_resume(struct phy_device *phydev) { int ret; @@ -595,6 +699,8 @@ static void bcm7xxx_28nm_remove(struct phy_device *phydev) .get_stats = bcm7xxx_28nm_get_phy_stats, \ .probe = bcm7xxx_28nm_probe, \ .remove = bcm7xxx_28nm_remove, \ + .read_mmd = bcm7xxx_28nm_ephy_read_mmd, \ + .write_mmd = bcm7xxx_28nm_ephy_write_mmd, \ } #define BCM7XXX_40NM_EPHY(_oui, _name) \ diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 53f034fc2e..6865d93191 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -525,6 +525,10 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) NULL == bus->read || NULL == bus->write) return -EINVAL; + if (bus->parent && bus->parent->of_node) + bus->parent->of_node->fwnode.flags |= + FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD; + BUG_ON(bus->state != MDIOBUS_ALLOCATED && bus->state != MDIOBUS_UNREGISTERED); @@ -534,6 +538,13 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) bus->dev.groups = NULL; dev_set_name(&bus->dev, "%s", bus->id); + /* We need to set state to MDIOBUS_UNREGISTERED to correctly release + * the device in mdiobus_free() + * + * State will be updated later in this function in case of success + */ + bus->state = MDIOBUS_UNREGISTERED; + err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index 2d5d5081c3..5ce1bf03bb 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -493,6 +493,25 @@ static int gpy_loopback(struct phy_device *phydev, bool enable) return ret; } +static int gpy115_loopback(struct phy_device *phydev, bool enable) +{ + int ret; + int fw_minor; + + if (enable) + return gpy_loopback(phydev, enable); + + ret = phy_read(phydev, PHY_FWV); + if (ret < 0) + return ret; + + fw_minor = FIELD_GET(PHY_FWV_MINOR_MASK, ret); + if (fw_minor > 0x0076) + return gpy_loopback(phydev, 0); + + return genphy_soft_reset(phydev); +} + static struct phy_driver gpy_drivers[] = { { PHY_ID_MATCH_MODEL(PHY_ID_GPY2xx), @@ -527,7 +546,7 @@ static struct phy_driver gpy_drivers[] = { .handle_interrupt = gpy_handle_interrupt, .set_wol = gpy_set_wol, .get_wol = gpy_get_wol, - .set_loopback = gpy_loopback, + .set_loopback = gpy115_loopback, }, { PHY_ID_MATCH_MODEL(PHY_ID_GPY115C), @@ -544,7 +563,7 @@ static struct phy_driver gpy_drivers[] = { .handle_interrupt = gpy_handle_interrupt, .set_wol = gpy_set_wol, .get_wol = gpy_get_wol, - .set_loopback = gpy_loopback, + .set_loopback = gpy115_loopback, }, { .phy_id = PHY_ID_GPY211B, diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index ba5ad86ec8..4f9990b47a 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3125,6 +3125,9 @@ static void phy_shutdown(struct device *dev) { struct phy_device *phydev = to_phy_device(dev); + if (phydev->state == PHY_READY || !phydev->attached_dev) + return; + phy_disable_interrupts(phydev); } diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 4c5d69732a..f87f175033 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -99,6 +99,10 @@ config USB_RTL8150 config USB_RTL8152 tristate "Realtek RTL8152/RTL8153 Based USB Ethernet Adapters" select MII + select CRC32 + select CRYPTO + select CRYPTO_HASH + select CRYPTO_SHA256 help This option adds support for Realtek RTL8152 based USB 2.0 10/100 Ethernet adapters and RTL8153 based USB 3.0 10/100/1000 diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 60ba9b7340..f329e39100 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -767,6 +767,7 @@ enum rtl8152_flags { PHY_RESET, SCHEDULE_TASKLET, GREEN_ETHERNET, + RX_EPROTO, }; #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2 0x3082 @@ -1770,6 +1771,14 @@ static void read_bulk_callback(struct urb *urb) rtl_set_unplug(tp); netif_device_detach(tp->netdev); return; + case -EPROTO: + urb->actual_length = 0; + spin_lock_irqsave(&tp->rx_lock, flags); + list_add_tail(&agg->list, &tp->rx_done); + spin_unlock_irqrestore(&tp->rx_lock, flags); + set_bit(RX_EPROTO, &tp->flags); + schedule_delayed_work(&tp->schedule, 1); + return; case -ENOENT: return; /* the urb is in unlink state */ case -ETIME: @@ -2425,6 +2434,7 @@ static int rx_bottom(struct r8152 *tp, int budget) if (list_empty(&tp->rx_done)) goto out1; + clear_bit(RX_EPROTO, &tp->flags); INIT_LIST_HEAD(&rx_queue); spin_lock_irqsave(&tp->rx_lock, flags); list_splice_init(&tp->rx_done, &rx_queue); @@ -2441,7 +2451,7 @@ static int rx_bottom(struct r8152 *tp, int budget) agg = list_entry(cursor, struct rx_agg, list); urb = agg->urb; - if (urb->actual_length < ETH_ZLEN) + if (urb->status != 0 || urb->actual_length < ETH_ZLEN) goto submit; agg_free = rtl_get_free_rx(tp, GFP_ATOMIC); @@ -6643,6 +6653,10 @@ static void rtl_work_func_t(struct work_struct *work) netif_carrier_ok(tp->netdev)) tasklet_schedule(&tp->tx_tl); + if (test_and_clear_bit(RX_EPROTO, &tp->flags) && + !list_empty(&tp->rx_done)) + napi_schedule(&tp->napi); + mutex_unlock(&tp->control); out1: diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index f4620fb260..8a035345d6 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -1242,7 +1242,10 @@ static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf) static void smsc95xx_handle_link_change(struct net_device *net) { + struct usbnet *dev = netdev_priv(net); + phy_print_status(net->phydev); + usbnet_defer_kevent(dev, EVENT_LINK_CHANGE); } static int smsc95xx_start_phy(struct usbnet *dev) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 79bd2585ec..4ad25a8b08 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -406,7 +406,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, * add_recvbuf_mergeable() + get_mergeable_buf_len() */ truesize = headroom ? PAGE_SIZE : truesize; - tailroom = truesize - len - headroom; + tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len); buf = p - headroom; len -= hdr_len; diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig index 741289e385..ca007b800f 100644 --- a/drivers/net/wireless/ath/ath10k/Kconfig +++ b/drivers/net/wireless/ath/ath10k/Kconfig @@ -44,7 +44,7 @@ config ATH10K_SNOC tristate "Qualcomm ath10k SNOC support" depends on ATH10K depends on ARCH_QCOM || COMPILE_TEST - depends on QCOM_SCM || !QCOM_SCM #if QCOM_SCM=m this can't be =y + select QCOM_SCM select QCOM_QMI_HELPERS help This module adds support for integrated WCN3990 chip connected diff --git a/drivers/net/wireless/ath/ath5k/Kconfig b/drivers/net/wireless/ath/ath5k/Kconfig index f35cd8de22..6914b37bb0 100644 --- a/drivers/net/wireless/ath/ath5k/Kconfig +++ b/drivers/net/wireless/ath/ath5k/Kconfig @@ -3,9 +3,7 @@ config ATH5K tristate "Atheros 5xxx wireless cards support" depends on (PCI || ATH25) && MAC80211 select ATH_COMMON - select MAC80211_LEDS - select LEDS_CLASS - select NEW_LEDS + select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211 select ATH5K_AHB if ATH25 select ATH5K_PCI if !ATH25 help diff --git a/drivers/net/wireless/ath/ath5k/led.c b/drivers/net/wireless/ath/ath5k/led.c index 6a2a168567..33e9928af3 100644 --- a/drivers/net/wireless/ath/ath5k/led.c +++ b/drivers/net/wireless/ath/ath5k/led.c @@ -89,7 +89,8 @@ static const struct pci_device_id ath5k_led_devices[] = { void ath5k_led_enable(struct ath5k_hw *ah) { - if (test_bit(ATH_STAT_LEDSOFT, ah->status)) { + if (IS_ENABLED(CONFIG_MAC80211_LEDS) && + test_bit(ATH_STAT_LEDSOFT, ah->status)) { ath5k_hw_set_gpio_output(ah, ah->led_pin); ath5k_led_off(ah); } @@ -104,7 +105,8 @@ static void ath5k_led_on(struct ath5k_hw *ah) void ath5k_led_off(struct ath5k_hw *ah) { - if (!test_bit(ATH_STAT_LEDSOFT, ah->status)) + if (!IS_ENABLED(CONFIG_MAC80211_LEDS) || + !test_bit(ATH_STAT_LEDSOFT, ah->status)) return; ath5k_hw_set_gpio(ah, ah->led_pin, !ah->led_on); } @@ -146,7 +148,7 @@ ath5k_register_led(struct ath5k_hw *ah, struct ath5k_led *led, static void ath5k_unregister_led(struct ath5k_led *led) { - if (!led->ah) + if (!IS_ENABLED(CONFIG_MAC80211_LEDS) || !led->ah) return; led_classdev_unregister(&led->led_dev); ath5k_led_off(led->ah); @@ -169,7 +171,7 @@ int ath5k_init_leds(struct ath5k_hw *ah) char name[ATH5K_LED_MAX_NAME_LEN + 1]; const struct pci_device_id *match; - if (!ah->pdev) + if (!IS_ENABLED(CONFIG_MAC80211_LEDS) || !ah->pdev) return 0; #ifdef CONFIG_ATH5K_AHB diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 94ab63dfc1..077852e6b0 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -7465,23 +7465,18 @@ static s32 brcmf_translate_country_code(struct brcmf_pub *drvr, char alpha2[2], s32 found_index; int i; + country_codes = drvr->settings->country_codes; + if (!country_codes) { + brcmf_dbg(TRACE, "No country codes configured for device\n"); + return -EINVAL; + } + if ((alpha2[0] == ccreq->country_abbrev[0]) && (alpha2[1] == ccreq->country_abbrev[1])) { brcmf_dbg(TRACE, "Country code already set\n"); return -EAGAIN; } - country_codes = drvr->settings->country_codes; - if (!country_codes) { - brcmf_dbg(TRACE, "No country codes configured for device, using ISO3166 code and 0 rev\n"); - memset(ccreq, 0, sizeof(*ccreq)); - ccreq->country_abbrev[0] = alpha2[0]; - ccreq->country_abbrev[1] = alpha2[1]; - ccreq->ccode[0] = alpha2[0]; - ccreq->ccode[1] = alpha2[1]; - return 0; - } - found_index = -1; for (i = 0; i < country_codes->table_size; i++) { cc = &country_codes->table[i]; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 0e97d5e6c6..9f706fffb5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -160,6 +160,7 @@ static void iwl_mvm_wowlan_program_keys(struct ieee80211_hw *hw, mvm->ptk_icvlen = key->icv_len; mvm->gtk_ivlen = key->iv_len; mvm->gtk_icvlen = key->icv_len; + mutex_unlock(&mvm->mutex); /* don't upload key again */ return; @@ -360,11 +361,11 @@ static void iwl_mvm_wowlan_get_rsc_v5_data(struct ieee80211_hw *hw, if (sta) { rsc = data->rsc->ucast_rsc; } else { - if (WARN_ON(data->gtks > ARRAY_SIZE(data->gtk_ids))) + if (WARN_ON(data->gtks >= ARRAY_SIZE(data->gtk_ids))) return; data->gtk_ids[data->gtks] = key->keyidx; rsc = data->rsc->mcast_rsc[data->gtks % 2]; - if (WARN_ON(key->keyidx > + if (WARN_ON(key->keyidx >= ARRAY_SIZE(data->rsc->mcast_key_id_map))) return; data->rsc->mcast_key_id_map[key->keyidx] = data->gtks % 2; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 25af88a3ed..e91f8e889d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -662,12 +662,13 @@ static bool __iwl_mvm_remove_time_event(struct iwl_mvm *mvm, u32 *uid) { u32 id; - struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(te_data->vif); + struct iwl_mvm_vif *mvmvif; enum nl80211_iftype iftype; if (!te_data->vif) return false; + mvmvif = iwl_mvm_vif_from_mac80211(te_data->vif); iftype = te_data->vif->type; /* diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 61b2797a34..e3996ff99b 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -547,6 +547,8 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0x43F0, 0x0074, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x43F0, 0x0078, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x43F0, 0x007C, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x43F0, 0x1651, killer1650s_2ax_cfg_qu_b0_hr_b0, iwl_ax201_killer_1650s_name), + IWL_DEV_INFO(0x43F0, 0x1652, killer1650i_2ax_cfg_qu_b0_hr_b0, iwl_ax201_killer_1650i_name), IWL_DEV_INFO(0x43F0, 0x2074, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x43F0, 0x4070, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0xA0F0, 0x0070, iwl_ax201_cfg_qu_hr, NULL), diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index ffa894f731..0adae76eb8 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1867,8 +1867,8 @@ mac80211_hwsim_beacon(struct hrtimer *timer) bcn_int -= data->bcn_delta; data->bcn_delta = 0; } - hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer), - ns_to_ktime(bcn_int * NSEC_PER_USEC)); + hrtimer_forward_now(&data->beacon_timer, + ns_to_ktime(bcn_int * NSEC_PER_USEC)); return HRTIMER_RESTART; } diff --git a/drivers/net/wireless/marvell/mwifiex/sta_tx.c b/drivers/net/wireless/marvell/mwifiex/sta_tx.c index 241305377e..a9b5eb9922 100644 --- a/drivers/net/wireless/marvell/mwifiex/sta_tx.c +++ b/drivers/net/wireless/marvell/mwifiex/sta_tx.c @@ -62,8 +62,8 @@ void *mwifiex_process_sta_txpd(struct mwifiex_private *priv, pkt_type = mwifiex_is_skb_mgmt_frame(skb) ? PKT_TYPE_MGMT : 0; - pad = ((void *)skb->data - (sizeof(*local_tx_pd) + hroom)- - NULL) & (MWIFIEX_DMA_ALIGN_SZ - 1); + pad = ((uintptr_t)skb->data - (sizeof(*local_tx_pd) + hroom)) & + (MWIFIEX_DMA_ALIGN_SZ - 1); skb_push(skb, sizeof(*local_tx_pd) + pad); local_tx_pd = (struct txpd *) skb->data; diff --git a/drivers/net/wireless/marvell/mwifiex/uap_txrx.c b/drivers/net/wireless/marvell/mwifiex/uap_txrx.c index 9bbdb8dfce..245ff644f8 100644 --- a/drivers/net/wireless/marvell/mwifiex/uap_txrx.c +++ b/drivers/net/wireless/marvell/mwifiex/uap_txrx.c @@ -475,8 +475,8 @@ void *mwifiex_process_uap_txpd(struct mwifiex_private *priv, pkt_type = mwifiex_is_skb_mgmt_frame(skb) ? PKT_TYPE_MGMT : 0; - pad = ((void *)skb->data - (sizeof(*txpd) + hroom) - NULL) & - (MWIFIEX_DMA_ALIGN_SZ - 1); + pad = ((uintptr_t)skb->data - (sizeof(*txpd) + hroom)) & + (MWIFIEX_DMA_ALIGN_SZ - 1); skb_push(skb, sizeof(*txpd) + pad); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 72de88ff0d..ef4950f808 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -380,7 +380,6 @@ static int pmem_attach_disk(struct device *dev, struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; struct request_queue *q; - struct device *gendev; struct gendisk *disk; void *addr; int rc; @@ -489,10 +488,8 @@ static int pmem_attach_disk(struct device *dev, } dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); pmem->dax_dev = dax_dev; - gendev = disk_to_dev(disk); - gendev->groups = pmem_attribute_groups; - device_add_disk(dev, disk, NULL); + device_add_disk(dev, disk, pmem_attribute_groups); if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) return -ENOMEM; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e486845d2c..f8dd664b2e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -978,6 +978,7 @@ EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) { struct nvme_command *cmd = nvme_req(req)->cmd; + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; blk_status_t ret = BLK_STS_OK; if (!(req->rq_flags & RQF_DONTPREP)) { @@ -1026,7 +1027,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) return BLK_STS_IOERR; } - nvme_req(req)->genctr++; + if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN)) + nvme_req(req)->genctr++; cmd->common.command_id = nvme_cid(req); trace_nvme_setup_cmd(req, cmd); return ret; @@ -3548,10 +3550,15 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, return 0; } +static void nvme_cdev_rel(struct device *dev) +{ + ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(dev->devt)); +} + void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) { cdev_device_del(cdev, cdev_device); - ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(cdev_device->devt)); + put_device(cdev_device); } int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, @@ -3564,14 +3571,14 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, return minor; cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); cdev_device->class = nvme_ns_chr_class; + cdev_device->release = nvme_cdev_rel; device_initialize(cdev_device); cdev_init(cdev, fops); cdev->owner = owner; ret = cdev_device_add(cdev, cdev_device); - if (ret) { + if (ret) put_device(cdev_device); - ida_simple_remove(&nvme_ns_chr_minor_ida, minor); - } + return ret; } @@ -3603,11 +3610,9 @@ static int nvme_add_ns_cdev(struct nvme_ns *ns) ns->ctrl->instance, ns->head->instance); if (ret) return ret; - ret = nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, - ns->ctrl->ops->module); - if (ret) - kfree_const(ns->cdev_device.kobj.name); - return ret; + + return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, + ns->ctrl->ops->module); } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e8ccdd398f..fba06618c6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -431,8 +431,6 @@ static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) return ret; ret = nvme_cdev_add(&head->cdev, &head->cdev_device, &nvme_ns_head_chr_fops, THIS_MODULE); - if (ret) - kfree_const(head->cdev_device.kobj.name); return ret; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9871c0c937..ed79a6c7e8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -138,6 +138,12 @@ enum nvme_quirks { * 48 bits. */ NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16), + + /* + * The controller requires the command_id value be be limited, so skip + * encoding the generation sequence number. + */ + NVME_QUIRK_SKIP_CID_GEN = (1 << 17), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b82492cd75..149ecf73df 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1330,7 +1330,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) iod->aborted = 1; cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = req->tag; + cmd.abort.cid = nvme_cid(req); cmd.abort.sqid = cpu_to_le16(nvmeq->qid); dev_warn(nvmeq->dev->ctrl.device, @@ -3369,7 +3369,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), .driver_data = NVME_QUIRK_SINGLE_VECTOR | NVME_QUIRK_128_BYTES_SQES | - NVME_QUIRK_SHARED_TAGS }, + NVME_QUIRK_SHARED_TAGS | + NVME_QUIRK_SKIP_CID_GEN }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { 0, } diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 3d87fadaa1..8976da38b3 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -1383,7 +1383,8 @@ static void nvmem_shift_read_buffer_in_place(struct nvmem_cell *cell, void *buf) *p-- = 0; /* clear msb bits if any leftover in the last byte */ - *p &= GENMASK((cell->nbits%BITS_PER_BYTE) - 1, 0); + if (cell->nbits % BITS_PER_BYTE) + *p &= GENMASK((cell->nbits % BITS_PER_BYTE) - 1, 0); } static int __nvmem_cell_read(struct nvmem_device *nvmem, diff --git a/drivers/of/base.c b/drivers/of/base.c index f720c0d246..0ac1725625 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -36,6 +36,7 @@ LIST_HEAD(aliases_lookup); struct device_node *of_root; EXPORT_SYMBOL(of_root); struct device_node *of_chosen; +EXPORT_SYMBOL(of_chosen); struct device_node *of_aliases; struct device_node *of_stdout; static const char *of_stdout_options; diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index eaec915ffe..67c46e52c0 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3301,9 +3301,17 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) return 0; if (!keep_devs) { - /* Delete any children which might still exist. */ + struct list_head removed; + + /* Move all present children to the list on stack */ + INIT_LIST_HEAD(&removed); spin_lock_irqsave(&hbus->device_list_lock, flags); - list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) { + list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) + list_move_tail(&hpdev->list_entry, &removed); + spin_unlock_irqrestore(&hbus->device_list_lock, flags); + + /* Remove all children in the list */ + list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { list_del(&hpdev->list_entry); if (hpdev->pci_slot) pci_destroy_slot(hpdev->pci_slot); @@ -3311,7 +3319,6 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) put_pcichild(hpdev); put_pcichild(hpdev); } - spin_unlock_irqrestore(&hbus->device_list_lock, flags); } ret = hv_send_resources_released(hdev); diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index 014868752c..dcefdb42ac 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -62,14 +62,7 @@ static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value) struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev, hotplug_slot); - switch (zdev->state) { - case ZPCI_FN_STATE_STANDBY: - *value = 0; - break; - default: - *value = 1; - break; - } + *value = zpci_is_device_configured(zdev) ? 1 : 0; return 0; } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 0099a00af3..4b4792940e 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -535,6 +535,7 @@ static int msi_verify_entries(struct pci_dev *dev) static int msi_capability_init(struct pci_dev *dev, int nvec, struct irq_affinity *affd) { + const struct attribute_group **groups; struct msi_desc *entry; int ret; @@ -558,12 +559,14 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, if (ret) goto err; - dev->msi_irq_groups = msi_populate_sysfs(&dev->dev); - if (IS_ERR(dev->msi_irq_groups)) { - ret = PTR_ERR(dev->msi_irq_groups); + groups = msi_populate_sysfs(&dev->dev); + if (IS_ERR(groups)) { + ret = PTR_ERR(groups); goto err; } + dev->msi_irq_groups = groups; + /* Set MSI enabled bits */ pci_intx_for_msi(dev, 0); pci_msi_set_enable(dev, 1); @@ -691,6 +694,7 @@ static void msix_mask_all(void __iomem *base, int tsize) static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, int nvec, struct irq_affinity *affd) { + const struct attribute_group **groups; void __iomem *base; int ret, tsize; u16 control; @@ -730,12 +734,14 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, msix_update_entries(dev, entries); - dev->msi_irq_groups = msi_populate_sysfs(&dev->dev); - if (IS_ERR(dev->msi_irq_groups)) { - ret = PTR_ERR(dev->msi_irq_groups); + groups = msi_populate_sysfs(&dev->dev); + if (IS_ERR(groups)) { + ret = PTR_ERR(groups); goto out_free; } + dev->msi_irq_groups = groups; + /* Set MSI-X enabled bits and unmask the function */ pci_intx_for_msi(dev, 0); dev->msix_enabled = 1; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 0f40943a9a..260a06fb78 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1249,6 +1249,9 @@ static struct acpi_device *acpi_pci_find_companion(struct device *dev) bool check_children; u64 addr; + if (!dev->parent) + return NULL; + down_read(&pci_acpi_companion_lookup_sem); adev = pci_acpi_find_companion_hook ? diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 3cbc3baf08..295cc7952d 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -952,6 +952,8 @@ int armpmu_register(struct arm_pmu *pmu) pmu->name, pmu->num_events, has_nmi ? ", using NMIs" : ""); + kvm_host_pmu_init(pmu); + return 0; out_destroy: diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index a4ac87c8b4..5082102d7d 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -2306,7 +2306,7 @@ EXPORT_SYMBOL_GPL(devm_pinctrl_register_and_init); /** * devm_pinctrl_unregister() - Resource managed version of pinctrl_unregister(). - * @dev: device for which which resource was allocated + * @dev: device for which resource was allocated * @pctldev: the pinctrl device to unregister. */ void devm_pinctrl_unregister(struct device *dev, struct pinctrl_dev *pctldev) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index c001f2ed20..8d0f88e9ca 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -445,6 +445,7 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); u32 wake_mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3); + int err; raw_spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + (d->hwirq)*4); @@ -457,6 +458,15 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) writel(pin_reg, gpio_dev->base + (d->hwirq)*4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); + if (on) + err = enable_irq_wake(gpio_dev->irq); + else + err = disable_irq_wake(gpio_dev->irq); + + if (err) + dev_err(&gpio_dev->pdev->dev, "failed to %s wake-up interrupt\n", + on ? "enable" : "disable"); + return 0; } @@ -902,7 +912,6 @@ static struct pinctrl_desc amd_pinctrl_desc = { static int amd_gpio_probe(struct platform_device *pdev) { int ret = 0; - int irq_base; struct resource *res; struct amd_gpio *gpio_dev; struct gpio_irq_chip *girq; @@ -925,9 +934,9 @@ static int amd_gpio_probe(struct platform_device *pdev) if (!gpio_dev->base) return -ENOMEM; - irq_base = platform_get_irq(pdev, 0); - if (irq_base < 0) - return irq_base; + gpio_dev->irq = platform_get_irq(pdev, 0); + if (gpio_dev->irq < 0) + return gpio_dev->irq; #ifdef CONFIG_PM_SLEEP gpio_dev->saved_regs = devm_kcalloc(&pdev->dev, amd_pinctrl_desc.npins, @@ -987,7 +996,7 @@ static int amd_gpio_probe(struct platform_device *pdev) goto out2; } - ret = devm_request_irq(&pdev->dev, irq_base, amd_gpio_irq_handler, + ret = devm_request_irq(&pdev->dev, gpio_dev->irq, amd_gpio_irq_handler, IRQF_SHARED, KBUILD_MODNAME, gpio_dev); if (ret) goto out2; diff --git a/drivers/pinctrl/pinctrl-amd.h b/drivers/pinctrl/pinctrl-amd.h index 95e7634240..1d43170736 100644 --- a/drivers/pinctrl/pinctrl-amd.h +++ b/drivers/pinctrl/pinctrl-amd.h @@ -98,6 +98,7 @@ struct amd_gpio { struct resource *res; struct platform_device *pdev; u32 *saved_regs; + int irq; }; /* KERNCZ configuration*/ diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index ae33e37669..5ce260f152 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -2092,6 +2092,23 @@ static bool rockchip_pinconf_pull_valid(struct rockchip_pin_ctrl *ctrl, return false; } +static int rockchip_pinconf_defer_output(struct rockchip_pin_bank *bank, + unsigned int pin, u32 arg) +{ + struct rockchip_pin_output_deferred *cfg; + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return -ENOMEM; + + cfg->pin = pin; + cfg->arg = arg; + + list_add_tail(&cfg->head, &bank->deferred_output); + + return 0; +} + /* set the pin config settings for a specified pin */ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, unsigned long *configs, unsigned num_configs) @@ -2136,6 +2153,22 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (rc != RK_FUNC_GPIO) return -EINVAL; + /* + * Check for gpio driver not being probed yet. + * The lock makes sure that either gpio-probe has completed + * or the gpio driver hasn't probed yet. + */ + mutex_lock(&bank->deferred_lock); + if (!gpio || !gpio->direction_output) { + rc = rockchip_pinconf_defer_output(bank, pin - bank->pin_base, arg); + mutex_unlock(&bank->deferred_lock); + if (rc) + return rc; + + break; + } + mutex_unlock(&bank->deferred_lock); + rc = gpio->direction_output(gpio, pin - bank->pin_base, arg); if (rc) @@ -2204,6 +2237,11 @@ static int rockchip_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (rc != RK_FUNC_GPIO) return -EINVAL; + if (!gpio || !gpio->get) { + arg = 0; + break; + } + rc = gpio->get(gpio, pin - bank->pin_base); if (rc < 0) return rc; @@ -2450,6 +2488,9 @@ static int rockchip_pinctrl_register(struct platform_device *pdev, pin_bank->name, pin); pdesc++; } + + INIT_LIST_HEAD(&pin_bank->deferred_output); + mutex_init(&pin_bank->deferred_lock); } ret = rockchip_pinctrl_parse_dt(pdev, info); @@ -2716,6 +2757,31 @@ static int rockchip_pinctrl_probe(struct platform_device *pdev) return 0; } +static int rockchip_pinctrl_remove(struct platform_device *pdev) +{ + struct rockchip_pinctrl *info = platform_get_drvdata(pdev); + struct rockchip_pin_bank *bank; + struct rockchip_pin_output_deferred *cfg; + int i; + + of_platform_depopulate(&pdev->dev); + + for (i = 0; i < info->ctrl->nr_banks; i++) { + bank = &info->ctrl->pin_banks[i]; + + mutex_lock(&bank->deferred_lock); + while (!list_empty(&bank->deferred_output)) { + cfg = list_first_entry(&bank->deferred_output, + struct rockchip_pin_output_deferred, head); + list_del(&cfg->head); + kfree(cfg); + } + mutex_unlock(&bank->deferred_lock); + } + + return 0; +} + static struct rockchip_pin_bank px30_pin_banks[] = { PIN_BANK_IOMUX_FLAGS(0, 32, "gpio0", IOMUX_SOURCE_PMU, IOMUX_SOURCE_PMU, @@ -3175,6 +3241,7 @@ static const struct of_device_id rockchip_pinctrl_dt_match[] = { static struct platform_driver rockchip_pinctrl_driver = { .probe = rockchip_pinctrl_probe, + .remove = rockchip_pinctrl_remove, .driver = { .name = "rockchip-pinctrl", .pm = &rockchip_pinctrl_dev_pm_ops, diff --git a/drivers/pinctrl/pinctrl-rockchip.h b/drivers/pinctrl/pinctrl-rockchip.h index 589d4d2a98..91f10279d0 100644 --- a/drivers/pinctrl/pinctrl-rockchip.h +++ b/drivers/pinctrl/pinctrl-rockchip.h @@ -141,6 +141,8 @@ struct rockchip_drv { * @toggle_edge_mode: bit mask to toggle (falling/rising) edge mode * @recalced_mask: bit mask to indicate a need to recalulate the mask * @route_mask: bits describing the routing pins of per bank + * @deferred_output: gpio output settings to be done after gpio bank probed + * @deferred_lock: mutex for the deferred_output shared btw gpio and pinctrl */ struct rockchip_pin_bank { struct device *dev; @@ -169,6 +171,8 @@ struct rockchip_pin_bank { u32 toggle_edge_mode; u32 recalced_mask; u32 route_mask; + struct list_head deferred_output; + struct mutex deferred_lock; }; /** @@ -243,6 +247,12 @@ struct rockchip_pin_config { unsigned int nconfigs; }; +struct rockchip_pin_output_deferred { + struct list_head head; + unsigned int pin; + u32 arg; +}; + /** * struct rockchip_pin_group: represent group of pins of a pinmux function. * @name: name of the pin group, used to lookup the group. diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index 32ea2a8ec0..5ff4207df6 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -3,7 +3,8 @@ if (ARCH_QCOM || COMPILE_TEST) config PINCTRL_MSM tristate "Qualcomm core pin controller driver" - depends on GPIOLIB && (QCOM_SCM || !QCOM_SCM) #if QCOM_SCM=m this can't be =y + depends on GPIOLIB + select QCOM_SCM select PINMUX select PINCONF select GENERIC_PINCONF diff --git a/drivers/pinctrl/qcom/pinctrl-sc7280.c b/drivers/pinctrl/qcom/pinctrl-sc7280.c index afddf6d60d..9017ede409 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc7280.c +++ b/drivers/pinctrl/qcom/pinctrl-sc7280.c @@ -1496,6 +1496,7 @@ static const struct of_device_id sc7280_pinctrl_of_match[] = { static struct platform_driver sc7280_pinctrl_driver = { .driver = { .name = "sc7280-pinctrl", + .pm = &msm_pinctrl_dev_pm_ops, .of_match_table = sc7280_pinctrl_of_match, }, .probe = sc7280_pinctrl_probe, diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index 98bf0e2a2a..b2562e8931 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2012-2014, The Linux Foundation. All rights reserved. + * Copyright (c) 2012-2014, 2016-2021 The Linux Foundation. All rights reserved. */ #include @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -171,6 +172,8 @@ struct pmic_gpio_state { struct pinctrl_dev *ctrl; struct gpio_chip chip; struct irq_chip irq; + u8 usid; + u8 pid_base; }; static const struct pinconf_generic_params pmic_gpio_bindings[] = { @@ -949,12 +952,36 @@ static int pmic_gpio_child_to_parent_hwirq(struct gpio_chip *chip, unsigned int *parent_hwirq, unsigned int *parent_type) { - *parent_hwirq = child_hwirq + 0xc0; + struct pmic_gpio_state *state = gpiochip_get_data(chip); + + *parent_hwirq = child_hwirq + state->pid_base; *parent_type = child_type; return 0; } +static void *pmic_gpio_populate_parent_fwspec(struct gpio_chip *chip, + unsigned int parent_hwirq, + unsigned int parent_type) +{ + struct pmic_gpio_state *state = gpiochip_get_data(chip); + struct irq_fwspec *fwspec; + + fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL); + if (!fwspec) + return NULL; + + fwspec->fwnode = chip->irq.parent_domain->fwnode; + + fwspec->param_count = 4; + fwspec->param[0] = state->usid; + fwspec->param[1] = parent_hwirq; + /* param[2] must be left as 0 */ + fwspec->param[3] = parent_type; + + return fwspec; +} + static int pmic_gpio_probe(struct platform_device *pdev) { struct irq_domain *parent_domain; @@ -965,6 +992,7 @@ static int pmic_gpio_probe(struct platform_device *pdev) struct pmic_gpio_pad *pad, *pads; struct pmic_gpio_state *state; struct gpio_irq_chip *girq; + const struct spmi_device *parent_spmi_dev; int ret, npins, i; u32 reg; @@ -984,6 +1012,9 @@ static int pmic_gpio_probe(struct platform_device *pdev) state->dev = &pdev->dev; state->map = dev_get_regmap(dev->parent, NULL); + parent_spmi_dev = to_spmi_device(dev->parent); + state->usid = parent_spmi_dev->usid; + state->pid_base = reg >> 8; pindesc = devm_kcalloc(dev, npins, sizeof(*pindesc), GFP_KERNEL); if (!pindesc) @@ -1059,7 +1090,7 @@ static int pmic_gpio_probe(struct platform_device *pdev) girq->fwnode = of_node_to_fwnode(state->dev->of_node); girq->parent_domain = parent_domain; girq->child_to_parent_hwirq = pmic_gpio_child_to_parent_hwirq; - girq->populate_parent_alloc_arg = gpiochip_populate_parent_fwspec_fourcell; + girq->populate_parent_alloc_arg = pmic_gpio_populate_parent_fwspec; girq->child_offset_to_irq = pmic_gpio_child_offset_to_irq; girq->child_irq_domain_ops.translate = pmic_gpio_domain_translate; diff --git a/drivers/platform/mellanox/mlxreg-io.c b/drivers/platform/mellanox/mlxreg-io.c index 7646708d57..a916cd89cb 100644 --- a/drivers/platform/mellanox/mlxreg-io.c +++ b/drivers/platform/mellanox/mlxreg-io.c @@ -98,7 +98,7 @@ mlxreg_io_get_reg(void *regmap, struct mlxreg_core_data *data, u32 in_val, if (ret) goto access_error; - *regval |= rol32(val, regsize * i); + *regval |= rol32(val, regsize * i * 8); } } @@ -141,7 +141,7 @@ mlxreg_io_attr_store(struct device *dev, struct device_attribute *attr, return -EINVAL; /* Convert buffer to input value. */ - ret = kstrtou32(buf, len, &input_val); + ret = kstrtou32(buf, 0, &input_val); if (ret) return ret; diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index d6a7c896ac..fc95620101 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -476,6 +476,7 @@ static const struct acpi_device_id amd_pmc_acpi_ids[] = { {"AMDI0006", 0}, {"AMDI0007", 0}, {"AMD0004", 0}, + {"AMD0005", 0}, { } }; MODULE_DEVICE_TABLE(acpi, amd_pmc_acpi_ids); diff --git a/drivers/platform/x86/dell/Kconfig b/drivers/platform/x86/dell/Kconfig index 42513eab1d..2fffa57e59 100644 --- a/drivers/platform/x86/dell/Kconfig +++ b/drivers/platform/x86/dell/Kconfig @@ -167,6 +167,7 @@ config DELL_WMI config DELL_WMI_PRIVACY bool "Dell WMI Hardware Privacy Support" depends on LEDS_TRIGGER_AUDIO = y || DELL_WMI = LEDS_TRIGGER_AUDIO + depends on DELL_WMI help This option adds integration with the "Dell Hardware Privacy" feature of Dell laptops to the dell-wmi driver. diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c index d53634c8a6..658bab4b79 100644 --- a/drivers/platform/x86/gigabyte-wmi.c +++ b/drivers/platform/x86/gigabyte-wmi.c @@ -141,6 +141,7 @@ static u8 gigabyte_wmi_detect_sensor_usability(struct wmi_device *wdev) static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M S2H V2"), + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE AX V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 GAMING X V2"), diff --git a/drivers/platform/x86/intel/int1092/intel_sar.c b/drivers/platform/x86/intel/int1092/intel_sar.c index 379560fe5d..e03943e638 100644 --- a/drivers/platform/x86/intel/int1092/intel_sar.c +++ b/drivers/platform/x86/intel/int1092/intel_sar.c @@ -42,12 +42,20 @@ static void update_sar_data(struct wwan_sar_context *context) if (config->device_mode_info && context->sar_data.device_mode < config->total_dev_mode) { - struct wwan_device_mode_info *dev_mode = - &config->device_mode_info[context->sar_data.device_mode]; + int itr = 0; - context->sar_data.antennatable_index = dev_mode->antennatable_index; - context->sar_data.bandtable_index = dev_mode->bandtable_index; - context->sar_data.sartable_index = dev_mode->sartable_index; + for (itr = 0; itr < config->total_dev_mode; itr++) { + if (context->sar_data.device_mode == + config->device_mode_info[itr].device_mode) { + struct wwan_device_mode_info *dev_mode = + &config->device_mode_info[itr]; + + context->sar_data.antennatable_index = dev_mode->antennatable_index; + context->sar_data.bandtable_index = dev_mode->bandtable_index; + context->sar_data.sartable_index = dev_mode->sartable_index; + break; + } + } } } @@ -305,7 +313,6 @@ static struct platform_driver sar_driver = { .remove = sar_remove, .driver = { .name = DRVNAME, - .owner = THIS_MODULE, .acpi_match_table = ACPI_PTR(sar_device_ids) } }; @@ -313,4 +320,4 @@ module_platform_driver(sar_driver); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Platform device driver for INTEL MODEM BIOS SAR"); -MODULE_AUTHOR("Shravan S "); +MODULE_AUTHOR("Shravan Sudhakar "); diff --git a/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c b/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c index 9fe0a2527e..e59d79c7e8 100644 --- a/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c +++ b/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c @@ -401,7 +401,7 @@ int skl_int3472_discrete_remove(struct platform_device *pdev) gpiod_remove_lookup_table(&int3472->gpios); - if (int3472->clock.ena_gpio) + if (int3472->clock.cl) skl_int3472_unregister_clock(int3472); gpiod_put(int3472->clock.ena_gpio); diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index bfa0cc2075..7cc9089d1e 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -75,7 +75,7 @@ struct intel_scu_ipc_dev { #define IPC_READ_BUFFER 0x90 /* Timeout in jiffies */ -#define IPC_TIMEOUT (5 * HZ) +#define IPC_TIMEOUT (10 * HZ) static struct intel_scu_ipc_dev *ipcdev; /* Only one for now */ static DEFINE_MUTEX(ipclock); /* lock used to prevent multiple call to SCU */ @@ -232,7 +232,7 @@ static inline u32 ipc_data_readl(struct intel_scu_ipc_dev *scu, u32 offset) /* Wait till scu status is busy */ static inline int busy_loop(struct intel_scu_ipc_dev *scu) { - unsigned long end = jiffies + msecs_to_jiffies(IPC_TIMEOUT); + unsigned long end = jiffies + IPC_TIMEOUT; do { u32 status; @@ -247,7 +247,7 @@ static inline int busy_loop(struct intel_scu_ipc_dev *scu) return -ETIMEDOUT; } -/* Wait till ipc ioc interrupt is received or timeout in 3 HZ */ +/* Wait till ipc ioc interrupt is received or timeout in 10 HZ */ static inline int ipc_wait_for_interrupt(struct intel_scu_ipc_dev *scu) { int status; diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c index 3dd519dfc4..d0096cd709 100644 --- a/drivers/ptp/ptp_kvm_x86.c +++ b/drivers/ptp/ptp_kvm_x86.c @@ -15,8 +15,6 @@ #include #include -struct pvclock_vsyscall_time_info *hv_clock; - static phys_addr_t clock_pair_gpa; static struct kvm_clock_pairing clock_pair; @@ -28,8 +26,7 @@ int kvm_arch_ptp_init(void) return -ENODEV; clock_pair_gpa = slow_virt_to_phys(&clock_pair); - hv_clock = pvclock_get_pvti_cpu0_va(); - if (!hv_clock) + if (!pvclock_get_pvti_cpu0_va()) return -ENODEV; ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, @@ -64,10 +61,8 @@ int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec, struct pvclock_vcpu_time_info *src; unsigned int version; long ret; - int cpu; - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; + src = this_cpu_pvti(); do { /* diff --git a/drivers/ptp/ptp_pch.c b/drivers/ptp/ptp_pch.c index a17e8cc642..8070f3fd98 100644 --- a/drivers/ptp/ptp_pch.c +++ b/drivers/ptp/ptp_pch.c @@ -644,6 +644,7 @@ static const struct pci_device_id pch_ieee1588_pcidev_id[] = { }, {0} }; +MODULE_DEVICE_TABLE(pci, pch_ieee1588_pcidev_id); static SIMPLE_DEV_PM_OPS(pch_pm_ops, pch_suspend, pch_resume); diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c index 14da4ab301..ea75b71a1f 100644 --- a/drivers/rtc/rtc-pcf85063.c +++ b/drivers/rtc/rtc-pcf85063.c @@ -34,6 +34,7 @@ #define PCF85063_REG_CTRL1 0x00 /* status */ #define PCF85063_REG_CTRL1_CAP_SEL BIT(0) #define PCF85063_REG_CTRL1_STOP BIT(5) +#define PCF85063_REG_CTRL1_EXT_TEST BIT(7) #define PCF85063_REG_CTRL2 0x01 #define PCF85063_CTRL2_AF BIT(6) @@ -117,6 +118,7 @@ static int pcf85063_rtc_set_time(struct device *dev, struct rtc_time *tm) * reset state until all time/date registers are written */ rc = regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL1, + PCF85063_REG_CTRL1_EXT_TEST | PCF85063_REG_CTRL1_STOP, PCF85063_REG_CTRL1_STOP); if (rc) diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c index f3c656975e..93695d5353 100644 --- a/drivers/s390/cio/blacklist.c +++ b/drivers/s390/cio/blacklist.c @@ -262,10 +262,12 @@ static int blacklist_parse_proc_parameters(char *buf) if (strcmp("free", parm) == 0) { rc = blacklist_parse_parameters(buf, free, 0); - /* There could be subchannels without proper devices connected. - * evaluate all the entries + /* + * Evaluate the subchannels without an online device. This way, + * no path-verification will be triggered on those subchannels + * and it avoids unnecessary delays. */ - css_schedule_eval_all(); + css_schedule_eval_cond(CSS_EVAL_NOT_ONLINE, 0); } else if (strcmp("add", parm) == 0) rc = blacklist_parse_parameters(buf, add, 0); else if (strcmp("purge", parm) == 0) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 3377097e65..44461928aa 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -788,27 +788,49 @@ static int __unset_registered(struct device *dev, void *data) return 0; } -void css_schedule_eval_all_unreg(unsigned long delay) +static int __unset_online(struct device *dev, void *data) +{ + struct idset *set = data; + struct subchannel *sch = to_subchannel(dev); + struct ccw_device *cdev = sch_get_cdev(sch); + + if (cdev && cdev->online) + idset_sch_del(set, sch->schid); + + return 0; +} + +void css_schedule_eval_cond(enum css_eval_cond cond, unsigned long delay) { unsigned long flags; - struct idset *unreg_set; + struct idset *set; /* Find unregistered subchannels. */ - unreg_set = idset_sch_new(); - if (!unreg_set) { + set = idset_sch_new(); + if (!set) { /* Fallback. */ css_schedule_eval_all(); return; } - idset_fill(unreg_set); - bus_for_each_dev(&css_bus_type, NULL, unreg_set, __unset_registered); + idset_fill(set); + switch (cond) { + case CSS_EVAL_UNREG: + bus_for_each_dev(&css_bus_type, NULL, set, __unset_registered); + break; + case CSS_EVAL_NOT_ONLINE: + bus_for_each_dev(&css_bus_type, NULL, set, __unset_online); + break; + default: + break; + } + /* Apply to slow_subchannel_set. */ spin_lock_irqsave(&slow_subchannel_lock, flags); - idset_add_set(slow_subchannel_set, unreg_set); + idset_add_set(slow_subchannel_set, set); atomic_set(&css_eval_scheduled, 1); queue_delayed_work(cio_work_q, &slow_path_work, delay); spin_unlock_irqrestore(&slow_subchannel_lock, flags); - idset_free(unreg_set); + idset_free(set); } void css_wait_for_slow_path(void) @@ -820,7 +842,7 @@ void css_wait_for_slow_path(void) void css_schedule_reprobe(void) { /* Schedule with a delay to allow merging of subsequent calls. */ - css_schedule_eval_all_unreg(1 * HZ); + css_schedule_eval_cond(CSS_EVAL_UNREG, 1 * HZ); } EXPORT_SYMBOL_GPL(css_schedule_reprobe); diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h index c98522cbe2..ede0b905bc 100644 --- a/drivers/s390/cio/css.h +++ b/drivers/s390/cio/css.h @@ -34,6 +34,14 @@ #define SNID_STATE3_MULTI_PATH 1 #define SNID_STATE3_SINGLE_PATH 0 +/* + * Conditions used to specify which subchannels need evaluation + */ +enum css_eval_cond { + CSS_EVAL_UNREG, /* unregistered subchannels */ + CSS_EVAL_NOT_ONLINE /* sch without an online-device */ +}; + struct path_state { __u8 state1 : 2; /* path state value 1 */ __u8 state2 : 2; /* path state value 2 */ @@ -136,7 +144,7 @@ static inline struct channel_subsystem *css_by_id(u8 cssid) /* Helper functions to build lists for the slow path. */ void css_schedule_eval(struct subchannel_id schid); void css_schedule_eval_all(void); -void css_schedule_eval_all_unreg(unsigned long delay); +void css_schedule_eval_cond(enum css_eval_cond, unsigned long delay); int css_complete_work(void); int sch_is_pseudo_sch(struct subchannel *); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 118939a772..623d5269a5 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -361,6 +361,7 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev) mutex_lock(&matrix_dev->lock); list_del(&matrix_mdev->node); mutex_unlock(&matrix_dev->lock); + vfio_uninit_group_dev(&matrix_mdev->vdev); kfree(matrix_mdev); err_dec_available: atomic_inc(&matrix_dev->available_instances); @@ -376,9 +377,10 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) mutex_lock(&matrix_dev->lock); vfio_ap_mdev_reset_queues(matrix_mdev); list_del(&matrix_mdev->node); + mutex_unlock(&matrix_dev->lock); + vfio_uninit_group_dev(&matrix_mdev->vdev); kfree(matrix_mdev); atomic_inc(&matrix_dev->available_instances); - mutex_unlock(&matrix_dev->lock); } static ssize_t name_show(struct mdev_type *mtype, diff --git a/drivers/scsi/arm/acornscsi.c b/drivers/scsi/arm/acornscsi.c index b4cb5fb199..0cc62c1b08 100644 --- a/drivers/scsi/arm/acornscsi.c +++ b/drivers/scsi/arm/acornscsi.c @@ -1776,7 +1776,7 @@ int acornscsi_reconnect_finish(AS_Host *host) host->scsi.disconnectable = 0; if (host->SCpnt->device->id == host->scsi.reconnected.target && host->SCpnt->device->lun == host->scsi.reconnected.lun && - scsi_cmd_to_tag(host->SCpnt) == host->scsi.reconnected.tag) { + scsi_cmd_to_rq(host->SCpnt)->tag == host->scsi.reconnected.tag) { #if (DEBUG & (DEBUG_QUEUES|DEBUG_DISCON)) DBG(host->SCpnt, printk("scsi%d.%c: reconnected", host->host->host_no, acornscsi_target(host))); diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c index 390b07bf92..ccbded3353 100644 --- a/drivers/scsi/csiostor/csio_init.c +++ b/drivers/scsi/csiostor/csio_init.c @@ -1254,3 +1254,4 @@ MODULE_DEVICE_TABLE(pci, csio_pci_tbl); MODULE_VERSION(CSIO_DRV_VERSION); MODULE_FIRMWARE(FW_FNAME_T5); MODULE_FIRMWARE(FW_FNAME_T6); +MODULE_SOFTDEP("pre: cxgb4"); diff --git a/drivers/scsi/elx/efct/efct_scsi.c b/drivers/scsi/elx/efct/efct_scsi.c index 40fb3a724c..cf2e41dd35 100644 --- a/drivers/scsi/elx/efct/efct_scsi.c +++ b/drivers/scsi/elx/efct/efct_scsi.c @@ -32,7 +32,7 @@ efct_scsi_io_alloc(struct efct_node *node) struct efct *efct; struct efct_xport *xport; struct efct_io *io; - unsigned long flags = 0; + unsigned long flags; efct = node->efct; @@ -44,7 +44,6 @@ efct_scsi_io_alloc(struct efct_node *node) if (!io) { efc_log_err(efct, "IO alloc Failed\n"); atomic_add_return(1, &xport->io_alloc_failed_count); - spin_unlock_irqrestore(&node->active_ios_lock, flags); return NULL; } diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 4683c183e9..5bc91d34df 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2281,11 +2281,6 @@ int iscsi_eh_abort(struct scsi_cmnd *sc) return FAILED; } - conn = session->leadconn; - iscsi_get_conn(conn->cls_conn); - conn->eh_abort_cnt++; - age = session->age; - spin_lock(&session->back_lock); task = (struct iscsi_task *)sc->SCp.ptr; if (!task || !task->sc) { @@ -2293,8 +2288,16 @@ int iscsi_eh_abort(struct scsi_cmnd *sc) ISCSI_DBG_EH(session, "sc completed while abort in progress\n"); spin_unlock(&session->back_lock); - goto success; + spin_unlock_bh(&session->frwd_lock); + mutex_unlock(&session->eh_mutex); + return SUCCESS; } + + conn = session->leadconn; + iscsi_get_conn(conn->cls_conn); + conn->eh_abort_cnt++; + age = session->age; + ISCSI_DBG_EH(session, "aborting [sc %p itt 0x%x]\n", sc, task->itt); __iscsi_get_task(task); spin_unlock(&session->back_lock); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 78ce38d725..026a1196a5 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -12292,12 +12292,12 @@ void lpfc_ignore_els_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, struct lpfc_iocbq *rspiocb) { - struct lpfc_nodelist *ndlp = (struct lpfc_nodelist *) cmdiocb->context1; + struct lpfc_nodelist *ndlp = NULL; IOCB_t *irsp = &rspiocb->iocb; /* ELS cmd tag completes */ lpfc_printf_log(phba, KERN_INFO, LOG_ELS, - "0139 Ignoring ELS cmd tag x%x completion Data: " + "0139 Ignoring ELS cmd code x%x completion Data: " "x%x x%x x%x\n", irsp->ulpIoTag, irsp->ulpStatus, irsp->un.ulpWord[4], irsp->ulpTimeout); @@ -12305,10 +12305,13 @@ lpfc_ignore_els_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, * Deref the ndlp after free_iocb. sli_release_iocb will access the ndlp * if exchange is busy. */ - if (cmdiocb->iocb.ulpCommand == CMD_GEN_REQUEST64_CR) + if (cmdiocb->iocb.ulpCommand == CMD_GEN_REQUEST64_CR) { + ndlp = cmdiocb->context_un.ndlp; lpfc_ct_free_iocb(phba, cmdiocb); - else + } else { + ndlp = (struct lpfc_nodelist *) cmdiocb->context1; lpfc_els_free_iocb(phba, cmdiocb); + } lpfc_nlp_put(ndlp); } diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index ece60267b9..b26f2699ad 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -2634,7 +2634,7 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, } if (unlikely(logit)) - ql_log(ql_log_warn, fcport->vha, 0x5060, + ql_log(ql_dbg_io, fcport->vha, 0x5060, "NVME-%s ERR Handling - hdl=%x status(%x) tr_len:%x resid=%x ox_id=%x\n", sp->name, sp->handle, comp_status, fd->transferred_length, le32_to_cpu(sts->residual_len), @@ -3491,7 +3491,7 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt) out: if (logit) - ql_log(ql_log_warn, fcport->vha, 0x3022, + ql_log(ql_dbg_io, fcport->vha, 0x3022, "FCP command status: 0x%x-0x%x (0x%x) nexus=%ld:%d:%llu portid=%02x%02x%02x oxid=0x%x cdb=%10phN len=0x%x rsp_info=0x%x resid=0x%x fw_resid=0x%x sp=%p cp=%p.\n", comp_status, scsi_status, res, vha->host_no, cp->device->id, cp->device->lun, fcport->d_id.b.domain, diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 029c9631ec..95be7ecdfe 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -318,8 +318,7 @@ static void ufshcd_add_query_upiu_trace(struct ufs_hba *hba, static void ufshcd_add_tm_upiu_trace(struct ufs_hba *hba, unsigned int tag, enum ufs_trace_str_t str_t) { - int off = (int)tag - hba->nutrs; - struct utp_task_req_desc *descp = &hba->utmrdl_base_addr[off]; + struct utp_task_req_desc *descp = &hba->utmrdl_base_addr[tag]; if (!trace_ufshcd_upiu_enabled()) return; @@ -6378,27 +6377,6 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba, u32 intr_status) return retval; } -struct ctm_info { - struct ufs_hba *hba; - unsigned long pending; - unsigned int ncpl; -}; - -static bool ufshcd_compl_tm(struct request *req, void *priv, bool reserved) -{ - struct ctm_info *const ci = priv; - struct completion *c; - - WARN_ON_ONCE(reserved); - if (test_bit(req->tag, &ci->pending)) - return true; - ci->ncpl++; - c = req->end_io_data; - if (c) - complete(c); - return true; -} - /** * ufshcd_tmc_handler - handle task management function completion * @hba: per adapter instance @@ -6409,18 +6387,24 @@ static bool ufshcd_compl_tm(struct request *req, void *priv, bool reserved) */ static irqreturn_t ufshcd_tmc_handler(struct ufs_hba *hba) { - unsigned long flags; - struct request_queue *q = hba->tmf_queue; - struct ctm_info ci = { - .hba = hba, - }; + unsigned long flags, pending, issued; + irqreturn_t ret = IRQ_NONE; + int tag; + + pending = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL); spin_lock_irqsave(hba->host->host_lock, flags); - ci.pending = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL); - blk_mq_tagset_busy_iter(q->tag_set, ufshcd_compl_tm, &ci); + issued = hba->outstanding_tasks & ~pending; + for_each_set_bit(tag, &issued, hba->nutmrs) { + struct request *req = hba->tmf_rqs[tag]; + struct completion *c = req->end_io_data; + + complete(c); + ret = IRQ_HANDLED; + } spin_unlock_irqrestore(hba->host->host_lock, flags); - return ci.ncpl ? IRQ_HANDLED : IRQ_NONE; + return ret; } /** @@ -6543,9 +6527,9 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, ufshcd_hold(hba, false); spin_lock_irqsave(host->host_lock, flags); - blk_mq_start_request(req); task_tag = req->tag; + hba->tmf_rqs[req->tag] = req; treq->upiu_req.req_header.dword_0 |= cpu_to_be32(task_tag); memcpy(hba->utmrdl_base_addr + task_tag, treq, sizeof(*treq)); @@ -6586,6 +6570,7 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, } spin_lock_irqsave(hba->host->host_lock, flags); + hba->tmf_rqs[req->tag] = NULL; __clear_bit(task_tag, &hba->outstanding_tasks); spin_unlock_irqrestore(hba->host->host_lock, flags); @@ -9636,6 +9621,12 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) err = PTR_ERR(hba->tmf_queue); goto free_tmf_tag_set; } + hba->tmf_rqs = devm_kcalloc(hba->dev, hba->nutmrs, + sizeof(*hba->tmf_rqs), GFP_KERNEL); + if (!hba->tmf_rqs) { + err = -ENOMEM; + goto free_tmf_queue; + } /* Reset the attached device */ ufshcd_device_reset(hba); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index f0da5d3db1..41f6e06f91 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -828,6 +828,7 @@ struct ufs_hba { struct blk_mq_tag_set tmf_tag_set; struct request_queue *tmf_queue; + struct request **tmf_rqs; struct uic_command *active_uic_cmd; struct mutex uic_cmd_mutex; diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index c25ce8f0e0..07d0250f17 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -300,7 +300,7 @@ static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, } break; default: - pr_info("Unsupport virtio scsi event reason %x\n", event->reason); + pr_info("Unsupported virtio scsi event reason %x\n", event->reason); } } @@ -392,7 +392,7 @@ static void virtscsi_handle_event(struct work_struct *work) virtscsi_handle_param_change(vscsi, event); break; default: - pr_err("Unsupport virtio scsi event %x\n", event->event); + pr_err("Unsupported virtio scsi event %x\n", event->event); } virtscsi_kick_event(vscsi, event_node); } diff --git a/drivers/soc/canaan/Kconfig b/drivers/soc/canaan/Kconfig index 8179b69518..853096b7e8 100644 --- a/drivers/soc/canaan/Kconfig +++ b/drivers/soc/canaan/Kconfig @@ -5,7 +5,6 @@ config SOC_K210_SYSCTL depends on RISCV && SOC_CANAAN && OF default SOC_CANAAN select PM - select SIMPLE_PM_BUS select SYSCON select MFD_SYSCON help diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c index bda170d7b4..72fc2b5392 100644 --- a/drivers/soc/qcom/mdt_loader.c +++ b/drivers/soc/qcom/mdt_loader.c @@ -98,7 +98,7 @@ void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len) if (ehdr->e_phnum < 2) return ERR_PTR(-EINVAL); - if (phdrs[0].p_type == PT_LOAD || phdrs[1].p_type == PT_LOAD) + if (phdrs[0].p_type == PT_LOAD) return ERR_PTR(-EINVAL); if ((phdrs[1].p_flags & QCOM_MDT_TYPE_MASK) != QCOM_MDT_TYPE_HASH) diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index 9faf48302f..52e5811671 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -628,7 +628,7 @@ static int qcom_socinfo_probe(struct platform_device *pdev) /* Feed the soc specific unique data into entropy pool */ add_device_randomness(info, item_size); - platform_set_drvdata(pdev, qs->soc_dev); + platform_set_drvdata(pdev, qs); return 0; } diff --git a/drivers/soc/ti/omap_prm.c b/drivers/soc/ti/omap_prm.c index ea64e18785..f32e1cbbe8 100644 --- a/drivers/soc/ti/omap_prm.c +++ b/drivers/soc/ti/omap_prm.c @@ -825,26 +825,29 @@ static int omap_reset_deassert(struct reset_controller_dev *rcdev, writel_relaxed(v, reset->prm->base + reset->prm->data->rstctrl); spin_unlock_irqrestore(&reset->lock, flags); - if (!has_rstst) - goto exit; - - /* wait for the status to be set */ + /* wait for the reset bit to clear */ ret = readl_relaxed_poll_timeout_atomic(reset->prm->base + - reset->prm->data->rstst, - v, v & BIT(st_bit), 1, - OMAP_RESET_MAX_WAIT); + reset->prm->data->rstctrl, + v, !(v & BIT(id)), 1, + OMAP_RESET_MAX_WAIT); if (ret) pr_err("%s: timedout waiting for %s:%lu\n", __func__, reset->prm->data->name, id); -exit: - if (reset->clkdm) { - /* At least dra7 iva needs a delay before clkdm idle */ - if (has_rstst) - udelay(1); - pdata->clkdm_allow_idle(reset->clkdm); + /* wait for the status to be set */ + if (has_rstst) { + ret = readl_relaxed_poll_timeout_atomic(reset->prm->base + + reset->prm->data->rstst, + v, v & BIT(st_bit), 1, + OMAP_RESET_MAX_WAIT); + if (ret) + pr_err("%s: timedout waiting for %s:%lu\n", __func__, + reset->prm->data->name, id); } + if (reset->clkdm) + pdata->clkdm_allow_idle(reset->clkdm); + return ret; } diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c index 788dcdf25f..f872cf196c 100644 --- a/drivers/spi/spi-atmel.c +++ b/drivers/spi/spi-atmel.c @@ -1301,7 +1301,7 @@ static int atmel_spi_one_transfer(struct spi_master *master, * DMA map early, for performance (empties dcache ASAP) and * better fault reporting. */ - if ((!master->cur_msg_mapped) + if ((!master->cur_msg->is_dma_mapped) && as->use_pdc) { if (atmel_spi_dma_map_xfer(as, xfer) < 0) return -ENOMEM; @@ -1381,7 +1381,7 @@ static int atmel_spi_one_transfer(struct spi_master *master, } } - if (!master->cur_msg_mapped + if (!master->cur_msg->is_dma_mapped && as->use_pdc) atmel_spi_dma_unmap_xfer(master, xfer); diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c index a78e56f566..3043677ba2 100644 --- a/drivers/spi/spi-bcm-qspi.c +++ b/drivers/spi/spi-bcm-qspi.c @@ -1250,10 +1250,14 @@ static void bcm_qspi_hw_init(struct bcm_qspi *qspi) static void bcm_qspi_hw_uninit(struct bcm_qspi *qspi) { + u32 status = bcm_qspi_read(qspi, MSPI, MSPI_MSPI_STATUS); + bcm_qspi_write(qspi, MSPI, MSPI_SPCR2, 0); if (has_bspi(qspi)) bcm_qspi_write(qspi, MSPI, MSPI_WRITE_LOCK, 0); + /* clear interrupt */ + bcm_qspi_write(qspi, MSPI, MSPI_MSPI_STATUS, status & ~1); } static const struct spi_controller_mem_ops bcm_qspi_mem_ops = { @@ -1397,6 +1401,47 @@ int bcm_qspi_probe(struct platform_device *pdev, if (!qspi->dev_ids) return -ENOMEM; + /* + * Some SoCs integrate spi controller (e.g., its interrupt bits) + * in specific ways + */ + if (soc_intc) { + qspi->soc_intc = soc_intc; + soc_intc->bcm_qspi_int_set(soc_intc, MSPI_DONE, true); + } else { + qspi->soc_intc = NULL; + } + + if (qspi->clk) { + ret = clk_prepare_enable(qspi->clk); + if (ret) { + dev_err(dev, "failed to prepare clock\n"); + goto qspi_probe_err; + } + qspi->base_clk = clk_get_rate(qspi->clk); + } else { + qspi->base_clk = MSPI_BASE_FREQ; + } + + if (data->has_mspi_rev) { + rev = bcm_qspi_read(qspi, MSPI, MSPI_REV); + /* some older revs do not have a MSPI_REV register */ + if ((rev & 0xff) == 0xff) + rev = 0; + } + + qspi->mspi_maj_rev = (rev >> 4) & 0xf; + qspi->mspi_min_rev = rev & 0xf; + qspi->mspi_spcr3_sysclk = data->has_spcr3_sysclk; + + qspi->max_speed_hz = qspi->base_clk / (bcm_qspi_spbr_min(qspi) * 2); + + /* + * On SW resets it is possible to have the mask still enabled + * Need to disable the mask and clear the status while we init + */ + bcm_qspi_hw_uninit(qspi); + for (val = 0; val < num_irqs; val++) { irq = -1; name = qspi_irq_tab[val].irq_name; @@ -1433,38 +1478,6 @@ int bcm_qspi_probe(struct platform_device *pdev, goto qspi_probe_err; } - /* - * Some SoCs integrate spi controller (e.g., its interrupt bits) - * in specific ways - */ - if (soc_intc) { - qspi->soc_intc = soc_intc; - soc_intc->bcm_qspi_int_set(soc_intc, MSPI_DONE, true); - } else { - qspi->soc_intc = NULL; - } - - ret = clk_prepare_enable(qspi->clk); - if (ret) { - dev_err(dev, "failed to prepare clock\n"); - goto qspi_probe_err; - } - - qspi->base_clk = clk_get_rate(qspi->clk); - - if (data->has_mspi_rev) { - rev = bcm_qspi_read(qspi, MSPI, MSPI_REV); - /* some older revs do not have a MSPI_REV register */ - if ((rev & 0xff) == 0xff) - rev = 0; - } - - qspi->mspi_maj_rev = (rev >> 4) & 0xf; - qspi->mspi_min_rev = rev & 0xf; - qspi->mspi_spcr3_sysclk = data->has_spcr3_sysclk; - - qspi->max_speed_hz = qspi->base_clk / (bcm_qspi_spbr_min(qspi) * 2); - bcm_qspi_hw_init(qspi); init_completion(&qspi->mspi_done); init_completion(&qspi->bspi_done); diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index 386e8c84be..a15de10ee2 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -233,36 +233,44 @@ static int mtk_spi_set_hw_cs_timing(struct spi_device *spi) return delay; inactive = (delay * DIV_ROUND_UP(mdata->spi_clk_hz, 1000000)) / 1000; - setup = setup ? setup : 1; - hold = hold ? hold : 1; - inactive = inactive ? inactive : 1; - - reg_val = readl(mdata->base + SPI_CFG0_REG); - if (mdata->dev_comp->enhance_timing) { - hold = min_t(u32, hold, 0x10000); - setup = min_t(u32, setup, 0x10000); - reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); - reg_val |= (((hold - 1) & 0xffff) - << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); - reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); - reg_val |= (((setup - 1) & 0xffff) - << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); - } else { - hold = min_t(u32, hold, 0x100); - setup = min_t(u32, setup, 0x100); - reg_val &= ~(0xff << SPI_CFG0_CS_HOLD_OFFSET); - reg_val |= (((hold - 1) & 0xff) << SPI_CFG0_CS_HOLD_OFFSET); - reg_val &= ~(0xff << SPI_CFG0_CS_SETUP_OFFSET); - reg_val |= (((setup - 1) & 0xff) - << SPI_CFG0_CS_SETUP_OFFSET); + if (hold || setup) { + reg_val = readl(mdata->base + SPI_CFG0_REG); + if (mdata->dev_comp->enhance_timing) { + if (hold) { + hold = min_t(u32, hold, 0x10000); + reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); + reg_val |= (((hold - 1) & 0xffff) + << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); + } + if (setup) { + setup = min_t(u32, setup, 0x10000); + reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); + reg_val |= (((setup - 1) & 0xffff) + << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); + } + } else { + if (hold) { + hold = min_t(u32, hold, 0x100); + reg_val &= ~(0xff << SPI_CFG0_CS_HOLD_OFFSET); + reg_val |= (((hold - 1) & 0xff) << SPI_CFG0_CS_HOLD_OFFSET); + } + if (setup) { + setup = min_t(u32, setup, 0x100); + reg_val &= ~(0xff << SPI_CFG0_CS_SETUP_OFFSET); + reg_val |= (((setup - 1) & 0xff) + << SPI_CFG0_CS_SETUP_OFFSET); + } + } + writel(reg_val, mdata->base + SPI_CFG0_REG); } - writel(reg_val, mdata->base + SPI_CFG0_REG); - inactive = min_t(u32, inactive, 0x100); - reg_val = readl(mdata->base + SPI_CFG1_REG); - reg_val &= ~SPI_CFG1_CS_IDLE_MASK; - reg_val |= (((inactive - 1) & 0xff) << SPI_CFG1_CS_IDLE_OFFSET); - writel(reg_val, mdata->base + SPI_CFG1_REG); + if (inactive) { + inactive = min_t(u32, inactive, 0x100); + reg_val = readl(mdata->base + SPI_CFG1_REG); + reg_val &= ~SPI_CFG1_CS_IDLE_MASK; + reg_val |= (((inactive - 1) & 0xff) << SPI_CFG1_CS_IDLE_OFFSET); + writel(reg_val, mdata->base + SPI_CFG1_REG); + } return 0; } diff --git a/drivers/spi/spi-mux.c b/drivers/spi/spi-mux.c index 9708b7827f..f5d32ec463 100644 --- a/drivers/spi/spi-mux.c +++ b/drivers/spi/spi-mux.c @@ -137,6 +137,13 @@ static int spi_mux_probe(struct spi_device *spi) priv = spi_controller_get_devdata(ctlr); priv->spi = spi; + /* + * Increase lockdep class as these lock are taken while the parent bus + * already holds their instance's lock. + */ + lockdep_set_subclass(&ctlr->io_mutex, 1); + lockdep_set_subclass(&ctlr->add_lock, 1); + priv->mux = devm_mux_control_get(&spi->dev, NULL); if (IS_ERR(priv->mux)) { ret = dev_err_probe(&spi->dev, PTR_ERR(priv->mux), diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index a66fa97046..2b0301fc97 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -315,6 +316,7 @@ #define NXP_FSPI_MIN_IOMAP SZ_4M #define DCFG_RCWSR1 0x100 +#define SYS_PLL_RAT GENMASK(6, 2) /* Access flash memory using IP bus only */ #define FSPI_QUIRK_USE_IP_ONLY BIT(0) @@ -926,9 +928,8 @@ static void erratum_err050568(struct nxp_fspi *f) { .family = "QorIQ LS1028A" }, { /* sentinel */ } }; - struct device_node *np; struct regmap *map; - u32 val = 0, sysclk = 0; + u32 val, sys_pll_ratio; int ret; /* Check for LS1028A family */ @@ -937,7 +938,6 @@ static void erratum_err050568(struct nxp_fspi *f) return; } - /* Compute system clock frequency multiplier ratio */ map = syscon_regmap_lookup_by_compatible("fsl,ls1028a-dcfg"); if (IS_ERR(map)) { dev_err(f->dev, "No syscon regmap\n"); @@ -948,23 +948,11 @@ static void erratum_err050568(struct nxp_fspi *f) if (ret < 0) goto err; - /* Strap bits 6:2 define SYS_PLL_RAT i.e frequency multiplier ratio */ - val = (val >> 2) & 0x1F; - WARN(val == 0, "Strapping is zero: Cannot determine ratio"); + sys_pll_ratio = FIELD_GET(SYS_PLL_RAT, val); + dev_dbg(f->dev, "val: 0x%08x, sys_pll_ratio: %d\n", val, sys_pll_ratio); - /* Compute system clock frequency */ - np = of_find_node_by_name(NULL, "clock-sysclk"); - if (!np) - goto err; - - if (of_property_read_u32(np, "clock-frequency", &sysclk)) - goto err; - - sysclk = (sysclk * val) / 1000000; /* Convert sysclk to Mhz */ - dev_dbg(f->dev, "val: 0x%08x, sysclk: %dMhz\n", val, sysclk); - - /* Use IP bus only if PLL is 300MHz */ - if (sysclk == 300) + /* Use IP bus only if platform clock is 300MHz */ + if (sys_pll_ratio == 3) f->devtype_data->quirks |= FSPI_QUIRK_USE_IP_ONLY; return; diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c index 8ce840c7ec..713292b0c7 100644 --- a/drivers/spi/spi-tegra20-slink.c +++ b/drivers/spi/spi-tegra20-slink.c @@ -1182,8 +1182,7 @@ static int tegra_slink_resume(struct device *dev) } #endif -#ifdef CONFIG_PM -static int tegra_slink_runtime_suspend(struct device *dev) +static int __maybe_unused tegra_slink_runtime_suspend(struct device *dev) { struct spi_master *master = dev_get_drvdata(dev); struct tegra_slink_data *tspi = spi_master_get_devdata(master); @@ -1208,7 +1207,6 @@ static int tegra_slink_runtime_resume(struct device *dev) } return 0; } -#endif /* CONFIG_PM */ static const struct dev_pm_ops slink_pm_ops = { SET_RUNTIME_PM_OPS(tegra_slink_runtime_suspend, diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 35822a4db3..d2a6038aef 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -478,12 +478,6 @@ static LIST_HEAD(spi_controller_list); */ static DEFINE_MUTEX(board_lock); -/* - * Prevents addition of devices with same chip select and - * addition of devices below an unregistering controller. - */ -static DEFINE_MUTEX(spi_add_lock); - /** * spi_alloc_device - Allocate a new SPI device * @ctlr: Controller to which device is connected @@ -636,9 +630,9 @@ int spi_add_device(struct spi_device *spi) * chipselect **BEFORE** we call setup(), else we'll trash * its configuration. Lock against concurrent add() calls. */ - mutex_lock(&spi_add_lock); + mutex_lock(&ctlr->add_lock); status = __spi_add_device(spi); - mutex_unlock(&spi_add_lock); + mutex_unlock(&ctlr->add_lock); return status; } EXPORT_SYMBOL_GPL(spi_add_device); @@ -658,7 +652,7 @@ static int spi_add_device_locked(struct spi_device *spi) /* Set the bus ID string */ spi_dev_set_name(spi); - WARN_ON(!mutex_is_locked(&spi_add_lock)); + WARN_ON(!mutex_is_locked(&ctlr->add_lock)); return __spi_add_device(spi); } @@ -2553,6 +2547,12 @@ struct spi_controller *__spi_alloc_controller(struct device *dev, return NULL; device_initialize(&ctlr->dev); + INIT_LIST_HEAD(&ctlr->queue); + spin_lock_init(&ctlr->queue_lock); + spin_lock_init(&ctlr->bus_lock_spinlock); + mutex_init(&ctlr->bus_lock_mutex); + mutex_init(&ctlr->io_mutex); + mutex_init(&ctlr->add_lock); ctlr->bus_num = -1; ctlr->num_chipselect = 1; ctlr->slave = slave; @@ -2825,11 +2825,6 @@ int spi_register_controller(struct spi_controller *ctlr) return id; ctlr->bus_num = id; } - INIT_LIST_HEAD(&ctlr->queue); - spin_lock_init(&ctlr->queue_lock); - spin_lock_init(&ctlr->bus_lock_spinlock); - mutex_init(&ctlr->bus_lock_mutex); - mutex_init(&ctlr->io_mutex); ctlr->bus_lock_flag = 0; init_completion(&ctlr->xfer_completion); if (!ctlr->max_dma_len) @@ -2966,7 +2961,7 @@ void spi_unregister_controller(struct spi_controller *ctlr) /* Prevent addition of new devices, unregister existing ones */ if (IS_ENABLED(CONFIG_SPI_DYNAMIC)) - mutex_lock(&spi_add_lock); + mutex_lock(&ctlr->add_lock); device_for_each_child(&ctlr->dev, NULL, __unregister); @@ -2997,7 +2992,7 @@ void spi_unregister_controller(struct spi_controller *ctlr) mutex_unlock(&board_lock); if (IS_ENABLED(CONFIG_SPI_DYNAMIC)) - mutex_unlock(&spi_add_lock); + mutex_unlock(&ctlr->add_lock); } EXPORT_SYMBOL_GPL(spi_unregister_controller); diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index b247f5fc5b..9a3b8a51ea 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -668,6 +668,19 @@ static const struct file_operations spidev_fops = { static struct class *spidev_class; +static const struct spi_device_id spidev_spi_ids[] = { + { .name = "dh2228fv" }, + { .name = "ltc2488" }, + { .name = "sx1301" }, + { .name = "bk4" }, + { .name = "dhcom-board" }, + { .name = "m53cpld" }, + { .name = "spi-petra" }, + { .name = "spi-authenta" }, + {}, +}; +MODULE_DEVICE_TABLE(spi, spidev_spi_ids); + #ifdef CONFIG_OF static const struct of_device_id spidev_dt_ids[] = { { .compatible = "rohm,dh2228fv" }, @@ -813,6 +826,7 @@ static struct spi_driver spidev_spi_driver = { }, .probe = spidev_probe, .remove = spidev_remove, + .id_table = spidev_spi_ids, /* NOTE: suspend/resume methods are not necessary here. * We don't do anything except pass the requests to/from diff --git a/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c b/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c index 8e085dda0c..712e01c378 100644 --- a/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c +++ b/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c @@ -1646,6 +1646,8 @@ static input_system_err_t input_system_configure_channel_sensor( default: return INPUT_SYSTEM_ERR_PARAMETER_NOT_SUPPORTED; } + + return INPUT_SYSTEM_ERR_NO_ERROR; } // Test flags and set structure. diff --git a/drivers/staging/media/hantro/hantro_drv.c b/drivers/staging/media/hantro/hantro_drv.c index 8a2edd67f2..20e5081588 100644 --- a/drivers/staging/media/hantro/hantro_drv.c +++ b/drivers/staging/media/hantro/hantro_drv.c @@ -919,7 +919,7 @@ static int hantro_probe(struct platform_device *pdev) if (!vpu->variant->irqs[i].handler) continue; - if (vpu->variant->num_clocks > 1) { + if (vpu->variant->num_irqs > 1) { irq_name = vpu->variant->irqs[i].name; irq = platform_get_irq_byname(vpu->pdev, irq_name); } else { diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_video.c b/drivers/staging/media/sunxi/cedrus/cedrus_video.c index c589fe9dae..825af5fd35 100644 --- a/drivers/staging/media/sunxi/cedrus/cedrus_video.c +++ b/drivers/staging/media/sunxi/cedrus/cedrus_video.c @@ -135,7 +135,7 @@ void cedrus_prepare_format(struct v4l2_pix_format *pix_fmt) sizeimage = bytesperline * height; /* Chroma plane size. */ - sizeimage += bytesperline * height / 2; + sizeimage += bytesperline * ALIGN(height, 64) / 2; break; diff --git a/drivers/staging/r8188eu/hal/hal_intf.c b/drivers/staging/r8188eu/hal/hal_intf.c index a6d589e89a..f27eba72d6 100644 --- a/drivers/staging/r8188eu/hal/hal_intf.c +++ b/drivers/staging/r8188eu/hal/hal_intf.c @@ -248,7 +248,7 @@ void rtw_hal_update_ra_mask(struct adapter *adapt, u32 mac_id, u8 rssi_level) #ifdef CONFIG_88EU_AP_MODE struct sta_info *psta = NULL; struct sta_priv *pstapriv = &adapt->stapriv; - if ((mac_id - 1) > 0) + if (mac_id >= 2) psta = pstapriv->sta_aid[(mac_id - 1) - 1]; if (psta) add_RATid(adapt, psta, 0);/* todo: based on rssi_level*/ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index e14a7e0a63..25c17ad9c7 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -204,7 +204,7 @@ create_pagelist(char *buf, char __user *ubuf, offset = (uintptr_t)ubuf & (PAGE_SIZE - 1); num_pages = DIV_ROUND_UP(count + offset, PAGE_SIZE); - if (num_pages > (SIZE_MAX - sizeof(struct pagelist) - + if ((size_t)num_pages > (SIZE_MAX - sizeof(struct pagelist) - sizeof(struct vchiq_pagelist_info)) / (sizeof(u32) + sizeof(pages[0]) + sizeof(struct scatterlist))) diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index 5ce13b099d..5363ebebfc 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -585,6 +585,9 @@ static int optee_remove(struct platform_device *pdev) { struct optee *optee = platform_get_drvdata(pdev); + /* Unregister OP-TEE specific client devices on TEE bus */ + optee_unregister_devices(); + /* * Ask OP-TEE to free all cached shared memory objects to decrease * reference counters and also avoid wild pointers in secure world diff --git a/drivers/tee/optee/device.c b/drivers/tee/optee/device.c index ec1d24693e..128a2d2a50 100644 --- a/drivers/tee/optee/device.c +++ b/drivers/tee/optee/device.c @@ -53,6 +53,13 @@ static int get_devices(struct tee_context *ctx, u32 session, return 0; } +static void optee_release_device(struct device *dev) +{ + struct tee_client_device *optee_device = to_tee_client_device(dev); + + kfree(optee_device); +} + static int optee_register_device(const uuid_t *device_uuid) { struct tee_client_device *optee_device = NULL; @@ -63,6 +70,7 @@ static int optee_register_device(const uuid_t *device_uuid) return -ENOMEM; optee_device->dev.bus = &tee_bus_type; + optee_device->dev.release = optee_release_device; if (dev_set_name(&optee_device->dev, "optee-ta-%pUb", device_uuid)) { kfree(optee_device); return -ENOMEM; @@ -154,3 +162,17 @@ int optee_enumerate_devices(u32 func) { return __optee_enumerate_devices(func); } + +static int __optee_unregister_device(struct device *dev, void *data) +{ + if (!strncmp(dev_name(dev), "optee-ta", strlen("optee-ta"))) + device_unregister(dev); + + return 0; +} + +void optee_unregister_devices(void) +{ + bus_for_each_dev(&tee_bus_type, NULL, NULL, + __optee_unregister_device); +} diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h index dbdd367be1..f6bb4a763b 100644 --- a/drivers/tee/optee/optee_private.h +++ b/drivers/tee/optee/optee_private.h @@ -184,6 +184,7 @@ void optee_fill_pages_list(u64 *dst, struct page **pages, int num_pages, #define PTA_CMD_GET_DEVICES 0x0 #define PTA_CMD_GET_DEVICES_SUPP 0x1 int optee_enumerate_devices(u32 func); +void optee_unregister_devices(void); /* * Small helpers diff --git a/drivers/tee/optee/shm_pool.c b/drivers/tee/optee/shm_pool.c index c41a9a501a..d167039af5 100644 --- a/drivers/tee/optee/shm_pool.c +++ b/drivers/tee/optee/shm_pool.c @@ -35,7 +35,7 @@ static int pool_op_alloc(struct tee_shm_pool_mgr *poolm, unsigned int nr_pages = 1 << order, i; struct page **pages; - pages = kcalloc(nr_pages, sizeof(pages), GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) { rc = -ENOMEM; goto err; diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile index da19d7987d..78fd365893 100644 --- a/drivers/thunderbolt/Makefile +++ b/drivers/thunderbolt/Makefile @@ -7,6 +7,7 @@ thunderbolt-objs += usb4_port.o nvm.o retimer.o quirks.o thunderbolt-${CONFIG_ACPI} += acpi.o thunderbolt-$(CONFIG_DEBUG_FS) += debugfs.o thunderbolt-${CONFIG_USB4_KUNIT_TEST} += test.o +CFLAGS_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) thunderbolt_dma_test-${CONFIG_USB4_DMA_TEST} += dma_test.o obj-$(CONFIG_USB4_DMA_TEST) += thunderbolt_dma_test.o diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 8f143c09a1..f0bf01ea06 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -618,10 +618,8 @@ static int __init xenboot_console_setup(struct console *console, char *string) { static struct xencons_info xenboot; - if (xen_initial_domain()) + if (xen_initial_domain() || !xen_pv_domain()) return 0; - if (!xen_pv_domain()) - return -ENODEV; return xencons_info_pv_init(&xenboot, 0); } @@ -632,17 +630,16 @@ static void xenboot_write_console(struct console *console, const char *string, unsigned int linelen, off = 0; const char *pos; + if (dom0_write_console(0, string, len) >= 0) + return; + if (!xen_pv_domain()) { xen_hvm_early_write(0, string, len); return; } - dom0_write_console(0, string, len); - - if (xen_initial_domain()) + if (domU_write_console(0, "(early) ", 8) < 0) return; - - domU_write_console(0, "(early) ", 8); while (off < len && NULL != (pos = strchr(string+off, '\n'))) { linelen = pos-string+off; if (off + linelen > len) diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig index 71ae16de0f..39fc96dc25 100644 --- a/drivers/tty/serial/8250/Kconfig +++ b/drivers/tty/serial/8250/Kconfig @@ -361,9 +361,13 @@ config SERIAL_8250_BCM2835AUX If unsure, say N. config SERIAL_8250_FSL - bool + bool "Freescale 16550 UART support" if COMPILE_TEST && !(PPC || ARM || ARM64) depends on SERIAL_8250_CONSOLE - default PPC || ARM || ARM64 || COMPILE_TEST + default PPC || ARM || ARM64 + help + Selecting this option enables a workaround for a break-detection + erratum for Freescale 16550 UARTs in the 8250 driver. It also + enables support for ACPI enumeration. config SERIAL_8250_DW tristate "Support for Synopsys DesignWare 8250 quirks" diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c index 8b7bc10b6e..f1d100671e 100644 --- a/drivers/usb/chipidea/ci_hdrc_imx.c +++ b/drivers/usb/chipidea/ci_hdrc_imx.c @@ -420,11 +420,16 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) data->phy = devm_usb_get_phy_by_phandle(dev, "fsl,usbphy", 0); if (IS_ERR(data->phy)) { ret = PTR_ERR(data->phy); - /* Return -EINVAL if no usbphy is available */ - if (ret == -ENODEV) - data->phy = NULL; - else - goto err_clk; + if (ret == -ENODEV) { + data->phy = devm_usb_get_phy_by_phandle(dev, "phys", 0); + if (IS_ERR(data->phy)) { + ret = PTR_ERR(data->phy); + if (ret == -ENODEV) + data->phy = NULL; + else + goto err_clk; + } + } } pdata.usb_phy = data->phy; diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 4e2f1552f4..7b2e2420ec 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -340,6 +340,9 @@ static void acm_process_notification(struct acm *acm, unsigned char *buf) acm->iocount.overrun++; spin_unlock_irqrestore(&acm->read_lock, flags); + if (newctrl & ACM_CTRL_BRK) + tty_flip_buffer_push(&acm->port); + if (difference) wake_up_all(&acm->wioctl); @@ -475,11 +478,16 @@ static int acm_submit_read_urbs(struct acm *acm, gfp_t mem_flags) static void acm_process_read_urb(struct acm *acm, struct urb *urb) { + unsigned long flags; + if (!urb->actual_length) return; + spin_lock_irqsave(&acm->read_lock, flags); tty_insert_flip_string(&acm->port, urb->transfer_buffer, urb->actual_length); + spin_unlock_irqrestore(&acm->read_lock, flags); + tty_flip_buffer_push(&acm->port); } diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 35d5908b54..fdf79bcf7e 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -824,7 +824,7 @@ static struct usb_class_driver wdm_class = { }; /* --- WWAN framework integration --- */ -#ifdef CONFIG_WWAN_CORE +#ifdef CONFIG_WWAN static int wdm_wwan_port_start(struct wwan_port *port) { struct wdm_device *desc = wwan_port_get_drvdata(port); @@ -963,11 +963,11 @@ static void wdm_wwan_rx(struct wdm_device *desc, int length) /* inbuf has been copied, it is safe to check for outstanding data */ schedule_work(&desc->service_outs_intr); } -#else /* CONFIG_WWAN_CORE */ +#else /* CONFIG_WWAN */ static void wdm_wwan_init(struct wdm_device *desc) {} static void wdm_wwan_deinit(struct wdm_device *desc) {} static void wdm_wwan_rx(struct wdm_device *desc, int length) {} -#endif /* CONFIG_WWAN_CORE */ +#endif /* CONFIG_WWAN */ /* --- error handling --- */ static void wdm_rxwork(struct work_struct *work) diff --git a/drivers/usb/common/Kconfig b/drivers/usb/common/Kconfig index 5e8a04e3dd..b856622431 100644 --- a/drivers/usb/common/Kconfig +++ b/drivers/usb/common/Kconfig @@ -6,8 +6,7 @@ config USB_COMMON config USB_LED_TRIG bool "USB LED Triggers" - depends on LEDS_CLASS && LEDS_TRIGGERS - select USB_COMMON + depends on LEDS_CLASS && USB_COMMON && LEDS_TRIGGERS help This option adds LED triggers for USB host and/or gadget activity. diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 804b505481..4519d06c9c 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4243,7 +4243,7 @@ int dwc3_gadget_init(struct dwc3 *dwc) } - usb_initialize_gadget(dwc->sysdev, dwc->gadget, dwc_gadget_release); + usb_initialize_gadget(dwc->dev, dwc->gadget, dwc_gadget_release); dev = &dwc->gadget->dev; dev->platform_data = dwc; dwc->gadget->ops = &dwc3_gadget_ops; diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index be864560bf..ef55b8bb58 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -674,11 +674,17 @@ static int set_ep_max_packet_size(const struct f_uac2_opts *uac2_opts, ssize = uac2_opts->c_ssize; } - if (!is_playback && (uac2_opts->c_sync == USB_ENDPOINT_SYNC_ASYNC)) + if (!is_playback && (uac2_opts->c_sync == USB_ENDPOINT_SYNC_ASYNC)) { + // Win10 requires max packet size + 1 frame srate = srate * (1000 + uac2_opts->fb_max) / 1000; - - max_size_bw = num_channels(chmask) * ssize * - DIV_ROUND_UP(srate, factor / (1 << (ep_desc->bInterval - 1))); + // updated srate is always bigger, therefore DIV_ROUND_UP always yields +1 + max_size_bw = num_channels(chmask) * ssize * + (DIV_ROUND_UP(srate, factor / (1 << (ep_desc->bInterval - 1)))); + } else { + // adding 1 frame provision for Win10 + max_size_bw = num_channels(chmask) * ssize * + (DIV_ROUND_UP(srate, factor / (1 << (ep_desc->bInterval - 1))) + 1); + } ep_desc->wMaxPacketSize = cpu_to_le16(min_t(u16, max_size_bw, max_size_ep)); diff --git a/drivers/usb/host/dwc_otg/dwc_otg_hcd_queue.c b/drivers/usb/host/dwc_otg/dwc_otg_hcd_queue.c index 4503af692a..f51fad1e6b 100644 --- a/drivers/usb/host/dwc_otg/dwc_otg_hcd_queue.c +++ b/drivers/usb/host/dwc_otg/dwc_otg_hcd_queue.c @@ -691,7 +691,7 @@ int dwc_otg_hcd_qh_add(dwc_otg_hcd_t * hcd, dwc_otg_qh_t * qh) } else { /* If the QH wasn't in a schedule, then sched_frame is stale. */ qh->sched_frame = dwc_frame_num_inc(dwc_otg_hcd_get_frame_number(hcd), - SCHEDULE_SLOP); + max_t(uint32_t, qh->interval, SCHEDULE_SLOP)); status = schedule_periodic(hcd, qh); qh->start_split_frame = qh->sched_frame; if ( !hcd->periodic_qh_count ) { diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c index 0b37227707..ded9738392 100644 --- a/drivers/usb/host/ohci-omap.c +++ b/drivers/usb/host/ohci-omap.c @@ -40,17 +40,6 @@ #include -/* OMAP-1510 OHCI has its own MMU for DMA */ -#define OMAP1510_LB_MEMSIZE 32 /* Should be same as SDRAM size */ -#define OMAP1510_LB_CLOCK_DIV 0xfffec10c -#define OMAP1510_LB_MMU_CTL 0xfffec208 -#define OMAP1510_LB_MMU_LCK 0xfffec224 -#define OMAP1510_LB_MMU_LD_TLB 0xfffec228 -#define OMAP1510_LB_MMU_CAM_H 0xfffec22c -#define OMAP1510_LB_MMU_CAM_L 0xfffec230 -#define OMAP1510_LB_MMU_RAM_H 0xfffec234 -#define OMAP1510_LB_MMU_RAM_L 0xfffec238 - #define DRIVER_DESC "OHCI OMAP driver" struct ohci_omap_priv { @@ -104,61 +93,6 @@ static int omap_ohci_transceiver_power(struct ohci_omap_priv *priv, int on) return 0; } -#ifdef CONFIG_ARCH_OMAP15XX -/* - * OMAP-1510 specific Local Bus clock on/off - */ -static int omap_1510_local_bus_power(int on) -{ - if (on) { - omap_writel((1 << 1) | (1 << 0), OMAP1510_LB_MMU_CTL); - udelay(200); - } else { - omap_writel(0, OMAP1510_LB_MMU_CTL); - } - - return 0; -} - -/* - * OMAP-1510 specific Local Bus initialization - * NOTE: This assumes 32MB memory size in OMAP1510LB_MEMSIZE. - * See also arch/mach-omap/memory.h for __virt_to_dma() and - * __dma_to_virt() which need to match with the physical - * Local Bus address below. - */ -static int omap_1510_local_bus_init(void) -{ - unsigned int tlb; - unsigned long lbaddr, physaddr; - - omap_writel((omap_readl(OMAP1510_LB_CLOCK_DIV) & 0xfffffff8) | 0x4, - OMAP1510_LB_CLOCK_DIV); - - /* Configure the Local Bus MMU table */ - for (tlb = 0; tlb < OMAP1510_LB_MEMSIZE; tlb++) { - lbaddr = tlb * 0x00100000 + OMAP1510_LB_OFFSET; - physaddr = tlb * 0x00100000 + PHYS_OFFSET; - omap_writel((lbaddr & 0x0fffffff) >> 22, OMAP1510_LB_MMU_CAM_H); - omap_writel(((lbaddr & 0x003ffc00) >> 6) | 0xc, - OMAP1510_LB_MMU_CAM_L); - omap_writel(physaddr >> 16, OMAP1510_LB_MMU_RAM_H); - omap_writel((physaddr & 0x0000fc00) | 0x300, OMAP1510_LB_MMU_RAM_L); - omap_writel(tlb << 4, OMAP1510_LB_MMU_LCK); - omap_writel(0x1, OMAP1510_LB_MMU_LD_TLB); - } - - /* Enable the walking table */ - omap_writel(omap_readl(OMAP1510_LB_MMU_CTL) | (1 << 3), OMAP1510_LB_MMU_CTL); - udelay(200); - - return 0; -} -#else -#define omap_1510_local_bus_power(x) {} -#define omap_1510_local_bus_init() {} -#endif - #ifdef CONFIG_USB_OTG static void start_hnp(struct ohci_hcd *ohci) @@ -229,10 +163,8 @@ static int ohci_omap_reset(struct usb_hcd *hcd) omap_ohci_clock_power(priv, 1); - if (cpu_is_omap15xx()) { - omap_1510_local_bus_power(1); - omap_1510_local_bus_init(); - } + if (config->lb_reset) + config->lb_reset(); ret = ohci_setup(hcd); if (ret < 0) diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c index 6e784f2fc2..eb46e642e8 100644 --- a/drivers/usb/host/xhci-dbgtty.c +++ b/drivers/usb/host/xhci-dbgtty.c @@ -408,40 +408,38 @@ static int xhci_dbc_tty_register_device(struct xhci_dbc *dbc) return -EBUSY; xhci_dbc_tty_init_port(dbc, port); - tty_dev = tty_port_register_device(&port->port, - dbc_tty_driver, 0, NULL); - if (IS_ERR(tty_dev)) { - ret = PTR_ERR(tty_dev); - goto register_fail; - } ret = kfifo_alloc(&port->write_fifo, DBC_WRITE_BUF_SIZE, GFP_KERNEL); if (ret) - goto buf_alloc_fail; + goto err_exit_port; ret = xhci_dbc_alloc_requests(dbc, BULK_IN, &port->read_pool, dbc_read_complete); if (ret) - goto request_fail; + goto err_free_fifo; ret = xhci_dbc_alloc_requests(dbc, BULK_OUT, &port->write_pool, dbc_write_complete); if (ret) - goto request_fail; + goto err_free_requests; + + tty_dev = tty_port_register_device(&port->port, + dbc_tty_driver, 0, NULL); + if (IS_ERR(tty_dev)) { + ret = PTR_ERR(tty_dev); + goto err_free_requests; + } port->registered = true; return 0; -request_fail: +err_free_requests: xhci_dbc_free_requests(&port->read_pool); xhci_dbc_free_requests(&port->write_pool); +err_free_fifo: kfifo_free(&port->write_fifo); - -buf_alloc_fail: - tty_unregister_device(dbc_tty_driver, 0); - -register_fail: +err_exit_port: xhci_dbc_tty_exit_port(port); dev_err(dbc->dev, "can't register tty port, err %d\n", ret); diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 2c9f25ca8e..2484a9d38c 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -30,6 +30,7 @@ #define PCI_VENDOR_ID_FRESCO_LOGIC 0x1b73 #define PCI_DEVICE_ID_FRESCO_LOGIC_PDK 0x1000 #define PCI_DEVICE_ID_FRESCO_LOGIC_FL1009 0x1009 +#define PCI_DEVICE_ID_FRESCO_LOGIC_FL1100 0x1100 #define PCI_DEVICE_ID_FRESCO_LOGIC_FL1400 0x1400 #define PCI_VENDOR_ID_ETRON 0x1b6f @@ -113,6 +114,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) /* Look for vendor-specific quirks */ if (pdev->vendor == PCI_VENDOR_ID_FRESCO_LOGIC && (pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_PDK || + pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_FL1100 || pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_FL1400)) { if (pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_PDK && pdev->revision == 0x0) { @@ -279,8 +281,10 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) pdev->device == 0x3432) xhci->quirks |= XHCI_BROKEN_STREAMS; - if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483) + if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483) { xhci->quirks |= XHCI_LPM_SUPPORT; + xhci->quirks |= XHCI_EP_CTX_BROKEN_DCS; + } if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA && pdev->device == PCI_DEVICE_ID_ASMEDIA_1042_XHCI) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 133f29ee72..0b21b66b45 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -366,16 +366,22 @@ static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci, /* Must be called with xhci->lock held, releases and aquires lock back */ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags) { - u64 temp_64; + u32 temp_32; int ret; xhci_dbg(xhci, "Abort command ring\n"); reinit_completion(&xhci->cmd_ring_stop_completion); - temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); - xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, - &xhci->op_regs->cmd_ring); + /* + * The control bits like command stop, abort are located in lower + * dword of the command ring control register. Limit the write + * to the lower dword to avoid corrupting the command ring pointer + * in case if the command ring is stopped by the time upper dword + * is written. + */ + temp_32 = readl(&xhci->op_regs->cmd_ring); + writel(temp_32 | CMD_RING_ABORT, &xhci->op_regs->cmd_ring); /* Section 4.6.1.2 of xHCI 1.0 spec says software should also time the * completion of the Command Abort operation. If CRR is not negated in 5 @@ -559,8 +565,11 @@ static int xhci_move_dequeue_past_td(struct xhci_hcd *xhci, struct xhci_ring *ep_ring; struct xhci_command *cmd; struct xhci_segment *new_seg; + struct xhci_segment *halted_seg = NULL; union xhci_trb *new_deq; int new_cycle; + union xhci_trb *halted_trb; + int index = 0; dma_addr_t addr; u64 hw_dequeue; bool cycle_found = false; @@ -598,7 +607,27 @@ static int xhci_move_dequeue_past_td(struct xhci_hcd *xhci, hw_dequeue = xhci_get_hw_deq(xhci, dev, ep_index, stream_id); new_seg = ep_ring->deq_seg; new_deq = ep_ring->dequeue; - new_cycle = hw_dequeue & 0x1; + + /* + * Quirk: xHC write-back of the DCS field in the hardware dequeue + * pointer is wrong - use the cycle state of the TRB pointed to by + * the dequeue pointer. + */ + if (xhci->quirks & XHCI_EP_CTX_BROKEN_DCS && + !(ep->ep_state & EP_HAS_STREAMS)) + halted_seg = trb_in_td(xhci, td->start_seg, + td->first_trb, td->last_trb, + hw_dequeue & ~0xf, false); + if (halted_seg) { + index = ((dma_addr_t)(hw_dequeue & ~0xf) - halted_seg->dma) / + sizeof(*halted_trb); + halted_trb = &halted_seg->trbs[index]; + new_cycle = halted_trb->generic.field[3] & 0x1; + xhci_dbg(xhci, "Endpoint DCS = %d TRB index = %d cycle = %d\n", + (u8)(hw_dequeue & 0x1), index, new_cycle); + } else { + new_cycle = hw_dequeue & 0x1; + } /* * We want to find the pointer, segment and cycle state of the new trb diff --git a/drivers/usb/host/xhci-tegra.c b/drivers/usb/host/xhci-tegra.c index 575fa89a78..1bf494b649 100644 --- a/drivers/usb/host/xhci-tegra.c +++ b/drivers/usb/host/xhci-tegra.c @@ -1787,7 +1787,6 @@ static int tegra_xusb_remove(struct platform_device *pdev) return 0; } -#if IS_ENABLED(CONFIG_PM) || IS_ENABLED(CONFIG_PM_SLEEP) static bool xhci_hub_ports_suspended(struct xhci_hub *hub) { struct device *dev = hub->hcd->self.controller; @@ -2102,7 +2101,7 @@ static int tegra_xusb_exit_elpg(struct tegra_xusb *tegra, bool runtime) return err; } -static int tegra_xusb_suspend(struct device *dev) +static __maybe_unused int tegra_xusb_suspend(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int err; @@ -2144,7 +2143,7 @@ static int tegra_xusb_suspend(struct device *dev) return err; } -static int tegra_xusb_resume(struct device *dev) +static __maybe_unused int tegra_xusb_resume(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int err; @@ -2174,10 +2173,8 @@ static int tegra_xusb_resume(struct device *dev) return 0; } -#endif -#ifdef CONFIG_PM -static int tegra_xusb_runtime_suspend(struct device *dev) +static __maybe_unused int tegra_xusb_runtime_suspend(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int ret; @@ -2190,7 +2187,7 @@ static int tegra_xusb_runtime_suspend(struct device *dev) return ret; } -static int tegra_xusb_runtime_resume(struct device *dev) +static __maybe_unused int tegra_xusb_runtime_resume(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int err; @@ -2201,7 +2198,6 @@ static int tegra_xusb_runtime_resume(struct device *dev) return err; } -#endif static const struct dev_pm_ops tegra_xusb_pm_ops = { SET_RUNTIME_PM_OPS(tegra_xusb_runtime_suspend, diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index e4a7e3c835..20b1819ac8 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1899,6 +1899,7 @@ struct xhci_hcd { #define XHCI_SG_TRB_CACHE_SIZE_QUIRK BIT_ULL(39) #define XHCI_NO_SOFT_RETRY BIT_ULL(40) #define XHCI_BROKEN_D3COLD BIT_ULL(41) +#define XHCI_EP_CTX_BROKEN_DCS BIT_ULL(42) unsigned int num_active_eps; unsigned int limit_active_eps; diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c index ce9fc46c92..b5935834f9 100644 --- a/drivers/usb/musb/musb_dsps.c +++ b/drivers/usb/musb/musb_dsps.c @@ -899,11 +899,13 @@ static int dsps_probe(struct platform_device *pdev) if (usb_get_dr_mode(&pdev->dev) == USB_DR_MODE_PERIPHERAL) { ret = dsps_setup_optional_vbus_irq(pdev, glue); if (ret) - goto err; + goto unregister_pdev; } return 0; +unregister_pdev: + platform_device_unregister(glue->musb); err: pm_runtime_disable(&pdev->dev); iounmap(glue->usbss_base); diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 6cfb5d3360..a484ff5e4e 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -246,11 +246,13 @@ static void option_instat_callback(struct urb *urb); /* These Quectel products use Quectel's vendor ID */ #define QUECTEL_PRODUCT_EC21 0x0121 #define QUECTEL_PRODUCT_EC25 0x0125 +#define QUECTEL_PRODUCT_EG91 0x0191 #define QUECTEL_PRODUCT_EG95 0x0195 #define QUECTEL_PRODUCT_BG96 0x0296 #define QUECTEL_PRODUCT_EP06 0x0306 #define QUECTEL_PRODUCT_EM12 0x0512 #define QUECTEL_PRODUCT_RM500Q 0x0800 +#define QUECTEL_PRODUCT_EC200S_CN 0x6002 #define QUECTEL_PRODUCT_EC200T 0x6026 #define CMOTECH_VENDOR_ID 0x16d8 @@ -1111,6 +1113,9 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0xff, 0xff), .driver_info = NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0xff, 0xff), + .driver_info = NUMEP2 }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0xff, 0xff), .driver_info = NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0, 0) }, @@ -1128,6 +1133,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), .driver_info = ZLP }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200S_CN, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, @@ -1227,6 +1233,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1203, 0xff), /* Telit LE910Cx (RNDIS) */ .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1204, 0xff), /* Telit LE910Cx (MBIM) */ + .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c index 83da8236e3..c18bf8164b 100644 --- a/drivers/usb/serial/qcserial.c +++ b/drivers/usb/serial/qcserial.c @@ -165,6 +165,7 @@ static const struct usb_device_id id_table[] = { {DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */ {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */ {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */ + {DEVICE_SWI(0x1199, 0x90d2)}, /* Sierra Wireless EM9191 QDL */ {DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c index 9858716698..c15eec9cc4 100644 --- a/drivers/usb/typec/tcpm/tcpci.c +++ b/drivers/usb/typec/tcpm/tcpci.c @@ -696,7 +696,7 @@ irqreturn_t tcpci_irq(struct tcpci *tcpci) tcpm_pd_receive(tcpci->port, &msg); } - if (status & TCPC_ALERT_EXTENDED_STATUS) { + if (tcpci->data->vbus_vsafe0v && (status & TCPC_ALERT_EXTENDED_STATUS)) { ret = regmap_read(tcpci->regmap, TCPC_EXTENDED_STATUS, &raw); if (!ret && (raw & TCPC_EXTENDED_STATUS_VSAFE0V)) tcpm_vbus_change(tcpci->port); diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index a4d37205df..7f2f3ff1b3 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -4876,6 +4876,7 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1, tcpm_set_state(port, SRC_ATTACH_WAIT, 0); break; case SRC_ATTACHED: + case SRC_STARTUP: case SRC_SEND_CAPABILITIES: case SRC_READY: if (tcpm_port_is_disconnected(port) || diff --git a/drivers/usb/typec/tipd/core.c b/drivers/usb/typec/tipd/core.c index 21b3ae25c7..ea4cc0a6e4 100644 --- a/drivers/usb/typec/tipd/core.c +++ b/drivers/usb/typec/tipd/core.c @@ -625,10 +625,6 @@ static int tps6598x_probe(struct i2c_client *client) if (ret < 0) return ret; - fwnode = device_get_named_child_node(&client->dev, "connector"); - if (!fwnode) - return -ENODEV; - /* * This fwnode has a "compatible" property, but is never populated as a * struct device. Instead we simply parse it to read the properties. @@ -636,7 +632,9 @@ static int tps6598x_probe(struct i2c_client *client) * with existing DT files, we work around this by deleting any * fwnode_links to/from this fwnode. */ - fw_devlink_purge_absent_suppliers(fwnode); + fwnode = device_get_named_child_node(&client->dev, "connector"); + if (fwnode) + fw_devlink_purge_absent_suppliers(fwnode); tps->role_sw = fwnode_usb_role_switch_get(fwnode); if (IS_ERR(tps->role_sw)) { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 294ba05e6f..bd56de7484 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1714,6 +1714,9 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; + if (!mvdev->actual_features) + return; + if (!is_index_valid(mvdev, idx)) return; @@ -2145,6 +2148,8 @@ static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) for (i = 0; i < ndev->mvdev.max_vqs; i++) ndev->vqs[i].ready = false; + + ndev->mvdev.cvq.ready = false; } static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 29a38ecba1..26e3d90d1e 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -665,13 +665,11 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, static int vduse_vdpa_reset(struct vdpa_device *vdpa) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - - if (vduse_dev_set_status(dev, 0)) - return -EIO; + int ret = vduse_dev_set_status(dev, 0); vduse_dev_reset(dev); - return 0; + return ret; } static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) @@ -1593,8 +1591,10 @@ static int vduse_init(void) vduse_irq_wq = alloc_workqueue("vduse-irq", WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0); - if (!vduse_irq_wq) + if (!vduse_irq_wq) { + ret = -ENOMEM; goto err_wq; + } ret = vduse_domain_init(); if (ret) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 68198e0f2a..a03b5a99c2 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -565,7 +565,7 @@ static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) } struct vfio_pci_walk_info { - int (*fn)(struct pci_dev *, void *data); + int (*fn)(struct pci_dev *pdev, void *data); void *data; struct pci_dev *pdev; bool slot; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index f41d081777..39039e0461 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -173,6 +173,10 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) return -EINVAL; + if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK)) + for (i = 0; i < nvqs; i++) + vhost_vdpa_unsetup_vq_irq(v, i); + if (status == 0) { ret = ops->reset(vdpa); if (ret) @@ -184,10 +188,6 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) for (i = 0; i < nvqs; i++) vhost_vdpa_setup_vq_irq(v, i); - if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK)) - for (i = 0; i < nvqs; i++) - vhost_vdpa_unsetup_vq_irq(v, i); - return 0; } @@ -322,7 +322,7 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) struct eventfd_ctx *ctx; cb.callback = vhost_vdpa_config_cb; - cb.private = v->vdpa; + cb.private = v; if (copy_from_user(&fd, argp, sizeof(fd))) return -EFAULT; @@ -640,7 +640,7 @@ static int vhost_vdpa_va_map(struct vhost_vdpa *v, u64 offset, map_size, map_iova = iova; struct vdpa_map_file *map_file; struct vm_area_struct *vma; - int ret; + int ret = 0; mmap_read_lock(dev->mm); diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 2d2249914d..e7f1a831f9 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -2207,8 +2207,9 @@ config FB_HYPERV This framebuffer driver supports Microsoft Hyper-V Synthetic Video. config FB_SIMPLE - bool "Simple framebuffer support" - depends on (FB = y) && !DRM_SIMPLEDRM + tristate "Simple framebuffer support" + depends on FB + depends on !DRM_SIMPLEDRM select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c index c5b99a4861..6b4d5a7f3e 100644 --- a/drivers/video/fbdev/gbefb.c +++ b/drivers/video/fbdev/gbefb.c @@ -1267,7 +1267,7 @@ static struct platform_device *gbefb_device; static int __init gbefb_init(void) { int ret = platform_driver_register(&gbefb_driver); - if (!ret) { + if (IS_ENABLED(CONFIG_SGI_IP32) && !ret) { gbefb_device = platform_device_alloc("gbefb", 0); if (gbefb_device) { ret = platform_device_add(gbefb_device); diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 588e02fb91..236081afe9 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -239,6 +239,17 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } + /* + * Some devices detect legacy solely via F_VERSION_1. Write + * F_VERSION_1 to force LE config space accesses before FEATURES_OK for + * these when needed. + */ + if (drv->validate && !virtio_legacy_is_little_endian() + && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) { + dev->features = BIT_ULL(VIRTIO_F_VERSION_1); + dev->config->finalize_features(dev); + } + if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else @@ -345,8 +356,13 @@ static int virtio_device_of_init(struct virtio_device *dev) ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device); BUG_ON(ret >= sizeof(compat)); + /* + * On powerpc/pseries virtio devices are PCI devices so PCI + * vendor/device ids play the role of the "compatible" property. + * Simply don't init of_node in this case. + */ if (!of_device_is_compatible(np, compat)) { - ret = -EINVAL; + ret = 0; goto out; } diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index b81fe4f7d4..bf59faeb3d 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1666,7 +1666,7 @@ config WDT_MTX1 config SIBYTE_WDOG tristate "Sibyte SoC hardware watchdog" - depends on CPU_SB1 || (MIPS && COMPILE_TEST) + depends on CPU_SB1 help Watchdog driver for the built in watchdog hardware in Sibyte SoC processors. There are apparently two watchdog timers diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 22f5aff0c1..1b2c3aca68 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -241,7 +241,7 @@ config XEN_PRIVCMD config XEN_ACPI_PROCESSOR tristate "Xen ACPI processor" - depends on XEN && XEN_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ + depends on XEN && XEN_PV_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ default m help This ACPI processor uploads Power Management information to the Xen @@ -259,7 +259,7 @@ config XEN_ACPI_PROCESSOR config XEN_MCE_LOG bool "Xen platform mcelog" - depends on XEN_DOM0 && X86_MCE + depends on XEN_PV_DOM0 && X86_MCE help Allow kernel fetching MCE error from Xen platform and converting it into Linux mcelog format for mcelog tools diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 43ebfe36ac..3a50f097ed 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -491,12 +491,12 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * Stop waiting if either state is not BP_EAGAIN and ballooning action is - * needed, or if the credit has changed while state is BP_EAGAIN. + * Stop waiting if either state is BP_DONE and ballooning action is + * needed, or if the credit has changed while state is not BP_DONE. */ static bool balloon_thread_cond(enum bp_state state, long credit) { - if (state != BP_EAGAIN) + if (state == BP_DONE) credit = 0; return current_credit() != credit || kthread_should_stop(); @@ -516,10 +516,19 @@ static int balloon_thread(void *unused) set_freezable(); for (;;) { - if (state == BP_EAGAIN) - timeout = balloon_stats.schedule_delay * HZ; - else + switch (state) { + case BP_DONE: + case BP_ECANCELED: timeout = 3600 * HZ; + break; + case BP_EAGAIN: + timeout = balloon_stats.schedule_delay * HZ; + break; + case BP_WAIT: + timeout = HZ; + break; + } + credit = current_credit(); wait_event_freezable_timeout(balloon_thread_wq, diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 720a7b7abd..3369734108 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -257,7 +257,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) LIST_HEAD(pagelist); struct mmap_gfn_state state; - /* We only support privcmd_ioctl_mmap_batch for auto translated. */ + /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ if (xen_feature(XENFEAT_auto_translated_physmap)) return -ENOSYS; @@ -420,7 +420,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) int rc; struct page **pages; - pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); if (pages == NULL) return -ENOMEM; @@ -428,7 +428,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) if (rc != 0) { pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, numpgs, rc); - kfree(pages); + kvfree(pages); return -ENOMEM; } BUG_ON(vma->vm_private_data != NULL); @@ -803,21 +803,21 @@ static long privcmd_ioctl_mmap_resource(struct file *file, unsigned int domid = (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? DOMID_SELF : kdata.dom; - int num; + int num, *errs = (int *)pfns; + BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); num = xen_remap_domain_mfn_array(vma, kdata.addr & PAGE_MASK, - pfns, kdata.num, (int *)pfns, + pfns, kdata.num, errs, vma->vm_page_prot, - domid, - vma->vm_private_data); + domid); if (num < 0) rc = num; else if (num != kdata.num) { unsigned int i; for (i = 0; i < num; i++) { - rc = pfns[i]; + rc = errs[i]; if (rc < 0) break; } @@ -912,7 +912,7 @@ static void privcmd_close(struct vm_area_struct *vma) else pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", numpgs, rc); - kfree(pages); + kvfree(pages); } static vm_fault_t privcmd_fault(struct vm_fault *vmf) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index eb2151fb60..1769a44f48 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -23,7 +23,7 @@ struct fscache_netfs v9fs_cache_netfs = { .version = 0, }; -/** +/* * v9fs_random_cachetag - Generate a random tag to be associated * with a new cache session. * @@ -233,7 +233,7 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data, unlock_page(page); } -/** +/* * __v9fs_readpage_from_fscache - read a page from cache * * Returns 0 if the pages are in cache and a BIO is submitted, @@ -268,7 +268,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) } } -/** +/* * __v9fs_readpages_from_fscache - read multiple pages from cache * * Returns 0 if the pages are in cache and a BIO is submitted, @@ -308,7 +308,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode, } } -/** +/* * __v9fs_readpage_to_fscache - write a page to the cache * */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 9d9de62592..b8863dd0de 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -19,18 +19,18 @@ #include "v9fs_vfs.h" #include "fid.h" +static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) +{ + hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); +} + + /** * v9fs_fid_add - add a fid to a dentry * @dentry: dentry that the fid is being added to * @fid: fid to add * */ - -static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) -{ - hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); -} - void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { spin_lock(&dentry->d_lock); @@ -67,7 +67,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) /** * v9fs_open_fid_add - add an open fid to an inode - * @dentry: inode that the fid is being added to + * @inode: inode that the fid is being added to * @fid: fid to add * */ diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index cdb99507ef..2e0fa7c932 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -155,6 +155,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) /** * v9fs_parse_options - parse mount options into session structure * @v9ses: existing v9fs session information + * @opts: The mount option string * * Return 0 upon success, -ERRNO upon failure. */ @@ -542,12 +543,9 @@ extern int v9fs_error_init(void); static struct kobject *v9fs_kobj; #ifdef CONFIG_9P_FSCACHE -/** - * caches_show - list caches associated with a session - * - * Returns the size of buffer written. +/* + * List caches associated with a session */ - static ssize_t caches_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index cce9ace651..1c4f1b39cc 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -30,8 +30,7 @@ /** * v9fs_fid_readpage - read an entire page in from 9P - * - * @fid: fid being read + * @data: Opaque pointer to the fid being read * @page: structure to page * */ @@ -116,6 +115,8 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, /** * v9fs_release_page - release the private state associated with a page + * @page: The page to be released + * @gfp: The caller's allocation restrictions * * Returns 1 if the page can be released, false otherwise. */ @@ -129,9 +130,9 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) /** * v9fs_invalidate_page - Invalidate a page completely or partially - * - * @page: structure to page - * @offset: offset in the page + * @page: The page to be invalidated + * @offset: offset of the invalidated region + * @length: length of the invalidated region */ static void v9fs_invalidate_page(struct page *page, unsigned int offset, @@ -199,6 +200,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) /** * v9fs_launder_page - Writeback a dirty page + * @page: The page to be cleaned up + * * Returns 0 on success. */ @@ -219,6 +222,7 @@ static int v9fs_launder_page(struct page *page) /** * v9fs_direct_IO - 9P address space operation for direct I/O * @iocb: target I/O control block + * @iter: The data/buffer to use * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index aab5e65386..246235ebdb 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -359,14 +359,11 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, } /** - * v9fs_file_read - read from a file - * @filp: file pointer to read - * @udata: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ - static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -388,11 +385,9 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t @@ -561,11 +556,9 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) } /** - * v9fs_mmap_file_read - read from a file - * @filp: file pointer to read - * @data: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_mmap_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ static ssize_t @@ -576,11 +569,9 @@ v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_mmap_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_mmap_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 795706520b..08f48b70a7 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -218,7 +218,7 @@ v9fs_blank_wstat(struct p9_wstat *wstat) /** * v9fs_alloc_inode - helper function to allocate an inode - * + * @sb: The superblock to allocate the inode from */ struct inode *v9fs_alloc_inode(struct super_block *sb) { @@ -238,7 +238,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) /** * v9fs_free_inode - destroy an inode - * + * @inode: The inode to be freed */ void v9fs_free_inode(struct inode *inode) @@ -343,7 +343,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, * v9fs_get_inode - helper function to setup an inode * @sb: superblock * @mode: mode to setup inode with - * + * @rdev: The device numbers to set */ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) @@ -369,7 +369,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) } /** - * v9fs_clear_inode - release an inode + * v9fs_evict_inode - Remove an inode from the inode cache * @inode: inode to release * */ @@ -665,14 +665,15 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, /** * v9fs_vfs_create - VFS hook to create a regular file + * @mnt_userns: The user namespace of the mount + * @dir: The parent directory + * @dentry: The name of file to be created + * @mode: The UNIX file mode to set + * @excl: True if the file must not yet exist * * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called * for mknod(2). * - * @dir: directory inode that is being created - * @dentry: dentry that is being deleted - * @mode: create permissions - * */ static int @@ -696,6 +697,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, /** * v9fs_vfs_mkdir - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @mode: mode for new directory @@ -900,10 +902,12 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) /** * v9fs_vfs_rename - VFS hook to rename an inode + * @mnt_userns: The user namespace of the mount * @old_dir: old dir inode * @old_dentry: old dentry * @new_dir: new dir inode * @new_dentry: new dentry + * @flags: RENAME_* flags * */ @@ -1009,6 +1013,7 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, /** * v9fs_vfs_getattr - retrieve file metadata + * @mnt_userns: The user namespace of the mount * @path: Object to query * @stat: metadata structure to populate * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -1050,6 +1055,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /** * v9fs_vfs_setattr - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -1285,6 +1291,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_symlink - helper function to create symlinks + * @mnt_userns: The user namespace of the mount * @dir: directory inode containing symlink * @dentry: dentry for symlink * @symname: symlink data @@ -1340,6 +1347,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @mode: mode for creation diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e1c0240b51..01b9e1281a 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -37,7 +37,10 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** - * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object + * @dir_inode: The directory inode + * + * Helper function to get the gid for creating a * new file system object. This checks the S_ISGID to determine the owning * group of the new file system object. */ @@ -211,12 +214,13 @@ int v9fs_open_to_dotl_flags(int flags) /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @mnt_userns: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions + * @excl: True if the file must not yet exist * */ - static int v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) @@ -361,6 +365,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory @@ -537,6 +542,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) /** * v9fs_vfs_setattr_dotl - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -816,6 +822,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod_dotl - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index dae9a57d7e..45cfd50a95 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -86,8 +86,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode return afs_do_sync_operation(op); } -/** - * afs_sillyrename - Perform a silly-rename of a dentry +/* + * Perform silly-rename of a dentry. * * AFS is stateless and the server doesn't know when the client is holding a * file open. To prevent application problems when a file is unlinked while diff --git a/fs/afs/write.c b/fs/afs/write.c index 2dfe3b3a53..f24370f5c7 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -974,8 +974,7 @@ int afs_launder_page(struct page *page) iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); - ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE, - true); + ret = afs_store_data(vnode, &iter, page_offset(page) + f, true); } trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 69d900a847..a813b70f59 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -630,7 +630,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED_NOREPLACE; + elf_type |= MAP_FIXED; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dff2c8a3e0..c0cebcf745 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3030,7 +3030,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f1274d5c38..7721ce0c06 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -190,9 +190,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( } /* - * lookup a directory item based on name. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory item by name. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir item does not exists, an error pointer if an error + * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -273,27 +284,42 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, } /* - * lookup a directory item based on index. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory index item by name and index number. * - * The name is used to make sure the index really points to the name you were - * looking for. + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @index: The index number. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir index item does not exists, an error pointer if an + * error happened, or a pointer to a dir item if the dir index item exists and + * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod) { + struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = objectid; + key.offset = index; - return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (di == ERR_PTR(-ENOENT)) + return NULL; + + return di; } struct btrfs_dir_item * diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fc3da7585f..0ab456cb4b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4859,6 +4859,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, out_free_delayed: btrfs_free_delayed_extent_op(extent_op); out_free_buf: + btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ff577005d..a1762363f6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -734,8 +734,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root == fs_info->tree_root); + update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -2704,14 +2703,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, drop_args.bytes_found); if (ret != -ENOSPC) { /* - * When cloning we want to avoid transaction aborts when - * nothing was done and we are attempting to clone parts - * of inline extents, in such cases -EOPNOTSUPP is - * returned by __btrfs_drop_extents() without having - * changed anything in the file. + * The only time we don't want to abort is if we are + * attempting to clone a partial inline extent, in which + * case we'll get EOPNOTSUPP. However if we aren't + * clone we need to abort no matter what, because if we + * got EOPNOTSUPP via prealloc then we messed up and + * need to abort. */ - if (extent_info && !extent_info->is_new_extent && - ret && ret != -EOPNOTSUPP) + if (ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7efc26aa8..b415c5ec03 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -939,9 +939,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, } /* - * helper function to see if a given name and sequence number found - * in an inode back reference are already in a directory and correctly - * point to this inode + * See if a given name and sequence number found in an inode back reference are + * already in a directory and correctly point to this inode. + * + * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it + * exists. */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, @@ -950,29 +952,34 @@ static noinline int inode_in_dir(struct btrfs_root *root, { struct btrfs_dir_item *di; struct btrfs_key location; - int match = 0; + int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (di) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; - } else + } else { goto out; - btrfs_release_path(path); + } + btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else + if (IS_ERR(di)) { + ret = PTR_ERR(di); goto out; - match = 1; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid == objectid) + ret = 1; + } out: btrfs_release_path(path); - return match; + return ret; } /* @@ -1182,7 +1189,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; @@ -1192,7 +1201,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; @@ -1517,10 +1528,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - /* if we already have a perfect match, we're done */ - if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen)) { + ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, + name, namelen); + if (ret < 0) { + goto out; + } else if (ret == 0) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -1580,6 +1593,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; } + /* Else, ret == 1, we already have a perfect match, we're done. */ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); @@ -1936,8 +1950,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct inode *dir; u8 log_type; - int exists; - int ret = 0; + bool exists; + int ret; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); bool name_added = false; @@ -1957,12 +1971,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len); btrfs_dir_item_key_to_cpu(eb, di, &log_key); - exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); - if (exists == 0) - exists = 1; - else - exists = 0; + ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); + if (ret < 0) + goto out; + exists = (ret == 0); + ret = 0; if (key->type == BTRFS_DIR_ITEM_KEY) { dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, @@ -1977,7 +1991,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } - if (IS_ERR_OR_NULL(dst_di)) { + + if (IS_ERR(dst_di)) { + ret = PTR_ERR(dst_di); + goto out; + } else if (!dst_di) { /* we need a sequence number to insert, so we only * do inserts for the BTRFS_DIR_INDEX_KEY types */ @@ -2281,7 +2299,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, dir_key->offset, name, name_len, 0); } - if (!log_di || log_di == ERR_PTR(-ENOENT)) { + if (!log_di) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -3540,8 +3558,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (err == -ENOSPC) { btrfs_set_log_full_commit(trans); err = 0; - } else if (err < 0 && err != -ENOENT) { - /* ENOENT can be returned if the entry hasn't been fsynced yet */ + } else if (err < 0) { btrfs_abort_transaction(trans, err); } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8129a430d7..2f117c5716 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -528,7 +528,7 @@ void debugfs_create_file_size(const char *name, umode_t mode, { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); - if (de) + if (!IS_ERR(de)) d_inode(de)->i_size = file_size; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ffb295aa89..74b172a4ad 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -551,7 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; - int ret; + int ret = 0; if (!info) { info = ext4_htree_create_dir_info(file, ctx->pos); @@ -599,7 +599,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) info->curr_minor_hash, &info->next_hash); if (ret < 0) - return ret; + goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; @@ -630,7 +630,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) } finished: info->last_pos = ctx->pos; - return 0; + return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 90ff5acaf1..3825195539 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3593,9 +3593,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, unsigned flags, struct page **pagep, void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c0de30f251..0e02571f2f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5916,7 +5916,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) } /* Check if *cur is a hole and if it is, skip it */ -static void skip_hole(struct inode *inode, ext4_lblk_t *cur) +static int skip_hole(struct inode *inode, ext4_lblk_t *cur) { int ret; struct ext4_map_blocks map; @@ -5925,9 +5925,12 @@ static void skip_hole(struct inode *inode, ext4_lblk_t *cur) map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; if (ret != 0) - return; + return 0; *cur = *cur + map.m_len; + return 0; } /* Count number of blocks used by this inode and update i_blocks */ @@ -5976,7 +5979,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) * iblocks by total number of differences found. */ cur = 0; - skip_hole(inode, &cur); + ret = skip_hole(inode, &cur); + if (ret < 0) + goto out; path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; @@ -5995,8 +6000,12 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); - skip_hole(inode, &cur); - + ret = skip_hole(inode, &cur); + if (ret < 0) { + ext4_ext_drop_refs(path); + kfree(path); + break; + } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { ext4_ext_drop_refs(path); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 8e610a3818..8ea5a81e65 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -892,6 +892,12 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { + unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? + EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; + + /* Limit the number of blocks in one extent */ + map.m_len = min(max, map.m_len); + fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 82bf4ff6be..39a1ab129f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "ext4.h" @@ -733,45 +734,83 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret, no_expand; + handle_t *handle = ext4_journal_current_handle(); + int no_expand; void *kaddr; struct ext4_iloc iloc; + int ret = 0, ret2; - if (unlikely(copied < len)) { - if (!PageUptodate(page)) { - copied = 0; + if (unlikely(copied < len) && !PageUptodate(page)) + copied = 0; + + if (likely(copied)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + unlock_page(page); + put_page(page); + ext4_std_error(inode->i_sb, ret); goto out; } - } + ext4_write_lock_xattr(inode, &no_expand); + BUG_ON(!ext4_has_inline_data(inode)); - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - copied = 0; - goto out; - } + /* + * ei->i_inline_off may have changed since + * ext4_write_begin() called + * ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); - ext4_write_lock_xattr(inode, &no_expand); - BUG_ON(!ext4_has_inline_data(inode)); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); + + ext4_write_unlock_xattr(inode, &no_expand); + brelse(iloc.bh); + + /* + * It's important to update i_size while still holding page + * lock: page writeout could otherwise come in and zero + * beyond i_size. + */ + ext4_update_inode_size(inode, pos + copied); + } + unlock_page(page); + put_page(page); /* - * ei->i_inline_off may have changed since ext4_write_begin() - * called ext4_try_to_write_inline_data() + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. */ - (void) ext4_find_inline_data_nolock(inode); - - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, len); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); - - ext4_write_unlock_xattr(inode, &no_expand); - brelse(iloc.bh); - mark_inode_dirty(inode); + if (likely(copied)) + mark_inode_dirty(inode); out: - return copied; + /* + * If we didn't copy as much data as expected, we need to trim back + * size of xattr containing inline data. + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret ? ret : copied; } struct buffer_head * @@ -953,43 +992,6 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, return ret; } -int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - int ret; - - ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ret; - } - copied = ret; - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - unlock_page(page); - put_page(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - mark_inode_dirty(inode); - - return copied; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) @@ -1917,6 +1919,24 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { + /* + * if there's inline data to truncate and this file was + * converted to extents after that inline data was written, + * the extent status cache must be cleared to avoid leaving + * behind stale delayed allocated extent entries + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { +retry: + err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (err) + goto out_error; + } + /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d18852d602..0f06305167 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1284,22 +1284,14 @@ static int ext4_write_end(struct file *file, loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - } else - copied = block_write_end(file, mapping, pos, - len, copied, page, fsdata); + + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1320,7 +1312,7 @@ static int ext4_write_end(struct file *file, * ordering of page lock and transaction start for journaling * filesystems. */ - if (i_size_changed || inline_data) + if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) @@ -1329,7 +1321,7 @@ static int ext4_write_end(struct file *file, * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); -errout: + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1395,7 +1387,6 @@ static int ext4_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; int size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); @@ -1404,16 +1395,10 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - } else if (unlikely(copied < len) && !PageUptodate(page)) { + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { @@ -1436,7 +1421,7 @@ static int ext4_journalled_write_end(struct file *file, if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - if (size_changed || inline_data) { + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1449,7 +1434,6 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); -errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1644,6 +1628,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; + bool reserved = false; /* * If the cluster containing lblk is shared with a delayed, @@ -1660,6 +1645,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, @@ -1672,6 +1658,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { allocated = true; } @@ -1682,6 +1669,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) } ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + if (ret && reserved) + ext4_da_release_space(inode, 1); errout: return ret; @@ -1722,13 +1711,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, } /* - * Delayed extent could be allocated by fallocate. - * So we need to check it. + * the buffer head associated with a delayed and not unwritten + * block found in the extent status cache must contain an + * invalid block number and have its BH_New and BH_Delay bits + * set, reflecting the state assigned when the block was + * initially delayed allocated */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delonly(&es)) { + BUG_ON(bh->b_blocknr != invalid_block); + BUG_ON(!buffer_new(bh)); + BUG_ON(!buffer_delay(bh)); return 0; } @@ -2932,19 +2924,6 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -/* We always reserve for an inode update; the superblock could be there too */ -static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) -{ - if (likely(ext4_has_feature_large_file(inode->i_sb))) - return 1; - - if (pos + len <= 0x7fffffffULL) - return 1; - - /* We might need to update the superblock to set LARGE_FILE */ - return 2; -} - static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2953,7 +2932,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page *page; pgoff_t index; struct inode *inode = mapping->host; - handle_t *handle; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -2979,41 +2957,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, return 0; } - /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page - * is being written back. So grab it first before we start - * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. - */ -retry_grab: +retry: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; - unlock_page(page); - /* - * With delayed allocation, we don't log the i_disksize update - * if there is delayed block allocation. But we still need - * to journalling the i_disksize update if writes to the end - * of file which has an already mapped buffer. - */ -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_da_write_credits(inode, pos, len)); - if (IS_ERR(handle)) { - put_page(page); - return PTR_ERR(handle); - } - - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); - ext4_journal_stop(handle); - goto retry_grab; - } /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); @@ -3025,20 +2973,18 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, #endif if (ret < 0) { unlock_page(page); - ext4_journal_stop(handle); + put_page(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - - put_page(page); + goto retry; return ret; } @@ -3075,8 +3021,6 @@ static int ext4_da_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - int ret = 0, ret2; - handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; @@ -3086,44 +3030,36 @@ static int ext4_da_write_end(struct file *file, len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ - new_i_size = pos + copied; - if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_has_inline_data(inode) || - ext4_da_should_update_i_disksize(page, end)) { - ext4_update_i_disksize(inode, new_i_size); - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) - */ - ret = ext4_mark_inode_dirty(handle, inode); - } - } if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) - ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, - page); - else - ret2 = generic_write_end(file, mapping, pos, len, copied, - page, fsdata); + return ext4_write_inline_data_end(inode, pos, len, copied, page); - copied = ret2; - if (ret2 < 0) - ret = ret2; - ret2 = ext4_journal_stop(handle); - if (unlikely(ret2 && !ret)) - ret = ret2; + start = pos & (PAGE_SIZE - 1); + end = start + copied - 1; - return ret ? ret : copied; + /* + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range upto i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize upto i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block (ext4_da_should_update_i_disksize() + * check), we need to update i_disksize here as neither + * ext4_writepage() nor certain ext4_writepages() paths not + * allocating blocks update i_disksize. + * + * Note that we defer inode dirtying to generic_write_end() / + * ext4_da_write_inline_data_end(). + */ + new_i_size = pos + copied; + if (copied && new_i_size > inode->i_size && + ext4_da_should_update_i_disksize(page, end)) + ext4_update_i_disksize(inode, new_i_size); + + return generic_write_end(file, mapping, pos, len, copied, page, fsdata); } /* @@ -4340,6 +4276,12 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, goto has_buffer; lock_buffer(bh); + if (ext4_buffer_uptodate(bh)) { + /* Someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0775950ee8..88d5d274a8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -658,7 +658,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ - if (continue_fs) + if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_error_work); else ext4_commit_super(sb); @@ -1350,6 +1350,12 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } + + if (EXT4_I(inode)->i_reserved_data_blocks) + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + inode->i_ino, EXT4_I(inode), + EXT4_I(inode)->i_reserved_data_blocks); } static void init_once(void *foo) @@ -3021,17 +3027,17 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { - loff_t res = EXT4_NDIR_BLOCKS; + unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; - /* This is calculated to be the largest file size for a dense, block + + /* + * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ - if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field @@ -3074,7 +3080,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; - return res; + return (loff_t)res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, @@ -5042,12 +5048,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: + /* flush s_error_work before sbi destroy */ flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index f346a78f4b..6a67565212 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -77,7 +77,6 @@ static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); -static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object); static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); @@ -907,6 +906,7 @@ static void fscache_dequeue_object(struct fscache_object *object) * @object: The object to ask about * @data: The auxiliary data for the object * @datalen: The size of the auxiliary data + * @object_size: The size of the object according to the server. * * This function consults the netfs about the coherency state of an object. * The caller must be holding a ref on cookie->n_active (held by diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 4338771077..e002cdfaf3 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -22,7 +22,10 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op) /** * fscache_operation_init - Do basic initialisation of an operation + * @cookie: The cookie to operate on * @op: The operation to initialise + * @processor: The function to perform the operation + * @cancel: A function to handle operation cancellation * @release: The release function to assign * * Do basic initialisation of an operation. The caller must still set flags, diff --git a/fs/io-wq.c b/fs/io-wq.c index c2360cdc40..5bf8aa8171 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -584,10 +584,7 @@ static int io_wqe_worker(void *data) if (!get_signal(&ksig)) continue; - if (fatal_signal_pending(current) || - signal_group_exit(current->signal)) - break; - continue; + break; } last_timeout = !ret; } diff --git a/fs/io_uring.c b/fs/io_uring.c index 82f867983b..e68d27829b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -403,7 +403,6 @@ struct io_ring_ctx { struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; - struct fasync_struct *cq_fasync; unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; @@ -1614,10 +1613,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) { + if (waitqueue_active(&ctx->poll_wait)) wake_up_interruptible(&ctx->poll_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1631,10 +1628,8 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) { + if (waitqueue_active(&ctx->poll_wait)) wake_up_interruptible(&ctx->poll_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } } /* Returns true if there are no backlogged entries after the flush */ @@ -2954,7 +2949,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_ring_ctx *ctx = req->ctx; req_set_fail(req); - if (issue_flags & IO_URING_F_NONBLOCK) { + if (!(issue_flags & IO_URING_F_NONBLOCK)) { mutex_lock(&ctx->uring_lock); __io_req_complete(req, issue_flags, ret, cflags); mutex_unlock(&ctx->uring_lock); @@ -9345,13 +9340,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) return mask; } -static int io_uring_fasync(int fd, struct file *file, int on) -{ - struct io_ring_ctx *ctx = file->private_data; - - return fasync_helper(fd, file, on, &ctx->cq_fasync); -} - static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; @@ -10145,7 +10133,6 @@ static const struct file_operations io_uring_fops = { .mmap_capabilities = io_uring_nommu_mmap_capabilities, #endif .poll = io_uring_poll, - .fasync = io_uring_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = io_uring_show_fdinfo, #endif diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index ba581429bf..8e0a1378a4 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1111,13 +1111,25 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ - if (kn && kernfs_active(kn)) { + if (kn) { + /* Inactive nodes are invisible to the VFS so don't + * create a negative. + */ + if (!kernfs_active(kn)) { + up_read(&kernfs_rwsem); + return NULL; + } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } - /* Needed only for negative dentry validation */ - if (!inode) + /* + * Needed for negative dentry validation. + * The negative dentry can be created in kernfs_iop_lookup() + * or transforms from positive dentry in dentry_unlink_inode() + * called from vfs_rmdir(). + */ + if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&kernfs_rwsem); diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index de36f12070..71c989f156 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -68,125 +68,6 @@ void ksmbd_copy_gss_neg_header(void *buf) memcpy(buf, NEGOTIATE_GSS_HEADER, AUTH_GSS_LENGTH); } -static void -str_to_key(unsigned char *str, unsigned char *key) -{ - int i; - - key[0] = str[0] >> 1; - key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); - key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); - key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); - key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); - key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); - key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); - key[7] = str[6] & 0x7F; - for (i = 0; i < 8; i++) - key[i] = (key[i] << 1); -} - -static int -smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) -{ - unsigned char key2[8]; - struct des_ctx ctx; - - if (fips_enabled) { - ksmbd_debug(AUTH, "FIPS compliance enabled: DES not permitted\n"); - return -ENOENT; - } - - str_to_key(key, key2); - des_expand_key(&ctx, key2, DES_KEY_SIZE); - des_encrypt(&ctx, out, in); - memzero_explicit(&ctx, sizeof(ctx)); - return 0; -} - -static int ksmbd_enc_p24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) -{ - int rc; - - rc = smbhash(p24, c8, p21); - if (rc) - return rc; - rc = smbhash(p24 + 8, c8, p21 + 7); - if (rc) - return rc; - return smbhash(p24 + 16, c8, p21 + 14); -} - -/* produce a md4 message digest from data of length n bytes */ -static int ksmbd_enc_md4(unsigned char *md4_hash, unsigned char *link_str, - int link_len) -{ - int rc; - struct ksmbd_crypto_ctx *ctx; - - ctx = ksmbd_crypto_ctx_find_md4(); - if (!ctx) { - ksmbd_debug(AUTH, "Crypto md4 allocation error\n"); - return -ENOMEM; - } - - rc = crypto_shash_init(CRYPTO_MD4(ctx)); - if (rc) { - ksmbd_debug(AUTH, "Could not init md4 shash\n"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_MD4(ctx), link_str, link_len); - if (rc) { - ksmbd_debug(AUTH, "Could not update with link_str\n"); - goto out; - } - - rc = crypto_shash_final(CRYPTO_MD4(ctx), md4_hash); - if (rc) - ksmbd_debug(AUTH, "Could not generate md4 hash\n"); -out: - ksmbd_release_crypto_ctx(ctx); - return rc; -} - -static int ksmbd_enc_update_sess_key(unsigned char *md5_hash, char *nonce, - char *server_challenge, int len) -{ - int rc; - struct ksmbd_crypto_ctx *ctx; - - ctx = ksmbd_crypto_ctx_find_md5(); - if (!ctx) { - ksmbd_debug(AUTH, "Crypto md5 allocation error\n"); - return -ENOMEM; - } - - rc = crypto_shash_init(CRYPTO_MD5(ctx)); - if (rc) { - ksmbd_debug(AUTH, "Could not init md5 shash\n"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_MD5(ctx), server_challenge, len); - if (rc) { - ksmbd_debug(AUTH, "Could not update with challenge\n"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_MD5(ctx), nonce, len); - if (rc) { - ksmbd_debug(AUTH, "Could not update with nonce\n"); - goto out; - } - - rc = crypto_shash_final(CRYPTO_MD5(ctx), md5_hash); - if (rc) - ksmbd_debug(AUTH, "Could not generate md5 hash\n"); -out: - ksmbd_release_crypto_ctx(ctx); - return rc; -} - /** * ksmbd_gen_sess_key() - function to generate session key * @sess: session of connection @@ -324,43 +205,6 @@ static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash, return ret; } -/** - * ksmbd_auth_ntlm() - NTLM authentication handler - * @sess: session of connection - * @pw_buf: NTLM challenge response - * @passkey: user password - * - * Return: 0 on success, error number on error - */ -int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf) -{ - int rc; - unsigned char p21[21]; - char key[CIFS_AUTH_RESP_SIZE]; - - memset(p21, '\0', 21); - memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE); - rc = ksmbd_enc_p24(p21, sess->ntlmssp.cryptkey, key); - if (rc) { - pr_err("password processing failed\n"); - return rc; - } - - ksmbd_enc_md4(sess->sess_key, user_passkey(sess->user), - CIFS_SMB1_SESSKEY_SIZE); - memcpy(sess->sess_key + CIFS_SMB1_SESSKEY_SIZE, key, - CIFS_AUTH_RESP_SIZE); - sess->sequence_number = 1; - - if (strncmp(pw_buf, key, CIFS_AUTH_RESP_SIZE) != 0) { - ksmbd_debug(AUTH, "ntlmv1 authentication failed\n"); - return -EINVAL; - } - - ksmbd_debug(AUTH, "ntlmv1 authentication pass\n"); - return 0; -} - /** * ksmbd_auth_ntlmv2() - NTLMv2 authentication handler * @sess: session of connection @@ -441,44 +285,6 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, return rc; } -/** - * __ksmbd_auth_ntlmv2() - NTLM2(extended security) authentication handler - * @sess: session of connection - * @client_nonce: client nonce from LM response. - * @ntlm_resp: ntlm response data from client. - * - * Return: 0 on success, error number on error - */ -static int __ksmbd_auth_ntlmv2(struct ksmbd_session *sess, char *client_nonce, - char *ntlm_resp) -{ - char sess_key[CIFS_SMB1_SESSKEY_SIZE] = {0}; - int rc; - unsigned char p21[21]; - char key[CIFS_AUTH_RESP_SIZE]; - - rc = ksmbd_enc_update_sess_key(sess_key, - client_nonce, - (char *)sess->ntlmssp.cryptkey, 8); - if (rc) { - pr_err("password processing failed\n"); - goto out; - } - - memset(p21, '\0', 21); - memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE); - rc = ksmbd_enc_p24(p21, sess_key, key); - if (rc) { - pr_err("password processing failed\n"); - goto out; - } - - if (memcmp(ntlm_resp, key, CIFS_AUTH_RESP_SIZE) != 0) - rc = -EINVAL; -out: - return rc; -} - /** * ksmbd_decode_ntlmssp_auth_blob() - helper function to construct * authenticate blob @@ -512,17 +318,6 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset); nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length); - /* process NTLM authentication */ - if (nt_len == CIFS_AUTH_RESP_SIZE) { - if (le32_to_cpu(authblob->NegotiateFlags) & - NTLMSSP_NEGOTIATE_EXTENDED_SEC) - return __ksmbd_auth_ntlmv2(sess, (char *)authblob + - lm_off, (char *)authblob + nt_off); - else - return ksmbd_auth_ntlm(sess, (char *)authblob + - nt_off); - } - /* TODO : use domain name that imported from configuration file */ domain_name = smb_strndup_from_utf16((const char *)authblob + le32_to_cpu(authblob->DomainName.BufferOffset), diff --git a/fs/ksmbd/crypto_ctx.c b/fs/ksmbd/crypto_ctx.c index 5f4b1008d1..81488d0419 100644 --- a/fs/ksmbd/crypto_ctx.c +++ b/fs/ksmbd/crypto_ctx.c @@ -81,12 +81,6 @@ static struct shash_desc *alloc_shash_desc(int id) case CRYPTO_SHASH_SHA512: tfm = crypto_alloc_shash("sha512", 0, 0); break; - case CRYPTO_SHASH_MD4: - tfm = crypto_alloc_shash("md4", 0, 0); - break; - case CRYPTO_SHASH_MD5: - tfm = crypto_alloc_shash("md5", 0, 0); - break; default: return NULL; } @@ -214,16 +208,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void) return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512); } -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void) -{ - return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD4); -} - -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void) -{ - return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD5); -} - static struct ksmbd_crypto_ctx *____crypto_aead_ctx_find(int id) { struct ksmbd_crypto_ctx *ctx; diff --git a/fs/ksmbd/crypto_ctx.h b/fs/ksmbd/crypto_ctx.h index ef11154b43..4a367c62f6 100644 --- a/fs/ksmbd/crypto_ctx.h +++ b/fs/ksmbd/crypto_ctx.h @@ -15,8 +15,6 @@ enum { CRYPTO_SHASH_CMACAES, CRYPTO_SHASH_SHA256, CRYPTO_SHASH_SHA512, - CRYPTO_SHASH_MD4, - CRYPTO_SHASH_MD5, CRYPTO_SHASH_MAX, }; @@ -43,8 +41,6 @@ struct ksmbd_crypto_ctx { #define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES]) #define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256]) #define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512]) -#define CRYPTO_MD4(c) ((c)->desc[CRYPTO_SHASH_MD4]) -#define CRYPTO_MD5(c) ((c)->desc[CRYPTO_SHASH_MD5]) #define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm) #define CRYPTO_HMACSHA256_TFM(c)\ @@ -52,8 +48,6 @@ struct ksmbd_crypto_ctx { #define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm) #define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm) #define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm) -#define CRYPTO_MD4_TFM(c) ((c)->desc[CRYPTO_SHASH_MD4]->tfm) -#define CRYPTO_MD5_TFM(c) ((c)->desc[CRYPTO_SHASH_MD5]->tfm) #define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM]) #define CRYPTO_CCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_CCM]) @@ -64,8 +58,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void); -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void); -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void); void ksmbd_crypto_destroy(void); diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 6a19f4bc69..60e7ac62c9 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -162,17 +162,14 @@ char *convert_to_nt_pathname(char *filename) { char *ab_pathname; - if (strlen(filename) == 0) { - ab_pathname = kmalloc(2, GFP_KERNEL); - ab_pathname[0] = '\\'; - ab_pathname[1] = '\0'; - } else { - ab_pathname = kstrdup(filename, GFP_KERNEL); - if (!ab_pathname) - return NULL; + if (strlen(filename) == 0) + filename = "\\"; - ksmbd_conv_path_to_windows(ab_pathname); - } + ab_pathname = kstrdup(filename, GFP_KERNEL); + if (!ab_pathname) + return NULL; + + ksmbd_conv_path_to_windows(ab_pathname); return ab_pathname; } diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 16b6236d1b..f9dae6ef21 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1451,26 +1451,47 @@ struct lease_ctx_info *parse_lease_state(void *open_req) */ struct create_context *smb2_find_context_vals(void *open_req, const char *tag) { - char *data_offset; struct create_context *cc; unsigned int next = 0; char *name; struct smb2_create_req *req = (struct smb2_create_req *)open_req; + unsigned int remain_len, name_off, name_len, value_off, value_len, + cc_len; - data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset); - cc = (struct create_context *)data_offset; + /* + * CreateContextsOffset and CreateContextsLength are guaranteed to + * be valid because of ksmbd_smb2_check_message(). + */ + cc = (struct create_context *)((char *)req + 4 + + le32_to_cpu(req->CreateContextsOffset)); + remain_len = le32_to_cpu(req->CreateContextsLength); do { - int val; - cc = (struct create_context *)((char *)cc + next); - name = le16_to_cpu(cc->NameOffset) + (char *)cc; - val = le16_to_cpu(cc->NameLength); - if (val < 4) + if (remain_len < offsetof(struct create_context, Buffer)) return ERR_PTR(-EINVAL); - if (memcmp(name, tag, val) == 0) - return cc; next = le32_to_cpu(cc->Next); + name_off = le16_to_cpu(cc->NameOffset); + name_len = le16_to_cpu(cc->NameLength); + value_off = le16_to_cpu(cc->DataOffset); + value_len = le32_to_cpu(cc->DataLength); + cc_len = next ? next : remain_len; + + if ((next & 0x7) != 0 || + next > remain_len || + name_off != offsetof(struct create_context, Buffer) || + name_len < 4 || + name_off + name_len > cc_len || + (value_off & 0x7) != 0 || + (value_off && (value_off < name_off + name_len)) || + ((u64)value_off + value_len > cc_len)) + return ERR_PTR(-EINVAL); + + name = (char *)cc + name_off; + if (memcmp(name, tag, name_len) == 0) + return cc; + + remain_len -= next; } while (next != 0); return NULL; diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 9aa46bb3e1..9edd9c161b 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -80,18 +80,21 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { }; /* - * Returns the pointer to the beginning of the data area. Length of the data - * area and the offset to it (from the beginning of the smb are also returned. + * Set length of the data area and the offset to arguments. + * if they are invalid, return error. */ -static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, + struct smb2_hdr *hdr) { + int ret = 0; + *off = 0; *len = 0; /* error reqeusts do not have data area */ if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED && (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE) - return NULL; + return ret; /* * Following commands have data areas so we have to get the location @@ -165,69 +168,60 @@ static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) case SMB2_IOCTL: *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset); *len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount); - break; default: ksmbd_debug(SMB, "no length check for command\n"); break; } - /* - * Invalid length or offset probably means data area is invalid, but - * we have little choice but to ignore the data area in this case. - */ if (*off > 4096) { - ksmbd_debug(SMB, "offset %d too large, data area ignored\n", - *off); - *len = 0; - *off = 0; - } else if (*off < 0) { - ksmbd_debug(SMB, - "negative offset %d to data invalid ignore data area\n", - *off); - *off = 0; - *len = 0; - } else if (*len < 0) { - ksmbd_debug(SMB, - "negative data length %d invalid, data area ignored\n", - *len); - *len = 0; - } else if (*len > 128 * 1024) { - ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len); - *len = 0; + ksmbd_debug(SMB, "offset %d too large\n", *off); + ret = -EINVAL; + } else if ((u64)*off + *len > MAX_STREAM_PROT_LEN) { + ksmbd_debug(SMB, "Request is larger than maximum stream protocol length(%u): %llu\n", + MAX_STREAM_PROT_LEN, (u64)*off + *len); + ret = -EINVAL; } - /* return pointer to beginning of data area, ie offset from SMB start */ - if ((*off != 0) && (*len != 0)) - return (char *)hdr + *off; - else - return NULL; + return ret; } /* * Calculate the size of the SMB message based on the fixed header * portion, the number of word parameters and the data portion of the message. */ -static unsigned int smb2_calc_size(void *buf) +static int smb2_calc_size(void *buf, unsigned int *len) { struct smb2_pdu *pdu = (struct smb2_pdu *)buf; struct smb2_hdr *hdr = &pdu->hdr; - int offset; /* the offset from the beginning of SMB to data area */ - int data_length; /* the length of the variable length data area */ + unsigned int offset; /* the offset from the beginning of SMB to data area */ + unsigned int data_length; /* the length of the variable length data area */ + int ret; + /* Structure Size has already been checked to make sure it is 64 */ - int len = le16_to_cpu(hdr->StructureSize); + *len = le16_to_cpu(hdr->StructureSize); /* * StructureSize2, ie length of fixed parameter area has already * been checked to make sure it is the correct length. */ - len += le16_to_cpu(pdu->StructureSize2); + *len += le16_to_cpu(pdu->StructureSize2); + /* + * StructureSize2 of smb2_lock pdu is set to 48, indicating + * the size of smb2 lock request with single smb2_lock_element + * regardless of number of locks. Subtract single + * smb2_lock_element for correct buffer size check. + */ + if (hdr->Command == SMB2_LOCK) + *len -= sizeof(struct smb2_lock_element); if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false) goto calc_size_exit; - smb2_get_data_area_len(&offset, &data_length, hdr); - ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length, + ret = smb2_get_data_area_len(&offset, &data_length, hdr); + if (ret) + return ret; + ksmbd_debug(SMB, "SMB2 data length %u offset %u\n", data_length, offset); if (data_length > 0) { @@ -237,16 +231,19 @@ static unsigned int smb2_calc_size(void *buf) * for some commands, typically those with odd StructureSize, * so we must add one to the calculation. */ - if (offset + 1 < len) + if (offset + 1 < *len) { ksmbd_debug(SMB, - "data area offset %d overlaps SMB2 header %d\n", - offset + 1, len); - else - len = offset + data_length; + "data area offset %d overlaps SMB2 header %u\n", + offset + 1, *len); + return -EINVAL; + } + + *len = offset + data_length; } + calc_size_exit: - ksmbd_debug(SMB, "SMB2 len %d\n", len); - return len; + ksmbd_debug(SMB, "SMB2 len %u\n", *len); + return 0; } static inline int smb2_query_info_req_len(struct smb2_query_info_req *h) @@ -391,9 +388,11 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) return 1; } - clc_len = smb2_calc_size(hdr); + if (smb2_calc_size(hdr, &clc_len)) + return 1; + if (len != clc_len) { - /* server can return one byte more due to implied bcc[0] */ + /* client can return one byte more due to implied bcc[0] */ if (clc_len == len + 1) return 0; @@ -418,9 +417,6 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) return 0; } - if (command == SMB2_LOCK_HE && len == 88) - return 0; - ksmbd_debug(SMB, "cli req too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 197473871a..b06456eb58 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -187,11 +187,6 @@ static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = { [SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify}, }; -int init_smb2_0_server(struct ksmbd_conn *conn) -{ - return -EOPNOTSUPP; -} - /** * init_smb2_1_server() - initialize a smb server connection with smb2.1 * command dispatcher diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 761e12171d..005aa93a49 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -236,9 +236,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) if (conn->need_neg == false) return -EINVAL; - if (!(conn->dialect >= SMB20_PROT_ID && - conn->dialect <= SMB311_PROT_ID)) - return -EINVAL; rsp_hdr = work->response_buf; @@ -459,13 +456,22 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) bool is_chained_smb2_message(struct ksmbd_work *work) { struct smb2_hdr *hdr = work->request_buf; - unsigned int len; + unsigned int len, next_cmd; if (hdr->ProtocolId != SMB2_PROTO_NUMBER) return false; hdr = ksmbd_req_buf_next(work); - if (le32_to_cpu(hdr->NextCommand) > 0) { + next_cmd = le32_to_cpu(hdr->NextCommand); + if (next_cmd > 0) { + if ((u64)work->next_smb2_rcv_hdr_off + next_cmd + + __SMB2_HEADER_STRUCTURE_SIZE > + get_rfc1002_len(work->request_buf)) { + pr_err("next command(%u) offset exceeds smb msg size\n", + next_cmd); + return false; + } + ksmbd_debug(SMB, "got SMB2 chained command\n"); init_chained_smb2_rsp(work); return true; @@ -1058,6 +1064,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) struct smb2_negotiate_req *req = work->request_buf; struct smb2_negotiate_rsp *rsp = work->response_buf; int rc = 0; + unsigned int smb2_buf_len, smb2_neg_size; __le32 status; ksmbd_debug(SMB, "Received negotiate request\n"); @@ -1075,6 +1082,44 @@ int smb2_handle_negotiate(struct ksmbd_work *work) goto err_out; } + smb2_buf_len = get_rfc1002_len(work->request_buf); + smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects) - 4; + if (smb2_neg_size > smb2_buf_len) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + + if (conn->dialect == SMB311_PROT_ID) { + unsigned int nego_ctxt_off = le32_to_cpu(req->NegotiateContextOffset); + + if (smb2_buf_len < nego_ctxt_off) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + + if (smb2_neg_size > nego_ctxt_off) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + + if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) > + nego_ctxt_off) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + } else { + if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) > + smb2_buf_len) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + } + conn->cli_cap = le32_to_cpu(req->Capabilities); switch (conn->dialect) { case SMB311_PROT_ID: @@ -1118,13 +1163,6 @@ int smb2_handle_negotiate(struct ksmbd_work *work) case SMB21_PROT_ID: init_smb2_1_server(conn); break; - case SMB20_PROT_ID: - rc = init_smb2_0_server(conn); - if (rc) { - rsp->hdr.Status = STATUS_NOT_SUPPORTED; - goto err_out; - } - break; case SMB2X_PROT_ID: case BAD_PROT_ID: default: @@ -1143,11 +1181,9 @@ int smb2_handle_negotiate(struct ksmbd_work *work) rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size); rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size); - if (conn->dialect > SMB20_PROT_ID) { - memcpy(conn->ClientGUID, req->ClientGUID, - SMB2_CLIENT_GUID_SIZE); - conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); - } + memcpy(conn->ClientGUID, req->ClientGUID, + SMB2_CLIENT_GUID_SIZE); + conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); rsp->StructureSize = cpu_to_le16(65); rsp->DialectRevision = cpu_to_le16(conn->dialect); @@ -1489,11 +1525,9 @@ static int ntlm_authenticate(struct ksmbd_work *work) } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -1575,11 +1609,9 @@ static int krb5_authenticate(struct ksmbd_work *work) } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -2093,16 +2125,22 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work) * smb2_set_ea() - handler for setting extended attributes using set * info command * @eabuf: set info command buffer + * @buf_len: set info command buffer length * @path: dentry path for get ea * * Return: 0 on success, otherwise error */ -static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path) +static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, + struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *attr_name = NULL, *value; int rc = 0; - int next = 0; + unsigned int next = 0; + + if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + + le16_to_cpu(eabuf->EaValueLength)) + return -EINVAL; attr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL); if (!attr_name) @@ -2167,7 +2205,13 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path) next: next = le32_to_cpu(eabuf->NextEntryOffset); + if (next == 0 || buf_len < next) + break; + buf_len -= next; eabuf = (struct smb2_ea_info *)((char *)eabuf + next); + if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength)) + break; + } while (next != 0); kfree(attr_name); @@ -2367,6 +2411,10 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, ksmbd_debug(SMB, "Set ACLs using SMB2_CREATE_SD_BUFFER context\n"); sd_buf = (struct create_sd_buf_req *)context; + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_sd_buf_req)) + return -EINVAL; return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd, le32_to_cpu(sd_buf->ccontext.DataLength), true); } @@ -2561,6 +2609,12 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } else if (context) { ea_buf = (struct create_ea_buf_req *)context; + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_ea_buf_req)) { + rc = -EINVAL; + goto err_out1; + } if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) { rsp->hdr.Status = STATUS_ACCESS_DENIED; rc = -EACCES; @@ -2599,6 +2653,12 @@ int smb2_open(struct ksmbd_work *work) } else if (context) { struct create_posix *posix = (struct create_posix *)context; + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_posix)) { + rc = -EINVAL; + goto err_out1; + } ksmbd_debug(SMB, "get posix context\n"); posix_mode = le32_to_cpu(posix->Mode); @@ -2748,7 +2808,15 @@ int smb2_open(struct ksmbd_work *work) created = true; user_ns = mnt_user_ns(path.mnt); if (ea_buf) { - rc = smb2_set_ea(&ea_buf->ea, &path); + if (le32_to_cpu(ea_buf->ccontext.DataLength) < + sizeof(struct smb2_ea_info)) { + rc = -EINVAL; + goto err_out; + } + + rc = smb2_set_ea(&ea_buf->ea, + le32_to_cpu(ea_buf->ccontext.DataLength), + &path); if (rc == -EOPNOTSUPP) rc = 0; else if (rc) @@ -2981,9 +3049,16 @@ int smb2_open(struct ksmbd_work *work) rc = PTR_ERR(az_req); goto err_out; } else if (az_req) { - loff_t alloc_size = le64_to_cpu(az_req->AllocationSize); + loff_t alloc_size; int err; + if (le16_to_cpu(az_req->ccontext.DataOffset) + + le32_to_cpu(az_req->ccontext.DataLength) < + sizeof(struct create_alloc_size_req)) { + rc = -EINVAL; + goto err_out; + } + alloc_size = le64_to_cpu(az_req->AllocationSize); ksmbd_debug(SMB, "request smb2 create allocate size : %llu\n", alloc_size); @@ -4152,7 +4227,7 @@ static void get_file_access_info(struct smb2_query_info_rsp *rsp, static int get_file_basic_info(struct smb2_query_info_rsp *rsp, struct ksmbd_file *fp, void *rsp_org) { - struct smb2_file_all_info *basic_info; + struct smb2_file_basic_info *basic_info; struct kstat stat; u64 time; @@ -4162,7 +4237,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, return -EACCES; } - basic_info = (struct smb2_file_all_info *)rsp->Buffer; + basic_info = (struct smb2_file_basic_info *)rsp->Buffer; generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); @@ -4175,9 +4250,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, basic_info->Attributes = fp->f_ci->m_fattr; basic_info->Pad1 = 0; rsp->OutputBufferLength = - cpu_to_le32(offsetof(struct smb2_file_all_info, AllocationSize)); - inc_rfc1001_len(rsp_org, offsetof(struct smb2_file_all_info, - AllocationSize)); + cpu_to_le32(sizeof(struct smb2_file_basic_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_basic_info)); return 0; } @@ -5333,7 +5407,7 @@ static int smb2_rename(struct ksmbd_work *work, static int smb2_create_link(struct ksmbd_work *work, struct ksmbd_share_config *share, struct smb2_file_link_info *file_info, - struct file *filp, + unsigned int buf_len, struct file *filp, struct nls_table *local_nls) { char *link_name = NULL, *target_name = NULL, *pathname = NULL; @@ -5341,6 +5415,10 @@ static int smb2_create_link(struct ksmbd_work *work, bool file_present = true; int rc; + if (buf_len < (u64)sizeof(struct smb2_file_link_info) + + le32_to_cpu(file_info->FileNameLength)) + return -EINVAL; + ksmbd_debug(SMB, "setting FILE_LINK_INFORMATION\n"); pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathname) @@ -5400,12 +5478,11 @@ static int smb2_create_link(struct ksmbd_work *work, return rc; } -static int set_file_basic_info(struct ksmbd_file *fp, char *buf, +static int set_file_basic_info(struct ksmbd_file *fp, + struct smb2_file_basic_info *file_info, struct ksmbd_share_config *share) { - struct smb2_file_all_info *file_info; struct iattr attrs; - struct timespec64 ctime; struct file *filp; struct inode *inode; struct user_namespace *user_ns; @@ -5414,7 +5491,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) return -EACCES; - file_info = (struct smb2_file_all_info *)buf; attrs.ia_valid = 0; filp = fp->filp; inode = file_inode(filp); @@ -5428,13 +5504,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); } - if (file_info->ChangeTime) { + attrs.ia_valid |= ATTR_CTIME; + if (file_info->ChangeTime) attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - ctime = attrs.ia_ctime; - attrs.ia_valid |= ATTR_CTIME; - } else { - ctime = inode->i_ctime; - } + else + attrs.ia_ctime = inode->i_ctime; if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); @@ -5480,18 +5554,17 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, return -EACCES; inode_lock(inode); + inode->i_ctime = attrs.ia_ctime; + attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(user_ns, dentry, &attrs, NULL); - if (!rc) { - inode->i_ctime = ctime; - mark_inode_dirty(inode); - } inode_unlock(inode); } return rc; } static int set_file_allocation_info(struct ksmbd_work *work, - struct ksmbd_file *fp, char *buf) + struct ksmbd_file *fp, + struct smb2_file_alloc_info *file_alloc_info) { /* * TODO : It's working fine only when store dos attributes @@ -5499,7 +5572,6 @@ static int set_file_allocation_info(struct ksmbd_work *work, * properly with any smb.conf option */ - struct smb2_file_alloc_info *file_alloc_info; loff_t alloc_blks; struct inode *inode; int rc; @@ -5507,7 +5579,6 @@ static int set_file_allocation_info(struct ksmbd_work *work, if (!(fp->daccess & FILE_WRITE_DATA_LE)) return -EACCES; - file_alloc_info = (struct smb2_file_alloc_info *)buf; alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9; inode = file_inode(fp->filp); @@ -5543,9 +5614,8 @@ static int set_file_allocation_info(struct ksmbd_work *work, } static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, - char *buf) + struct smb2_file_eof_info *file_eof_info) { - struct smb2_file_eof_info *file_eof_info; loff_t newsize; struct inode *inode; int rc; @@ -5553,7 +5623,6 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, if (!(fp->daccess & FILE_WRITE_DATA_LE)) return -EACCES; - file_eof_info = (struct smb2_file_eof_info *)buf; newsize = le64_to_cpu(file_eof_info->EndOfFile); inode = file_inode(fp->filp); @@ -5580,7 +5649,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, } static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, - char *buf) + struct smb2_file_rename_info *rename_info, + unsigned int buf_len) { struct user_namespace *user_ns; struct ksmbd_file *parent_fp; @@ -5593,6 +5663,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, return -EACCES; } + if (buf_len < (u64)sizeof(struct smb2_file_rename_info) + + le32_to_cpu(rename_info->FileNameLength)) + return -EINVAL; + user_ns = file_mnt_user_ns(fp->filp); if (ksmbd_stream_fd(fp)) goto next; @@ -5615,14 +5689,13 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, } } next: - return smb2_rename(work, fp, user_ns, - (struct smb2_file_rename_info *)buf, + return smb2_rename(work, fp, user_ns, rename_info, work->sess->conn->local_nls); } -static int set_file_disposition_info(struct ksmbd_file *fp, char *buf) +static int set_file_disposition_info(struct ksmbd_file *fp, + struct smb2_file_disposition_info *file_info) { - struct smb2_file_disposition_info *file_info; struct inode *inode; if (!(fp->daccess & FILE_DELETE_LE)) { @@ -5631,7 +5704,6 @@ static int set_file_disposition_info(struct ksmbd_file *fp, char *buf) } inode = file_inode(fp->filp); - file_info = (struct smb2_file_disposition_info *)buf; if (file_info->DeletePending) { if (S_ISDIR(inode->i_mode) && ksmbd_vfs_empty_dir(fp) == -ENOTEMPTY) @@ -5643,15 +5715,14 @@ static int set_file_disposition_info(struct ksmbd_file *fp, char *buf) return 0; } -static int set_file_position_info(struct ksmbd_file *fp, char *buf) +static int set_file_position_info(struct ksmbd_file *fp, + struct smb2_file_pos_info *file_info) { - struct smb2_file_pos_info *file_info; loff_t current_byte_offset; unsigned long sector_size; struct inode *inode; inode = file_inode(fp->filp); - file_info = (struct smb2_file_pos_info *)buf; current_byte_offset = le64_to_cpu(file_info->CurrentByteOffset); sector_size = inode->i_sb->s_blocksize; @@ -5667,12 +5738,11 @@ static int set_file_position_info(struct ksmbd_file *fp, char *buf) return 0; } -static int set_file_mode_info(struct ksmbd_file *fp, char *buf) +static int set_file_mode_info(struct ksmbd_file *fp, + struct smb2_file_mode_info *file_info) { - struct smb2_file_mode_info *file_info; __le32 mode; - file_info = (struct smb2_file_mode_info *)buf; mode = file_info->Mode; if ((mode & ~FILE_MODE_INFO_MASK) || @@ -5702,40 +5772,74 @@ static int set_file_mode_info(struct ksmbd_file *fp, char *buf) * TODO: need to implement an error handling for STATUS_INFO_LENGTH_MISMATCH */ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, - int info_class, char *buf, + struct smb2_set_info_req *req, struct ksmbd_share_config *share) { - switch (info_class) { + unsigned int buf_len = le32_to_cpu(req->BufferLength); + + switch (req->FileInfoClass) { case FILE_BASIC_INFORMATION: - return set_file_basic_info(fp, buf, share); + { + if (buf_len < sizeof(struct smb2_file_basic_info)) + return -EINVAL; + return set_file_basic_info(fp, (struct smb2_file_basic_info *)req->Buffer, share); + } case FILE_ALLOCATION_INFORMATION: - return set_file_allocation_info(work, fp, buf); + { + if (buf_len < sizeof(struct smb2_file_alloc_info)) + return -EINVAL; + return set_file_allocation_info(work, fp, + (struct smb2_file_alloc_info *)req->Buffer); + } case FILE_END_OF_FILE_INFORMATION: - return set_end_of_file_info(work, fp, buf); + { + if (buf_len < sizeof(struct smb2_file_eof_info)) + return -EINVAL; + return set_end_of_file_info(work, fp, + (struct smb2_file_eof_info *)req->Buffer); + } case FILE_RENAME_INFORMATION: + { if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); return -EACCES; } - return set_rename_info(work, fp, buf); + if (buf_len < sizeof(struct smb2_file_rename_info)) + return -EINVAL; + + return set_rename_info(work, fp, + (struct smb2_file_rename_info *)req->Buffer, + buf_len); + } case FILE_LINK_INFORMATION: - return smb2_create_link(work, work->tcon->share_conf, - (struct smb2_file_link_info *)buf, fp->filp, - work->sess->conn->local_nls); + { + if (buf_len < sizeof(struct smb2_file_link_info)) + return -EINVAL; + return smb2_create_link(work, work->tcon->share_conf, + (struct smb2_file_link_info *)req->Buffer, + buf_len, fp->filp, + work->sess->conn->local_nls); + } case FILE_DISPOSITION_INFORMATION: + { if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); return -EACCES; } - return set_file_disposition_info(fp, buf); + if (buf_len < sizeof(struct smb2_file_disposition_info)) + return -EINVAL; + + return set_file_disposition_info(fp, + (struct smb2_file_disposition_info *)req->Buffer); + } case FILE_FULL_EA_INFORMATION: { if (!(fp->daccess & FILE_WRITE_EA_LE)) { @@ -5744,18 +5848,29 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EACCES; } - return smb2_set_ea((struct smb2_ea_info *)buf, - &fp->filp->f_path); - } + if (buf_len < sizeof(struct smb2_ea_info)) + return -EINVAL; + return smb2_set_ea((struct smb2_ea_info *)req->Buffer, + buf_len, &fp->filp->f_path); + } case FILE_POSITION_INFORMATION: - return set_file_position_info(fp, buf); + { + if (buf_len < sizeof(struct smb2_file_pos_info)) + return -EINVAL; + return set_file_position_info(fp, (struct smb2_file_pos_info *)req->Buffer); + } case FILE_MODE_INFORMATION: - return set_file_mode_info(fp, buf); + { + if (buf_len < sizeof(struct smb2_file_mode_info)) + return -EINVAL; + + return set_file_mode_info(fp, (struct smb2_file_mode_info *)req->Buffer); + } } - pr_err("Unimplemented Fileinfoclass :%d\n", info_class); + pr_err("Unimplemented Fileinfoclass :%d\n", req->FileInfoClass); return -EOPNOTSUPP; } @@ -5816,8 +5931,7 @@ int smb2_set_info(struct ksmbd_work *work) switch (req->InfoType) { case SMB2_O_INFO_FILE: ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n"); - rc = smb2_set_info_file(work, fp, req->FileInfoClass, - req->Buffer, work->tcon->share_conf); + rc = smb2_set_info_file(work, fp, req, work->tcon->share_conf); break; case SMB2_O_INFO_SECURITY: ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n"); @@ -8171,7 +8285,8 @@ void smb3_preauth_hash_rsp(struct ksmbd_work *work) WORK_BUFFERS(work, req, rsp); - if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE) + if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE && + conn->preauth_info) ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp, conn->preauth_info->Preauth_HashValue); @@ -8275,31 +8390,29 @@ int smb3_decrypt_req(struct ksmbd_work *work) struct smb2_hdr *hdr; unsigned int pdu_length = get_rfc1002_len(buf); struct kvec iov[2]; - unsigned int buf_data_size = pdu_length + 4 - + int buf_data_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr); struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; - unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); int rc = 0; - sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); - if (!sess) { - pr_err("invalid session id(%llx) in transform header\n", - le64_to_cpu(tr_hdr->SessionId)); - return -ECONNABORTED; - } - - if (pdu_length + 4 < - sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) { + if (buf_data_size < sizeof(struct smb2_hdr)) { pr_err("Transform message is too small (%u)\n", pdu_length); return -ECONNABORTED; } - if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) { + if (buf_data_size < le32_to_cpu(tr_hdr->OriginalMessageSize)) { pr_err("Transform message is broken\n"); return -ECONNABORTED; } + sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); + if (!sess) { + pr_err("invalid session id(%llx) in transform header\n", + le64_to_cpu(tr_hdr->SessionId)); + return -ECONNABORTED; + } + iov[0].iov_base = buf; iov[0].iov_len = sizeof(struct smb2_transform_hdr); iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr); diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index bcec845b03..a6dec5ec6a 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -1464,6 +1464,15 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */ char FileName[1]; } __packed; /* level 18 Query */ +struct smb2_file_basic_info { /* data block encoding of response to level 18 */ + __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ +} __packed; + struct smb2_file_alt_name_info { __le32 FileNameLength; char FileName[0]; @@ -1628,7 +1637,6 @@ struct smb2_posix_info { } __packed; /* functions */ -int init_smb2_0_server(struct ksmbd_conn *conn); void init_smb2_1_server(struct ksmbd_conn *conn); void init_smb3_0_server(struct ksmbd_conn *conn); void init_smb3_02_server(struct ksmbd_conn *conn); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 40f4fafa2e..707490ab1f 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -21,7 +21,6 @@ static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%"; #define MAGIC_CHAR '~' #define PERIOD '.' #define mangle(V) ((char)(basechars[(V) % MANGLE_BASE])) -#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr)) struct smb_protocol { int index; @@ -89,7 +88,7 @@ unsigned int ksmbd_server_side_copy_max_total_size(void) inline int ksmbd_min_protocol(void) { - return SMB2_PROT; + return SMB21_PROT; } inline int ksmbd_max_protocol(void) @@ -155,20 +154,7 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) */ bool ksmbd_smb_request(struct ksmbd_conn *conn) { - int type = *(char *)conn->request_buf; - - switch (type) { - case RFC1002_SESSION_MESSAGE: - /* Regular SMB request */ - return true; - case RFC1002_SESSION_KEEP_ALIVE: - ksmbd_debug(SMB, "RFC 1002 session keep alive\n"); - break; - default: - ksmbd_debug(SMB, "RFC 1002 unknown request type 0x%x\n", type); - } - - return false; + return conn->request_buf[0] == 0; } static bool supported_protocol(int idx) @@ -182,10 +168,12 @@ static bool supported_protocol(int idx) idx <= server_conf.max_protocol); } -static char *next_dialect(char *dialect, int *next_off) +static char *next_dialect(char *dialect, int *next_off, int bcount) { dialect = dialect + *next_off; - *next_off = strlen(dialect); + *next_off = strnlen(dialect, bcount); + if (dialect[*next_off] != '\0') + return NULL; return dialect; } @@ -200,7 +188,9 @@ static int ksmbd_lookup_dialect_by_name(char *cli_dialects, __le16 byte_count) dialect = cli_dialects; bcount = le16_to_cpu(byte_count); do { - dialect = next_dialect(dialect, &next); + dialect = next_dialect(dialect, &next, bcount); + if (!dialect) + break; ksmbd_debug(SMB, "client requested dialect %s\n", dialect); if (!strcmp(dialect, smb1_protos[i].name)) { @@ -248,13 +238,22 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count) static int ksmbd_negotiate_smb_dialect(void *buf) { - __le32 proto; + int smb_buf_length = get_rfc1002_len(buf); + __le32 proto = ((struct smb2_hdr *)buf)->ProtocolId; - proto = ((struct smb2_hdr *)buf)->ProtocolId; if (proto == SMB2_PROTO_NUMBER) { struct smb2_negotiate_req *req; + int smb2_neg_size = + offsetof(struct smb2_negotiate_req, Dialects) - 4; req = (struct smb2_negotiate_req *)buf; + if (smb2_neg_size > smb_buf_length) + goto err_out; + + if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) > + smb_buf_length) + goto err_out; + return ksmbd_lookup_dialect_by_id(req->Dialects, req->DialectCount); } @@ -264,10 +263,19 @@ static int ksmbd_negotiate_smb_dialect(void *buf) struct smb_negotiate_req *req; req = (struct smb_negotiate_req *)buf; + if (le16_to_cpu(req->ByteCount) < 2) + goto err_out; + + if (offsetof(struct smb_negotiate_req, DialectsArray) - 4 + + le16_to_cpu(req->ByteCount) > smb_buf_length) { + goto err_out; + } + return ksmbd_lookup_dialect_by_name(req->DialectsArray, req->ByteCount); } +err_out: return BAD_PROT_ID; } @@ -285,11 +293,6 @@ int ksmbd_init_smb_server(struct ksmbd_work *work) return 0; } -bool ksmbd_pdu_size_has_room(unsigned int pdu) -{ - return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4); -} - int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, struct ksmbd_file *dir, struct ksmbd_dir_info *d_info, @@ -424,7 +427,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname, static int __smb2_negotiate(struct ksmbd_conn *conn) { - return (conn->dialect >= SMB20_PROT_ID && + return (conn->dialect >= SMB21_PROT_ID && conn->dialect <= SMB311_PROT_ID); } @@ -454,7 +457,7 @@ int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command) } } - if (command == SMB2_NEGOTIATE_HE) { + if (command == SMB2_NEGOTIATE_HE && __smb2_negotiate(conn)) { ret = smb2_handle_negotiate(work); init_smb2_neg_rsp(work); return ret; diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 0a6af447cc..6e79e7577f 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -48,13 +48,7 @@ #define CIFS_DEFAULT_IOSIZE (64 * 1024) #define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ -/* RFC 1002 session packet types */ -#define RFC1002_SESSION_MESSAGE 0x00 -#define RFC1002_SESSION_REQUEST 0x81 -#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82 -#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83 -#define RFC1002_RETARGET_SESSION_RESPONSE 0x84 -#define RFC1002_SESSION_KEEP_ALIVE 0x85 +#define MAX_STREAM_PROT_LEN 0x00FFFFFF /* Responses when opening a file. */ #define F_SUPERSEDED 0 @@ -501,8 +495,6 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); int ksmbd_init_smb_server(struct ksmbd_work *work); -bool ksmbd_pdu_size_has_room(unsigned int pdu); - struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 0a95cdec8c..bd792db326 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -380,7 +380,7 @@ static void parse_dacl(struct user_namespace *user_ns, { int i, ret; int num_aces = 0; - int acl_size; + unsigned int acl_size; char *acl_base; struct smb_ace **ppace; struct posix_acl_entry *cf_pace, *cf_pdace; @@ -392,7 +392,7 @@ static void parse_dacl(struct user_namespace *user_ns, return; /* validate that we do not go past end of acl */ - if (end_of_acl <= (char *)pdacl || + if (end_of_acl < (char *)pdacl + sizeof(struct smb_acl) || end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { pr_err("ACL too small to parse DACL\n"); return; @@ -431,8 +431,22 @@ static void parse_dacl(struct user_namespace *user_ns, * user/group/other have no permissions */ for (i = 0; i < num_aces; ++i) { + if (end_of_acl - acl_base < acl_size) + break; + ppace[i] = (struct smb_ace *)(acl_base + acl_size); acl_base = (char *)ppace[i]; + acl_size = offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth); + + if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + (end_of_acl - acl_base < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || + (le16_to_cpu(ppace[i]->size) < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth)) + break; + acl_size = le16_to_cpu(ppace[i]->size); ppace[i]->access_req = smb_map_generic_desired_access(ppace[i]->access_req); @@ -807,6 +821,9 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, if (!pntsd) return -EIO; + if (acl_len < sizeof(struct smb_ntsd)) + return -EINVAL; + owner_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); group_sid_ptr = (struct smb_sid *)((char *)pntsd + diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index dc15a5ecd2..c14320e03b 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -215,7 +215,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) * ksmbd_kthread_fn() - listen to new SMB connections and callback server * @p: arguments to forker thread * - * Return: Returns a task_struct or ERR_PTR + * Return: 0 on success, error number otherwise */ static int ksmbd_kthread_fn(void *p) { @@ -387,7 +387,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket) /** * create_socket - create socket for ksmbd/0 * - * Return: Returns a task_struct or ERR_PTR + * Return: 0 on success, error number otherwise */ static int create_socket(struct interface *iface) { diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 0b6cd3b873..994ec22d40 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -150,7 +150,7 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq) { struct iov_iter iter; - iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages, + iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); iov_iter_zero(iov_iter_count(&iter), &iter); diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index edec458315..0a9b72685f 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -42,7 +42,6 @@ EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace - * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 7629248fdd..be3c1aad50 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -542,7 +542,7 @@ nfsd_file_close_inode_sync(struct inode *inode) } /** - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file * @inode: inode of the file to attempt to remove * * Walk the whole hash bucket, looking for any files that correspond to "inode". diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7abeccb975..cf030ebe28 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3544,15 +3544,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, goto fail; cd->rd_maxcount -= entry_bytes; /* - * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so - * let's always let through the first entry, at least: + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and + * notes that it could be zero. If it is zero, then the server + * should enforce only the rd_maxcount value. */ - if (!cd->rd_dircount) - goto fail; - name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; - if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) - goto fail; - cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (cd->rd_dircount) { + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (!cd->rd_dircount) + cd->rd_maxcount = 0; + } cd->cookie_offset = cookie_offset; skip_entry: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c2c3d9077d..070e5dd03e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -793,7 +793,10 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr svc_xprt_put(xprt); } out_err: - nfsd_destroy(net); + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + nn->nfsd_serv->sv_nrthreads--; + else + nfsd_destroy(net); return err; } @@ -1545,7 +1548,7 @@ static int __init init_nfsd(void) goto out_free_all; return 0; out_free_all: - unregister_pernet_subsys(&nfsd_net_ops); + unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 34c4cbf7e2..e8c00dda42 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -6,13 +6,9 @@ * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? */ -#include -#include #include -#include -#include -#include #include +#include #include "debug.h" #include "ntfs.h" @@ -291,7 +287,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, if (!rsize) { /* Empty resident -> Non empty nonresident. */ } else if (!is_data) { - err = ntfs_sb_write_run(sbi, run, 0, data, rsize); + err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0); if (err) goto out2; } else if (!page) { @@ -451,11 +447,8 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, again_1: align = sbi->cluster_size; - if (is_ext) { + if (is_ext) align <<= attr_b->nres.c_unit; - if (is_attr_sparsed(attr_b)) - keep_prealloc = false; - } old_valid = le64_to_cpu(attr_b->nres.valid_size); old_size = le64_to_cpu(attr_b->nres.data_size); @@ -465,9 +458,6 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, new_alloc = (new_size + align - 1) & ~(u64)(align - 1); new_alen = new_alloc >> cluster_bits; - if (keep_prealloc && is_ext) - keep_prealloc = false; - if (keep_prealloc && new_size < old_size) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = true; @@ -529,7 +519,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, } else if (pre_alloc == -1) { pre_alloc = 0; if (type == ATTR_DATA && !name_len && - sbi->options.prealloc) { + sbi->options->prealloc) { CLST new_alen2 = bytes_to_cluster( sbi, get_pre_allocated(new_size)); pre_alloc = new_alen2 - new_alen; @@ -1966,7 +1956,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) return 0; from = vbo; - to = (vbo + bytes) < data_size ? (vbo + bytes) : data_size; + to = min_t(u64, vbo + bytes, data_size); memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); return 0; } diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index fa32399eb5..bad6d8a849 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -5,10 +5,7 @@ * */ -#include -#include #include -#include #include "debug.h" #include "ntfs.h" @@ -336,7 +333,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, if (attr && attr->non_res) { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, 0); if (err) return err; al->dirty = false; @@ -423,7 +420,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, return true; } -int al_update(struct ntfs_inode *ni) +int al_update(struct ntfs_inode *ni, int sync) { int err; struct ATTRIB *attr; @@ -445,7 +442,7 @@ int al_update(struct ntfs_inode *ni) memcpy(resident_data(attr), al->le, al->size); } else { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, sync); if (err) goto out; diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c index ce304d40b5..50d8380937 100644 --- a/fs/ntfs3/bitfunc.c +++ b/fs/ntfs3/bitfunc.c @@ -5,13 +5,8 @@ * */ -#include -#include -#include -#include +#include -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" #define BITS_IN_SIZE_T (sizeof(size_t) * 8) @@ -124,8 +119,7 @@ bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits) pos = nbits & 7; if (pos) { - u8 mask = fill_mask[pos]; - + mask = fill_mask[pos]; if ((*map & mask) != mask) return false; } diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 8315015550..aa18440752 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -10,12 +10,10 @@ * */ -#include #include #include -#include +#include -#include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" @@ -435,7 +433,7 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) ; } else { n3 = rb_next(&e->count.node); - max_new_len = len > new_len ? len : new_len; + max_new_len = max(len, new_len); if (!n3) { wnd->extent_max = max_new_len; } else { @@ -731,7 +729,7 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -784,7 +782,7 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -834,7 +832,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wbits != wnd->free_bits[iw]) { bool ret; @@ -926,7 +924,7 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wnd->free_bits[iw]) { bool ret; diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h index 31120569a8..53ef7489c7 100644 --- a/fs/ntfs3/debug.h +++ b/fs/ntfs3/debug.h @@ -11,6 +11,9 @@ #ifndef _LINUX_NTFS3_DEBUG_H #define _LINUX_NTFS3_DEBUG_H +struct super_block; +struct inode; + #ifndef Add2Ptr #define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I))) #define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B))) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 93f6d48556..fb438d6040 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -7,10 +7,7 @@ * */ -#include -#include #include -#include #include #include "debug.h" @@ -18,30 +15,27 @@ #include "ntfs_fs.h" /* Convert little endian UTF-16 to NLS string. */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len) { - int ret, uni_len, warn; - const __le16 *ip; + int ret, warn; u8 *op; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; static_assert(sizeof(wchar_t) == sizeof(__le16)); if (!nls) { /* UTF-16 -> UTF-8 */ - ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len, - UTF16_LITTLE_ENDIAN, buf, buf_len); + ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf, + buf_len); buf[ret] = '\0'; return ret; } - ip = uni->name; op = buf; - uni_len = uni->len; warn = 0; - while (uni_len--) { + while (len--) { u16 ec; int charlen; char dump[5]; @@ -52,7 +46,7 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, break; } - ec = le16_to_cpu(*ip++); + ec = le16_to_cpu(*name++); charlen = nls->uni2char(ec, op, buf_len); if (charlen > 0) { @@ -186,7 +180,7 @@ int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, { int ret, slen; const u8 *end; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; u16 *uname = uni->name; static_assert(sizeof(wchar_t) == sizeof(u16)); @@ -301,14 +295,14 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; /* Skip meta files. Unless option to show metafiles is set. */ - if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino)) + if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino)) return 0; - if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) + if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) return 0; - name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len, - name, PATH_MAX); + name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name, + PATH_MAX); if (name_len <= 0) { ntfs_warn(sbi->sb, "failed to convert name for inode %lx.", ino); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 424450e77a..43b1451bff 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -12,7 +12,6 @@ #include #include #include -#include #include "debug.h" #include "ntfs.h" @@ -588,8 +587,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) truncate_pagecache(inode, vbo_down); if (!is_sparsed(ni) && !is_compressed(ni)) { - /* Normal file. */ - err = ntfs_zero_range(inode, vbo, end); + /* + * Normal file, can't make hole. + * TODO: Try to find way to save info about hole. + */ + err = -EOPNOTSUPP; goto out; } @@ -737,7 +739,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode = inode->i_mode; int err; - if (sbi->options.no_acs_rules) { + if (sbi->options->noacsrules) { /* "No access rules" - Force any changes of time etc. */ attr->ia_valid |= ATTR_FORCE; /* and disable for editing some attributes. */ @@ -1185,7 +1187,7 @@ static int ntfs_file_release(struct inode *inode, struct file *file) int err = 0; /* If we are last writer on the inode, drop the block reservation. */ - if (sbi->options.prealloc && ((file->f_mode & FMODE_WRITE) && + if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && atomic_read(&inode->i_writecount) == 1)) { ni_lock(ni); down_write(&ni->file.run_lock); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 938b12d56c..6f47a9c17f 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -5,11 +5,8 @@ * */ -#include -#include #include #include -#include #include #include "debug.h" @@ -708,18 +705,35 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) continue; mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) { + /* Should never happened, 'cause already checked. */ + goto bad; + } attr = mi_find_attr(mi, NULL, le->type, le_name(le), le->name_len, &le->id); + if (!attr) { + /* Should never happened, 'cause already checked. */ + goto bad; + } asize = le32_to_cpu(attr->size); /* Insert into primary record. */ attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), le->name_len, asize, le16_to_cpu(attr->name_off)); - id = attr_ins->id; + if (!attr_ins) { + /* + * Internal error. + * Either no space in primary record (already checked). + * Either tried to insert another + * non indexed attribute (logic error). + */ + goto bad; + } /* Copy all except id. */ + id = attr_ins->id; memcpy(attr_ins, attr, asize); attr_ins->id = id; @@ -735,6 +749,10 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) ni->attr_list.dirty = false; return 0; +bad: + ntfs_inode_err(&ni->vfs_inode, "Internal error"); + make_bad_inode(&ni->vfs_inode); + return -EINVAL; } /* @@ -956,6 +974,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, continue; } + /* + * Do not try to insert this attribute + * if there is no room in record. + */ + if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size) + continue; + /* Try to insert attribute into this subrecord. */ attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, name_off, svcn, ins_le); @@ -1451,7 +1476,7 @@ int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, attr->res.flags = RESIDENT_FLAG_INDEXED; /* is_attr_indexed(attr)) == true */ - le16_add_cpu(&ni->mi.mrec->hard_links, +1); + le16_add_cpu(&ni->mi.mrec->hard_links, 1); ni->mi.dirty = true; } attr->res.res = 0; @@ -1606,7 +1631,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, *le = NULL; - if (FILE_NAME_POSIX == name_type) + if (name_type == FILE_NAME_POSIX) return NULL; /* Enumerate all names. */ @@ -1706,18 +1731,16 @@ int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa) /* * ni_parse_reparse * - * Buffer is at least 24 bytes. + * buffer - memory for reparse buffer header */ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer) + struct REPARSE_DATA_BUFFER *buffer) { const struct REPARSE_DATA_BUFFER *rp = NULL; u8 bits; u16 len; typeof(rp->CompressReparseBuffer) *cmpr; - static_assert(sizeof(struct REPARSE_DATA_BUFFER) <= 24); - /* Try to estimate reparse point. */ if (!attr->non_res) { rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); @@ -1803,6 +1826,9 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, return REPARSE_NONE; } + if (buffer != rp) + memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER)); + /* Looks like normal symlink. */ return REPARSE_LINK; } @@ -2906,9 +2932,8 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); mi_get_ref(&ni->mi, &de->ref); - if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) { + if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) return false; - } } return true; @@ -3077,7 +3102,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, const struct EA_INFO *info; info = resident_data_ex(attr, sizeof(struct EA_INFO)); - dup->ea_size = info->size_pack; + /* If ATTR_EA_INFO exists 'info' can't be NULL. */ + if (info) + dup->ea_size = info->size_pack; } } @@ -3205,7 +3232,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) goto out; } - err = al_update(ni); + err = al_update(ni, sync); if (err) goto out; } diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index b5853aed0e..06492f088d 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -6,12 +6,8 @@ */ #include -#include #include -#include -#include #include -#include #include #include "debug.h" @@ -2219,7 +2215,7 @@ static int last_log_lsn(struct ntfs_log *log) err = ntfs_sb_write_run(log->ni->mi.sbi, &log->ni->file.run, off, page, - log->page_size); + log->page_size, 0); if (err) goto out; @@ -3710,7 +3706,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, if (a_dirty) { attr = oa->attr; - err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes); + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); if (err) goto out; } @@ -5152,10 +5148,10 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) ntfs_fix_pre_write(&rh->rhdr, log->page_size); - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0); if (!err) err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, - rh, log->page_size); + rh, log->page_size, 0); kfree(rh); if (err) diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 91e3743e14..4de9acb169 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include "debug.h" #include "ntfs.h" @@ -358,7 +358,7 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, enum ALLOCATE_OPT opt) { int err; - CLST alen = 0; + CLST alen; struct super_block *sb = sbi->sb; size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; @@ -370,27 +370,28 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, if (!zlen) { err = ntfs_refresh_zone(sbi); if (err) - goto out; + goto up_write; + zlen = wnd_zone_len(wnd); } if (!zlen) { ntfs_err(sbi->sb, "no free space to extend mft"); - goto out; + err = -ENOSPC; + goto up_write; } lcn = wnd_zone_bit(wnd); - alen = zlen > len ? len : zlen; + alen = min_t(CLST, len, zlen); wnd_zone_set(wnd, lcn + alen, zlen - alen); err = wnd_set_used(wnd, lcn, alen); - if (err) { - up_write(&wnd->rw_lock); - return err; - } + if (err) + goto up_write; + alcn = lcn; - goto out; + goto space_found; } /* * 'Cause cluster 0 is always used this value means that we should use @@ -404,49 +405,45 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn); if (alen) - goto out; + goto space_found; /* Try to use clusters from MftZone. */ zlen = wnd_zone_len(wnd); zeroes = wnd_zeroes(wnd); /* Check too big request */ - if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) - goto out; + if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) { + err = -ENOSPC; + goto up_write; + } /* How many clusters to cat from zone. */ zlcn = wnd_zone_bit(wnd); zlen2 = zlen >> 1; - ztrim = len > zlen ? zlen : (len > zlen2 ? len : zlen2); - new_zlen = zlen - ztrim; - - if (new_zlen < NTFS_MIN_MFT_ZONE) { - new_zlen = NTFS_MIN_MFT_ZONE; - if (new_zlen > zlen) - new_zlen = zlen; - } + ztrim = clamp_val(len, zlen2, zlen); + new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE); wnd_zone_set(wnd, zlcn, new_zlen); /* Allocate continues clusters. */ alen = wnd_find(wnd, len, 0, BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn); - -out: - if (alen) { - err = 0; - *new_len = alen; - *new_lcn = alcn; - - ntfs_unmap_meta(sb, alcn, alen); - - /* Set hint for next requests. */ - if (!(opt & ALLOCATE_MFT)) - sbi->used.next_free_lcn = alcn + alen; - } else { + if (!alen) { err = -ENOSPC; + goto up_write; } +space_found: + err = 0; + *new_len = alen; + *new_lcn = alcn; + + ntfs_unmap_meta(sb, alcn, alen); + + /* Set hint for next requests. */ + if (!(opt & ALLOCATE_MFT)) + sbi->used.next_free_lcn = alcn + alen; +up_write: up_write(&wnd->rw_lock); return err; } @@ -1080,7 +1077,7 @@ int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, } int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes) + u64 vbo, const void *buf, size_t bytes, int sync) { struct super_block *sb = sbi->sb; u8 cluster_bits = sbi->cluster_bits; @@ -1099,8 +1096,8 @@ int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, len = ((u64)clen << cluster_bits) - off; for (;;) { - u32 op = len < bytes ? len : bytes; - int err = ntfs_sb_write(sb, lbo, op, buf, 0); + u32 op = min_t(u64, len, bytes); + int err = ntfs_sb_write(sb, lbo, op, buf, sync); if (err) return err; @@ -1300,7 +1297,7 @@ int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, nb->off = off = lbo & (blocksize - 1); for (;;) { - u32 len32 = len < bytes ? len : bytes; + u32 len32 = min_t(u64, len, bytes); sector_t block = lbo >> sb->s_blocksize_bits; do { @@ -2175,7 +2172,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write main SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off, - d_security, aligned_sec_size); + d_security, aligned_sec_size, 0); if (err) goto out; @@ -2193,7 +2190,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write copy SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security, - aligned_sec_size); + aligned_sec_size, 0); if (err) goto out; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 0daca9adc5..6f81e3a49a 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include "debug.h" #include "ntfs.h" @@ -671,113 +671,17 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, const struct INDEX_HDR *hdr, const void *key, size_t key_len, const void *ctx, int *diff) { - struct NTFS_DE *e; + struct NTFS_DE *e, *found = NULL; NTFS_CMP_FUNC cmp = indx->cmp; + int min_idx = 0, mid_idx, max_idx = 0; + int diff2; + int table_size = 8; u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); + u16 offs[128]; -#ifdef NTFS3_INDEX_BINARY_SEARCH - int max_idx = 0, fnd, min_idx; - int nslots = 64; - u16 *offs; - - if (end > 0x10000) - goto next; - - offs = kmalloc(sizeof(u16) * nslots, GFP_NOFS); - if (!offs) - goto next; - - /* Use binary search algorithm. */ -next1: - if (off + sizeof(struct NTFS_DE) > end) { - e = NULL; - goto out1; - } - e = Add2Ptr(hdr, off); - e_size = le16_to_cpu(e->size); - - if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) { - e = NULL; - goto out1; - } - - if (max_idx >= nslots) { - u16 *ptr; - int new_slots = ALIGN(2 * nslots, 8); - - ptr = kmalloc(sizeof(u16) * new_slots, GFP_NOFS); - if (ptr) - memcpy(ptr, offs, sizeof(u16) * max_idx); - kfree(offs); - offs = ptr; - nslots = new_slots; - if (!ptr) - goto next; - } - - /* Store entry table. */ - offs[max_idx] = off; - - if (!de_is_last(e)) { - off += e_size; - max_idx += 1; - goto next1; - } - - /* - * Table of pointers is created. - * Use binary search to find entry that is <= to the search value. - */ - fnd = -1; - min_idx = 0; - - while (min_idx <= max_idx) { - int mid_idx = min_idx + ((max_idx - min_idx) >> 1); - int diff2; - - e = Add2Ptr(hdr, offs[mid_idx]); - - e_key_len = le16_to_cpu(e->key_size); - - diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); - - if (!diff2) { - *diff = 0; - goto out1; - } - - if (diff2 < 0) { - max_idx = mid_idx - 1; - fnd = mid_idx; - if (!fnd) - break; - } else { - min_idx = mid_idx + 1; - } - } - - if (fnd == -1) { - e = NULL; - goto out1; - } - - *diff = -1; - e = Add2Ptr(hdr, offs[fnd]); - -out1: - kfree(offs); - - return e; -#endif - -next: - /* - * Entries index are sorted. - * Enumerate all entries until we find entry - * that is <= to the search value. - */ +fill_table: if (off + sizeof(struct NTFS_DE) > end) return NULL; @@ -787,22 +691,54 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) return NULL; - off += e_size; + if (!de_is_last(e)) { + offs[max_idx] = off; + off += e_size; + max_idx++; + if (max_idx < table_size) + goto fill_table; + + max_idx--; + } + +binary_search: e_key_len = le16_to_cpu(e->key_size); - *diff = (*cmp)(key, key_len, e + 1, e_key_len, ctx); - if (!*diff) - return e; + diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + if (diff2 > 0) { + if (found) { + min_idx = mid_idx + 1; + } else { + if (de_is_last(e)) + return NULL; - if (*diff <= 0) - return e; + max_idx = 0; + table_size = min(table_size * 2, + (int)ARRAY_SIZE(offs)); + goto fill_table; + } + } else if (diff2 < 0) { + if (found) + max_idx = mid_idx - 1; + else + max_idx--; - if (de_is_last(e)) { - *diff = 1; + found = e; + } else { + *diff = 0; return e; } - goto next; + + if (min_idx > max_idx) { + *diff = -1; + return found; + } + + mid_idx = (min_idx + max_idx) >> 1; + e = Add2Ptr(hdr, offs[mid_idx]); + + goto binary_search; } /* @@ -1136,9 +1072,7 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, if (!e) return -EINVAL; - if (fnd) - fnd->root_de = e; - + fnd->root_de = e; err = 0; for (;;) { @@ -1401,7 +1335,7 @@ int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { - int err = -ENOMEM; + int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *bitmap; struct ATTRIB *alloc; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index db2a5a4c38..859951d785 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -5,10 +5,8 @@ * */ -#include #include #include -#include #include #include #include @@ -49,8 +47,8 @@ static struct inode *ntfs_read_mft(struct inode *inode, inode->i_op = NULL; /* Setup 'uid' and 'gid' */ - inode->i_uid = sbi->options.fs_uid; - inode->i_gid = sbi->options.fs_gid; + inode->i_uid = sbi->options->fs_uid; + inode->i_gid = sbi->options->fs_gid; err = mi_init(&ni->mi, sbi, ino); if (err) @@ -224,12 +222,9 @@ static struct inode *ntfs_read_mft(struct inode *inode, if (!attr->non_res) { ni->i_valid = inode->i_size = rsize; inode_set_bytes(inode, rsize); - t32 = asize; - } else { - t32 = le16_to_cpu(attr->nres.run_off); } - mode = S_IFREG | (0777 & sbi->options.fs_fmask_inv); + mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv); if (!attr->non_res) { ni->ni_flags |= NI_FLAG_RESIDENT; @@ -272,7 +267,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, goto out; mode = sb->s_root - ? (S_IFDIR | (0777 & sbi->options.fs_dmask_inv)) + ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : (S_IFDIR | 0777); goto next_attr; @@ -315,17 +310,14 @@ static struct inode *ntfs_read_mft(struct inode *inode, rp_fa = ni_parse_reparse(ni, attr, &rp); switch (rp_fa) { case REPARSE_LINK: - if (!attr->non_res) { - inode->i_size = rsize; - inode_set_bytes(inode, rsize); - t32 = asize; - } else { - inode->i_size = - le64_to_cpu(attr->nres.data_size); - t32 = le16_to_cpu(attr->nres.run_off); - } + /* + * Normal symlink. + * Assume one unicode symbol == one utf8. + */ + inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer + .PrintNameLength) / + sizeof(u16); - /* Looks like normal symlink. */ ni->i_valid = inode->i_size; /* Clear directory bit. */ @@ -422,7 +414,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; - inode_nohighmem(inode); // ?? + inode_nohighmem(inode); } else if (S_ISREG(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; @@ -443,7 +435,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, goto out; } - if ((sbi->options.sys_immutable && + if ((sbi->options->sys_immutable && (std5->fa & FILE_ATTRIBUTE_SYSTEM)) && !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) { inode->i_flags |= S_IMMUTABLE; @@ -1200,9 +1192,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, struct REPARSE_DATA_BUFFER *rp = NULL; bool rp_inserted = false; + ni_lock_dir(dir_ni); + dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); - if (!dir_root) - return ERR_PTR(-EINVAL); + if (!dir_root) { + err = -EINVAL; + goto out1; + } if (S_ISDIR(mode)) { /* Use parent's directory attributes. */ @@ -1244,7 +1240,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, * } */ } else if (S_ISREG(mode)) { - if (sbi->options.sparse) { + if (sbi->options->sparse) { /* Sparsed regular file, cause option 'sparse'. */ fa = FILE_ATTRIBUTE_SPARSE_FILE | FILE_ATTRIBUTE_ARCHIVE; @@ -1486,7 +1482,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, asize = ALIGN(SIZEOF_RESIDENT + nsize, 8); t16 = PtrOffset(rec, attr); - /* 0x78 - the size of EA + EAINFO to store WSL */ + /* + * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes. + * It is good idea to keep extened attributes resident. + */ if (asize + t16 + 0x78 + 8 > sbi->record_size) { CLST alen; CLST clst = bytes_to_cluster(sbi, nsize); @@ -1521,14 +1520,14 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, } asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); - inode->i_size = nsize; } else { attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(nsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); - inode->i_size = nsize; nsize = 0; } + /* Size of symlink equals the length of input string. */ + inode->i_size = size; attr->size = cpu_to_le32(asize); @@ -1551,6 +1550,9 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (err) goto out6; + /* Unlock parent directory before ntfs_init_acl. */ + ni_unlock(dir_ni); + inode->i_generation = le16_to_cpu(rec->seq); dir->i_mtime = dir->i_ctime = inode->i_atime; @@ -1562,6 +1564,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; inode->i_mapping->a_ops = &ntfs_aops; + inode->i_size = size; + inode_nohighmem(inode); } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; @@ -1577,7 +1581,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { err = ntfs_init_acl(mnt_userns, inode, dir); if (err) - goto out6; + goto out7; } else #endif { @@ -1586,7 +1590,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, /* Write non resident data. */ if (nsize) { - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); if (err) goto out7; } @@ -1607,8 +1611,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, out7: /* Undo 'indx_insert_entry'. */ + ni_lock_dir(dir_ni); indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, le16_to_cpu(new_de->key_size), sbi); + /* ni_unlock(dir_ni); will be called later. */ out6: if (rp_inserted) ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); @@ -1632,8 +1638,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, kfree(rp); out1: - if (err) + if (err) { + ni_unlock(dir_ni); return ERR_PTR(err); + } unlock_new_inode(inode); @@ -1754,15 +1762,15 @@ void ntfs_evict_inode(struct inode *inode) static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, int buflen) { - int i, err = 0; + int i, err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - u64 i_size = inode->i_size; - u16 nlen = 0; + u64 size; + u16 ulen = 0; void *to_free = NULL; struct REPARSE_DATA_BUFFER *rp; - struct le_str *uni; + const __le16 *uname; struct ATTRIB *attr; /* Reparse data present. Try to parse it. */ @@ -1771,68 +1779,64 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, *buffer = 0; - /* Read into temporal buffer. */ - if (i_size > sbi->reparse.max_size || i_size <= sizeof(u32)) { - err = -EINVAL; - goto out; - } - attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); - if (!attr) { - err = -EINVAL; + if (!attr) goto out; - } if (!attr->non_res) { - rp = resident_data_ex(attr, i_size); - if (!rp) { - err = -EINVAL; + rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); + if (!rp) goto out; - } + size = le32_to_cpu(attr->res.data_size); } else { - rp = kmalloc(i_size, GFP_NOFS); + size = le64_to_cpu(attr->nres.data_size); + rp = NULL; + } + + if (size > sbi->reparse.max_size || size <= sizeof(u32)) + goto out; + + if (!rp) { + rp = kmalloc(size, GFP_NOFS); if (!rp) { err = -ENOMEM; goto out; } to_free = rp; - err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, i_size, NULL); + /* Read into temporal buffer. */ + err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL); if (err) goto out; } - err = -EINVAL; - /* Microsoft Tag. */ switch (rp->ReparseTag) { case IO_REPARSE_TAG_MOUNT_POINT: /* Mount points and junctions. */ /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer) + - le16_to_cpu(rp->MountPointReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); + uname = Add2Ptr(rp, + offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer) + + le16_to_cpu(rp->MountPointReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); break; case IO_REPARSE_TAG_SYMLINK: /* FolderSymbolicLink */ /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer) + - le16_to_cpu(rp->SymbolicLinkReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu( + uname = Add2Ptr( + rp, offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer) + + le16_to_cpu(rp->SymbolicLinkReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu( rp->SymbolicLinkReparseBuffer.PrintNameLength); break; @@ -1864,29 +1868,28 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, goto out; } if (!IsReparseTagNameSurrogate(rp->ReparseTag) || - i_size <= sizeof(struct REPARSE_POINT)) { + size <= sizeof(struct REPARSE_POINT)) { goto out; } /* Users tag. */ - uni = Add2Ptr(rp, sizeof(struct REPARSE_POINT) - 2); - nlen = le16_to_cpu(rp->ReparseDataLength) - + uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT)); + ulen = le16_to_cpu(rp->ReparseDataLength) - sizeof(struct REPARSE_POINT); } /* Convert nlen from bytes to UNICODE chars. */ - nlen >>= 1; + ulen >>= 1; /* Check that name is available. */ - if (!nlen || &uni->name[nlen] > (__le16 *)Add2Ptr(rp, i_size)) + if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size)) goto out; /* If name is already zero terminated then truncate it now. */ - if (!uni->name[nlen - 1]) - nlen -= 1; - uni->len = nlen; + if (!uname[ulen - 1]) + ulen -= 1; - err = ntfs_utf16_to_nls(sbi, uni, buffer, buflen); + err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen); if (err < 0) goto out; diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h index 2d70ae42f1..dd7ced000d 100644 --- a/fs/ntfs3/lib/decompress_common.h +++ b/fs/ntfs3/lib/decompress_common.h @@ -5,6 +5,9 @@ * Copyright (C) 2015 Eric Biggers */ +#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H +#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H + #include #include #include @@ -336,3 +339,5 @@ static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend return dst; } + +#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */ diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h index f508fbad2e..90309a5ae5 100644 --- a/fs/ntfs3/lib/lib.h +++ b/fs/ntfs3/lib/lib.h @@ -7,6 +7,10 @@ * - linux kernel code style */ +#ifndef _LINUX_NTFS3_LIB_LIB_H +#define _LINUX_NTFS3_LIB_LIB_H + +#include /* globals from xpress_decompress.c */ struct xpress_decompressor *xpress_allocate_decompressor(void); @@ -24,3 +28,5 @@ int lzx_decompress(struct lzx_decompressor *__restrict d, const void *__restrict compressed_data, size_t compressed_size, void *__restrict uncompressed_data, size_t uncompressed_size); + +#endif /* _LINUX_NTFS3_LIB_LIB_H */ diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index f1f691a67c..28f654561f 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -5,13 +5,13 @@ * */ -#include -#include -#include -#include +#include +#include +#include +#include +#include #include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" // clang-format off @@ -292,7 +292,7 @@ static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr, /* * get_lznt_ctx * @level: 0 - Standard compression. - * !0 - Best compression, requires a lot of cpu. + * !0 - Best compression, requires a lot of cpu. */ struct lznt *get_lznt_ctx(int level) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index e58415d071..bc741213ad 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -5,11 +5,7 @@ * */ -#include -#include #include -#include -#include #include #include "debug.h" @@ -99,16 +95,11 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -120,16 +111,11 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -200,15 +186,10 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, { u32 size = strlen(symname); struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -219,15 +200,10 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 6bb3e59526..9cc396b117 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -10,19 +10,27 @@ #ifndef _LINUX_NTFS3_NTFS_H #define _LINUX_NTFS3_NTFS_H -/* TODO: Check 4K MFT record and 512 bytes cluster. */ +#include +#include +#include +#include +#include +#include -/* Activate this define to use binary search in indexes. */ -#define NTFS3_INDEX_BINARY_SEARCH +#include "debug.h" + +/* TODO: Check 4K MFT record and 512 bytes cluster. */ /* Check each run for marked clusters. */ #define NTFS3_CHECK_FREE_CLST #define NTFS_NAME_LEN 255 -/* ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. */ -#define NTFS_LINK_MAX 0x400 -//#define NTFS_LINK_MAX 0xffff +/* + * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. + * xfstest generic/041 creates 3003 hardlinks. + */ +#define NTFS_LINK_MAX 4000 /* * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys. diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index dc71c59fd4..8aaec7e080 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -9,6 +9,37 @@ #ifndef _LINUX_NTFS3_NTFS_FS_H #define _LINUX_NTFS3_NTFS_FS_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "debug.h" +#include "ntfs.h" + +struct dentry; +struct fiemap_extent_info; +struct user_namespace; +struct page; +struct writeback_control; +enum utf16_endian; + + #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 @@ -52,6 +83,7 @@ // clang-format on struct ntfs_mount_options { + char *nls_name; struct nls_table *nls; kuid_t fs_uid; @@ -59,19 +91,16 @@ struct ntfs_mount_options { u16 fs_fmask_inv; u16 fs_dmask_inv; - unsigned uid : 1, /* uid was set. */ - gid : 1, /* gid was set. */ - fmask : 1, /* fmask was set. */ - dmask : 1, /* dmask was set. */ - sys_immutable : 1, /* Immutable system files. */ - discard : 1, /* Issue discard requests on deletions. */ - sparse : 1, /* Create sparse files. */ - showmeta : 1, /* Show meta files. */ - nohidden : 1, /* Do not show hidden files. */ - force : 1, /* Rw mount dirty volume. */ - no_acs_rules : 1, /*Exclude acs rules. */ - prealloc : 1 /* Preallocate space when file is growing. */ - ; + unsigned fmask : 1; /* fmask was set. */ + unsigned dmask : 1; /*dmask was set. */ + unsigned sys_immutable : 1; /* Immutable system files. */ + unsigned discard : 1; /* Issue discard requests on deletions. */ + unsigned sparse : 1; /* Create sparse files. */ + unsigned showmeta : 1; /* Show meta files. */ + unsigned nohidden : 1; /* Do not show hidden files. */ + unsigned force : 1; /* RW mount dirty volume. */ + unsigned noacsrules : 1; /* Exclude acs rules. */ + unsigned prealloc : 1; /* Preallocate space when file is growing. */ }; /* Special value to unpack and deallocate. */ @@ -182,10 +211,8 @@ struct ntfs_sb_info { u32 blocks_per_cluster; // cluster_size / sb->s_blocksize u32 record_size; - u32 sector_size; u32 index_size; - u8 sector_bits; u8 cluster_bits; u8 record_bits; @@ -279,7 +306,7 @@ struct ntfs_sb_info { #endif } compress; - struct ntfs_mount_options options; + struct ntfs_mount_options *options; struct ratelimit_state msg_ratelimit; }; @@ -436,7 +463,7 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, const __le16 *name, size_t name_len, const struct MFT_REF *ref); -int al_update(struct ntfs_inode *ni); +int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { return (size + 1023) & ~(size_t)1023; @@ -448,7 +475,7 @@ bool are_bits_set(const ulong *map, size_t bit, size_t nbits); size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits); /* Globals from dir.c */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len); int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, struct cpu_str *uni, u32 max_ulen, @@ -520,7 +547,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, struct ATTR_LIST_ENTRY **entry); int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer); + struct REPARSE_DATA_BUFFER *buffer); int ni_write_inode(struct inode *inode, int sync, const char *hint); #define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, @@ -577,7 +604,7 @@ int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes); + u64 vbo, const void *buf, size_t bytes, int sync); struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo); int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 103705c867..861e357915 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -5,10 +5,7 @@ * */ -#include -#include #include -#include #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 26ed2b6434..a8fec651f9 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -7,10 +7,8 @@ */ #include -#include #include #include -#include #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 55bbc9200a..d41d76979e 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -23,16 +23,15 @@ * */ -#include #include #include #include #include -#include +#include +#include #include #include #include -#include #include #include @@ -205,9 +204,11 @@ void *ntfs_put_shared(void *ptr) return ret; } -static inline void clear_mount_options(struct ntfs_mount_options *options) +static inline void put_mount_options(struct ntfs_mount_options *options) { + kfree(options->nls_name); unload_nls(options->nls); + kfree(options); } enum Opt { @@ -223,218 +224,175 @@ enum Opt { Opt_nohidden, Opt_showmeta, Opt_acl, - Opt_noatime, - Opt_nls, + Opt_iocharset, Opt_prealloc, - Opt_no_acs_rules, + Opt_noacsrules, Opt_err, }; -static const match_table_t ntfs_tokens = { - { Opt_uid, "uid=%u" }, - { Opt_gid, "gid=%u" }, - { Opt_umask, "umask=%o" }, - { Opt_dmask, "dmask=%o" }, - { Opt_fmask, "fmask=%o" }, - { Opt_immutable, "sys_immutable" }, - { Opt_discard, "discard" }, - { Opt_force, "force" }, - { Opt_sparse, "sparse" }, - { Opt_nohidden, "nohidden" }, - { Opt_acl, "acl" }, - { Opt_noatime, "noatime" }, - { Opt_showmeta, "showmeta" }, - { Opt_nls, "nls=%s" }, - { Opt_prealloc, "prealloc" }, - { Opt_no_acs_rules, "no_acs_rules" }, - { Opt_err, NULL }, +static const struct fs_parameter_spec ntfs_fs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_flag_no("sys_immutable", Opt_immutable), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag_no("force", Opt_force), + fsparam_flag_no("sparse", Opt_sparse), + fsparam_flag_no("hidden", Opt_nohidden), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("showmeta", Opt_showmeta), + fsparam_flag_no("prealloc", Opt_prealloc), + fsparam_flag_no("acsrules", Opt_noacsrules), + fsparam_string("iocharset", Opt_iocharset), + {} }; -static noinline int ntfs_parse_options(struct super_block *sb, char *options, - int silent, - struct ntfs_mount_options *opts) +/* + * Load nls table or if @nls is utf8 then return NULL. + */ +static struct nls_table *ntfs_load_nls(char *nls) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char nls_name[30]; - struct nls_table *nls; + struct nls_table *ret; - opts->fs_uid = current_uid(); - opts->fs_gid = current_gid(); - opts->fs_fmask_inv = opts->fs_dmask_inv = ~current_umask(); - nls_name[0] = 0; + if (!nls) + nls = CONFIG_NLS_DEFAULT; - if (!options) - goto out; + if (strcmp(nls, "utf8") == 0) + return NULL; - while ((p = strsep(&options, ","))) { - int token; + if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0) + return load_nls_default(); - if (!*p) - continue; + ret = load_nls(nls); + if (ret) + return ret; - token = match_token(p, ntfs_tokens, args); - switch (token) { - case Opt_immutable: - opts->sys_immutable = 1; - break; - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(opts->fs_uid)) - return -EINVAL; - opts->uid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(opts->fs_gid)) - return -EINVAL; - opts->gid = 1; - break; - case Opt_umask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = opts->fs_dmask_inv = ~option; - opts->fmask = opts->dmask = 1; - break; - case Opt_dmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_dmask_inv = ~option; - opts->dmask = 1; - break; - case Opt_fmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = ~option; - opts->fmask = 1; - break; - case Opt_discard: - opts->discard = 1; - break; - case Opt_force: - opts->force = 1; - break; - case Opt_sparse: - opts->sparse = 1; - break; - case Opt_nohidden: - opts->nohidden = 1; - break; - case Opt_acl: + return ERR_PTR(-EINVAL); +} + +static int ntfs_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ntfs_mount_options *opts = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, ntfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(opts->fs_uid)) + return invalf(fc, "ntfs3: Invalid value for uid."); + break; + case Opt_gid: + opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(opts->fs_gid)) + return invalf(fc, "ntfs3: Invalid value for gid."); + break; + case Opt_umask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for umask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fs_dmask_inv = ~result.uint_32; + opts->fmask = 1; + opts->dmask = 1; + break; + case Opt_dmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for dmask."); + opts->fs_dmask_inv = ~result.uint_32; + opts->dmask = 1; + break; + case Opt_fmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for fmask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fmask = 1; + break; + case Opt_immutable: + opts->sys_immutable = result.negated ? 0 : 1; + break; + case Opt_discard: + opts->discard = result.negated ? 0 : 1; + break; + case Opt_force: + opts->force = result.negated ? 0 : 1; + break; + case Opt_sparse: + opts->sparse = result.negated ? 0 : 1; + break; + case Opt_nohidden: + opts->nohidden = result.negated ? 1 : 0; + break; + case Opt_acl: + if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; - break; + fc->sb_flags |= SB_POSIXACL; #else - ntfs_err(sb, "support for ACL not compiled in!"); - return -EINVAL; + return invalf(fc, "ntfs3: Support for ACL not compiled in!"); #endif - case Opt_noatime: - sb->s_flags |= SB_NOATIME; - break; - case Opt_showmeta: - opts->showmeta = 1; - break; - case Opt_nls: - match_strlcpy(nls_name, &args[0], sizeof(nls_name)); - break; - case Opt_prealloc: - opts->prealloc = 1; - break; - case Opt_no_acs_rules: - opts->no_acs_rules = 1; - break; - default: - if (!silent) - ntfs_err( - sb, - "Unrecognized mount option \"%s\" or missing value", - p); - //return -EINVAL; - } + else + fc->sb_flags &= ~SB_POSIXACL; + break; + case Opt_showmeta: + opts->showmeta = result.negated ? 0 : 1; + break; + case Opt_iocharset: + kfree(opts->nls_name); + opts->nls_name = param->string; + param->string = NULL; + break; + case Opt_prealloc: + opts->prealloc = result.negated ? 0 : 1; + break; + case Opt_noacsrules: + opts->noacsrules = result.negated ? 1 : 0; + break; + default: + /* Should not be here unless we forget add case. */ + return -EINVAL; } - -out: - if (!strcmp(nls_name[0] ? nls_name : CONFIG_NLS_DEFAULT, "utf8")) { - /* - * For UTF-8 use utf16s_to_utf8s()/utf8s_to_utf16s() - * instead of NLS. - */ - nls = NULL; - } else if (nls_name[0]) { - nls = load_nls(nls_name); - if (!nls) { - ntfs_err(sb, "failed to load \"%s\"", nls_name); - return -EINVAL; - } - } else { - nls = load_nls_default(); - if (!nls) { - ntfs_err(sb, "failed to load default nls"); - return -EINVAL; - } - } - opts->nls = nls; - return 0; } -static int ntfs_remount(struct super_block *sb, int *flags, char *data) +static int ntfs_fs_reconfigure(struct fs_context *fc) { - int err, ro_rw; + struct super_block *sb = fc->root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options old_opts; - char *orig_data = kstrdup(data, GFP_KERNEL); + struct ntfs_mount_options *new_opts = fc->fs_private; + int ro_rw; - if (data && !orig_data) - return -ENOMEM; - - /* Store original options. */ - memcpy(&old_opts, &sbi->options, sizeof(old_opts)); - clear_mount_options(&sbi->options); - memset(&sbi->options, 0, sizeof(sbi->options)); - - err = ntfs_parse_options(sb, data, 0, &sbi->options); - if (err) - goto restore_opts; - - ro_rw = sb_rdonly(sb) && !(*flags & SB_RDONLY); + ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { - ntfs_warn( - sb, - "Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); - err = -EINVAL; - goto restore_opts; + errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + return -EINVAL; } + new_opts->nls = ntfs_load_nls(new_opts->nls_name); + if (IS_ERR(new_opts->nls)) { + new_opts->nls = NULL; + errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); + return -EINVAL; + } + if (new_opts->nls != sbi->options->nls) + return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!"); + sync_filesystem(sb); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && - !sbi->options.force) { - ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); - err = -EINVAL; - goto restore_opts; + !new_opts->force) { + errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); + return -EINVAL; } - clear_mount_options(&old_opts); + memcpy(sbi->options, new_opts, sizeof(*new_opts)); - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME) | - SB_NODIRATIME | SB_NOATIME; - ntfs_info(sb, "re-mounted. Opts: %s", orig_data); - err = 0; - goto out; - -restore_opts: - clear_mount_options(&sbi->options); - memcpy(&sbi->options, &old_opts, sizeof(old_opts)); - -out: - kfree(orig_data); - return err; + return 0; } static struct kmem_cache *ntfs_inode_cachep; @@ -513,8 +471,6 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi) xpress_free_decompressor(sbi->compress.xpress); lzx_free_decompressor(sbi->compress.lzx); #endif - clear_mount_options(&sbi->options); - kfree(sbi); } @@ -525,7 +481,9 @@ static void ntfs_put_super(struct super_block *sb) /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + put_mount_options(sbi->options); put_ntfs(sbi); + sb->s_fs_info = NULL; sync_blockdev(sb->s_bdev); } @@ -552,23 +510,21 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) { struct super_block *sb = root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options *opts = &sbi->options; + struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); - if (opts->uid) - seq_printf(m, ",uid=%u", - from_kuid_munged(user_ns, opts->fs_uid)); - if (opts->gid) - seq_printf(m, ",gid=%u", - from_kgid_munged(user_ns, opts->fs_gid)); + seq_printf(m, ",uid=%u", + from_kuid_munged(user_ns, opts->fs_uid)); + seq_printf(m, ",gid=%u", + from_kgid_munged(user_ns, opts->fs_gid)); if (opts->fmask) seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv); if (opts->dmask) seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv); if (opts->nls) - seq_printf(m, ",nls=%s", opts->nls->charset); + seq_printf(m, ",iocharset=%s", opts->nls->charset); else - seq_puts(m, ",nls=utf8"); + seq_puts(m, ",iocharset=utf8"); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) @@ -581,14 +537,12 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nohidden"); if (opts->force) seq_puts(m, ",force"); - if (opts->no_acs_rules) - seq_puts(m, ",no_acs_rules"); + if (opts->noacsrules) + seq_puts(m, ",noacsrules"); if (opts->prealloc) seq_puts(m, ",prealloc"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); - if (sb->s_flags & SB_NOATIME) - seq_puts(m, ",noatime"); return 0; } @@ -643,7 +597,6 @@ static const struct super_operations ntfs_sops = { .statfs = ntfs_statfs, .show_options = ntfs_show_options, .sync_fs = ntfs_sync_fs, - .remount_fs = ntfs_remount, .write_inode = ntfs3_write_inode, }; @@ -729,7 +682,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, fs_size, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -787,20 +740,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, goto out; } - sbi->sector_size = boot_sector_size; - sbi->sector_bits = blksize_bits(boot_sector_size); - fs_size = (sectors + 1) << sbi->sector_bits; + sbi->volume.size = sectors * boot_sector_size; - gb = format_size_gb(fs_size, &mb); + gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb); /* * - Volume formatted and mounted with the same sector size. * - Volume formatted 4K and mounted as 512. * - Volume formatted 512 and mounted as 4K. */ - if (sbi->sector_size != sector_size) { - ntfs_warn(sb, - "Different NTFS' sector size and media sector size"); + if (boot_sector_size != sector_size) { + ntfs_warn( + sb, + "Different NTFS' sector size (%u) and media sector size (%u)", + boot_sector_size, sector_size); dev_size += sector_size - 1; } @@ -810,9 +763,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->mft.lbo = mlcn << sbi->cluster_bits; sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; - if (sbi->cluster_size < sbi->sector_size) + /* Compare boot's cluster and sector. */ + if (sbi->cluster_size < boot_sector_size) goto out; + /* Compare boot's cluster and media sector. */ + if (sbi->cluster_size < sector_size) { + /* No way to use ntfs_get_block in this case. */ + ntfs_err( + sb, + "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)", + sbi->cluster_size, sector_size); + goto out; + } + sbi->cluster_mask = sbi->cluster_size - 1; sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; sbi->record_size = record_size = boot->record_size < 0 @@ -836,10 +800,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, : (u32)boot->index_size << sbi->cluster_bits; sbi->volume.ser_num = le64_to_cpu(boot->serial_num); - sbi->volume.size = sectors << sbi->sector_bits; /* Warning if RAW volume. */ - if (dev_size < fs_size) { + if (dev_size < sbi->volume.size + boot_sector_size) { u32 mb0, gb0; gb0 = format_size_gb(dev_size, &mb0); @@ -883,8 +846,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, rec->total = cpu_to_le32(sbi->record_size); ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; - if (sbi->cluster_size < PAGE_SIZE) - sb_set_blocksize(sb, sbi->cluster_size); + sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); sbi->block_mask = sb->s_blocksize - 1; sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; @@ -897,9 +859,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (clusters >= (1ull << (64 - sbi->cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; + sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; + sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; #endif err = 0; @@ -913,14 +877,13 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, /* * ntfs_fill_super - Try to mount. */ -static int ntfs_fill_super(struct super_block *sb, void *data, int silent) +static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) { int err; - struct ntfs_sb_info *sbi; + struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; - struct inode *bd_inode = bdev->bd_inode; - struct request_queue *rq = bdev_get_queue(bdev); - struct inode *inode = NULL; + struct request_queue *rq; + struct inode *inode; struct ntfs_inode *ni; size_t i, tt; CLST vcn, lcn, len; @@ -928,18 +891,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) const struct VOLUME_INFO *info; u32 idx, done, bytes; struct ATTR_DEF_ENTRY *t; - u16 *upcase = NULL; u16 *shared; - bool is_ro; struct MFT_REF ref; ref.high = 0; - sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); - if (!sbi) - return -ENOMEM; - - sb->s_fs_info = sbi; sbi->sb = sb; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" @@ -948,41 +904,27 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; - ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - err = ntfs_parse_options(sb, data, silent, &sbi->options); - if (err) + sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); + if (IS_ERR(sbi->options->nls)) { + sbi->options->nls = NULL; + errorf(fc, "Cannot load nls %s", sbi->options->nls_name); + err = -EINVAL; goto out; + } - if (!rq || !blk_queue_discard(rq) || !rq->limits.discard_granularity) { - ; - } else { + rq = bdev_get_queue(bdev); + if (blk_queue_discard(rq) && rq->limits.discard_granularity) { sbi->discard_granularity = rq->limits.discard_granularity; sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } - sb_set_blocksize(sb, PAGE_SIZE); - /* Parse boot. */ err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, - bd_inode->i_size); + bdev->bd_inode->i_size); if (err) goto out; -#ifdef CONFIG_NTFS3_64BIT_CLUSTER - sb->s_maxbytes = MAX_LFS_FILESIZE; -#else - sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; -#endif - - mutex_init(&sbi->compress.mtx_lznt); -#ifdef CONFIG_NTFS3_LZX_XPRESS - mutex_init(&sbi->compress.mtx_xpress); - mutex_init(&sbi->compress.mtx_lzx); -#endif - /* * Load $Volume. This should be done before $LogFile * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'. @@ -991,9 +933,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Volume."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1015,36 +956,33 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) } else { /* Should we break mounting here? */ //err = -EINVAL; - //goto out; + //goto put_inode_out; } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); if (!attr || is_attr_ext(attr)) { err = -EINVAL; - goto out; + goto put_inode_out; } info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); if (!info) { err = -EINVAL; - goto out; + goto put_inode_out; } sbi->volume.major_ver = info->major_ver; sbi->volume.minor_ver = info->minor_ver; sbi->volume.flags = info->flags; - sbi->volume.ni = ni; - inode = NULL; /* Load $MFTMirr to estimate recs_mirr. */ ref.low = cpu_to_le32(MFT_REC_MIRR); ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFTMirr."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1058,9 +996,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1068,22 +1005,19 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = ntfs_loadlog_and_replay(ni, sbi); if (err) - goto out; + goto put_inode_out; iput(inode); - inode = NULL; - - is_ro = sb_rdonly(sbi->sb); if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { - if (!is_ro) { + if (!sb_rdonly(sb)) { ntfs_warn(sb, "failed to replay log file. Can't mount rw!"); err = -EINVAL; goto out; } } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { - if (!is_ro && !sbi->options.force) { + if (!sb_rdonly(sb) && !sbi->options->force) { ntfs_warn( sb, "volume is dirty and \"force\" flag is not set!"); @@ -1098,9 +1032,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFT."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1112,11 +1045,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = wnd_init(&sbi->mft.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; err = ni_load_all_mi(ni); if (err) - goto out; + goto put_inode_out; sbi->mft.ni = ni; @@ -1125,9 +1058,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BADCLUST); inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $BadClus."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1150,18 +1082,15 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Bitmap."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (inode->i_size >> 32) { err = -EINVAL; - goto out; + goto put_inode_out; } #endif @@ -1169,14 +1098,14 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) tt = sbi->used.bitmap.nbits; if (inode->i_size < bitmap_size(tt)) { err = -EINVAL; - goto out; + goto put_inode_out; } /* Not necessary. */ sbi->used.bitmap.set_tail = true; - err = wnd_init(&sbi->used.bitmap, sbi->sb, tt); + err = wnd_init(&sbi->used.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; iput(inode); @@ -1188,23 +1117,22 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) /* Load $AttrDef. */ ref.low = cpu_to_le32(MFT_REC_ATTR); ref.seq = cpu_to_le16(MFT_REC_ATTR); - inode = ntfs_iget5(sbi->sb, &ref, &NAME_ATTRDEF); + inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $AttrDef -> %d", err); - inode = NULL; + err = PTR_ERR(inode); goto out; } if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { err = -EINVAL; - goto out; + goto put_inode_out; } bytes = inode->i_size; sbi->def_table = t = kmalloc(bytes, GFP_NOFS); if (!t) { err = -ENOMEM; - goto out; + goto put_inode_out; } for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) { @@ -1213,7 +1141,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } memcpy(Add2Ptr(t, done), page_address(page), min(PAGE_SIZE, tail)); @@ -1221,7 +1149,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (!idx && ATTR_STD != t->type) { err = -EINVAL; - goto out; + goto put_inode_out; } } @@ -1254,33 +1182,24 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $UpCase."); err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; goto out; } - ni = ntfs_i(inode); - if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; - goto out; - } - - sbi->upcase = upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); - if (!upcase) { - err = -ENOMEM; - goto out; + goto put_inode_out; } for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) { const __le16 *src; - u16 *dst = Add2Ptr(upcase, idx << PAGE_SHIFT); + u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT); struct page *page = ntfs_map_page(inode->i_mapping, idx); if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } src = page_address(page); @@ -1294,14 +1213,13 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ntfs_unmap_page(page); } - shared = ntfs_set_shared(upcase, 0x10000 * sizeof(short)); - if (shared && upcase != shared) { + shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short)); + if (shared && sbi->upcase != shared) { + kvfree(sbi->upcase); sbi->upcase = shared; - kvfree(upcase); } iput(inode); - inode = NULL; if (is_ntfs3(sbi)) { /* Load $Secure. */ @@ -1331,34 +1249,31 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load root."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) { - err = -EINVAL; - goto out; + err = -ENOMEM; + goto put_inode_out; } + fc->fs_private = NULL; + return 0; -out: +put_inode_out: iput(inode); - - if (sb->s_root) { - d_drop(sb->s_root); - sb->s_root = NULL; - } - +out: + /* + * Free resources here. + * ntfs_fs_free will be called with fc->s_fs_info = NULL + */ put_ntfs(sbi); - sb->s_fs_info = NULL; + return err; } @@ -1403,7 +1318,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) if (sbi->flags & NTFS_FLAGS_NODISCARD) return -EOPNOTSUPP; - if (!sbi->options.discard) + if (!sbi->options->discard) return -EOPNOTSUPP; lbo = (u64)lcn << sbi->cluster_bits; @@ -1428,19 +1343,99 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) return err; } -static struct dentry *ntfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int ntfs_fs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); + return get_tree_bdev(fc, ntfs_fill_super); +} + +/* + * ntfs_fs_free - Free fs_context. + * + * Note that this will be called after fill_super and reconfigure + * even when they pass. So they have to take pointers if they pass. + */ +static void ntfs_fs_free(struct fs_context *fc) +{ + struct ntfs_mount_options *opts = fc->fs_private; + struct ntfs_sb_info *sbi = fc->s_fs_info; + + if (sbi) + put_ntfs(sbi); + + if (opts) + put_mount_options(opts); +} + +static const struct fs_context_operations ntfs_context_ops = { + .parse_param = ntfs_fs_parse_param, + .get_tree = ntfs_fs_get_tree, + .reconfigure = ntfs_fs_reconfigure, + .free = ntfs_fs_free, +}; + +/* + * ntfs_init_fs_context - Initialize spi and opts + * + * This will called when mount/remount. We will first initiliaze + * options so that if remount we can use just that. + */ +static int ntfs_init_fs_context(struct fs_context *fc) +{ + struct ntfs_mount_options *opts; + struct ntfs_sb_info *sbi; + + opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS); + if (!opts) + return -ENOMEM; + + /* Default options. */ + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask_inv = ~current_umask(); + opts->fs_dmask_inv = ~current_umask(); + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + goto ok; + + sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); + if (!sbi) + goto free_opts; + + sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); + if (!sbi->upcase) + goto free_sbi; + + ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + mutex_init(&sbi->compress.mtx_lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + mutex_init(&sbi->compress.mtx_xpress); + mutex_init(&sbi->compress.mtx_lzx); +#endif + + sbi->options = opts; + fc->s_fs_info = sbi; +ok: + fc->fs_private = opts; + fc->ops = &ntfs_context_ops; + + return 0; +free_sbi: + kfree(sbi); +free_opts: + kfree(opts); + return -ENOMEM; } // clang-format off static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs3", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .owner = THIS_MODULE, + .name = "ntfs3", + .init_fs_context = ntfs_init_fs_context, + .parameters = ntfs_fs_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c index bbeba77823..b5e8256fd7 100644 --- a/fs/ntfs3/upcase.c +++ b/fs/ntfs3/upcase.c @@ -5,13 +5,9 @@ * */ -#include -#include -#include -#include +#include +#include -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 7282d85c4e..afd0ddad82 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -5,10 +5,7 @@ * */ -#include -#include #include -#include #include #include #include @@ -78,6 +75,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, size_t add_bytes, const struct EA_INFO **info) { int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *attr_info, *attr_ea; void *ea_p; @@ -102,10 +100,10 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, /* Check Ea limit. */ size = le32_to_cpu((*info)->size); - if (size > ni->mi.sbi->ea_max_size) + if (size > sbi->ea_max_size) return -EFBIG; - if (attr_size(attr_ea) > ni->mi.sbi->ea_max_size) + if (attr_size(attr_ea) > sbi->ea_max_size) return -EFBIG; /* Allocate memory for packed Ea. */ @@ -113,15 +111,16 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, if (!ea_p) return -ENOMEM; - if (attr_ea->non_res) { + if (!size) { + ; + } else if (attr_ea->non_res) { struct runs_tree run; run_init(&run); err = attr_load_runs(attr_ea, ni, &run, NULL); if (!err) - err = ntfs_read_run_nb(ni->mi.sbi, &run, 0, ea_p, size, - NULL); + err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL); run_close(&run); if (err) @@ -260,7 +259,7 @@ static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len, static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags, int locked) + size_t val_size, int flags) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -279,8 +278,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, u64 new_sz; void *p; - if (!locked) - ni_lock(ni); + ni_lock(ni); run_init(&ea_run); @@ -370,21 +368,22 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, new_ea->name[name_len] = 0; memcpy(new_ea->name + name_len + 1, value, val_size); new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea); - - /* Should fit into 16 bits. */ - if (new_pack > 0xffff) { - err = -EFBIG; // -EINVAL? - goto out; - } ea_info.size_pack = cpu_to_le16(new_pack); - /* New size of ATTR_EA. */ size += add; - if (size > sbi->ea_max_size) { + ea_info.size = cpu_to_le32(size); + + /* + * 1. Check ea_info.size_pack for overflow. + * 2. New attibute size must fit value from $AttrDef + */ + if (new_pack > 0xffff || size > sbi->ea_max_size) { + ntfs_inode_warn( + inode, + "The size of extended attributes must not exceed 64KiB"); err = -EFBIG; // -EINVAL? goto out; } - ea_info.size = cpu_to_le32(size); update_ea: @@ -444,7 +443,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, /* Delete xattr, ATTR_EA */ ni_remove_attr_le(ni, attr, mi, le); } else if (attr->non_res) { - err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size); + err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0); if (err) goto out; } else { @@ -468,8 +467,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, mark_inode_dirty(&ni->vfs_inode); out: - if (!locked) - ni_unlock(ni); + ni_unlock(ni); run_close(&ea_run); kfree(ea_all); @@ -478,12 +476,6 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static inline void ntfs_posix_acl_release(struct posix_acl *acl) -{ - if (acl && refcount_dec_and_test(&acl->a_refcount)) - kfree(acl); -} - static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, int type, int locked) @@ -521,12 +513,15 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, /* Translate extended attribute to acl. */ if (err >= 0) { acl = posix_acl_from_xattr(mnt_userns, buf, err); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); + } else if (err == -ENODATA) { + acl = NULL; } else { - acl = err == -ENODATA ? NULL : ERR_PTR(err); + acl = ERR_PTR(err); } + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + __putname(buf); return acl; @@ -546,12 +541,13 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, - int type, int locked) + int type) { const char *name; size_t size, name_len; void *value = NULL; int err = 0; + int flags; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -561,22 +557,15 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, if (acl) { umode_t mode = inode->i_mode; - err = posix_acl_equiv_mode(acl, &mode); - if (err < 0) - return err; + err = posix_acl_update_mode(mnt_userns, inode, &mode, + &acl); + if (err) + goto out; if (inode->i_mode != mode) { inode->i_mode = mode; mark_inode_dirty(inode); } - - if (!err) { - /* - * ACL can be exactly represented in the - * traditional file mode permission bits. - */ - acl = NULL; - } } name = XATTR_NAME_POSIX_ACL_ACCESS; name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; @@ -594,20 +583,24 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, } if (!acl) { + /* Remove xattr if it can be presented via mode. */ size = 0; value = NULL; + flags = XATTR_REPLACE; } else { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(mnt_userns, acl, value, size); if (err < 0) goto out; + flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, 0, locked); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); + if (err == -ENODATA && !size) + err = 0; /* Removing non existed xattr. */ if (!err) set_cached_acl(inode, type, acl); @@ -623,68 +616,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type, 0); -} - -static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, void *buffer, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - acl = ntfs_get_acl(inode, type, false); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) - return -ENODATA; - - err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); - ntfs_posix_acl_release(acl); - - return err; -} - -static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, const void *value, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - - if (!value) { - acl = NULL; - } else { - acl = posix_acl_from_xattr(mnt_userns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - err = posix_acl_valid(mnt_userns, acl); - if (err) - goto release_and_out; - } - } - - err = ntfs_set_acl(mnt_userns, inode, acl, type); - -release_and_out: - ntfs_posix_acl_release(acl); - return err; + return ntfs_set_acl_ex(mnt_userns, inode, acl, type); } /* @@ -698,54 +630,27 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *default_acl, *acl; int err; - /* - * TODO: Refactoring lock. - * ni_lock(dir) ... -> posix_acl_create(dir,...) -> ntfs_get_acl -> ni_lock(dir) - */ - inode->i_default_acl = NULL; + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; - default_acl = ntfs_get_acl_ex(mnt_userns, dir, ACL_TYPE_DEFAULT, 1); - - if (!default_acl || default_acl == ERR_PTR(-EOPNOTSUPP)) { - inode->i_mode &= ~current_umask(); - err = 0; - goto out; - } - - if (IS_ERR(default_acl)) { - err = PTR_ERR(default_acl); - goto out; - } - - acl = default_acl; - err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (err < 0) - goto out1; - if (!err) { - posix_acl_release(acl); - acl = NULL; - } - - if (!S_ISDIR(inode->i_mode)) { - posix_acl_release(default_acl); - default_acl = NULL; - } - - if (default_acl) + if (default_acl) { err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, - ACL_TYPE_DEFAULT, 1); + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; + } if (!acl) inode->i_acl = NULL; - else if (!err) - err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS, - 1); + else { + if (!err) + err = ntfs_set_acl_ex(mnt_userns, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } - posix_acl_release(acl); -out1: - posix_acl_release(default_acl); - -out: return err; } #endif @@ -772,7 +677,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { - if (ntfs_sb(inode->i_sb)->options.no_acs_rules) { + if (ntfs_sb(inode->i_sb)->options->noacsrules) { /* "No access rules" mode - Allow all changes. */ return 0; } @@ -880,23 +785,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - /* TODO: init_user_ns? */ - err = ntfs_xattr_get_acl( - &init_user_ns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - buffer, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -1009,24 +897,8 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - err = ntfs_xattr_set_acl( - mnt_userns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - value, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); out: return err; @@ -1042,28 +914,29 @@ int ntfs_save_wsl_perm(struct inode *inode) int err; __le32 value; + /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */ value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 1fefb2b896..93c7c267de 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1219,9 +1219,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, goto out_dput; } } else { - if (!d_is_negative(newdentry) && - (!new_opaque || !ovl_is_whiteout(newdentry))) - goto out_dput; + if (!d_is_negative(newdentry)) { + if (!new_opaque || !ovl_is_whiteout(newdentry)) + goto out_dput; + } else { + if (flags & RENAME_EXCHANGE) + goto out_dput; + } } if (olddentry == trap) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index d081faa55e..c88ac57159 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -296,6 +296,12 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, @@ -320,7 +326,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) out: revert_creds(old_cred); ovl_file_accessed(file); - +out_fdput: fdput(real); return ret; @@ -349,6 +355,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) goto out_unlock; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -384,6 +396,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) } out: revert_creds(old_cred); +out_fdput: fdput(real); out_unlock: diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 4f5e59f062..37dd3fe5b1 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -21,10 +21,7 @@ #define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ -#define VBSF_MOUNT_SIGNATURE_BYTE_0 ('\000') -#define VBSF_MOUNT_SIGNATURE_BYTE_1 ('\377') -#define VBSF_MOUNT_SIGNATURE_BYTE_2 ('\376') -#define VBSF_MOUNT_SIGNATURE_BYTE_3 ('\375') +static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375"; static int follow_symlinks; module_param(follow_symlinks, int, 0444); @@ -386,12 +383,7 @@ static int vboxsf_setup(void) static int vboxsf_parse_monolithic(struct fs_context *fc, void *data) { - unsigned char *options = data; - - if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 && - options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 && - options[2] == VBSF_MOUNT_SIGNATURE_BYTE_2 && - options[3] == VBSF_MOUNT_SIGNATURE_BYTE_3) { + if (data && !memcmp(data, VBSF_MOUNT_SIGNATURE, 4)) { vbg_err("vboxsf: Old binary mount data not supported, remove obsolete mount.vboxsf and/or update your VBoxService.\n"); return -EINVAL; } diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 77e159a034..60a4372aa4 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -177,7 +177,7 @@ static int build_merkle_tree(struct file *filp, * (level 0) and ascending to the root node (level 'num_levels - 1'). * Then at the end (level 'num_levels'), calculate the root hash. */ - blocks = (inode->i_size + params->block_size - 1) >> + blocks = ((u64)inode->i_size + params->block_size - 1) >> params->log_blocksize; for (level = 0; level <= params->num_levels; level++) { err = build_merkle_tree_level(filp, level, blocks, params, diff --git a/fs/verity/open.c b/fs/verity/open.c index 60ff8af721..92df87f5fa 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -89,7 +89,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, */ /* Compute number of levels and the number of blocks in each level */ - blocks = (inode->i_size + params->block_size - 1) >> log_blocksize; + blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index cc7338f9e0..7ce93aaf69 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -957,7 +957,7 @@ static inline void __iomem *ioremap(phys_addr_t offset, size_t size) #ifndef iounmap #define iounmap iounmap -static inline void iounmap(void __iomem *addr) +static inline void iounmap(volatile void __iomem *addr) { } #endif diff --git a/include/kunit/test.h b/include/kunit/test.h index 24b40e5c16..018e776a34 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -613,7 +613,7 @@ void kunit_remove_resource(struct kunit *test, struct kunit_resource *res); * and is automatically cleaned up after the test case concludes. See &struct * kunit_resource for more information. */ -void *kunit_kmalloc_array(struct kunit *test, size_t n, size_t size, gfp_t flags); +void *kunit_kmalloc_array(struct kunit *test, size_t n, size_t size, gfp_t gfp); /** * kunit_kmalloc() - Like kmalloc() except the allocation is *test managed*. @@ -657,9 +657,9 @@ static inline void *kunit_kzalloc(struct kunit *test, size_t size, gfp_t gfp) * * See kcalloc() and kunit_kmalloc_array() for more information. */ -static inline void *kunit_kcalloc(struct kunit *test, size_t n, size_t size, gfp_t flags) +static inline void *kunit_kcalloc(struct kunit *test, size_t n, size_t size, gfp_t gfp) { - return kunit_kmalloc_array(test, n, size, flags | __GFP_ZERO); + return kunit_kmalloc_array(test, n, size, gfp | __GFP_ZERO); } void kunit_cleanup(struct kunit *test); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 864b9997ef..90f21898aa 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -61,7 +61,6 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); -int kvm_pmu_probe_pmuver(void); #else struct kvm_pmu { }; @@ -118,8 +117,6 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return 0; } -static inline int kvm_pmu_probe_pmuver(void) { return 0xf; } - #endif #endif diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 7d1cabe152..63ccb52521 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -321,10 +321,20 @@ asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0); * from register 0 to 3 on return from the SMC instruction. An optional * quirk structure provides vendor specific behavior. */ +#ifdef CONFIG_HAVE_ARM_SMCCC asmlinkage void __arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4, unsigned long a5, unsigned long a6, unsigned long a7, struct arm_smccc_res *res, struct arm_smccc_quirk *quirk); +#else +static inline void __arm_smccc_smc(unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, unsigned long a4, + unsigned long a5, unsigned long a6, unsigned long a7, + struct arm_smccc_res *res, struct arm_smccc_quirk *quirk) +{ + *res = (struct arm_smccc_res){}; +} +#endif /** * __arm_smccc_hvc() - make HVC calls diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f4c16f19f8..020a7d5bf4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -578,11 +578,12 @@ struct btf_func_model { * programs only. Should not be used with normal calls and indirect calls. */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) - /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs. */ #define BPF_TRAMP_F_IP_ARG BIT(3) +/* Return the return value of fentry prog. Only used by bpf_struct_ops. */ +#define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 5d4d07a9e1..1e7399fc69 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -996,14 +996,15 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) * cpumask; Typically used by bin_attribute to export cpumask bitmask * ABI. * - * Returns the length of how many bytes have been copied. + * Returns the length of how many bytes have been copied, excluding + * terminating '\0'. */ static inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), - nr_cpu_ids, off, count); + nr_cpu_ids, off, count) - 1; } /** @@ -1018,7 +1019,7 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), - nr_cpu_ids, off, count); + nr_cpu_ids, off, count) - 1; } #if NR_CPUS <= BITS_PER_LONG diff --git a/include/linux/dsa/mv88e6xxx.h b/include/linux/dsa/mv88e6xxx.h new file mode 100644 index 0000000000..8c3d45eca4 --- /dev/null +++ b/include/linux/dsa/mv88e6xxx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright 2021 NXP + */ + +#ifndef _NET_DSA_TAG_MV88E6XXX_H +#define _NET_DSA_TAG_MV88E6XXX_H + +#include + +#define MV88E6XXX_VID_STANDALONE 0 +#define MV88E6XXX_VID_BRIDGED (VLAN_N_VID - 1) + +#endif diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 435777a007..8ae999f587 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -5,7 +5,28 @@ #ifndef _NET_DSA_TAG_OCELOT_H #define _NET_DSA_TAG_OCELOT_H +#include #include +#include + +struct ocelot_skb_cb { + struct sk_buff *clone; + unsigned int ptp_class; /* valid only for clones */ + u8 ptp_cmd; + u8 ts_id; +}; + +#define OCELOT_SKB_CB(skb) \ + ((struct ocelot_skb_cb *)((skb)->cb)) + +#define IFH_TAG_TYPE_C 0 +#define IFH_TAG_TYPE_S 1 + +#define IFH_REW_OP_NOOP 0x0 +#define IFH_REW_OP_DSCP 0x1 +#define IFH_REW_OP_ONE_STEP_PTP 0x2 +#define IFH_REW_OP_TWO_STEP_PTP 0x3 +#define IFH_REW_OP_ORIGIN_PTP 0x5 #define OCELOT_TAG_LEN 16 #define OCELOT_SHORT_PREFIX_LEN 4 @@ -140,6 +161,17 @@ * +------+------+------+------+------+------+------+------+ */ +struct felix_deferred_xmit_work { + struct dsa_port *dp; + struct sk_buff *skb; + struct kthread_work work; +}; + +struct felix_port { + void (*xmit_work_fn)(struct kthread_work *work); + struct kthread_worker *xmit_worker; +}; + static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val) { packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); @@ -215,4 +247,21 @@ static inline void ocelot_ifh_set_vid(void *injection, u64 vid) packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0); } +/* Determine the PTP REW_OP to use for injecting the given skb */ +static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) +{ + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; + u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd; + u32 rew_op = 0; + + if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) { + rew_op = ptp_cmd; + rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; + } else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { + rew_op = ptp_cmd; + } + + return rew_op; +} + #endif diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 171106202f..9e07079528 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -48,6 +48,10 @@ struct sja1105_tagger_data { spinlock_t meta_lock; unsigned long state; u8 ts_id; + /* Used on SJA1110 where meta frames are generated only for + * 2-step TX timestamps + */ + struct sk_buff_head skb_txtstamp_queue; }; struct sja1105_skb_cb { @@ -69,42 +73,24 @@ struct sja1105_port { bool hwts_tx_en; }; -enum sja1110_meta_tstamp { - SJA1110_META_TSTAMP_TX = 0, - SJA1110_META_TSTAMP_RX = 1, -}; +/* Timestamps are in units of 8 ns clock ticks (equivalent to + * a fixed 125 MHz clock). + */ +#define SJA1105_TICK_NS 8 -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) - -void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, - enum sja1110_meta_tstamp dir, u64 tstamp); - -#else - -static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, - u8 ts_id, enum sja1110_meta_tstamp dir, - u64 tstamp) +static inline s64 ns_to_sja1105_ticks(s64 ns) { + return ns / SJA1105_TICK_NS; } -#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */ - -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105) - -extern const struct dsa_switch_ops sja1105_switch_ops; +static inline s64 sja1105_ticks_to_ns(s64 ticks) +{ + return ticks * SJA1105_TICK_NS; +} static inline bool dsa_port_is_sja1105(struct dsa_port *dp) { - return dp->ds->ops == &sja1105_switch_ops; + return true; } -#else - -static inline bool dsa_port_is_sja1105(struct dsa_port *dp) -{ - return false; -} - -#endif - #endif /* _NET_DSA_SJA1105_H */ diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 928c411bd5..c58d504514 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -308,7 +308,7 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src) */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { - ether_addr_copy(dev->dev_addr, addr); + __dev_addr_set(dev, addr, ETH_ALEN); } /** diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 59828516eb..9f4ad719bf 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -22,10 +22,15 @@ struct device; * LINKS_ADDED: The fwnode has already be parsed to add fwnode links. * NOT_DEVICE: The fwnode will never be populated as a struct device. * INITIALIZED: The hardware corresponding to fwnode has been initialized. + * NEEDS_CHILD_BOUND_ON_ADD: For this fwnode/device to probe successfully, its + * driver needs its child devices to be bound with + * their respective drivers as soon as they are + * added. */ -#define FWNODE_FLAG_LINKS_ADDED BIT(0) -#define FWNODE_FLAG_NOT_DEVICE BIT(1) -#define FWNODE_FLAG_INITIALIZED BIT(2) +#define FWNODE_FLAG_LINKS_ADDED BIT(0) +#define FWNODE_FLAG_NOT_DEVICE BIT(1) +#define FWNODE_FLAG_INITIALIZED BIT(2) +#define FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD BIT(3) struct fwnode_handle { struct fwnode_handle *secondary; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c68d83c87f..0f5315c2b5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -149,6 +149,7 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 +#define GD_DEAD 2 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 041ca7f15e..0f18df7fe8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -608,7 +608,6 @@ struct kvm { unsigned long mmu_notifier_range_start; unsigned long mmu_notifier_range_end; #endif - long tlbs_dirty; struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; @@ -721,11 +720,6 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } -static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) -{ - return vcpu->vcpu_idx; -} - #define kvm_for_each_memslot(memslot, slots) \ for (memslot = &slots->memslots[0]; \ memslot < slots->memslots + slots->used_slots; memslot++) \ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3638d09ba..993204a6c1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9475,16 +9475,22 @@ struct mlx5_ifc_pcmr_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; u8 reserved_at_10[0x10]; + u8 entropy_force_cap[0x1]; u8 entropy_calc_cap[0x1]; u8 entropy_gre_calc_cap[0x1]; - u8 reserved_at_23[0x1b]; + u8 reserved_at_23[0xf]; + u8 rx_ts_over_crc_cap[0x1]; + u8 reserved_at_33[0xb]; u8 fcs_cap[0x1]; u8 reserved_at_3f[0x1]; + u8 entropy_force[0x1]; u8 entropy_calc[0x1]; u8 entropy_gre_calc[0x1]; - u8 reserved_at_43[0x1b]; + u8 reserved_at_43[0xf]; + u8 rx_ts_over_crc[0x1]; + u8 reserved_at_53[0xb]; u8 fcs_chk[0x1]; u8 reserved_at_5f[0x1]; }; diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 505480217c..2512e2f9cd 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -163,6 +163,12 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn); static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; } #endif +#ifdef CONFIG_KVM +void kvm_host_pmu_init(struct arm_pmu *pmu); +#else +#define kvm_host_pmu_init(x) do { } while(0) +#endif + /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); struct arm_pmu *armpmu_alloc_atomic(void); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fe156a8170..9b60bb89d8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -683,7 +683,9 @@ struct perf_event { /* * timestamp shadows the actual context timing but it can * be safely used in NMI interrupt context. It reflects the - * context time as it was when the event was last scheduled in. + * context time as it was when the event was last scheduled in, + * or when ctx_sched_in failed to schedule the event because we + * run out of PMC. * * ctx_time already accounts for ctx->timestamp. Therefore to * compute ctx_time for a sample, simply add perf_clock(). diff --git a/include/linux/platform_data/usb-omap1.h b/include/linux/platform_data/usb-omap1.h index 43b5ce139c..878e572a78 100644 --- a/include/linux/platform_data/usb-omap1.h +++ b/include/linux/platform_data/usb-omap1.h @@ -48,6 +48,8 @@ struct omap_usb_config { u32 (*usb2_init)(unsigned nwires, unsigned alt_pingroup); int (*ocpi_enable)(void); + + void (*lb_reset)(void); }; #endif /* __LINUX_USB_OMAP1_H */ diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index c0475d1c98..81cad9e1e4 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -61,7 +61,6 @@ enum qcom_scm_ice_cipher { #define QCOM_SCM_PERM_RW (QCOM_SCM_PERM_READ | QCOM_SCM_PERM_WRITE) #define QCOM_SCM_PERM_RWX (QCOM_SCM_PERM_RW | QCOM_SCM_PERM_EXEC) -#if IS_ENABLED(CONFIG_QCOM_SCM) extern bool qcom_scm_is_available(void); extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus); @@ -115,74 +114,4 @@ extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, extern int qcom_scm_lmh_profile_change(u32 profile_id); extern bool qcom_scm_lmh_dcvsh_available(void); -#else - -#include - -static inline bool qcom_scm_is_available(void) { return false; } - -static inline int qcom_scm_set_cold_boot_addr(void *entry, - const cpumask_t *cpus) { return -ENODEV; } -static inline int qcom_scm_set_warm_boot_addr(void *entry, - const cpumask_t *cpus) { return -ENODEV; } -static inline void qcom_scm_cpu_power_down(u32 flags) {} -static inline u32 qcom_scm_set_remote_state(u32 state,u32 id) - { return -ENODEV; } - -static inline int qcom_scm_pas_init_image(u32 peripheral, const void *metadata, - size_t size) { return -ENODEV; } -static inline int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr, - phys_addr_t size) { return -ENODEV; } -static inline int qcom_scm_pas_auth_and_reset(u32 peripheral) - { return -ENODEV; } -static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; } -static inline bool qcom_scm_pas_supported(u32 peripheral) { return false; } - -static inline int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val) - { return -ENODEV; } -static inline int qcom_scm_io_writel(phys_addr_t addr, unsigned int val) - { return -ENODEV; } - -static inline bool qcom_scm_restore_sec_cfg_available(void) { return false; } -static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare) - { return -ENODEV; } -static inline int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size) - { return -ENODEV; } -static inline int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare) - { return -ENODEV; } -extern inline int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size, - u32 cp_nonpixel_start, - u32 cp_nonpixel_size) - { return -ENODEV; } -static inline int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, - unsigned int *src, const struct qcom_scm_vmperm *newvm, - unsigned int dest_cnt) { return -ENODEV; } - -static inline bool qcom_scm_ocmem_lock_available(void) { return false; } -static inline int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset, - u32 size, u32 mode) { return -ENODEV; } -static inline int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id, - u32 offset, u32 size) { return -ENODEV; } - -static inline bool qcom_scm_ice_available(void) { return false; } -static inline int qcom_scm_ice_invalidate_key(u32 index) { return -ENODEV; } -static inline int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size, - enum qcom_scm_ice_cipher cipher, - u32 data_unit_size) { return -ENODEV; } - -static inline bool qcom_scm_hdcp_available(void) { return false; } -static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, - u32 *resp) { return -ENODEV; } - -static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en) - { return -ENODEV; } - -static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, - u64 limit_node, u32 node_id, u64 version) - { return -ENODEV; } - -static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; } - -static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; } -#endif #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 39039ce8ac..c1a927ddec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1720,7 +1720,7 @@ extern struct pid *cad_pid; #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -static inline bool is_percpu_thread(void) +static __always_inline bool is_percpu_thread(void) { #ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8371bca137..6b0b686f6f 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -531,6 +531,9 @@ struct spi_controller { /* I/O mutex */ struct mutex io_mutex; + /* Used to avoid adding the same CS twice */ + struct mutex add_lock; + /* lock and mutex for SPI bus locking */ spinlock_t bus_lock_spinlock; struct mutex bus_lock_mutex; diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2ebef6b1a3..74d3c1efd9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -399,9 +399,8 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...); +__printf(1, 4) struct workqueue_struct * +alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 21c5386d4a..ab5348e57d 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -597,5 +597,5 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, - int nh_weight, u8 rt_family); + int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index af0fc13cea..618d1f427c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2818,13 +2818,13 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag * when they are able to replace in-use PTK keys according to the following * requirements: - * 1) They do not hand over frames decrypted with the old key to - mac80211 once the call to set_key() with command %DISABLE_KEY has been - completed when also setting @IEEE80211_KEY_FLAG_GENERATE_IV for any key, + * 1) They do not hand over frames decrypted with the old key to mac80211 + once the call to set_key() with command %DISABLE_KEY has been completed, 2) either drop or continue to use the old key for any outgoing frames queued at the time of the key deletion (including re-transmits), 3) never send out a frame queued prior to the set_key() %SET_KEY command - encrypted with the new key and + encrypted with the new key when also needing + @IEEE80211_KEY_FLAG_GENERATE_IV and 4) never send out a frame unencrypted when it should be encrypted. Mac80211 will not queue any new frames for a deleted key to the driver. */ diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index 0fd8a41596..ceadf8ba25 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -17,7 +17,6 @@ struct inet_frags_ctl; struct nft_ct_frag6_pernet { struct ctl_table_header *nf_frag_frags_hdr; struct fqdir *fqdir; - unsigned int users; }; #endif /* _NF_DEFRAG_IPV6_H */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 148f5d8ee5..a16171c5fd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1202,7 +1202,7 @@ struct nft_object *nft_obj_lookup(const struct net *net, void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, - int event, int family, int report, gfp_t gfp); + int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 986a2a9cfd..b593f95e99 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -27,5 +27,11 @@ struct netns_nf { #if IS_ENABLED(CONFIG_DECNET) struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; #endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + unsigned int defrag_ipv4_users; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + unsigned int defrag_ipv6_users; +#endif }; #endif diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 10e1777877..28085b995d 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -325,7 +325,7 @@ int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, struct fib_nh_common *nhc = &nhi->fib_nhc; int weight = nhg->nh_entries[i].weight; - if (fib_add_nexthop(skb, nhc, weight, rt_family) < 0) + if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0) return -EMSGSIZE; } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 6d7b12cba0..bf79f3a890 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -11,6 +11,7 @@ #include #define DEFAULT_TX_QUEUE_LEN 1000 +#define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; diff --git a/include/net/sock.h b/include/net/sock.h index c005c3c750..ea6fbc88c8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -307,6 +307,7 @@ struct bpf_local_storage; * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family + * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting @@ -488,8 +489,10 @@ struct sock { u8 sk_prefer_busy_poll; u16 sk_busy_poll_budget; #endif + spinlock_t sk_peer_lock; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; + long sk_rcvtimeo; ktime_t sk_stamp; #if BITS_PER_LONG==32 @@ -1623,7 +1626,36 @@ void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); + +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process wont block + * return false if fast path is taken: + * + * sk_lock.slock locked, owned = 0, BH disabled + * + * return true if slow path is taken: + * + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +static inline bool lock_sock_fast(struct sock *sk) +{ + /* The sk_lock has mutex_lock() semantics here. */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + return __lock_sock_fast(sk); +} + +/* fast socket lock variant for caller already holding a [different] socket lock */ +static inline bool lock_sock_fast_nested(struct sock *sk) +{ + mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); + + return __lock_sock_fast(sk); +} /** * unlock_sock_fast - complement of lock_sock_fast diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 06706a9fd5..d7055b4198 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -89,15 +89,6 @@ /* Source PGIDs, one per physical port */ #define PGID_SRC 80 -#define IFH_TAG_TYPE_C 0 -#define IFH_TAG_TYPE_S 1 - -#define IFH_REW_OP_NOOP 0x0 -#define IFH_REW_OP_DSCP 0x1 -#define IFH_REW_OP_ONE_STEP_PTP 0x2 -#define IFH_REW_OP_TWO_STEP_PTP 0x3 -#define IFH_REW_OP_ORIGIN_PTP 0x5 - #define OCELOT_NUM_TC 8 #define OCELOT_SPEED_2500 0 @@ -603,10 +594,10 @@ struct ocelot_port { /* The VLAN ID that will be transmitted as untagged, on egress */ struct ocelot_vlan native_vlan; + unsigned int ptp_skbs_in_flight; u8 ptp_cmd; struct sk_buff_head tx_skbs; u8 ts_id; - spinlock_t ts_id_lock; phy_interface_t phy_mode; @@ -680,6 +671,9 @@ struct ocelot { struct ptp_clock *ptp_clock; struct ptp_clock_info ptp_info; struct hwtstamp_config hwtstamp_config; + unsigned int ptp_skbs_in_flight; + /* Protects the 2-step TX timestamp ID logic */ + spinlock_t ts_id_lock; /* Protects the PTP interface state */ struct mutex ptp_lock; /* Protects the PTP clock */ @@ -692,15 +686,6 @@ struct ocelot_policer { u32 burst; /* bytes */ }; -struct ocelot_skb_cb { - struct sk_buff *clone; - u8 ptp_cmd; - u8 ts_id; -}; - -#define OCELOT_SKB_CB(skb) \ - ((struct ocelot_skb_cb *)((skb)->cb)) - #define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) #define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi)) #define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri)) @@ -752,8 +737,6 @@ u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, u32 val, u32 reg, u32 offset); -#if IS_ENABLED(CONFIG_MSCC_OCELOT_SWITCH_LIB) - /* Packet I/O */ bool ocelot_can_inject(struct ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, @@ -761,36 +744,6 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb); void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp); -u32 ocelot_ptp_rew_op(struct sk_buff *skb); -#else - -static inline bool ocelot_can_inject(struct ocelot *ocelot, int grp) -{ - return false; -} - -static inline void ocelot_port_inject_frame(struct ocelot *ocelot, int port, - int grp, u32 rew_op, - struct sk_buff *skb) -{ -} - -static inline int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, - struct sk_buff **skb) -{ - return -EIO; -} - -static inline void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) -{ -} - -static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) -{ - return 0; -} -#endif - /* Hardware initialization */ int ocelot_regfields_init(struct ocelot *ocelot, const struct reg_field *const regfields); diff --git a/include/soc/mscc/ocelot_ptp.h b/include/soc/mscc/ocelot_ptp.h index ded497d72b..f085884b1f 100644 --- a/include/soc/mscc/ocelot_ptp.h +++ b/include/soc/mscc/ocelot_ptp.h @@ -13,6 +13,9 @@ #include #include +#define OCELOT_MAX_PTP_ID 63 +#define OCELOT_PTP_FIFO_SIZE 128 + #define PTP_PIN_CFG_RSZ 0x20 #define PTP_PIN_TOD_SEC_MSB_RSZ PTP_PIN_CFG_RSZ #define PTP_PIN_TOD_SEC_LSB_RSZ PTP_PIN_CFG_RSZ diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 25fd525aaf..4869ebbd43 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -694,7 +694,7 @@ int ocelot_vcap_filter_add(struct ocelot *ocelot, int ocelot_vcap_filter_del(struct ocelot *ocelot, struct ocelot_vcap_filter *rule); struct ocelot_vcap_filter * -ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int id, - bool tc_offload); +ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, + unsigned long cookie, bool tc_offload); #endif /* _OCELOT_VCAP_H_ */ diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h index 01570dbda5..0e45963bb7 100644 --- a/include/sound/hda_codec.h +++ b/include/sound/hda_codec.h @@ -224,6 +224,7 @@ struct hda_codec { #endif /* misc flags */ + unsigned int configured:1; /* codec was configured */ unsigned int in_freeing:1; /* being released */ unsigned int registered:1; /* codec was registered */ unsigned int display_power_control:1; /* needs display power */ diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h index 989e151733..7a08ed2acd 100644 --- a/include/sound/rawmidi.h +++ b/include/sound/rawmidi.h @@ -98,6 +98,7 @@ struct snd_rawmidi_file { struct snd_rawmidi *rmidi; struct snd_rawmidi_substream *input; struct snd_rawmidi_substream *output; + unsigned int user_pversion; /* supported protocol version */ }; struct snd_rawmidi_str { diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 9a448fe935..920b6a303d 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -178,7 +178,7 @@ TRACE_EVENT(cachefiles_unlink, ), TP_fast_assign( - __entry->obj = obj->fscache.debug_id; + __entry->obj = obj ? obj->fscache.debug_id : UINT_MAX; __entry->de = de; __entry->why = why; ), @@ -205,7 +205,7 @@ TRACE_EVENT(cachefiles_rename, ), TP_fast_assign( - __entry->obj = obj->fscache.debug_id; + __entry->obj = obj ? obj->fscache.debug_id : UINT_MAX; __entry->de = de; __entry->to = to; __entry->why = why; @@ -305,7 +305,7 @@ TRACE_EVENT(cachefiles_mark_buried, ), TP_fast_assign( - __entry->obj = obj->fscache.debug_id; + __entry->obj = obj ? obj->fscache.debug_id : UINT_MAX; __entry->de = de; __entry->why = why; ), diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h index 491098a0d8..bf7533f171 100644 --- a/include/trace/events/kyber.h +++ b/include/trace/events/kyber.h @@ -13,11 +13,11 @@ TRACE_EVENT(kyber_latency, - TP_PROTO(struct request_queue *q, const char *domain, const char *type, + TP_PROTO(dev_t dev, const char *domain, const char *type, unsigned int percentile, unsigned int numerator, unsigned int denominator, unsigned int samples), - TP_ARGS(q, domain, type, percentile, numerator, denominator, samples), + TP_ARGS(dev, domain, type, percentile, numerator, denominator, samples), TP_STRUCT__entry( __field( dev_t, dev ) @@ -30,7 +30,7 @@ TRACE_EVENT(kyber_latency, ), TP_fast_assign( - __entry->dev = disk_devt(q->disk); + __entry->dev = dev; strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strlcpy(__entry->type, type, sizeof(__entry->type)); __entry->percentile = percentile; @@ -47,10 +47,9 @@ TRACE_EVENT(kyber_latency, TRACE_EVENT(kyber_adjust, - TP_PROTO(struct request_queue *q, const char *domain, - unsigned int depth), + TP_PROTO(dev_t dev, const char *domain, unsigned int depth), - TP_ARGS(q, domain, depth), + TP_ARGS(dev, domain, depth), TP_STRUCT__entry( __field( dev_t, dev ) @@ -59,7 +58,7 @@ TRACE_EVENT(kyber_adjust, ), TP_fast_assign( - __entry->dev = disk_devt(q->disk); + __entry->dev = dev; strlcpy(__entry->domain, domain, sizeof(__entry->domain)); __entry->depth = depth; ), @@ -71,9 +70,9 @@ TRACE_EVENT(kyber_adjust, TRACE_EVENT(kyber_throttled, - TP_PROTO(struct request_queue *q, const char *domain), + TP_PROTO(dev_t dev, const char *domain), - TP_ARGS(q, domain), + TP_ARGS(dev, domain), TP_STRUCT__entry( __field( dev_t, dev ) @@ -81,7 +80,7 @@ TRACE_EVENT(kyber_throttled, ), TP_fast_assign( - __entry->dev = disk_devt(q->disk); + __entry->dev = dev; strlcpy(__entry->domain, domain, sizeof(__entry->domain)); ), diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index 6135d92e0d..daf82a230c 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -26,7 +26,7 @@ #ifndef _UAPI_HYPERV_H #define _UAPI_HYPERV_H -#include +#include /* * Framework version for util services. diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index b96c1ea716..eda0426ec4 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -213,13 +213,13 @@ enum { XFRM_MSG_GETSPDINFO, #define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO + XFRM_MSG_MAPPING, +#define XFRM_MSG_MAPPING XFRM_MSG_MAPPING + XFRM_MSG_SETDEFAULT, #define XFRM_MSG_SETDEFAULT XFRM_MSG_SETDEFAULT XFRM_MSG_GETDEFAULT, #define XFRM_MSG_GETDEFAULT XFRM_MSG_GETDEFAULT - - XFRM_MSG_MAPPING, -#define XFRM_MSG_MAPPING XFRM_MSG_MAPPING __XFRM_MSG_MAX }; #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1) @@ -514,9 +514,12 @@ struct xfrm_user_offload { #define XFRM_OFFLOAD_INBOUND 2 struct xfrm_userpolicy_default { -#define XFRM_USERPOLICY_DIRMASK_MAX (sizeof(__u8) * 8) - __u8 dirmask; - __u8 action; +#define XFRM_USERPOLICY_UNSPEC 0 +#define XFRM_USERPOLICY_BLOCK 1 +#define XFRM_USERPOLICY_ACCEPT 2 + __u8 in; + __u8 fwd; + __u8 out; }; #ifndef __KERNEL__ diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 7cc2a0f3f2..d13bb8c1b4 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -917,7 +917,6 @@ struct hl_wait_cs_in { #define HL_WAIT_CS_STATUS_BUSY 1 #define HL_WAIT_CS_STATUS_TIMEDOUT 2 #define HL_WAIT_CS_STATUS_ABORTED 3 -#define HL_WAIT_CS_STATUS_INTERRUPTED 4 #define HL_WAIT_CS_STATUS_FLAG_GONE 0x1 #define HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD 0x2 @@ -1286,7 +1285,8 @@ struct hl_debug_args { * EIO - The CS was aborted (usually because the device was reset) * ENODEV - The device wants to do hard-reset (so user need to close FD) * - * The driver also returns a custom define inside the IOCTL which can be: + * The driver also returns a custom define in case the IOCTL call returned 0. + * The define can be one of the following: * * HL_WAIT_CS_STATUS_COMPLETED - The CS has been completed successfully (0) * HL_WAIT_CS_STATUS_BUSY - The CS is still executing (0) @@ -1294,8 +1294,6 @@ struct hl_debug_args { * (ETIMEDOUT) * HL_WAIT_CS_STATUS_ABORTED - The CS was aborted, usually because the * device was reset (EIO) - * HL_WAIT_CS_STATUS_INTERRUPTED - Waiting for the CS was interrupted (EINTR) - * */ #define HL_IOCTL_WAIT_CS \ diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 1d84ec9db9..5859ca0a14 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -784,6 +784,7 @@ struct snd_rawmidi_status { #define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) #define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) +#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) #define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) #define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) #define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index db28e79b77..a3584a357f 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -52,12 +52,12 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); #if defined(CONFIG_XEN_PV) int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, - unsigned int domid, bool no_translate, struct page **pages); + unsigned int domid, bool no_translate); #else static inline int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, unsigned int domid, - bool no_translate, struct page **pages) + bool no_translate) { BUG(); return 0; @@ -134,7 +134,7 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, */ BUG_ON(err_ptr == NULL); return xen_remap_pfn(vma, addr, gfn, nr, err_ptr, prot, domid, - false, pages); + false); } /* @@ -146,7 +146,6 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, * @err_ptr: Returns per-MFN error status. * @prot: page protection mask * @domid: Domain owning the pages - * @pages: Array of pages if this domain has an auto-translated physmap * * @mfn and @err_ptr may point to the same buffer, the MFNs will be * overwritten by the error codes after they are mapped. @@ -157,14 +156,13 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, static inline int xen_remap_domain_mfn_array(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *mfn, int nr, int *err_ptr, - pgprot_t prot, unsigned int domid, - struct page **pages) + pgprot_t prot, unsigned int domid) { if (xen_feature(XENFEAT_auto_translated_physmap)) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid, - true, pages); + true); } /* xen_remap_domain_gfn_range() - map a range of foreign frames @@ -188,8 +186,7 @@ static inline int xen_remap_domain_gfn_range(struct vm_area_struct *vma, if (xen_feature(XENFEAT_auto_translated_physmap)) return -EOPNOTSUPP; - return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false, - pages); + return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false); } int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, diff --git a/init/main.c b/init/main.c index 81a79a77db..3c4054a955 100644 --- a/init/main.c +++ b/init/main.c @@ -382,6 +382,7 @@ static char * __init xbc_make_cmdline(const char *key) ret = xbc_snprint_cmdline(new_cmdline, len + 1, root); if (ret < 0 || ret > len) { pr_err("Failed to print extra kernel cmdline.\n"); + memblock_free_ptr(new_cmdline, len + 1); return NULL; } diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d6731c3286..9abcc33f02 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *mtype, *ptype; struct bpf_prog *prog; u32 moff; + u32 flags; moff = btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); @@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + flags = st_ops->func_models[i].ret_size > 0 ? + BPF_TRAMP_F_RET_FENTRY_RET : 0; err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, - &st_ops->func_models[i], 0, - tprogs, NULL); + &st_ops->func_models[i], + flags, tprogs, NULL); if (err < 0) goto reset_unlock; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9f4636d021..d6b7dfdd80 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { atomic_long_sub(pages, &bpf_jit_current); return -EPERM; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 09a3fd97d3..6e75bbee39 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map) static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { - u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + u64 elem_size = sizeof(struct stack_map_bucket) + + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 09b9cc3a54..a19b72a281 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6612,22 +6612,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } + while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->cgroup = cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index df1ccf4558..2a9695ccb6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -311,17 +311,19 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global locks guarding cpuset structures - cpuset_mutex and + * There are two global locks guarding cpuset structures - cpuset_rwsem and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. + * comment. The cpuset code uses only cpuset_rwsem write lock. Other + * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to + * prevent change to cpuset structures. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_mutex. While it is performing these checks, various + * just holding cpuset_rwsem. While it is performing these checks, various * callback routines can briefly acquire callback_lock to query cpusets. * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. @@ -393,7 +395,7 @@ static inline bool is_in_v2_mode(void) * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) @@ -435,7 +437,7 @@ static void guarantee_online_cpus(struct task_struct *tsk, * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -447,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -468,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. + * are only set if the other's are set. Call holding cpuset_rwsem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -577,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_mutex held. + * cpuset_rwsem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -700,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_mutex held. */ +/* Must be called with cpuset_rwsem held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -726,7 +728,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_mutex held. + * Must be called with cpuset_rwsem held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -1005,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes cpus_read_lock(). + * Call with cpuset_rwsem held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1078,7 +1080,7 @@ void rebuild_sched_domains(void) * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_cpumask(struct cpuset *cs) @@ -1347,7 +1349,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) { @@ -1704,12 +1706,12 @@ static void *cpuset_being_rebound; * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_mutex */ + static nodemask_t newmems; /* protected by cpuset_rwsem */ struct css_task_iter it; struct task_struct *task; @@ -1722,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_mutex, we know that no other rebind effort + * the global cpuset_rwsem, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1768,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -1821,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_lock during call. + * Call with cpuset_rwsem held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1911,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays + * function is called with cpuset_rwsem held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) @@ -1931,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1980,7 +1982,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, * cs: the cpuset to update * new_prs: new partition root state * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { @@ -2167,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; -/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; @@ -2219,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) } /* - * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -2227,7 +2229,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_mutex */ + /* static buf protected by cpuset_rwsem */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; @@ -2417,7 +2419,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy + * grabbing cpuset_rwsem anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); @@ -3672,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc//cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_rwsem, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, diff --git a/kernel/events/core.c b/kernel/events/core.c index 0c000cb01e..f23ca26030 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3707,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +static inline bool event_update_userpage(struct perf_event *event) +{ + if (likely(!atomic_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); + perf_event_update_userpage(event); + + return true; +} + +static inline void group_update_userpage(struct perf_event *group_event) +{ + struct perf_event *event; + + if (!event_update_userpage(group_event)) + return; + + for_each_sibling_event(event, group_event) + event_update_userpage(event); +} + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; @@ -3725,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data) } if (event->state == PERF_EVENT_STATE_INACTIVE) { + *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } else { + ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); + group_update_userpage(event); } - - *can_add_hw = 0; - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); } return 0; @@ -6324,6 +6348,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ring_buffer_attach(event, rb); + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { diff --git a/kernel/module.c b/kernel/module.c index 40ec9a030e..5c26a76e80 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4489,8 +4489,10 @@ static void cfi_init(struct module *mod) /* Fix init/exit functions to point to the CFI jump table */ if (init) mod->init = *init; +#ifdef CONFIG_MODULE_UNLOAD if (exit) mod->exit = *exit; +#endif cfi_module_add(mod, module_addr_min); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 49716228ef..17a653b670 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[16]; + unsigned int scaling; if (cnt > 15) cnt = 15; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; - if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling)) + if (kstrtouint(buf, 10, &scaling)) return -EINVAL; + if (scaling >= SCHED_TUNABLESCALING_END) + return -EINVAL; + + sysctl_sched_tunable_scaling = scaling; if (sched_update_scaling()) return -EINVAL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b9..f6a05d9b54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4936,8 +4936,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - if (!cfs_rq->load.weight) + /* Nothing to run but something to decay (on_list)? Complete the branch */ + if (!cfs_rq->load.weight) { + if (cfs_rq->on_list) + goto unthrottle_throttle; return; + } task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7896d30d90..bc677cd642 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1744,16 +1744,15 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -/* - * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - * defined(CONFIG_FSNOTIFY) - */ -#else +#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER) #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", 0644, d_tracer, \ &tr->max_latency, &tracing_max_lat_fops) +#else +#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif #ifdef CONFIG_TRACER_MAX_TRACE @@ -9473,9 +9472,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_maxlat_file(tr, d_tracer); -#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 3044b762cb..c4a15aef36 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -119,10 +119,58 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_eprobe *ep = to_trace_eprobe(ev); + const char *slash; - return strcmp(trace_probe_name(&ep->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && - trace_probe_match_command_args(&ep->tp, argc, argv); + /* + * We match the following: + * event only - match all eprobes with event name + * system and event only - match all system/event probes + * + * The below has the above satisfied with more arguments: + * + * attached system/event - If the arg has the system and event + * the probe is attached to, match + * probes with the attachment. + * + * If any more args are given, then it requires a full match. + */ + + /* + * If system exists, but this probe is not part of that system + * do not match. + */ + if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0) + return false; + + /* Must match the event name */ + if (strcmp(trace_probe_name(&ep->tp), event) != 0) + return false; + + /* No arguments match all */ + if (argc < 1) + return true; + + /* First argument is the system/event the probe is attached to */ + + slash = strchr(argv[0], '/'); + if (!slash) + slash = strchr(argv[0], '.'); + if (!slash) + return false; + + if (strncmp(ep->event_system, argv[0], slash - argv[0])) + return false; + if (strcmp(ep->event_name, slash + 1)) + return false; + + argc--; + argv++; + + /* If there are no other args, then match */ + if (argc < 1) + return true; + + return trace_probe_match_command_args(&ep->tp, argc, argv); } static struct dyn_event_operations eprobe_dyn_event_ops = { @@ -632,6 +680,13 @@ static int disable_eprobe(struct trace_eprobe *ep, trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); + + /* Make sure nothing is using the edata or trigger */ + tracepoint_synchronize_unregister(); + + kfree(edata); + kfree(trigger); + return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a6061a69aa..f01e442716 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2506,7 +2506,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data, * events. However, for convenience, users are allowed to directly * specify an event field in an action, which will be automatically * converted into a variable on their behalf. - + * * If a user specifies a field on an event that isn't the event the * histogram currently being defined (the target event histogram), the * only way that can be accomplished is if a new hist trigger is diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a6b4a244..1b3eb1e953 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4830,8 +4830,16 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); show_pwq(pwq); + printk_deferred_exit(); + } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. @@ -4849,7 +4857,12 @@ void show_workqueue_state(void) raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; - + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%us workers=%d", @@ -4864,6 +4877,7 @@ void show_workqueue_state(void) first = false; } pr_cont("\n"); + printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /* diff --git a/lib/Makefile b/lib/Makefile index 5efd1b435a..a841be5244 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -351,7 +351,7 @@ obj-$(CONFIG_OBJAGG) += objagg.o obj-$(CONFIG_PLDMFW) += pldmfw/ # KUnit tests -CFLAGS_bitfield_kunit.o := $(call cc-option,-Wframe-larger-than=10240) +CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o diff --git a/lib/kunit/executor_test.c b/lib/kunit/executor_test.c index cdbe54b165..e14a18af57 100644 --- a/lib/kunit/executor_test.c +++ b/lib/kunit/executor_test.c @@ -116,8 +116,8 @@ static void kfree_at_end(struct kunit *test, const void *to_free) /* kfree() handles NULL already, but avoid allocating a no-op cleanup. */ if (IS_ERR_OR_NULL(to_free)) return; - kunit_alloc_and_get_resource(test, NULL, kfree_res_free, GFP_KERNEL, - (void *)to_free); + kunit_alloc_resource(test, NULL, kfree_res_free, GFP_KERNEL, + (void *)to_free); } static struct kunit_suite *alloc_fake_suite(struct kunit *test, diff --git a/mm/memblock.c b/mm/memblock.c index 184dcd2e5d..5c3503c98b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -936,7 +936,12 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); + int ret = memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); + + if (!ret) + kmemleak_free_part_phys(base, size); + + return ret; } /** diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2eb0e55ef5..b5f4ef3535 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -552,6 +552,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->gso_segs = skb_shinfo(skb)->gso_segs; } +static struct proto bpf_dummy_proto = { + .name = "bpf_dummy", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -596,20 +602,19 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } - sk = kzalloc(sizeof(struct sock), GFP_USER); + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } - sock_net_set(sk, net); sock_init_data(NULL, sk); skb = build_skb(data, 0); if (!skb) { kfree(data); kfree(ctx); - kfree(sk); + sk_free(sk); return -ENOMEM; } skb->sk = sk; @@ -682,8 +687,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - bpf_sk_storage_free(sk); - kfree(sk); + sk_free(sk); kfree(ctx); return ret; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3523c8c706..f3d7511053 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1677,8 +1677,6 @@ static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, int ifindex, struct br_ip *saddr) { - lockdep_assert_held_once(&brmctx->br->multicast_lock); - write_seqcount_begin(&querier->seq); querier->port_ifidx = ifindex; memcpy(&querier->addr, saddr, sizeof(*saddr)); @@ -3867,13 +3865,13 @@ void br_multicast_ctx_init(struct net_bridge *br, brmctx->ip4_other_query.delay_time = 0; brmctx->ip4_querier.port_ifidx = 0; - seqcount_init(&brmctx->ip4_querier.seq); + seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock); brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) brmctx->multicast_mld_version = 1; brmctx->ip6_other_query.delay_time = 0; brmctx->ip6_querier.port_ifidx = 0; - seqcount_init(&brmctx->ip6_querier.seq); + seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock); #endif timer_setup(&brmctx->ip4_mc_router_timer, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6c58fc14d2..5c6c4305ed 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1666,7 +1666,8 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + - nla_total_size(sizeof(struct br_mcast_stats)) + + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b4cef3a97f..e8136db444 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -82,7 +82,7 @@ struct bridge_mcast_other_query { struct bridge_mcast_querier { struct br_ip addr; int port_ifidx; - seqcount_t seq; + seqcount_spinlock_t seq; }; /* IGMP/MLD statistics */ diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 8c39283c26..f0cb383441 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -50,6 +50,11 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, if (addr_len > MAX_ADDR_LEN) return -EINVAL; + ha = list_first_entry(&list->list, struct netdev_hw_addr, list); + if (ha && !memcmp(addr, ha->addr, addr_len) && + (!addr_type || addr_type == ha->type)) + goto found_it; + while (*ins_point) { int diff; @@ -64,6 +69,7 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } else if (diff > 0) { ins_point = &parent->rb_right; } else { +found_it: if (exclusive) return -EEXIST; if (global) { diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index eab5fc88a0..d8b9dbabd4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -77,8 +77,8 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - seq_printf(seq, "%9s: %16llu %12llu %4llu %6llu %4llu %5llu %10llu %9llu " - "%16llu %12llu %4llu %6llu %4llu %5llu %7llu %10llu\n", + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, @@ -103,11 +103,11 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) static int dev_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "Interface| Receive " - " | Transmit\n" - " | bytes packets errs drop fifo frame " - "compressed multicast| bytes packets errs " - " drop fifo colls carrier compressed\n"); + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); else dev_seq_printf_stats(seq, v); return 0; @@ -259,14 +259,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v) struct packet_type *pt = v; if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); + seq_puts(seq, "Type Device Function\n"); else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-9s %ps\n", + seq_printf(seq, " %-8s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } @@ -327,14 +327,12 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) struct netdev_hw_addr *ha; struct net_device *dev = v; - if (v == SEQ_START_TOKEN) { - seq_puts(seq, "Ifindex Interface Refcount Global_use Address\n"); + if (v == SEQ_START_TOKEN) return 0; - } netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - seq_printf(seq, "%-7d %-9s %-8d %-10d %*phN\n", + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", dev->ifindex, dev->name, ha->refcount, ha->global_use, (int)dev->addr_len, ha->addr); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 972c8cb303..8ccce85562 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -5262,7 +5262,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { - size_t size = 0; + size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); diff --git a/net/core/sock.c b/net/core/sock.c index 512e629f97..c1601f75ec 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1376,6 +1376,16 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_setsockopt); +static const struct cred *sk_get_peer_cred(struct sock *sk) +{ + const struct cred *cred; + + spin_lock(&sk->sk_peer_lock); + cred = get_cred(sk->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + return cred; +} static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) @@ -1552,7 +1562,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + spin_unlock(&sk->sk_peer_lock); + if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; @@ -1560,20 +1574,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_PEERGROUPS: { + const struct cred *cred; int ret, n; - if (!sk->sk_peer_cred) + cred = sk_get_peer_cred(sk); + if (!cred) return -ENODATA; - n = sk->sk_peer_cred->group_info->ngroups; + n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); + put_cred(cred); return put_user(len, optlen) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, - sk->sk_peer_cred->group_info); + ret = groups_to_user((gid_t __user *)optval, cred->group_info); + put_cred(cred); if (ret) return ret; goto lenout; @@ -1935,9 +1952,10 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_frag.page = NULL; } - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); + if (likely(sk->sk_net_refcnt)) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); @@ -3145,6 +3163,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; @@ -3210,24 +3230,8 @@ void release_sock(struct sock *sk) } EXPORT_SYMBOL(release_sock); -/** - * lock_sock_fast - fast version of lock_sock - * @sk: socket - * - * This version should be used for very small section, where process wont block - * return false if fast path is taken: - * - * sk_lock.slock locked, owned = 0, BH disabled - * - * return true if slow path is taken: - * - * sk_lock.slock unlocked, owned = 1, BH enabled - */ -bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { - /* The sk_lock has mutex_lock() semantics here. */ - mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); - might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -3256,7 +3260,7 @@ bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) spin_unlock_bh(&sk->sk_lock.slock); return true; } -EXPORT_SYMBOL(lock_sock_fast); +EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 5482855397..d8ee15f1c7 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -101,8 +101,6 @@ config NET_DSA_TAG_RTL4_A config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches, using NPI port" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) select PACKING help Say Y or M if you want to enable NPI tagging for the Ocelot switches @@ -114,8 +112,6 @@ config NET_DSA_TAG_OCELOT config NET_DSA_TAG_OCELOT_8021Q tristate "Tag driver for Ocelot family of switches, using VLAN" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) help Say Y or M if you want to enable support for tagging frames with a custom VLAN-based header. Frames that require timestamping, such as @@ -138,7 +134,6 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" - depends on NET_DSA_SJA1105 || !NET_DSA_SJA1105 select PACKING help Say Y or M if you want to enable support for tagging frames with the diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b29262eee0..da18094b5a 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -170,7 +170,7 @@ void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num) /* Check if the bridge is still in use, otherwise it is time * to clean it up so we can reuse this bridge_num later. */ - if (!dsa_bridge_num_find(bridge_dev)) + if (dsa_bridge_num_find(bridge_dev) < 0) clear_bit(bridge_num, &dsa_fwd_offloading_bridges); } @@ -811,7 +811,9 @@ static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) if (!dsa_is_cpu_port(ds, port)) continue; + rtnl_lock(); err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + rtnl_unlock(); if (err) { dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", tag_ops->name, ERR_PTR(err)); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 1c797ec8e2..6466d0539a 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -168,7 +168,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (extack._msg) dev_err(ds->dev, "port %d: %s\n", info->port, extack._msg); - if (err && err != EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) return err; } diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 77d0ce89ab..b3da4b2ea1 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -45,6 +45,7 @@ * 6 6 2 2 4 2 N */ +#include #include #include #include @@ -129,12 +130,9 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 tag_dev, tag_port; enum dsa_cmd cmd; u8 *dsa_header; - u16 pvid = 0; - int err; if (skb->offload_fwd_mark) { struct dsa_switch_tree *dst = dp->ds->dst; - struct net_device *br = dp->bridge_dev; cmd = DSA_CMD_FORWARD; @@ -144,19 +142,6 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, */ tag_dev = dst->last_switch + 1 + dp->bridge_num; tag_port = 0; - - /* If we are offloading forwarding for a VLAN-unaware bridge, - * inject packets to hardware using the bridge's pvid, since - * that's where the packets ingressed from. - */ - if (!br_vlan_enabled(br)) { - /* Safe because __dev_queue_xmit() runs under - * rcu_read_lock_bh() - */ - err = br_vlan_get_pvid_rcu(br, &pvid); - if (err) - return NULL; - } } else { cmd = DSA_CMD_FROM_CPU; tag_dev = dp->ds->index; @@ -180,16 +165,21 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, dsa_header[2] &= ~0x10; } } else { + struct net_device *br = dp->bridge_dev; + u16 vid; + + vid = br ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE; + skb_push(skb, DSA_HLEN + extra); dsa_alloc_etype_header(skb, DSA_HLEN + extra); - /* Construct untagged DSA tag. */ + /* Construct DSA header from untagged frame. */ dsa_header = dsa_etype_header_pos_tx(skb) + extra; dsa_header[0] = (cmd << 6) | tag_dev; dsa_header[1] = tag_port << 3; - dsa_header[2] = pvid >> 8; - dsa_header[3] = pvid & 0xff; + dsa_header[2] = vid >> 8; + dsa_header[3] = vid & 0xff; } return skb; @@ -210,7 +200,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, cmd = dsa_header[0] >> 6; switch (cmd) { case DSA_CMD_FORWARD: - trunk = !!(dsa_header[1] & 7); + trunk = !!(dsa_header[1] & 4); break; case DSA_CMD_TO_CPU: diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 8025ed778d..605b51ca69 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -2,7 +2,6 @@ /* Copyright 2019 NXP */ #include -#include #include "dsa_priv.h" static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 59072930cb..3412051981 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -9,10 +9,32 @@ * that on egress */ #include -#include -#include +#include #include "dsa_priv.h" +static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp, + struct sk_buff *skb) +{ + struct felix_deferred_xmit_work *xmit_work; + struct felix_port *felix_port = dp->priv; + + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + if (!xmit_work) + return NULL; + + /* Calls felix_port_deferred_xmit in felix.c */ + kthread_init_work(&xmit_work->work, felix_port->xmit_work_fn); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + xmit_work->dp = dp; + xmit_work->skb = skb_get(skb); + + kthread_queue_work(felix_port->xmit_worker, &xmit_work->work); + + return NULL; +} + static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -20,18 +42,10 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - struct ocelot *ocelot = dp->ds->priv; - int port = dp->index; - u32 rew_op = 0; + struct ethhdr *hdr = eth_hdr(skb); - rew_op = ocelot_ptp_rew_op(skb); - if (rew_op) { - if (!ocelot_can_inject(ocelot, 0)) - return NULL; - - ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); - return NULL; - } + if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest)) + return ocelot_defer_xmit(dp, skb); return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index c054f48541..2edede9dda 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include "dsa_priv.h" @@ -53,6 +54,11 @@ #define SJA1110_TX_TRAILER_LEN 4 #define SJA1110_MAX_PADDING_LEN 15 +enum sja1110_meta_tstamp { + SJA1110_META_TSTAMP_TX = 0, + SJA1110_META_TSTAMP_RX = 1, +}; + /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ static inline bool sja1105_is_link_local(const struct sk_buff *skb) { @@ -520,6 +526,43 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, is_meta); } +static void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, + u8 ts_id, enum sja1110_meta_tstamp dir, + u64 tstamp) +{ + struct sk_buff *skb, *skb_tmp, *skb_match = NULL; + struct dsa_port *dp = dsa_to_port(ds, port); + struct skb_shared_hwtstamps shwt = {0}; + struct sja1105_port *sp = dp->priv; + + if (!dsa_port_is_sja1105(dp)) + return; + + /* We don't care about RX timestamps on the CPU port */ + if (dir == SJA1110_META_TSTAMP_RX) + return; + + spin_lock(&sp->data->skb_txtstamp_queue.lock); + + skb_queue_walk_safe(&sp->data->skb_txtstamp_queue, skb, skb_tmp) { + if (SJA1105_SKB_CB(skb)->ts_id != ts_id) + continue; + + __skb_unlink(skb, &sp->data->skb_txtstamp_queue); + skb_match = skb; + + break; + } + + spin_unlock(&sp->data->skb_txtstamp_queue.lock); + + if (WARN_ON(!skb_match)) + return; + + shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp)); + skb_complete_tx_timestamp(skb_match, &shwt); +} + static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) { u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b42c429ceb..3364cb9c67 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1661,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, - int nh_weight, u8 rt_family) + int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; @@ -1679,6 +1679,9 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, rtnh->rtnh_flags = flags; + if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) + goto nla_put_failure; + /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; @@ -1706,14 +1709,13 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) } for_nexthops(fi) { - if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, - AF_INET) < 0) - goto nla_put_failure; + u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + nh_tclassid = nh->nh_tclassid; #endif + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, + AF_INET, nh_tclassid) < 0) + goto nla_put_failure; } endfor_nexthops(fi); mp_end: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8b30cadff7..b7e277d8a8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1054,14 +1054,19 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio); if (!ext_hdr || !iio) goto send_mal_query; - if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr)) + if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) || + ntohs(iio->extobj_hdr.length) > sizeof(_iio)) goto send_mal_query; ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr); + iio = skb_header_pointer(skb, sizeof(_ext_hdr), + sizeof(iio->extobj_hdr) + ident_len, &_iio); + if (!iio) + goto send_mal_query; + status = 0; dev = NULL; switch (iio->extobj_hdr.class_type) { case ICMP_EXT_ECHO_CTYPE_NAME: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); if (ident_len >= IFNAMSIZ) goto send_mal_query; memset(buff, 0, sizeof(buff)); @@ -1069,30 +1074,24 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) dev = dev_get_by_name(net, buff); break; case ICMP_EXT_ECHO_CTYPE_INDEX: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(iio->ident.ifindex), &_iio); if (ident_len != sizeof(iio->ident.ifindex)) goto send_mal_query; dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); break; case ICMP_EXT_ECHO_CTYPE_ADDR: - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) || + ident_len != sizeof(iio->ident.addr.ctype3_hdr) + iio->ident.addr.ctype3_hdr.addrlen) goto send_mal_query; switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) { case ICMP_AFI_IP: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(struct in_addr), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr)) goto send_mal_query; dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in6_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr)) goto send_mal_query; dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); dev_hold(dev); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 80aeaf9e6e..bfb522e513 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -242,8 +242,10 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; + score = sk->sk_bound_dev_if ? 2 : 1; - score = sk->sk_family == PF_INET ? 2 : 1; + if (sk->sk_family == PF_INET) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index b88e0f36cd..8265c67657 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -42,7 +42,7 @@ iptable_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_table_init(struct net *net) +static int iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; const struct xt_table *table = &packet_raw; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 613432a36f..e61ea428ea 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -20,13 +20,8 @@ #endif #include -static unsigned int defrag4_pernet_id __read_mostly; static DEFINE_MUTEX(defrag4_mutex); -struct defrag4_pernet { - unsigned int users; -}; - static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -111,19 +106,15 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = { static void __net_exit defrag4_net_exit(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - - if (nf_defrag->users) { + if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); - nf_defrag->users = 0; + net->nf.defrag_ipv4_users = 0; } } static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, - .id = &defrag4_pernet_id, - .size = sizeof(struct defrag4_pernet), }; static int __init nf_defrag_init(void) @@ -138,24 +129,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv4_enable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; mutex_lock(&defrag4_mutex); - if (nf_defrag->users == UINT_MAX) { + if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_defrag->users) { - nf_defrag->users++; + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) - nf_defrag->users = 1; + net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); @@ -165,12 +155,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - mutex_lock(&defrag4_mutex); - if (nf_defrag->users) { - nf_defrag->users--; - if (nf_defrag->users == 0) + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users--; + if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8851c9463b..8536b2a721 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -390,7 +390,8 @@ static int compute_score(struct sock *sk, struct net *net, dif, sdif); if (!dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if) + score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; @@ -1053,7 +1054,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; @@ -1361,7 +1362,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, } up->len += size; - if (!(up->corkflag || (flags&MSG_MORE))) + if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) ret = udp_push_pending_frames(sk); if (!ret) ret = size; @@ -2662,9 +2663,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: if (val != 0) { - up->corkflag = 1; + WRITE_ONCE(up->corkflag, 1); } else { - up->corkflag = 0; + WRITE_ONCE(up->corkflag, 0); lock_sock(sk); push_pending_frames(sk); release_sock(sk); @@ -2787,7 +2788,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: - val = up->corkflag; + val = READ_ONCE(up->corkflag); break; case UDP_ENCAP: diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 55c290d556..67c9114835 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -106,7 +106,7 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; - score = 1; + score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 5e89610048..d128172bb5 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -770,6 +770,66 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, data += sizeof(__be32); } + /* bit12 undefined: filled with empty value */ + if (trace->type.bit12) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit13 undefined: filled with empty value */ + if (trace->type.bit13) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit14 undefined: filled with empty value */ + if (trace->type.bit14) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit15 undefined: filled with empty value */ + if (trace->type.bit15) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit16 undefined: filled with empty value */ + if (trace->type.bit16) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit17 undefined: filled with empty value */ + if (trace->type.bit17) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit18 undefined: filled with empty value */ + if (trace->type.bit18) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit19 undefined: filled with empty value */ + if (trace->type.bit19) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit20 undefined: filled with empty value */ + if (trace->type.bit20) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit21 undefined: filled with empty value */ + if (trace->type.bit21) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + /* opaque state snapshot */ if (trace->type.bit22) { if (!sc) { @@ -791,16 +851,10 @@ void ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc; u8 sclen = 0; - /* Skip if Overflow flag is set OR - * if an unknown type (bit 12-21) is set + /* Skip if Overflow flag is set */ - if (trace->overflow || - trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | - trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | - trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | - trace->type.bit21) { + if (trace->overflow) return; - } /* NodeLen does not include Opaque State Snapshot length. We need to * take it into account if the corresponding bit is set (bit 22) and diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index f9ee04541c..9b7b726f8f 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -75,7 +75,11 @@ static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) u32 fields; if (!trace->type_be32 || !trace->remlen || - trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4) + trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 || + trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | + trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | + trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | + trace->type.bit21) return false; trace->nodelen = 0; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index de2cf3943b..a579ea14a6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index a010841527..5c47be29b9 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,7 +33,7 @@ static const char nf_frags_cache_name[] = "nf-frags"; -unsigned int nf_frag_pernet_id __read_mostly; +static unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index e8a59d8bf2..cb4eb1d2c6 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -25,8 +25,6 @@ #include #include -extern unsigned int nf_frag_pernet_id; - static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, @@ -91,12 +89,10 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = { static void __net_exit defrag6_net_exit(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - - if (nf_frag->users) { + if (net->nf.defrag_ipv6_users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); - nf_frag->users = 0; + net->nf.defrag_ipv6_users = 0; } } @@ -134,24 +130,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv6_enable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; mutex_lock(&defrag6_mutex); - if (nf_frag->users == UINT_MAX) { + if (net->nf.defrag_ipv6_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_frag->users) { - nf_frag->users++; + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) - nf_frag->users = 1; + net->nf.defrag_ipv6_users = 1; out_unlock: mutex_unlock(&defrag6_mutex); @@ -161,12 +156,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); void nf_defrag_ipv6_disable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - mutex_lock(&defrag6_mutex); - if (nf_frag->users) { - nf_frag->users--; - if (nf_frag->users == 0) + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users--; + if (net->nf.defrag_ipv6_users == 0) nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dbc2240239..9b9ef09382 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5681,14 +5681,15 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, - rt->fib6_nh->fib_nh_weight, AF_INET6) < 0) + rt->fib6_nh->fib_nh_weight, AF_INET6, + 0) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6) < 0) + AF_INET6, 0) < 0) goto nla_put_failure; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ea53847b5b..8d785232b4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -133,7 +133,8 @@ static int compute_score(struct sock *sk, struct net *net, dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; - score++; + if (sk->sk_bound_dev_if) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; @@ -1303,7 +1304,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index efbefcbac3..7cab1cf09b 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -60,7 +60,10 @@ static struct mesh_table *mesh_table_alloc(void) atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); spin_lock_init(&newtbl->walk_lock); - rhashtable_init(&newtbl->rhead, &mesh_rht_params); + if (rhashtable_init(&newtbl->rhead, &mesh_rht_params)) { + kfree(newtbl); + return NULL; + } return newtbl; } diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 204830a552..3fbd0b9ff9 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -2,6 +2,7 @@ /* * Copyright 2012-2013, Marco Porsch * Copyright 2012-2013, cozybit Inc. + * Copyright (C) 2021 Intel Corporation */ #include "mesh.h" @@ -588,7 +589,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, /* only transmit to PS STA with announced, non-zero awake window */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && - (!elems->awake_window || !le16_to_cpu(*elems->awake_window))) + (!elems->awake_window || !get_unaligned_le16(elems->awake_window))) return; if (!test_sta_flag(sta, WLAN_STA_MPSP_OWNER)) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index e5935e3d7a..8c6416129d 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -392,10 +392,6 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; - if (ieee80211_is_tx_data(txrc->skb) && - info->flags & IEEE80211_TX_CTL_NO_ACK) - return false; - if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 99ed68f7dc..c4071b015c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4131,7 +4131,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || - ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2) || + !is_valid_ether_addr(hdr->addr2)) return false; if (ieee80211_is_beacon(hdr->frame_control)) return true; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2d1193ed3e..8921088a5d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2209,7 +2209,11 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, } vht_mcs = iterator.this_arg[4] >> 4; + if (vht_mcs > 11) + vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; + if (!vht_nss || vht_nss > 8) + vht_nss = 1; break; /* @@ -3380,6 +3384,14 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; + /* If n == 2, the "while (*frag_tail)" loop above didn't execute + * and frag_tail should be &skb_shinfo(head)->frag_list. + * However, ieee80211_amsdu_prepare_head() can reallocate it. + * Reload frag_tail to have it pointing to the correct place. + */ + if (n == 2) + frag_tail = &skb_shinfo(head)->frag_list; + /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index bca47fad5a..4eed23e276 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -520,6 +520,9 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; @@ -749,6 +752,9 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f48eb6315b..292374fb07 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -36,7 +36,7 @@ static int mptcp_diag_dump_one(struct netlink_callback *cb, struct sock *sk; net = sock_net(in_skb->sk); - msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + msk = mptcp_token_get_sock(net, req->id.idiag_cookie[0]); if (!msk) goto out_nosk; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c4f9a5ce38..050eea2315 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1718,9 +1718,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) list_for_each_entry(entry, &pernet->local_addr_list, list) { if (addresses_equal(&entry->addr, &addr.addr, true)) { - ret = mptcp_nl_addr_backup(net, &entry->addr, bkup); - if (ret) - return ret; + mptcp_nl_addr_backup(net, &entry->addr, bkup); if (bkup) entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index dbcebf5679..d073b21113 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,7 +528,6 @@ static bool mptcp_check_data_fin(struct sock *sk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); switch (sk->sk_state) { case TCP_ESTABLISHED: @@ -742,10 +741,9 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk)) { - set_bit(MPTCP_DATA_READY, &msk->flags); + if (move_skbs_to_msk(msk, ssk)) sk->sk_data_ready(sk); - } + mptcp_data_unlock(sk); } @@ -847,7 +845,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } @@ -1759,21 +1756,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return copied ? : ret; } -static void mptcp_wait_data(struct sock *sk, long *timeo) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct mptcp_sock *msk = mptcp_sk(sk); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - - sk_wait_event(sk, timeo, - test_bit(MPTCP_DATA_READY, &msk->flags), &wait); - - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); -} - static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -2077,19 +2059,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld", timeo); - mptcp_wait_data(sk, &timeo); - } - - if (skb_queue_empty_lockless(&sk->sk_receive_queue) && - skb_queue_empty(&msk->receive_queue)) { - /* entire backlog drained, clear DATA_READY. */ - clear_bit(MPTCP_DATA_READY, &msk->flags); - - /* .. race-breaker: ssk might have gotten new data - * after last __mptcp_move_skbs() returned false. - */ - if (unlikely(__mptcp_move_skbs(msk))) - set_bit(MPTCP_DATA_READY, &msk->flags); + sk_wait_data(sk, &timeo, NULL); } out_err: @@ -2098,9 +2068,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, tcp_recv_timestamp(msg, sk, &tss); } - pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", - msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty_lockless(&sk->sk_receive_queue), copied); + pr_debug("msk=%p rx queue empty=%d:%d copied=%d", + msk, skb_queue_empty_lockless(&sk->sk_receive_queue), + skb_queue_empty(&msk->receive_queue), copied); if (!(flags & MSG_PEEK)) mptcp_rcv_space_adjust(msk, copied); @@ -2368,7 +2338,6 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); mptcp_close_wake_up(sk); @@ -2735,7 +2704,7 @@ static void mptcp_close(struct sock *sk, long timeout) inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow = lock_sock_fast(ssk); + bool slow = lock_sock_fast_nested(ssk); sock_orphan(ssk); unlock_sock_fast(ssk, slow); @@ -3385,8 +3354,14 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { - return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : - 0; + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) @@ -3421,7 +3396,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) - return mptcp_check_readable(msk); + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d3e6fd1615..dc984676c5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -709,7 +709,7 @@ int mptcp_token_new_connect(struct sock *sk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); -struct mptcp_sock *mptcp_token_get_sock(u32 token); +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1de7ce883c..6172f380df 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -86,7 +86,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) struct mptcp_sock *msk; int local_id; - msk = mptcp_token_get_sock(subflow_req->token); + msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c index 37127781ae..7f22526346 100644 --- a/net/mptcp/syncookies.c +++ b/net/mptcp/syncookies.c @@ -108,18 +108,12 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl e->valid = 0; - msk = mptcp_token_get_sock(e->token); + msk = mptcp_token_get_sock(net, e->token); if (!msk) { spin_unlock_bh(&join_entry_locks[i]); return false; } - /* If this fails, the token got re-used in the mean time by another - * mptcp socket in a different netns, i.e. entry is outdated. - */ - if (!net_eq(sock_net((struct sock *)msk), net)) - goto err_put; - subflow_req->remote_nonce = e->remote_nonce; subflow_req->local_nonce = e->local_nonce; subflow_req->backup = e->backup; @@ -128,11 +122,6 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl subflow_req->msk = msk; spin_unlock_bh(&join_entry_locks[i]); return true; - -err_put: - spin_unlock_bh(&join_entry_locks[i]); - sock_put((struct sock *)msk); - return false; } void __init mptcp_join_cookie_init(void) diff --git a/net/mptcp/token.c b/net/mptcp/token.c index a98e554b03..e581b341c5 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -231,6 +231,7 @@ bool mptcp_token_exists(u32 token) /** * mptcp_token_get_sock - retrieve mptcp connection sock using its token + * @net: restrict to this namespace * @token: token of the mptcp connection to retrieve * * This function returns the mptcp connection structure with the given token. @@ -238,7 +239,7 @@ bool mptcp_token_exists(u32 token) * * returns NULL if no connection with the given token value exists. */ -struct mptcp_sock *mptcp_token_get_sock(u32 token) +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { struct hlist_nulls_node *pos; struct token_bucket *bucket; @@ -251,11 +252,15 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token) again: sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { msk = mptcp_sk(sk); - if (READ_ONCE(msk->token) != token) + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto not_found; - if (READ_ONCE(msk->token) != token) { + + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) { sock_put(sk); goto again; } diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index e1bd6f0a06..5d984bec1c 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -11,6 +11,7 @@ static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); mptcp_token_init_request((struct request_sock *)req); + sock_net_set((struct sock *)req, &init_net); return req; } @@ -22,7 +23,7 @@ static void mptcp_token_test_req_basic(struct kunit *test) KUNIT_ASSERT_EQ(test, 0, mptcp_token_new_request((struct request_sock *)req)); KUNIT_EXPECT_NE(test, 0, (int)req->token); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, req->token)); /* cleanup */ mptcp_token_destroy_request((struct request_sock *)req); @@ -55,6 +56,7 @@ static struct mptcp_sock *build_msk(struct kunit *test) msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + sock_net_set((struct sock *)msk, &init_net); return msk; } @@ -74,11 +76,11 @@ static void mptcp_token_test_msk_basic(struct kunit *test) mptcp_token_new_connect((struct sock *)icsk)); KUNIT_EXPECT_NE(test, 0, (int)ctx->token); KUNIT_EXPECT_EQ(test, ctx->token, msk->token); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, ctx->token)); KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); mptcp_token_destroy(msk); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, ctx->token)); } static void mptcp_token_test_accept(struct kunit *test) @@ -90,11 +92,11 @@ static void mptcp_token_test_accept(struct kunit *test) mptcp_token_new_request((struct request_sock *)req)); msk->token = req->token; mptcp_token_accept(req, msk); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* this is now a no-op */ mptcp_token_destroy_request((struct request_sock *)req); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); @@ -116,7 +118,7 @@ static void mptcp_token_test_destroyed(struct kunit *test) /* simulate race on removal */ refcount_set(&sk->sk_refcnt, 0); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 6186358eac..6e39130843 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -130,11 +130,11 @@ htable_size(u8 hbits) { size_t hsize; - /* We must fit both into u32 in jhash and size_t */ + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c100c6b112..2c467c422d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1468,6 +1468,10 @@ int __init ip_vs_conn_init(void) int idx; /* Compute size and mask */ + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { + pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; + } ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 94e18fb969..770a63103c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -74,10 +74,14 @@ static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; +/* serialize hash resizes and nf_ct_iterate_cleanup */ +static DEFINE_MUTEX(nf_conntrack_mutex); + #define GC_SCAN_INTERVAL (120u * HZ) #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) -#define MAX_CHAINLEN 64u +#define MIN_CHAINLEN 8u +#define MAX_CHAINLEN (32u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; @@ -188,11 +192,13 @@ seqcount_spinlock_t nf_conntrack_generation __read_mostly; static siphash_key_t nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, const struct net *net) { struct { struct nf_conntrack_man src; union nf_inet_addr dst_addr; + unsigned int zone; u32 net_mix; u16 dport; u16 proto; @@ -205,6 +211,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, /* The direction must be ignored, so handle usable members manually. */ combined.src = tuple->src; combined.dst_addr = tuple->dst.u3; + combined.zone = zoneid; combined.net_mix = net_hash_mix(net); combined.dport = (__force __u16)tuple->dst.u.all; combined.proto = tuple->dst.protonum; @@ -219,15 +226,17 @@ static u32 scale_hash(u32 hash) static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, unsigned int size) { - return reciprocal_scale(hash_conntrack_raw(tuple, net), size); + return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, + unsigned int zoneid) { - return scale_hash(hash_conntrack_raw(tuple, net)); + return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, @@ -650,9 +659,11 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); clean_from_lists(ct); @@ -819,8 +830,20 @@ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, net)); + unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + struct nf_conntrack_tuple_hash *thash; + + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone_id, net)); + + if (thash) + return thash; + + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (rid != zone_id) + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -842,6 +865,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int max_chainlen; unsigned int chainlen = 0; unsigned int sequence; int err = -EEXIST; @@ -852,18 +876,22 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -873,7 +901,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -1103,8 +1131,8 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, int __nf_conntrack_confirm(struct sk_buff *skb) { + unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; - unsigned int chainlen = 0, sequence; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -1133,8 +1161,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related @@ -1168,6 +1196,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -1175,7 +1204,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -1184,7 +1213,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) { + if (chainlen++ > max_chainlen) { chaintoolong: nf_ct_add_to_dying_list(ct); NF_CT_STAT_INC(net, chaintoolong); @@ -1246,7 +1275,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); - hash = __hash_conntrack(net, tuple, hsize); + hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); @@ -1687,8 +1716,8 @@ resolve_normal_ct(struct nf_conn *tmpl, struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + u32 hash, zone_id, rid; struct nf_conn *ct; - u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, @@ -1699,8 +1728,20 @@ resolve_normal_ct(struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple, state->net); + + zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); + + if (!h) { + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (zone_id != rid) { + u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); + + h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); + } + } + if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); @@ -2225,28 +2266,31 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) - continue; - /* All nf_conn objects are added to hash table twice, one - * for original direction tuple, once for the reply tuple. - * - * Exception: In the IPS_NAT_CLASH case, only the reply - * tuple is added (the original tuple already existed for - * a different object). - * - * We only need to call the iterator once for each - * conntrack, so we just use the 'reply' direction - * tuple while iterating. - */ - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; - } + hlist_nulls_for_each_entry(h, n, hslot, hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) + continue; + /* All nf_conn objects are added to hash table twice, one + * for original direction tuple, once for the reply tuple. + * + * Exception: In the IPS_NAT_CLASH case, only the reply + * tuple is added (the original tuple already existed for + * a different object). + * + * We only need to call the iterator once for each + * conntrack, so we just use the 'reply' direction + * tuple while iterating. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; } spin_unlock(lockp); local_bh_enable(); @@ -2264,26 +2308,20 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - unsigned int bucket = 0, sequence; + unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); - for (;;) { - sequence = read_seqcount_begin(&nf_conntrack_generation); + mutex_lock(&nf_conntrack_mutex); + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ - - nf_ct_delete(ct, portid, report); - nf_ct_put(ct); - cond_resched(); - } - - if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) - break; - bucket = 0; + nf_ct_delete(ct, portid, report); + nf_ct_put(ct); + cond_resched(); } + mutex_unlock(&nf_conntrack_mutex); } struct iter_data { @@ -2519,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize) if (!hash) return -ENOMEM; + mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { + mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } @@ -2537,12 +2577,16 @@ int nf_conntrack_hash_resize(unsigned int hashsize) for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + unsigned int zone_id; + h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); + + zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), - &h->tuple, hashsize); + &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } @@ -2556,6 +2600,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize) nf_conntrack_all_unlock(); local_bh_enable(); + mutex_unlock(&nf_conntrack_mutex); + synchronize_net(); kvfree(old_hash); return 0; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7008961f5c..2731176839 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -150,13 +150,16 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, + const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; + u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); @@ -165,9 +168,13 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; - combined.net_mix = net_hash_mix(n); + combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; + /* Zone ID can be used provided its valid for both directions */ + if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) + combined.zone = zone->id; + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); @@ -272,7 +279,7 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { - unsigned int h = hash_by_src(net, tuple); + unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { @@ -619,7 +626,7 @@ nf_nat_setup_info(struct nf_conn *ct, unsigned int srchash; spinlock_t *lock; - srchash = hash_by_src(net, + srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); @@ -788,7 +795,7 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; - h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d463..acd73f717a 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -9,8 +9,19 @@ #include +struct masq_dev_work { + struct work_struct work; + struct net *net; + union nf_inet_addr addr; + int ifindex; + int (*iter)(struct nf_conn *i, void *data); +}; + +#define MAX_MASQ_WORKER_COUNT 16 + static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; +static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -63,13 +74,71 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); -static int device_cmp(struct nf_conn *i, void *ifindex) +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + + w = container_of(work, struct masq_dev_work, work); + + nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&masq_worker_count); + module_put(THIS_MODULE); +} + +/* Iterate conntrack table in the background and remove conntrack entries + * that use the device/address being removed. + * + * In case too many work items have been queued already or memory allocation + * fails iteration is skipped, conntrack entries will time out eventually. + */ +static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, + int ifindex, + int (*iter)(struct nf_conn *i, void *data), + gfp_t gfp_flags) +{ + struct masq_dev_work *w; + + if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) + return; + + net = maybe_get_net(net); + if (!net) + return; + + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kzalloc(sizeof(*w), gfp_flags); + if (w) { + /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ + atomic_inc(&masq_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = ifindex; + w->net = net; + w->iter = iter; + if (addr) + w->addr = *addr; + schedule_work(&w->work); + return; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); +} + +static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); + const struct masq_dev_work *w = arg; if (!nat) return 0; - return nat->masq_index == (int)(long)ifindex; + return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, @@ -85,8 +154,8 @@ static int masq_device_event(struct notifier_block *this, * and forget them. */ - nf_ct_iterate_cleanup_net(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_nat_masq_schedule(net, NULL, dev->ifindex, + device_cmp, GFP_KERNEL); } return NOTIFY_DONE; @@ -94,35 +163,45 @@ static int masq_device_event(struct notifier_block *this, static int inet_cmp(struct nf_conn *ct, void *ptr) { - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; struct nf_conntrack_tuple *tuple; + struct masq_dev_work *w = ptr; - if (!device_cmp(ct, (void *)(long)dev->ifindex)) + if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ifa->ifa_address == tuple->dst.u3.ip; + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct net *net = dev_net(idev->dev); + const struct in_ifaddr *ifa = ptr; + const struct in_device *idev; + const struct net_device *dev; + union nf_inet_addr addr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ + idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + memset(&addr, 0, sizeof(addr)); + + addr.ip = ifa->ifa_address; + + dev = idev->dev; + nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, + inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } @@ -136,8 +215,6 @@ static struct notifier_block masq_inet_notifier = { }; #if IS_ENABLED(CONFIG_IPV6) -static atomic_t v6_worker_count __read_mostly; - static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, @@ -187,40 +264,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -struct masq_dev_work { - struct work_struct work; - struct net *net; - struct in6_addr addr; - int ifindex; -}; - -static int inet6_cmp(struct nf_conn *ct, void *work) -{ - struct masq_dev_work *w = (struct masq_dev_work *)work; - struct nf_conntrack_tuple *tuple; - - if (!device_cmp(ct, (void *)(long)w->ifindex)) - return 0; - - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - - return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); -} - -static void iterate_cleanup_work(struct work_struct *work) -{ - struct masq_dev_work *w; - - w = container_of(work, struct masq_dev_work, work); - - nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0); - - put_net(w->net); - kfree(w); - atomic_dec(&v6_worker_count); - module_put(THIS_MODULE); -} - /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. @@ -233,36 +276,19 @@ static int masq_inet6_event(struct notifier_block *this, { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; - struct masq_dev_work *w; - struct net *net; + union nf_inet_addr addr; - if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16) + if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; - net = maybe_get_net(dev_net(dev)); - if (!net) - return NOTIFY_DONE; - if (!try_module_get(THIS_MODULE)) - goto err_module; + memset(&addr, 0, sizeof(addr)); - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - atomic_inc(&v6_worker_count); + addr.in6 = ifa->addr; - INIT_WORK(&w->work, iterate_cleanup_work); - w->ifindex = dev->ifindex; - w->net = net; - w->addr = ifa->addr; - schedule_work(&w->work); - - return NOTIFY_DONE; - } - - module_put(THIS_MODULE); - err_module: - put_net(net); + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, + GFP_ATOMIC); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 081437dd75..c0851fec11 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -780,6 +780,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -790,8 +791,11 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table); + event, flags, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -1563,6 +1567,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -1573,8 +1578,11 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, + event, flags, ctx->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -2866,8 +2874,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, - const struct nft_rule *prule) + const struct nft_rule *rule, u64 handle) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; @@ -2887,9 +2894,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, NFTA_RULE_PAD)) goto nla_put_failure; - if (event != NFT_MSG_DELRULE && prule) { - if (nla_put_be64(skb, NFTA_RULE_POSITION, - cpu_to_be64(prule->handle), + if (event != NFT_MSG_DELRULE && handle) { + if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle), NFTA_RULE_PAD)) goto nla_put_failure; } @@ -2925,7 +2931,10 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); + const struct nft_rule *prule; struct sk_buff *skb; + u64 handle = 0; + u16 flags = 0; int err; if (!ctx->report && @@ -2936,9 +2945,20 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (event == NFT_MSG_NEWRULE && + !list_is_first(&rule->list, &ctx->chain->rules) && + !list_is_last(&rule->list, &ctx->chain->rules)) { + prule = list_prev_entry(rule, list); + handle = prule->handle; + } + if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE)) + flags |= NLM_F_APPEND; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, - ctx->chain, rule, NULL); + event, flags, ctx->family, ctx->table, + ctx->chain, rule, handle); if (err < 0) { kfree_skb(skb); goto err; @@ -2964,6 +2984,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; unsigned int s_idx = cb->args[0]; + u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { @@ -2975,12 +2996,17 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); } + if (prule) + handle = prule->handle; + else + handle = 0; + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, prule) < 0) + table, chain, rule, handle) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3143,7 +3169,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, NULL); + family, table, chain, rule, 0); if (err < 0) goto err_fill_rule_info; @@ -3403,17 +3429,15 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + err = nft_delrule(&ctx, old_rule); + if (err < 0) + goto err_destroy_flow_rule; + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err_destroy_flow_rule; } - err = nft_delrule(&ctx, old_rule); - if (err < 0) { - nft_trans_destroy(trans); - goto err_destroy_flow_rule; - } - list_add_tail_rcu(&rule->list, &old_rule->list); } else { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); @@ -3943,8 +3967,9 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, gfp_t gfp_flags) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); - struct sk_buff *skb; u32 portid = ctx->portid; + struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -3955,7 +3980,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_set(skb, ctx, set, event, 0); + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + + err = nf_tables_fill_set(skb, ctx, set, event, flags); if (err < 0) { kfree_skb(skb); goto err; @@ -4336,7 +4364,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (ops->privsize != NULL) size = ops->privsize(nla, &desc); alloc_size = sizeof(*set) + size + udlen; - if (alloc_size < size) + if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; set = kvzalloc(alloc_size, GFP_KERNEL); if (!set) @@ -5231,12 +5259,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb, static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_set_elem *elem, - int event, u16 flags) + int event) { struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -5246,6 +5275,9 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, set, elem); if (err < 0) { @@ -6921,7 +6953,7 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) + u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; @@ -6946,8 +6978,9 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, if (skb == NULL) goto err; - err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family, - table, obj, false); + err = nf_tables_fill_obj_info(skb, net, portid, seq, event, + flags & (NLM_F_CREATE | NLM_F_EXCL), + family, table, obj, false); if (err < 0) { kfree_skb(skb); goto err; @@ -6964,7 +6997,7 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->family, ctx->report, GFP_KERNEL); + ctx->flags, ctx->family, ctx->report, GFP_KERNEL); } /* @@ -7745,6 +7778,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -7755,8 +7789,11 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, - ctx->seq, event, 0, + ctx->seq, event, flags, ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); @@ -8634,7 +8671,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_setelem_activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_NEWSETELEM, 0); + NFT_MSG_NEWSETELEM); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -8642,7 +8679,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM, 0); + NFT_MSG_DELSETELEM); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); @@ -9599,7 +9636,6 @@ static void __nft_release_table(struct net *net, struct nft_table *table) table->use--; nf_tables_chain_destroy(&ctx); } - list_del(&table->list); nf_tables_table_destroy(&ctx); } @@ -9612,6 +9648,8 @@ static void __nft_release_tables(struct net *net) if (nft_table_has_owner(table)) continue; + list_del(&table->list); + __nft_release_table(net, table); } } @@ -9619,31 +9657,38 @@ static void __nft_release_tables(struct net *net) static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct nft_table *table, *to_delete[8]; struct nftables_pernet *nft_net; struct netlink_notify *n = ptr; - struct nft_table *table, *nt; struct net *net = n->net; - bool release = false; + unsigned int deleted; + bool restart = false; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; nft_net = nft_pernet(net); + deleted = 0; mutex_lock(&nft_net->commit_mutex); +again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { __nft_release_hook(net, table); - release = true; + list_del_rcu(&table->list); + to_delete[deleted++] = table; + if (deleted >= ARRAY_SIZE(to_delete)) + break; } } - if (release) { + if (deleted) { + restart = deleted >= ARRAY_SIZE(to_delete); synchronize_rcu(); - list_for_each_entry_safe(table, nt, &nft_net->tables, list) { - if (nft_table_has_owner(table) && - n->portid == table->nlpid) - __nft_release_table(net, table); - } + while (deleted) + __nft_release_table(net, to_delete[--deleted]); + + if (restart) + goto again; } mutex_unlock(&nft_net->commit_mutex); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 272bcdb139..f69cc73c58 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -19,6 +19,7 @@ #include #include #include +#include /* Used for matches where *info is larger than X byte */ #define NFT_MATCH_LARGE_THRESH 192 @@ -257,8 +258,22 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_compat_wait_for_destructors(); ret = xt_check_target(&par, size, proto, inv); - if (ret < 0) + if (ret < 0) { + if (ret == -ENOENT) { + const char *modname = NULL; + + if (strcmp(target->name, "LOG") == 0) + modname = "nf_log_syslog"; + else if (strcmp(target->name, "NFLOG") == 0) + modname = "nfnetlink_log"; + + if (modname && + nft_request_module(ctx->net, "%s", modname) == -EAGAIN) + return -EAGAIN; + } + return ret; + } /* The standard target cannot be used */ if (!target->target) diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 0363f533a4..c4d1389f71 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -60,7 +60,7 @@ static void nft_quota_obj_eval(struct nft_object *obj, if (overquota && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, - NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); + NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); } static int nft_quota_do_init(const struct nlattr * const tb[], diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 2ff75f7637..f39244f9c0 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) static int log_tg_check(const struct xt_tgchk_param *par) { const struct xt_log_info *loginfo = par->targinfo; + int ret; if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) return -EINVAL; @@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nf_log_syslog"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + } + + return ret; } static void log_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index fb57932080..e660c3710a 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) static int nflog_tg_check(const struct xt_tgchk_param *par) { const struct xt_nflog_info *info = par->targinfo; + int ret; if (info->flags & ~XT_NFLOG_MASK) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nfnetlink_log"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + } + + return ret; } static void nflog_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 24b7cf447b..ada47e5964 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -594,7 +594,10 @@ static int netlink_insert(struct sock *sk, u32 portid) /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); - nlk_sk(sk)->bound = portid; + /* Paired with lockless reads from netlink_bind(), + * netlink_connect() and netlink_sendmsg(). + */ + WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); @@ -1012,7 +1015,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; - bound = nlk->bound; + /* Paired with WRITE_ONCE() in netlink_insert() */ + bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); @@ -1098,8 +1102,9 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, /* No need for barriers here as we return to user-space without * using any of the bound attributes. + * Paired with WRITE_ONCE() in netlink_insert(). */ - if (!nlk->bound) + if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { @@ -1888,7 +1893,8 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->bound) { + /* Paired with WRITE_ONCE() in netlink_insert() */ + if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 6024fad905..dda323e0a4 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -60,6 +60,9 @@ int nfc_proto_register(const struct nfc_protocol *nfc_proto) proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); + if (rc) + proto_unregister(nfc_proto->proto); + return rc; } EXPORT_SYMBOL(nfc_proto_register); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index fefc03674f..d63d2e5dc6 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -277,6 +277,7 @@ int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) { struct digital_tg_mdaa_params *params; + int rc; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) @@ -291,8 +292,12 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) get_random_bytes(params->nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); params->sc = DIGITAL_SENSF_FELICA_SC; - return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, - 500, digital_tg_recv_atr_req, NULL); + rc = digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, + 500, digital_tg_recv_atr_req, NULL); + if (rc) + kfree(params); + + return rc; } static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech) diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index 84d2345c75..3adf458985 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -465,8 +465,12 @@ static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev, skb_put_u8(skb, sel_cmd); skb_put_u8(skb, DIGITAL_SDD_REQ_SEL_PAR); - return digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, - target); + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, + target); + if (rc) + kfree_skb(skb); + + return rc; } static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg, diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index a2e72c0038..b911ab78be 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -334,6 +334,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, ndev->cur_conn_id); if (conn_info) { list_del(&conn_info->list); + if (conn_info == ndev->rf_conn_info) + ndev->rf_conn_info = NULL; devm_kfree(&ndev->nfc_dev->dev, conn_info); } } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 23b21253b3..eb6345a027 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2188,18 +2188,24 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, arg->count = arg->skip; + rcu_read_lock(); idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) { /* don't return filters that are being deleted */ if (!refcount_inc_not_zero(&f->refcnt)) continue; + rcu_read_unlock(); + if (arg->fn(tp, f, arg) < 0) { __fl_put(f); arg->stop = 1; + rcu_read_lock(); break; } __fl_put(f); arg->count++; + rcu_read_lock(); } + rcu_read_unlock(); arg->cookie = id; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 5e90e9b160..12f39a2dff 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -513,6 +513,12 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, return stab; } + if (s->size_log > STAB_SIZE_LOG_MAX || + s->cell_log > STAB_SIZE_LOG_MAX) { + NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); + return ERR_PTR(-EINVAL); + } + stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index a579a4131d..e1040421b7 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -233,6 +233,9 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; + if (!q->ops->change) + return 0; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 8766ab5b87..5eb3b1b7ae 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -529,22 +529,28 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); struct Qdisc *qdisc = rtnl_dereference(q->qdisc); - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; - struct gnet_stats_queue __percpu *cpu_qstats = NULL; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - cpu_bstats = qdisc->cpu_bstats; - cpu_qstats = qdisc->cpu_qstats; - } - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - cpu_bstats, &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - cpu_qstats, - &qdisc->qstats, - qlen); + if (qdisc_is_percpu_stats(qdisc)) { + qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&qstats, + qdisc->cpu_qstats, + &qdisc->qstats, + qlen); + } else { + qlen += qdisc->q.qlen; + bstats.bytes += qdisc->bstats.bytes; + bstats.packets += qdisc->bstats.packets; + qstats.backlog += qdisc->qstats.backlog; + qstats.drops += qdisc->qstats.drops; + qstats.requeues += qdisc->qstats.requeues; + qstats.overlimits += qdisc->qstats.overlimits; + } spin_unlock_bh(qdisc_lock(qdisc)); } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 1ab2fc933a..b9fd18d986 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1641,6 +1641,10 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); + /* Note that taprio_reset() might not be called if an error + * happens in qdisc_create(), after taprio_init() has been called. + */ + hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); diff --git a/net/sctp/input.c b/net/sctp/input.c index 5ef86fdb11..1f1786021d 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -702,7 +702,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ - if (ntohs(ch->length) < sizeof(_ch)) + if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index b8fa8f1a72..c7503fd649 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3697,7 +3697,7 @@ struct sctp_chunk *sctp_make_strreset_req( outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; - retval = sctp_make_reconf(asoc, outlen + inlen); + retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index f23f558054..99acd337ba 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -150,9 +150,11 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) again: link = conn->lnk; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) - return rc; + goto put_out; spin_lock_bh(&conn->send_lock); if (link != conn->lnk) { @@ -160,6 +162,7 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) spin_unlock_bh(&conn->send_lock); smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); + smc_wr_tx_link_put(link); if (again) return -ENOLINK; again = true; @@ -167,6 +170,8 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); +put_out: + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8280c938be..d2206743dc 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -949,7 +949,7 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, to_lnk = &lgr->lnk[i]; break; } - if (!to_lnk) { + if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { smc_lgr_terminate_sched(lgr); return NULL; } @@ -981,24 +981,26 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; goto again; } read_unlock_bh(&lgr->conns_lock); + smc_wr_tx_link_put(to_lnk); return to_lnk; + +err_out: + smcr_link_down_cond_sched(to_lnk); + smc_wr_tx_link_put(to_lnk); + return NULL; } static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2e7560eba9..72f4b72eb1 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -383,9 +383,11 @@ int smc_llc_send_confirm_link(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; confllc = (struct smc_llc_msg_confirm_link *)wr_buf; memset(confllc, 0, sizeof(*confllc)); confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; @@ -402,6 +404,8 @@ int smc_llc_send_confirm_link(struct smc_link *link, confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -415,9 +419,11 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_link *link; int i, rc, rtok_ix; + if (!smc_wr_tx_link_hold(send_link)) + return -ENOLINK; rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; @@ -444,6 +450,8 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); +put_out: + smc_wr_tx_link_put(send_link); return rc; } @@ -456,9 +464,11 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; @@ -467,6 +477,8 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -480,9 +492,11 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; addllc = (struct smc_llc_msg_add_link *)wr_buf; memset(addllc, 0, sizeof(*addllc)); @@ -504,6 +518,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], } /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -517,9 +533,11 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; delllc = (struct smc_llc_msg_del_link *)wr_buf; memset(delllc, 0, sizeof(*delllc)); @@ -536,6 +554,8 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -547,9 +567,11 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; testllc = (struct smc_llc_msg_test_link *)wr_buf; memset(testllc, 0, sizeof(*testllc)); testllc->hd.common.type = SMC_LLC_TEST_LINK; @@ -557,6 +579,8 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -567,13 +591,16 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } /* schedule an llc send on link, may wait for buffers, @@ -586,13 +613,16 @@ static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); + rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); +put_out: + smc_wr_tx_link_put(link); + return rc; } /********************************* receive ***********************************/ @@ -672,9 +702,11 @@ static int smc_llc_add_link_cont(struct smc_link *link, struct smc_buf_desc *rmb; u8 n; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; memset(addc_llc, 0, sizeof(*addc_llc)); @@ -706,7 +738,10 @@ static int smc_llc_add_link_cont(struct smc_link *link, addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); if (lgr->role == SMC_CLNT) addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } static int smc_llc_cli_rkey_exchange(struct smc_link *link, diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index c79361dfcd..738a4a99c8 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -496,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; @@ -505,8 +505,11 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; + if (!link || !smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { + smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -547,22 +550,7 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) out_unlock: spin_unlock_bh(&conn->send_lock); - return rc; -} - -static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) -{ - struct smc_link *link = conn->lnk; - int rc = -ENOLINK; - - if (!link) - return rc; - - atomic_inc(&link->wr_tx_refcnt); - if (smc_link_usable(link)) - rc = _smcr_tx_sndbuf_nonempty(conn); - if (atomic_dec_and_test(&link->wr_tx_refcnt)) - wake_up_all(&link->wr_tx_wait); + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 423b8709f1..2bc626f230 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,20 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline bool smc_wr_tx_link_hold(struct smc_link *link) +{ + if (!smc_link_usable(link)) + return false; + atomic_inc(&link->wr_tx_refcnt); + return true; +} + +static inline void smc_wr_tx_link_put(struct smc_link *link) +{ + if (atomic_dec_and_test(&link->wr_tx_refcnt)) + wake_up_all(&link->wr_tx_wait); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 92345c9bb6..89f9e85ae9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -608,20 +608,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) @@ -806,7 +828,7 @@ static void unix_unhash(struct sock *sk) } struct proto unix_dgram_proto = { - .name = "UNIX-DGRAM", + .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, @@ -828,20 +850,25 @@ struct proto unix_stream_proto = { static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { - struct sock *sk = NULL; struct unix_sock *u; + struct sock *sk; + int err; atomic_long_inc(&unix_nr_socks); - if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) - goto out; + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { + err = -ENFILE; + goto err; + } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); - if (!sk) - goto out; + if (!sk) { + err = -ENOMEM; + goto err; + } sock_init_data(sock, sk); @@ -861,20 +888,23 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); -out: - if (sk == NULL) - atomic_long_dec(&unix_nr_socks); - else { - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - local_bh_enable(); - } + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + return sk; + +err: + atomic_long_dec(&unix_nr_socks); + return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct sock *sk; + if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -901,7 +931,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM; + sk = unix_create1(net, sock, kern, sock->type); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return 0; } static int unix_release(struct socket *sock) @@ -1314,12 +1348,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case. */ - err = -ENOMEM; - /* create new sock for complete connection */ newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); - if (newsk == NULL) + if (IS_ERR(newsk)) { + err = PTR_ERR(newsk); + newsk = NULL; goto out; + } + + err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); @@ -2845,6 +2882,9 @@ static int unix_shutdown(struct socket *sock, int mode) unix_state_lock(sk); sk->sk_shutdown |= mode; + if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && + mode == SHUTDOWN_MASK) + sk->sk_state = TCP_CLOSE; other = unix_peer(sk); if (other) sock_hold(other); @@ -2867,12 +2907,10 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - if (peer_mode == SHUTDOWN_MASK) { + if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); - other->sk_state = TCP_CLOSE; - } else if (peer_mode & RCV_SHUTDOWN) { + else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - } } if (other) sock_put(other); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 03b66d154b..3a3cb09eec 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1961,24 +1961,65 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, return skb; } +static int xfrm_notify_userpolicy(struct net *net) +{ + struct xfrm_userpolicy_default *up; + int len = NLMSG_ALIGN(sizeof(*up)); + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err; + + skb = nlmsg_new(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_GETDEFAULT, sizeof(*up), 0); + if (nlh == NULL) { + kfree_skb(skb); + return -EMSGSIZE; + } + + up = nlmsg_data(nlh); + up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + + nlmsg_end(skb, nlh); + + rcu_read_lock(); + err = xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); + rcu_read_unlock(); + + return err; +} + static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_default *up = nlmsg_data(nlh); - u8 dirmask; - u8 old_default = net->xfrm.policy_default; - if (up->dirmask >= XFRM_USERPOLICY_DIRMASK_MAX) - return -EINVAL; + if (up->in == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_IN; + else if (up->in == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_IN; - dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK; + if (up->fwd == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_FWD; + else if (up->fwd == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_FWD; - net->xfrm.policy_default = (old_default & (0xff ^ dirmask)) - | (up->action << up->dirmask); + if (up->out == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_OUT; + else if (up->out == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_OUT; rt_genid_bump_all(net); + xfrm_notify_userpolicy(net); return 0; } @@ -1988,13 +2029,11 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *r_skb; struct nlmsghdr *r_nlh; struct net *net = sock_net(skb->sk); - struct xfrm_userpolicy_default *r_up, *up; + struct xfrm_userpolicy_default *r_up; int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default)); u32 portid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; - up = nlmsg_data(nlh); - r_skb = nlmsg_new(len, GFP_ATOMIC); if (!r_skb) return -ENOMEM; @@ -2007,8 +2046,12 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, r_up = nlmsg_data(r_nlh); - r_up->action = ((net->xfrm.policy_default & (1 << up->dirmask)) >> up->dirmask); - r_up->dirmask = up->dirmask; + r_up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; nlmsg_end(r_skb, r_nlh); return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4dc20be5fb..5fd48a8d4f 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -322,17 +322,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h -include $(BPF_SAMPLES_PATH)/Makefile.target -VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ - ../../../../vmlinux \ - /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ + $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ + $(abspath ./vmlinux) VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) -ifeq ($(VMLINUX_BTF),) -$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") -endif - $(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) ifeq ($(VMLINUX_H),) $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ @@ -340,6 +334,11 @@ else $(Q)cp "$(VMLINUX_H)" $@ endif +ifeq ($(VMLINUX_BTF),) + $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\ + build the kernel or set VMLINUX_BTF variable) +endif + clean-files += vmlinux.h # Get Clang's default includes on this system, as opposed to those seen by diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h index aee0453448..29c3bb6ad1 100644 --- a/samples/bpf/bpf_insn.h +++ b/samples/bpf/bpf_insn.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* eBPF instruction mini library */ #ifndef __BPF_INSN_H #define __BPF_INSN_H diff --git a/samples/bpf/xdp_redirect_map_multi.bpf.c b/samples/bpf/xdp_redirect_map_multi.bpf.c index 8f59d430cb..bb0a5a3bfc 100644 --- a/samples/bpf/xdp_redirect_map_multi.bpf.c +++ b/samples/bpf/xdp_redirect_map_multi.bpf.c @@ -5,11 +5,6 @@ #include "xdp_sample.bpf.h" #include "xdp_sample_shared.h" -enum { - BPF_F_BROADCAST = (1ULL << 3), - BPF_F_EXCLUDE_INGRESS = (1ULL << 4), -}; - struct { __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); __uint(key_size, sizeof(int)); diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 952e468763..4aad284800 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -19,6 +19,10 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF) \ += -fplugin-arg-structleak_plugin-byref gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL) \ += -fplugin-arg-structleak_plugin-byref-all +ifdef CONFIG_GCC_PLUGIN_STRUCTLEAK + DISABLE_STRUCTLEAK_PLUGIN += -fplugin-arg-structleak_plugin-disable +endif +export DISABLE_STRUCTLEAK_PLUGIN gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK) \ += -DSTRUCTLEAK_PLUGIN diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index fd9777f63f..9dbab13329 100644 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -82,10 +82,8 @@ cat << EOF #define __IGNORE_truncate64 #define __IGNORE_stat64 #define __IGNORE_lstat64 -#define __IGNORE_fstat64 #define __IGNORE_fcntl64 #define __IGNORE_fadvise64_64 -#define __IGNORE_fstatat64 #define __IGNORE_fstatfs64 #define __IGNORE_statfs64 #define __IGNORE_llseek @@ -253,6 +251,10 @@ cat << EOF #define __IGNORE_getpmsg #define __IGNORE_putpmsg #define __IGNORE_vserver + +/* 64-bit ports never needed these, and new 32-bit ports can use statx */ +#define __IGNORE_fstat64 +#define __IGNORE_fstatat64 EOF } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 8f6b13ae46..7d631aaa0a 100644 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -189,7 +189,7 @@ if ($arch =~ /(x86(_64)?)|(i386)/) { $local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; $weak_regex = "^[0-9a-fA-F]+\\s+([wW])\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; -$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; +$function_regex = "^([0-9a-fA-F]+)\\s+<([^^]*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s(mcount|__fentry__)\$"; $section_type = '@progbits'; $mcount_adjust = 0; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index d59276f48d..94ea2a8b2b 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -126,6 +126,8 @@ static const struct nlmsg_perm nlmsg_xfrm_perms[] = { XFRM_MSG_NEWSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_GETSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_MAPPING, NETLINK_XFRM_SOCKET__NLMSG_READ }, + { XFRM_MSG_SETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, + { XFRM_MSG_GETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_audit_perms[] = @@ -189,7 +191,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_MAPPING); + BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_GETDEFAULT); err = nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms, sizeof(nlmsg_xfrm_perms)); break; diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c index a59de24695..dfe5a64e19 100644 --- a/sound/core/pcm_compat.c +++ b/sound/core/pcm_compat.c @@ -468,6 +468,76 @@ static int snd_pcm_ioctl_sync_ptr_x32(struct snd_pcm_substream *substream, } #endif /* CONFIG_X86_X32 */ +#ifdef __BIG_ENDIAN +typedef char __pad_before_u32[4]; +typedef char __pad_after_u32[0]; +#else +typedef char __pad_before_u32[0]; +typedef char __pad_after_u32[4]; +#endif + +/* PCM 2.0.15 API definition had a bug in mmap control; it puts the avail_min + * at the wrong offset due to a typo in padding type. + * The bug hits only 32bit. + * A workaround for incorrect read/write is needed only in 32bit compat mode. + */ +struct __snd_pcm_mmap_control64_buggy { + __pad_before_u32 __pad1; + __u32 appl_ptr; + __pad_before_u32 __pad2; /* SiC! here is the bug */ + __pad_before_u32 __pad3; + __u32 avail_min; + __pad_after_uframe __pad4; +}; + +static int snd_pcm_ioctl_sync_ptr_buggy(struct snd_pcm_substream *substream, + struct snd_pcm_sync_ptr __user *_sync_ptr) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + struct snd_pcm_sync_ptr sync_ptr; + struct __snd_pcm_mmap_control64_buggy *sync_cp; + volatile struct snd_pcm_mmap_status *status; + volatile struct snd_pcm_mmap_control *control; + int err; + + memset(&sync_ptr, 0, sizeof(sync_ptr)); + sync_cp = (struct __snd_pcm_mmap_control64_buggy *)&sync_ptr.c.control; + if (get_user(sync_ptr.flags, (unsigned __user *)&(_sync_ptr->flags))) + return -EFAULT; + if (copy_from_user(sync_cp, &(_sync_ptr->c.control), sizeof(*sync_cp))) + return -EFAULT; + status = runtime->status; + control = runtime->control; + if (sync_ptr.flags & SNDRV_PCM_SYNC_PTR_HWSYNC) { + err = snd_pcm_hwsync(substream); + if (err < 0) + return err; + } + snd_pcm_stream_lock_irq(substream); + if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_APPL)) { + err = pcm_lib_apply_appl_ptr(substream, sync_cp->appl_ptr); + if (err < 0) { + snd_pcm_stream_unlock_irq(substream); + return err; + } + } else { + sync_cp->appl_ptr = control->appl_ptr; + } + if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN)) + control->avail_min = sync_cp->avail_min; + else + sync_cp->avail_min = control->avail_min; + sync_ptr.s.status.state = status->state; + sync_ptr.s.status.hw_ptr = status->hw_ptr; + sync_ptr.s.status.tstamp = status->tstamp; + sync_ptr.s.status.suspended_state = status->suspended_state; + sync_ptr.s.status.audio_tstamp = status->audio_tstamp; + snd_pcm_stream_unlock_irq(substream); + if (copy_to_user(_sync_ptr, &sync_ptr, sizeof(sync_ptr))) + return -EFAULT; + return 0; +} + /* */ enum { @@ -537,7 +607,7 @@ static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned l if (in_x32_syscall()) return snd_pcm_ioctl_sync_ptr_x32(substream, argp); #endif /* CONFIG_X86_X32 */ - return snd_pcm_common_ioctl(file, substream, cmd, argp); + return snd_pcm_ioctl_sync_ptr_buggy(substream, argp); case SNDRV_PCM_IOCTL_HW_REFINE32: return snd_pcm_ioctl_hw_params_compat(substream, 1, argp); case SNDRV_PCM_IOCTL_HW_PARAMS32: diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index 6c0a4a67ad..6f30231bdb 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -873,12 +873,21 @@ static long snd_rawmidi_ioctl(struct file *file, unsigned int cmd, unsigned long return -EINVAL; } } + case SNDRV_RAWMIDI_IOCTL_USER_PVERSION: + if (get_user(rfile->user_pversion, (unsigned int __user *)arg)) + return -EFAULT; + return 0; + case SNDRV_RAWMIDI_IOCTL_PARAMS: { struct snd_rawmidi_params params; if (copy_from_user(¶ms, argp, sizeof(struct snd_rawmidi_params))) return -EFAULT; + if (rfile->user_pversion < SNDRV_PROTOCOL_VERSION(2, 0, 2)) { + params.mode = 0; + memset(params.reserved, 0, sizeof(params.reserved)); + } switch (params.stream) { case SNDRV_RAWMIDI_STREAM_OUTPUT: if (rfile->output == NULL) diff --git a/sound/core/seq_device.c b/sound/core/seq_device.c index 382275c5b1..7f3fd8eb01 100644 --- a/sound/core/seq_device.c +++ b/sound/core/seq_device.c @@ -156,6 +156,8 @@ static int snd_seq_device_dev_free(struct snd_device *device) struct snd_seq_device *dev = device->device_data; cancel_autoload_drivers(); + if (dev->private_free) + dev->private_free(dev); put_device(&dev->dev); return 0; } @@ -183,11 +185,7 @@ static int snd_seq_device_dev_disconnect(struct snd_device *device) static void snd_seq_dev_release(struct device *dev) { - struct snd_seq_device *sdev = to_seq_dev(dev); - - if (sdev->private_free) - sdev->private_free(sdev); - kfree(sdev); + kfree(to_seq_dev(dev)); } /* diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c index ed40d0f743..773db4bf08 100644 --- a/sound/drivers/pcsp/pcsp_lib.c +++ b/sound/drivers/pcsp/pcsp_lib.c @@ -143,7 +143,7 @@ enum hrtimer_restart pcsp_do_timer(struct hrtimer *handle) if (pointer_update) pcsp_pointer_update(chip); - hrtimer_forward(handle, hrtimer_get_expires(handle), ns_to_ktime(ns)); + hrtimer_forward_now(handle, ns_to_ktime(ns)); return HRTIMER_RESTART; } diff --git a/sound/firewire/motu/amdtp-motu.c b/sound/firewire/motu/amdtp-motu.c index 5388b85fb6..a18c2c033e 100644 --- a/sound/firewire/motu/amdtp-motu.c +++ b/sound/firewire/motu/amdtp-motu.c @@ -276,10 +276,11 @@ static void __maybe_unused copy_message(u64 *frames, __be32 *buffer, /* This is just for v2/v3 protocol. */ for (i = 0; i < data_blocks; ++i) { - *frames = (be32_to_cpu(buffer[1]) << 16) | - (be32_to_cpu(buffer[2]) >> 16); + *frames = be32_to_cpu(buffer[1]); + *frames <<= 16; + *frames |= be32_to_cpu(buffer[2]) >> 16; + ++frames; buffer += data_block_quadlets; - frames++; } } diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c index cb5b5e3a48..daf7313646 100644 --- a/sound/firewire/oxfw/oxfw.c +++ b/sound/firewire/oxfw/oxfw.c @@ -184,13 +184,16 @@ static int detect_quirks(struct snd_oxfw *oxfw, const struct ieee1394_device_id model = val; } - /* - * Mackie Onyx Satellite with base station has a quirk to report a wrong - * value in 'dbs' field of CIP header against its format information. - */ - if (vendor == VENDOR_LOUD && model == MODEL_SATELLITE) + if (vendor == VENDOR_LOUD) { + // Mackie Onyx Satellite with base station has a quirk to report a wrong + // value in 'dbs' field of CIP header against its format information. oxfw->quirks |= SND_OXFW_QUIRK_WRONG_DBS; + // OXFW971-based models may transfer events by blocking method. + if (!(oxfw->quirks & SND_OXFW_QUIRK_JUMBO_PAYLOAD)) + oxfw->quirks |= SND_OXFW_QUIRK_BLOCKING_TRANSMISSION; + } + return 0; } diff --git a/sound/hda/hdac_controller.c b/sound/hda/hdac_controller.c index 062da7a7a5..f7bd6e2db0 100644 --- a/sound/hda/hdac_controller.c +++ b/sound/hda/hdac_controller.c @@ -421,8 +421,9 @@ int snd_hdac_bus_reset_link(struct hdac_bus *bus, bool full_reset) if (!full_reset) goto skip_reset; - /* clear STATESTS */ - snd_hdac_chip_writew(bus, STATESTS, STATESTS_INT_MASK); + /* clear STATESTS if not in reset */ + if (snd_hdac_chip_readb(bus, GCTL) & AZX_GCTL_RESET) + snd_hdac_chip_writew(bus, STATESTS, STATESTS_INT_MASK); /* reset controller */ snd_hdac_bus_enter_link_reset(bus); diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c index 2523b23389..1c8bffc3ee 100644 --- a/sound/pci/hda/hda_bind.c +++ b/sound/pci/hda/hda_bind.c @@ -298,29 +298,31 @@ int snd_hda_codec_configure(struct hda_codec *codec) { int err; + if (codec->configured) + return 0; + if (is_generic_config(codec)) codec->probe_id = HDA_CODEC_ID_GENERIC; else codec->probe_id = 0; - err = snd_hdac_device_register(&codec->core); - if (err < 0) - return err; + if (!device_is_registered(&codec->core.dev)) { + err = snd_hdac_device_register(&codec->core); + if (err < 0) + return err; + } if (!codec->preset) codec_bind_module(codec); if (!codec->preset) { err = codec_bind_generic(codec); if (err < 0) { - codec_err(codec, "Unable to bind the codec\n"); - goto error; + codec_dbg(codec, "Unable to bind the codec\n"); + return err; } } + codec->configured = 1; return 0; - - error: - snd_hdac_device_unregister(&codec->core); - return err; } EXPORT_SYMBOL_GPL(snd_hda_codec_configure); diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index a9ebefd60c..0c4a337c9f 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -791,6 +791,7 @@ void snd_hda_codec_cleanup_for_unbind(struct hda_codec *codec) snd_array_free(&codec->nids); remove_conn_list(codec); snd_hdac_regmap_exit(&codec->core); + codec->configured = 0; } EXPORT_SYMBOL_GPL(snd_hda_codec_cleanup_for_unbind); diff --git a/sound/pci/hda/hda_controller.c b/sound/pci/hda/hda_controller.c index 7cd452831f..930ae4002a 100644 --- a/sound/pci/hda/hda_controller.c +++ b/sound/pci/hda/hda_controller.c @@ -25,6 +25,7 @@ #include #include #include "hda_controller.h" +#include "hda_local.h" #define CREATE_TRACE_POINTS #include "hda_controller_trace.h" @@ -1248,17 +1249,24 @@ EXPORT_SYMBOL_GPL(azx_probe_codecs); int azx_codec_configure(struct azx *chip) { struct hda_codec *codec, *next; + int success = 0; - /* use _safe version here since snd_hda_codec_configure() deregisters - * the device upon error and deletes itself from the bus list. - */ - list_for_each_codec_safe(codec, next, &chip->bus) { - snd_hda_codec_configure(codec); + list_for_each_codec(codec, &chip->bus) { + if (!snd_hda_codec_configure(codec)) + success++; } - if (!azx_bus(chip)->num_codecs) - return -ENODEV; - return 0; + if (success) { + /* unregister failed codecs if any codec has been probed */ + list_for_each_codec_safe(codec, next, &chip->bus) { + if (!codec->configured) { + codec_err(codec, "Unable to configure, disabling\n"); + snd_hdac_device_unregister(&codec->core); + } + } + } + + return success ? 0 : -ENODEV; } EXPORT_SYMBOL_GPL(azx_codec_configure); diff --git a/sound/pci/hda/hda_controller.h b/sound/pci/hda/hda_controller.h index 3062f87380..f5bf295eb8 100644 --- a/sound/pci/hda/hda_controller.h +++ b/sound/pci/hda/hda_controller.h @@ -41,7 +41,7 @@ /* 24 unused */ #define AZX_DCAPS_COUNT_LPIB_DELAY (1 << 25) /* Take LPIB as delay */ #define AZX_DCAPS_PM_RUNTIME (1 << 26) /* runtime PM support */ -/* 27 unused */ +#define AZX_DCAPS_RETRY_PROBE (1 << 27) /* retry probe if no codec is configured */ #define AZX_DCAPS_CORBRP_SELF_CLEAR (1 << 28) /* CORBRP clears itself after reset */ #define AZX_DCAPS_NO_MSI64 (1 << 29) /* Stick to 32-bit MSIs */ #define AZX_DCAPS_SEPARATE_STREAM_TAG (1 << 30) /* capture and playback use separate stream tag */ diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 3aa432d814..4d22e7adee 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -307,7 +307,8 @@ enum { /* quirks for AMD SB */ #define AZX_DCAPS_PRESET_AMD_SB \ (AZX_DCAPS_NO_TCSEL | AZX_DCAPS_AMD_WORKAROUND |\ - AZX_DCAPS_SNOOP_TYPE(ATI) | AZX_DCAPS_PM_RUNTIME) + AZX_DCAPS_SNOOP_TYPE(ATI) | AZX_DCAPS_PM_RUNTIME |\ + AZX_DCAPS_RETRY_PROBE) /* quirks for Nvidia */ #define AZX_DCAPS_PRESET_NVIDIA \ @@ -883,10 +884,11 @@ static unsigned int azx_get_pos_skl(struct azx *chip, struct azx_dev *azx_dev) return azx_get_pos_posbuf(chip, azx_dev); } -static void azx_shutdown_chip(struct azx *chip) +static void __azx_shutdown_chip(struct azx *chip, bool skip_link_reset) { azx_stop_chip(chip); - azx_enter_link_reset(chip); + if (!skip_link_reset) + azx_enter_link_reset(chip); azx_clear_irq_pending(chip); display_power(chip, false); } @@ -895,6 +897,11 @@ static void azx_shutdown_chip(struct azx *chip) static DEFINE_MUTEX(card_list_lock); static LIST_HEAD(card_list); +static void azx_shutdown_chip(struct azx *chip) +{ + __azx_shutdown_chip(chip, false); +} + static void azx_add_card_list(struct azx *chip) { struct hda_intel *hda = container_of(chip, struct hda_intel, chip); @@ -1717,7 +1724,7 @@ static void azx_check_snoop_available(struct azx *chip) static void azx_probe_work(struct work_struct *work) { - struct hda_intel *hda = container_of(work, struct hda_intel, probe_work); + struct hda_intel *hda = container_of(work, struct hda_intel, probe_work.work); azx_probe_continue(&hda->chip); } @@ -1822,7 +1829,7 @@ static int azx_create(struct snd_card *card, struct pci_dev *pci, } /* continue probing in work context as may trigger request module */ - INIT_WORK(&hda->probe_work, azx_probe_work); + INIT_DELAYED_WORK(&hda->probe_work, azx_probe_work); *rchip = chip; @@ -2136,7 +2143,7 @@ static int azx_probe(struct pci_dev *pci, #endif if (schedule_probe) - schedule_work(&hda->probe_work); + schedule_delayed_work(&hda->probe_work, 0); dev++; if (chip->disabled) @@ -2222,6 +2229,11 @@ static int azx_probe_continue(struct azx *chip) int dev = chip->dev_index; int err; + if (chip->disabled || hda->init_failed) + return -EIO; + if (hda->probe_retry) + goto probe_retry; + to_hda_bus(bus)->bus_probing = 1; hda->probe_continued = 1; @@ -2283,10 +2295,20 @@ static int azx_probe_continue(struct azx *chip) #endif } #endif + + probe_retry: if (bus->codec_mask && !(probe_only[dev] & 1)) { err = azx_codec_configure(chip); - if (err < 0) + if (err) { + if ((chip->driver_caps & AZX_DCAPS_RETRY_PROBE) && + ++hda->probe_retry < 60) { + schedule_delayed_work(&hda->probe_work, + msecs_to_jiffies(1000)); + return 0; /* keep things up */ + } + dev_err(chip->card->dev, "Cannot probe codecs, giving up\n"); goto out_free; + } } err = snd_card_register(chip->card); @@ -2316,6 +2338,7 @@ static int azx_probe_continue(struct azx *chip) display_power(chip, false); complete_all(&hda->probe_wait); to_hda_bus(bus)->bus_probing = 0; + hda->probe_retry = 0; return 0; } @@ -2341,7 +2364,7 @@ static void azx_remove(struct pci_dev *pci) * device during cancel_work_sync() call. */ device_unlock(&pci->dev); - cancel_work_sync(&hda->probe_work); + cancel_delayed_work_sync(&hda->probe_work); device_lock(&pci->dev); snd_card_free(card); @@ -2357,7 +2380,7 @@ static void azx_shutdown(struct pci_dev *pci) return; chip = card->private_data; if (chip && chip->running) - azx_shutdown_chip(chip); + __azx_shutdown_chip(chip, true); } /* PCI IDs */ diff --git a/sound/pci/hda/hda_intel.h b/sound/pci/hda/hda_intel.h index 3fb119f090..0f39418f93 100644 --- a/sound/pci/hda/hda_intel.h +++ b/sound/pci/hda/hda_intel.h @@ -14,7 +14,7 @@ struct hda_intel { /* sync probing */ struct completion probe_wait; - struct work_struct probe_work; + struct delayed_work probe_work; /* card list (for power_save trigger) */ struct list_head list; @@ -30,6 +30,8 @@ struct hda_intel { unsigned int freed:1; /* resources already released */ bool need_i915_power:1; /* the hda controller needs i915 power */ + + int probe_retry; /* being probe-retry */ }; #endif diff --git a/sound/pci/hda/patch_cs8409.c b/sound/pci/hda/patch_cs8409.c index 3c7ef55d01..31ff11ab86 100644 --- a/sound/pci/hda/patch_cs8409.c +++ b/sound/pci/hda/patch_cs8409.c @@ -1207,6 +1207,9 @@ void dolphin_fixups(struct hda_codec *codec, const struct hda_fixup *fix, int ac snd_hda_jack_add_kctl(codec, DOLPHIN_LO_PIN_NID, "Line Out", true, SND_JACK_HEADPHONE, NULL); + snd_hda_jack_add_kctl(codec, DOLPHIN_AMIC_PIN_NID, "Microphone", true, + SND_JACK_MICROPHONE, NULL); + cs8409_fix_caps(codec, DOLPHIN_HP_PIN_NID); cs8409_fix_caps(codec, DOLPHIN_LO_PIN_NID); cs8409_fix_caps(codec, DOLPHIN_AMIC_PIN_NID); diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8b7a389b6a..22d27b12c4 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -526,6 +526,8 @@ static void alc_shutup_pins(struct hda_codec *codec) struct alc_spec *spec = codec->spec; switch (codec->core.vendor_id) { + case 0x10ec0236: + case 0x10ec0256: case 0x10ec0283: case 0x10ec0286: case 0x10ec0288: @@ -2537,7 +2539,8 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x70d1, "Clevo PC70[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), - SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170SM", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x7715, "Clevo X170KM-G", ALC1220_FIXUP_CLEVO_PB51ED), SND_PCI_QUIRK(0x1558, 0x9501, "Clevo P950HR", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x9506, "Clevo P955HQ", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x950a, "Clevo P955H[PR]", ALC1220_FIXUP_CLEVO_P950), @@ -3528,7 +3531,8 @@ static void alc256_shutup(struct hda_codec *codec) /* If disable 3k pulldown control for alc257, the Mic detection will not work correctly * when booting with headset plugged. So skip setting it for the codec alc257 */ - if (codec->core.vendor_id != 0x10ec0257) + if (spec->codec_variant != ALC269_TYPE_ALC257 && + spec->codec_variant != ALC269_TYPE_ALC256) alc_update_coef_idx(codec, 0x46, 0, 3 << 12); if (!spec->no_shutup_pins) @@ -6429,12 +6433,44 @@ static void alc_fixup_thinkpad_acpi(struct hda_codec *codec, hda_fixup_thinkpad_acpi(codec, fix, action); } +/* Fixup for Lenovo Legion 15IMHg05 speaker output on headset removal. */ +static void alc287_fixup_legion_15imhg05_speakers(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + struct alc_spec *spec = codec->spec; + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + spec->gen.suppress_auto_mute = 1; + break; + } +} + /* for alc295_fixup_hp_top_speakers */ #include "hp_x360_helper.c" /* for alc285_fixup_ideapad_s740_coef() */ #include "ideapad_s740_helper.c" +static void alc256_fixup_tongfang_reset_persistent_settings(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + /* + * A certain other OS sets these coeffs to different values. On at least one TongFang + * barebone these settings might survive even a cold reboot. So to restore a clean slate the + * values are explicitly reset to default here. Without this, the external microphone is + * always in a plugged-in state, while the internal microphone is always in an unplugged + * state, breaking the ability to use the internal microphone. + */ + alc_write_coef_idx(codec, 0x24, 0x0000); + alc_write_coef_idx(codec, 0x26, 0x0000); + alc_write_coef_idx(codec, 0x29, 0x3000); + alc_write_coef_idx(codec, 0x37, 0xfe05); + alc_write_coef_idx(codec, 0x45, 0x5089); +} + enum { ALC269_FIXUP_GPIO2, ALC269_FIXUP_SONY_VAIO, @@ -6646,6 +6682,11 @@ enum { ALC623_FIXUP_LENOVO_THINKSTATION_P340, ALC255_FIXUP_ACER_HEADPHONE_AND_MIC, ALC236_FIXUP_HP_LIMIT_INT_MIC_BOOST, + ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS, + ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE, + ALC287_FIXUP_YOGA7_14ITL_SPEAKERS, + ALC287_FIXUP_13S_GEN2_SPEAKERS, + ALC256_FIXUP_TONGFANG_RESET_PERSISTENT_SETTINGS, }; static const struct hda_fixup alc269_fixups[] = { @@ -8236,6 +8277,117 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF, }, + [ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS] = { + .type = HDA_FIXUP_VERBS, + //.v.verbs = legion_15imhg05_coefs, + .v.verbs = (const struct hda_verb[]) { + // set left speaker Legion 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x41 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x1a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + // set right speaker Legion 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x42 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + {} + }, + .chained = true, + .chain_id = ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE, + }, + [ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc287_fixup_legion_15imhg05_speakers, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, + [ALC287_FIXUP_YOGA7_14ITL_SPEAKERS] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + // set left speaker Yoga 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x41 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x1a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + // set right speaker Yoga 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x46 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + {} + }, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, + [ALC287_FIXUP_13S_GEN2_SPEAKERS] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x41 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x42 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + {} + }, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, + [ALC256_FIXUP_TONGFANG_RESET_PERSISTENT_SETTINGS] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc256_fixup_tongfang_reset_persistent_settings, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8327,6 +8479,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0a30, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a58, "Dell", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a61, "Dell XPS 15 9510", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x0a62, "Dell Precision 5560", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x0a9d, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x0a9e, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), @@ -8630,6 +8785,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940", ALC298_FIXUP_LENOVO_SPK_VOLUME), SND_PCI_QUIRK(0x17aa, 0x3827, "Ideapad S740", ALC285_FIXUP_IDEAPAD_S740_COEF), SND_PCI_QUIRK(0x17aa, 0x3843, "Yoga 9i", ALC287_FIXUP_IDEAPAD_BASS_SPK_AMP), + SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3852, "Lenovo Yoga 7 14ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3853, "Lenovo Yoga 7 15ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3819, "Lenovo 13s Gen2 ITL", ALC287_FIXUP_13S_GEN2_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), @@ -8660,6 +8819,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802), SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X), + SND_PCI_QUIRK(0x1d05, 0x1132, "TongFang PHxTxX1", ALC256_FIXUP_TONGFANG_RESET_PERSISTENT_SETTINGS), SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), @@ -10037,6 +10197,9 @@ enum { ALC671_FIXUP_HP_HEADSET_MIC2, ALC662_FIXUP_ACER_X2660G_HEADSET_MODE, ALC662_FIXUP_ACER_NITRO_HEADSET_MODE, + ALC668_FIXUP_ASUS_NO_HEADSET_MIC, + ALC668_FIXUP_HEADSET_MIC, + ALC668_FIXUP_MIC_DET_COEF, }; static const struct hda_fixup alc662_fixups[] = { @@ -10420,6 +10583,29 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC662_FIXUP_USI_FUNC }, + [ALC668_FIXUP_ASUS_NO_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x1b, 0x04a1112c }, + { } + }, + .chained = true, + .chain_id = ALC668_FIXUP_HEADSET_MIC + }, + [ALC668_FIXUP_HEADSET_MIC] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc269_fixup_headset_mic, + .chained = true, + .chain_id = ALC668_FIXUP_MIC_DET_COEF + }, + [ALC668_FIXUP_MIC_DET_COEF] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x15 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0d60 }, + {} + }, + }, }; static const struct snd_pci_quirk alc662_fixup_tbl[] = { @@ -10455,6 +10641,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51), SND_PCI_QUIRK(0x1043, 0x17bd, "ASUS N751", ALC668_FIXUP_ASUS_Nx51), + SND_PCI_QUIRK(0x1043, 0x185d, "ASUS G551JW", ALC668_FIXUP_ASUS_NO_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1963, "ASUS X71SL", ALC662_FIXUP_ASUS_MODE8), SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), diff --git a/sound/soc/bcm/Kconfig b/sound/soc/bcm/Kconfig index d65df373c3..c85714895f 100644 --- a/sound/soc/bcm/Kconfig +++ b/sound/soc/bcm/Kconfig @@ -79,7 +79,8 @@ config SND_BCM2708_SOC_HIFIBERRY_DACPLUSADCPRO tristate "Support for HifiBerry DAC+ADC PRO" depends on SND_BCM2708_SOC_I2S || SND_BCM2835_SOC_I2S select SND_SOC_PCM512x_I2C - select SND_SOC_PCM186X_I2C + select SND_SOC_PCM186X_I2C + select SND_SOC_TPA6130A2 select COMMON_CLK_HIFIBERRY_DACPRO help Say Y or M if you want to add support for HifiBerry DAC+ADC PRO. diff --git a/sound/soc/bcm/hifiberry_dacplusadcpro.c b/sound/soc/bcm/hifiberry_dacplusadcpro.c index 79eccdb4dc..517a70fba7 100644 --- a/sound/soc/bcm/hifiberry_dacplusadcpro.c +++ b/sound/soc/bcm/hifiberry_dacplusadcpro.c @@ -4,8 +4,8 @@ * Author: Daniel Matuschek, Stuart MacLean * Copyright 2014-2015 * based on code by Florian Meier - * ADC added by Joerg Schambacher - * Copyright 2018-19 + * ADC, HP added by Joerg Schambacher + * Copyright 2018-21 * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -468,6 +469,15 @@ static struct snd_soc_dai_link snd_rpi_hifiberry_dacplusadcpro_dai[] = { }, }; +/* aux device for optional headphone amp */ +static struct snd_soc_aux_dev hifiberry_dacplusadcpro_aux_devs[] = { + { + .dlc = { + .name = "tpa6130a2.1-0060", + }, + }, +}; + /* audio machine driver */ static struct snd_soc_card snd_rpi_hifiberry_dacplusadcpro = { .name = "snd_rpi_hifiberry_dacplusadcpro", @@ -477,10 +487,68 @@ static struct snd_soc_card snd_rpi_hifiberry_dacplusadcpro = { .num_links = ARRAY_SIZE(snd_rpi_hifiberry_dacplusadcpro_dai), }; +static int hb_hp_detect(void) +{ + struct i2c_adapter *adap = i2c_get_adapter(1); + int ret; + struct i2c_client tpa_i2c_client = { + .addr = 0x60, + .adapter = adap, + }; + + if (!adap) + return -EPROBE_DEFER; /* I2C module not yet available */ + + ret = i2c_smbus_read_byte(&tpa_i2c_client) >= 0; + i2c_put_adapter(adap); + return ret; +}; + +static struct property tpa_enable_prop = { + .name = "status", + .length = 4 + 1, /* length 'okay' + 1 */ + .value = "okay", + }; + static int snd_rpi_hifiberry_dacplusadcpro_probe(struct platform_device *pdev) { int ret = 0, i = 0; struct snd_soc_card *card = &snd_rpi_hifiberry_dacplusadcpro; + struct device_node *tpa_node; + struct property *tpa_prop; + struct of_changeset ocs; + int len; + + /* probe for head phone amp */ + ret = hb_hp_detect(); + if (ret < 0) + return ret; + if (ret) { + card->aux_dev = hifiberry_dacplusadcpro_aux_devs; + card->num_aux_devs = + ARRAY_SIZE(hifiberry_dacplusadcpro_aux_devs); + tpa_node = of_find_compatible_node(NULL, NULL, "ti,tpa6130a2"); + tpa_prop = of_find_property(tpa_node, "status", &len); + + if (strcmp((char *)tpa_prop->value, "okay")) { + /* and activate headphone using change_sets */ + dev_info(&pdev->dev, "activating headphone amplifier"); + of_changeset_init(&ocs); + ret = of_changeset_update_property(&ocs, tpa_node, + &tpa_enable_prop); + if (ret) { + dev_err(&pdev->dev, + "cannot activate headphone amplifier\n"); + return -ENODEV; + } + ret = of_changeset_apply(&ocs); + if (ret) { + dev_err(&pdev->dev, + "cannot activate headphone amplifier\n"); + return -ENODEV; + } + } + } snd_rpi_hifiberry_dacplusadcpro.dev = &pdev->dev; if (pdev->dev.of_node) { @@ -531,7 +599,7 @@ static struct platform_driver snd_rpi_hifiberry_dacplusadcpro_driver = { module_platform_driver(snd_rpi_hifiberry_dacplusadcpro_driver); -MODULE_AUTHOR("Joerg Schambacher "); +MODULE_AUTHOR("Joerg Schambacher "); MODULE_AUTHOR("Daniel Matuschek "); MODULE_DESCRIPTION("ASoC Driver for HiFiBerry DAC+ADC"); MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c index a961f837cd..bda66b30e0 100644 --- a/sound/soc/fsl/fsl_esai.c +++ b/sound/soc/fsl/fsl_esai.c @@ -1073,6 +1073,16 @@ static int fsl_esai_probe(struct platform_device *pdev) if (ret < 0) goto err_pm_get_sync; + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = imx_pcm_dma_init(pdev, IMX_ESAI_DMABUF_SIZE); + if (ret) { + dev_err(&pdev->dev, "failed to init imx pcm dma: %d\n", ret); + goto err_pm_get_sync; + } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_esai_component, &fsl_esai_dai, 1); if (ret) { @@ -1082,12 +1092,6 @@ static int fsl_esai_probe(struct platform_device *pdev) INIT_WORK(&esai_priv->work, fsl_esai_hw_reset); - ret = imx_pcm_dma_init(pdev, IMX_ESAI_DMABUF_SIZE); - if (ret) { - dev_err(&pdev->dev, "failed to init imx pcm dma: %d\n", ret); - goto err_pm_get_sync; - } - return ret; err_pm_get_sync: diff --git a/sound/soc/fsl/fsl_micfil.c b/sound/soc/fsl/fsl_micfil.c index 8c0c75ce94..9f90989ac5 100644 --- a/sound/soc/fsl/fsl_micfil.c +++ b/sound/soc/fsl/fsl_micfil.c @@ -737,18 +737,23 @@ static int fsl_micfil_probe(struct platform_device *pdev) pm_runtime_enable(&pdev->dev); regcache_cache_only(micfil->regmap, true); + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0); + if (ret) { + dev_err(&pdev->dev, "failed to pcm register\n"); + return ret; + } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_micfil_component, &fsl_micfil_dai, 1); if (ret) { dev_err(&pdev->dev, "failed to register component %s\n", fsl_micfil_component.name); - return ret; } - ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0); - if (ret) - dev_err(&pdev->dev, "failed to pcm register\n"); - return ret; } diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index 223fcd15bf..38f6362099 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -1152,11 +1152,10 @@ static int fsl_sai_probe(struct platform_device *pdev) if (ret < 0) goto err_pm_get_sync; - ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component, - &sai->cpu_dai_drv, 1); - if (ret) - goto err_pm_get_sync; - + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ if (sai->soc_data->use_imx_pcm) { ret = imx_pcm_dma_init(pdev, IMX_SAI_DMABUF_SIZE); if (ret) @@ -1167,6 +1166,11 @@ static int fsl_sai_probe(struct platform_device *pdev) goto err_pm_get_sync; } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component, + &sai->cpu_dai_drv, 1); + if (ret) + goto err_pm_get_sync; + return ret; err_pm_get_sync: diff --git a/sound/soc/fsl/fsl_spdif.c b/sound/soc/fsl/fsl_spdif.c index 8ffb1a6048..1c53719bb6 100644 --- a/sound/soc/fsl/fsl_spdif.c +++ b/sound/soc/fsl/fsl_spdif.c @@ -1434,6 +1434,16 @@ static int fsl_spdif_probe(struct platform_device *pdev) pm_runtime_enable(&pdev->dev); regcache_cache_only(spdif_priv->regmap, true); + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = imx_pcm_dma_init(pdev, IMX_SPDIF_DMABUF_SIZE); + if (ret) { + dev_err_probe(&pdev->dev, ret, "imx_pcm_dma_init failed\n"); + goto err_pm_disable; + } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_spdif_component, &spdif_priv->cpu_dai_drv, 1); if (ret) { @@ -1441,12 +1451,6 @@ static int fsl_spdif_probe(struct platform_device *pdev) goto err_pm_disable; } - ret = imx_pcm_dma_init(pdev, IMX_SPDIF_DMABUF_SIZE); - if (ret) { - dev_err_probe(&pdev->dev, ret, "imx_pcm_dma_init failed\n"); - goto err_pm_disable; - } - return ret; err_pm_disable: diff --git a/sound/soc/fsl/fsl_xcvr.c b/sound/soc/fsl/fsl_xcvr.c index 31c5ee641f..7ba2fd1513 100644 --- a/sound/soc/fsl/fsl_xcvr.c +++ b/sound/soc/fsl/fsl_xcvr.c @@ -1215,18 +1215,23 @@ static int fsl_xcvr_probe(struct platform_device *pdev) pm_runtime_enable(dev); regcache_cache_only(xcvr->regmap, true); + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = devm_snd_dmaengine_pcm_register(dev, NULL, 0); + if (ret) { + dev_err(dev, "failed to pcm register\n"); + return ret; + } + ret = devm_snd_soc_register_component(dev, &fsl_xcvr_comp, &fsl_xcvr_dai, 1); if (ret) { dev_err(dev, "failed to register component %s\n", fsl_xcvr_comp.name); - return ret; } - ret = devm_snd_dmaengine_pcm_register(dev, NULL, 0); - if (ret) - dev_err(dev, "failed to pcm register\n"); - return ret; } diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 6602eda89e..6b06248a93 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -929,6 +929,11 @@ static int create_sdw_dailink(struct snd_soc_card *card, cpus + *cpu_id, cpu_dai_num, codecs, codec_num, NULL, &sdw_ops); + /* + * SoundWire DAILINKs use 'stream' functions and Bank Switch operations + * based on wait_for_completion(), tag them as 'nonatomic'. + */ + dai_links[*be_index].nonatomic = true; ret = set_codec_init_func(card, link, dai_links + (*be_index)++, playback, group_id); diff --git a/sound/soc/mediatek/Kconfig b/sound/soc/mediatek/Kconfig index 5a2f4667d5..81ad2dcee9 100644 --- a/sound/soc/mediatek/Kconfig +++ b/sound/soc/mediatek/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SND_SOC_MEDIATEK tristate + select REGMAP_MMIO config SND_SOC_MT2701 tristate "ASoC support for Mediatek MT2701 chip" @@ -188,7 +189,9 @@ config SND_SOC_MT8192_MT6359_RT1015_RT5682 config SND_SOC_MT8195 tristate "ASoC support for Mediatek MT8195 chip" depends on ARCH_MEDIATEK || COMPILE_TEST + depends on COMMON_CLK select SND_SOC_MEDIATEK + select MFD_SYSCON if SND_SOC_MT6359 help This adds ASoC platform driver support for Mediatek MT8195 chip that can be used with other codecs. diff --git a/sound/soc/mediatek/common/mtk-afe-fe-dai.c b/sound/soc/mediatek/common/mtk-afe-fe-dai.c index baaa5881b1..e95c7c018e 100644 --- a/sound/soc/mediatek/common/mtk-afe-fe-dai.c +++ b/sound/soc/mediatek/common/mtk-afe-fe-dai.c @@ -334,9 +334,11 @@ int mtk_afe_suspend(struct snd_soc_component *component) devm_kcalloc(dev, afe->reg_back_up_list_num, sizeof(unsigned int), GFP_KERNEL); - for (i = 0; i < afe->reg_back_up_list_num; i++) - regmap_read(regmap, afe->reg_back_up_list[i], - &afe->reg_back_up[i]); + if (afe->reg_back_up) { + for (i = 0; i < afe->reg_back_up_list_num; i++) + regmap_read(regmap, afe->reg_back_up_list[i], + &afe->reg_back_up[i]); + } afe->suspended = true; afe->runtime_suspend(dev); @@ -356,12 +358,13 @@ int mtk_afe_resume(struct snd_soc_component *component) afe->runtime_resume(dev); - if (!afe->reg_back_up) + if (!afe->reg_back_up) { dev_dbg(dev, "%s no reg_backup\n", __func__); - - for (i = 0; i < afe->reg_back_up_list_num; i++) - mtk_regmap_write(regmap, afe->reg_back_up_list[i], - afe->reg_back_up[i]); + } else { + for (i = 0; i < afe->reg_back_up_list_num; i++) + mtk_regmap_write(regmap, afe->reg_back_up_list[i], + afe->reg_back_up[i]); + } afe->suspended = false; return 0; diff --git a/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c b/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c index c97ace7387..de09f67c04 100644 --- a/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c +++ b/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c @@ -424,8 +424,8 @@ static int mt8195_hdmi_codec_init(struct snd_soc_pcm_runtime *rtd) return snd_soc_component_set_jack(cmpnt_codec, &priv->hdmi_jack, NULL); } -static int mt8195_hdmitx_dptx_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, - struct snd_pcm_hw_params *params) +static int mt8195_dptx_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, + struct snd_pcm_hw_params *params) { /* fix BE i2s format to 32bit, clean param mask first */ @@ -902,7 +902,7 @@ static struct snd_soc_dai_link mt8195_mt6359_rt1019_rt5682_dai_links[] = { .no_pcm = 1, .dpcm_playback = 1, .ops = &mt8195_dptx_ops, - .be_hw_params_fixup = mt8195_hdmitx_dptx_hw_params_fixup, + .be_hw_params_fixup = mt8195_dptx_hw_params_fixup, SND_SOC_DAILINK_REG(DPTX_BE), }, [DAI_LINK_ETDM1_IN_BE] = { @@ -953,7 +953,6 @@ static struct snd_soc_dai_link mt8195_mt6359_rt1019_rt5682_dai_links[] = { SND_SOC_DAIFMT_NB_NF | SND_SOC_DAIFMT_CBS_CFS, .dpcm_playback = 1, - .be_hw_params_fixup = mt8195_hdmitx_dptx_hw_params_fixup, SND_SOC_DAILINK_REG(ETDM3_OUT_BE), }, [DAI_LINK_PCM1_BE] = { diff --git a/sound/soc/sof/core.c b/sound/soc/sof/core.c index 3e4dd4a863..59d0d7b2b5 100644 --- a/sound/soc/sof/core.c +++ b/sound/soc/sof/core.c @@ -371,7 +371,6 @@ int snd_sof_device_remove(struct device *dev) dev_warn(dev, "error: %d failed to prepare DSP for device removal", ret); - snd_sof_fw_unload(sdev); snd_sof_ipc_free(sdev); snd_sof_free_debug(sdev); snd_sof_free_trace(sdev); @@ -394,8 +393,7 @@ int snd_sof_device_remove(struct device *dev) snd_sof_remove(sdev); /* release firmware */ - release_firmware(pdata->fw); - pdata->fw = NULL; + snd_sof_fw_unload(sdev); return 0; } diff --git a/sound/soc/sof/imx/imx8.c b/sound/soc/sof/imx/imx8.c index 12fedf0984..7e9723a10d 100644 --- a/sound/soc/sof/imx/imx8.c +++ b/sound/soc/sof/imx/imx8.c @@ -365,7 +365,14 @@ static int imx8_remove(struct snd_sof_dev *sdev) /* on i.MX8 there is 1 to 1 match between type and BAR idx */ static int imx8_get_bar_index(struct snd_sof_dev *sdev, u32 type) { - return type; + /* Only IRAM and SRAM bars are valid */ + switch (type) { + case SOF_FW_BLK_TYPE_IRAM: + case SOF_FW_BLK_TYPE_SRAM: + return type; + default: + return -EINVAL; + } } static void imx8_ipc_msg_data(struct snd_sof_dev *sdev, diff --git a/sound/soc/sof/imx/imx8m.c b/sound/soc/sof/imx/imx8m.c index cb822d9537..892e1482f9 100644 --- a/sound/soc/sof/imx/imx8m.c +++ b/sound/soc/sof/imx/imx8m.c @@ -228,7 +228,14 @@ static int imx8m_remove(struct snd_sof_dev *sdev) /* on i.MX8 there is 1 to 1 match between type and BAR idx */ static int imx8m_get_bar_index(struct snd_sof_dev *sdev, u32 type) { - return type; + /* Only IRAM and SRAM bars are valid */ + switch (type) { + case SOF_FW_BLK_TYPE_IRAM: + case SOF_FW_BLK_TYPE_SRAM: + return type; + default: + return -EINVAL; + } } static void imx8m_ipc_msg_data(struct snd_sof_dev *sdev, diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index 2b38a77cd5..bb79c77775 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -729,10 +729,10 @@ int snd_sof_load_firmware_raw(struct snd_sof_dev *sdev) ret = request_firmware(&plat_data->fw, fw_filename, sdev->dev); if (ret < 0) { - dev_err(sdev->dev, "error: request firmware %s failed err: %d\n", - fw_filename, ret); dev_err(sdev->dev, - "you may need to download the firmware from https://github.com/thesofproject/sof-bin/\n"); + "error: sof firmware file is missing, you might need to\n"); + dev_err(sdev->dev, + " download it from https://github.com/thesofproject/sof-bin/\n"); goto err; } else { dev_dbg(sdev->dev, "request_firmware %s successful\n", @@ -880,5 +880,7 @@ EXPORT_SYMBOL(snd_sof_run_firmware); void snd_sof_fw_unload(struct snd_sof_dev *sdev) { /* TODO: support module unloading at runtime */ + release_firmware(sdev->pdata->fw); + sdev->pdata->fw = NULL; } EXPORT_SYMBOL(snd_sof_fw_unload); diff --git a/sound/soc/sof/trace.c b/sound/soc/sof/trace.c index f72a6e83e6..58f6ca5cf4 100644 --- a/sound/soc/sof/trace.c +++ b/sound/soc/sof/trace.c @@ -530,7 +530,6 @@ void snd_sof_trace_notify_for_error(struct snd_sof_dev *sdev) return; if (sdev->dtrace_is_enabled) { - dev_err(sdev->dev, "error: waking up any trace sleepers\n"); sdev->dtrace_error = true; wake_up(&sdev->trace_sleep); } diff --git a/sound/usb/card.c b/sound/usb/card.c index 236604f1cd..fbbe330a52 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -1060,7 +1060,7 @@ static int usb_audio_suspend(struct usb_interface *intf, pm_message_t message) return 0; } -static int __usb_audio_resume(struct usb_interface *intf, bool reset_resume) +static int usb_audio_resume(struct usb_interface *intf) { struct snd_usb_audio *chip = usb_get_intfdata(intf); struct snd_usb_stream *as; @@ -1086,7 +1086,7 @@ static int __usb_audio_resume(struct usb_interface *intf, bool reset_resume) * we just notify and restart the mixers */ list_for_each_entry(mixer, &chip->mixer_list, list) { - err = snd_usb_mixer_resume(mixer, reset_resume); + err = snd_usb_mixer_resume(mixer); if (err < 0) goto err_out; } @@ -1106,20 +1106,10 @@ static int __usb_audio_resume(struct usb_interface *intf, bool reset_resume) atomic_dec(&chip->active); /* allow autopm after this point */ return err; } - -static int usb_audio_resume(struct usb_interface *intf) -{ - return __usb_audio_resume(intf, false); -} - -static int usb_audio_reset_resume(struct usb_interface *intf) -{ - return __usb_audio_resume(intf, true); -} #else #define usb_audio_suspend NULL #define usb_audio_resume NULL -#define usb_audio_reset_resume NULL +#define usb_audio_resume NULL #endif /* CONFIG_PM */ static const struct usb_device_id usb_audio_ids [] = { @@ -1141,7 +1131,7 @@ static struct usb_driver usb_audio_driver = { .disconnect = usb_audio_disconnect, .suspend = usb_audio_suspend, .resume = usb_audio_resume, - .reset_resume = usb_audio_reset_resume, + .reset_resume = usb_audio_resume, .id_table = usb_audio_ids, .supports_autosuspend = 1, }; diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 43bc59575a..a2ce535df1 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -3653,33 +3653,16 @@ static int restore_mixer_value(struct usb_mixer_elem_list *list) return 0; } -static int default_mixer_reset_resume(struct usb_mixer_elem_list *list) -{ - int err; - - if (list->resume) { - err = list->resume(list); - if (err < 0) - return err; - } - return restore_mixer_value(list); -} - -int snd_usb_mixer_resume(struct usb_mixer_interface *mixer, bool reset_resume) +int snd_usb_mixer_resume(struct usb_mixer_interface *mixer) { struct usb_mixer_elem_list *list; - usb_mixer_elem_resume_func_t f; int id, err; /* restore cached mixer values */ for (id = 0; id < MAX_ID_ELEMS; id++) { for_each_mixer_elem(list, mixer, id) { - if (reset_resume) - f = list->reset_resume; - else - f = list->resume; - if (f) { - err = f(list); + if (list->resume) { + err = list->resume(list); if (err < 0) return err; } @@ -3700,7 +3683,6 @@ void snd_usb_mixer_elem_init_std(struct usb_mixer_elem_list *list, list->id = unitid; list->dump = snd_usb_mixer_dump_cval; #ifdef CONFIG_PM - list->resume = NULL; - list->reset_resume = default_mixer_reset_resume; + list->resume = restore_mixer_value; #endif } diff --git a/sound/usb/mixer.h b/sound/usb/mixer.h index 876bbc9a71..98ea24d91d 100644 --- a/sound/usb/mixer.h +++ b/sound/usb/mixer.h @@ -70,7 +70,6 @@ struct usb_mixer_elem_list { bool is_std_info; usb_mixer_elem_dump_func_t dump; usb_mixer_elem_resume_func_t resume; - usb_mixer_elem_resume_func_t reset_resume; }; /* iterate over mixer element list of the given unit id */ @@ -121,7 +120,7 @@ int snd_usb_mixer_vol_tlv(struct snd_kcontrol *kcontrol, int op_flag, #ifdef CONFIG_PM int snd_usb_mixer_suspend(struct usb_mixer_interface *mixer); -int snd_usb_mixer_resume(struct usb_mixer_interface *mixer, bool reset_resume); +int snd_usb_mixer_resume(struct usb_mixer_interface *mixer); #endif int snd_usb_set_cur_mix_value(struct usb_mixer_elem_info *cval, int channel, diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index a66ce0375f..46082dc57b 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -151,7 +151,7 @@ static int add_single_ctl_with_resume(struct usb_mixer_interface *mixer, *listp = list; list->mixer = mixer; list->id = id; - list->reset_resume = resume; + list->resume = resume; kctl = snd_ctl_new1(knew, list); if (!kctl) { kfree(list); diff --git a/sound/usb/mixer_scarlett_gen2.c b/sound/usb/mixer_scarlett_gen2.c index 3d5848d548..53ebabf424 100644 --- a/sound/usb/mixer_scarlett_gen2.c +++ b/sound/usb/mixer_scarlett_gen2.c @@ -2450,6 +2450,8 @@ static int scarlett2_update_monitor_other(struct usb_mixer_interface *mixer) err = scarlett2_usb_get_config(mixer, SCARLETT2_CONFIG_TALKBACK_MAP, 1, &bitmap); + if (err < 0) + return err; for (i = 0; i < num_mixes; i++, bitmap >>= 1) private->talkback_map[i] = bitmap & 1; } diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index e03043f7da..de18fff692 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -77,6 +77,48 @@ /* E-Mu 0204 USB */ { USB_DEVICE_VENDOR_SPEC(0x041e, 0x3f19) }, +/* + * Creative Technology, Ltd Live! Cam Sync HD [VF0770] + * The device advertises 8 formats, but only a rate of 48kHz is honored by the + * hardware and 24 bits give chopped audio, so only report the one working + * combination. + */ +{ + USB_DEVICE(0x041e, 0x4095), + .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) { + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = &(const struct snd_usb_audio_quirk[]) { + { + .ifnum = 2, + .type = QUIRK_AUDIO_STANDARD_MIXER, + }, + { + .ifnum = 3, + .type = QUIRK_AUDIO_FIXED_ENDPOINT, + .data = &(const struct audioformat) { + .formats = SNDRV_PCM_FMTBIT_S16_LE, + .channels = 2, + .fmt_bits = 16, + .iface = 3, + .altsetting = 4, + .altset_idx = 4, + .endpoint = 0x82, + .ep_attr = 0x05, + .rates = SNDRV_PCM_RATE_48000, + .rate_min = 48000, + .rate_max = 48000, + .nr_rates = 1, + .rate_table = (unsigned int[]) { 48000 }, + }, + }, + { + .ifnum = -1 + }, + }, + }, +}, + /* * HP Wireless Audio * When not ignored, causes instability issues for some users, forcing them to diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 2095f091cc..fbd2bde7c0 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1902,6 +1902,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY), VENDOR_FLG(0x07fd, /* MOTU */ QUIRK_FLAG_VALIDATE_RATES), + VENDOR_FLG(0x1235, /* Focusrite Novation */ + QUIRK_FLAG_VALIDATE_RATES), VENDOR_FLG(0x152a, /* Thesycon devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x1de7, /* Phoenix Audio */ diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index 1d84ec9db9..5859ca0a14 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -784,6 +784,7 @@ struct snd_rawmidi_status { #define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) #define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) +#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) #define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) #define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) #define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 88d8825fc6..e4f83c304e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6894,7 +6894,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) if (obj->gen_loader) { /* reset FDs */ - btf__set_fd(obj->btf, -1); + if (obj->btf) + btf__set_fd(obj->btf, -1); for (i = 0; i < obj->nr_maps; i++) obj->maps[i].fd = -1; if (!err) diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 10911a8cad..2df880cefd 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1649,11 +1649,17 @@ static bool btf_is_non_static(const struct btf_type *t) static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, int *out_btf_sec_id, int *out_btf_id) { - int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; + int i, j, n, m, btf_id = 0; const struct btf_type *t; const struct btf_var_secinfo *vi; const char *name; + if (!obj->btf) { + pr_warn("failed to find BTF info for object '%s'\n", obj->filename); + return -EINVAL; + } + + n = btf__get_nr_types(obj->btf); for (i = 1; i <= n; i++) { t = btf__type_by_id(obj->btf, i); diff --git a/tools/lib/bpf/strset.c b/tools/lib/bpf/strset.c index 1fb8b49de1..ea65531815 100644 --- a/tools/lib/bpf/strset.c +++ b/tools/lib/bpf/strset.c @@ -88,6 +88,7 @@ void strset__free(struct strset *set) hashmap__free(set->strs_hash); free(set->strs_data); + free(set); } size_t strset__data_size(const struct strset *set) diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c index c67c833991..ce91a582f0 100644 --- a/tools/lib/perf/tests/test-evlist.c +++ b/tools/lib/perf/tests/test-evlist.c @@ -40,7 +40,7 @@ static int test_stat_cpu(void) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK, }; - int err, cpu, tmp; + int err, idx; cpus = perf_cpu_map__new(NULL); __T("failed to create cpus", cpus); @@ -70,10 +70,10 @@ static int test_stat_cpu(void) perf_evlist__for_each_evsel(evlist, evsel) { cpus = perf_evsel__cpus(evsel); - perf_cpu_map__for_each_cpu(cpu, tmp, cpus) { + for (idx = 0; idx < perf_cpu_map__nr(cpus); idx++) { struct perf_counts_values counts = { .val = 0 }; - perf_evsel__read(evsel, cpu, 0, &counts); + perf_evsel__read(evsel, idx, 0, &counts); __T("failed to read value for evsel", counts.val != 0); } } diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index a184e48616..33ae933486 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -22,7 +22,7 @@ static int test_stat_cpu(void) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, }; - int err, cpu, tmp; + int err, idx; cpus = perf_cpu_map__new(NULL); __T("failed to create cpus", cpus); @@ -33,10 +33,10 @@ static int test_stat_cpu(void) err = perf_evsel__open(evsel, cpus, NULL); __T("failed to open evsel", err == 0); - perf_cpu_map__for_each_cpu(cpu, tmp, cpus) { + for (idx = 0; idx < perf_cpu_map__nr(cpus); idx++) { struct perf_counts_values counts = { .val = 0 }; - perf_evsel__read(evsel, cpu, 0, &counts); + perf_evsel__read(evsel, idx, 0, &counts); __T("failed to read value for evsel", counts.val != 0); } @@ -148,6 +148,7 @@ static int test_stat_user_read(int event) __T("failed to mmap evsel", err == 0); pc = perf_evsel__mmap_base(evsel, 0, 0); + __T("failed to get mmapped address", pc); #if defined(__i386__) || defined(__x86_64__) __T("userspace counter access not supported", pc->cap_user_rdpmc); diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index bc821056ab..0893436cc0 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -684,7 +684,7 @@ static int elf_add_alternative(struct elf *elf, sec = find_section_by_name(elf, ".altinstructions"); if (!sec) { sec = elf_create_section(elf, ".altinstructions", - SHF_ALLOC, size, 0); + SHF_ALLOC, 0, 0); if (!sec) { WARN_ELF("elf_create_section"); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e5947fbb9e..06b5c164ae 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -292,7 +292,7 @@ static int decode_instructions(struct objtool_file *file) !strcmp(sec->name, ".entry.text")) sec->noinstr = true; - for (offset = 0; offset < sec->len; offset += insn->len) { + for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { insn = malloc(sizeof(*insn)); if (!insn) { WARN("malloc failed"); @@ -307,7 +307,7 @@ static int decode_instructions(struct objtool_file *file) insn->offset = offset; ret = arch_decode_instruction(file->elf, sec, offset, - sec->len - offset, + sec->sh.sh_size - offset, &insn->len, &insn->type, &insn->immediate, &insn->stack_ops); @@ -349,9 +349,9 @@ static struct instruction *find_last_insn(struct objtool_file *file, { struct instruction *insn = NULL; unsigned int offset; - unsigned int end = (sec->len > 10) ? sec->len - 10 : 0; + unsigned int end = (sec->sh.sh_size > 10) ? sec->sh.sh_size - 10 : 0; - for (offset = sec->len - 1; offset >= end && !insn; offset--) + for (offset = sec->sh.sh_size - 1; offset >= end && !insn; offset--) insn = find_insn(file, sec, offset); return insn; @@ -389,7 +389,7 @@ static int add_dead_ends(struct objtool_file *file) insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (reloc->addend == reloc->sym->sec->len) { + else if (reloc->addend == reloc->sym->sec->sh.sh_size) { insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find unreachable insn at %s+0x%x", @@ -424,7 +424,7 @@ static int add_dead_ends(struct objtool_file *file) insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (reloc->addend == reloc->sym->sec->len) { + else if (reloc->addend == reloc->sym->sec->sh.sh_size) { insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find reachable insn at %s+0x%x", @@ -1561,14 +1561,14 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - if (sec->len % sizeof(struct unwind_hint)) { + if (sec->sh.sh_size % sizeof(struct unwind_hint)) { WARN("struct unwind_hint size mismatch"); return -1; } file->hints = true; - for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) { + for (i = 0; i < sec->sh.sh_size / sizeof(struct unwind_hint); i++) { hint = (struct unwind_hint *)sec->data->d_buf + i; reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint)); diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 8676c75987..fee03b744a 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -286,10 +286,9 @@ static int read_sections(struct elf *elf) return -1; } } - sec->len = sec->sh.sh_size; if (sec->sh.sh_flags & SHF_EXECINSTR) - elf->text_size += sec->len; + elf->text_size += sec->sh.sh_size; list_add_tail(&sec->list, &elf->sections); elf_hash_add(section, &sec->hash, sec->idx); @@ -509,6 +508,7 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, list_add_tail(&reloc->list, &sec->reloc->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); + sec->reloc->sh.sh_size += sec->reloc->sh.sh_entsize; sec->reloc->changed = true; return 0; @@ -734,8 +734,8 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str) data->d_size = strlen(str) + 1; data->d_align = 1; - len = strtab->len; - strtab->len += data->d_size; + len = strtab->sh.sh_size; + strtab->sh.sh_size += data->d_size; strtab->changed = true; return len; @@ -790,9 +790,9 @@ struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) data->d_align = 1; data->d_type = ELF_T_SYM; - sym->idx = symtab->len / sizeof(sym->sym); + sym->idx = symtab->sh.sh_size / sizeof(sym->sym); - symtab->len += data->d_size; + symtab->sh.sh_size += data->d_size; symtab->changed = true; symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); @@ -814,7 +814,7 @@ struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) data->d_align = 4; data->d_type = ELF_T_WORD; - symtab_shndx->len += 4; + symtab_shndx->sh.sh_size += 4; symtab_shndx->changed = true; } @@ -855,7 +855,6 @@ struct section *elf_create_section(struct elf *elf, const char *name, } sec->idx = elf_ndxscn(s); - sec->len = size; sec->changed = true; sec->data = elf_newdata(s); @@ -979,63 +978,63 @@ static struct section *elf_create_reloc_section(struct elf *elf, } } -static int elf_rebuild_rel_reloc_section(struct section *sec, int nr) +static int elf_rebuild_rel_reloc_section(struct section *sec) { struct reloc *reloc; - int idx = 0, size; + int idx = 0; void *buf; /* Allocate a buffer for relocations */ - size = nr * sizeof(GElf_Rel); - buf = malloc(size); + buf = malloc(sec->sh.sh_size); if (!buf) { perror("malloc"); return -1; } sec->data->d_buf = buf; - sec->data->d_size = size; + sec->data->d_size = sec->sh.sh_size; sec->data->d_type = ELF_T_REL; - sec->sh.sh_size = size; - idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { reloc->rel.r_offset = reloc->offset; reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); - gelf_update_rel(sec->data, idx, &reloc->rel); + if (!gelf_update_rel(sec->data, idx, &reloc->rel)) { + WARN_ELF("gelf_update_rel"); + return -1; + } idx++; } return 0; } -static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) +static int elf_rebuild_rela_reloc_section(struct section *sec) { struct reloc *reloc; - int idx = 0, size; + int idx = 0; void *buf; /* Allocate a buffer for relocations with addends */ - size = nr * sizeof(GElf_Rela); - buf = malloc(size); + buf = malloc(sec->sh.sh_size); if (!buf) { perror("malloc"); return -1; } sec->data->d_buf = buf; - sec->data->d_size = size; + sec->data->d_size = sec->sh.sh_size; sec->data->d_type = ELF_T_RELA; - sec->sh.sh_size = size; - idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { reloc->rela.r_offset = reloc->offset; reloc->rela.r_addend = reloc->addend; reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); - gelf_update_rela(sec->data, idx, &reloc->rela); + if (!gelf_update_rela(sec->data, idx, &reloc->rela)) { + WARN_ELF("gelf_update_rela"); + return -1; + } idx++; } @@ -1044,16 +1043,9 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) static int elf_rebuild_reloc_section(struct elf *elf, struct section *sec) { - struct reloc *reloc; - int nr; - - nr = 0; - list_for_each_entry(reloc, &sec->reloc_list, list) - nr++; - switch (sec->sh.sh_type) { - case SHT_REL: return elf_rebuild_rel_reloc_section(sec, nr); - case SHT_RELA: return elf_rebuild_rela_reloc_section(sec, nr); + case SHT_REL: return elf_rebuild_rel_reloc_section(sec); + case SHT_RELA: return elf_rebuild_rela_reloc_section(sec); default: return -1; } } @@ -1113,12 +1105,6 @@ int elf_write(struct elf *elf) /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->changed) { - if (sec->base && - elf_rebuild_reloc_section(elf, sec)) { - WARN("elf_rebuild_reloc_section"); - return -1; - } - s = elf_getscn(elf->elf, sec->idx); if (!s) { WARN_ELF("elf_getscn"); @@ -1129,6 +1115,12 @@ int elf_write(struct elf *elf) return -1; } + if (sec->base && + elf_rebuild_reloc_section(elf, sec)) { + WARN("elf_rebuild_reloc_section"); + return -1; + } + sec->changed = false; elf->changed = true; } diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index e343950475..075d8291b8 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -38,7 +38,6 @@ struct section { Elf_Data *data; char *name; int idx; - unsigned int len; bool changed, text, rodata, noinstr; }; diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index dc9b7dd314..b5865e2450 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -204,7 +204,7 @@ int orc_create(struct objtool_file *file) /* Add a section terminator */ if (!empty) { - orc_list_add(&orc_list, &null, sec, sec->len); + orc_list_add(&orc_list, &null, sec, sec->sh.sh_size); nr++; } } diff --git a/tools/objtool/special.c b/tools/objtool/special.c index bc925cf19e..06c3eacab3 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -58,6 +58,13 @@ void __weak arch_handle_alternative(unsigned short feature, struct special_alt * { } +static void reloc_to_sec_off(struct reloc *reloc, struct section **sec, + unsigned long *off) +{ + *sec = reloc->sym->sec; + *off = reloc->sym->offset + reloc->addend; +} + static int get_alt_entry(struct elf *elf, struct special_entry *entry, struct section *sec, int idx, struct special_alt *alt) @@ -91,14 +98,8 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, WARN_FUNC("can't find orig reloc", sec, offset + entry->orig); return -1; } - if (orig_reloc->sym->type != STT_SECTION) { - WARN_FUNC("don't know how to handle non-section reloc symbol %s", - sec, offset + entry->orig, orig_reloc->sym->name); - return -1; - } - alt->orig_sec = orig_reloc->sym->sec; - alt->orig_off = orig_reloc->addend; + reloc_to_sec_off(orig_reloc, &alt->orig_sec, &alt->orig_off); if (!entry->group || alt->new_len) { new_reloc = find_reloc_by_dest(elf, sec, offset + entry->new); @@ -116,8 +117,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, if (arch_is_retpoline(new_reloc->sym)) return 1; - alt->new_sec = new_reloc->sym->sec; - alt->new_off = (unsigned int)new_reloc->addend; + reloc_to_sec_off(new_reloc, &alt->new_sec, &alt->new_off); /* _ASM_EXTABLE_EX hack */ if (alt->new_off >= 0x7ffffff0) @@ -159,13 +159,13 @@ int special_get_alts(struct elf *elf, struct list_head *alts) if (!sec) continue; - if (sec->len % entry->size != 0) { + if (sec->sh.sh_size % entry->size != 0) { WARN("%s size not a multiple of %d", sec->name, entry->size); return -1; } - nr_entries = sec->len / entry->size; + nr_entries = sec->sh.sh_size / entry->size; for (idx = 0; idx < nr_entries; idx++) { alt = malloc(sizeof(*alt)); diff --git a/tools/perf/Documentation/jitdump-specification.txt b/tools/perf/Documentation/jitdump-specification.txt index 52152d156a..79936355d8 100644 --- a/tools/perf/Documentation/jitdump-specification.txt +++ b/tools/perf/Documentation/jitdump-specification.txt @@ -164,7 +164,7 @@ const char unwinding_data[n]: an array of unwinding data, consisting of the EH F The EH Frame header follows the Linux Standard Base (LSB) specification as described in the document at https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html -The EH Frame follows the LSB specicfication as described in the document at https://refspecs.linuxbase.org/LSB_3.0.0/LSB-PDA/LSB-PDA/ehframechpt.html +The EH Frame follows the LSB specification as described in the document at https://refspecs.linuxbase.org/LSB_3.0.0/LSB-PDA/LSB-PDA/ehframechpt.html NOTE: The mapped_size is generally either the same as unwind_data_size (if the unwinding data was mapped in memory by the running process) or zero (if the unwinding data is not mapped by the process). If the unwinding data was not mapped, then only the EH Frame Header will be read, which can be used to specify FP based unwinding for a function which does not have unwinding information. diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index de6beedb72..3b6a2c84ea 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -261,7 +261,7 @@ COALESCE User can specify how to sort offsets for cacheline. Following fields are available and governs the final -output fields set for caheline offsets output: +output fields set for cacheline offsets output: tid - coalesced by process TIDs pid - coalesced by process PIDs diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index 184ba62420..db465fa7ee 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -883,7 +883,7 @@ and "r" can be combined to get calls and returns. "Transactions" events correspond to the start or end of transactions. The 'flags' field can be used in perf script to determine whether the event is a -tranasaction start, commit or abort. +transaction start, commit or abort. Note that "instructions", "branches" and "transactions" events depend on code flow packets which can be disabled by using the config term "branch=0". Refer diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 74d7745921..1b4d452923 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -44,7 +44,7 @@ COMMON OPTIONS -f:: --force:: - Don't complan, do it. + Don't complain, do it. REPORT OPTIONS -------------- diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt index 5a1f68122f..fa4f39d305 100644 --- a/tools/perf/Documentation/perf-script-perl.txt +++ b/tools/perf/Documentation/perf-script-perl.txt @@ -54,7 +54,7 @@ all sched_wakeup events in the system: Traces meant to be processed using a script should be recorded with the above option: -a to enable system-wide collection. -The format file for the sched_wakep event defines the following fields +The format file for the sched_wakeup event defines the following fields (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format): ---- diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt index 0250dc61cf..cf4b7f4b62 100644 --- a/tools/perf/Documentation/perf-script-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt @@ -448,7 +448,7 @@ all sched_wakeup events in the system: Traces meant to be processed using a script should be recorded with the above option: -a to enable system-wide collection. -The format file for the sched_wakep event defines the following fields +The format file for the sched_wakeup event defines the following fields (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format): ---- diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 4c9310be6a..7e6fb7cbc0 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -385,7 +385,7 @@ Aggregate counts per physical processor for system-wide mode measurements. Print metrics or metricgroups specified in a comma separated list. For a group all metrics from the group are added. The events from the metrics are automatically measured. -See perf list output for the possble metrics and metricgroups. +See perf list output for the possible metrics and metricgroups. -A:: --no-aggr:: diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt index c6302df4cf..a15b93fdcf 100644 --- a/tools/perf/Documentation/topdown.txt +++ b/tools/perf/Documentation/topdown.txt @@ -2,7 +2,7 @@ Using TopDown metrics in user space ----------------------------------- Intel CPUs (since Sandy Bridge and Silvermont) support a TopDown -methology to break down CPU pipeline execution into 4 bottlenecks: +methodology to break down CPU pipeline execution into 4 bottlenecks: frontend bound, backend bound, bad speculation, retiring. For more details on Topdown see [1][5] diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 446180401e..14e3e8d702 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -143,7 +143,7 @@ FEATURE_CHECK_LDFLAGS-libcrypto = -lcrypto ifdef CSINCLUDES LIBOPENCSD_CFLAGS := -I$(CSINCLUDES) endif -OPENCSDLIBS := -lopencsd_c_api -lopencsd +OPENCSDLIBS := -lopencsd_c_api -lopencsd -lstdc++ ifdef CSLIBS LIBOPENCSD_LDFLAGS := -L$(CSLIBS) endif diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index e04313c4d8..5cd702062a 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -802,7 +802,7 @@ endif $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h) -LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(LDFLAGS)' +LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(filter-out -static,$(LDFLAGS))' $(LIBTRACEEVENT): FORCE $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index c7c7ec0812..5fc6a2a3db 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -8,10 +8,10 @@ #include #include -#include "../../util/auxtrace.h" -#include "../../util/debug.h" -#include "../../util/evlist.h" -#include "../../util/pmu.h" +#include "../../../util/auxtrace.h" +#include "../../../util/debug.h" +#include "../../../util/evlist.h" +#include "../../../util/pmu.h" #include "cs-etm.h" #include "arm-spe.h" diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 515aae470e..293a23bf8b 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -16,19 +16,19 @@ #include #include "cs-etm.h" -#include "../../util/debug.h" -#include "../../util/record.h" -#include "../../util/auxtrace.h" -#include "../../util/cpumap.h" -#include "../../util/event.h" -#include "../../util/evlist.h" -#include "../../util/evsel.h" -#include "../../util/perf_api_probe.h" -#include "../../util/evsel_config.h" -#include "../../util/pmu.h" -#include "../../util/cs-etm.h" +#include "../../../util/debug.h" +#include "../../../util/record.h" +#include "../../../util/auxtrace.h" +#include "../../../util/cpumap.h" +#include "../../../util/event.h" +#include "../../../util/evlist.h" +#include "../../../util/evsel.h" +#include "../../../util/perf_api_probe.h" +#include "../../../util/evsel_config.h" +#include "../../../util/pmu.h" +#include "../../../util/cs-etm.h" #include // page_size -#include "../../util/session.h" +#include "../../../util/session.h" #include #include diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c index 2864e2e377..2833e101a7 100644 --- a/tools/perf/arch/arm/util/perf_regs.c +++ b/tools/perf/arch/arm/util/perf_regs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../util/perf_regs.h" +#include "../../../util/perf_regs.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG_END diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c index bbc297a7e2..b8b23b9dc5 100644 --- a/tools/perf/arch/arm/util/pmu.c +++ b/tools/perf/arch/arm/util/pmu.c @@ -10,7 +10,7 @@ #include #include "arm-spe.h" -#include "../../util/pmu.h" +#include "../../../util/pmu.h" struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused) diff --git a/tools/perf/arch/arm/util/unwind-libdw.c b/tools/perf/arch/arm/util/unwind-libdw.c index 36ba4c69c3..b7692cb0c7 100644 --- a/tools/perf/arch/arm/util/unwind-libdw.c +++ b/tools/perf/arch/arm/util/unwind-libdw.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include "../../util/unwind-libdw.h" -#include "../../util/perf_regs.h" -#include "../../util/event.h" +#include "../../../util/unwind-libdw.h" +#include "../../../util/perf_regs.h" +#include "../../../util/event.h" bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) { diff --git a/tools/perf/arch/arm/util/unwind-libunwind.c b/tools/perf/arch/arm/util/unwind-libunwind.c index 3a550225df..438906bf00 100644 --- a/tools/perf/arch/arm/util/unwind-libunwind.c +++ b/tools/perf/arch/arm/util/unwind-libunwind.c @@ -3,8 +3,8 @@ #include #include #include "perf_regs.h" -#include "../../util/unwind.h" -#include "../../util/debug.h" +#include "../../../util/unwind.h" +#include "../../../util/debug.h" int libunwind__arch_reg_id(int regnum) { diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c index eeafe97b81..792cd75ade 100644 --- a/tools/perf/arch/x86/util/iostat.c +++ b/tools/perf/arch/x86/util/iostat.c @@ -432,7 +432,7 @@ void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, u8 die = ((struct iio_root_port *)evsel->priv)->die; struct perf_counts_values *count = perf_counts(evsel->counts, die, 0); - if (count->run && count->ena) { + if (count && count->run && count->ena) { if (evsel->prev_raw_counts && !out->force_header) { struct perf_counts_values *prev_count = perf_counts(evsel->prev_raw_counts, die, 0); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f6e87b7be5..f0ecfda34e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2408,6 +2408,8 @@ int cmd_stat(int argc, const char **argv) goto out; } else if (verbose) iostat_list(evsel_list, &stat_config); + if (iostat_mode == IOSTAT_RUN && !target__has_cpu(&target)) + target.system_wide = true; } if (add_default_attributes()) diff --git a/tools/perf/pmu-events/arch/powerpc/power8/other.json b/tools/perf/pmu-events/arch/powerpc/power8/other.json index 84a0cedf1f..f1f2965f67 100644 --- a/tools/perf/pmu-events/arch/powerpc/power8/other.json +++ b/tools/perf/pmu-events/arch/powerpc/power8/other.json @@ -1046,7 +1046,7 @@ { "EventCode": "0x4e010", "EventName": "PM_GCT_NOSLOT_IC_L3MISS", - "BriefDescription": "Gct empty for this thread due to icach l3 miss", + "BriefDescription": "Gct empty for this thread due to icache l3 miss", "PublicDescription": "" }, { diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 6731b3cf0c..7c887d37b8 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -1285,6 +1285,7 @@ int main(int argc, char *argv[]) } free_arch_std_events(); + free_sys_event_tables(); free(mapfile); return 0; @@ -1306,6 +1307,7 @@ int main(int argc, char *argv[]) create_empty_mapping(output_file); err_out: free_arch_std_events(); + free_sys_event_tables(); free(mapfile); return ret; } diff --git a/tools/perf/tests/attr/test-stat-default b/tools/perf/tests/attr/test-stat-default index d9e99b3f77..d8ea6a8816 100644 --- a/tools/perf/tests/attr/test-stat-default +++ b/tools/perf/tests/attr/test-stat-default @@ -68,3 +68,100 @@ fd=10 type=0 config=5 optional=1 + +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-1 b/tools/perf/tests/attr/test-stat-detailed-1 index 8b04a055d1..b656ab93c5 100644 --- a/tools/perf/tests/attr/test-stat-detailed-1 +++ b/tools/perf/tests/attr/test-stat-detailed-1 @@ -70,12 +70,109 @@ type=0 config=5 optional=1 +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event11:base-stat] -fd=11 +[event20:base-stat] +fd=20 type=3 config=0 optional=1 @@ -84,8 +181,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event12:base-stat] -fd=12 +[event21:base-stat] +fd=21 type=3 config=65536 optional=1 @@ -94,8 +191,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event13:base-stat] -fd=13 +[event22:base-stat] +fd=22 type=3 config=2 optional=1 @@ -104,8 +201,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event14:base-stat] -fd=14 +[event23:base-stat] +fd=23 type=3 config=65538 optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-2 b/tools/perf/tests/attr/test-stat-detailed-2 index 4fca9f1bfb..97625090a1 100644 --- a/tools/perf/tests/attr/test-stat-detailed-2 +++ b/tools/perf/tests/attr/test-stat-detailed-2 @@ -70,12 +70,109 @@ type=0 config=5 optional=1 +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event11:base-stat] -fd=11 +[event20:base-stat] +fd=20 type=3 config=0 optional=1 @@ -84,8 +181,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event12:base-stat] -fd=12 +[event21:base-stat] +fd=21 type=3 config=65536 optional=1 @@ -94,8 +191,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event13:base-stat] -fd=13 +[event22:base-stat] +fd=22 type=3 config=2 optional=1 @@ -104,8 +201,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event14:base-stat] -fd=14 +[event23:base-stat] +fd=23 type=3 config=65538 optional=1 @@ -114,8 +211,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event15:base-stat] -fd=15 +[event24:base-stat] +fd=24 type=3 config=1 optional=1 @@ -124,8 +221,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event16:base-stat] -fd=16 +[event25:base-stat] +fd=25 type=3 config=65537 optional=1 @@ -134,8 +231,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event17:base-stat] -fd=17 +[event26:base-stat] +fd=26 type=3 config=3 optional=1 @@ -144,8 +241,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event18:base-stat] -fd=18 +[event27:base-stat] +fd=27 type=3 config=65539 optional=1 @@ -154,8 +251,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event19:base-stat] -fd=19 +[event28:base-stat] +fd=28 type=3 config=4 optional=1 @@ -164,8 +261,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event20:base-stat] -fd=20 +[event29:base-stat] +fd=29 type=3 config=65540 optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-3 b/tools/perf/tests/attr/test-stat-detailed-3 index 4bb58e1c82..d555042e3f 100644 --- a/tools/perf/tests/attr/test-stat-detailed-3 +++ b/tools/perf/tests/attr/test-stat-detailed-3 @@ -70,12 +70,109 @@ type=0 config=5 optional=1 +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event11:base-stat] -fd=11 +[event20:base-stat] +fd=20 type=3 config=0 optional=1 @@ -84,8 +181,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event12:base-stat] -fd=12 +[event21:base-stat] +fd=21 type=3 config=65536 optional=1 @@ -94,8 +191,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event13:base-stat] -fd=13 +[event22:base-stat] +fd=22 type=3 config=2 optional=1 @@ -104,8 +201,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event14:base-stat] -fd=14 +[event23:base-stat] +fd=23 type=3 config=65538 optional=1 @@ -114,8 +211,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event15:base-stat] -fd=15 +[event24:base-stat] +fd=24 type=3 config=1 optional=1 @@ -124,8 +221,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event16:base-stat] -fd=16 +[event25:base-stat] +fd=25 type=3 config=65537 optional=1 @@ -134,8 +231,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event17:base-stat] -fd=17 +[event26:base-stat] +fd=26 type=3 config=3 optional=1 @@ -144,8 +241,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event18:base-stat] -fd=18 +[event27:base-stat] +fd=27 type=3 config=65539 optional=1 @@ -154,8 +251,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event19:base-stat] -fd=19 +[event28:base-stat] +fd=28 type=3 config=4 optional=1 @@ -164,8 +261,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event20:base-stat] -fd=20 +[event29:base-stat] +fd=29 type=3 config=65540 optional=1 @@ -174,8 +271,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event21:base-stat] -fd=21 +[event30:base-stat] +fd=30 type=3 config=512 optional=1 @@ -184,8 +281,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event22:base-stat] -fd=22 +[event31:base-stat] +fd=31 type=3 config=66048 optional=1 diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 9866cddebf..9b4a765e4b 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -229,8 +229,8 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, struct thread *thread, struct state *state) { struct addr_location al; - unsigned char buf1[BUFSZ]; - unsigned char buf2[BUFSZ]; + unsigned char buf1[BUFSZ] = {0}; + unsigned char buf2[BUFSZ] = {0}; size_t ret_len; u64 objdump_addr; const char *objdump_name; diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index a288035eb3..c756284b3b 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -20,6 +20,23 @@ /* For bsearch. We try to unwind functions in shared object. */ #include +/* + * The test will assert frames are on the stack but tail call optimizations lose + * the frame of the caller. Clang can disable this optimization on a called + * function but GCC currently (11/2020) lacks this attribute. The barrier is + * used to inhibit tail calls in these cases. + */ +#ifdef __has_attribute +#if __has_attribute(disable_tail_calls) +#define NO_TAIL_CALL_ATTRIBUTE __attribute__((disable_tail_calls)) +#define NO_TAIL_CALL_BARRIER +#endif +#endif +#ifndef NO_TAIL_CALL_ATTRIBUTE +#define NO_TAIL_CALL_ATTRIBUTE +#define NO_TAIL_CALL_BARRIER __asm__ __volatile__("" : : : "memory"); +#endif + static int mmap_handler(struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, @@ -91,7 +108,7 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) return strcmp((const char *) symbol, funcs[idx]); } -noinline int test_dwarf_unwind__thread(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__thread(struct thread *thread) { struct perf_sample sample; unsigned long cnt = 0; @@ -122,7 +139,7 @@ noinline int test_dwarf_unwind__thread(struct thread *thread) static int global_unwind_retval = -INT_MAX; -noinline int test_dwarf_unwind__compare(void *p1, void *p2) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__compare(void *p1, void *p2) { /* Any possible value should be 'thread' */ struct thread *thread = *(struct thread **)p1; @@ -141,7 +158,7 @@ noinline int test_dwarf_unwind__compare(void *p1, void *p2) return p1 - p2; } -noinline int test_dwarf_unwind__krava_3(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__krava_3(struct thread *thread) { struct thread *array[2] = {thread, thread}; void *fp = &bsearch; @@ -160,14 +177,22 @@ noinline int test_dwarf_unwind__krava_3(struct thread *thread) return global_unwind_retval; } -noinline int test_dwarf_unwind__krava_2(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__krava_2(struct thread *thread) { - return test_dwarf_unwind__krava_3(thread); + int ret; + + ret = test_dwarf_unwind__krava_3(thread); + NO_TAIL_CALL_BARRIER; + return ret; } -noinline int test_dwarf_unwind__krava_1(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__krava_1(struct thread *thread) { - return test_dwarf_unwind__krava_2(thread); + int ret; + + ret = test_dwarf_unwind__krava_2(thread); + NO_TAIL_CALL_BARRIER; + return ret; } int test__dwarf_unwind(struct test *test __maybe_unused, int subtest __maybe_unused) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 4fb5e90d7a..60ce5908c6 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -801,7 +801,7 @@ int perf_config_set(struct perf_config_set *set, section->name, item->name); ret = fn(key, value, data); if (ret < 0) { - pr_err("Error: wrong config key-value pair %s=%s\n", + pr_err("Error in the given config file: wrong config key-value pair %s=%s\n", key, value); /* * Can't be just a 'break', as perf_config_set__for_each_entry() diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 069c2cfdd3..352f16076e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2116,7 +2116,7 @@ fetch_decomp_event(u64 head, size_t mmap_size, char *buf, bool needs_swap) static int __perf_session__process_decomp_events(struct perf_session *session) { s64 skip; - u64 size, file_pos = 0; + u64 size; struct decomp *decomp = session->decomp_last; if (!decomp) @@ -2132,7 +2132,7 @@ static int __perf_session__process_decomp_events(struct perf_session *session) size = event->header.size; if (size < sizeof(struct perf_event_header) || - (skip = perf_session__process_event(session, event, file_pos)) < 0) { + (skip = perf_session__process_event(session, event, decomp->file_pos)) < 0) { pr_err("%#" PRIx64 " [%#x]: failed to process type: %d\n", decomp->file_pos + decomp->head, event->header.size, event->header.type); return -EINVAL; diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 5a931456e7..ac35c61f65 100644 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -16,7 +16,7 @@ assert sys.version_info >= (3, 7), "Python version is too old" from collections import namedtuple from enum import Enum, auto -from typing import Iterable +from typing import Iterable, Sequence import kunit_config import kunit_json @@ -186,6 +186,26 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree, exec_result.elapsed_time)) return parse_result +# Problem: +# $ kunit.py run --json +# works as one would expect and prints the parsed test results as JSON. +# $ kunit.py run --json suite_name +# would *not* pass suite_name as the filter_glob and print as json. +# argparse will consider it to be another way of writing +# $ kunit.py run --json=suite_name +# i.e. it would run all tests, and dump the json to a `suite_name` file. +# So we hackily automatically rewrite --json => --json=stdout +pseudo_bool_flag_defaults = { + '--json': 'stdout', + '--raw_output': 'kunit', +} +def massage_argv(argv: Sequence[str]) -> Sequence[str]: + def massage_arg(arg: str) -> str: + if arg not in pseudo_bool_flag_defaults: + return arg + return f'{arg}={pseudo_bool_flag_defaults[arg]}' + return list(map(massage_arg, argv)) + def add_common_opts(parser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' @@ -303,7 +323,7 @@ def main(argv, linux=None): help='Specifies the file to read results from.', type=str, nargs='?', metavar='input_file') - cli_args = parser.parse_args(argv) + cli_args = parser.parse_args(massage_argv(argv)) if get_kernel_root_path(): os.chdir(get_kernel_root_path()) diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 619c4554cb..1edcc8373b 100644 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -408,6 +408,14 @@ class KUnitMainTest(unittest.TestCase): self.assertNotEqual(call, mock.call(StrContains('Testing complete.'))) self.assertNotEqual(call, mock.call(StrContains(' 0 tests run'))) + def test_run_raw_output_does_not_take_positional_args(self): + # --raw_output is a string flag, but we don't want it to consume + # any positional arguments, only ones after an '=' + self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) + kunit.main(['run', '--raw_output', 'filter_glob'], self.linux_source_mock) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir='.kunit', filter_glob='filter_glob', timeout=300) + def test_exec_timeout(self): timeout = 3453 kunit.main(['exec', '--timeout', str(timeout)], self.linux_source_mock) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 866531c08e..799b88152e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -375,7 +375,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ - $(wildcard $(BPFDIR)/bpf_*.h) | $(TRUNNER_OUTPUT) + $(wildcard $(BPFDIR)/bpf_*.h) \ + | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh index 59ea56945e..b497bb85b6 100644 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh @@ -112,6 +112,14 @@ setup() ip netns add "${NS2}" ip netns add "${NS3}" + # rp_filter gets confused by what these tests are doing, so disable it + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip link add veth1 type veth peer name veth2 ip link add veth3 type veth peer name veth4 ip link add veth5 type veth peer name veth6 @@ -236,11 +244,6 @@ setup() ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} - # rp_filter gets confused by what these tests are doing, so disable it - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 - TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) sleep 1 # reduce flakiness diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c index 4de902ea14..de1c4e6de0 100644 --- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c +++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + #include #include #include #include #include -#include +#include #include #include diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc index 5f5b2ba3e5..60c02b482b 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc @@ -11,8 +11,8 @@ SYSTEM="syscalls" EVENT="sys_enter_openat" FIELD="filename" EPROBE="eprobe_open" - -echo "e:$EPROBE $SYSTEM/$EVENT file=+0(\$filename):ustring" >> dynamic_events +OPTIONS="file=+0(\$filename):ustring" +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events grep -q "$EPROBE" dynamic_events test -d events/eprobes/$EPROBE @@ -37,4 +37,54 @@ echo "-:$EPROBE" >> dynamic_events ! grep -q "$EPROBE" dynamic_events ! test -d events/eprobes/$EPROBE +# test various ways to remove the probe (already tested with just event name) + +# With group name +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With group name and system/event +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With just event name and system/event +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:$EPROBE $SYSTEM/$EVENT" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With just event name and system/event and options +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With group name and system/event and options +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# Finally make sure what is in the dynamic_events file clears it too +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +LINE=`sed -e '/$EPROBE/s/^e/-/' < dynamic_events` +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + clear_trace diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 618bf9bc7f..b8dbabe24a 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -24,6 +24,7 @@ /x86_64/smm_test /x86_64/state_test /x86_64/svm_vmcall_test +/x86_64/svm_int_ctl_test /x86_64/sync_regs_test /x86_64/tsc_msrs_test /x86_64/userspace_msr_exit_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 9ac325cfc9..d1774f4613 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 71e277c7c3..5d95113c7b 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -371,9 +371,7 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n\n"); - backing_src_help(); + backing_src_help("-s"); puts(""); exit(0); } @@ -381,7 +379,7 @@ static void help(char *name) int main(int argc, char *argv[]) { struct test_params params = { - .backing_src = VM_MEM_SRC_ANONYMOUS, + .backing_src = DEFAULT_VM_MEM_SRC, .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, .vcpus = 1, }; diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index e79c1b6497..1510b21e63 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -179,7 +179,7 @@ static void *uffd_handler_thread_fn(void *arg) return NULL; } - if (!pollfd[0].revents & POLLIN) + if (!(pollfd[0].revents & POLLIN)) continue; r = read(uffd, &msg, sizeof(msg)); @@ -416,7 +416,7 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-t type] [-v vcpus] [-o]\n", name); + " [-b memory] [-s type] [-v vcpus] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); @@ -426,8 +426,7 @@ static void help(char *name) printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); - printf(" -t: The type of backing memory to use. Default: anonymous\n"); - backing_src_help(); + backing_src_help("-s"); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); @@ -439,14 +438,14 @@ int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { - .src_type = VM_MEM_SRC_ANONYMOUS, + .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -465,7 +464,7 @@ int main(int argc, char *argv[]) case 'b': guest_percpu_mem_size = parse_size(optarg); break; - case 't': + case 's': p.src_type = parse_backing_src_type(optarg); break; case 'v': @@ -485,7 +484,7 @@ int main(int argc, char *argv[]) if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && !backing_src_is_shared(p.src_type)) { - TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s"); } for_each_guest_mode(run_test, &p); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 479868570d..7ffab5bd5c 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -118,42 +118,64 @@ static inline void disable_dirty_logging(struct kvm_vm *vm, int slots) toggle_dirty_logging(vm, slots, false); } -static void get_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, - uint64_t nr_pages) +static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots) { - uint64_t slot_pages = nr_pages / slots; int i; for (i = 0; i < slots; i++) { int slot = PERF_TEST_MEM_SLOT_INDEX + i; - unsigned long *slot_bitmap = bitmap + i * slot_pages; - kvm_vm_get_dirty_log(vm, slot, slot_bitmap); + kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); } } -static void clear_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, - uint64_t nr_pages) +static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], + int slots, uint64_t pages_per_slot) { - uint64_t slot_pages = nr_pages / slots; int i; for (i = 0; i < slots; i++) { int slot = PERF_TEST_MEM_SLOT_INDEX + i; - unsigned long *slot_bitmap = bitmap + i * slot_pages; - kvm_vm_clear_dirty_log(vm, slot, slot_bitmap, 0, slot_pages); + kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); } } +static unsigned long **alloc_bitmaps(int slots, uint64_t pages_per_slot) +{ + unsigned long **bitmaps; + int i; + + bitmaps = malloc(slots * sizeof(bitmaps[0])); + TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array."); + + for (i = 0; i < slots; i++) { + bitmaps[i] = bitmap_zalloc(pages_per_slot); + TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap."); + } + + return bitmaps; +} + +static void free_bitmaps(unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) + free(bitmaps[i]); + + free(bitmaps); +} + static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; pthread_t *vcpu_threads; struct kvm_vm *vm; - unsigned long *bmap; + unsigned long **bitmaps; uint64_t guest_num_pages; uint64_t host_num_pages; + uint64_t pages_per_slot; int vcpu_id; struct timespec start; struct timespec ts_diff; @@ -171,7 +193,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_num_pages = vm_num_host_pages(mode, guest_num_pages); - bmap = bitmap_zalloc(host_num_pages); + pages_per_slot = host_num_pages / p->slots; + + bitmaps = alloc_bitmaps(p->slots, pages_per_slot); if (dirty_log_manual_caps) { cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; @@ -239,7 +263,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - get_dirty_log(vm, p->slots, bmap, host_num_pages); + get_dirty_log(vm, bitmaps, p->slots); ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); @@ -248,7 +272,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - clear_dirty_log(vm, p->slots, bmap, host_num_pages); + clear_dirty_log(vm, bitmaps, p->slots, pages_per_slot); ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); @@ -281,7 +305,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); } - free(bmap); + free_bitmaps(bitmaps, p->slots); free(vcpu_threads); perf_test_destroy_vm(vm); } @@ -308,11 +332,9 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n\n"); + backing_src_help("-s"); printf(" -x: Split the memory region into this number of memslots.\n" - " (default: 1)"); - backing_src_help(); + " (default: 1)\n"); puts(""); exit(0); } @@ -324,7 +346,7 @@ int main(int argc, char *argv[]) .iterations = TEST_HOST_LOOP_N, .wr_fract = 1, .partition_vcpu_memory_access = true, - .backing_src = VM_MEM_SRC_ANONYMOUS, + .backing_src = DEFAULT_VM_MEM_SRC, .slots = 1, }; int opt; diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 451fed5ce8..f8fddc84c0 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -90,6 +90,8 @@ enum vm_mem_backing_src_type { NUM_SRC_TYPES, }; +#define DEFAULT_VM_MEM_SRC VM_MEM_SRC_ANONYMOUS + struct vm_mem_backing_src_alias { const char *name; uint32_t flag; @@ -102,7 +104,7 @@ size_t get_trans_hugepagesz(void); size_t get_def_hugetlb_pagesz(void); const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); size_t get_backing_src_pagesz(uint32_t i); -void backing_src_help(void); +void backing_src_help(const char *flag); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); long get_run_delay(void); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 242ae8e09a..05e65ca1c3 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -312,37 +312,37 @@ static inline void set_xmm(int n, unsigned long val) } } -typedef unsigned long v1di __attribute__ ((vector_size (8))); +#define GET_XMM(__xmm) \ +({ \ + unsigned long __val; \ + asm volatile("movq %%"#__xmm", %0" : "=r"(__val)); \ + __val; \ +}) + static inline unsigned long get_xmm(int n) { assert(n >= 0 && n <= 7); - register v1di xmm0 __asm__("%xmm0"); - register v1di xmm1 __asm__("%xmm1"); - register v1di xmm2 __asm__("%xmm2"); - register v1di xmm3 __asm__("%xmm3"); - register v1di xmm4 __asm__("%xmm4"); - register v1di xmm5 __asm__("%xmm5"); - register v1di xmm6 __asm__("%xmm6"); - register v1di xmm7 __asm__("%xmm7"); switch (n) { case 0: - return (unsigned long)xmm0; + return GET_XMM(xmm0); case 1: - return (unsigned long)xmm1; + return GET_XMM(xmm1); case 2: - return (unsigned long)xmm2; + return GET_XMM(xmm2); case 3: - return (unsigned long)xmm3; + return GET_XMM(xmm3); case 4: - return (unsigned long)xmm4; + return GET_XMM(xmm4); case 5: - return (unsigned long)xmm5; + return GET_XMM(xmm5); case 6: - return (unsigned long)xmm6; + return GET_XMM(xmm6); case 7: - return (unsigned long)xmm7; + return GET_XMM(xmm7); } + + /* never reached */ return 0; } diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 0d04a7db7f..36407cb0ec 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -456,10 +456,7 @@ static void help(char *name) " (default: 1G)\n"); printf(" -v: specify the number of vCPUs to run\n" " (default: 1)\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n" - " (default: anonymous)\n\n"); - backing_src_help(); + backing_src_help("-s"); puts(""); } @@ -468,7 +465,7 @@ int main(int argc, char *argv[]) int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { .test_mem_size = DEFAULT_TEST_MEM_SIZE, - .src_type = VM_MEM_SRC_ANONYMOUS, + .src_type = DEFAULT_VM_MEM_SRC, }; int opt; diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index a9107bfae4..b724291089 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -283,13 +283,22 @@ size_t get_backing_src_pagesz(uint32_t i) } } -void backing_src_help(void) +static void print_available_backing_src_types(const char *prefix) { int i; - printf("Available backing src types:\n"); + printf("%sAvailable backing src types:\n", prefix); + for (i = 0; i < NUM_SRC_TYPES; i++) - printf("\t%s\n", vm_mem_backing_src_alias(i)->name); + printf("%s %s\n", prefix, vm_mem_backing_src_alias(i)->name); +} + +void backing_src_help(const char *flag) +{ + printf(" %s: specify the type of memory that should be used to\n" + " back the guest data region. (default: %s)\n", + flag, vm_mem_backing_src_alias(DEFAULT_VM_MEM_SRC)->name); + print_available_backing_src_types(" "); } enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) @@ -300,7 +309,7 @@ enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name)) return i; - backing_src_help(); + print_available_backing_src_types(""); TEST_FAIL("Unknown backing src type: %s", type_name); return -1; } diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 060538bd40..4158da0da2 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ static __thread volatile struct rseq __rseq = { static pthread_t migration_thread; static cpu_set_t possible_mask; +static int min_cpu, max_cpu; static bool done; static atomic_t seq_cnt; @@ -57,20 +59,37 @@ static void sys_rseq(int flags) TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno)); } +static int next_cpu(int cpu) +{ + /* + * Advance to the next CPU, skipping those that weren't in the original + * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's + * data storage is considered as opaque. Note, if this task is pinned + * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will + * burn a lot cycles and the test will take longer than normal to + * complete. + */ + do { + cpu++; + if (cpu > max_cpu) { + cpu = min_cpu; + TEST_ASSERT(CPU_ISSET(cpu, &possible_mask), + "Min CPU = %d must always be usable", cpu); + break; + } + } while (!CPU_ISSET(cpu, &possible_mask)); + + return cpu; +} + static void *migration_worker(void *ign) { cpu_set_t allowed_mask; - int r, i, nr_cpus, cpu; + int r, i, cpu; CPU_ZERO(&allowed_mask); - nr_cpus = CPU_COUNT(&possible_mask); - - for (i = 0; i < NR_TASK_MIGRATIONS; i++) { - cpu = i % nr_cpus; - if (!CPU_ISSET(cpu, &possible_mask)) - continue; - + for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) { CPU_SET(cpu, &allowed_mask); /* @@ -154,6 +173,36 @@ static void *migration_worker(void *ign) return NULL; } +static int calc_min_max_cpu(void) +{ + int i, cnt, nproc; + + if (CPU_COUNT(&possible_mask) < 2) + return -EINVAL; + + /* + * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that + * this task is affined to in order to reduce the time spent querying + * unusable CPUs, e.g. if this task is pinned to a small percentage of + * total CPUs. + */ + nproc = get_nprocs_conf(); + min_cpu = -1; + max_cpu = -1; + cnt = 0; + + for (i = 0; i < nproc; i++) { + if (!CPU_ISSET(i, &possible_mask)) + continue; + if (min_cpu == -1) + min_cpu = i; + max_cpu = i; + cnt++; + } + + return (cnt < 2) ? -EINVAL : 0; +} + int main(int argc, char *argv[]) { int r, i, snapshot; @@ -167,8 +216,8 @@ int main(int argc, char *argv[]) TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, strerror(errno)); - if (CPU_COUNT(&possible_mask) < 2) { - print_skip("Only one CPU, task migration not possible\n"); + if (calc_min_max_cpu()) { + print_skip("Only one usable CPU, task migration not possible"); exit(KSFT_SKIP); } @@ -180,6 +229,7 @@ int main(int argc, char *argv[]) * CPU affinity. */ vm = vm_create_default(VCPU_ID, 0, guest_code); + ucall_init(vm, NULL); pthread_create(&migration_thread, NULL, migration_worker, 0); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 2172d65b85..62f2eb9ee3 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -116,12 +116,12 @@ struct st_time { uint64_t st_time; }; -static int64_t smccc(uint32_t func, uint32_t arg) +static int64_t smccc(uint32_t func, uint64_t arg) { unsigned long ret; asm volatile( - "mov x0, %1\n" + "mov w0, %w1\n" "mov x1, %2\n" "hvc #0\n" "mov %0, x0\n" diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c new file mode 100644 index 0000000000..df04f56ce8 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_int_ctl_test + * + * Copyright (C) 2021, Red Hat, Inc. + * + * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "apic.h" + +#define VCPU_ID 0 + +static struct kvm_vm *vm; + +bool vintr_irq_called; +bool intr_irq_called; + +#define VINTR_IRQ_NUMBER 0x20 +#define INTR_IRQ_NUMBER 0x30 + +static void vintr_irq_handler(struct ex_regs *regs) +{ + vintr_irq_called = true; +} + +static void intr_irq_handler(struct ex_regs *regs) +{ + x2apic_write_reg(APIC_EOI, 0x00); + intr_irq_called = true; +} + +static void l2_guest_code(struct svm_test_data *svm) +{ + /* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC, + * and since L1 didn't enable virtual interrupt masking, + * L2 should receive it and not L1. + * + * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ + * so it should also receive it after the following 'sti'. + */ + x2apic_write_reg(APIC_ICR, + APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER); + + __asm__ __volatile__( + "sti\n" + "nop\n" + ); + + GUEST_ASSERT(vintr_irq_called); + GUEST_ASSERT(intr_irq_called); + + __asm__ __volatile__( + "vmcall\n" + ); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + x2apic_enable(); + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* No virtual interrupt masking */ + vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + /* No intercepts for real and virtual interrupts */ + vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR); + + /* Make a virtual interrupt VINTR_IRQ_NUMBER pending */ + vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT); + vmcb->control.int_vector = VINTR_IRQ_NUMBER; + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t svm_gva; + + nested_svm_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); + vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); + + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + break; + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index 3caf72bb9c..a2489ec398 100644 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -468,10 +468,26 @@ out_bits() for i in {0..22} do ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ - prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} dev veth0 + prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} \ + dev veth0 &>/dev/null - run_test "out_bit$i" "${desc//$i}" ioam-node-alpha ioam-node-beta \ - db01::2 db01::1 veth0 ${bit2type[$i]} 123 + local cmd_res=$? + local descr="${desc//$i}" + + if [[ $i -ge 12 && $i -le 21 ]] + then + if [ $cmd_res != 0 ] + then + npassed=$((npassed+1)) + log_test_passed "$descr" + else + nfailed=$((nfailed+1)) + log_test_failed "$descr" + fi + else + run_test "out_bit$i" "$descr" ioam-node-alpha ioam-node-beta \ + db01::2 db01::1 veth0 ${bit2type[$i]} 123 + fi done bit2size[22]=$tmp @@ -544,7 +560,7 @@ in_bits() local tmp=${bit2size[22]} bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) )) - for i in {0..22} + for i in {0..11} {22..22} do ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} dev veth0 diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c index d376cb2c38..8f6997d358 100644 --- a/tools/testing/selftests/net/ioam6_parser.c +++ b/tools/testing/selftests/net/ioam6_parser.c @@ -94,16 +94,6 @@ enum { TEST_OUT_BIT9, TEST_OUT_BIT10, TEST_OUT_BIT11, - TEST_OUT_BIT12, - TEST_OUT_BIT13, - TEST_OUT_BIT14, - TEST_OUT_BIT15, - TEST_OUT_BIT16, - TEST_OUT_BIT17, - TEST_OUT_BIT18, - TEST_OUT_BIT19, - TEST_OUT_BIT20, - TEST_OUT_BIT21, TEST_OUT_BIT22, TEST_OUT_FULL_SUPP_TRACE, @@ -125,16 +115,6 @@ enum { TEST_IN_BIT9, TEST_IN_BIT10, TEST_IN_BIT11, - TEST_IN_BIT12, - TEST_IN_BIT13, - TEST_IN_BIT14, - TEST_IN_BIT15, - TEST_IN_BIT16, - TEST_IN_BIT17, - TEST_IN_BIT18, - TEST_IN_BIT19, - TEST_IN_BIT20, - TEST_IN_BIT21, TEST_IN_BIT22, TEST_IN_FULL_SUPP_TRACE, @@ -199,30 +179,6 @@ static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h, ioam6h->nodelen != 2 || ioam6h->remlen; - case TEST_OUT_BIT12: - case TEST_IN_BIT12: - case TEST_OUT_BIT13: - case TEST_IN_BIT13: - case TEST_OUT_BIT14: - case TEST_IN_BIT14: - case TEST_OUT_BIT15: - case TEST_IN_BIT15: - case TEST_OUT_BIT16: - case TEST_IN_BIT16: - case TEST_OUT_BIT17: - case TEST_IN_BIT17: - case TEST_OUT_BIT18: - case TEST_IN_BIT18: - case TEST_OUT_BIT19: - case TEST_IN_BIT19: - case TEST_OUT_BIT20: - case TEST_IN_BIT20: - case TEST_OUT_BIT21: - case TEST_IN_BIT21: - return ioam6h->overflow || - ioam6h->nodelen || - ioam6h->remlen != 1; - case TEST_OUT_BIT22: case TEST_IN_BIT22: return ioam6h->overflow || @@ -326,6 +282,66 @@ static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h, *p += sizeof(__u32); } + if (ioam6h->type.bit12) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit13) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit14) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit15) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit16) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit17) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit18) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit19) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit20) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit21) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + if (ioam6h->type.bit22) { len = cnf.sc_data ? strlen(cnf.sc_data) : 0; aligned = cnf.sc_data ? __ALIGN_KERNEL(len, 4) : 0; @@ -455,26 +471,6 @@ static int str2id(const char *tname) return TEST_OUT_BIT10; if (!strcmp("out_bit11", tname)) return TEST_OUT_BIT11; - if (!strcmp("out_bit12", tname)) - return TEST_OUT_BIT12; - if (!strcmp("out_bit13", tname)) - return TEST_OUT_BIT13; - if (!strcmp("out_bit14", tname)) - return TEST_OUT_BIT14; - if (!strcmp("out_bit15", tname)) - return TEST_OUT_BIT15; - if (!strcmp("out_bit16", tname)) - return TEST_OUT_BIT16; - if (!strcmp("out_bit17", tname)) - return TEST_OUT_BIT17; - if (!strcmp("out_bit18", tname)) - return TEST_OUT_BIT18; - if (!strcmp("out_bit19", tname)) - return TEST_OUT_BIT19; - if (!strcmp("out_bit20", tname)) - return TEST_OUT_BIT20; - if (!strcmp("out_bit21", tname)) - return TEST_OUT_BIT21; if (!strcmp("out_bit22", tname)) return TEST_OUT_BIT22; if (!strcmp("out_full_supp_trace", tname)) @@ -509,26 +505,6 @@ static int str2id(const char *tname) return TEST_IN_BIT10; if (!strcmp("in_bit11", tname)) return TEST_IN_BIT11; - if (!strcmp("in_bit12", tname)) - return TEST_IN_BIT12; - if (!strcmp("in_bit13", tname)) - return TEST_IN_BIT13; - if (!strcmp("in_bit14", tname)) - return TEST_IN_BIT14; - if (!strcmp("in_bit15", tname)) - return TEST_IN_BIT15; - if (!strcmp("in_bit16", tname)) - return TEST_IN_BIT16; - if (!strcmp("in_bit17", tname)) - return TEST_IN_BIT17; - if (!strcmp("in_bit18", tname)) - return TEST_IN_BIT18; - if (!strcmp("in_bit19", tname)) - return TEST_IN_BIT19; - if (!strcmp("in_bit20", tname)) - return TEST_IN_BIT20; - if (!strcmp("in_bit21", tname)) - return TEST_IN_BIT21; if (!strcmp("in_bit22", tname)) return TEST_IN_BIT22; if (!strcmp("in_full_supp_trace", tname)) @@ -606,16 +582,6 @@ static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = { [TEST_OUT_BIT9] = check_ioam_header_and_data, [TEST_OUT_BIT10] = check_ioam_header_and_data, [TEST_OUT_BIT11] = check_ioam_header_and_data, - [TEST_OUT_BIT12] = check_ioam_header, - [TEST_OUT_BIT13] = check_ioam_header, - [TEST_OUT_BIT14] = check_ioam_header, - [TEST_OUT_BIT15] = check_ioam_header, - [TEST_OUT_BIT16] = check_ioam_header, - [TEST_OUT_BIT17] = check_ioam_header, - [TEST_OUT_BIT18] = check_ioam_header, - [TEST_OUT_BIT19] = check_ioam_header, - [TEST_OUT_BIT20] = check_ioam_header, - [TEST_OUT_BIT21] = check_ioam_header, [TEST_OUT_BIT22] = check_ioam_header_and_data, [TEST_OUT_FULL_SUPP_TRACE] = check_ioam_header_and_data, [TEST_IN_UNDEF_NS] = check_ioam_header, @@ -633,16 +599,6 @@ static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = { [TEST_IN_BIT9] = check_ioam_header_and_data, [TEST_IN_BIT10] = check_ioam_header_and_data, [TEST_IN_BIT11] = check_ioam_header_and_data, - [TEST_IN_BIT12] = check_ioam_header, - [TEST_IN_BIT13] = check_ioam_header, - [TEST_IN_BIT14] = check_ioam_header, - [TEST_IN_BIT15] = check_ioam_header, - [TEST_IN_BIT16] = check_ioam_header, - [TEST_IN_BIT17] = check_ioam_header, - [TEST_IN_BIT18] = check_ioam_header, - [TEST_IN_BIT19] = check_ioam_header, - [TEST_IN_BIT20] = check_ioam_header, - [TEST_IN_BIT21] = check_ioam_header, [TEST_IN_BIT22] = check_ioam_header_and_data, [TEST_IN_FULL_SUPP_TRACE] = check_ioam_header_and_data, [TEST_FWD_FULL_SUPP_TRACE] = check_ioam_header_and_data, diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/netfilter/nft_nat_zones.sh new file mode 100644 index 0000000000..b9ab37380f --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_nat_zones.sh @@ -0,0 +1,309 @@ +#!/bin/bash +# +# Test connection tracking zone and NAT source port reallocation support. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# Don't increase too much, 2000 clients should work +# just fine but script can then take several minutes with +# KASAN/debug builds. +maxclients=100 + +have_iperf=1 +ret=0 + +# client1---. +# veth1-. +# | +# NAT Gateway --veth0--> Server +# | | +# veth2-' | +# client2---' | +# .... | +# clientX----vethX---' + +# All clients share identical IP address. +# NAT Gateway uses policy routing and conntrack zones to isolate client +# namespaces. Each client connects to Server, each with colliding tuples: +# clientsaddr:10000 -> serveraddr:dport +# NAT Gateway is supposed to do port reallocation for each of the +# connections. + +sfx=$(mktemp -u "XXXXXXXX") +gw="ns-gw-$sfx" +cl1="ns-cl1-$sfx" +cl2="ns-cl2-$sfx" +srv="ns-srv-$sfx" + +v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) +v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) +v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) +v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) +v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) +v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) + +cleanup() +{ + ip netns del $gw + ip netns del $srv + for i in $(seq 1 $maxclients); do + ip netns del ns-cl$i-$sfx 2>/dev/null + done + + sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +iperf3 -v >/dev/null 2>&1 +if [ $? -ne 0 ];then + have_iperf=0 +fi + +ip netns add "$gw" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $gw" + exit $ksft_skip +fi +ip -net "$gw" link set lo up + +trap cleanup EXIT + +ip netns add "$srv" +if [ $? -ne 0 ];then + echo "SKIP: Could not create server netns $srv" + exit $ksft_skip +fi + +ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" +ip -net "$gw" link set veth0 up +ip -net "$srv" link set lo up +ip -net "$srv" link set eth0 up + +sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + + ip netns add "$cl" + if [ $? -ne 0 ];then + echo "SKIP: Could not create client netns $cl" + exit $ksft_skip + fi + ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip + fi +done + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + echo netns exec "$cl" ip link set lo up + echo netns exec "$cl" ip link set eth0 up + echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 + echo netns exec "$gw" ip link set veth$i up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 + + # clients have same IP addresses. + echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 + echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 + echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 + + # NB: same addresses on client-facing interfaces. + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i + echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i + + # gw: policy routing + echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) +done | ip -batch /dev/stdin + +ip -net "$gw" addr add 10.3.0.1/24 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 + +ip -net "$srv" addr add 10.3.0.99/24 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 + +ip netns exec $gw nft -f /dev/stdin< /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null + +# useful for debugging: allows to use 'ping' from clients to gateway. +ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null + +for i in $(seq 1 $maxclients); do + cl="ns-cl$i-$sfx" + ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & + if [ $? -ne 0 ]; then + echo FAIL: Ping failure from $cl 1>&2 + ret=1 + break + fi +done + +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth$i" 1>&2 + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + break + fi +done + +ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" + ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 +fi + +if [ $ret -eq 0 ]; then + echo "PASS: ping test from all $maxclients namespaces" +fi + +if [ $have_iperf -eq 0 ];then + echo "SKIP: iperf3 not installed" + if [ $ret -ne 0 ];then + exit $ret + fi + exit $ksft_skip +fi + +ip netns exec $srv iperf3 -s > /dev/null 2>&1 & +iperfpid=$! +sleep 1 + +for i in $(seq 1 $maxclients); do + if [ $ret -ne 0 ]; then + break + fi + cl="ns-cl$i-$sfx" + ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null + if [ $? -ne 0 ]; then + echo FAIL: Failure to connect for $cl 1>&2 + ip netns exec $gw conntrack -S 1>&2 + ret=1 + fi +done +if [ $ret -eq 0 ];then + echo "PASS: iperf3 connections for all $maxclients net namespaces" +fi + +kill $iperfpid +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 + break + fi +done +if [ $ret -eq 0 ];then + echo "PASS: Found client connection for all $maxclients net namespaces" +fi + +ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: cannot find return entry on veth0" 1>&2 +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh new file mode 100644 index 0000000000..ac646376eb --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_zones_many.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +# Test insertion speed for packets with identical addresses/ports +# that are all placed in distinct conntrack zones. + +sfx=$(mktemp -u "XXXXXXXX") +ns="ns-$sfx" + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +zones=20000 +have_ct_tool=0 +ret=0 + +cleanup() +{ + ip netns del $ns +} + +ip netns add $ns +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $gw" + exit $ksft_skip +fi + +trap cleanup EXIT + +conntrack -V > /dev/null 2>&1 +if [ $? -eq 0 ];then + have_ct_tool=1 +fi + +ip -net "$ns" link set lo up + +test_zones() { + local max_zones=$1 + +ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 +ip netns exec $ns nft -f /dev/stdin</dev/null | ip netns exec "$ns" nc -w 1 -q 1 -u -p 12345 127.0.0.1 12345 > /dev/null + if [ $? -ne 0 ] ;then + ret=1 + break + fi + + stop=$(date +%s%3N) + local duration=$((stop-start)) + echo "PASS: added 10000 entries in $duration ms (now $i total, loop $j)" + done + + if [ $have_ct_tool -eq 1 ]; then + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries from packet path in $duration ms total" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" + ret=1 + fi + fi + + if [ $ret -ne 0 ];then + echo "FAIL: insert $max_zones entries from packet path" 1>&2 + fi +} + +test_conntrack_tool() { + local max_zones=$1 + + ip netns exec $ns conntrack -F >/dev/null 2>/dev/null + + local outerstart=$(date +%s%3N) + local start=$(date +%s%3N) + local stop=$start + local i=0 + while [ $i -lt $max_zones ]; do + i=$((i + 1)) + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 + if [ $? -ne 0 ];then + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null + echo "FAIL: conntrack -I returned an error" + ret=1 + break + fi + + if [ $((i%10000)) -eq 0 ];then + stop=$(date +%s%3N) + + local duration=$((stop-start)) + echo "PASS: added 10000 entries in $duration ms (now $i total)" + start=$stop + fi + done + + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries via ctnetlink in $duration ms" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" + ret=1 + fi +} + +test_zones $zones + +if [ $have_ct_tool -eq 1 ];then + test_conntrack_tool $zones +else + echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" + if [ $ret -eq 0 ];then + exit $ksft_skip + fi +fi + +exit $ret diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 439d3b4cd1..7851f3a1b5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -235,9 +235,13 @@ static void ack_flush(void *_completed) { } -static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) +static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait) { - if (unlikely(!cpus)) + const struct cpumask *cpus; + + if (likely(cpumask_available(tmp))) + cpus = tmp; + else cpus = cpu_online_mask; if (cpumask_empty(cpus)) @@ -263,14 +267,34 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, continue; kvm_make_request(req, vcpu); - cpu = vcpu->cpu; if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) continue; - if (tmp != NULL && cpu != -1 && cpu != me && - kvm_request_needs_ipi(vcpu, req)) - __cpumask_set_cpu(cpu, tmp); + /* + * tmp can be "unavailable" if cpumasks are allocated off stack + * as allocation of the mask is deliberately not fatal and is + * handled by falling back to kicking all online CPUs. + */ + if (!cpumask_available(tmp)) + continue; + + /* + * Note, the vCPU could get migrated to a different pCPU at any + * point after kvm_request_needs_ipi(), which could result in + * sending an IPI to the previous pCPU. But, that's ok because + * the purpose of the IPI is to ensure the vCPU returns to + * OUTSIDE_GUEST_MODE, which is satisfied if the vCPU migrates. + * Entering READING_SHADOW_PAGE_TABLES after this point is also + * ok, as the requirement is only that KVM wait for vCPUs that + * were reading SPTEs _before_ any changes were finalized. See + * kvm_vcpu_kick() for more details on handling requests. + */ + if (kvm_request_needs_ipi(vcpu, req)) { + cpu = READ_ONCE(vcpu->cpu); + if (cpu != -1 && cpu != me) + __cpumask_set_cpu(cpu, tmp); + } } called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); @@ -302,13 +326,8 @@ EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { - /* - * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in - * kvm_make_all_cpus_request. - */ - long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); - ++kvm->stat.generic.remote_tlb_flush_requests; + /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. @@ -323,7 +342,6 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) if (!kvm_arch_flush_remote_tlb(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); #endif @@ -528,7 +546,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, } } - if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) + if (range->flush_on_ret && ret) kvm_flush_remote_tlbs(kvm); if (locked) @@ -3134,15 +3152,19 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, shrink; + unsigned int old, val, shrink, grow_start; old = val = vcpu->halt_poll_ns; shrink = READ_ONCE(halt_poll_ns_shrink); + grow_start = READ_ONCE(halt_poll_ns_grow_start); if (shrink == 0) val = 0; else val /= shrink; + if (val < grow_start) + val = 0; + vcpu->halt_poll_ns = val; trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); } @@ -3290,16 +3312,24 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); */ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { - int me; - int cpu = vcpu->cpu; + int me, cpu; if (kvm_vcpu_wake_up(vcpu)) return; + /* + * Note, the vCPU could get migrated to a different pCPU at any point + * after kvm_arch_vcpu_should_kick(), which could result in sending an + * IPI to the previous pCPU. But, that's ok because the purpose of the + * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the + * vCPU also requires it to leave IN_GUEST_MODE. + */ me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_arch_vcpu_should_kick(vcpu)) + if (kvm_arch_vcpu_should_kick(vcpu)) { + cpu = READ_ONCE(vcpu->cpu); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) smp_send_reschedule(cpu); + } put_cpu(); } EXPORT_SYMBOL_GPL(kvm_vcpu_kick);