rockchip: add kernel 6.1 support

This commit is contained in:
DHDAXCW 2022-12-20 05:44:01 +00:00
parent dca450d9a6
commit f247ce474f
332 changed files with 25565 additions and 10495 deletions

View File

@ -1,2 +0,0 @@
LINUX_VERSION-6.0 = .10
LINUX_KERNEL_HASH-6.0.10 = 39e57fcd84cd70bfa3e1a4185d3aa0ed7f1432f24c6548d16326b0c3c9541dd0

2
include/kernel-6.1 Normal file
View File

@ -0,0 +1,2 @@
LINUX_VERSION-6.1 =
LINUX_KERNEL_HASH-6.1 = 2ca1f17051a430f6fed1196e4952717507171acfd97d96577212502703b25deb

View File

@ -221,11 +221,6 @@ $(eval $(call nf_add,NF_NATHELPER_EXTRA,CONFIG_NF_CONNTRACK_IRC, $(P_XT)nf_connt
$(eval $(call nf_add,NF_NATHELPER_EXTRA,CONFIG_NF_NAT_IRC, $(P_XT)nf_nat_irc))
# ulog
$(eval $(call nf_add,IPT_ULOG,CONFIG_IP_NF_TARGET_ULOG, $(P_V4)ipt_ULOG))
# nflog
$(eval $(call nf_add,IPT_NFLOG,CONFIG_NETFILTER_XT_TARGET_NFLOG, $(P_XT)xt_NFLOG))
@ -311,7 +306,6 @@ $(eval $(call nf_add,EBTABLES_IP4,CONFIG_BRIDGE_EBT_SNAT, $(P_EBT)ebt_snat))
# watchers
$(eval $(call nf_add,EBTABLES_WATCHERS,CONFIG_BRIDGE_EBT_LOG, $(P_EBT)ebt_log))
$(eval $(call nf_add,EBTABLES_WATCHERS,CONFIG_BRIDGE_EBT_ULOG, $(P_EBT)ebt_ulog))
$(eval $(call nf_add,EBTABLES_WATCHERS,CONFIG_BRIDGE_EBT_NFLOG, $(P_EBT)ebt_nflog))
$(eval $(call nf_add,EBTABLES_WATCHERS,CONFIG_BRIDGE_EBT_NFQUEUE, $(P_EBT)ebt_nfqueue))
@ -374,7 +368,6 @@ IPT_BUILTIN += $(IPT_NAT6-y)
IPT_BUILTIN += $(IPT_NAT_EXTRA-y)
IPT_BUILTIN += $(NF_NATHELPER-y)
IPT_BUILTIN += $(NF_NATHELPER_EXTRA-y)
IPT_BUILTIN += $(IPT_ULOG-y)
IPT_BUILTIN += $(IPT_TPROXY-y)
IPT_BUILTIN += $(NFNETLINK-y)
IPT_BUILTIN += $(NFNETLINK_LOG-y)

View File

@ -447,32 +447,6 @@ endef
$(eval $(call KernelPackage,crypto-kpp))
define KernelPackage/crypto-lib-blake2s
TITLE:=BLAKE2s hash function library
KCONFIG:=CONFIG_CRYPTO_LIB_BLAKE2S
HIDDEN:=1
FILES:= \
$(LINUX_DIR)/lib/crypto/libblake2s.ko \
$(LINUX_DIR)/lib/crypto/libblake2s-generic.ko
$(call AddDepends/crypto,+PACKAGE_kmod-crypto-hash:kmod-crypto-hash)
endef
define KernelPackage/crypto-lib-blake2s/config
imply PACKAGE_kmod-crypto-hash
endef
define KernelPackage/crypto-lib-blake2s/x86/64
KCONFIG+=CONFIG_CRYPTO_BLAKE2S_X86
FILES+=$(LINUX_DIR)/arch/x86/crypto/blake2s-x86_64.ko
endef
define KernelPackage/crypto-lib-blake2s/arm
KCONFIG+=CONFIG_CRYPTO_BLAKE2S_ARM
FILES+=$(LINUX_DIR)/arch/arm/crypto/blake2s-arm.ko
endef
$(eval $(call KernelPackage,crypto-lib-blake2s))
define KernelPackage/crypto-lib-chacha20
TITLE:=ChaCha library interface

View File

@ -109,9 +109,9 @@ define KernelPackage/fs-cifs
+kmod-crypto-ccm \
+kmod-crypto-ecb \
+kmod-crypto-des \
+(LINUX_5_15||LINUX_6_0):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_0):kmod-oid-registry \
+(LINUX_5_15||LINUX_6_0):kmod-dnsresolver
+(LINUX_5_15||LINUX_6_1):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_1):kmod-oid-registry \
+(LINUX_5_15||LINUX_6_1):kmod-dnsresolver
endef
define KernelPackage/fs-cifs/description
@ -530,7 +530,7 @@ $(eval $(call KernelPackage,fs-ntfs))
define KernelPackage/fs-ntfs3
SUBMENU:=$(FS_MENU)
TITLE:=NTFS3 Read-Write file system support
DEPENDS:=@(LINUX_5_4||LINUX_5_10||LINUX_5_15||LINUX_6_0) +kmod-nls-base
DEPENDS:= +kmod-nls-base
KCONFIG:= \
CONFIG_NTFS3_FS \
CONFIG_NTFS3_64BIT_CLUSTER=y \

View File

@ -134,6 +134,7 @@ define KernelPackage/lib-zstd
$(LINUX_DIR)/crypto/zstd.ko \
$(LINUX_DIR)/lib/xxhash.ko \
$(LINUX_DIR)/lib/zstd/zstd_compress.ko \
$(LINUX_DIR)/lib/zstd/zstd_common.ko@ge6.1 \
$(LINUX_DIR)/lib/zstd/zstd_decompress.ko
AUTOLOAD:=$(call AutoProbe,xxhash zstd zstd_compress zstd_decompress)
endef
@ -151,13 +152,15 @@ define KernelPackage/lib-lz4
DEPENDS:=+kmod-crypto-acompress
KCONFIG:= \
CONFIG_CRYPTO_LZ4 \
CONFIG_CRYPTO_LZ4HC \
CONFIG_LZ4_COMPRESS \
CONFIG_LZ4_DECOMPRESS
FILES:= \
$(LINUX_DIR)/crypto/lz4.ko \
$(LINUX_DIR)/lib/lz4/lz4_compress.ko \
$(LINUX_DIR)/lib/lz4/lz4hc_compress.ko \
$(LINUX_DIR)/lib/lz4/lz4_decompress.ko
AUTOLOAD:=$(call AutoProbe,lz4 lz4_compress lz4_decompress)
AUTOLOAD:=$(call AutoProbe,lz4 lz4_compress lz4hc_compress lz4_decompress)
endef
define KernelPackage/lib-lz4/description

View File

@ -142,7 +142,7 @@ $(eval $(call KernelPackage,mii))
define KernelPackage/mdio-devres
SUBMENU:=$(NETWORK_DEVICES_MENU)
TITLE:=Supports MDIO device registration
DEPENDS:=@(LINUX_5_10||LINUX_5_15||LINUX_6_0) +kmod-libphy +(TARGET_armvirt||TARGET_bcm27xx_bcm2708||TARGET_tegra):kmod-of-mdio
DEPENDS:=@(LINUX_5_10||LINUX_5_15||LINUX_6_1) +kmod-libphy +(TARGET_armvirt||TARGET_bcm27xx_bcm2708||TARGET_tegra):kmod-of-mdio
KCONFIG:=CONFIG_MDIO_DEVRES
HIDDEN:=1
FILES:=$(LINUX_DIR)/drivers/net/phy/mdio_devres.ko
@ -597,7 +597,7 @@ $(eval $(call KernelPackage,8139cp))
define KernelPackage/r8169
SUBMENU:=$(NETWORK_DEVICES_MENU)
TITLE:=RealTek RTL-8169 PCI Gigabit Ethernet Adapter kernel support
DEPENDS:=@PCI_SUPPORT +kmod-mii +r8169-firmware +kmod-phy-realtek +(LINUX_5_10||LINUX_5_15||LINUX_6_0):kmod-mdio-devres
DEPENDS:=@PCI_SUPPORT +kmod-mii +r8169-firmware +kmod-phy-realtek +(LINUX_5_10||LINUX_5_15||LINUX_6_1):kmod-mdio-devres
KCONFIG:= \
CONFIG_R8169 \
CONFIG_R8169_NAPI=y \
@ -723,7 +723,7 @@ $(eval $(call KernelPackage,igbvf))
define KernelPackage/ixgbe
SUBMENU:=$(NETWORK_DEVICES_MENU)
TITLE:=Intel(R) 82598/82599 PCI-Express 10 Gigabit Ethernet support
DEPENDS:=@PCI_SUPPORT +kmod-mdio +kmod-ptp +kmod-hwmon-core +kmod-libphy +(LINUX_5_10||LINUX_5_15||LINUX_6_0):kmod-mdio-devres
DEPENDS:=@PCI_SUPPORT +kmod-mdio +kmod-ptp +kmod-hwmon-core +kmod-libphy +(LINUX_5_10||LINUX_5_15||LINUX_6_1):kmod-mdio-devres
KCONFIG:=CONFIG_IXGBE \
CONFIG_IXGBE_VXLAN=n \
CONFIG_IXGBE_HWMON=y \

View File

@ -604,23 +604,6 @@ endef
$(eval $(call KernelPackage,nf-nathelper-extra))
define KernelPackage/ipt-ulog
TITLE:=Module for user-space packet logging
KCONFIG:=$(KCONFIG_IPT_ULOG)
FILES:=$(foreach mod,$(IPT_ULOG-m),$(LINUX_DIR)/net/$(mod).ko)
AUTOLOAD:=$(call AutoProbe,$(notdir $(IPT_ULOG-m)))
$(call AddDepends/ipt)
endef
define KernelPackage/ipt-ulog/description
Netfilter (IPv4) module for user-space packet logging
Includes:
- ULOG
endef
$(eval $(call KernelPackage,ipt-ulog))
define KernelPackage/ipt-nflog
TITLE:=Module for user-space packet logging
KCONFIG:=$(KCONFIG_IPT_NFLOG)

View File

@ -565,6 +565,23 @@ endef
$(eval $(call KernelPackage,veth))
define KernelPackage/vrf
SUBMENU:=$(NETWORK_SUPPORT_MENU)
TITLE:=Virtual Routing and Forwarding (Lite)
DEPENDS:=@KERNEL_NET_L3_MASTER_DEV
KCONFIG:=CONFIG_NET_VRF
FILES:=$(LINUX_DIR)/drivers/net/vrf.ko
AUTOLOAD:=$(call AutoLoad,30,vrf)
endef
define KernelPackage/vrf/description
This option enables the support for mapping interfaces into VRF's. The
support enables VRF devices.
endef
$(eval $(call KernelPackage,vrf))
define KernelPackage/slhc
SUBMENU:=$(NETWORK_SUPPORT_MENU)
HIDDEN:=1
@ -1296,7 +1313,6 @@ define KernelPackage/wireguard
SUBMENU:=$(NETWORK_SUPPORT_MENU)
TITLE:=WireGuard secure network tunnel
DEPENDS:= \
+kmod-crypto-lib-blake2s \
+kmod-crypto-lib-chacha20poly1305 \
+kmod-crypto-lib-curve25519 \
+kmod-udptunnel4 \
@ -1340,11 +1356,11 @@ define KernelPackage/qrtr
SUBMENU:=$(NETWORK_SUPPORT_MENU)
TITLE:=Qualcomm IPC Router support
HIDDEN:=1
DEPENDS:=@(LINUX_5_15||LINUX_6_0)
DEPENDS:=@(LINUX_5_15||LINUX_6_1)
KCONFIG:=CONFIG_QRTR
FILES:= \
$(LINUX_DIR)/net/qrtr/qrtr.ko \
$(LINUX_DIR)/net/qrtr/ns.ko
$(LINUX_DIR)/net/qrtr/ns.ko@lt6.0
AUTOLOAD:=$(call AutoProbe,qrtr)
endef

View File

@ -916,6 +916,10 @@ define KernelPackage/zram/config
bool "lz4"
select PACKAGE_kmod-lib-lz4
config ZRAM_DEF_COMP_LZ4HC
bool "lz4-hc"
select PACKAGE_kmod-lib-lz4hc
config ZRAM_DEF_COMP_ZSTD
bool "zstd"
select PACKAGE_kmod-lib-zstd
@ -1136,8 +1140,8 @@ $(eval $(call KernelPackage,keys-trusted))
define KernelPackage/tpm
SUBMENU:=$(OTHER_MENU)
TITLE:=TPM Hardware Support
DEPENDS:= +kmod-random-core +(LINUX_5_15||LINUX_6_0):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_0):kmod-asn1-encoder +(LINUX_5_15||LINUX_6_0):kmod-oid-registry
DEPENDS:= +kmod-random-core +(LINUX_5_15||LINUX_6_1):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_1):kmod-asn1-encoder +(LINUX_5_15||LINUX_6_1):kmod-oid-registry
KCONFIG:= CONFIG_TCG_TPM
FILES:= $(LINUX_DIR)/drivers/char/tpm/tpm.ko
AUTOLOAD:=$(call AutoLoad,10,tpm,1)
@ -1283,7 +1287,7 @@ $(eval $(call KernelPackage,qcom-qmi-helpers))
define KernelPackage/mhi
SUBMENU:=$(OTHER_MENU)
TITLE:=Modem Host Interface (MHI) bus
DEPENDS:=@(LINUX_5_15||LINUX_6_0)
DEPENDS:=@(LINUX_5_15||LINUX_6_1)
KCONFIG:=CONFIG_MHI_BUS \
CONFIG_MHI_BUS_DEBUG=y \
CONFIG_MHI_BUS_PCI_GENERIC=n \

View File

@ -1138,7 +1138,8 @@ $(eval $(call KernelPackage,usb-net-aqc111))
define KernelPackage/usb-net-asix
TITLE:=Kernel module for USB-to-Ethernet Asix convertors
DEPENDS:=+kmod-libphy +(LINUX_5_15||LINUX_6_0):kmod-mdio-devres
DEPENDS:=+(LINUX_5_4||LINUX_5_10):kmod-libphy \
+(LINUX_5_15||LINUX_6_1):kmod-mdio-devres +LINUX_6_1:kmod-phylink
KCONFIG:=CONFIG_USB_NET_AX8817X
FILES:= \
$(LINUX_DIR)/drivers/$(USBNET_DIR)/asix.ko \
@ -1153,7 +1154,6 @@ endef
$(eval $(call KernelPackage,usb-net-asix))
define KernelPackage/usb-net-asix-ax88179
TITLE:=Kernel module for USB-to-Gigabit-Ethernet Asix convertors
DEPENDS:=+kmod-libphy

View File

@ -28,7 +28,9 @@ define KernelPackage/backlight
CONFIG_BACKLIGHT_ADP8870=n \
CONFIG_BACKLIGHT_OT200=n \
CONFIG_BACKLIGHT_PM8941_WLED=n
FILES:=$(LINUX_DIR)/drivers/video/backlight/backlight.ko
FILES:=$(LINUX_DIR)/drivers/video/backlight/backlight.ko \
$(LINUX_DIR)/drivers/acpi/video.ko@ge6.1 \
$(LINUX_DIR)/drivers/platform/x86/wmi.ko@ge6.1
AUTOLOAD:=$(call AutoProbe,video backlight)
endef
@ -243,8 +245,8 @@ define KernelPackage/drm
SUBMENU:=$(VIDEO_MENU)
TITLE:=Direct Rendering Manager (DRM) support
HIDDEN:=1
DEPENDS:=+kmod-dma-buf +kmod-i2c-core +kmod-i2c-algo-bit +PACKAGE_kmod-backlight:kmod-backlight \
+(LINUX_5_15||LINUX_6_0):kmod-fb
DEPENDS:=+kmod-dma-buf +kmod-i2c-core +kmod-i2c-algo-bit +kmod-backlight \
+(LINUX_5_15||LINUX_6_1):kmod-fb
KCONFIG:= \
CONFIG_DRM \
CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y \
@ -266,7 +268,7 @@ $(eval $(call KernelPackage,drm))
define KernelPackage/drm-buddy
SUBMENU:=$(VIDEO_MENU)
TITLE:=A page based buddy allocator
DEPENDS:=@TARGET_x86 @DISPLAY_SUPPORT +kmod-drm @(LINUX_6_0)
DEPENDS:=@TARGET_x86 @DISPLAY_SUPPORT +kmod-drm @LINUX_6_1
KCONFIG:=CONFIG_DRM_BUDDY
FILES:= $(LINUX_DIR)/drivers/gpu/drm/drm_buddy.ko
AUTOLOAD:=$(call AutoProbe,drm_buddy)
@ -311,7 +313,7 @@ $(eval $(call KernelPackage,drm-kms-helper))
define KernelPackage/drm-display-helper
SUBMENU:=$(VIDEO_MENU)
TITLE:=DRM helpers for display adapters drivers
DEPENDS:=@DISPLAY_SUPPORT +kmod-drm +TARGET_x86:kmod-drm-buddy @(LINUX_6_0)
DEPENDS:=@DISPLAY_SUPPORT +kmod-drm +TARGET_x86:kmod-drm-buddy @LINUX_6_1
KCONFIG:=CONFIG_DRM_DISPLAY_HELPER
FILES:=$(LINUX_DIR)/drivers/gpu/drm/display/drm_display_helper.ko
AUTOLOAD:=$(call AutoProbe,drm_display_helper)
@ -328,7 +330,7 @@ define KernelPackage/drm-amdgpu
TITLE:=AMDGPU DRM support
DEPENDS:=@TARGET_x86 @DISPLAY_SUPPORT +kmod-backlight +kmod-drm-ttm \
+kmod-drm-kms-helper +kmod-i2c-algo-bit +amdgpu-firmware \
+(LINUX_6_0):kmod-drm-display-helper
+LINUX_6_1:kmod-drm-display-helper
KCONFIG:=CONFIG_DRM_AMDGPU \
CONFIG_DRM_AMDGPU_SI=y \
CONFIG_DRM_AMDGPU_CIK=y \
@ -1103,7 +1105,7 @@ define KernelPackage/drm-i915
SUBMENU:=$(VIDEO_MENU)
TITLE:=Intel GPU drm support
DEPENDS:=@TARGET_x86 +kmod-drm-ttm +kmod-drm-kms-helper +i915-firmware \
+(LINUX_6_0):kmod-drm-display-helper
+LINUX_6_1:kmod-drm-display-helper
KCONFIG:= \
CONFIG_INTEL_GTT \
CONFIG_DRM_I915 \

View File

@ -7,12 +7,12 @@ include $(TOPDIR)/rules.mk
include $(INCLUDE_DIR)/kernel.mk
PKG_NAME:=r8152
PKG_VERSION:=2.16.1
PKG_RELEASE:=1
PKG_VERSION:=2.16.3.20220914
PKG_RELEASE:=3
PKG_SOURCE:=$(PKG_NAME)-$(PKG_VERSION).tar.gz
PKG_SOURCE_URL:=https://codeload.github.com/wget/realtek-r8152-linux/tar.gz/v$(PKG_VERSION)?
PKG_HASH:=2be6a02f6e29485efd107bb7e777ad3c482d9db0ff7e5e6c5ef034a1557a395b
PKG_HASH:=61ed7af34c8882c6028ddd1a27bb78fb5bfba41211f84dd7a06e4dc84dbe9a9a
PKG_BUILD_DIR:=$(KERNEL_BUILD_DIR)/realtek-$(PKG_NAME)-linux-$(PKG_VERSION)

View File

@ -0,0 +1,38 @@
--- a/compatibility.h
+++ b/compatibility.h
@@ -237,9 +237,15 @@
#define napi_disable(napi_ptr) netif_poll_disable(container_of(napi_ptr, struct r8152, napi)->netdev)
#define napi_schedule(napi_ptr) netif_rx_schedule(container_of(napi_ptr, struct r8152, napi)->netdev)
#define napi_complete(napi_ptr) netif_rx_complete(container_of(napi_ptr, struct r8152, napi)->netdev)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
+ #define netif_napi_add_weight(ndev, napi_ptr, function, weight_t) \
+ ndev->poll = function; \
+ ndev->weight = weight_t;
+#else
#define netif_napi_add(ndev, napi_ptr, function, weight_t) \
ndev->poll = function; \
ndev->weight = weight_t;
+#endif
typedef unsigned long uintptr_t;
#define DMA_BIT_MASK(value) \
(value < 64 ? ((1ULL << value) - 1) : 0xFFFFFFFFFFFFFFFFULL)
--- a/r8152.c
+++ b/r8152.c
@@ -20718,10 +20718,17 @@
usb_set_intfdata(intf, tp);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
+ if (tp->support_2500full)
+ netif_napi_add_weight(netdev, &tp->napi, r8152_poll, 256);
+ else
+ netif_napi_add_weight(netdev, &tp->napi, r8152_poll, 64);
+#else
if (tp->support_2500full)
netif_napi_add(netdev, &tp->napi, r8152_poll, 256);
else
netif_napi_add(netdev, &tp->napi, r8152_poll, 64);
+#endif
ret = register_netdev(netdev);
if (ret != 0) {

View File

@ -7,13 +7,13 @@ include $(TOPDIR)/rules.mk
include $(INCLUDE_DIR)/kernel.mk
PKG_NAME:=r8168
PKG_VERSION:=8.050.03
PKG_RELEASE:=$(AUTORELEASE)
PKG_VERSION:=8.051.02
PKG_RELEASE:=1
PKG_SOURCE_PROTO:=git
PKG_SOURCE_URL:=https://github.com/BROBIRD/openwrt-r8168.git
PKG_SOURCE_VERSION:=ddfaceacd1b7ed2857fb995642a8ffb1fc37e989
PKG_MIRROR_HASH:=5428f60dc33e9503c6cfdf690c00077149dce24cbb0591129d905b9f1aad9202
PKG_SOURCE_VERSION:=4f6cfe1ca12fb772deed57f1d2d1062af041ad07
PKG_MIRROR_HASH:=6b149f5eb3b9e1dc50867a694984d253aa58d97dd5fbab30eb405d2d7b2be587
PKG_BUILD_DIR:=$(KERNEL_BUILD_DIR)/$(PKG_NAME)-$(PKG_VERSION)

View File

@ -8,7 +8,7 @@
#include <linux/if_vlan.h>
#include <linux/crc32.h>
#include <linux/interrupt.h>
@@ -24643,6 +24644,22 @@ rtl8168_set_bios_setting(struct net_devi
@@ -24769,6 +24770,22 @@ rtl8168_set_bios_setting(struct net_devi
}
}
@ -31,7 +31,7 @@
static void
rtl8168_init_software_variable(struct net_device *dev)
{
@@ -25206,6 +25223,8 @@ rtl8168_init_software_variable(struct ne
@@ -25343,6 +25360,8 @@ rtl8168_init_software_variable(struct ne
tp->NotWrMcuPatchCode = TRUE;
}

View File

@ -1,47 +0,0 @@
--- a/src/r8168_n.c
+++ b/src/r8168_n.c
@@ -3715,7 +3715,11 @@
txd->opts2 = 0;
while (1) {
memset(tmpAddr, pattern++, len - 14);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)
pci_dma_sync_single_for_device(tp->pci_dev,
+#else
+ dma_sync_single_for_device(tp_to_dev(tp),
+#endif
le64_to_cpu(mapping),
len, DMA_TO_DEVICE);
txd->opts1 = cpu_to_le32(DescOwn | FirstFrag | LastFrag | len);
@@ -3743,7 +3747,11 @@
if (rx_len == len) {
dma_sync_single_for_cpu(tp_to_dev(tp), le64_to_cpu(rxd->addr), tp->rx_buf_sz, DMA_FROM_DEVICE);
i = memcmp(skb->data, rx_skb->data, rx_len);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)
pci_dma_sync_single_for_device(tp->pci_dev, le64_to_cpu(rxd->addr), tp->rx_buf_sz, DMA_FROM_DEVICE);
+#else
+ dma_sync_single_for_device(tp_to_dev(tp), le64_to_cpu(rxd->addr), tp->rx_buf_sz, DMA_FROM_DEVICE);
+#endif
if (i == 0) {
// dev_printk(KERN_INFO, tp_to_dev(tp), "loopback test finished\n",rx_len,len);
break;
@@ -26464,11 +26472,20 @@
if ((sizeof(dma_addr_t) > 4) &&
use_dac &&
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)
!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) &&
!pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
+#else
+ !dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) &&
+ !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) {
+#endif
dev->features |= NETIF_F_HIGHDMA;
} else {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)
rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+#else
+ rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+#endif
if (rc < 0) {
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
if (netif_msg_probe(tp))

View File

@ -0,0 +1,14 @@
--- a/src/r8168.h
--- b/src/r8168.h
@@ -566,7 +566,11 @@
typedef struct napi_struct *napi_ptr;
typedef int napi_budget;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
+#define RTL_NAPI_CONFIG(ndev, priv, function, weight) netif_napi_add_weight(ndev, &priv->napi, function, weight)
+#else
#define RTL_NAPI_CONFIG(ndev, priv, function, weight) netif_napi_add(ndev, &priv->napi, function, weight)
+#endif
#define RTL_NAPI_QUOTA(budget, ndev) min(budget, budget)
#define RTL_GET_PRIV(stuct_ptr, priv_struct) container_of(stuct_ptr, priv_struct, stuct_ptr)
#define RTL_GET_NETDEV(priv_ptr) struct net_device *dev = priv_ptr->dev;

View File

@ -1,25 +0,0 @@
From ea9e2477624adaa40e8a553ef876f60ec8d3150c Mon Sep 17 00:00:00 2001
From: W_Y_CPP <383152993@qq.com>
Date: Fri, 18 Feb 2022 00:53:12 -0500
Subject: [PATCH] refresh
---
xt_FULLCONENAT.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/xt_FULLCONENAT.c b/xt_FULLCONENAT.c
index f96cfd8a3..237666039 100644
--- a/xt_FULLCONENAT.c
+++ b/xt_FULLCONENAT.c
@@ -1258,8 +1258,7 @@ static int fullconenat_tg_check(const struct xt_tgchk_param *par)
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS)
- if (!READ_ONCE(par->net->ct.nf_conntrack_event_cb)) {
- nf_conntrack_register_notifier(par->net, &ct_event_notifier);
+ if (!READ_ONCE(par->net->ct.nf_conntrack_event_cb)&&(nf_conntrack_register_notifier(par->net, &ct_event_notifier)==0)) {
#else
if (nf_conntrack_register_notifier(par->net, &ct_event_notifier) == 0) {
#endif
--
2.17.1

View File

@ -13,9 +13,9 @@ PKG_RELEASE:=2
PKG_SOURCE_PROTO:=git
PKG_SOURCE_URL=$(PROJECT_GIT)/project/firewall3.git
PKG_SOURCE_DATE:=2022-01-10
PKG_SOURCE_VERSION:=0f16ea5f055722a532d4e68c7ba34ed084b48b37
PKG_MIRROR_HASH:=219478ef95b170b5122030715eac7b3317f2ac4d67e1a936c22a78b10e056123
PKG_SOURCE_DATE:=2021-03-23
PKG_SOURCE_VERSION:=61db17edddb1f05e8107f0dbef6f7d060ce67483
PKG_MIRROR_HASH:=b2eb09816640e14e2dae21fb54ea05c33858fe0004844fe8d99e541a2e19e9c0
PKG_MAINTAINER:=Jo-Philipp Wich <jo@mein.io>
PKG_LICENSE:=ISC
@ -59,4 +59,4 @@ define Package/firewall/install
$(INSTALL_CONF) $(PKG_BUILD_DIR)/helpers.conf $(1)/usr/share/fw3
endef
$(eval $(call BuildPackage,firewall))
$(eval $(call BuildPackage,firewall))

View File

@ -3,7 +3,7 @@ config defaults
option input ACCEPT
option output ACCEPT
option forward REJECT
option fullcone 0
option fullcone 2
# Uncomment this line to disable ipv6 rules
# option disable_ipv6 1

View File

@ -0,0 +1,38 @@
From df1306a96127e91ff2d513a0a67345baaf61d113 Mon Sep 17 00:00:00 2001
From: Florian Eckert <fe@dev.tdt.de>
Date: Fri, 19 Nov 2021 09:51:02 +0100
Subject: [PATCH] firewall3: fix locking issue
By calling the command 'fw3 reload' several times at the same time, I
noticed that the locking was not working properly. It happened from time
to time that some firewall rules were present twice in the system!
By removing the 'unlink' systemcall, this error no longer occurred on my
systems.
Since fw3 does not run as a service, it makes no sense to delete this
lock file every time a filehandler is no longer open on this lock file,
because fw3 binary is not running.
If fw3 does run as a service then we can remove this lock file on
service stop. But this is not the case for fw3.
Signed-off-by: Florian Eckert <fe@dev.tdt.de>
---
utils.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/utils.c b/utils.c
index 17d5bf9..92e966c 100644
--- a/utils.c
+++ b/utils.c
@@ -397,7 +397,6 @@ fw3_unlock_path(int *fd, const char *lockpath)
warn("Cannot release exclusive lock: %s", strerror(errno));
close(*fd);
- unlink(FW3_LOCKFILE);
*fd = -1;
}
--
2.30.2

View File

@ -1,15 +1,14 @@
#
# Copyright (C) 2018 Chion Tang <tech@chionlab.moe>
# Copyright (C) 2022 Chion Tang <tech@chionlab.moe>
#
# This is free software, licensed under the GNU General Public License v2.
# See /LICENSE for more information.
#
include $(TOPDIR)/rules.mk
include $(INCLUDE_DIR)/kernel.mk
PKG_NAME:=fullconenat
PKG_RELEASE:=6
PKG_RELEASE:=9
PKG_SOURCE_DATE:=2022-02-13
PKG_SOURCE_PROTO:=git
@ -19,7 +18,9 @@ PKG_MIRROR_HASH:=00d749235271dee194dcd23c22e6e85207ea90192a62a110b2af0b4e4de1971
PKG_LICENSE:=GPL-2.0
PKG_LICENSE_FILES:=LICENSE
PKG_MAINTAINER:=Chion Tang <tech@chionlab.moe>
include $(INCLUDE_DIR)/kernel.mk
include $(INCLUDE_DIR)/package.mk
define Package/iptables-mod-fullconenat
@ -28,7 +29,6 @@ define Package/iptables-mod-fullconenat
CATEGORY:=Network
TITLE:=FULLCONENAT iptables extension
DEPENDS:=+iptables +kmod-ipt-fullconenat
MAINTAINER:=Chion Tang <tech@chionlab.moe>
endef
define Package/iptables-mod-fullconenat/install
@ -40,7 +40,6 @@ define KernelPackage/ipt-fullconenat
SUBMENU:=Netfilter Extensions
TITLE:=FULLCONENAT netfilter module
DEPENDS:=+kmod-nf-ipt +kmod-nf-nat
MAINTAINER:=Chion Tang <tech@chionlab.moe>
KCONFIG:= \
CONFIG_NF_CONNTRACK_EVENTS=y \
CONFIG_NF_CONNTRACK_CHAIN_EVENTS=y
@ -49,20 +48,15 @@ endef
include $(INCLUDE_DIR)/kernel-defaults.mk
define Build/Prepare
$(call Build/Prepare/Default)
$(CP) ./files/Makefile $(PKG_BUILD_DIR)/
endef
define Build/Compile
+$(MAKE) $(PKG_JOBS) -C "$(LINUX_DIR)" \
CROSS_COMPILE="$(TARGET_CROSS)" \
ARCH="$(LINUX_KARCH)" \
M="$(PKG_BUILD_DIR)" \
EXTRA_CFLAGS="$(BUILDFLAGS)" \
modules
CROSS_COMPILE="$(TARGET_CROSS)" \
ARCH="$(LINUX_KARCH)" \
M="$(PKG_BUILD_DIR)" \
EXTRA_CFLAGS="$(BUILDFLAGS)" \
modules
$(call Build/Compile/Default)
endef
$(eval $(call BuildPackage,iptables-mod-fullconenat))
$(eval $(call KernelPackage,ipt-fullconenat))
$(eval $(call BuildPackage,iptables-mod-fullconenat))

View File

@ -0,0 +1,26 @@
--- a/xt_FULLCONENAT.c
+++ b/xt_FULLCONENAT.c
@@ -325,7 +325,11 @@
/* for now we do the same thing for both --random and --random-fully */
/* select a random starting point */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
+ start = (uint16_t)(get_random_u32() % (u32)range_size);
+#else
start = (uint16_t)(prandom_u32() % (u32)range_size);
+#endif
} else {
if ((original_port >= min && original_port <= min + range_size - 1)
@@ -995,7 +999,11 @@
/* for now we do the same thing for both --random and --random-fully */
/* select a random starting point */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
+ start = (uint16_t)(get_random_u32() % (u32)range_size);
+#else
start = (uint16_t)(prandom_u32() % (u32)range_size);
+#endif
} else {
if ((original_port >= min && original_port <= min + range_size - 1)

View File

@ -222,19 +222,6 @@ iptables extensions for extra NAT targets.
- NETMAP
endef
define Package/iptables-mod-ulog
$(call Package/iptables/Module, +kmod-ipt-ulog)
TITLE:=user-space packet logging
endef
define Package/iptables-mod-ulog/description
iptables extensions for user-space packet logging.
Targets:
- ULOG
endef
define Package/iptables-mod-nflog
$(call Package/iptables/Module, +kmod-nfnetlink-log +kmod-ipt-nflog)
TITLE:=Netfilter NFLOG target
@ -674,7 +661,6 @@ $(eval $(call BuildPlugin,iptables-mod-nat-extra,$(IPT_NAT_EXTRA-m)))
$(eval $(call BuildPlugin,iptables-mod-iprange,$(IPT_IPRANGE-m)))
$(eval $(call BuildPlugin,iptables-mod-cluster,$(IPT_CLUSTER-m)))
$(eval $(call BuildPlugin,iptables-mod-clusterip,$(IPT_CLUSTERIP-m)))
$(eval $(call BuildPlugin,iptables-mod-ulog,$(IPT_ULOG-m)))
$(eval $(call BuildPlugin,iptables-mod-hashlimit,$(IPT_HASHLIMIT-m)))
$(eval $(call BuildPlugin,iptables-mod-rpfilter,$(IPT_RPFILTER-m)))
$(eval $(call BuildPlugin,iptables-mod-led,$(IPT_LED-m)))

View File

@ -1,75 +0,0 @@
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1386,16 +1386,6 @@ config BOOT_CONFIG_EMBED_FILE
This bootconfig will be used if there is no initrd or no other
bootconfig in the initrd.
-config INITRAMFS_PRESERVE_MTIME
- bool "Preserve cpio archive mtimes in initramfs"
- default y
- help
- Each entry in an initramfs cpio archive carries an mtime value. When
- enabled, extracted cpio items take this mtime, with directory mtime
- setting deferred until after creation of any child entries.
-
- If unsure, say Y.
-
choice
prompt "Compiler optimization level"
default CC_OPTIMIZE_FOR_PERFORMANCE
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -127,17 +127,15 @@ static void __init free_hash(void)
}
}
-#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME
-static void __init do_utime(char *filename, time64_t mtime)
+static long __init do_utime(char *filename, time64_t mtime)
{
- struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
- init_utimes(filename, t);
-}
+ struct timespec64 t[2];
-static void __init do_utime_path(const struct path *path, time64_t mtime)
-{
- struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
- vfs_utimes(path, t);
+ t[0].tv_sec = mtime;
+ t[0].tv_nsec = 0;
+ t[1].tv_sec = mtime;
+ t[1].tv_nsec = 0;
+ return init_utimes(filename, t);
}
static __initdata LIST_HEAD(dir_list);
@@ -170,12 +168,6 @@ static void __init dir_utime(void)
kfree(de);
}
}
-#else
-static void __init do_utime(char *filename, time64_t mtime) {}
-static void __init do_utime_path(const struct path *path, time64_t mtime) {}
-static void __init dir_add(const char *name, time64_t mtime) {}
-static void __init dir_utime(void) {}
-#endif
static __initdata time64_t mtime;
@@ -407,10 +399,14 @@ static int __init do_name(void)
static int __init do_copy(void)
{
if (byte_count >= body_len) {
+ struct timespec64 t[2] = { };
if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len)
error("write error");
- do_utime_path(&wfile->f_path, mtime);
+ t[0].tv_sec = mtime;
+ t[1].tv_sec = mtime;
+ vfs_utimes(&wfile->f_path, t);
+
fput(wfile);
if (csum_present && io_csum != hdr_csum)
error("bad data checksum");

View File

@ -1,122 +0,0 @@
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2133,8 +2133,6 @@ struct net_device {
/* Protocol-specific pointers */
- struct in_device __rcu *ip_ptr;
- struct inet6_dev __rcu *ip6_ptr;
#if IS_ENABLED(CONFIG_VLAN_8021Q)
struct vlan_info __rcu *vlan_info;
#endif
@@ -2147,18 +2145,16 @@ struct net_device {
#if IS_ENABLED(CONFIG_ATALK)
void *atalk_ptr;
#endif
+ struct in_device __rcu *ip_ptr;
#if IS_ENABLED(CONFIG_DECNET)
struct dn_dev __rcu *dn_ptr;
#endif
+ struct inet6_dev __rcu *ip6_ptr;
#if IS_ENABLED(CONFIG_AX25)
void *ax25_ptr;
#endif
-#if IS_ENABLED(CONFIG_CFG80211)
struct wireless_dev *ieee80211_ptr;
-#endif
-#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
struct wpan_dev *ieee802154_ptr;
-#endif
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
struct mpls_dev __rcu *mpls_ptr;
#endif
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8379,9 +8379,7 @@ int cfg80211_register_netdevice(struct n
*/
static inline void cfg80211_unregister_netdevice(struct net_device *dev)
{
-#if IS_ENABLED(CONFIG_CFG80211)
cfg80211_unregister_wdev(dev->ieee80211_ptr);
-#endif
}
/**
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -373,7 +373,6 @@ struct wpan_dev {
#define to_phy(_dev) container_of(_dev, struct wpan_phy, dev)
-#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
static inline int
wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
const struct ieee802154_addr *daddr,
@@ -384,7 +383,6 @@ wpan_dev_hard_header(struct sk_buff *skb
return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len);
}
-#endif
struct wpan_phy *
wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size);
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -308,11 +308,9 @@ static bool batadv_is_cfg80211_netdev(st
if (!net_device)
return false;
-#if IS_ENABLED(CONFIG_CFG80211)
/* cfg80211 drivers have to set ieee80211_ptr */
if (net_device->ieee80211_ptr)
return true;
-#endif
return false;
}
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -747,6 +747,7 @@ static const struct attribute_group nets
.attrs = netstat_attrs,
};
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
static struct attribute *wireless_attrs[] = {
NULL
};
@@ -755,19 +756,7 @@ static const struct attribute_group wire
.name = "wireless",
.attrs = wireless_attrs,
};
-
-static bool wireless_group_needed(struct net_device *ndev)
-{
-#if IS_ENABLED(CONFIG_CFG80211)
- if (ndev->ieee80211_ptr)
- return true;
#endif
-#if IS_ENABLED(CONFIG_WIRELESS_EXT)
- if (ndev->wireless_handlers)
- return true;
-#endif
- return false;
-}
#else /* CONFIG_SYSFS */
#define net_class_groups NULL
@@ -2008,8 +1997,14 @@ int netdev_register_kobject(struct net_d
*groups++ = &netstat_group;
- if (wireless_group_needed(ndev))
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+ if (ndev->ieee80211_ptr)
+ *groups++ = &wireless_group;
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ else if (ndev->wireless_handlers)
*groups++ = &wireless_group;
+#endif
+#endif
#endif /* CONFIG_SYSFS */
error = device_add(dev);

View File

@ -1,143 +0,0 @@
From e3264035bdac67898d685423ffb2f3a9c3a5964a Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 4 Aug 2021 01:31:34 -0600
Subject: [PATCH 01/14] mm: x86, arm64: add arch_has_hw_pte_young()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some architectures automatically set the accessed bit in PTEs, e.g.,
x86 and arm64 v8.2. On architectures that do not have this capability,
clearing the accessed bit in a PTE usually triggers a page fault
following the TLB miss of this PTE (to emulate the accessed bit).
Being aware of this capability can help make better decisions, e.g.,
whether to spread the work out over a period of time to reduce bursty
page faults when trying to clear the accessed bit in many PTEs.
Note that theoretically this capability can be unreliable, e.g.,
hotplugged CPUs might be different from builtin ones. Therefore it
should not be used in architecture-independent code that involves
correctness, e.g., to determine whether TLB flushes are required (in
combination with the accessed bit).
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
---
arch/arm64/include/asm/pgtable.h | 15 ++-------------
arch/x86/include/asm/pgtable.h | 6 +++---
include/linux/pgtable.h | 13 +++++++++++++
mm/memory.c | 14 +-------------
4 files changed, 19 insertions(+), 29 deletions(-)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1082,24 +1082,13 @@ static inline void update_mmu_cache(stru
* page after fork() + CoW for pfn mappings. We don't always have a
* hardware-managed access flag on arm64.
*/
-static inline bool arch_faults_on_old_pte(void)
-{
- /* The register read below requires a stable CPU to make any sense */
- cant_migrate();
-
- return !cpu_has_hw_af();
-}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
+#define arch_has_hw_pte_young cpu_has_hw_af
/*
* Experimentally, it's cheap to set the access flag in hardware and we
* benefit from prefaulting mappings as 'old' to start with.
*/
-static inline bool arch_wants_old_prefaulted_pte(void)
-{
- return !arch_faults_on_old_pte();
-}
-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
+#define arch_wants_old_prefaulted_pte cpu_has_hw_af
static inline bool pud_sect_supported(void)
{
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1431,10 +1431,10 @@ static inline bool arch_has_pfn_modify_c
return boot_cpu_has_bug(X86_BUG_L1TF);
}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
+#define arch_has_hw_pte_young arch_has_hw_pte_young
+static inline bool arch_has_hw_pte_young(void)
{
- return false;
+ return true;
}
#ifdef CONFIG_PAGE_TABLE_CHECK
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+#ifndef arch_has_hw_pte_young
+/*
+ * Return whether the accessed bit is supported on the local CPU.
+ *
+ * This stub assumes accessing through an old PTE triggers a page fault.
+ * Architectures that automatically set the access bit should overwrite it.
+ */
+static inline bool arch_has_hw_pte_young(void)
+{
+ return false;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,18 +125,6 @@ int randomize_va_space __read_mostly =
2;
#endif
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
- /*
- * Those arches which don't have hw access flag feature need to
- * implement their own helper. By default, "true" means pagefault
- * will be hit on old pte.
- */
- return true;
-}
-#endif
-
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -2872,7 +2860,7 @@ static inline bool __wp_page_copy_user(s
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);

View File

@ -1,132 +0,0 @@
From 0c0016e6f53b52166fe4da61c81fa6b27f4650cd Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 26 Sep 2020 21:17:18 -0600
Subject: [PATCH 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some architectures support the accessed bit in non-leaf PMD entries,
e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it
as part of linear address translation [1]. Page table walkers that
clear the accessed bit may use this capability to reduce their search
space.
Note that:
1. Although an inline function is preferable, this capability is added
as a configuration option for consistency with the existing macros.
2. Due to the little interest in other varieties, this capability was
only tested on Intel and AMD CPUs.
Thanks to the following developers for their efforts [2][3].
Randy Dunlap <rdunlap@infradead.org>
Stephen Rothwell <sfr@canb.auug.org.au>
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
Volume 3 (June 2021), section 4.8
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
---
arch/Kconfig | 8 ++++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 3 ++-
arch/x86/mm/pgtable.c | 5 ++++-
include/linux/pgtable.h | 4 ++--
5 files changed, 17 insertions(+), 4 deletions(-)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1418,6 +1418,14 @@ config DYNAMIC_SIGFRAME
config HAVE_ARCH_NODE_DEV_GROUP
bool
+config ARCH_HAS_NONLEAF_PMD_YOUNG
+ bool
+ help
+ Architectures that select this option are capable of setting the
+ accessed bit in non-leaf PMD entries when using them as part of linear
+ address translations. Page table walkers that clear the accessed bit
+ may use this capability to reduce their search space.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -85,6 +85,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_COPY_MC if X86_64
select ARCH_HAS_SET_MEMORY
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vad
static inline int pmd_bad(pmd_t pmd)
{
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}
static inline unsigned long pages_to_mb(unsigned long npg)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
return ret;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
return ret;
}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp)
{
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_yo
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_yo
BUILD_BUG();
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH

View File

@ -1,254 +0,0 @@
From d8e0edcddc441574410a047ede56f79c849a6d37 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 27 Sep 2020 20:49:08 -0600
Subject: [PATCH 03/14] mm/vmscan.c: refactor shrink_node()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch refactors shrink_node() to improve readability for the
upcoming changes to mm/vmscan.c.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
---
mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
1 file changed, 104 insertions(+), 94 deletions(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2728,6 +2728,109 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -3197,109 +3300,16 @@ static void shrink_node(pg_data_t *pgdat
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
- /*
- * Flush the memory cgroup stats, so that we read accurate per-memcg
- * lruvec stats for heuristics.
- */
- mem_cgroup_flush_stats();
-
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&target_lruvec->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&target_lruvec->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);

View File

@ -1,59 +0,0 @@
From bc14d2c7c6d0fb8c79ad0fc5eab488b977cbcccf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 6 Mar 2022 20:22:40 -0700
Subject: [PATCH 04/14] Revert "include/linux/mm_inline.h: fold
__update_lru_size() into its sole caller"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch undoes the following refactor:
commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller")
The upcoming changes to include/linux/mm_inline.h will reuse
__update_lru_size().
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I6155c407d50199a43b179c7f45904d4b7c052118
---
include/linux/mm_inline.h | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struc
return folio_is_file_lru(page_folio(page));
}
-static __always_inline void update_lru_size(struct lruvec *lruvec,
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
long nr_pages)
{
@@ -43,6 +43,13 @@ static __always_inline void update_lru_s
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, enum zone_type zid,
+ long nr_pages)
+{
+ __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif

View File

@ -1,777 +0,0 @@
From 8c6beb4548c216da9dae5e1a7612a108396e3f9e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 25 Jan 2021 21:12:33 -0700
Subject: [PATCH 05/14] mm: multi-gen LRU: groundwork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Evictable pages are divided into multiple generations for each lruvec.
The youngest generation number is stored in lrugen->max_seq for both
anon and file types as they are aged on an equal footing. The oldest
generation numbers are stored in lrugen->min_seq[] separately for anon
and file types as clean file pages can be evicted regardless of swap
constraints. These three variables are monotonically increasing.
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
in order to fit into the gen counter in folio->flags. Each truncated
generation number is an index to lrugen->lists[]. The sliding window
technique is used to track at least MIN_NR_GENS and at most
MAX_NR_GENS generations. The gen counter stores a value within [1,
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
stores 0.
There are two conceptually independent procedures: "the aging", which
produces young generations, and "the eviction", which consumes old
generations. They form a closed-loop system, i.e., "the page reclaim".
Both procedures can be invoked from userspace for the purposes of
working set estimation and proactive reclaim. These techniques are
commonly used to optimize job scheduling (bin packing) in data
centers [1][2].
To avoid confusion, the terms "hot" and "cold" will be applied to the
multi-gen LRU, as a new convention; the terms "active" and "inactive"
will be applied to the active/inactive LRU, as usual.
The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
one through page tables and the other through file descriptors. The
protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
applications usually do not prepare themselves for major page
faults like they do for blocked I/O. E.g., GUI applications
commonly use dedicated I/O threads to avoid blocking rendering
threads.
There are also two access patterns: one with temporal locality and the
other without. For the reasons listed above, the former channel is
assumed to follow the former pattern unless VM_SEQ_READ or
VM_RAND_READ is present; the latter channel is assumed to follow the
latter pattern unless outlying refaults have been observed [3][4].
The next patch will address the "outlying refaults". Three macros,
i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are
added in this patch to make the entire patchset less diffy.
A page is added to the youngest generation on faulting. The aging
needs to check the accessed bit at least twice before handing this
page over to the eviction. The first check takes care of the accessed
bit set on the initial fault; the second check makes sure this page
has not been used since then. This protocol, AKA second chance,
requires a minimum of two generations, hence MIN_NR_GENS.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
[3] https://lwn.net/Articles/495543/
[4] https://lwn.net/Articles/815342/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
---
fs/fuse/dev.c | 3 +-
include/linux/mm_inline.h | 175 ++++++++++++++++++++++++++++++
include/linux/mmzone.h | 102 +++++++++++++++++
include/linux/page-flags-layout.h | 13 ++-
include/linux/page-flags.h | 4 +-
include/linux/sched.h | 4 +
kernel/bounds.c | 5 +
mm/Kconfig | 8 ++
mm/huge_memory.c | 3 +-
mm/memcontrol.c | 2 +
mm/memory.c | 25 +++++
mm/mm_init.c | 6 +-
mm/mmzone.c | 2 +
mm/swap.c | 11 +-
mm/vmscan.c | 75 +++++++++++++
15 files changed, 424 insertions(+), 14 deletions(-)
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -776,7 +776,8 @@ static int fuse_check_page(struct page *
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -40,6 +40,9 @@ static __always_inline void __update_lru
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ lockdep_assert_held(&lruvec->lru_lock);
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
+
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
@@ -101,11 +104,177 @@ static __always_inline enum lru_list fol
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+static inline bool lru_gen_enabled(void)
+{
+ return true;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return current->in_lru_fault;
+}
+
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+static inline int folio_lru_gen(struct folio *folio)
+{
+ unsigned long flags = READ_ONCE(folio->flags);
+
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = lruvec->lrugen.max_seq;
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+
+ /* see the comment on MIN_NR_GENS */
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
+ lrugen->nr_pages[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
+ lrugen->nr_pages[new_gen][type][zone] + delta);
+
+ /* addition */
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ /* deletion */
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long seq;
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+
+ if (folio_test_unevictable(folio))
+ return false;
+ /*
+ * There are three common cases for this page:
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
+ * migrated, add it to the youngest generation.
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
+ * not in swapcache or a dirty page pending writeback, add it to the
+ * second oldest generation.
+ * 3. Everything else (clean, cold) is added to the oldest generation.
+ */
+ if (folio_test_active(folio))
+ seq = lrugen->max_seq;
+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ seq = lrugen->min_seq[type] + 1;
+ else
+ seq = lrugen->min_seq[type];
+
+ gen = lru_gen_from_seq(seq);
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
+ /* see the comment on MIN_NR_GENS about PG_active */
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+
+ lru_gen_update_size(lruvec, folio, -1, gen);
+ /* for folio_rotate_reclaimable() */
+ if (reclaiming)
+ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+
+ if (gen < 0)
+ return false;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+
+ /* for folio_migrate_flags() */
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
+ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ lru_gen_update_size(lruvec, folio, gen, -1);
+ list_del(&folio->lru);
+
+ return true;
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, false))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
if (lru != LRU_UNEVICTABLE)
@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, true))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
/* This is not expected to be used on LRU_UNEVICTABLE */
@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lru
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_del_folio(lruvec, folio, false))
+ return;
+
if (lru != LRU_UNEVICTABLE)
list_del(&folio->lru);
update_lru_size(lruvec, lru, folio_zonenum(folio),
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -314,6 +314,102 @@ enum lruvec_flags {
*/
};
+#endif /* !__GENERATING_BOUNDS_H */
+
+/*
+ * Evictable pages are divided into multiple generations. The youngest and the
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+ * corresponding generation. The gen counter in folio->flags stores gen+1 while
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ *
+ * A page is added to the youngest generation on faulting. The aging needs to
+ * check the accessed bit at least twice before handing this page over to the
+ * eviction. The first check takes care of the accessed bit set on the initial
+ * fault; the second check makes sure this page hasn't been used since then.
+ * This process, AKA second chance, requires a minimum of two generations,
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
+ * rest of generations, if they exist, are considered inactive. See
+ * lru_gen_is_active().
+ *
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+ * the aging needs not to worry about it. And it's set again when a page
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
+ * See lru_gen_add_folio() and lru_gen_del_folio().
+ *
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
+ * in folio->flags.
+ */
+#define MIN_NR_GENS 2U
+#define MAX_NR_GENS 4U
+
+#ifndef __GENERATING_BOUNDS_H
+
+struct lruvec;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+enum {
+ LRU_GEN_ANON,
+ LRU_GEN_FILE,
+};
+
+/*
+ * The youngest generation number is stored in max_seq for both anon and file
+ * types as they are aged on an equal footing. The oldest generation numbers are
+ * stored in min_seq[] separately for anon and file types as clean file pages
+ * can be evicted regardless of swap constraints.
+ *
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
+ * min_seq behind.
+ *
+ * The number of pages in each generation is eventually consistent and therefore
+ * can be transiently negative.
+ */
+struct lru_gen_struct {
+ /* the aging increments the youngest generation number */
+ unsigned long max_seq;
+ /* the eviction increments the oldest generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the multi-gen LRU lists, lazily sorted on eviction */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the multi-gen LRU sizes, eventually consistent */
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+};
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+#endif
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -331,6 +427,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* evictable pages divided into generations */
+ struct lru_gen_struct lrugen;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
@@ -746,6 +846,8 @@ static inline bool zone_is_empty(struct
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -55,7 +55,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +90,8 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
@@ -100,10 +101,12 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
+#define LRU_REFS_WIDTH 0
+
#endif
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageA
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageA
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -914,6 +914,10 @@ struct task_struct {
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
+#ifdef CONFIG_LRU_GEN
+ /* whether the LRU algorithm may apply to this access */
+ unsigned in_lru_fault:1;
+#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,11 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+#endif
/* End of constants */
return 0;
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1124,6 +1124,14 @@ config PTE_MARKER_UFFD_WP
purposes. It is required to enable userfaultfd write protection on
file-backed memory types like shmem and hugetlbfs.
+config LRU_GEN
+ bool "Multi-Gen LRU"
+ depends on MMU
+ # make sure folio->flags has enough spare bits
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
+ help
+ A high performance LRU implementation to overcommit memory.
+
source "mm/damon/Kconfig"
endmenu
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2438,7 +2438,8 @@ static void __split_huge_page_tail(struc
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5170,6 +5170,7 @@ static void __mem_cgroup_free(struct mem
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
}
@@ -5228,6 +5229,7 @@ static struct mem_cgroup *mem_cgroup_all
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5110,6 +5110,27 @@ static inline void mm_account_fault(stru
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+ /* the LRU algorithm doesn't apply to sequential or random reads */
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+}
+
+static void lru_gen_exit_fault(void)
+{
+ current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -5141,11 +5162,15 @@ vm_fault_t handle_mm_fault(struct vm_are
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ lru_gen_enter_fault(vma);
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ lru_gen_exit_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_REFS_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec)
* Poison its list head, so that any operations on it would crash.
*/
list_del(&lruvec->lists[LRU_UNEVICTABLE]);
+
+ lru_gen_init_lruvec(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -484,6 +484,11 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ /* see the comment in lru_gen_add_folio() */
+ if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
+ folio_set_active(folio);
+
folio_get(folio);
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
@@ -575,7 +580,7 @@ static void lru_deactivate_file_fn(struc
static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
long nr_pages = folio_nr_pages(folio);
lruvec_del_folio(lruvec, folio);
@@ -688,8 +693,8 @@ void deactivate_page(struct page *page)
{
struct folio *folio = page_folio(page);
- if (folio_test_lru(folio) && folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
+ (folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3050,6 +3050,81 @@ static bool can_age_anon_pages(struct pg
return can_demote(pgdat->node_id, sc);
}
+#ifdef CONFIG_LRU_GEN
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+#ifdef CONFIG_MEMCG
+ if (memcg) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* for hotadd_new_pgdat() */
+ if (!lruvec->pgdat)
+ lruvec->pgdat = pgdat;
+
+ return lruvec;
+ }
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return pgdat ? &pgdat->__lruvec : NULL;
+}
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ sizeof(lruvec->lrugen.nr_pages)));
+ }
+}
+#endif
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ return 0;
+};
+late_initcall(init_lru_gen);
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];

View File

@ -1,476 +0,0 @@
From 93fa87bdef9e7fa9977355c4712c000f31639231 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:43:22 -0700
Subject: [PATCH 07/14] mm: multi-gen LRU: exploit locality in rmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Searching the rmap for PTEs mapping each page on an LRU list (to test
and clear the accessed bit) can be expensive because pages from
different VMAs (PA space) are not cache friendly to the rmap (VA
space). For workloads mostly using mapped pages, searching the rmap
can incur the highest CPU cost in the reclaim path.
This patch exploits spatial locality to reduce the trips into the
rmap. When shrink_page_list() walks the rmap and finds a young PTE, a
new function lru_gen_look_around() scans at most BITS_PER_LONG-1
adjacent PTEs. On finding another young PTE, it clears the accessed
bit and updates the gen counter of the page mapped by this PTE to
(max_seq%MAX_NR_GENS)+1.
Server benchmark results:
Single workload:
fio (buffered I/O): no change
Single workload:
memcached (anon): +[3, 5]%
Ops/sec KB/sec
patch1-6: 1106168.46 43025.04
patch1-7: 1147696.57 44640.29
Configurations:
no change
Client benchmark results:
kswapd profiles:
patch1-6
39.03% lzo1x_1_do_compress (real work)
18.47% page_vma_mapped_walk (overhead)
6.74% _raw_spin_unlock_irq
3.97% do_raw_spin_lock
2.49% ptep_clear_flush
2.48% anon_vma_interval_tree_iter_first
1.92% folio_referenced_one
1.88% __zram_bvec_write
1.48% memmove
1.31% vma_interval_tree_iter_next
patch1-7
48.16% lzo1x_1_do_compress (real work)
8.20% page_vma_mapped_walk (overhead)
7.06% _raw_spin_unlock_irq
2.92% ptep_clear_flush
2.53% __zram_bvec_write
2.11% do_raw_spin_lock
2.02% memmove
1.93% lru_gen_look_around
1.56% free_unref_page_list
1.40% memset
Configurations:
no change
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I4b9ca0fd20f566ce554e703f14cee3fe0048c2fd
---
include/linux/memcontrol.h | 31 +++++++
include/linux/mm.h | 5 +
include/linux/mmzone.h | 6 ++
mm/internal.h | 1 +
mm/memcontrol.c | 1 +
mm/rmap.c | 6 ++
mm/swap.c | 4 +-
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
8 files changed, 236 insertions(+), 2 deletions(-)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -444,6 +444,7 @@ static inline struct obj_cgroup *__folio
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -505,6 +506,7 @@ static inline struct mem_cgroup *folio_m
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem page a caller should hold an rcu read lock to protect memcg
* associated with a kmem page from being released.
@@ -959,6 +961,23 @@ void unlock_page_memcg(struct page *page
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+/* try to stablize folio_memcg() for all the pages in a memcg */
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
+ return true;
+
+ rcu_read_unlock();
+ return false;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
@@ -1433,6 +1452,18 @@ static inline void folio_memcg_unlock(st
{
}
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ /* to match folio_memcg_rcu() */
+ rcu_read_lock();
+ return true;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
static inline void mem_cgroup_handle_over_high(void)
{
}
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1465,6 +1465,11 @@ static inline unsigned long folio_pfn(st
return page_to_pfn(&folio->page);
}
+static inline struct folio *pfn_folio(unsigned long pfn)
+{
+ return page_folio(pfn_to_page(pfn));
+}
+
static inline atomic_t *folio_pincount_ptr(struct folio *folio)
{
return &folio_page(folio, 1)->compound_pincount;
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -372,6 +372,7 @@ enum lruvec_flags {
#ifndef __GENERATING_BOUNDS_H
struct lruvec;
+struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
@@ -427,6 +428,7 @@ struct lru_gen_struct {
};
void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
@@ -439,6 +441,10 @@ static inline void lru_gen_init_lruvec(s
{
}
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
#ifdef CONFIG_MEMCG
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
+void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2789,6 +2789,7 @@ static void commit_charge(struct folio *
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -833,6 +833,12 @@ static bool folio_referenced_one(struct
}
if (pvmw.pte) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ lru_gen_look_around(&pvmw);
+ referenced++;
+ }
+
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -366,7 +366,7 @@ static void folio_activate_drain(int cpu
folio_batch_move_lru(fbatch, folio_activate_fn);
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
@@ -385,7 +385,7 @@ static inline void folio_activate_drain(
{
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1635,6 +1635,11 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
+ /* folio_update_gen() tried to promote this page? */
+ if (lru_gen_enabled() && !ignore_references &&
+ folio_mapped(folio) && folio_test_referenced(folio))
+ goto keep_locked;
+
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -3219,6 +3224,29 @@ static bool positive_ctrl_err(struct ctr
* the aging
******************************************************************************/
+/* promote pages accessed through page tables */
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ /* lru_gen_del_folio() has isolated this page? */
+ if (!(old_flags & LRU_GEN_MASK)) {
+ /* for shrink_page_list() */
+ new_flags = old_flags | BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
/* protect pages accessed multiple times through file descriptors */
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
@@ -3230,6 +3258,11 @@ static int folio_inc_gen(struct lruvec *
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
do {
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ /* folio_update_gen() has promoted this page? */
+ if (new_gen >= 0 && new_gen != old_gen)
+ return new_gen;
+
new_gen = (old_gen + 1) % MAX_NR_GENS;
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
@@ -3244,6 +3277,43 @@ static int folio_inc_gen(struct lruvec *
return new_gen;
}
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pte_present(pte) || is_zero_pfn(pfn))
+ return -1;
+
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ struct pglist_data *pgdat)
+{
+ struct folio *folio;
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return NULL;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ return NULL;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ return NULL;
+
+ return folio;
+}
+
static void inc_min_seq(struct lruvec *lruvec, int type)
{
struct lru_gen_struct *lrugen = &lruvec->lrugen;
@@ -3443,6 +3513,114 @@ static void lru_gen_age_node(struct pgli
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
}
+/*
+ * This function exploits spatial locality when shrink_page_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long addr;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ struct folio *folio = pfn_folio(pvmw->pfn);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+
+ if (spin_is_contended(pvmw->ptl))
+ return;
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+
+ rcu_read_lock();
+ arch_enter_lazy_mmu_mode();
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i]))
+ continue;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ __set_bit(i, bitmap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ rcu_read_unlock();
+
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ folio_activate(folio);
+ }
+ return;
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ spin_lock_irq(&lruvec->lru_lock);
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen < 0 || old_gen == new_gen)
+ continue;
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_unlock_pages();
+}
+
/******************************************************************************
* the eviction
******************************************************************************/
@@ -3479,6 +3657,12 @@ static bool sort_folio(struct lruvec *lr
return true;
}
+ /* promoted */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
/* protected */
if (tier > tier_idx) {
int hist = lru_hist_from_seq(lrugen->min_seq[type]);

View File

@ -1,290 +0,0 @@
From 6b9670b94ba2b49b289b997121062500e32fc3e4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 19:59:54 -0700
Subject: [PATCH 09/14] mm: multi-gen LRU: optimize multiple memcgs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When multiple memcgs are available, it is possible to use generations
as a frame of reference to make better choices and improve overall
performance under global memory pressure. This patch adds a basic
optimization to select memcgs that can drop single-use unmapped clean
pages first. Doing so reduces the chance of going into the aging path
or swapping, which can be costly.
A typical example that benefits from this optimization is a server
running mixed types of workloads, e.g., heavy anon workload in one
memcg and heavy buffered I/O workload in the other.
Though this optimization can be applied to both kswapd and direct
reclaim, it is only added to kswapd to keep the patchset manageable.
Later improvements may cover the direct reclaim path.
While ensuring certain fairness to all eligible memcgs, proportional
scans of individual memcgs also require proper backoff to avoid
overshooting their aggregate reclaim target by too much. Otherwise it
can cause high direct reclaim latency. The conditions for backoff are:
1. At low priorities, for direct reclaim, if aging fairness or direct
reclaim latency is at risk, i.e., aging one memcg multiple times or
swapping after the target is met.
2. At high priorities, for global reclaim, if per-zone free pages are
above respective watermarks.
Server benchmark results:
Mixed workloads:
fio (buffered I/O): +[19, 21]%
IOPS BW
patch1-8: 1880k 7343MiB/s
patch1-9: 2252k 8796MiB/s
memcached (anon): +[119, 123]%
Ops/sec KB/sec
patch1-8: 862768.65 33514.68
patch1-9: 1911022.12 74234.54
Mixed workloads:
fio (buffered I/O): +[75, 77]%
IOPS BW
5.19-rc1: 1279k 4996MiB/s
patch1-9: 2252k 8796MiB/s
memcached (anon): +[13, 15]%
Ops/sec KB/sec
5.19-rc1: 1673524.04 65008.87
patch1-9: 1911022.12 74234.54
Configurations:
(changes since patch 6)
cat mixed.sh
modprobe brd rd_nr=2 rd_size=56623104
swapoff -a
mkswap /dev/ram0
swapon /dev/ram0
mkfs.ext4 /dev/ram1
mount -t ext4 /dev/ram1 /mnt
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
--ratio 1:0 --pipeline 8 -d 2000
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
--buffered=1 --ioengine=io_uring --iodepth=128 \
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
--rw=randread --random_distribution=random --norandommap \
--time_based --ramp_time=10m --runtime=90m --group_reporting &
pid=$!
sleep 200
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
kill -INT $pid
wait
Client benchmark results:
no change (CONFIG_MEMCG=n)
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I7e00e0c733437e534ac98031cf8154a681becc00
---
mm/vmscan.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 95 insertions(+), 9 deletions(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,12 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
+#ifdef CONFIG_LRU_GEN
+ /* help kswapd make better choices among multiple memcgs */
+ unsigned int memcgs_need_aging:1;
+ unsigned long last_reclaimed;
+#endif
+
/* Allocation order */
s8 order;
@@ -4429,6 +4435,19 @@ static void lru_gen_age_node(struct pgli
VM_WARN_ON_ONCE(!current_is_kswapd());
+ sc->last_reclaimed = sc->nr_reclaimed;
+
+ /*
+ * To reduce the chance of going into the aging path, which can be
+ * costly, optimistically skip it if the flag below was cleared in the
+ * eviction path. This improves the overall performance when multiple
+ * memcgs are available.
+ */
+ if (!sc->memcgs_need_aging) {
+ sc->memcgs_need_aging = true;
+ return;
+ }
+
set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -4840,7 +4859,8 @@ static int isolate_folios(struct lruvec
return scanned;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ bool *need_swapping)
{
int type;
int scanned;
@@ -4903,6 +4923,9 @@ static int evict_folios(struct lruvec *l
sc->nr_reclaimed += reclaimed;
+ if (need_swapping && type == LRU_GEN_ANON)
+ *need_swapping = true;
+
return scanned;
}
@@ -4912,9 +4935,8 @@ static int evict_folios(struct lruvec *l
* reclaim.
*/
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap)
+ bool can_swap, bool *need_aging)
{
- bool need_aging;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -4924,8 +4946,8 @@ static unsigned long get_nr_to_scan(stru
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
return 0;
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
- if (!need_aging)
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+ if (!*need_aging)
return nr_to_scan;
/* skip the aging path at the default priority */
@@ -4942,10 +4964,67 @@ done:
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
}
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+ struct scan_control *sc, bool need_swapping)
+{
+ int i;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (!current_is_kswapd()) {
+ /* age each memcg at most once to ensure fairness */
+ if (max_seq - seq > 1)
+ return true;
+
+ /* over-swapping can increase allocation latency */
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+ return true;
+
+ /* give this thread a chance to exit and free its memory */
+ if (fatal_signal_pending(current)) {
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+ return true;
+ }
+
+ if (cgroup_reclaim(sc))
+ return false;
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+ return false;
+
+ /* keep scanning at low priorities to ensure fairness */
+ if (sc->priority > DEF_PRIORITY - 2)
+ return false;
+
+ /*
+ * A minimum amount of work was done under global memory pressure. For
+ * kswapd, it may be overshooting. For direct reclaim, the allocation
+ * may succeed if all suitable zones are somewhat safe. In either case,
+ * it's better to stop now, and restart later if necessary.
+ */
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ unsigned long wmark;
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+ return false;
+ }
+
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+
+ return true;
+}
+
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
+ bool need_aging = false;
+ bool need_swapping = false;
unsigned long scanned = 0;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ DEFINE_MAX_SEQ(lruvec);
lru_add_drain();
@@ -4965,21 +5044,28 @@ static void lru_gen_shrink_lruvec(struct
else
swappiness = 0;
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
if (!nr_to_scan)
- break;
+ goto done;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
if (!delta)
- break;
+ goto done;
scanned += delta;
if (scanned >= nr_to_scan)
break;
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
+ break;
+
cond_resched();
}
+ /* see the comment in lru_gen_age_node() */
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
+ sc->memcgs_need_aging = false;
+done:
clear_mm_walk();
blk_finish_plug(&plug);

View File

@ -1,475 +0,0 @@
From ef61bb3622ee0f36e055dfd5006badff08f5ce61 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 19:52:09 -0700
Subject: [PATCH 10/14] mm: multi-gen LRU: kill switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
can be disabled include:
0x0001: the multi-gen LRU core
0x0002: walking page table, when arch_has_hw_pte_young() returns
true
0x0004: clearing the accessed bit in non-leaf PMD entries, when
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
[yYnN]: apply to all the components above
E.g.,
echo y >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0007
echo 5 >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0005
NB: the page table walks happen on the scale of seconds under heavy
memory pressure, in which case the mmap_lock contention is a lesser
concern, compared with the LRU lock contention and the I/O congestion.
So far the only well-known case of the mmap_lock contention happens on
Android, due to Scudo [1] which allocates several thousand VMAs for
merely a few hundred MBs. The SPF and the Maple Tree also have
provided their own assessments [2][3]. However, if walking page tables
does worsen the mmap_lock contention, the kill switch can be used to
disable it. In this case the multi-gen LRU will suffer a minor
performance degradation, as shown previously.
Clearing the accessed bit in non-leaf PMD entries can also be
disabled, since this behavior was not tested on x86 varieties other
than Intel and AMD.
[1] https://source.android.com/devices/tech/debug/scudo
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I4c909618e8fed7fb1337f6624bbe542ec920a515
---
include/linux/cgroup.h | 15 ++-
include/linux/mm_inline.h | 15 ++-
include/linux/mmzone.h | 9 ++
kernel/cgroup/cgroup-internal.h | 1 -
mm/Kconfig | 6 +
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
6 files changed, 265 insertions(+), 9 deletions(-)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -708,6 +719,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -106,10 +106,21 @@ static __always_inline enum lru_list fol
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
- return true;
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#else
+static inline bool lru_gen_enabled(void)
+{
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
}
+#endif
static inline bool lru_gen_in_fault(void)
{
@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(str
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
/*
* There are three common cases for this page:
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -384,6 +384,13 @@ enum {
LRU_GEN_FILE,
};
+enum {
+ LRU_GEN_CORE,
+ LRU_GEN_MM_WALK,
+ LRU_GEN_NONLEAF_YOUNG,
+ NR_LRU_GEN_CAPS
+};
+
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
@@ -425,6 +432,8 @@ struct lru_gen_struct {
/* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multi-gen LRU is enabled */
+ bool enabled;
};
enum {
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1133,6 +1133,12 @@ config LRU_GEN
help
A high performance LRU implementation to overcommit memory.
+config LRU_GEN_ENABLED
+ bool "Enable by default"
+ depends on LRU_GEN
+ help
+ This option enables the multi-gen LRU by default.
+
config LRU_GEN_STATS
bool "Full stats for debugging"
depends on LRU_GEN
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,7 @@
#include <linux/psi.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -3070,6 +3071,14 @@ static bool can_age_anon_pages(struct pg
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
+#else
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
+#endif
+
/******************************************************************************
* shorthand helpers
******************************************************************************/
@@ -3946,7 +3955,8 @@ static void walk_pmd_range_locked(pud_t
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -4044,10 +4054,12 @@ restart:
walk->mm_stats[MM_NONLEAF_TOTAL]++;
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- if (!pmd_young(val))
- continue;
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (!pmd_young(val))
+ continue;
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ }
#endif
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
continue;
@@ -4309,7 +4321,7 @@ static bool try_to_inc_max_seq(struct lr
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!arch_has_hw_pte_young()) {
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -5072,6 +5084,208 @@ done:
}
/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ if (lrugen->enabled) {
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ if (!list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+ } else {
+ int gen, type, zone;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool fill_evictable(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
+
+ lruvec_del_folio(lruvec, folio);
+ success = lru_gen_add_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_evictable(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ success = lru_gen_del_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+ lruvec_add_folio(lruvec, folio);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void lru_gen_change_state(bool enabled)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ cgroup_lock();
+ cpus_read_lock();
+ get_online_mems();
+ mutex_lock(&state_mutex);
+
+ if (enabled == lru_gen_enabled())
+ goto unlock;
+
+ if (enabled)
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ else
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ if (!lruvec)
+ continue;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
+
+ lruvec->lrugen.enabled = enabled;
+
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ put_online_mems();
+ cpus_read_unlock();
+ cgroup_unlock();
+}
+
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned int caps = 0;
+
+ if (get_cap(LRU_GEN_CORE))
+ caps |= BIT(LRU_GEN_CORE);
+
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ caps |= BIT(LRU_GEN_MM_WALK);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+}
+
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+ unsigned int caps;
+
+ if (tolower(*buf) == 'n')
+ caps = 0;
+ else if (tolower(*buf) == 'y')
+ caps = -1;
+ else if (kstrtouint(buf, 0, &caps))
+ return -EINVAL;
+
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
+ bool enabled = caps & BIT(i);
+
+ if (i == LRU_GEN_CORE)
+ lru_gen_change_state(enabled);
+ else if (enabled)
+ static_branch_enable(&lru_gen_caps[i]);
+ else
+ static_branch_disable(&lru_gen_caps[i]);
+ }
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_enabled, store_enabled
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
+/******************************************************************************
* initialization
******************************************************************************/
@@ -5081,6 +5295,7 @@ void lru_gen_init_lruvec(struct lruvec *
struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled = lru_gen_enabled();
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
@@ -5120,6 +5335,9 @@ static int __init init_lru_gen(void)
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
return 0;
};
late_initcall(init_lru_gen);

View File

@ -1,202 +0,0 @@
From 9d92c76fb8ac09ff195024139575d8c4db66b672 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:08:50 -0700
Subject: [PATCH 11/14] mm: multi-gen LRU: thrashing prevention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
requested by many desktop users [1].
When set to value N, it prevents the working set of N milliseconds
from getting evicted. The OOM killer is triggered if this working set
cannot be kept in memory. Based on the average human detectable lag
(~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
Larger values like N=3000 make lags less noticeable at the risk of
premature OOM kills.
Compared with the size-based approach [2], this time-based approach
has the following advantages:
1. It is easier to configure because it is agnostic to applications
and memory sizes.
2. It is more reliable because it is directly wired to the OOM killer.
[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I007499d7e47374b59fd620e8c3962940bc9f788e
---
include/linux/mmzone.h | 2 ++
mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 73 insertions(+), 3 deletions(-)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -419,6 +419,8 @@ struct lru_gen_struct {
unsigned long max_seq;
/* the eviction increments the oldest generation numbers */
unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
/* the multi-gen LRU lists, lazily sorted on eviction */
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the multi-gen LRU sizes, eventually consistent */
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4293,6 +4293,7 @@ static void inc_max_seq(struct lruvec *l
for (type = 0; type < ANON_AND_FILE; type++)
reset_ctrl_pos(lruvec, type, false);
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
@@ -4420,7 +4421,7 @@ static bool should_run_aging(struct lruv
return false;
}
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
{
bool need_aging;
unsigned long nr_to_scan;
@@ -4434,16 +4435,36 @@ static void age_lruvec(struct lruvec *lr
mem_cgroup_calculate_protection(NULL, memcg);
if (mem_cgroup_below_min(memcg))
- return;
+ return false;
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+
+ if (min_ttl) {
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_after_jiffies(birth + min_ttl))
+ return false;
+
+ /* the size is likely too small to be helpful */
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
+ return false;
+ }
+
if (need_aging)
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
+
+ return true;
}
+/* to protect the working set of the last N jiffies */
+static unsigned long lru_gen_min_ttl __read_mostly;
+
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
+ bool success = false;
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
VM_WARN_ON_ONCE(!current_is_kswapd());
@@ -4466,12 +4487,32 @@ static void lru_gen_age_node(struct pgli
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- age_lruvec(lruvec, sc);
+ if (age_lruvec(lruvec, sc, min_ttl))
+ success = true;
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
clear_mm_walk();
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (success || !min_ttl || sc->order)
+ return;
+
+ /*
+ * The main goal is to OOM kill if every generation from all memcgs is
+ * younger than min_ttl. However, another possibility is all memcgs are
+ * either below min or empty.
+ */
+ if (mutex_trylock(&oom_lock)) {
+ struct oom_control oc = {
+ .gfp_mask = sc->gfp_mask,
+ };
+
+ out_of_memory(&oc);
+
+ mutex_unlock(&oom_lock);
+ }
}
/*
@@ -5228,6 +5269,28 @@ unlock:
* sysfs interface
******************************************************************************/
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+);
+
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
unsigned int caps = 0;
@@ -5276,6 +5339,7 @@ static struct kobj_attribute lru_gen_ena
);
static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_min_ttl_attr.attr,
&lru_gen_enabled_attr.attr,
NULL
};
@@ -5291,12 +5355,16 @@ static struct attribute_group lru_gen_at
void lru_gen_init_lruvec(struct lruvec *lruvec)
{
+ int i;
int gen, type, zone;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);

View File

@ -1,557 +0,0 @@
From d1e0e5fcdea16d4ceead496a0ea2fdbb6bc5bfe4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:12:41 -0700
Subject: [PATCH 12/14] mm: multi-gen LRU: debugfs interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
reclaim. These techniques are commonly used to optimize job scheduling
(bin packing) in data centers [1][2].
Compared with the page table-based approach and the PFN-based
approach, this lruvec-based approach has the following advantages:
1. It offers better choices because it is aware of memcgs, NUMA nodes,
shared mappings and unmapped page cache.
2. It is more scalable because it is O(nr_hot_pages), whereas the
PFN-based approach is O(nr_total_pages).
Add /sys/kernel/debug/lru_gen_full for debugging.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I7bb06f14e0a94901a076cc3767d0855d4f1ea3ab
---
include/linux/nodemask.h | 1 +
mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 402 insertions(+), 10 deletions(-)
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -493,6 +493,7 @@ static inline int num_node_state(enum no
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
+#define next_memory_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1U
#define nr_online_nodes 1U
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,7 @@
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -4197,12 +4198,40 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static void inc_min_seq(struct lruvec *lruvec, int type)
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
+ int zone;
+ int remaining = MAX_LRU_BATCH;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (type == LRU_GEN_ANON && !can_swap)
+ goto done;
+
+ /* prevent cold/hot inversion if force_scan is true */
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+ new_gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+done:
reset_ctrl_pos(lruvec, type, true);
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
}
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
@@ -4248,7 +4277,7 @@ next:
return success;
}
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
{
int prev, next;
int type, zone;
@@ -4262,9 +4291,13 @@ static void inc_max_seq(struct lruvec *l
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
- inc_min_seq(lruvec, type);
+ while (!inc_min_seq(lruvec, type, can_swap)) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
}
/*
@@ -4301,7 +4334,7 @@ static void inc_max_seq(struct lruvec *l
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap)
+ struct scan_control *sc, bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -4322,7 +4355,7 @@ static bool try_to_inc_max_seq(struct lr
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -4336,7 +4369,7 @@ static bool try_to_inc_max_seq(struct lr
walk->lruvec = lruvec;
walk->max_seq = max_seq;
walk->can_swap = can_swap;
- walk->force_scan = false;
+ walk->force_scan = force_scan;
do {
success = iterate_mm_list(lruvec, walk, &mm);
@@ -4356,7 +4389,7 @@ done:
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
- inc_max_seq(lruvec, can_swap);
+ inc_max_seq(lruvec, can_swap, force_scan);
/* either this sees any waiters or they will see updated max_seq */
if (wq_has_sleeper(&lruvec->mm_state.wait))
wake_up_all(&lruvec->mm_state.wait);
@@ -4452,7 +4485,7 @@ static bool age_lruvec(struct lruvec *lr
}
if (need_aging)
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
return true;
}
@@ -5011,7 +5044,7 @@ static unsigned long get_nr_to_scan(stru
if (current_is_kswapd())
return 0;
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
return nr_to_scan;
done:
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
@@ -5350,6 +5383,361 @@ static struct attribute_group lru_gen_at
};
/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return get_lruvec(memcg, nid);
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kvfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return get_lruvec(memcg, nid);
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = lru_hist_from_seq(seq);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ const char *s = " ";
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ s = "RT ";
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
+ s = "rep";
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ }
+
+ for (i = 0; i < 3; i++)
+ seq_printf(m, " %10lu%c", n[i], s[i]);
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ const char *s = " ";
+ unsigned long n = 0;
+
+ if (seq == max_seq && NR_HIST_GENS == 1) {
+ s = "LOYNFA";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
+ s = "loynfa";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ }
+
+ seq_printf(m, " %10lu%c", n, s[i]);
+ }
+ seq_putc(m, '\n');
+}
+
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (nid == first_memory_node) {
+ const char *path = memcg ? m->private : "";
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ if (!full)
+ seq = min_seq[LRU_GEN_ANON];
+ else if (max_seq >= MAX_NR_GENS)
+ seq = max_seq - MAX_NR_GENS + 1;
+ else
+ seq = 0;
+
+ for (; seq <= max_seq; seq++) {
+ int type, zone;
+ int gen = lru_gen_from_seq(seq);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long size = 0;
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ seq_printf(m, " %10lu%c", size, mark);
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ bool can_swap, bool force_scan)
+{
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < max_seq)
+ return 0;
+
+ if (seq > max_seq)
+ return -EINVAL;
+
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
+ return -ERANGE;
+
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+
+ return 0;
+}
+
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ int swappiness, unsigned long nr_to_reclaim)
+{
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq + MIN_NR_GENS > max_seq)
+ return -EINVAL;
+
+ sc->nr_reclaimed = 0;
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < min_seq[!swappiness])
+ return 0;
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return 0;
+
+ if (!evict_folios(lruvec, sc, swappiness, NULL))
+ return 0;
+
+ cond_resched();
+ }
+
+ return -EINTR;
+}
+
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+ struct scan_control *sc, int swappiness, unsigned long opt)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcg_id);
+#ifdef CONFIG_MEMCG
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+#endif
+ rcu_read_unlock();
+
+ if (!memcg)
+ return -EINVAL;
+ }
+
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ lruvec = get_lruvec(memcg, nid);
+
+ if (swappiness < 0)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (swappiness > 200)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
+ break;
+ case '-':
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ unsigned int flags;
+ struct blk_plug plug;
+ int err = -EINVAL;
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ buf = kvmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ flags = memalloc_noreclaim_save();
+ blk_start_plug(&plug);
+ if (!set_mm_walk(NULL)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long opt = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &opt, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
+ if (err)
+ break;
+ }
+done:
+ clear_mm_walk();
+ blk_finish_plug(&plug);
+ memalloc_noreclaim_restore(flags);
+ set_task_reclaim_state(current, NULL);
+
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/******************************************************************************
* initialization
******************************************************************************/
@@ -5406,6 +5794,9 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
return 0;
};
late_initcall(init_lru_gen);

View File

@ -1,253 +0,0 @@
From 22199c9b30ffcc332be643577709a2af960e6786 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 23 Jan 2022 16:44:43 -0700
Subject: [PATCH 13/14] mm: multi-gen LRU: admin guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add an admin guide.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
---
Documentation/admin-guide/mm/index.rst | 1 +
Documentation/admin-guide/mm/multigen_lru.rst | 162 ++++++++++++++++++
mm/Kconfig | 3 +-
mm/vmscan.c | 4 +
4 files changed, 169 insertions(+), 1 deletion(-)
create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -32,6 +32,7 @@ the Linux memory management.
idle_page_tracking
ksm
memory-hotplug
+ multigen_lru
nommu-mmap
numa_memory_policy
numaperf
--- /dev/null
+++ b/Documentation/admin-guide/mm/multigen_lru.rst
@@ -0,0 +1,162 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Multi-Gen LRU
+=============
+The multi-gen LRU is an alternative LRU implementation that optimizes
+page reclaim and improves performance under memory pressure. Page
+reclaim decides the kernel's caching policy and ability to overcommit
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
+
+Quick start
+===========
+Build the kernel with the following configurations.
+
+* ``CONFIG_LRU_GEN=y``
+* ``CONFIG_LRU_GEN_ENABLED=y``
+
+All set!
+
+Runtime options
+===============
+``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
+following subsections.
+
+Kill switch
+-----------
+``enabled`` accepts different values to enable or disable the
+following components. Its default value depends on
+``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
+unless some of them have unforeseen side effects. Writing to
+``enabled`` has no effect when a component is not supported by the
+hardware, and valid values will be accepted even when the main switch
+is off.
+
+====== ===============================================================
+Values Components
+====== ===============================================================
+0x0001 The main switch for the multi-gen LRU.
+0x0002 Clearing the accessed bit in leaf page table entries in large
+ batches, when MMU sets it (e.g., on x86). This behavior can
+ theoretically worsen lock contention (mmap_lock). If it is
+ disabled, the multi-gen LRU will suffer a minor performance
+ degradation for workloads that contiguously map hot pages,
+ whose accessed bits can be otherwise cleared by fewer larger
+ batches.
+0x0004 Clearing the accessed bit in non-leaf page table entries as
+ well, when MMU sets it (e.g., on x86). This behavior was not
+ verified on x86 varieties other than Intel and AMD. If it is
+ disabled, the multi-gen LRU will suffer a negligible
+ performance degradation.
+[yYnN] Apply to all the components above.
+====== ===============================================================
+
+E.g.,
+::
+
+ echo y >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0007
+ echo 5 >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0005
+
+Thrashing prevention
+--------------------
+Personal computers are more sensitive to thrashing because it can
+cause janks (lags when rendering UI) and negatively impact user
+experience. The multi-gen LRU offers thrashing prevention to the
+majority of laptop and desktop users who do not have ``oomd``.
+
+Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
+``N`` milliseconds from getting evicted. The OOM killer is triggered
+if this working set cannot be kept in memory. In other words, this
+option works as an adjustable pressure relief valve, and when open, it
+terminates applications that are hopefully not being used.
+
+Based on the average human detectable lag (~100ms), ``N=1000`` usually
+eliminates intolerable janks due to thrashing. Larger values like
+``N=3000`` make janks less noticeable at the risk of premature OOM
+kills.
+
+The default value ``0`` means disabled.
+
+Experimental features
+=====================
+``/sys/kernel/debug/lru_gen`` accepts commands described in the
+following subsections. Multiple command lines are supported, so does
+concatenation with delimiters ``,`` and ``;``.
+
+``/sys/kernel/debug/lru_gen_full`` provides additional stats for
+debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
+evicted generations in this file.
+
+Working set estimation
+----------------------
+Working set estimation measures how much memory an application needs
+in a given time interval, and it is usually done with little impact on
+the performance of the application. E.g., data centers want to
+optimize job scheduling (bin packing) to improve memory utilizations.
+When a new job comes in, the job scheduler needs to find out whether
+each server it manages can allocate a certain amount of memory for
+this new job before it can pick a candidate. To do so, the job
+scheduler needs to estimate the working sets of the existing jobs.
+
+When it is read, ``lru_gen`` returns a histogram of numbers of pages
+accessed over different time intervals for each memcg and node.
+``MAX_NR_GENS`` decides the number of bins for each histogram. The
+histograms are noncumulative.
+::
+
+ memcg memcg_id memcg_path
+ node node_id
+ min_gen_nr age_in_ms nr_anon_pages nr_file_pages
+ ...
+ max_gen_nr age_in_ms nr_anon_pages nr_file_pages
+
+Each bin contains an estimated number of pages that have been accessed
+within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
+and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
+the former is the largest and that of the latter is the smallest.
+
+Users can write the following command to ``lru_gen`` to create a new
+generation ``max_gen_nr+1``:
+
+ ``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
+
+``can_swap`` defaults to the swap setting and, if it is set to ``1``,
+it forces the scan of anon pages when swap is off, and vice versa.
+``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
+employs heuristics to reduce the overhead, which is likely to reduce
+the coverage as well.
+
+A typical use case is that a job scheduler runs this command at a
+certain time interval to create new generations, and it ranks the
+servers it manages based on the sizes of their cold pages defined by
+this time interval.
+
+Proactive reclaim
+-----------------
+Proactive reclaim induces page reclaim when there is no memory
+pressure. It usually targets cold pages only. E.g., when a new job
+comes in, the job scheduler wants to proactively reclaim cold pages on
+the server it selected, to improve the chance of successfully landing
+this new job.
+
+Users can write the following command to ``lru_gen`` to evict
+generations less than or equal to ``min_gen_nr``.
+
+ ``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
+
+``min_gen_nr`` should be less than ``max_gen_nr-1``, since
+``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
+the active list) and therefore cannot be evicted. ``swappiness``
+overrides the default value in ``/proc/sys/vm/swappiness``.
+``nr_to_reclaim`` limits the number of pages to evict.
+
+A typical use case is that a job scheduler runs this command before it
+tries to land a new job on a server. If it fails to materialize enough
+cold pages because of the overestimation, it retries on the next
+server according to the ranking result obtained from the working set
+estimation step. This less forceful approach limits the impacts on the
+existing jobs.
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1131,7 +1131,8 @@ config LRU_GEN
# make sure folio->flags has enough spare bits
depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
help
- A high performance LRU implementation to overcommit memory.
+ A high performance LRU implementation to overcommit memory. See
+ Documentation/admin-guide/mm/multigen_lru.rst for details.
config LRU_GEN_ENABLED
bool "Enable by default"
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5307,6 +5307,7 @@ static ssize_t show_min_ttl(struct kobje
return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
@@ -5340,6 +5341,7 @@ static ssize_t show_enabled(struct kobje
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
@@ -5487,6 +5489,7 @@ static void lru_gen_seq_show_full(struct
seq_putc(m, '\n');
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
@@ -5645,6 +5648,7 @@ done:
return err;
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
size_t len, loff_t *pos)
{

View File

@ -1,202 +0,0 @@
From bd82a74f6b5c0a75ef61be5e9be34319bb17328f Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 6 Mar 2022 20:35:00 -0700
Subject: [PATCH 14/14] mm: multi-gen LRU: design doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add a design doc.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I958afcabf5abc37b3e58f72638d35a349c31b98d
---
Documentation/mm/index.rst | 1 +
Documentation/mm/multigen_lru.rst | 159 ++++++++++++++++++++++++++++++
2 files changed, 160 insertions(+)
create mode 100644 Documentation/mm/multigen_lru.rst
--- a/Documentation/mm/index.rst
+++ b/Documentation/mm/index.rst
@@ -51,6 +51,7 @@ above structured documentation, or delet
ksm
memory-model
mmu_notifier
+ multigen_lru
numa
overcommit-accounting
page_migration
--- /dev/null
+++ b/Documentation/mm/multigen_lru.rst
@@ -0,0 +1,159 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Multi-Gen LRU
+=============
+The multi-gen LRU is an alternative LRU implementation that optimizes
+page reclaim and improves performance under memory pressure. Page
+reclaim decides the kernel's caching policy and ability to overcommit
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
+
+Design overview
+===============
+Objectives
+----------
+The design objectives are:
+
+* Good representation of access recency
+* Try to profit from spatial locality
+* Fast paths to make obvious choices
+* Simple self-correcting heuristics
+
+The representation of access recency is at the core of all LRU
+implementations. In the multi-gen LRU, each generation represents a
+group of pages with similar access recency. Generations establish a
+(time-based) common frame of reference and therefore help make better
+choices, e.g., between different memcgs on a computer or different
+computers in a data center (for job scheduling).
+
+Exploiting spatial locality improves efficiency when gathering the
+accessed bit. A rmap walk targets a single page and does not try to
+profit from discovering a young PTE. A page table walk can sweep all
+the young PTEs in an address space, but the address space can be too
+sparse to make a profit. The key is to optimize both methods and use
+them in combination.
+
+Fast paths reduce code complexity and runtime overhead. Unmapped pages
+do not require TLB flushes; clean pages do not require writeback.
+These facts are only helpful when other conditions, e.g., access
+recency, are similar. With generations as a common frame of reference,
+additional factors stand out. But obvious choices might not be good
+choices; thus self-correction is necessary.
+
+The benefits of simple self-correcting heuristics are self-evident.
+Again, with generations as a common frame of reference, this becomes
+attainable. Specifically, pages in the same generation can be
+categorized based on additional factors, and a feedback loop can
+statistically compare the refault percentages across those categories
+and infer which of them are better choices.
+
+Assumptions
+-----------
+The protection of hot pages and the selection of cold pages are based
+on page access channels and patterns. There are two access channels:
+
+* Accesses through page tables
+* Accesses through file descriptors
+
+The protection of the former channel is by design stronger because:
+
+1. The uncertainty in determining the access patterns of the former
+ channel is higher due to the approximation of the accessed bit.
+2. The cost of evicting the former channel is higher due to the TLB
+ flushes required and the likelihood of encountering the dirty bit.
+3. The penalty of underprotecting the former channel is higher because
+ applications usually do not prepare themselves for major page
+ faults like they do for blocked I/O. E.g., GUI applications
+ commonly use dedicated I/O threads to avoid blocking rendering
+ threads.
+
+There are also two access patterns:
+
+* Accesses exhibiting temporal locality
+* Accesses not exhibiting temporal locality
+
+For the reasons listed above, the former channel is assumed to follow
+the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is
+present, and the latter channel is assumed to follow the latter
+pattern unless outlying refaults have been observed.
+
+Workflow overview
+=================
+Evictable pages are divided into multiple generations for each
+``lruvec``. The youngest generation number is stored in
+``lrugen->max_seq`` for both anon and file types as they are aged on
+an equal footing. The oldest generation numbers are stored in
+``lrugen->min_seq[]`` separately for anon and file types as clean file
+pages can be evicted regardless of swap constraints. These three
+variables are monotonically increasing.
+
+Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
+bits in order to fit into the gen counter in ``folio->flags``. Each
+truncated generation number is an index to ``lrugen->lists[]``. The
+sliding window technique is used to track at least ``MIN_NR_GENS`` and
+at most ``MAX_NR_GENS`` generations. The gen counter stores a value
+within ``[1, MAX_NR_GENS]`` while a page is on one of
+``lrugen->lists[]``; otherwise it stores zero.
+
+Each generation is divided into multiple tiers. A page accessed ``N``
+times through file descriptors is in tier ``order_base_2(N)``. Unlike
+generations, tiers do not have dedicated ``lrugen->lists[]``. In
+contrast to moving across generations, which requires the LRU lock,
+moving across tiers only involves atomic operations on
+``folio->flags`` and therefore has a negligible cost. A feedback loop
+modeled after the PID controller monitors refaults over all the tiers
+from anon and file types and decides which tiers from which types to
+evict or protect.
+
+There are two conceptually independent procedures: the aging and the
+eviction. They form a closed-loop system, i.e., the page reclaim.
+
+Aging
+-----
+The aging produces young generations. Given an ``lruvec``, it
+increments ``max_seq`` when ``max_seq-min_seq+1`` approaches
+``MIN_NR_GENS``. The aging promotes hot pages to the youngest
+generation when it finds them accessed through page tables; the
+demotion of cold pages happens consequently when it increments
+``max_seq``. The aging uses page table walks and rmap walks to find
+young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list``
+and calls ``walk_page_range()`` with each ``mm_struct`` on this list
+to scan PTEs, and after each iteration, it increments ``max_seq``. For
+the latter, when the eviction walks the rmap and finds a young PTE,
+the aging scans the adjacent PTEs. For both, on finding a young PTE,
+the aging clears the accessed bit and updates the gen counter of the
+page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
+
+Eviction
+--------
+The eviction consumes old generations. Given an ``lruvec``, it
+increments ``min_seq`` when ``lrugen->lists[]`` indexed by
+``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
+evict from, it first compares ``min_seq[]`` to select the older type.
+If both types are equally old, it selects the one whose first tier has
+a lower refault percentage. The first tier contains single-use
+unmapped clean pages, which are the best bet. The eviction sorts a
+page according to its gen counter if the aging has found this page
+accessed through page tables and updated its gen counter. It also
+moves a page to the next generation, i.e., ``min_seq+1``, if this page
+was accessed multiple times through file descriptors and the feedback
+loop has detected outlying refaults from the tier this page is in. To
+this end, the feedback loop uses the first tier as the baseline, for
+the reason stated earlier.
+
+Summary
+-------
+The multi-gen LRU can be disassembled into the following parts:
+
+* Generations
+* Rmap walks
+* Page table walks
+* Bloom filters
+* PID controller
+
+The aging and the eviction form a producer-consumer model;
+specifically, the latter drives the former by the sliding window over
+generations. Within the aging, rmap walks drive page table walks by
+inserting hot densely populated page tables to the Bloom filters.
+Within the eviction, the PID controller uses refaults as the feedback
+to select types to evict and tiers to protect.

View File

@ -0,0 +1,59 @@
From e9aef3d90b4bd11fccbde3741f2396ea05a9f386 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 30 Nov 2022 23:28:26 +0100
Subject: [PATCH] net: add netdev_sw_irq_coalesce_default_on()
Add a helper for drivers wanting to set SW IRQ coalescing
by default. The related sysfs attributes can be used to
override the default values.
Follow Jakub's suggestion and put this functionality into
net core so that drivers wanting to use software interrupt
coalescing per default don't have to open-code it.
Note that this function needs to be called before the
netdevice is registered.
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/netdevice.h | 1 +
net/core/dev.c | 16 ++++++++++++++++
2 files changed, 17 insertions(+)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -78,6 +78,7 @@ struct xdp_buff;
void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops);
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev);
/* Backlog congestion levels */
#define NET_RX_SUCCESS 0 /* keep 'em coming, baby */
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10535,6 +10535,22 @@ void netdev_set_default_ethtool_ops(stru
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+/**
+ * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
+ * @dev: netdev to enable the IRQ coalescing on
+ *
+ * Sets a conservative default for SW IRQ coalescing. Users can use
+ * sysfs attributes to override the default values.
+ */
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
+{
+ WARN_ON(dev->reg_state == NETREG_REGISTERED);
+
+ dev->gro_flush_timeout = 20000;
+ dev->napi_defer_hard_irqs = 1;
+}
+EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
+
void netdev_freemem(struct net_device *dev)
{
char *addr = (char *)dev - dev->padded;

View File

@ -0,0 +1,56 @@
From fd4f7a449938ffd21bf2f5a1708d811cc5f3daa5 Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <dkirjanov@suse.de>
Date: Thu, 27 Oct 2022 21:45:02 +0300
Subject: [PATCH 2/4] drivers: net: convert to boolean for the mac_managed_pm
flag
Signed-off-by: Dennis Kirjanov <dkirjanov@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/freescale/fec_main.c | 2 +-
drivers/net/ethernet/realtek/r8169_main.c | 2 +-
drivers/net/usb/asix_devices.c | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2226,7 +2226,7 @@ static int fec_enet_mii_probe(struct net
fep->link = 0;
fep->full_duplex = 0;
- phy_dev->mac_managed_pm = 1;
+ phy_dev->mac_managed_pm = true;
phy_attached_info(phy_dev);
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5018,7 +5018,7 @@ static int r8169_mdio_register(struct rt
return -EUNATCH;
}
- tp->phydev->mac_managed_pm = 1;
+ tp->phydev->mac_managed_pm = true;
phy_support_asym_pause(tp->phydev);
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -700,7 +700,7 @@ static int ax88772_init_phy(struct usbne
}
phy_suspend(priv->phydev);
- priv->phydev->mac_managed_pm = 1;
+ priv->phydev->mac_managed_pm = true;
phy_attached_info(priv->phydev);
@@ -720,7 +720,7 @@ static int ax88772_init_phy(struct usbne
return -ENODEV;
}
- priv->phydev_int->mac_managed_pm = 1;
+ priv->phydev_int->mac_managed_pm = true;
phy_suspend(priv->phydev_int);
return 0;

View File

@ -0,0 +1,38 @@
From fd149c4ab09b01136c7e80db020eed59a3385d24 Mon Sep 17 00:00:00 2001
From: Juhee Kang <claudiajkang@gmail.com>
Date: Wed, 30 Nov 2022 01:12:44 +0900
Subject: [PATCH 3/4] r8169: use tp_to_dev instead of open code
The open code is defined as a helper function(tp_to_dev) on r8169_main.c,
which the open code is &tp->pci_dev->dev. The helper function was added
in commit 1e1205b7d3e9 ("r8169: add helper tp_to_dev"). And then later,
commit f1e911d5d0df ("r8169: add basic phylib support") added
r8169_phylink_handler function but it didn't use the helper function.
Thus, tp_to_dev() replaces the open code. This patch doesn't change logic.
Signed-off-by: Juhee Kang <claudiajkang@gmail.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/20221129161244.5356-1-claudiajkang@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
drivers/net/ethernet/realtek/r8169_main.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -4559,12 +4559,13 @@ static int rtl8169_poll(struct napi_stru
static void r8169_phylink_handler(struct net_device *ndev)
{
struct rtl8169_private *tp = netdev_priv(ndev);
+ struct device *d = tp_to_dev(tp);
if (netif_carrier_ok(ndev)) {
rtl_link_chg_patch(tp);
- pm_request_resume(&tp->pci_dev->dev);
+ pm_request_resume(d);
} else {
- pm_runtime_idle(&tp->pci_dev->dev);
+ pm_runtime_idle(d);
}
phy_print_status(tp->phydev);

View File

@ -0,0 +1,33 @@
From 74ec605a11b7ecf68036c3f086f684bbe7381353 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 30 Nov 2022 23:30:15 +0100
Subject: [PATCH 4/4] r8169: enable GRO software interrupt coalescing per
default
There are reports about r8169 not reaching full line speed on certain
systems (e.g. SBC's) with a 2.5Gbps link.
There was a time when hardware interrupt coalescing was enabled per
default, but this was changed due to ASPM-related issues on few systems.
So let's use software interrupt coalescing instead and enable it
using new function netdev_sw_irq_coalesce_default_on().
Even with these conservative settings interrupt load on my 1Gbps test
system reduced significantly.
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/realtek/r8169_main.c | 2 ++
1 file changed, 2 insertions(+)
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5283,6 +5283,8 @@ static int rtl_init_one(struct pci_dev *
dev->hw_features |= NETIF_F_RXALL;
dev->hw_features |= NETIF_F_RXFCS;
+ netdev_sw_irq_coalesce_default_on(dev);
+
/* configure chip for default features */
rtl8169_set_features(dev, dev->features);

View File

@ -0,0 +1,65 @@
From 63db0cb35e1cb3b3c134906d1062f65513fdda2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Tue, 4 Oct 2022 10:37:09 +0200
Subject: [PATCH] mtd: core: simplify (a bit) code find partition-matching
dynamic OF node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
1. Don't hardcode "partition-" string twice
2. Use simpler logic & use ->name to avoid of_property_read_string()
3. Use mtd_get_of_node() helper
Cc: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20221004083710.27704-1-zajec5@gmail.com
---
drivers/mtd/mtdcore.c | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -551,18 +551,16 @@ static void mtd_check_of_node(struct mtd
struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
const char *pname, *prefix = "partition-";
int plen, mtd_name_len, offset, prefix_len;
- struct mtd_info *parent;
bool found = false;
/* Check if MTD already has a device node */
- if (dev_of_node(&mtd->dev))
+ if (mtd_get_of_node(mtd))
return;
/* Check if a partitions node exist */
if (!mtd_is_partition(mtd))
return;
- parent = mtd->parent;
- parent_dn = of_node_get(dev_of_node(&parent->dev));
+ parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
if (!parent_dn)
return;
@@ -575,15 +573,15 @@ static void mtd_check_of_node(struct mtd
/* Search if a partition is defined with the same name */
for_each_child_of_node(partitions, mtd_dn) {
- offset = 0;
-
/* Skip partition with no/wrong prefix */
- if (!of_node_name_prefix(mtd_dn, "partition-"))
+ if (!of_node_name_prefix(mtd_dn, prefix))
continue;
/* Label have priority. Check that first */
- if (of_property_read_string(mtd_dn, "label", &pname)) {
- of_property_read_string(mtd_dn, "name", &pname);
+ if (!of_property_read_string(mtd_dn, "label", &pname)) {
+ offset = 0;
+ } else {
+ pname = mtd_dn->name;
offset = prefix_len;
}

View File

@ -0,0 +1,84 @@
From ddb8cefb7af288950447ca6eeeafb09977dab56f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Tue, 4 Oct 2022 10:37:10 +0200
Subject: [PATCH] mtd: core: try to find OF node for every MTD partition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
So far this feature was limited to the top-level "nvmem-cells" node.
There are multiple parsers creating partitions and subpartitions
dynamically. Extend that code to handle them too.
This allows finding partition-* node for every MTD (sub)partition.
Random example:
partitions {
compatible = "brcm,bcm947xx-cfe-partitions";
partition-firmware {
compatible = "brcm,trx";
partition-loader {
};
};
};
Cc: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20221004083710.27704-2-zajec5@gmail.com
---
drivers/mtd/mtdcore.c | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -551,20 +551,22 @@ static void mtd_check_of_node(struct mtd
struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
const char *pname, *prefix = "partition-";
int plen, mtd_name_len, offset, prefix_len;
- bool found = false;
/* Check if MTD already has a device node */
if (mtd_get_of_node(mtd))
return;
- /* Check if a partitions node exist */
if (!mtd_is_partition(mtd))
return;
+
parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
if (!parent_dn)
return;
- partitions = of_get_child_by_name(parent_dn, "partitions");
+ if (mtd_is_partition(mtd->parent))
+ partitions = of_node_get(parent_dn);
+ else
+ partitions = of_get_child_by_name(parent_dn, "partitions");
if (!partitions)
goto exit_parent;
@@ -588,19 +590,11 @@ static void mtd_check_of_node(struct mtd
plen = strlen(pname) - offset;
if (plen == mtd_name_len &&
!strncmp(mtd->name, pname + offset, plen)) {
- found = true;
+ mtd_set_of_node(mtd, mtd_dn);
break;
}
}
- if (!found)
- goto exit_partitions;
-
- /* Set of_node only for nvmem */
- if (of_device_is_compatible(mtd_dn, "nvmem-cells"))
- mtd_set_of_node(mtd, mtd_dn);
-
-exit_partitions:
of_node_put(partitions);
exit_parent:
of_node_put(parent_dn);

View File

@ -0,0 +1,229 @@
From aec4d5f5ffd0f0092bd9dc21ea90e0bc237d4b74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Sat, 15 Oct 2022 11:29:50 +0200
Subject: [PATCH] mtd: parsers: add TP-Link SafeLoader partitions table parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This parser deals with most TP-Link home routers. It reads info about
partitions and registers them in the MTD subsystem.
Example from TP-Link Archer C5 V2:
spi-nor spi0.0: s25fl128s1 (16384 Kbytes)
15 tplink-safeloader partitions found on MTD device spi0.0
Creating 15 MTD partitions on "spi0.0":
0x000000000000-0x000000040000 : "fs-uboot"
0x000000040000-0x000000440000 : "os-image"
0x000000440000-0x000000e40000 : "rootfs"
0x000000e40000-0x000000e40200 : "default-mac"
0x000000e40200-0x000000e40400 : "pin"
0x000000e40400-0x000000e40600 : "product-info"
0x000000e50000-0x000000e60000 : "partition-table"
0x000000e60000-0x000000e60200 : "soft-version"
0x000000e61000-0x000000e70000 : "support-list"
0x000000e70000-0x000000e80000 : "profile"
0x000000e80000-0x000000e90000 : "default-config"
0x000000e90000-0x000000ee0000 : "user-config"
0x000000ee0000-0x000000fe0000 : "log"
0x000000fe0000-0x000000ff0000 : "radio_bk"
0x000000ff0000-0x000001000000 : "radio"
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20221015092950.27467-2-zajec5@gmail.com
---
drivers/mtd/parsers/Kconfig | 15 +++
drivers/mtd/parsers/Makefile | 1 +
drivers/mtd/parsers/tplink_safeloader.c | 150 ++++++++++++++++++++++++
3 files changed, 166 insertions(+)
create mode 100644 drivers/mtd/parsers/tplink_safeloader.c
--- a/drivers/mtd/parsers/Kconfig
+++ b/drivers/mtd/parsers/Kconfig
@@ -123,6 +123,21 @@ config MTD_AFS_PARTS
for your particular device. It won't happen automatically. The
'physmap' map driver (CONFIG_MTD_PHYSMAP) does this, for example.
+config MTD_PARSER_TPLINK_SAFELOADER
+ tristate "TP-Link Safeloader partitions parser"
+ depends on MTD && (ARCH_BCM_5301X || ATH79 || SOC_MT7620 || SOC_MT7621 || COMPILE_TEST)
+ help
+ TP-Link home routers use flash partitions to store various data. Info
+ about flash space layout is stored in a partitions table using a
+ custom ASCII-based format.
+
+ That format was first found in devices with SafeLoader bootloader and
+ was named after it. Later it was adapted to CFE and U-Boot
+ bootloaders.
+
+ This driver reads partitions table, parses it and creates MTD
+ partitions.
+
config MTD_PARSER_TRX
tristate "Parser for TRX format partitions"
depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || RALINK || COMPILE_TEST)
--- a/drivers/mtd/parsers/Makefile
+++ b/drivers/mtd/parsers/Makefile
@@ -10,6 +10,7 @@ ofpart-$(CONFIG_MTD_OF_PARTS_BCM4908) +=
ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)+= ofpart_linksys_ns.o
obj-$(CONFIG_MTD_PARSER_IMAGETAG) += parser_imagetag.o
obj-$(CONFIG_MTD_AFS_PARTS) += afs.o
+obj-$(CONFIG_MTD_PARSER_TPLINK_SAFELOADER) += tplink_safeloader.o
obj-$(CONFIG_MTD_PARSER_TRX) += parser_trx.o
obj-$(CONFIG_MTD_SERCOMM_PARTS) += scpart.o
obj-$(CONFIG_MTD_SHARPSL_PARTS) += sharpslpart.o
--- /dev/null
+++ b/drivers/mtd/parsers/tplink_safeloader.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2022 Rafał Miłecki <rafal@milecki.pl>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+
+#define TPLINK_SAFELOADER_DATA_OFFSET 4
+#define TPLINK_SAFELOADER_MAX_PARTS 32
+
+struct safeloader_cmn_header {
+ __be32 size;
+ uint32_t unused;
+} __packed;
+
+static void *mtd_parser_tplink_safeloader_read_table(struct mtd_info *mtd)
+{
+ struct safeloader_cmn_header hdr;
+ struct device_node *np;
+ size_t bytes_read;
+ size_t offset;
+ size_t size;
+ char *buf;
+ int err;
+
+ np = mtd_get_of_node(mtd);
+ if (mtd_is_partition(mtd))
+ of_node_get(np);
+ else
+ np = of_get_child_by_name(np, "partitions");
+
+ if (of_property_read_u32(np, "partitions-table-offset", (u32 *)&offset)) {
+ pr_err("Failed to get partitions table offset\n");
+ goto err_put;
+ }
+
+ err = mtd_read(mtd, offset, sizeof(hdr), &bytes_read, (uint8_t *)&hdr);
+ if (err && !mtd_is_bitflip(err)) {
+ pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset);
+ goto err_put;
+ }
+
+ size = be32_to_cpu(hdr.size);
+
+ buf = kmalloc(size + 1, GFP_KERNEL);
+ if (!buf)
+ goto err_put;
+
+ err = mtd_read(mtd, offset + sizeof(hdr), size, &bytes_read, buf);
+ if (err && !mtd_is_bitflip(err)) {
+ pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset + sizeof(hdr));
+ goto err_kfree;
+ }
+
+ buf[size] = '\0';
+
+ of_node_put(np);
+
+ return buf;
+
+err_kfree:
+ kfree(buf);
+err_put:
+ of_node_put(np);
+ return NULL;
+}
+
+static int mtd_parser_tplink_safeloader_parse(struct mtd_info *mtd,
+ const struct mtd_partition **pparts,
+ struct mtd_part_parser_data *data)
+{
+ struct mtd_partition *parts;
+ char name[65];
+ size_t offset;
+ size_t bytes;
+ char *buf;
+ int idx;
+ int err;
+
+ parts = kcalloc(TPLINK_SAFELOADER_MAX_PARTS, sizeof(*parts), GFP_KERNEL);
+ if (!parts) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ buf = mtd_parser_tplink_safeloader_read_table(mtd);
+ if (!buf) {
+ err = -ENOENT;
+ goto err_out;
+ }
+
+ for (idx = 0, offset = TPLINK_SAFELOADER_DATA_OFFSET;
+ idx < TPLINK_SAFELOADER_MAX_PARTS &&
+ sscanf(buf + offset, "partition %64s base 0x%llx size 0x%llx%zn\n",
+ name, &parts[idx].offset, &parts[idx].size, &bytes) == 3;
+ idx++, offset += bytes + 1) {
+ parts[idx].name = kstrdup(name, GFP_KERNEL);
+ if (!parts[idx].name) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+ }
+
+ if (idx == TPLINK_SAFELOADER_MAX_PARTS)
+ pr_warn("Reached maximum number of partitions!\n");
+
+ kfree(buf);
+
+ *pparts = parts;
+
+ return idx;
+
+err_free:
+ for (idx -= 1; idx >= 0; idx--)
+ kfree(parts[idx].name);
+err_out:
+ return err;
+};
+
+static void mtd_parser_tplink_safeloader_cleanup(const struct mtd_partition *pparts,
+ int nr_parts)
+{
+ int i;
+
+ for (i = 0; i < nr_parts; i++)
+ kfree(pparts[i].name);
+
+ kfree(pparts);
+}
+
+static const struct of_device_id mtd_parser_tplink_safeloader_of_match_table[] = {
+ { .compatible = "tplink,safeloader-partitions" },
+ {},
+};
+MODULE_DEVICE_TABLE(of, mtd_parser_tplink_safeloader_of_match_table);
+
+static struct mtd_part_parser mtd_parser_tplink_safeloader = {
+ .parse_fn = mtd_parser_tplink_safeloader_parse,
+ .cleanup = mtd_parser_tplink_safeloader_cleanup,
+ .name = "tplink-safeloader",
+ .of_match_table = mtd_parser_tplink_safeloader_of_match_table,
+};
+module_mtd_part_parser(mtd_parser_tplink_safeloader);
+
+MODULE_LICENSE("GPL");

View File

@ -1,11 +0,0 @@
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -315,7 +315,7 @@ config NET_IPVTI
on top.
config NET_UDP_TUNNEL
- tristate
+ tristate "IP: UDP tunneling support"
select NET_IP_TUNNEL
default n

View File

@ -1,23 +0,0 @@
From 8c817e33be829c7249c2cfd59ff48ad5fac6a31d Mon Sep 17 00:00:00 2001
From: Sungbo Eo <mans0n@gorani.run>
Date: Fri, 7 Jul 2017 17:09:21 +0200
Subject: [PATCH] kconfig: solidify SATA_PMP config
SATA_PMP option in kernel config file disappears for every kernel_oldconfig refresh.
To prevent this, SATA_HOST is now selected automatically when SATA_PMP is enabled.
This patch can be dropped if SATA_MV is ever re-added into the config.
---
drivers/ata/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -112,7 +112,7 @@ config SATA_ZPODD
config SATA_PMP
bool "SATA Port Multiplier support"
- depends on SATA_HOST
+ select SATA_HOST
default y
help
This option adds support for SATA Port Multipliers

View File

@ -1,22 +0,0 @@
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1954,7 +1954,7 @@ config PADATA
bool
config ASN1
- tristate
+ tristate "ASN1"
help
Build a simple ASN.1 grammar compiler that produces a bytecode output
that can be interpreted by the ASN.1 stream decoder and used to
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -627,7 +627,7 @@ config LIBFDT
bool
config OID_REGISTRY
- tristate
+ tristate "OID"
help
Enable fast lookup object identifier registry.

View File

@ -1,15 +0,0 @@
This makes it possible to select CONFIG_CRYPTO_LIB_ARC4 directly. We
need this to be able to compile this into the kernel and make use of it
from backports.
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -6,7 +6,7 @@ config CRYPTO_LIB_AES
tristate
config CRYPTO_LIB_ARC4
- tristate
+ tristate "ARC4 cipher library"
config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
bool

View File

@ -1,38 +0,0 @@
From: John Crispin <john@phrozen.org>
Subject: hack: kernel: add generic image_cmdline hack to MIPS targets
lede-commit: d59f5b3a987a48508257a0ddbaeadc7909f9f976
Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
---
arch/mips/Kconfig | 4 ++++
arch/mips/kernel/head.S | 6 ++++++
2 files changed, 10 insertions(+)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1112,6 +1112,10 @@ config MIPS_MSC
config SYNC_R4K
bool
+config IMAGE_CMDLINE_HACK
+ bool "OpenWrt specific image command line hack"
+ default n
+
config NO_IOPORT_MAP
def_bool n
--- a/arch/mips/kernel/head.S
+++ b/arch/mips/kernel/head.S
@@ -79,6 +79,12 @@ FEXPORT(__kernel_entry)
j kernel_entry
#endif /* CONFIG_BOOT_RAW */
+#ifdef CONFIG_IMAGE_CMDLINE_HACK
+ .ascii "CMDLINE:"
+EXPORT(__image_cmdline)
+ .fill 0x400
+#endif /* CONFIG_IMAGE_CMDLINE_HACK */
+
__REF
NESTED(kernel_entry, 16, sp) # kernel entry point

View File

@ -1,23 +0,0 @@
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -61,6 +61,10 @@ config MTD_NAND_ECC_MEDIATEK
help
This enables support for the hardware ECC engine from Mediatek.
+config MTD_NAND_MTK_BMT
+ bool "Support MediaTek NAND Bad-block Management Table"
+ default n
+
endmenu
endmenu
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -3,6 +3,7 @@
nandcore-objs := core.o bbt.o
obj-$(CONFIG_MTD_NAND_CORE) += nandcore.o
obj-$(CONFIG_MTD_NAND_ECC_MEDIATEK) += ecc-mtk.o
+obj-$(CONFIG_MTD_NAND_MTK_BMT) += mtk_bmt.o mtk_bmt_v2.o mtk_bmt_bbt.o mtk_bmt_nmbm.o
obj-y += onenand/
obj-y += raw/

File diff suppressed because it is too large Load Diff

View File

@ -1,41 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:18:54 +0200
Subject: bridge: only accept EAP locally
When bridging, do not forward EAP frames to other ports, only deliver
them locally, regardless of the state.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
[add disable_eap_hack sysfs attribute]
Signed-off-by: Etienne Champetier <champetier.etienne@gmail.com>
---
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -133,10 +133,14 @@ int br_handle_frame_finish(struct net *n
}
}
+ BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+
+ if (skb->protocol == htons(ETH_P_PAE) && !br->disable_eap_hack)
+ return br_pass_frame_up(skb);
+
if (state == BR_STATE_LEARNING)
goto drop;
- BR_INPUT_SKB_CB(skb)->brdev = br->dev;
BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
if (IS_ENABLED(CONFIG_INET) &&
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -482,6 +482,8 @@ struct net_bridge {
u16 group_fwd_mask;
u16 group_fwd_mask_required;
+ bool disable_eap_hack;
+
/* STP */
bridge_id designated_root;
bridge_id bridge_id;

View File

@ -1,100 +0,0 @@
From 1d418f7e88035ed7a94073f6354246c66e9193e9 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:22:58 +0200
Subject: fq_codel: switch default qdisc from pfifo_fast to fq_codel and remove pfifo_fast
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/net/sch_generic.h | 3 ++-
net/sched/Kconfig | 3 ++-
net/sched/sch_api.c | 2 +-
net/sched/sch_fq_codel.c | 3 ++-
net/sched/sch_generic.c | 4 ++--
5 files changed, 9 insertions(+), 6 deletions(-)
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -585,12 +585,13 @@ extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
+extern struct Qdisc_ops fq_codel_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
return ntx < dev->real_num_tx_queues ?
- default_qdisc_ops : &pfifo_fast_ops;
+ default_qdisc_ops : &fq_codel_qdisc_ops;
}
struct Qdisc_class_common {
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -4,8 +4,9 @@
#
menuconfig NET_SCHED
- bool "QoS and/or fair queueing"
+ def_bool y
select NET_SCH_FIFO
+ select NET_SCH_FQ_CODEL
help
When the kernel has several packets to send out over a network
device, it has to decide which ones to send first, which ones to
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2277,7 +2277,7 @@ static int __init pktsched_init(void)
return err;
}
- register_qdisc(&pfifo_fast_ops);
+ register_qdisc(&fq_codel_qdisc_ops);
register_qdisc(&pfifo_qdisc_ops);
register_qdisc(&bfifo_qdisc_ops);
register_qdisc(&pfifo_head_drop_qdisc_ops);
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -719,7 +719,7 @@ static const struct Qdisc_class_ops fq_c
.walk = fq_codel_walk,
};
-static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
.cl_ops = &fq_codel_class_ops,
.id = "fq_codel",
.priv_size = sizeof(struct fq_codel_sched_data),
@@ -734,6 +734,7 @@ static struct Qdisc_ops fq_codel_qdisc_o
.dump_stats = fq_codel_dump_stats,
.owner = THIS_MODULE,
};
+EXPORT_SYMBOL(fq_codel_qdisc_ops);
static int __init fq_codel_module_init(void)
{
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -32,7 +32,7 @@
#include <net/xfrm.h>
/* Qdisc to use by default */
-const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+const struct Qdisc_ops *default_qdisc_ops = &fq_codel_qdisc_ops;
EXPORT_SYMBOL(default_qdisc_ops);
static void qdisc_maybe_clear_missed(struct Qdisc *q,
@@ -1142,12 +1142,12 @@ static void attach_one_default_qdisc(str
void *_unused)
{
struct Qdisc *qdisc;
- const struct Qdisc_ops *ops = default_qdisc_ops;
+ const struct Qdisc_ops *ops = &fq_codel_qdisc_ops;
if (dev->priv_flags & IFF_NO_QUEUE)
ops = &noqueue_qdisc_ops;
else if(dev->type == ARPHRD_CAN)
- ops = &pfifo_fast_ops;
+ ops = &fq_codel_qdisc_ops;
qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
if (!qdisc)

View File

@ -1,12 +0,0 @@
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1179,6 +1179,9 @@ int __init early_init_dt_scan_chosen(cha
p = of_get_flat_dt_prop(node, "bootargs", &l);
if (p != NULL && l > 0)
strlcpy(cmdline, p, min(l, COMMAND_LINE_SIZE));
+ p = of_get_flat_dt_prop(node, "bootargs-append", &l);
+ if (p != NULL && l > 0)
+ strlcat(cmdline, p, min_t(int, strlen(cmdline) + (int)l, COMMAND_LINE_SIZE));
/*
* CONFIG_CMDLINE is meant to be a default in case nothing else

View File

@ -14,7 +14,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -164,6 +164,7 @@ extern void cleanup_module(void);
@@ -163,6 +163,7 @@ extern void cleanup_module(void);
/* Generic info of form tag = "info" */
#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)
@ -22,7 +22,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/* For userspace: you can also call me... */
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)
@@ -233,12 +234,12 @@ extern void cleanup_module(void);
@@ -232,12 +233,12 @@ extern void cleanup_module(void);
* Author(s), use "Name <email>" or just "Name", for multiple
* authors use multiple MODULE_AUTHOR() statements/lines.
*/
@ -38,7 +38,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name) \
extern typeof(name) __mod_##type##__##name##_device_table \
@@ -265,7 +266,9 @@ extern typeof(name) __mod_##type##__##na
@@ -264,7 +265,9 @@ extern typeof(name) __mod_##type##__##na
*/
#if defined(MODULE) || !defined(CONFIG_SYSFS)
@ -49,7 +49,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
#else
#define MODULE_VERSION(_version) \
MODULE_INFO(version, _version); \
@@ -288,7 +291,7 @@ extern typeof(name) __mod_##type##__##na
@@ -287,7 +290,7 @@ extern typeof(name) __mod_##type##__##na
/* Optional firmware file (or files) needed by the module
* format is simply firmware file name. Multiple firmware
* files require multiple MODULE_FIRMWARE() specifiers */
@ -88,9 +88,9 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -286,6 +286,13 @@ config UNUSED_KSYMS_WHITELIST
one per line. The path can be absolute, or relative to the kernel
source tree.
@@ -290,4 +290,11 @@ config MODULES_TREE_LOOKUP
def_bool y
depends on PERF_EVENTS || TRACING || CFI_CLANG
+config MODULE_STRIPPED
+ bool "Reduce module size"
@ -99,12 +99,26 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+ Remove module parameter descriptions, author info, version, aliases,
+ device tables, etc.
+
config MODULES_TREE_LOOKUP
def_bool y
depends on PERF_EVENTS || TRACING || CFI_CLANG
endif # MODULES
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1954,9 +1954,11 @@ static int setup_load_info(struct load_i
@@ -988,6 +988,7 @@ size_t modinfo_attrs_count = ARRAY_SIZE(
static const char vermagic[] = VERMAGIC_STRING;
+#if defined(CONFIG_MODVERSIONS) || !defined(CONFIG_MODULE_STRIPPED)
int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
@@ -999,6 +1000,7 @@ int try_to_force_load(struct module *mod
return -ENOEXEC;
#endif
}
+#endif
static char *get_modinfo(const struct load_info *info, const char *tag);
static char *get_next_modinfo(const struct load_info *info, const char *tag,
@@ -1950,9 +1952,11 @@ static int setup_load_info(struct load_i
static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
@ -117,7 +131,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
if (flags & MODULE_INIT_IGNORE_VERMAGIC)
modmagic = NULL;
@@ -1977,6 +1979,7 @@ static int check_modinfo(struct module *
@@ -1973,6 +1977,7 @@ static int check_modinfo(struct module *
mod->name);
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
}
@ -148,7 +162,29 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
buf_printf(b, "\n");
buf_printf(b, "__visible struct module __this_module\n");
buf_printf(b, "__section(\".gnu.linkonce.this_module\") = {\n");
@@ -2101,11 +2105,13 @@ static void add_depends(struct buffer *b
@@ -1995,8 +1999,10 @@ static void add_header(struct buffer *b,
buf_printf(b, "\t.arch = MODULE_ARCH_INIT,\n");
buf_printf(b, "};\n");
+#ifndef CONFIG_MODULE_STRIPPED
if (!external_module)
buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
+#endif
buf_printf(b,
"\n"
@@ -2004,8 +2010,10 @@ static void add_header(struct buffer *b,
"MODULE_INFO(retpoline, \"Y\");\n"
"#endif\n");
+#ifndef CONFIG_MODULE_STRIPPED
if (strstarts(mod->name, "drivers/staging"))
buf_printf(b, "\nMODULE_INFO(staging, \"Y\");\n");
+#endif
if (strstarts(mod->name, "tools/testing"))
buf_printf(b, "\nMODULE_INFO(test, \"Y\");\n");
@@ -2101,11 +2109,13 @@ static void add_depends(struct buffer *b
static void add_srcversion(struct buffer *b, struct module *mod)
{
@ -162,7 +198,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
}
static void write_buf(struct buffer *b, const char *fname)
@@ -2191,7 +2197,9 @@ static void write_mod_c_file(struct modu
@@ -2191,7 +2201,9 @@ static void write_mod_c_file(struct modu
add_exported_symbols(&buf, mod);
add_versions(&buf, mod);
add_depends(&buf, mod);

View File

@ -1,3 +1,12 @@
From 300d26562ce4dc427154cb247beb75db4b1f0774 Mon Sep 17 00:00:00 2001
From: OpenWrt community <openwrt-devel@lists.openwrt.org>
Date: Wed, 13 Jul 2022 13:29:57 +0200
Subject: [PATCH] scripts/Kconfig: Kconfig exit
---
scripts/kconfig/conf.c | 2 ++
1 file changed, 2 insertions(+)
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -432,6 +432,8 @@ static int conf_sym(struct menu *menu)

View File

@ -81,3 +81,13 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
struct page;
struct kmem_cache;
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -684,6 +684,7 @@ def main() -> None:
#include "util/header.h"
#include "util/pmu.h"
#include <string.h>
+#include <strings.h>
#include <stddef.h>
struct compact_pmu_event {

View File

@ -12,7 +12,7 @@ Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
---
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -121,6 +121,7 @@ config ARM
@@ -122,6 +122,7 @@ config ARM
select HAVE_UID16
select HAVE_VIRT_CPU_ACCOUNTING_GEN
select IRQ_FORCED_THREADING

View File

@ -30,7 +30,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/* Align . to a 8 byte boundary equals to maximum function alignment. */
#define ALIGN_FUNCTION() . = ALIGN(8)
@@ -479,14 +489,14 @@
@@ -512,14 +522,14 @@
/* Kernel symbol table: Normal symbols */ \
__ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \
__start___ksymtab = .; \
@ -47,7 +47,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
__stop___ksymtab_gpl = .; \
} \
\
@@ -506,7 +516,7 @@
@@ -539,7 +549,7 @@
\
/* Kernel symbol table: strings */ \
__ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \
@ -56,10 +56,10 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
} \
\
/* __*init sections */ \
@@ -1023,6 +1033,8 @@
@@ -1043,6 +1053,8 @@
#define COMMON_DISCARDS \
SANITIZER_DISCARDS \
PATCHABLE_DISCARDS \
+ SYMTAB_DISCARD \
+ SYMTAB_DISCARD_GPL \
*(.discard) \
@ -89,9 +89,33 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
"__kstrtab_" #sym ": \n" \
" .asciz \"" #sym "\" \n" \
"__kstrtabns_" #sym ": \n" \
--- a/include/asm-generic/export.h
+++ b/include/asm-generic/export.h
@@ -31,6 +31,12 @@
#endif
.endm
+#ifdef MODULE
+#define __EXPORT_SUFFIX(name)
+#else
+#define __EXPORT_SUFFIX(name) + #name
+#endif
+
/*
* note on .section use: we specify progbits since usage of the "M" (SHF_MERGE)
* section flag requires it. Use '%progbits' instead of '@progbits' since the
@@ -44,7 +50,7 @@
__ksymtab_\name:
__put \val, __kstrtab_\name
.previous
- .section __ksymtab_strings,"aMS",%progbits,1
+ .section __ksymtab_strings __EXPORT_SUFFIX(name),"aMS",%progbits,1
__kstrtab_\name:
.asciz "\name"
.previous
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -328,7 +328,7 @@ targets += $(real-dtb-y) $(lib-y) $(alwa
@@ -388,7 +388,7 @@ targets += $(real-dtb-y) $(lib-y) $(alwa
# Linker scripts preprocessor (.lds.S -> .lds)
# ---------------------------------------------------------------------------
quiet_cmd_cpp_lds_S = LDS $@

View File

@ -23,12 +23,16 @@ Signed-off-by: Imre Kaloz <kaloz@openwrt.org>
{ {0x02, 0x21}, "lz4", unlz4 },
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -421,7 +421,7 @@ quiet_cmd_bzip2_with_size = BZIP2 $@
@@ -443,10 +443,10 @@ quiet_cmd_bzip2_with_size = BZIP2 $@
# ---------------------------------------------------------------------------
quiet_cmd_lzma = LZMA $@
- cmd_lzma = cat $(real-prereqs) | $(LZMA) -9 > $@
+ cmd_lzma = { cat $(real-prereqs) | $(LZMA) e -d20 -lc1 -lp2 -pb2 -eos -si -so; $(size_append); } > $@
+ cmd_lzma = cat $(real-prereqs) | $(LZMA) e -d20 -lc1 -lp2 -pb2 -eos -si -so > $@
quiet_cmd_lzma_with_size = LZMA $@
cmd_lzma_with_size = { cat $(real-prereqs) | $(LZMA) -9; $(size_append); } > $@
- cmd_lzma_with_size = { cat $(real-prereqs) | $(LZMA) -9; $(size_append); } > $@
+ cmd_lzma_with_size = { cat $(real-prereqs) | $(LZMA) e -d20 -lc1 -lp2 -pb2 -eos -si -so; $(size_append); } > $@
quiet_cmd_lzo = LZO $@
cmd_lzo = cat $(real-prereqs) | $(KLZOP) -9 > $@

View File

@ -92,7 +92,7 @@ Signed-off-by: John Crispin <john@phrozen.org>
bool
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -456,16 +456,16 @@ config BCH_CONST_T
@@ -457,16 +457,16 @@ config BCH_CONST_T
# Textsearch support is select'ed if needed
#
config TEXTSEARCH
@ -147,7 +147,7 @@ Signed-off-by: John Crispin <john@phrozen.org>
config CFG80211
tristate "cfg80211 - wireless configuration API"
@@ -204,7 +204,7 @@ config CFG80211_WEXT_EXPORT
@@ -208,7 +208,7 @@ config CFG80211_WEXT_EXPORT
endif # CFG80211
config LIB80211
@ -156,7 +156,7 @@ Signed-off-by: John Crispin <john@phrozen.org>
default n
help
This options enables a library of common routines used
@@ -213,17 +213,17 @@ config LIB80211
@@ -217,17 +217,17 @@ config LIB80211
Drivers should select this themselves if needed.
config LIB80211_CRYPT_WEP

View File

@ -0,0 +1,32 @@
From dcd966fa7ca63f38cf7147e1184d13d66e2ca340 Mon Sep 17 00:00:00 2001
From: OpenWrt community <openwrt-devel@lists.openwrt.org>
Date: Wed, 13 Jul 2022 13:33:30 +0200
Subject: [PATCH] Kconfig: add tristate for OID and ASNI string
---
init/Kconfig | 2 +-
lib/Kconfig | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2003,7 +2003,7 @@ config PADATA
bool
config ASN1
- tristate
+ tristate "ASN1"
help
Build a simple ASN.1 grammar compiler that produces a bytecode output
that can be interpreted by the ASN.1 stream decoder and used to
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -637,7 +637,7 @@ config LIBFDT
bool
config OID_REGISTRY
- tristate
+ tristate "OID"
help
Enable fast lookup object identifier registry.

View File

@ -125,7 +125,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
#include <linux/mutex.h>
#include <linux/err.h>
#include <linux/property.h>
@@ -3384,3 +3385,5 @@ static int __init regmap_initcall(void)
@@ -3505,3 +3506,5 @@ static int __init regmap_initcall(void)
return 0;
}
postcore_initcall(regmap_initcall);

View File

@ -0,0 +1,24 @@
From 241e5d3f7b0dd3c01f8c7fa83cbc9a3882286d53 Mon Sep 17 00:00:00 2001
From: OpenWrt community <openwrt-devel@lists.openwrt.org>
Date: Wed, 13 Jul 2022 13:35:18 +0200
Subject: [PATCH] lib/crypto: add tristate string for ARC4
This makes it possible to select CONFIG_CRYPTO_LIB_ARC4 directly. We
need this to be able to compile this into the kernel and make use of it
from backports.
---
lib/crypto/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -9,7 +9,7 @@ config CRYPTO_LIB_AES
tristate
config CRYPTO_LIB_ARC4
- tristate
+ tristate "ARC4 cipher library"
config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
bool

View File

@ -26,7 +26,7 @@ Signed-off-by: John Crispin <john@phrozen.org>
* @name: name of the struct -- the string is not copied internally
--- a/net/Makefile
+++ b/net/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_TIPC) += tipc/
@@ -51,7 +51,7 @@ obj-$(CONFIG_TIPC) += tipc/
obj-$(CONFIG_NETLABEL) += netlabel/
obj-$(CONFIG_IUCV) += iucv/
obj-$(CONFIG_SMC) += smc/

View File

@ -0,0 +1,112 @@
From 0bccc3722bdd88e8ae995e77ef9f7b77ee4cbdee Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Wed, 7 Apr 2021 22:45:54 +0100
Subject: [PATCH 2/2] mtd: blktrans: call add disks after mtd device
To: linux-mtd@lists.infradead.org
Cc: Vignesh Raghavendra <vigneshr@ti.com>,
Richard Weinberger <richard@nod.at>,
Miquel Raynal <miquel.raynal@bootlin.com>,
David Woodhouse <dwmw2@infradead.org>
Calling device_add_disk while holding mtd_table_mutex leads
to deadlock in case part_bits!=0 as block partition parsers
will try to open the newly created disks, trying to acquire
mutex once again.
Move device_add_disk to additional function called after
add partitions of an MTD device have been added and locks
have been released.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
---
drivers/mtd/mtd_blkdevs.c | 33 ++++++++++++++++++++++++++-------
drivers/mtd/mtdcore.c | 3 +++
include/linux/mtd/blktrans.h | 1 +
3 files changed, 30 insertions(+), 7 deletions(-)
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -386,19 +386,8 @@ int add_mtd_blktrans_dev(struct mtd_blkt
if (new->readonly)
set_disk_ro(gd, 1);
- ret = device_add_disk(&new->mtd->dev, gd, NULL);
- if (ret)
- goto out_cleanup_disk;
-
- if (new->disk_attributes) {
- ret = sysfs_create_group(&disk_to_dev(gd)->kobj,
- new->disk_attributes);
- WARN_ON(ret);
- }
return 0;
-out_cleanup_disk:
- put_disk(new->disk);
out_free_tag_set:
blk_mq_free_tag_set(new->tag_set);
out_kfree_tag_set:
@@ -408,6 +397,35 @@ out_list_del:
return ret;
}
+void register_mtd_blktrans_devs(void)
+{
+ struct mtd_blktrans_ops *tr;
+ struct mtd_blktrans_dev *dev, *next;
+ int ret;
+
+ list_for_each_entry(tr, &blktrans_majors, list) {
+ list_for_each_entry_safe(dev, next, &tr->devs, list) {
+ if (disk_live(dev->disk))
+ continue;
+
+ ret = device_add_disk(&dev->mtd->dev, dev->disk, NULL);
+ if (ret)
+ goto out_cleanup_disk;
+
+ if (dev->disk_attributes) {
+ ret = sysfs_create_group(&disk_to_dev(dev->disk)->kobj,
+ dev->disk_attributes);
+ WARN_ON(ret);
+ }
+ }
+ }
+
+ return;
+
+out_cleanup_disk:
+ put_disk(dev->disk);
+}
+
int del_mtd_blktrans_dev(struct mtd_blktrans_dev *old)
{
unsigned long flags;
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -31,6 +31,7 @@
#include <linux/mtd/mtd.h>
#include <linux/mtd/partitions.h>
+#include <linux/mtd/blktrans.h>
#include "mtdcore.h"
@@ -1057,6 +1058,8 @@ int mtd_device_parse_register(struct mtd
ret = mtd_otp_nvmem_add(mtd);
+ register_mtd_blktrans_devs();
+
out:
if (ret && device_is_registered(&mtd->dev))
del_mtd_device(mtd);
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -76,6 +76,7 @@ extern int deregister_mtd_blktrans(struc
extern int add_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
extern int del_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
extern int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev);
+extern void register_mtd_blktrans_devs(void);
/**
* module_mtd_blktrans() - Helper macro for registering a mtd blktrans driver

View File

@ -1,6 +1,25 @@
From 69357074558daf6ff24c9f58714935e9e095a865 Mon Sep 17 00:00:00 2001
From: OpenWrt community <openwrt-devel@lists.openwrt.org>
Date: Wed, 13 Jul 2022 13:37:33 +0200
Subject: [PATCH] kernel: add block fit partition parser
---
block/blk.h | 2 ++
block/partitions/Kconfig | 7 +++++++
block/partitions/Makefile | 1 +
block/partitions/check.h | 3 +++
block/partitions/core.c | 17 +++++++++++++++++
block/partitions/efi.c | 8 ++++++++
block/partitions/efi.h | 3 +++
block/partitions/msdos.c | 10 ++++++++++
drivers/mtd/mtd_blkdevs.c | 2 ++
drivers/mtd/ubi/block.c | 3 +++
include/linux/msdos_partition.h | 1 +
11 files changed, 57 insertions(+)
--- a/block/blk.h
+++ b/block/blk.h
@@ -406,6 +406,8 @@ void blk_free_ext_minor(unsigned int min
@@ -414,6 +414,8 @@ void blk_free_ext_minor(unsigned int min
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
@ -98,30 +117,6 @@
return true;
}
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -433,6 +433,9 @@ int ubiblock_create(struct ubi_volume_in
}
gd->flags |= GENHD_FL_NO_PART;
gd->private_data = dev;
+#ifdef CONFIG_FIT_PARTITION
+ gd->flags |= GENHD_FL_EXT_DEVT;
+#endif
sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id);
set_capacity(gd, disk_capacity);
dev->gd = gd;
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -346,6 +346,9 @@ int add_mtd_blktrans_dev(struct mtd_blkt
gd->first_minor = (new->devnum) << tr->part_bits;
gd->minors = 1 << tr->part_bits;
gd->fops = &mtd_block_ops;
+#ifdef CONFIG_FIT_PARTITION
+ gd->flags |= GENHD_FL_EXT_DEVT;
+#endif
if (tr->part_bits) {
if (new->devnum < 26)
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -716,6 +716,9 @@ int efi_partition(struct parsed_partitio

View File

@ -20,7 +20,7 @@ Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
#include <linux/nvmem-provider.h>
#include <linux/mtd/mtd.h>
@@ -748,6 +749,16 @@ int add_mtd_device(struct mtd_info *mtd)
@@ -751,6 +752,16 @@ int add_mtd_device(struct mtd_info *mtd)
of this try_ nonsense, and no bitching about it
either. :) */
__module_get(THIS_MODULE);

View File

@ -0,0 +1,33 @@
From ac84397efb3b3868c71c10ad7521161773228a17 Mon Sep 17 00:00:00 2001
From: OpenWrt community <openwrt-devel@lists.openwrt.org>
Date: Wed, 13 Jul 2022 13:41:44 +0200
Subject: [PATCH] mtd/nand: add MediaTek NAND bad block managment table
---
drivers/mtd/nand/Kconfig | 4 ++++
drivers/mtd/nand/Makefile | 1 +
2 files changed, 5 insertions(+)
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -46,6 +46,10 @@ config MTD_NAND_ECC_SW_BCH
ECC codes. They are used with NAND devices requiring more than 1 bit
of error correction.
+config MTD_NAND_MTK_BMT
+ bool "Support MediaTek NAND Bad-block Management Table"
+ default n
+
config MTD_NAND_ECC_MXIC
bool "Macronix external hardware ECC engine"
depends on HAS_IOMEM
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -3,6 +3,7 @@
nandcore-objs := core.o bbt.o
obj-$(CONFIG_MTD_NAND_CORE) += nandcore.o
obj-$(CONFIG_MTD_NAND_ECC_MEDIATEK) += ecc-mtk.o
+obj-$(CONFIG_MTD_NAND_MTK_BMT) += mtk_bmt.o mtk_bmt_v2.o mtk_bmt_bbt.o mtk_bmt_nmbm.o
obj-y += onenand/
obj-y += raw/

View File

@ -0,0 +1,846 @@
From 11c3fae5afa6cac444d12622e2cf5af60a99c1ef Mon Sep 17 00:00:00 2001
From: OpenWrt community <openwrt-devel@lists.openwrt.org>
Date: Wed, 13 Jul 2022 13:43:15 +0200
Subject: [PATCH] net/bridge: add bridge offload
---
include/linux/if_bridge.h | 1 +
net/bridge/Makefile | 2 +-
net/bridge/br.c | 8 +
net/bridge/br_device.c | 2 +
net/bridge/br_fdb.c | 5 +
net/bridge/br_forward.c | 3 +
net/bridge/br_if.c | 6 +-
net/bridge/br_input.c | 5 +
net/bridge/br_offload.c | 438 ++++++++++++++++++++++++++++++++
net/bridge/br_private.h | 22 +-
net/bridge/br_private_offload.h | 23 ++
net/bridge/br_stp.c | 3 +
net/bridge/br_sysfs_br.c | 35 +++
net/bridge/br_sysfs_if.c | 2 +
net/bridge/br_vlan_tunnel.c | 3 +
15 files changed, 555 insertions(+), 3 deletions(-)
create mode 100644 net/bridge/br_offload.c
create mode 100644 net/bridge/br_private_offload.h
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -60,6 +60,7 @@ struct br_ip_list {
#define BR_TX_FWD_OFFLOAD BIT(20)
#define BR_PORT_LOCKED BIT(21)
#define BR_BPDU_FILTER BIT(22)
+#define BR_OFFLOAD BIT(23)
#define BR_DEFAULT_AGEING_TIME (300 * HZ)
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BRIDGE) += bridge.o
-bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
+bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o br_offload.o \
br_ioctl.o br_stp.o br_stp_bpdu.o \
br_stp_if.o br_stp_timer.o br_netlink.o \
br_netlink_tunnel.o br_arp_nd_proxy.o
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -18,6 +18,7 @@
#include <net/switchdev.h>
#include "br_private.h"
+#include "br_private_offload.h"
/*
* Handle changes in state of network devices enslaved to a bridge.
@@ -389,6 +390,10 @@ static int __init br_init(void)
if (err)
goto err_out;
+ err = br_offload_init();
+ if (err)
+ goto err_out0;
+
err = register_pernet_subsys(&br_net_ops);
if (err)
goto err_out1;
@@ -438,6 +443,8 @@ err_out3:
err_out2:
unregister_pernet_subsys(&br_net_ops);
err_out1:
+ br_offload_fini();
+err_out0:
br_fdb_fini();
err_out:
stp_proto_unregister(&br_stp_proto);
@@ -460,6 +467,7 @@ static void __exit br_deinit(void)
#if IS_ENABLED(CONFIG_ATM_LANE)
br_fdb_test_addr_hook = NULL;
#endif
+ br_offload_fini();
br_fdb_fini();
}
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -525,6 +525,8 @@ void br_dev_setup(struct net_device *dev
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME;
+ br->offload_cache_size = 128;
+ br->offload_cache_reserved = 8;
dev->max_mtu = ETH_MAX_MTU;
br_netfilter_rtable_init(br);
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,6 +23,7 @@
#include <net/switchdev.h>
#include <trace/events/bridge.h>
#include "br_private.h"
+#include "br_private_offload.h"
static const struct rhashtable_params br_fdb_rht_params = {
.head_offset = offsetof(struct net_bridge_fdb_entry, rhnode),
@@ -185,6 +186,8 @@ static void fdb_notify(struct net_bridge
struct sk_buff *skb;
int err = -ENOBUFS;
+ br_offload_fdb_update(fdb);
+
if (swdev_notify)
br_switchdev_fdb_notify(br, fdb, type);
@@ -393,6 +396,8 @@ static struct net_bridge_fdb_entry *fdb_
fdb->key.vlan_id = vid;
fdb->flags = flags;
fdb->updated = fdb->used = jiffies;
+ INIT_HLIST_HEAD(&fdb->offload_in);
+ INIT_HLIST_HEAD(&fdb->offload_out);
err = rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode,
br_fdb_rht_params);
if (err) {
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -16,6 +16,7 @@
#include <linux/if_vlan.h>
#include <linux/netfilter_bridge.h>
#include "br_private.h"
+#include "br_private_offload.h"
/* Don't forward packets to originating port or forwarding disabled */
static inline int should_deliver(const struct net_bridge_port *p,
@@ -32,6 +33,8 @@ static inline int should_deliver(const s
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ br_offload_output(skb);
+
skb_push(skb, ETH_HLEN);
if (!is_skb_forwardable(skb->dev, skb))
goto drop;
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -25,6 +25,7 @@
#include <net/net_namespace.h>
#include "br_private.h"
+#include "br_private_offload.h"
/*
* Determine initial path cost based on speed.
@@ -437,7 +438,7 @@ static struct net_bridge_port *new_nbp(s
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
- p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
+ p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_OFFLOAD;
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
@@ -761,6 +762,9 @@ void br_port_flags_change(struct net_bri
if (mask & BR_NEIGH_SUPPRESS)
br_recalculate_neigh_suppress_enabled(br);
+
+ if (mask & BR_OFFLOAD)
+ br_offload_port_state(p);
}
bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -22,6 +22,7 @@
#include <linux/rculist.h>
#include "br_private.h"
#include "br_private_tunnel.h"
+#include "br_private_offload.h"
static int
br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -189,6 +190,7 @@ int br_handle_frame_finish(struct net *n
dst->used = now;
br_forward(dst->dst, skb, local_rcv, false);
} else {
+ br_offload_skb_disable(skb);
if (!mcast_hit)
br_flood(br, skb, pkt_type, local_rcv, false);
else
@@ -322,6 +324,9 @@ static rx_handler_result_t br_handle_fra
memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
p = br_port_get_rcu(skb->dev);
+ if (br_offload_input(p, skb))
+ return RX_HANDLER_CONSUMED;
+
if (p->flags & BR_VLAN_TUNNEL)
br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p));
--- /dev/null
+++ b/net/bridge/br_offload.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include "br_private.h"
+#include "br_private_offload.h"
+
+static DEFINE_SPINLOCK(offload_lock);
+
+struct bridge_flow_key {
+ u8 dest[ETH_ALEN];
+ u8 src[ETH_ALEN];
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ u16 vlan_tag;
+ bool vlan_present;
+#endif
+};
+
+struct bridge_flow {
+ struct net_bridge_port *port;
+ struct rhash_head node;
+ struct bridge_flow_key key;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ bool vlan_out_present;
+ u16 vlan_out;
+#endif
+
+ unsigned long used;
+ struct net_bridge_fdb_entry *fdb_in, *fdb_out;
+ struct hlist_node fdb_list_in, fdb_list_out;
+
+ struct rcu_head rcu;
+};
+
+static const struct rhashtable_params flow_params = {
+ .automatic_shrinking = true,
+ .head_offset = offsetof(struct bridge_flow, node),
+ .key_len = sizeof(struct bridge_flow_key),
+ .key_offset = offsetof(struct bridge_flow, key),
+};
+
+static struct kmem_cache *offload_cache __read_mostly;
+
+static void
+flow_rcu_free(struct rcu_head *head)
+{
+ struct bridge_flow *flow;
+
+ flow = container_of(head, struct bridge_flow, rcu);
+ kmem_cache_free(offload_cache, flow);
+}
+
+static void
+__br_offload_flow_free(struct bridge_flow *flow)
+{
+ flow->used = 0;
+ hlist_del(&flow->fdb_list_in);
+ hlist_del(&flow->fdb_list_out);
+
+ call_rcu(&flow->rcu, flow_rcu_free);
+}
+
+static void
+br_offload_flow_free(struct bridge_flow *flow)
+{
+ if (rhashtable_remove_fast(&flow->port->offload.rht, &flow->node,
+ flow_params) != 0)
+ return;
+
+ __br_offload_flow_free(flow);
+}
+
+static bool
+br_offload_flow_fdb_refresh_time(struct bridge_flow *flow,
+ struct net_bridge_fdb_entry *fdb)
+{
+ if (!time_after(flow->used, fdb->updated))
+ return false;
+
+ fdb->updated = flow->used;
+
+ return true;
+}
+
+
+static void
+br_offload_flow_refresh_time(struct bridge_flow *flow)
+{
+ br_offload_flow_fdb_refresh_time(flow, flow->fdb_in);
+ br_offload_flow_fdb_refresh_time(flow, flow->fdb_out);
+}
+
+static void
+br_offload_destroy_cb(void *ptr, void *arg)
+{
+ struct bridge_flow *flow = ptr;
+
+ __br_offload_flow_free(flow);
+}
+
+static bool
+br_offload_need_gc(struct net_bridge_port *p)
+{
+ return (atomic_read(&p->offload.rht.nelems) +
+ p->br->offload_cache_reserved) >= p->br->offload_cache_size;
+}
+
+static void
+br_offload_gc_work(struct work_struct *work)
+{
+ struct rhashtable_iter hti;
+ struct net_bridge_port *p;
+ struct bridge_flow *gc_flow = NULL;
+ struct bridge_flow *flow;
+ unsigned long gc_used;
+
+ p = container_of(work, struct net_bridge_port, offload.gc_work);
+
+ if (!br_offload_need_gc(p))
+ return;
+
+ rhashtable_walk_enter(&p->offload.rht, &hti);
+ rhashtable_walk_start(&hti);
+ while ((flow = rhashtable_walk_next(&hti)) != NULL) {
+ unsigned long used;
+
+ if (IS_ERR(flow))
+ continue;
+
+ used = READ_ONCE(flow->used);
+ if (!used)
+ continue;
+
+ if (gc_flow && !time_before(used, gc_used))
+ continue;
+
+ gc_flow = flow;
+ gc_used = used;
+ }
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ if (!gc_flow)
+ return;
+
+ spin_lock_bh(&offload_lock);
+ if (br_offload_need_gc(p) && gc_flow &&
+ gc_flow->used == gc_used)
+ br_offload_flow_free(gc_flow);
+ if (p->offload.enabled && br_offload_need_gc(p))
+ queue_work(system_long_wq, work);
+ spin_unlock_bh(&offload_lock);
+
+}
+
+void br_offload_port_state(struct net_bridge_port *p)
+{
+ struct net_bridge_port_offload *o = &p->offload;
+ bool enabled = true;
+ bool flush = false;
+
+ if (p->state != BR_STATE_FORWARDING ||
+ !(p->flags & BR_OFFLOAD))
+ enabled = false;
+
+ spin_lock_bh(&offload_lock);
+ if (o->enabled == enabled)
+ goto out;
+
+ if (enabled) {
+ if (!o->gc_work.func)
+ INIT_WORK(&o->gc_work, br_offload_gc_work);
+ rhashtable_init(&o->rht, &flow_params);
+ } else {
+ flush = true;
+ rhashtable_free_and_destroy(&o->rht, br_offload_destroy_cb, o);
+ }
+
+ o->enabled = enabled;
+
+out:
+ spin_unlock_bh(&offload_lock);
+
+ if (flush)
+ flush_work(&o->gc_work);
+}
+
+void br_offload_fdb_update(const struct net_bridge_fdb_entry *fdb)
+{
+ struct bridge_flow *f;
+ struct hlist_node *tmp;
+
+ spin_lock_bh(&offload_lock);
+
+ hlist_for_each_entry_safe(f, tmp, &fdb->offload_in, fdb_list_in)
+ br_offload_flow_free(f);
+
+ hlist_for_each_entry_safe(f, tmp, &fdb->offload_out, fdb_list_out)
+ br_offload_flow_free(f);
+
+ spin_unlock_bh(&offload_lock);
+}
+
+static void
+br_offload_prepare_key(struct net_bridge_port *p, struct bridge_flow_key *key,
+ struct sk_buff *skb)
+{
+ memset(key, 0, sizeof(*key));
+ memcpy(key, eth_hdr(skb), 2 * ETH_ALEN);
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (!br_opt_get(p->br, BROPT_VLAN_ENABLED))
+ return;
+
+ if (!skb_vlan_tag_present(skb) || skb->vlan_proto != p->br->vlan_proto)
+ return;
+
+ key->vlan_present = true;
+ key->vlan_tag = skb_vlan_tag_get_id(skb);
+#endif
+}
+
+void br_offload_output(struct sk_buff *skb)
+{
+ struct net_bridge_port_offload *o;
+ struct br_input_skb_cb *cb = (struct br_input_skb_cb *)skb->cb;
+ struct net_bridge_port *p, *inp;
+ struct net_device *dev;
+ struct net_bridge_fdb_entry *fdb_in, *fdb_out;
+ struct net_bridge_vlan_group *vg;
+ struct bridge_flow_key key;
+ struct bridge_flow *flow;
+ u16 vlan;
+
+ if (!cb->offload)
+ return;
+
+ rcu_read_lock();
+
+ p = br_port_get_rcu(skb->dev);
+ if (!p)
+ goto out;
+
+ o = &p->offload;
+ if (!o->enabled)
+ goto out;
+
+ if (atomic_read(&p->offload.rht.nelems) >= p->br->offload_cache_size)
+ goto out;
+
+ dev = dev_get_by_index_rcu(dev_net(p->br->dev), cb->input_ifindex);
+ if (!dev)
+ goto out;
+
+ inp = br_port_get_rcu(dev);
+ if (!inp)
+ goto out;
+
+ vg = nbp_vlan_group_rcu(inp);
+ vlan = cb->input_vlan_present ? cb->input_vlan_tag : br_get_pvid(vg);
+ fdb_in = br_fdb_find_rcu(p->br, eth_hdr(skb)->h_source, vlan);
+ if (!fdb_in || !fdb_in->dst)
+ goto out;
+
+ vg = nbp_vlan_group_rcu(p);
+ vlan = skb_vlan_tag_present(skb) ? skb_vlan_tag_get_id(skb) : br_get_pvid(vg);
+ fdb_out = br_fdb_find_rcu(p->br, eth_hdr(skb)->h_dest, vlan);
+ if (!fdb_out || !fdb_out->dst)
+ goto out;
+
+ br_offload_prepare_key(p, &key, skb);
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ key.vlan_present = cb->input_vlan_present;
+ key.vlan_tag = cb->input_vlan_tag;
+#endif
+
+ flow = kmem_cache_alloc(offload_cache, GFP_ATOMIC);
+ flow->port = inp;
+ memcpy(&flow->key, &key, sizeof(key));
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ flow->vlan_out_present = skb_vlan_tag_present(skb);
+ flow->vlan_out = skb_vlan_tag_get(skb);
+#endif
+
+ flow->fdb_in = fdb_in;
+ flow->fdb_out = fdb_out;
+ flow->used = jiffies;
+
+ spin_lock_bh(&offload_lock);
+ if (!o->enabled ||
+ atomic_read(&p->offload.rht.nelems) >= p->br->offload_cache_size ||
+ rhashtable_insert_fast(&inp->offload.rht, &flow->node, flow_params)) {
+ kmem_cache_free(offload_cache, flow);
+ goto out_unlock;
+ }
+
+ hlist_add_head(&flow->fdb_list_in, &fdb_in->offload_in);
+ hlist_add_head(&flow->fdb_list_out, &fdb_out->offload_out);
+
+ if (br_offload_need_gc(p))
+ queue_work(system_long_wq, &p->offload.gc_work);
+
+out_unlock:
+ spin_unlock_bh(&offload_lock);
+
+out:
+ rcu_read_unlock();
+}
+
+bool br_offload_input(struct net_bridge_port *p, struct sk_buff *skb)
+{
+ struct net_bridge_port_offload *o = &p->offload;
+ struct br_input_skb_cb *cb = (struct br_input_skb_cb *)skb->cb;
+ struct bridge_flow_key key;
+ struct net_bridge_port *dst;
+ struct bridge_flow *flow;
+ unsigned long now = jiffies;
+ bool ret = false;
+
+ if (skb->len < sizeof(key))
+ return false;
+
+ if (!o->enabled)
+ return false;
+
+ if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
+ return false;
+
+ br_offload_prepare_key(p, &key, skb);
+
+ rcu_read_lock();
+ flow = rhashtable_lookup(&o->rht, &key, flow_params);
+ if (!flow) {
+ cb->offload = 1;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ cb->input_vlan_present = key.vlan_present != 0;
+ cb->input_vlan_tag = key.vlan_tag;
+#endif
+ cb->input_ifindex = p->dev->ifindex;
+ goto out;
+ }
+
+ if (flow->fdb_in->dst != p)
+ goto out;
+
+ dst = flow->fdb_out->dst;
+ if (!dst)
+ goto out;
+
+ ret = true;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (!flow->vlan_out_present && key.vlan_present) {
+ __vlan_hwaccel_clear_tag(skb);
+ } else if (flow->vlan_out_present) {
+ if (skb_vlan_tag_present(skb) &&
+ skb->vlan_proto != p->br->vlan_proto) {
+ /* Protocol-mismatch, empty out vlan_tci for new tag */
+ skb_push(skb, ETH_HLEN);
+ skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ if (unlikely(!skb))
+ goto out;
+
+ skb_pull(skb, ETH_HLEN);
+ skb_reset_mac_len(skb);
+ }
+
+ __vlan_hwaccel_put_tag(skb, p->br->vlan_proto,
+ flow->vlan_out);
+ }
+#endif
+
+ skb->dev = dst->dev;
+ skb_push(skb, ETH_HLEN);
+
+ if (skb_warn_if_lro(skb) || !is_skb_forwardable(skb->dev, skb)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ if (now - flow->used >= HZ) {
+ flow->used = now;
+ br_offload_flow_refresh_time(flow);
+ }
+
+ skb_forward_csum(skb);
+ dev_queue_xmit(skb);
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static void
+br_offload_check_gc(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ spin_lock_bh(&br->lock);
+ list_for_each_entry(p, &br->port_list, list)
+ if (br_offload_need_gc(p))
+ queue_work(system_long_wq, &p->offload.gc_work);
+ spin_unlock_bh(&br->lock);
+}
+
+
+int br_offload_set_cache_size(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->offload_cache_size = val;
+ br_offload_check_gc(br);
+
+ return 0;
+}
+
+int br_offload_set_cache_reserved(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->offload_cache_reserved = val;
+ br_offload_check_gc(br);
+
+ return 0;
+}
+
+int __init br_offload_init(void)
+{
+ offload_cache = kmem_cache_create("bridge_offload_cache",
+ sizeof(struct bridge_flow),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!offload_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void br_offload_fini(void)
+{
+ kmem_cache_destroy(offload_cache);
+}
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -271,7 +271,13 @@ struct net_bridge_fdb_entry {
unsigned long updated ____cacheline_aligned_in_smp;
unsigned long used;
- struct rcu_head rcu;
+ union {
+ struct {
+ struct hlist_head offload_in;
+ struct hlist_head offload_out;
+ };
+ struct rcu_head rcu;
+ };
};
struct net_bridge_fdb_flush_desc {
@@ -353,6 +359,12 @@ struct net_bridge_mdb_entry {
struct rcu_head rcu;
};
+struct net_bridge_port_offload {
+ struct rhashtable rht;
+ struct work_struct gc_work;
+ bool enabled;
+};
+
struct net_bridge_port {
struct net_bridge *br;
struct net_device *dev;
@@ -414,6 +426,7 @@ struct net_bridge_port {
u16 backup_redirected_cnt;
struct bridge_stp_xstats stp_xstats;
+ struct net_bridge_port_offload offload;
};
#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
@@ -531,6 +544,9 @@ struct net_bridge {
struct kobject *ifobj;
u32 auto_cnt;
+ u32 offload_cache_size;
+ u32 offload_cache_reserved;
+
#ifdef CONFIG_NET_SWITCHDEV
/* Counter used to make sure that hardware domains get unique
* identifiers in case a bridge spans multiple switchdev instances.
@@ -565,6 +581,10 @@ struct br_input_skb_cb {
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
u8 br_netfilter_broute:1;
#endif
+ u8 offload:1;
+ u8 input_vlan_present:1;
+ u16 input_vlan_tag;
+ int input_ifindex;
#ifdef CONFIG_NET_SWITCHDEV
/* Set if TX data plane offloading is used towards at least one
--- /dev/null
+++ b/net/bridge/br_private_offload.h
@@ -0,0 +1,23 @@
+#ifndef __BR_OFFLOAD_H
+#define __BR_OFFLOAD_H
+
+bool br_offload_input(struct net_bridge_port *p, struct sk_buff *skb);
+void br_offload_output(struct sk_buff *skb);
+void br_offload_port_state(struct net_bridge_port *p);
+void br_offload_fdb_update(const struct net_bridge_fdb_entry *fdb);
+int br_offload_init(void);
+void br_offload_fini(void);
+int br_offload_set_cache_size(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int br_offload_set_cache_reserved(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+
+static inline void br_offload_skb_disable(struct sk_buff *skb)
+{
+ struct br_input_skb_cb *cb = (struct br_input_skb_cb *)skb->cb;
+
+ if (cb->offload)
+ cb->offload = 0;
+}
+
+#endif
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -12,6 +12,7 @@
#include "br_private.h"
#include "br_private_stp.h"
+#include "br_private_offload.h"
/* since time values in bpdu are in jiffies and then scaled (1/256)
* before sending, make sure that is at least one STP tick.
@@ -58,6 +59,8 @@ void br_set_state(struct net_bridge_port
(unsigned int) p->port_no, p->dev->name,
br_port_state_names[p->state]);
+ br_offload_port_state(p);
+
if (p->br->stp_enabled == BR_KERNEL_STP) {
switch (p->state) {
case BR_STATE_BLOCKING:
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -18,6 +18,7 @@
#include <linux/sched/signal.h>
#include "br_private.h"
+#include "br_private_offload.h"
/* IMPORTANT: new bridge options must be added with netlink support only
* please do not add new sysfs entries
@@ -933,6 +934,38 @@ static ssize_t vlan_stats_per_port_store
static DEVICE_ATTR_RW(vlan_stats_per_port);
#endif
+static ssize_t offload_cache_size_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->offload_cache_size);
+}
+
+static ssize_t offload_cache_size_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_offload_set_cache_size);
+}
+static DEVICE_ATTR_RW(offload_cache_size);
+
+static ssize_t offload_cache_reserved_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->offload_cache_reserved);
+}
+
+static ssize_t offload_cache_reserved_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_offload_set_cache_reserved);
+}
+static DEVICE_ATTR_RW(offload_cache_reserved);
+
static struct attribute *bridge_attrs[] = {
&dev_attr_forward_delay.attr,
&dev_attr_hello_time.attr,
@@ -987,6 +1020,8 @@ static struct attribute *bridge_attrs[]
&dev_attr_vlan_stats_enabled.attr,
&dev_attr_vlan_stats_per_port.attr,
#endif
+ &dev_attr_offload_cache_size.attr,
+ &dev_attr_offload_cache_reserved.attr,
NULL
};
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -241,6 +241,7 @@ BRPORT_ATTR_FLAG(broadcast_flood, BR_BCA
BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
BRPORT_ATTR_FLAG(bpdu_filter, BR_BPDU_FILTER);
+BRPORT_ATTR_FLAG(offload, BR_OFFLOAD);
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -295,6 +296,7 @@ static const struct brport_attribute *br
&brport_attr_isolated,
&brport_attr_bpdu_filter,
&brport_attr_backup_port,
+ &brport_attr_offload,
NULL
};
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -15,6 +15,7 @@
#include "br_private.h"
#include "br_private_tunnel.h"
+#include "br_private_offload.h"
static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
@@ -180,6 +181,7 @@ void br_handle_ingress_vlan_tunnel(struc
skb_dst_drop(skb);
__vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
+ br_offload_skb_disable(skb);
}
int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
@@ -201,6 +203,7 @@ int br_handle_egress_vlan_tunnel(struct
if (err)
return err;
+ br_offload_skb_disable(skb);
tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst);
if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst))
skb_dst_set(skb, &tunnel_dst->dst);

View File

@ -0,0 +1,112 @@
From: Yousong Zhou <yszhou4tech@gmail.com>
Subject: [PATCH] ath79: add nvmem cell mac-address-ascii support
This is needed for devices with mac address stored in ascii format, e.g.
HiWiFi HC6361 to be ported in the following patch.
Submitted-by: Yousong Zhou <yszhou4tech@gmail.com>
---
net/ethernet/eth.c | 83 ++++++++++++------
1 files changed, 72 insertions(+), 11 deletions(-)
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -531,6 +531,63 @@ int eth_platform_get_mac_address(struct
}
EXPORT_SYMBOL(eth_platform_get_mac_address);
+static void *nvmem_cell_get_mac_address(struct nvmem_cell *cell)
+{
+ size_t len;
+ void *mac;
+
+ mac = nvmem_cell_read(cell, &len);
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+ if (len != ETH_ALEN) {
+ kfree(mac);
+ return ERR_PTR(-EINVAL);
+ }
+ return mac;
+}
+
+static void *nvmem_cell_get_mac_address_ascii(struct nvmem_cell *cell)
+{
+ size_t len;
+ int ret;
+ void *mac_ascii;
+ u8 *mac;
+
+ mac_ascii = nvmem_cell_read(cell, &len);
+ if (IS_ERR(mac_ascii))
+ return PTR_ERR(mac_ascii);
+ if (len != ETH_ALEN*2+5) {
+ kfree(mac_ascii);
+ return ERR_PTR(-EINVAL);
+ }
+ mac = kmalloc(ETH_ALEN, GFP_KERNEL);
+ if (!mac) {
+ kfree(mac_ascii);
+ return ERR_PTR(-ENOMEM);
+ }
+ ret = sscanf(mac_ascii, "%2hhx:%2hhx:%2hhx:%2hhx:%2hhx:%2hhx",
+ &mac[0], &mac[1], &mac[2],
+ &mac[3], &mac[4], &mac[5]);
+ kfree(mac_ascii);
+ if (ret == ETH_ALEN)
+ return mac;
+ kfree(mac);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nvmem_cell_mac_address_property {
+ char *name;
+ void *(*read)(struct nvmem_cell *);
+} nvmem_cell_mac_address_properties[] = {
+ {
+ .name = "mac-address",
+ .read = nvmem_cell_get_mac_address,
+ }, {
+ .name = "mac-address-ascii",
+ .read = nvmem_cell_get_mac_address_ascii,
+ },
+};
+
/**
* platform_get_ethdev_address - Set netdev's MAC address from a given device
* @dev: Pointer to the device
@@ -564,19 +621,23 @@ int nvmem_get_mac_address(struct device
{
struct nvmem_cell *cell;
const void *mac;
- size_t len;
+ struct nvmem_cell_mac_address_property *property;
+ int i;
- cell = nvmem_cell_get(dev, "mac-address");
- if (IS_ERR(cell))
- return PTR_ERR(cell);
-
- mac = nvmem_cell_read(cell, &len);
- nvmem_cell_put(cell);
-
- if (IS_ERR(mac))
- return PTR_ERR(mac);
+ for (i = 0; i < ARRAY_SIZE(nvmem_cell_mac_address_properties); i++) {
+ property = &nvmem_cell_mac_address_properties[i];
+ cell = nvmem_cell_get(dev, property->name);
+ if (IS_ERR(cell)) {
+ if (i == ARRAY_SIZE(nvmem_cell_mac_address_properties) - 1)
+ return PTR_ERR(cell);
+ continue;
+ }
+ mac = property->read(cell);
+ nvmem_cell_put(cell);
+ break;
+ }
- if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+ if (!is_valid_ether_addr(mac)) {
kfree(mac);
return -EINVAL;
}

View File

@ -83,7 +83,7 @@ Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
--- a/include/uapi/linux/netfilter/xt_connmark.h
+++ b/include/uapi/linux/netfilter/xt_connmark.h
@@ -20,6 +20,11 @@ enum {
@@ -15,6 +15,11 @@ enum {
};
enum {
@ -95,7 +95,7 @@ Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
D_SHIFT_LEFT = 0,
D_SHIFT_RIGHT,
};
@@ -34,6 +39,11 @@ struct xt_connmark_tginfo2 {
@@ -29,6 +34,11 @@ struct xt_connmark_tginfo2 {
__u8 shift_dir, shift_bits, mode;
};
@ -109,7 +109,7 @@ Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
__u8 invert;
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -24,12 +24,13 @@ MODULE_ALIAS("ipt_connmark");
@@ -24,13 +24,14 @@ MODULE_ALIAS("ipt_connmark");
MODULE_ALIAS("ip6t_connmark");
static unsigned int
@ -120,15 +120,16 @@ Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
u_int32_t oldmark;
+ u_int8_t dscp;
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
@@ -37,12 +38,24 @@ connmark_tg_shift(struct sk_buff *skb, c
@@ -39,12 +40,24 @@ connmark_tg_shift(struct sk_buff *skb, c
switch (info->mode) {
case XT_CONNMARK_SET:
- newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
oldmark = READ_ONCE(ct->mark);
- newmark = (oldmark & ~info->ctmask) ^ info->ctmark;
- if (info->shift_dir == D_SHIFT_RIGHT)
- newmark >>= info->shift_bits;
- else
@ -151,10 +152,10 @@ Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+ newmark = (newmark & ~info->ctmark) |
+ (info->ctmask | (dscp << info->shift_bits));
+ }
if (ct->mark != newmark) {
ct->mark = newmark;
if (READ_ONCE(ct->mark) != newmark) {
WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
@@ -81,20 +94,36 @@ static unsigned int
@@ -83,20 +96,36 @@ static unsigned int
connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo1 *info = par->targinfo;
@ -193,7 +194,7 @@ Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
return connmark_tg_shift(skb, info);
}
@@ -165,6 +194,16 @@ static struct xt_target connmark_tg_reg[
@@ -167,6 +196,16 @@ static struct xt_target connmark_tg_reg[
.targetsize = sizeof(struct xt_connmark_tginfo2),
.destroy = connmark_tg_destroy,
.me = THIS_MODULE,

View File

@ -8,30 +8,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -712,8 +712,6 @@ config NFT_REJECT_NETDEV
endif # NF_TABLES_NETDEV
-endif # NF_TABLES
-
config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
depends on NF_FLOW_TABLE
@@ -722,11 +720,12 @@ config NF_FLOW_TABLE_INET
To compile it as a module, choose M here.
+endif # NF_TABLES
+
config NF_FLOW_TABLE
tristate "Netfilter flow table module"
depends on NETFILTER_INGRESS
depends on NF_CONNTRACK
- depends on NF_TABLES
help
This option adds the flow table core infrastructure.
@@ -1023,6 +1022,15 @@ config NETFILTER_XT_TARGET_NOTRACK
@@ -1023,6 +1023,15 @@ config NETFILTER_XT_TARGET_NOTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_XT_TARGET_CT
@ -49,7 +26,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
depends on NETFILTER_ADVANCED
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -148,6 +148,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIF
@@ -154,6 +154,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIF
obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
@ -59,7 +36,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
--- /dev/null
+++ b/net/netfilter/xt_FLOWOFFLOAD.c
@@ -0,0 +1,694 @@
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2018-2021 Felix Fietkau <nbd@nbd.name>
+ *
@ -250,13 +227,16 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+}
+
+static void
+xt_flowoffload_check_hook(struct flow_offload *flow, void *data)
+xt_flowoffload_check_hook(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data)
+{
+ struct xt_flowoffload_table *table = data;
+ struct xt_flowoffload_table *table;
+ struct flow_offload_tuple *tuple0 = &flow->tuplehash[0].tuple;
+ struct flow_offload_tuple *tuple1 = &flow->tuplehash[1].tuple;
+ struct xt_flowoffload_hook *hook;
+
+ table = container_of(flowtable, struct xt_flowoffload_table, ft);
+
+ spin_lock_bh(&hooks_lock);
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->ops.dev->ifindex != tuple0->iifidx &&
@ -283,8 +263,8 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+ hook->used = false;
+ spin_unlock_bh(&hooks_lock);
+
+
+
+ err = nf_flow_table_iterate(&table->ft, xt_flowoffload_check_hook,
+ NULL);
+ if (err && err != -EAGAIN)
+ goto out;
+
@ -754,6 +734,34 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+MODULE_LICENSE("GPL");
+module_init(xt_flowoffload_tg_init);
+module_exit(xt_flowoffload_tg_exit);
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -7,7 +7,6 @@
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/ip6_route.h>
-#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -381,8 +380,7 @@ flow_offload_lookup(struct nf_flowtable
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
-static int
-nf_flow_table_iterate(struct nf_flowtable *flow_table,
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
void (*iter)(struct nf_flowtable *flowtable,
struct flow_offload *flow, void *data),
void *data)
@@ -436,6 +434,7 @@ static void nf_flow_offload_gc_step(stru
nf_flow_offload_stats(flow_table, flow);
}
}
+EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
{
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h
@@ -0,0 +1,17 @@
@ -774,3 +782,17 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+};
+
+#endif /* _XT_FLOWOFFLOAD_H */
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -280,6 +280,11 @@ void nf_flow_table_free(struct nf_flowta
void flow_offload_teardown(struct flow_offload *flow);
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ void (*iter)(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data),
+ void *data);
+
void nf_flow_snat_port(const struct flow_offload *flow,
struct sk_buff *skb, unsigned int thoff,
u8 protocol, enum flow_offload_tuple_dir dir);

View File

@ -11,7 +11,7 @@ Signed-off-by: Imre Kaloz <kaloz@openwrt.org>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -149,8 +149,8 @@ static inline bool dev_xmit_complete(int
@@ -150,8 +150,8 @@ static inline bool dev_xmit_complete(int
#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128

View File

@ -13,7 +13,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -474,7 +474,11 @@ static int fq_codel_init(struct Qdisc *s
@@ -471,7 +471,11 @@ static int fq_codel_init(struct Qdisc *s
sch->limit = 10*1024;
q->flows_cnt = 1024;

View File

@ -0,0 +1,25 @@
From 804fbb3f2ec9283f7b778e057a68bfff440a0be6 Mon Sep 17 00:00:00 2001
From: Rui Salvaterra <rsalvaterra@gmail.com>
Date: Wed, 30 Mar 2022 22:51:55 +0100
Subject: [PATCH] kernel: ct: size the hashtable more adequately
To set the default size of the connection tracking hash table, a divider of
16384 becomes inadequate for a router handling lots of connections. Divide by
2048 instead, making the default size scale better with the available RAM.
Signed-off-by: Rui Salvaterra <rsalvaterra@gmail.com>
---
net/netfilter/nf_conntrack_core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2698,7 +2698,7 @@ int nf_conntrack_init_start(void)
if (!nf_conntrack_htable_size) {
nf_conntrack_htable_size
- = (((nr_pages << PAGE_SHIFT) / 16384)
+ = (((nr_pages << PAGE_SHIFT) / 2048)
/ sizeof(struct hlist_head));
if (BITS_PER_LONG >= 64 &&
nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))

View File

@ -36,8 +36,9 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+ Support for FC is very limited.
+
+config AR8216_PHY
+ tristate "Driver for Atheros AR8216 switches"
+ tristate "Driver for Atheros AR8216/8327 switches"
+ select SWCONFIG
+ select ETHERNET_PACKET_MANGLE
+
+config AR8216_PHY_LEDS
+ bool "Atheros AR8216 switch LED support"
@ -52,7 +53,6 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+config PSB6970_PHY
+ tristate "Lantiq XWAY Tantos (PSB6970) Ethernet switch"
+ select SWCONFIG
+ select ETHERNET_PACKET_MANGLE
+
+config RTL8306_PHY
+ tristate "Driver for Realtek RTL8306S switches"
@ -95,13 +95,15 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
config AMD_PHY
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -24,6 +24,19 @@ libphy-$(CONFIG_LED_TRIGGER_PHY) += phy_
@@ -24,6 +24,21 @@ libphy-$(CONFIG_LED_TRIGGER_PHY) += phy_
obj-$(CONFIG_PHYLINK) += phylink.o
obj-$(CONFIG_PHYLIB) += libphy.o
+obj-$(CONFIG_SWCONFIG) += swconfig.o
+obj-$(CONFIG_ADM6996_PHY) += adm6996.o
+obj-$(CONFIG_AR8216_PHY) += ar8216.o ar8327.o
+obj-$(CONFIG_AR8216_PHY) += ar8xxx.o
+ar8xxx-y += ar8216.o
+ar8xxx-y += ar8327.o
+obj-$(CONFIG_SWCONFIG_B53) += b53/
+obj-$(CONFIG_IP17XX_PHY) += ip17xx.o
+obj-$(CONFIG_PSB6970_PHY) += psb6970.o

View File

@ -0,0 +1,21 @@
From ebd924d773223593142d417c41d4ee6fa16f1805 Mon Sep 17 00:00:00 2001
From: OpenWrt community <openwrt-devel@lists.openwrt.org>
Date: Wed, 13 Jul 2022 13:45:56 +0200
Subject: [PATCH] net/dsa/mv88e6xxx: disable ATU violation
---
drivers/net/dsa/mv88e6xxx/chip.c | 3 +++
1 file changed, 3 insertions(+)
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3461,6 +3461,9 @@ static int mv88e6xxx_setup_port(struct m
else
reg = 1 << port;
+ /* Disable ATU member violation interrupt */
+ reg |= MV88E6XXX_PORT_ASSOC_VECTOR_IGNORE_WRONG;
+
err = mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_ASSOC_VECTOR,
reg);
if (err)

View File

@ -0,0 +1,120 @@
From: Birger Koblitz <git@birger-koblitz.de>
Date: Sun, 5 Sep 2021 15:13:10 +0200
Subject: [PATCH] kernel: Add AQR113C and AQR813 support
This hack adds support for the Aquantia 4th generation, 10GBit
PHYs AQR113C and AQR813.
Signed-off-by: Birger Koblitz <git@birger-koblitz.de>
--- a/drivers/net/phy/aquantia_main.c
+++ b/drivers/net/phy/aquantia_main.c
@@ -23,6 +23,7 @@
#define PHY_ID_AQCS109 0x03a1b5c2
#define PHY_ID_AQR405 0x03a1b4b0
#define PHY_ID_AQR113C 0x31c31c12
+#define PHY_ID_AQR813 0x31c31cb2
#define MDIO_PHYXS_VEND_IF_STATUS 0xe812
#define MDIO_PHYXS_VEND_IF_STATUS_TYPE_MASK GENMASK(7, 3)
@@ -415,6 +416,49 @@ static int aqr107_read_rate(struct phy_d
return 0;
}
+static int aqr113c_read_status(struct phy_device *phydev)
+{
+ int val, ret;
+
+ ret = aqr_read_status(phydev);
+ if (ret)
+ return ret;
+
+ if (!phydev->link || phydev->autoneg == AUTONEG_DISABLE)
+ return 0;
+
+ // On AQR113C, the speed returned by aqr_read_status is wrong
+ aqr107_read_rate(phydev);
+
+ val = phy_read_mmd(phydev, MDIO_MMD_PHYXS, MDIO_PHYXS_VEND_IF_STATUS);
+ if (val < 0)
+ return val;
+
+ switch (FIELD_GET(MDIO_PHYXS_VEND_IF_STATUS_TYPE_MASK, val)) {
+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_KR:
+ phydev->interface = PHY_INTERFACE_MODE_10GKR;
+ break;
+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_XFI:
+ phydev->interface = PHY_INTERFACE_MODE_10GBASER;
+ break;
+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_USXGMII:
+ phydev->interface = PHY_INTERFACE_MODE_USXGMII;
+ break;
+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_SGMII:
+ phydev->interface = PHY_INTERFACE_MODE_SGMII;
+ break;
+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_OCSGMII:
+ phydev->interface = PHY_INTERFACE_MODE_2500BASEX;
+ break;
+ default:
+ phydev->interface = PHY_INTERFACE_MODE_NA;
+ break;
+ }
+
+ /* Read downshifted rate from vendor register */
+ return aqr107_read_rate(phydev);
+}
+
static int aqr107_read_status(struct phy_device *phydev)
{
int val, ret;
@@ -554,7 +598,7 @@ static void aqr107_chip_info(struct phy_
build_id = FIELD_GET(VEND1_GLOBAL_RSVD_STAT1_FW_BUILD_ID, val);
prov_id = FIELD_GET(VEND1_GLOBAL_RSVD_STAT1_PROV_ID, val);
- phydev_dbg(phydev, "FW %u.%u, Build %u, Provisioning %u\n",
+ phydev_info(phydev, "FW %u.%u, Build %u, Provisioning %u\n",
fw_major, fw_minor, build_id, prov_id);
}
@@ -809,7 +853,7 @@ static struct phy_driver aqr_driver[] =
.config_aneg = aqr_config_aneg,
.config_intr = aqr_config_intr,
.handle_interrupt = aqr_handle_interrupt,
- .read_status = aqr107_read_status,
+ .read_status = aqr113c_read_status,
.get_tunable = aqr107_get_tunable,
.set_tunable = aqr107_set_tunable,
.suspend = aqr107_suspend,
@@ -819,6 +863,24 @@ static struct phy_driver aqr_driver[] =
.get_stats = aqr107_get_stats,
.link_change_notify = aqr107_link_change_notify,
},
+{
+ PHY_ID_MATCH_MODEL(PHY_ID_AQR813),
+ .name = "Aquantia AQR813",
+ .probe = aqr107_probe,
+ .config_init = aqr107_config_init,
+ .config_aneg = aqr_config_aneg,
+ .config_intr = aqr_config_intr,
+ .handle_interrupt = aqr_handle_interrupt,
+ .read_status = aqr113c_read_status,
+ .get_tunable = aqr107_get_tunable,
+ .set_tunable = aqr107_set_tunable,
+ .suspend = aqr107_suspend,
+ .resume = aqr107_resume,
+ .get_sset_count = aqr107_get_sset_count,
+ .get_strings = aqr107_get_strings,
+ .get_stats = aqr107_get_stats,
+ .link_change_notify = aqr107_link_change_notify,
+},
};
module_phy_driver(aqr_driver);
@@ -832,6 +894,7 @@ static struct mdio_device_id __maybe_unu
{ PHY_ID_MATCH_MODEL(PHY_ID_AQCS109) },
{ PHY_ID_MATCH_MODEL(PHY_ID_AQR405) },
{ PHY_ID_MATCH_MODEL(PHY_ID_AQR113C) },
+ { PHY_ID_MATCH_MODEL(PHY_ID_AQR813) },
{ }
};

Some files were not shown because too many files have changed in this diff Show More