Last active
April 20, 2025 08:04
-
-
Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.
Kernel tuning for dedicated linux server. /etc/sysctl.d/60-sysctl.conf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/60-sysctl.conf | |
# High‑performance settings for API/Web server | |
# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel) | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Run `sysctl --system` after installing this file | |
######################## | |
# System Options # | |
######################## | |
# Reduce console noise from kernel messages | |
# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel | |
# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit) | |
kernel.printk = 3 4 1 3 | |
# Control swapping behavior | |
# 0: Only swap to avoid OOM | |
# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily | |
# 10-60: Balanced approach | |
# 60-100: Aggressively swap out anonymous memory | |
# 10 is a common starting point for servers with ample RAM. Adjust based on monitoring. | |
vm.swappiness = 10 | |
# Memory overcommit policy | |
# 0: Heuristic overcommit (default) | |
# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills) | |
# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free) | |
# 1 is often required by applications like Redis for background saves. Use with caution and monitor memory usage. | |
vm.overcommit_memory = 1 | |
# Maximum number of open file descriptors system-wide | |
# Increase for applications handling many concurrent connections (web servers, databases, etc.) | |
# This value is very high and should be sufficient for most workloads. | |
fs.file-max = 2097152 | |
# Maximum number of PIDs (processes/threads) | |
# 65536 is a standard high value, suitable for systems with many processes or threads. | |
kernel.pid_max = 65536 | |
# Incoming TCP connection backlog (SYN_RCVD state) | |
# The maximum number of queued connection requests that have not yet been accepted by the application. | |
# Increase for servers handling high rates of new connections. 32768 is a generous value. | |
net.core.somaxconn = 32768 | |
######################## | |
# TCP/IP Tuning # | |
######################## | |
# Essential TCP features for modern performance | |
net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections) | |
net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss) | |
net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW) | |
net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections) | |
# SYN backlog & flood protection | |
# Maximum number of entries in the SYN queue (SYN_RECV state) | |
net.ipv4.tcp_max_syn_backlog = 8192 # Increased from 4096, suitable for higher SYN rates | |
# Enable SYN cookies (protects against SYN floods when SYN queue is full) | |
net.ipv4.tcp_syncookies = 1 | |
# Reduce SYN retransmits before giving up | |
net.ipv4.tcp_syn_retries = 2 | |
net.ipv4.tcp_synack_retries = 2 | |
# Time‑wait handling | |
# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers. Keep DISABLED. | |
net.ipv4.tcp_tw_recycle = 0 | |
# tcp_tw_reuse is also problematic with NAT and generally not recommended on servers. Keep DISABLED. | |
net.ipv4.tcp_tw_reuse = 0 | |
# Congestion control | |
# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths. | |
# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback. | |
net.ipv4.tcp_congestion_control= bbr | |
# Low watermark for TCP send buffer (bytes) | |
# Helps reduce latency for small writes by preventing excessive buffering before sending. | |
net.ipv4.tcp_notsent_lowat = 16384 # 16KB | |
# Local port range for outgoing connections | |
# Increase if the server initiates many short-lived connections and tcp_tw_reuse is off. | |
# 1024-65535 is the widest possible range. | |
net.ipv4.ip_local_port_range = 1024 65535 | |
# Maximum number of sockets in TIME_WAIT state | |
# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load. | |
# Default is often 200000. This value is high and usually sufficient. | |
net.ipv4.tcp_max_tw_buckets = 200000 | |
# TCP Keepalives (Detect dead connections faster and free resources) | |
# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval) | |
# Shorter times are recommended for servers to reclaim resources from idle clients sooner. | |
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle | |
net.ipv4.tcp_keepalive_probes = 6 # Send up to 6 probes | |
net.ipv4.tcp_keepalive_intvl = 30 # Wait 30 seconds between probes | |
# Other TCP optimizations | |
# Disable slow start after idle periods (improves performance for intermittent traffic) | |
net.ipv4.tcp_slow_start_after_idle = 0 | |
# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU) | |
net.ipv4.tcp_mtu_probing = 1 | |
# Disable saving TCP metrics across connections (can save memory and CPU, minor impact) | |
net.ipv4.tcp_no_metrics_save = 1 | |
# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes) | |
net.ipv4.tcp_autocorking = 1 | |
# Explicitly enable SACK (usually on by default, but good for clarity) | |
net.ipv4.tcp_sack_enabled = 1 | |
######################## | |
# Network Buffers # | |
######################## | |
# Maximum backlog for packets arriving at the network device before being processed by the kernel | |
# Increase for high packet rates to avoid drops. 30000 is a high value. | |
net.core.netdev_max_backlog = 30000 | |
# Default and Maximum socket buffer sizes (bytes) | |
# These values are used by default and as limits for auto-tuning. | |
# They are generous and suitable for 10Gbps+ networks. | |
# Actual buffer sizes are often auto-tuned by the kernel within these limits. | |
net.core.rmem_default = 31457280 # Default receive buffer size (30MB) | |
net.core.wmem_default = 31457280 # Default send buffer size (30MB) | |
net.core.rmem_max = 67108864 # Max receive buffer size (64MB) | |
net.core.wmem_max = 67108864 # Max send buffer size (64MB) | |
# TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient | |
# net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer | |
# net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer | |
######################## | |
# Queue Discipline # | |
######################## | |
# Default queueing discipline for network interfaces | |
# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness. | |
net.core.default_qdisc = fq_codel | |
######################## | |
# IP Fragmentation # | |
######################## | |
# Memory thresholds for reassembling IP fragments (bytes) | |
# High: Start dropping fragments when memory exceeds this | |
# Low: Stop dropping fragments when memory drops below this | |
net.ipv4.ipfrag_high_thresh = 262144 # 256KB | |
net.ipv4.ipfrag_low_thresh = 196608 # 192KB | |
# Time to keep fragments in memory before discarding (seconds) | |
net.ipv4.ipfrag_time = 30 | |
######################## | |
# Security & ICMP # | |
######################## | |
# Disable redirects & source routing (Security hardening - prevents malicious routing) | |
net.ipv4.conf.all.accept_redirects = 0 | |
net.ipv4.conf.default.accept_redirects = 0 | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route= 0 | |
# IP spoofing & logging (Security hardening) | |
# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple. | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too | |
# Log packets with impossible addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
# ICMP hardening | |
net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods) | |
net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage) | |
# IP forwarding (Disable unless server acts as a router/gateway) | |
net.ipv4.ip_forward = 0 | |
######################## | |
# IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) # | |
######################## | |
# If IPv6 is used, apply similar tuning as IPv4 for consistency and performance. | |
# net.ipv6.conf.all.tcp_congestion_control = bbr | |
# net.ipv6.conf.default.tcp_congestion_control = bbr | |
# net.ipv6.conf.all.rmem_default = 31457280 | |
# net.ipv6.conf.all.wmem_default = 31457280 | |
# net.ipv6.conf.all.rmem_max = 67108864 | |
# net.ipv6.conf.all.wmem_max = 67108864 | |
# net.ipv6.conf.default.rmem_default = 31457280 | |
# net.ipv6.conf.default.wmem_default = 31457280 | |
# net.ipv6.conf.default.rmem_max = 67108864 | |
# net.ipv6.conf.default.wmem_max = 67108864 | |
# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/80-k8s-ipvs.conf | |
# Production Kubernetes node tuning with IPVS | |
# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel) | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Run `sysctl --system` after installing this file | |
########################## | |
# System Options # | |
########################## | |
# Reduce console noise from kernel messages | |
# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel | |
# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit) | |
kernel.printk = 3 4 1 3 | |
# Control swapping behavior | |
# 0: Only swap to avoid OOM | |
# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily | |
# 10-60: Balanced approach | |
# 60-100: Aggressively swap out anonymous memory | |
# 10 is a common starting point for servers with ample RAM. Adjust based on monitoring. | |
vm.swappiness = 10 | |
# Memory overcommit policy | |
# 0: Heuristic overcommit (default) | |
# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills) | |
# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free) | |
# 1 is often required by applications like Redis for background saves, and can be beneficial in K8s | |
# to allow pods to potentially burst memory, but requires careful monitoring to avoid node OOMs. | |
vm.overcommit_memory = 1 | |
# Maximum number of open file descriptors system-wide | |
# Increase for systems running many containers/pods, each potentially opening many files/sockets. | |
# This value is very high and should be sufficient for most workloads. | |
fs.file-max = 2097152 | |
# Maximum number of PIDs (processes/threads) | |
# Increase for systems running many containers/pods, as each container can have multiple processes. | |
# 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes. | |
kernel.pid_max = 65536 | |
########################## | |
# TCP Performance # | |
########################## | |
# Essential TCP features for modern performance | |
net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections) | |
net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss) | |
net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW) | |
net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections) | |
# SYN backlog & flood protection | |
# Maximum number of entries in the SYN queue (SYN_RECV state) | |
# Increase for servers handling high rates of new connections. 8192 is a good value. | |
net.ipv4.tcp_max_syn_backlog = 8192 | |
# Enable SYN cookies (protects against SYN floods when SYN queue is full) | |
net.ipv4.tcp_syncookies = 1 | |
# Reduce SYN retransmits before giving up | |
net.ipv4.tcp_syn_retries = 2 | |
net.ipv4.tcp_synack_retries = 2 | |
# Local port range for outgoing connections (ephemeral ports) | |
# Increase if the node initiates many short-lived connections (e.g., health checks, API calls) | |
# and tcp_tw_reuse is off. 1024-65535 is the widest possible range. | |
net.ipv4.ip_local_port_range = 1024 65535 | |
# Congestion control | |
# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths. | |
# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback. | |
net.ipv4.tcp_congestion_control = bbr | |
# Low watermark for TCP send buffer (bytes) | |
# Helps reduce latency for small writes by preventing excessive buffering before sending. | |
net.ipv4.tcp_notsent_lowat = 16384 # 16KB | |
# Disable slow start after idle periods (improves performance for intermittent traffic) | |
net.ipv4.tcp_slow_start_after_idle = 0 | |
# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU) | |
net.ipv4.tcp_mtu_probing = 1 | |
# Disable saving TCP metrics across connections (can save memory and CPU, minor impact) | |
# Useful on servers handling connections from a vast number of unique clients. | |
net.ipv4.tcp_no_metrics_save = 1 | |
# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes) | |
net.ipv4.tcp_autocorking = 1 | |
########################## | |
# Network Buffers # | |
########################## | |
# Maximum backlog for packets arriving at the network device before being processed by the kernel | |
# Increase for high packet rates to avoid drops. 30000 is a high value. | |
net.core.netdev_max_backlog = 30000 | |
# Default and Maximum socket buffer sizes (bytes) | |
# These values are used by default and as limits for auto-tuning. | |
# They are generous and suitable for 10Gbps+ networks common in clusters. | |
# Actual buffer sizes are often auto-tuned by the kernel within these limits. | |
net.core.rmem_default = 31457280 # Default receive buffer size (30MB) | |
net.core.wmem_default = 31457280 # Default send buffer size (30MB) | |
net.core.rmem_max = 67108864 # Max receive buffer size (64MB) | |
net.core.wmem_max = 67108864 # Max send buffer size (64MB) | |
# TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient | |
# net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer | |
# net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer | |
########################## | |
# Queue Discipline # | |
########################## | |
# Default queueing discipline for network interfaces | |
# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness. | |
net.core.default_qdisc = fq_codel | |
########################## | |
# IP Fragmentation # | |
########################## | |
# Memory thresholds for reassembling IP fragments (bytes) | |
# High: Start dropping fragments when memory exceeds this | |
# Low: Stop dropping fragments when memory drops below this | |
net.ipv4.ipfrag_high_thresh = 262144 # 256KB | |
net.ipv4.ipfrag_low_thresh = 196608 # 192KB | |
# Time to keep fragments in memory before discarding (seconds) | |
net.ipv4.ipfrag_time = 30 | |
########################## | |
# Time‑Wait & Keepalive # | |
########################## | |
# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/IPVS. Keep DISABLED. | |
net.ipv4.tcp_tw_recycle = 0 | |
# tcp_tw_reuse is also problematic with NAT/IPVS and generally not recommended on servers. Keep DISABLED. | |
net.ipv4.tcp_tw_reuse = 0 | |
# Maximum number of sockets in TIME_WAIT state | |
# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load. | |
# 1440000 is a very high value, suitable for nodes handling a massive number of short connections. | |
net.ipv4.tcp_max_tw_buckets = 1440000 | |
# TCP Keepalives (Detect dead connections faster and free resources) | |
# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval) | |
# Shorter times are recommended for servers to reclaim resources from idle clients sooner. | |
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle | |
net.ipv4.tcp_keepalive_intvl = 60 # Wait 60 seconds between probes | |
net.ipv4.tcp_keepalive_probes = 5 # Send up to 5 probes | |
########################## | |
# Security & ICMP # | |
########################## | |
# Disable redirects & source routing (Security hardening - prevents malicious routing) | |
net.ipv4.conf.all.accept_redirects = 0 | |
net.ipv4.conf.default.accept_redirects = 0 | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route = 0 | |
# IP spoofing & logging (Security hardening) | |
# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple. | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too | |
# Log packets with impossible addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
# ICMP hardening | |
net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods) | |
net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage) | |
########################## | |
# Routing & Bridge # | |
########################## | |
# Enable IP forwarding (Essential for Kubernetes nodes to route traffic between pods/services) | |
net.ipv4.ip_forward = 1 | |
# Enable netfilter processing for bridged IP packets (Essential for K8s networking like Services/NetworkPolicy) | |
net.bridge.bridge-nf-call-iptables = 1 | |
net.bridge.bridge-nf-call-ip6tables = 1 | |
# net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering | |
########################## | |
# Kubernetes IPVS # | |
########################## | |
# Enable IPVS connection tracking | |
net.ipv4.vs.conntrack = 1 | |
# Expire connections for unavailable destinations (helps with rolling updates/pod evictions) | |
net.ipv4.vs.expire_nodest_conn = 1 | |
# Expire templates for quiescent services | |
net.ipv4.vs.expire_quiescent_template = 1 | |
# Relax strict TCP/UDP state checks (often needed in K8s due to health checks, probes, etc.) | |
net.ipv4.vs.sloppy_tcp = 1 | |
net.ipv4.vs.sloppy_udp = 1 | |
########################## | |
# Conntrack Tuning # | |
########################## | |
# Maximum number of connection tracking entries | |
# Default is often low (e.g., 65536). Increase significantly for high connection loads in K8s. | |
# A common value is 1048576 (1M) or higher depending on node size and workload. | |
net.netfilter.nf_conntrack_max = 1048576 | |
# Connection tracking timeouts (seconds) | |
# Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster, | |
# but too short can break legitimate long-lived connections. | |
net.netfilter.nf_conntrack_tcp_timeout_established = 43200 # Default 5 days (432000), 12 hours is often sufficient | |
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 # Default 60 | |
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 # Default 120, can be reduced | |
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Default 120 (matches 2*MSL) | |
net.netfilter.nf_conntrack_generic_timeout = 60 # Default 60 (for non-TCP/UDP) | |
########################## | |
# IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) # | |
########################## | |
# If IPv6 is used in your cluster, apply similar tuning as IPv4 for consistency and performance. | |
# net.ipv6.conf.all.tcp_congestion_control = bbr | |
# net.ipv6.conf.default.tcp_congestion_control = bbr | |
# net.ipv6.conf.all.rmem_default = 31457280 | |
# net.ipv6.conf.all.wmem_default = 31457280 | |
# net.ipv6.conf.all.rmem_max = 67108864 | |
# net.ipv6.conf.all.wmem_max = 67108864 | |
# net.ipv6.conf.default.rmem_default = 31457280 | |
# net.ipv6.conf.default.wmem_default = 31457280 | |
# net.ipv6.conf.default.rmem_max = 67108864 | |
# net.ipv6.conf.default.wmem_max = 67108864 | |
# net.ipv6.conf.all.accept_redirects = 0 | |
# net.ipv6.conf.default.accept_redirects = 0 | |
# net.ipv6.conf.all.accept_source_route = 0 | |
# net.ipv6.conf.default.accept_source_route = 0 | |
# net.ipv6.conf.all.rp_filter = 1 | |
# net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here | |
# net.netfilter.nf_conntrack_max = 1048576 # Conntrack max is shared between IPv4 and IPv6 | |
# net.netfilter.nf_conntrack_ip6_timeout_established = 43200 # IPv6 specific timeouts if needed | |
# ... (add other IPv6 conntrack timeouts if desired) | |
# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/80-pve.conf | |
# Large Production Proxmox VE Cluster Host Tuning | |
# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel) | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Run `sysctl --system` after installing this file | |
######################## | |
# 1) System Options # | |
######################## | |
# Reduce console noise from kernel messages | |
# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel | |
# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit) | |
kernel.printk = 3 4 1 3 | |
# Control swapping behavior | |
# 0: Only swap to avoid OOM | |
# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily | |
# 10 is a common starting point for hypervisors to keep guest memory in RAM. Adjust based on monitoring. | |
vm.swappiness = 10 | |
# Memory overcommit policy | |
# 0: Heuristic overcommit (default) | |
# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills) | |
# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free) | |
# 1 is often required by applications like Redis for background saves. | |
# WARNING: Setting vm.overcommit_memory=1 on a hypervisor host can be risky. | |
# If guests overcommit their own memory, and the host also overcommits, | |
# you can potentially trigger an OOM killer event on the host itself. | |
# Monitor memory usage carefully. Consider 0 or 2 if not strictly required by host applications. | |
vm.overcommit_memory = 1 | |
# Maximum number of open file descriptors system-wide | |
# Increase for systems running many guests/processes, each potentially opening many files/sockets. | |
# This value is very high and should be sufficient for most workloads. | |
fs.file-max = 2097152 | |
# Maximum number of PIDs (processes/threads) | |
# Increase for systems running many guests/processes. | |
# 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes. | |
kernel.pid_max = 65536 | |
# Incoming TCP connection backlog (SYN_RCVD state) for host services (PVE API, SSH, etc.) | |
# Increase for servers handling high rates of new connections to host services. 32768 is generous. | |
net.core.somaxconn = 32768 | |
######################## | |
# 2) Network Core # | |
######################## | |
# Maximum backlog for packets arriving at the network device before being processed by the kernel | |
# Increase for high packet rates on high-speed NICs to avoid drops. 30000 is a high value. | |
net.core.netdev_max_backlog = 30000 | |
# Default and Maximum socket buffer sizes (bytes) | |
# These values are used by default and as limits for auto-tuning. | |
# They are generous and suitable for 10Gbps+ networks common in clusters. | |
# Actual buffer sizes are often auto-tuned by the kernel within these limits. | |
net.core.rmem_default = 31457280 # Default receive buffer size (30MB) | |
net.core.wmem_default = 31457280 # Default send buffer size (30MB) | |
net.core.rmem_max = 67108864 # Max receive buffer size (64MB) | |
net.core.wmem_max = 67108864 # Max send buffer size (64MB) | |
# Default queueing discipline for network interfaces | |
# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness. | |
net.core.default_qdisc = fq_codel | |
######################## | |
# 3) IPv4 TCP Tuning # | |
######################## | |
# Essential TCP features for modern performance | |
net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections) | |
net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss) | |
net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW) | |
net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections) | |
# SYN backlog & flood protection | |
# Maximum number of entries in the SYN queue (SYN_RECV state) | |
# Increase for servers handling high rates of new connections. 8192 is a good value. | |
net.ipv4.tcp_max_syn_backlog = 8192 | |
# Enable SYN cookies (protects against SYN floods when SYN queue is full) | |
net.ipv4.tcp_syncookies = 1 | |
# Reduce SYN retransmits before giving up | |
net.ipv4.tcp_syn_retries = 2 | |
net.ipv4.tcp_synack_retries = 2 | |
# Local port range for outgoing connections (ephemeral ports) | |
# Increase if the host or guests using host's IP initiate many short-lived connections | |
# and tcp_tw_reuse is off. 1024-65535 is the widest possible range. | |
net.ipv4.ip_local_port_range = 1024 65535 | |
# Congestion control | |
# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths. | |
# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback. | |
net.ipv4.tcp_congestion_control= bbr | |
# Low watermark for TCP send buffer (bytes) | |
# Helps reduce latency for small writes by preventing excessive buffering before sending. | |
net.ipv4.tcp_notsent_lowat = 16384 # 16KB | |
# Disable slow start after idle periods (improves performance for intermittent traffic) | |
net.ipv4.tcp_slow_start_after_idle = 0 | |
# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU) | |
net.ipv4.tcp_mtu_probing = 1 | |
# Disable saving TCP metrics across connections (can save memory and CPU, minor impact) | |
# Useful on servers handling connections from a vast number of unique clients. | |
net.ipv4.tcp_no_metrics_save = 1 | |
# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes) | |
net.ipv4.tcp_autocorking = 1 | |
######################## | |
# 4) IPv4 Other Tuning # | |
######################## | |
# IP Fragmentation | |
# Memory thresholds for reassembling IP fragments (bytes) | |
# High: Start dropping fragments when memory exceeds this | |
# Low: Stop dropping fragments when memory drops below this | |
net.ipv4.ipfrag_high_thresh = 262144 # 256KB | |
net.ipv4.ipfrag_low_thresh = 196608 # 192KB | |
# Time to keep fragments in memory before discarding (seconds) | |
net.ipv4.ipfrag_time = 30 | |
# Time‑Wait Handling | |
# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/Bridging. Keep DISABLED. | |
net.ipv4.tcp_tw_recycle = 0 | |
# tcp_tw_reuse is also problematic with NAT/Bridging and generally not recommended on servers. Keep DISABLED. | |
net.ipv4.tcp_tw_reuse = 0 | |
# Maximum number of sockets in TIME_WAIT state | |
# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load. | |
# 1440000 is a very high value, suitable for nodes handling a massive number of short connections. | |
net.ipv4.tcp_max_tw_buckets = 1440000 | |
# TCP Keepalives (Host Services) | |
# Detect dead connections faster and free resources for host services (PVE API, SSH, etc.) | |
# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval) | |
# Shorter times are recommended for servers to reclaim resources from idle clients sooner. | |
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle | |
net.ipv4.tcp_keepalive_intvl = 30 # Wait 30 seconds between probes | |
net.ipv4.tcp_keepalive_probes = 6 # Send up to 6 probes | |
######################## | |
# 5) Security & ICMP # | |
######################## | |
# Disable redirects & source routing (Security hardening - prevents malicious routing) | |
net.ipv4.conf.all.accept_redirects = 0 | |
net.ipv4.conf.default.accept_redirects = 0 | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route= 0 | |
# IP spoofing & logging (Security hardening) | |
# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple. | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too | |
# Log packets with impossible addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
# ICMP hardening | |
net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods) | |
net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage) | |
######################## | |
# 6) Routing & Bridge # | |
######################## | |
# Enable IP forwarding (ESSENTIAL for hypervisor nodes to route traffic between guests and external networks) | |
net.ipv4.ip_forward = 1 | |
# Enable netfilter processing for bridged IP packets (ESSENTIAL for PVE firewall and guest networking) | |
net.bridge.bridge-nf-call-iptables = 1 | |
net.bridge.bridge-nf-call-ip6tables = 1 | |
# net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering | |
######################## | |
# 7) Conntrack Tuning # | |
######################## | |
# Maximum number of connection tracking entries | |
# Default is often low (e.g., 65536). Increase significantly for high connection loads from many guests. | |
# 1000000 (1M) is a good starting point for busy nodes. Monitor usage (`/proc/sys/net/netfilter/nf_conntrack_count`). | |
net.netfilter.nf_conntrack_max = 1000000 | |
# Connection tracking timeouts (seconds) | |
# Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster, | |
# but too short can break legitimate long-lived connections. | |
# Default established is 5 days (432000). 24 hours (86400) is a common reduction. | |
net.netfilter.nf_conntrack_tcp_timeout_established = 86400 | |
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 # Default 60 | |
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 # Default 120, can be reduced | |
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Default 120 (matches 2*MSL) | |
net.netfilter.nf_conntrack_generic_timeout = 60 # Default 60 (for non-TCP/UDP) | |
######################## | |
# 8) VM & Storage # | |
######################## | |
# Control how aggressively the kernel swaps out filesystem cache (dentries and inodes) | |
# Lower values (closer to 0) mean the kernel is less aggressive about reclaiming memory | |
# used for caching filesystem metadata. Higher values (closer to 100) mean it's more aggressive. | |
# Default is 100. Reducing this can help keep filesystem metadata in RAM, potentially | |
# improving performance for file-heavy operations, but uses more memory. | |
# A value of 50 is a common compromise. Adjust based on monitoring. | |
vm.vfs_cache_pressure = 50 | |
# Percentage of total system memory that can be filled with 'dirty' pages (modified data | |
# that hasn't been written to disk) before the system starts actively writing it back. | |
# vm.dirty_ratio = 15 # Adjust based on storage performance | |
# Percentage of total system memory that can be filled with 'dirty' pages before a | |
# background process starts writing it back. | |
# vm.dirty_background_ratio = 5 # Adjust based on storage performance | |
# Note: vm.dirty_ratio and vm.dirty_background_ratio are CRITICAL for storage performance. | |
# The optimal values depend heavily on your storage type (SSD, NVMe, HDD, network storage) | |
# and workload. High values can lead to large write bursts and potential I/O stalls. | |
# Low values can cause excessive small writes. The values 15/5 are starting points; | |
# monitor I/O wait and adjust carefully. Consider using absolute bytes instead of percentages | |
# (vm.dirty_bytes, vm.dirty_background_bytes) on systems with large amounts of RAM. | |
######################## | |
# 9) IPv6 Tuning (Optional) # | |
######################## | |
# If IPv6 is used in your cluster (host or guests), apply similar tuning as IPv4. | |
# net.ipv6.conf.all.tcp_congestion_control = bbr | |
# net.ipv6.conf.default.tcp_congestion_control = bbr | |
# net.ipv6.conf.all.rmem_default = 31457280 | |
# net.ipv6.conf.all.wmem_default = 31457280 | |
# net.ipv6.conf.all.rmem_max = 67108864 | |
# net.ipv6.conf.all.wmem_max = 67108864 | |
# net.ipv6.conf.default.rmem_default = 31457280 | |
# net.ipv6.conf.default.wmem_default = 31457280 | |
# net.ipv6.conf.default.rmem_max = 67108864 | |
# net.ipv6.conf.default.wmem_max = 67108864 | |
# net.ipv6.conf.all.accept_redirects = 0 | |
# net.ipv6.conf.default.accept_redirects = 0 | |
# net.ipv6.conf.all.accept_source_route = 0 | |
# net.ipv6.conf.default.accept_source_route = 0 | |
# net.ipv6.conf.all.rp_filter = 1 | |
# net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here | |
# net.netfilter.nf_conntrack_max = 1000000 # Conntrack max is shared between IPv4 and IPv6 | |
# net.netfilter.nf_conntrack_ip6_timeout_established = 86400 # IPv6 specific timeouts if needed | |
# ... (add other IPv6 conntrack timeouts if desired) | |
# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment