Skip to content

Instantly share code, notes, and snippets.

@bouroo
Last active April 20, 2025 08:04
Show Gist options
  • Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.
Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.
Kernel tuning for dedicated linux server. /etc/sysctl.d/60-sysctl.conf
# /etc/sysctl.d/60-sysctl.conf
# High‑performance settings for API/Web server
# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Run `sysctl --system` after installing this file
########################
# System Options #
########################
# Reduce console noise from kernel messages
# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
kernel.printk = 3 4 1 3
# Control swapping behavior
# 0: Only swap to avoid OOM
# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
# 10-60: Balanced approach
# 60-100: Aggressively swap out anonymous memory
# 10 is a common starting point for servers with ample RAM. Adjust based on monitoring.
vm.swappiness = 10
# Memory overcommit policy
# 0: Heuristic overcommit (default)
# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
# 1 is often required by applications like Redis for background saves. Use with caution and monitor memory usage.
vm.overcommit_memory = 1
# Maximum number of open file descriptors system-wide
# Increase for applications handling many concurrent connections (web servers, databases, etc.)
# This value is very high and should be sufficient for most workloads.
fs.file-max = 2097152
# Maximum number of PIDs (processes/threads)
# 65536 is a standard high value, suitable for systems with many processes or threads.
kernel.pid_max = 65536
# Incoming TCP connection backlog (SYN_RCVD state)
# The maximum number of queued connection requests that have not yet been accepted by the application.
# Increase for servers handling high rates of new connections. 32768 is a generous value.
net.core.somaxconn = 32768
########################
# TCP/IP Tuning #
########################
# Essential TCP features for modern performance
net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections)
net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)
# SYN backlog & flood protection
# Maximum number of entries in the SYN queue (SYN_RECV state)
net.ipv4.tcp_max_syn_backlog = 8192 # Increased from 4096, suitable for higher SYN rates
# Enable SYN cookies (protects against SYN floods when SYN queue is full)
net.ipv4.tcp_syncookies = 1
# Reduce SYN retransmits before giving up
net.ipv4.tcp_syn_retries = 2
net.ipv4.tcp_synack_retries = 2
# Time‑wait handling
# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers. Keep DISABLED.
net.ipv4.tcp_tw_recycle = 0
# tcp_tw_reuse is also problematic with NAT and generally not recommended on servers. Keep DISABLED.
net.ipv4.tcp_tw_reuse = 0
# Congestion control
# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
net.ipv4.tcp_congestion_control= bbr
# Low watermark for TCP send buffer (bytes)
# Helps reduce latency for small writes by preventing excessive buffering before sending.
net.ipv4.tcp_notsent_lowat = 16384 # 16KB
# Local port range for outgoing connections
# Increase if the server initiates many short-lived connections and tcp_tw_reuse is off.
# 1024-65535 is the widest possible range.
net.ipv4.ip_local_port_range = 1024 65535
# Maximum number of sockets in TIME_WAIT state
# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
# Default is often 200000. This value is high and usually sufficient.
net.ipv4.tcp_max_tw_buckets = 200000
# TCP Keepalives (Detect dead connections faster and free resources)
# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
# Shorter times are recommended for servers to reclaim resources from idle clients sooner.
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle
net.ipv4.tcp_keepalive_probes = 6 # Send up to 6 probes
net.ipv4.tcp_keepalive_intvl = 30 # Wait 30 seconds between probes
# Other TCP optimizations
# Disable slow start after idle periods (improves performance for intermittent traffic)
net.ipv4.tcp_slow_start_after_idle = 0
# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
net.ipv4.tcp_mtu_probing = 1
# Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
net.ipv4.tcp_no_metrics_save = 1
# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
net.ipv4.tcp_autocorking = 1
# Explicitly enable SACK (usually on by default, but good for clarity)
net.ipv4.tcp_sack_enabled = 1
########################
# Network Buffers #
########################
# Maximum backlog for packets arriving at the network device before being processed by the kernel
# Increase for high packet rates to avoid drops. 30000 is a high value.
net.core.netdev_max_backlog = 30000
# Default and Maximum socket buffer sizes (bytes)
# These values are used by default and as limits for auto-tuning.
# They are generous and suitable for 10Gbps+ networks.
# Actual buffer sizes are often auto-tuned by the kernel within these limits.
net.core.rmem_default = 31457280 # Default receive buffer size (30MB)
net.core.wmem_default = 31457280 # Default send buffer size (30MB)
net.core.rmem_max = 67108864 # Max receive buffer size (64MB)
net.core.wmem_max = 67108864 # Max send buffer size (64MB)
# TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient
# net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer
# net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer
########################
# Queue Discipline #
########################
# Default queueing discipline for network interfaces
# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
net.core.default_qdisc = fq_codel
########################
# IP Fragmentation #
########################
# Memory thresholds for reassembling IP fragments (bytes)
# High: Start dropping fragments when memory exceeds this
# Low: Stop dropping fragments when memory drops below this
net.ipv4.ipfrag_high_thresh = 262144 # 256KB
net.ipv4.ipfrag_low_thresh = 196608 # 192KB
# Time to keep fragments in memory before discarding (seconds)
net.ipv4.ipfrag_time = 30
########################
# Security & ICMP #
########################
# Disable redirects & source routing (Security hardening - prevents malicious routing)
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route= 0
# IP spoofing & logging (Security hardening)
# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too
# Log packets with impossible addresses (martians)
net.ipv4.conf.all.log_martians = 1
# ICMP hardening
net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods)
net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage)
# IP forwarding (Disable unless server acts as a router/gateway)
net.ipv4.ip_forward = 0
########################
# IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) #
########################
# If IPv6 is used, apply similar tuning as IPv4 for consistency and performance.
# net.ipv6.conf.all.tcp_congestion_control = bbr
# net.ipv6.conf.default.tcp_congestion_control = bbr
# net.ipv6.conf.all.rmem_default = 31457280
# net.ipv6.conf.all.wmem_default = 31457280
# net.ipv6.conf.all.rmem_max = 67108864
# net.ipv6.conf.all.wmem_max = 67108864
# net.ipv6.conf.default.rmem_default = 31457280
# net.ipv6.conf.default.wmem_default = 31457280
# net.ipv6.conf.default.rmem_max = 67108864
# net.ipv6.conf.default.wmem_max = 67108864
# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
# /etc/sysctl.d/80-k8s-ipvs.conf
# Production Kubernetes node tuning with IPVS
# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Run `sysctl --system` after installing this file
##########################
# System Options #
##########################
# Reduce console noise from kernel messages
# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
kernel.printk = 3 4 1 3
# Control swapping behavior
# 0: Only swap to avoid OOM
# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
# 10-60: Balanced approach
# 60-100: Aggressively swap out anonymous memory
# 10 is a common starting point for servers with ample RAM. Adjust based on monitoring.
vm.swappiness = 10
# Memory overcommit policy
# 0: Heuristic overcommit (default)
# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
# 1 is often required by applications like Redis for background saves, and can be beneficial in K8s
# to allow pods to potentially burst memory, but requires careful monitoring to avoid node OOMs.
vm.overcommit_memory = 1
# Maximum number of open file descriptors system-wide
# Increase for systems running many containers/pods, each potentially opening many files/sockets.
# This value is very high and should be sufficient for most workloads.
fs.file-max = 2097152
# Maximum number of PIDs (processes/threads)
# Increase for systems running many containers/pods, as each container can have multiple processes.
# 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes.
kernel.pid_max = 65536
##########################
# TCP Performance #
##########################
# Essential TCP features for modern performance
net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections)
net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)
# SYN backlog & flood protection
# Maximum number of entries in the SYN queue (SYN_RECV state)
# Increase for servers handling high rates of new connections. 8192 is a good value.
net.ipv4.tcp_max_syn_backlog = 8192
# Enable SYN cookies (protects against SYN floods when SYN queue is full)
net.ipv4.tcp_syncookies = 1
# Reduce SYN retransmits before giving up
net.ipv4.tcp_syn_retries = 2
net.ipv4.tcp_synack_retries = 2
# Local port range for outgoing connections (ephemeral ports)
# Increase if the node initiates many short-lived connections (e.g., health checks, API calls)
# and tcp_tw_reuse is off. 1024-65535 is the widest possible range.
net.ipv4.ip_local_port_range = 1024 65535
# Congestion control
# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
net.ipv4.tcp_congestion_control = bbr
# Low watermark for TCP send buffer (bytes)
# Helps reduce latency for small writes by preventing excessive buffering before sending.
net.ipv4.tcp_notsent_lowat = 16384 # 16KB
# Disable slow start after idle periods (improves performance for intermittent traffic)
net.ipv4.tcp_slow_start_after_idle = 0
# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
net.ipv4.tcp_mtu_probing = 1
# Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
# Useful on servers handling connections from a vast number of unique clients.
net.ipv4.tcp_no_metrics_save = 1
# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
net.ipv4.tcp_autocorking = 1
##########################
# Network Buffers #
##########################
# Maximum backlog for packets arriving at the network device before being processed by the kernel
# Increase for high packet rates to avoid drops. 30000 is a high value.
net.core.netdev_max_backlog = 30000
# Default and Maximum socket buffer sizes (bytes)
# These values are used by default and as limits for auto-tuning.
# They are generous and suitable for 10Gbps+ networks common in clusters.
# Actual buffer sizes are often auto-tuned by the kernel within these limits.
net.core.rmem_default = 31457280 # Default receive buffer size (30MB)
net.core.wmem_default = 31457280 # Default send buffer size (30MB)
net.core.rmem_max = 67108864 # Max receive buffer size (64MB)
net.core.wmem_max = 67108864 # Max send buffer size (64MB)
# TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient
# net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer
# net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer
##########################
# Queue Discipline #
##########################
# Default queueing discipline for network interfaces
# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
net.core.default_qdisc = fq_codel
##########################
# IP Fragmentation #
##########################
# Memory thresholds for reassembling IP fragments (bytes)
# High: Start dropping fragments when memory exceeds this
# Low: Stop dropping fragments when memory drops below this
net.ipv4.ipfrag_high_thresh = 262144 # 256KB
net.ipv4.ipfrag_low_thresh = 196608 # 192KB
# Time to keep fragments in memory before discarding (seconds)
net.ipv4.ipfrag_time = 30
##########################
# Time‑Wait & Keepalive #
##########################
# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/IPVS. Keep DISABLED.
net.ipv4.tcp_tw_recycle = 0
# tcp_tw_reuse is also problematic with NAT/IPVS and generally not recommended on servers. Keep DISABLED.
net.ipv4.tcp_tw_reuse = 0
# Maximum number of sockets in TIME_WAIT state
# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
# 1440000 is a very high value, suitable for nodes handling a massive number of short connections.
net.ipv4.tcp_max_tw_buckets = 1440000
# TCP Keepalives (Detect dead connections faster and free resources)
# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
# Shorter times are recommended for servers to reclaim resources from idle clients sooner.
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle
net.ipv4.tcp_keepalive_intvl = 60 # Wait 60 seconds between probes
net.ipv4.tcp_keepalive_probes = 5 # Send up to 5 probes
##########################
# Security & ICMP #
##########################
# Disable redirects & source routing (Security hardening - prevents malicious routing)
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
# IP spoofing & logging (Security hardening)
# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too
# Log packets with impossible addresses (martians)
net.ipv4.conf.all.log_martians = 1
# ICMP hardening
net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods)
net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage)
##########################
# Routing & Bridge #
##########################
# Enable IP forwarding (Essential for Kubernetes nodes to route traffic between pods/services)
net.ipv4.ip_forward = 1
# Enable netfilter processing for bridged IP packets (Essential for K8s networking like Services/NetworkPolicy)
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
# net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering
##########################
# Kubernetes IPVS #
##########################
# Enable IPVS connection tracking
net.ipv4.vs.conntrack = 1
# Expire connections for unavailable destinations (helps with rolling updates/pod evictions)
net.ipv4.vs.expire_nodest_conn = 1
# Expire templates for quiescent services
net.ipv4.vs.expire_quiescent_template = 1
# Relax strict TCP/UDP state checks (often needed in K8s due to health checks, probes, etc.)
net.ipv4.vs.sloppy_tcp = 1
net.ipv4.vs.sloppy_udp = 1
##########################
# Conntrack Tuning #
##########################
# Maximum number of connection tracking entries
# Default is often low (e.g., 65536). Increase significantly for high connection loads in K8s.
# A common value is 1048576 (1M) or higher depending on node size and workload.
net.netfilter.nf_conntrack_max = 1048576
# Connection tracking timeouts (seconds)
# Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster,
# but too short can break legitimate long-lived connections.
net.netfilter.nf_conntrack_tcp_timeout_established = 43200 # Default 5 days (432000), 12 hours is often sufficient
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 # Default 60
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 # Default 120, can be reduced
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Default 120 (matches 2*MSL)
net.netfilter.nf_conntrack_generic_timeout = 60 # Default 60 (for non-TCP/UDP)
##########################
# IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) #
##########################
# If IPv6 is used in your cluster, apply similar tuning as IPv4 for consistency and performance.
# net.ipv6.conf.all.tcp_congestion_control = bbr
# net.ipv6.conf.default.tcp_congestion_control = bbr
# net.ipv6.conf.all.rmem_default = 31457280
# net.ipv6.conf.all.wmem_default = 31457280
# net.ipv6.conf.all.rmem_max = 67108864
# net.ipv6.conf.all.wmem_max = 67108864
# net.ipv6.conf.default.rmem_default = 31457280
# net.ipv6.conf.default.wmem_default = 31457280
# net.ipv6.conf.default.rmem_max = 67108864
# net.ipv6.conf.default.wmem_max = 67108864
# net.ipv6.conf.all.accept_redirects = 0
# net.ipv6.conf.default.accept_redirects = 0
# net.ipv6.conf.all.accept_source_route = 0
# net.ipv6.conf.default.accept_source_route = 0
# net.ipv6.conf.all.rp_filter = 1
# net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here
# net.netfilter.nf_conntrack_max = 1048576 # Conntrack max is shared between IPv4 and IPv6
# net.netfilter.nf_conntrack_ip6_timeout_established = 43200 # IPv6 specific timeouts if needed
# ... (add other IPv6 conntrack timeouts if desired)
# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
# /etc/sysctl.d/80-pve.conf
# Large Production Proxmox VE Cluster Host Tuning
# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Run `sysctl --system` after installing this file
########################
# 1) System Options #
########################
# Reduce console noise from kernel messages
# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
kernel.printk = 3 4 1 3
# Control swapping behavior
# 0: Only swap to avoid OOM
# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
# 10 is a common starting point for hypervisors to keep guest memory in RAM. Adjust based on monitoring.
vm.swappiness = 10
# Memory overcommit policy
# 0: Heuristic overcommit (default)
# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
# 1 is often required by applications like Redis for background saves.
# WARNING: Setting vm.overcommit_memory=1 on a hypervisor host can be risky.
# If guests overcommit their own memory, and the host also overcommits,
# you can potentially trigger an OOM killer event on the host itself.
# Monitor memory usage carefully. Consider 0 or 2 if not strictly required by host applications.
vm.overcommit_memory = 1
# Maximum number of open file descriptors system-wide
# Increase for systems running many guests/processes, each potentially opening many files/sockets.
# This value is very high and should be sufficient for most workloads.
fs.file-max = 2097152
# Maximum number of PIDs (processes/threads)
# Increase for systems running many guests/processes.
# 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes.
kernel.pid_max = 65536
# Incoming TCP connection backlog (SYN_RCVD state) for host services (PVE API, SSH, etc.)
# Increase for servers handling high rates of new connections to host services. 32768 is generous.
net.core.somaxconn = 32768
########################
# 2) Network Core #
########################
# Maximum backlog for packets arriving at the network device before being processed by the kernel
# Increase for high packet rates on high-speed NICs to avoid drops. 30000 is a high value.
net.core.netdev_max_backlog = 30000
# Default and Maximum socket buffer sizes (bytes)
# These values are used by default and as limits for auto-tuning.
# They are generous and suitable for 10Gbps+ networks common in clusters.
# Actual buffer sizes are often auto-tuned by the kernel within these limits.
net.core.rmem_default = 31457280 # Default receive buffer size (30MB)
net.core.wmem_default = 31457280 # Default send buffer size (30MB)
net.core.rmem_max = 67108864 # Max receive buffer size (64MB)
net.core.wmem_max = 67108864 # Max send buffer size (64MB)
# Default queueing discipline for network interfaces
# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
net.core.default_qdisc = fq_codel
########################
# 3) IPv4 TCP Tuning #
########################
# Essential TCP features for modern performance
net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections)
net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)
# SYN backlog & flood protection
# Maximum number of entries in the SYN queue (SYN_RECV state)
# Increase for servers handling high rates of new connections. 8192 is a good value.
net.ipv4.tcp_max_syn_backlog = 8192
# Enable SYN cookies (protects against SYN floods when SYN queue is full)
net.ipv4.tcp_syncookies = 1
# Reduce SYN retransmits before giving up
net.ipv4.tcp_syn_retries = 2
net.ipv4.tcp_synack_retries = 2
# Local port range for outgoing connections (ephemeral ports)
# Increase if the host or guests using host's IP initiate many short-lived connections
# and tcp_tw_reuse is off. 1024-65535 is the widest possible range.
net.ipv4.ip_local_port_range = 1024 65535
# Congestion control
# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
net.ipv4.tcp_congestion_control= bbr
# Low watermark for TCP send buffer (bytes)
# Helps reduce latency for small writes by preventing excessive buffering before sending.
net.ipv4.tcp_notsent_lowat = 16384 # 16KB
# Disable slow start after idle periods (improves performance for intermittent traffic)
net.ipv4.tcp_slow_start_after_idle = 0
# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
net.ipv4.tcp_mtu_probing = 1
# Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
# Useful on servers handling connections from a vast number of unique clients.
net.ipv4.tcp_no_metrics_save = 1
# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
net.ipv4.tcp_autocorking = 1
########################
# 4) IPv4 Other Tuning #
########################
# IP Fragmentation
# Memory thresholds for reassembling IP fragments (bytes)
# High: Start dropping fragments when memory exceeds this
# Low: Stop dropping fragments when memory drops below this
net.ipv4.ipfrag_high_thresh = 262144 # 256KB
net.ipv4.ipfrag_low_thresh = 196608 # 192KB
# Time to keep fragments in memory before discarding (seconds)
net.ipv4.ipfrag_time = 30
# Time‑Wait Handling
# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/Bridging. Keep DISABLED.
net.ipv4.tcp_tw_recycle = 0
# tcp_tw_reuse is also problematic with NAT/Bridging and generally not recommended on servers. Keep DISABLED.
net.ipv4.tcp_tw_reuse = 0
# Maximum number of sockets in TIME_WAIT state
# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
# 1440000 is a very high value, suitable for nodes handling a massive number of short connections.
net.ipv4.tcp_max_tw_buckets = 1440000
# TCP Keepalives (Host Services)
# Detect dead connections faster and free resources for host services (PVE API, SSH, etc.)
# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
# Shorter times are recommended for servers to reclaim resources from idle clients sooner.
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle
net.ipv4.tcp_keepalive_intvl = 30 # Wait 30 seconds between probes
net.ipv4.tcp_keepalive_probes = 6 # Send up to 6 probes
########################
# 5) Security & ICMP #
########################
# Disable redirects & source routing (Security hardening - prevents malicious routing)
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route= 0
# IP spoofing & logging (Security hardening)
# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too
# Log packets with impossible addresses (martians)
net.ipv4.conf.all.log_martians = 1
# ICMP hardening
net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods)
net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage)
########################
# 6) Routing & Bridge #
########################
# Enable IP forwarding (ESSENTIAL for hypervisor nodes to route traffic between guests and external networks)
net.ipv4.ip_forward = 1
# Enable netfilter processing for bridged IP packets (ESSENTIAL for PVE firewall and guest networking)
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
# net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering
########################
# 7) Conntrack Tuning #
########################
# Maximum number of connection tracking entries
# Default is often low (e.g., 65536). Increase significantly for high connection loads from many guests.
# 1000000 (1M) is a good starting point for busy nodes. Monitor usage (`/proc/sys/net/netfilter/nf_conntrack_count`).
net.netfilter.nf_conntrack_max = 1000000
# Connection tracking timeouts (seconds)
# Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster,
# but too short can break legitimate long-lived connections.
# Default established is 5 days (432000). 24 hours (86400) is a common reduction.
net.netfilter.nf_conntrack_tcp_timeout_established = 86400
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 # Default 60
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 # Default 120, can be reduced
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Default 120 (matches 2*MSL)
net.netfilter.nf_conntrack_generic_timeout = 60 # Default 60 (for non-TCP/UDP)
########################
# 8) VM & Storage #
########################
# Control how aggressively the kernel swaps out filesystem cache (dentries and inodes)
# Lower values (closer to 0) mean the kernel is less aggressive about reclaiming memory
# used for caching filesystem metadata. Higher values (closer to 100) mean it's more aggressive.
# Default is 100. Reducing this can help keep filesystem metadata in RAM, potentially
# improving performance for file-heavy operations, but uses more memory.
# A value of 50 is a common compromise. Adjust based on monitoring.
vm.vfs_cache_pressure = 50
# Percentage of total system memory that can be filled with 'dirty' pages (modified data
# that hasn't been written to disk) before the system starts actively writing it back.
# vm.dirty_ratio = 15 # Adjust based on storage performance
# Percentage of total system memory that can be filled with 'dirty' pages before a
# background process starts writing it back.
# vm.dirty_background_ratio = 5 # Adjust based on storage performance
# Note: vm.dirty_ratio and vm.dirty_background_ratio are CRITICAL for storage performance.
# The optimal values depend heavily on your storage type (SSD, NVMe, HDD, network storage)
# and workload. High values can lead to large write bursts and potential I/O stalls.
# Low values can cause excessive small writes. The values 15/5 are starting points;
# monitor I/O wait and adjust carefully. Consider using absolute bytes instead of percentages
# (vm.dirty_bytes, vm.dirty_background_bytes) on systems with large amounts of RAM.
########################
# 9) IPv6 Tuning (Optional) #
########################
# If IPv6 is used in your cluster (host or guests), apply similar tuning as IPv4.
# net.ipv6.conf.all.tcp_congestion_control = bbr
# net.ipv6.conf.default.tcp_congestion_control = bbr
# net.ipv6.conf.all.rmem_default = 31457280
# net.ipv6.conf.all.wmem_default = 31457280
# net.ipv6.conf.all.rmem_max = 67108864
# net.ipv6.conf.all.wmem_max = 67108864
# net.ipv6.conf.default.rmem_default = 31457280
# net.ipv6.conf.default.wmem_default = 31457280
# net.ipv6.conf.default.rmem_max = 67108864
# net.ipv6.conf.default.wmem_max = 67108864
# net.ipv6.conf.all.accept_redirects = 0
# net.ipv6.conf.default.accept_redirects = 0
# net.ipv6.conf.all.accept_source_route = 0
# net.ipv6.conf.default.accept_source_route = 0
# net.ipv6.conf.all.rp_filter = 1
# net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here
# net.netfilter.nf_conntrack_max = 1000000 # Conntrack max is shared between IPv4 and IPv6
# net.netfilter.nf_conntrack_ip6_timeout_established = 86400 # IPv6 specific timeouts if needed
# ... (add other IPv6 conntrack timeouts if desired)
# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment