bouroo · April 20, 2025 08:04
diff --git a/60-sysctl.conf b/60-sysctl.conf
 # /etc/sysctl.d/60-sysctl.conf
 # High‑performance settings for API/Web server
 # Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Run `sysctl --system` after installing this file

 ########################
 #   System Options     #
 ########################
 # Reduce console noise from kernel messages
 # Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
 # 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
 kernel.printk                  = 3 4 1 3

 # Control swapping behavior
 # 0: Only swap to avoid OOM
 # 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
 # 10-60: Balanced approach
 # 60-100: Aggressively swap out anonymous memory
 # 10 is a common starting point for servers with ample RAM. Adjust based on monitoring.
 vm.swappiness                  = 10

 # Memory overcommit policy
 # 0: Heuristic overcommit (default)
 # 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
 # 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
 # 1 is often required by applications like Redis for background saves. Use with caution and monitor memory usage.
 vm.overcommit_memory           = 1

 # Maximum number of open file descriptors system-wide
 # Increase for applications handling many concurrent connections (web servers, databases, etc.)
 # This value is very high and should be sufficient for most workloads.
 fs.file-max                    = 2097152

 # Maximum number of PIDs (processes/threads)
 # 65536 is a standard high value, suitable for systems with many processes or threads.
 kernel.pid_max                 = 65536

 # Incoming TCP connection backlog (SYN_RCVD state)
 # The maximum number of queued connection requests that have not yet been accepted by the application.
 # Increase for servers handling high rates of new connections. 32768 is a generous value.
 net.core.somaxconn             = 32768

 ########################
 #   TCP/IP Tuning      #
 ########################
 # Essential TCP features for modern performance
 net.ipv4.tcp_window_scaling    = 1 # Enable TCP window scaling (required for high bandwidth connections)
 net.ipv4.tcp_sack              = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
 net.ipv4.tcp_timestamps        = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
 net.ipv4.tcp_fastopen          = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)

 # SYN backlog & flood protection
 # Maximum number of entries in the SYN queue (SYN_RECV state)
 net.ipv4.tcp_max_syn_backlog   = 8192 # Increased from 4096, suitable for higher SYN rates

 # Enable SYN cookies (protects against SYN floods when SYN queue is full)
 net.ipv4.tcp_syncookies        = 1

 # Reduce SYN retransmits before giving up
 net.ipv4.tcp_syn_retries       = 2
 net.ipv4.tcp_synack_retries    = 2

 # Time‑wait handling
 # tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers. Keep DISABLED.
 net.ipv4.tcp_tw_recycle        = 0
 # tcp_tw_reuse is also problematic with NAT and generally not recommended on servers. Keep DISABLED.
 net.ipv4.tcp_tw_reuse          = 0

 # Congestion control
 # BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
 # Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
 net.ipv4.tcp_congestion_control= bbr

 # Low watermark for TCP send buffer (bytes)
 # Helps reduce latency for small writes by preventing excessive buffering before sending.
 net.ipv4.tcp_notsent_lowat     = 16384 # 16KB

 # Local port range for outgoing connections
 # Increase if the server initiates many short-lived connections and tcp_tw_reuse is off.
 # 1024-65535 is the widest possible range.
 net.ipv4.ip_local_port_range   = 1024 65535

 # Maximum number of sockets in TIME_WAIT state
 # Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
 # Default is often 200000. This value is high and usually sufficient.
 net.ipv4.tcp_max_tw_buckets    = 200000

 # TCP Keepalives (Detect dead connections faster and free resources)
 # Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
 # Shorter times are recommended for servers to reclaim resources from idle clients sooner.
 net.ipv4.tcp_keepalive_time    = 600       # Send first keepalive after 10 minutes idle
 net.ipv4.tcp_keepalive_probes  = 6         # Send up to 6 probes
 net.ipv4.tcp_keepalive_intvl   = 30        # Wait 30 seconds between probes

 # Other TCP optimizations
 # Disable slow start after idle periods (improves performance for intermittent traffic)
 net.ipv4.tcp_slow_start_after_idle = 0

 # Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
 net.ipv4.tcp_mtu_probing       = 1

 # Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
 net.ipv4.tcp_no_metrics_save   = 1

 # Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
 net.ipv4.tcp_autocorking       = 1

 # Explicitly enable SACK (usually on by default, but good for clarity)
 net.ipv4.tcp_sack_enabled      = 1

 ########################
 #   Network Buffers    #
 ########################
 # Maximum backlog for packets arriving at the network device before being processed by the kernel
 # Increase for high packet rates to avoid drops. 30000 is a high value.
 net.core.netdev_max_backlog    = 30000

 # Default and Maximum socket buffer sizes (bytes)
 # These values are used by default and as limits for auto-tuning.
 # They are generous and suitable for 10Gbps+ networks.
 # Actual buffer sizes are often auto-tuned by the kernel within these limits.
 net.core.rmem_default          = 31457280  # Default receive buffer size (30MB)
 net.core.wmem_default          = 31457280  # Default send buffer size (30MB)
 net.core.rmem_max              = 67108864  # Max receive buffer size (64MB)
 net.core.wmem_max              = 67108864  # Max send buffer size (64MB)

 # TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient
 # net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer
 # net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer

 ########################
 #   Queue Discipline   #
 ########################
 # Default queueing discipline for network interfaces
 # fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
 net.core.default_qdisc         = fq_codel

 ########################
 #   IP Fragmentation   #
 ########################
 # Memory thresholds for reassembling IP fragments (bytes)
 # High: Start dropping fragments when memory exceeds this
 # Low: Stop dropping fragments when memory drops below this
 net.ipv4.ipfrag_high_thresh    = 262144    # 256KB
 net.ipv4.ipfrag_low_thresh     = 196608    # 192KB

 # Time to keep fragments in memory before discarding (seconds)
 net.ipv4.ipfrag_time           = 30

 ########################
 #   Security & ICMP    #
 ########################
 # Disable redirects & source routing (Security hardening - prevents malicious routing)
 net.ipv4.conf.all.accept_redirects       = 0
 net.ipv4.conf.default.accept_redirects   = 0
 net.ipv4.conf.all.accept_source_route    = 0
 net.ipv4.conf.default.accept_source_route= 0

 # IP spoofing & logging (Security hardening)
 # rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
 net.ipv4.conf.all.rp_filter              = 1
 net.ipv4.conf.default.rp_filter          = 1 # Apply to default interface settings too

 # Log packets with impossible addresses (martians)
 net.ipv4.conf.all.log_martians           = 1

 # ICMP hardening
 net.ipv4.icmp_echo_ignore_broadcasts     = 1 # Ignore broadcast ICMP requests (reduces response to floods)
 net.ipv4.icmp_timestamp_ignore_all       = 1 # Ignore ICMP timestamp requests (reduces information leakage)

 # IP forwarding (Disable unless server acts as a router/gateway)
 net.ipv4.ip_forward                      = 0

 ########################
 #   IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) #
 ########################
 # If IPv6 is used, apply similar tuning as IPv4 for consistency and performance.
 # net.ipv6.conf.all.tcp_congestion_control = bbr
 # net.ipv6.conf.default.tcp_congestion_control = bbr
 # net.ipv6.conf.all.rmem_default = 31457280
 # net.ipv6.conf.all.wmem_default = 31457280
 # net.ipv6.conf.all.rmem_max = 67108864
 # net.ipv6.conf.all.wmem_max = 67108864
 # net.ipv6.conf.default.rmem_default = 31457280
 # net.ipv6.conf.default.wmem_default = 31457280
 # net.ipv6.conf.default.rmem_max = 67108864
 # net.ipv6.conf.default.wmem_max = 67108864
 # net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
diff --git a/80-k8s-ipvs.conf b/80-k8s-ipvs.conf
 # /etc/sysctl.d/80-k8s-ipvs.conf
 # Production Kubernetes node tuning with IPVS
 # Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Run `sysctl --system` after installing this file

 ##########################
 #   System Options       #
 ##########################
 # Reduce console noise from kernel messages
 # Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
 # 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
 kernel.printk                   = 3 4 1 3

 # Control swapping behavior
 # 0: Only swap to avoid OOM
 # 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
 # 10-60: Balanced approach
 # 60-100: Aggressively swap out anonymous memory
 # 10 is a common starting point for servers with ample RAM. Adjust based on monitoring.
 vm.swappiness                   = 10

 # Memory overcommit policy
 # 0: Heuristic overcommit (default)
 # 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
 # 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
 # 1 is often required by applications like Redis for background saves, and can be beneficial in K8s
 # to allow pods to potentially burst memory, but requires careful monitoring to avoid node OOMs.
 vm.overcommit_memory            = 1

 # Maximum number of open file descriptors system-wide
 # Increase for systems running many containers/pods, each potentially opening many files/sockets.
 # This value is very high and should be sufficient for most workloads.
 fs.file-max                     = 2097152

 # Maximum number of PIDs (processes/threads)
 # Increase for systems running many containers/pods, as each container can have multiple processes.
 # 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes.
 kernel.pid_max                  = 65536

 ##########################
 #   TCP Performance      #
 ##########################
 # Essential TCP features for modern performance
 net.ipv4.tcp_window_scaling     = 1         # Enable TCP window scaling (required for high bandwidth connections)
 net.ipv4.tcp_sack               = 1         # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
 net.ipv4.tcp_timestamps         = 1         # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
 net.ipv4.tcp_fastopen           = 3         # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)

 # SYN backlog & flood protection
 # Maximum number of entries in the SYN queue (SYN_RECV state)
 # Increase for servers handling high rates of new connections. 8192 is a good value.
 net.ipv4.tcp_max_syn_backlog    = 8192

 # Enable SYN cookies (protects against SYN floods when SYN queue is full)
 net.ipv4.tcp_syncookies         = 1

 # Reduce SYN retransmits before giving up
 net.ipv4.tcp_syn_retries        = 2
 net.ipv4.tcp_synack_retries     = 2

 # Local port range for outgoing connections (ephemeral ports)
 # Increase if the node initiates many short-lived connections (e.g., health checks, API calls)
 # and tcp_tw_reuse is off. 1024-65535 is the widest possible range.
 net.ipv4.ip_local_port_range    = 1024 65535

 # Congestion control
 # BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
 # Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
 net.ipv4.tcp_congestion_control = bbr

 # Low watermark for TCP send buffer (bytes)
 # Helps reduce latency for small writes by preventing excessive buffering before sending.
 net.ipv4.tcp_notsent_lowat      = 16384 # 16KB

 # Disable slow start after idle periods (improves performance for intermittent traffic)
 net.ipv4.tcp_slow_start_after_idle = 0

 # Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
 net.ipv4.tcp_mtu_probing        = 1

 # Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
 # Useful on servers handling connections from a vast number of unique clients.
 net.ipv4.tcp_no_metrics_save    = 1

 # Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
 net.ipv4.tcp_autocorking        = 1

 ##########################
 #   Network Buffers       #
 ##########################
 # Maximum backlog for packets arriving at the network device before being processed by the kernel
 # Increase for high packet rates to avoid drops. 30000 is a high value.
 net.core.netdev_max_backlog     = 30000

 # Default and Maximum socket buffer sizes (bytes)
 # These values are used by default and as limits for auto-tuning.
 # They are generous and suitable for 10Gbps+ networks common in clusters.
 # Actual buffer sizes are often auto-tuned by the kernel within these limits.
 net.core.rmem_default           = 31457280  # Default receive buffer size (30MB)
 net.core.wmem_default           = 31457280  # Default send buffer size (30MB)
 net.core.rmem_max               = 67108864  # Max receive buffer size (64MB)
 net.core.wmem_max               = 67108864  # Max send buffer size (64MB)

 # TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient
 # net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer
 # net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer

 ##########################
 #   Queue Discipline      #
 ##########################
 # Default queueing discipline for network interfaces
 # fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
 net.core.default_qdisc          = fq_codel

 ##########################
 #   IP Fragmentation      #
 ##########################
 # Memory thresholds for reassembling IP fragments (bytes)
 # High: Start dropping fragments when memory exceeds this
 # Low: Stop dropping fragments when memory drops below this
 net.ipv4.ipfrag_high_thresh     = 262144    # 256KB
 net.ipv4.ipfrag_low_thresh      = 196608    # 192KB

 # Time to keep fragments in memory before discarding (seconds)
 net.ipv4.ipfrag_time            = 30

 ##########################
 #   Time‑Wait & Keepalive #
 ##########################
 # tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/IPVS. Keep DISABLED.
 net.ipv4.tcp_tw_recycle         = 0
 # tcp_tw_reuse is also problematic with NAT/IPVS and generally not recommended on servers. Keep DISABLED.
 net.ipv4.tcp_tw_reuse           = 0

 # Maximum number of sockets in TIME_WAIT state
 # Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
 # 1440000 is a very high value, suitable for nodes handling a massive number of short connections.
 net.ipv4.tcp_max_tw_buckets     = 1440000

 # TCP Keepalives (Detect dead connections faster and free resources)
 # Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
 # Shorter times are recommended for servers to reclaim resources from idle clients sooner.
 net.ipv4.tcp_keepalive_time     = 600       # Send first keepalive after 10 minutes idle
 net.ipv4.tcp_keepalive_intvl    = 60        # Wait 60 seconds between probes
 net.ipv4.tcp_keepalive_probes   = 5         # Send up to 5 probes

 ##########################
 #   Security & ICMP       #
 ##########################
 # Disable redirects & source routing (Security hardening - prevents malicious routing)
 net.ipv4.conf.all.accept_redirects      = 0
 net.ipv4.conf.default.accept_redirects  = 0
 net.ipv4.conf.all.accept_source_route   = 0
 net.ipv4.conf.default.accept_source_route = 0

 # IP spoofing & logging (Security hardening)
 # rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
 net.ipv4.conf.all.rp_filter             = 1
 net.ipv4.conf.default.rp_filter         = 1 # Apply to default interface settings too

 # Log packets with impossible addresses (martians)
 net.ipv4.conf.all.log_martians          = 1

 # ICMP hardening
 net.ipv4.icmp_echo_ignore_broadcasts    = 1  # Ignore broadcast ICMP requests (reduces response to floods)
 net.ipv4.icmp_timestamp_ignore_all      = 1  # Ignore ICMP timestamp requests (reduces information leakage)

 ##########################
 #   Routing & Bridge      #
 ##########################
 # Enable IP forwarding (Essential for Kubernetes nodes to route traffic between pods/services)
 net.ipv4.ip_forward         = 1

 # Enable netfilter processing for bridged IP packets (Essential for K8s networking like Services/NetworkPolicy)
 net.bridge.bridge-nf-call-iptables = 1
 net.bridge.bridge-nf-call-ip6tables = 1
 # net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering

 ##########################
 #   Kubernetes IPVS       #
 ##########################
 # Enable IPVS connection tracking
 net.ipv4.vs.conntrack               = 1

 # Expire connections for unavailable destinations (helps with rolling updates/pod evictions)
 net.ipv4.vs.expire_nodest_conn      = 1

 # Expire templates for quiescent services
 net.ipv4.vs.expire_quiescent_template = 1

 # Relax strict TCP/UDP state checks (often needed in K8s due to health checks, probes, etc.)
 net.ipv4.vs.sloppy_tcp              = 1
 net.ipv4.vs.sloppy_udp              = 1

 ##########################
 #   Conntrack Tuning      #
 ##########################
 # Maximum number of connection tracking entries
 # Default is often low (e.g., 65536). Increase significantly for high connection loads in K8s.
 # A common value is 1048576 (1M) or higher depending on node size and workload.
 net.netfilter.nf_conntrack_max = 1048576

 # Connection tracking timeouts (seconds)
 # Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster,
 # but too short can break legitimate long-lived connections.
 net.netfilter.nf_conntrack_tcp_timeout_established = 43200 # Default 5 days (432000), 12 hours is often sufficient
 net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60    # Default 60
 net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60      # Default 120, can be reduced
 net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120     # Default 120 (matches 2*MSL)
 net.netfilter.nf_conntrack_generic_timeout = 60           # Default 60 (for non-TCP/UDP)

 ##########################
 #   IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) #
 ##########################
 # If IPv6 is used in your cluster, apply similar tuning as IPv4 for consistency and performance.
 # net.ipv6.conf.all.tcp_congestion_control = bbr
 # net.ipv6.conf.default.tcp_congestion_control = bbr
 # net.ipv6.conf.all.rmem_default = 31457280
 # net.ipv6.conf.all.wmem_default = 31457280
 # net.ipv6.conf.all.rmem_max = 67108864
 # net.ipv6.conf.all.wmem_max = 67108864
 # net.ipv6.conf.default.rmem_default = 31457280
 # net.ipv6.conf.default.wmem_default = 31457280
 # net.ipv6.conf.default.rmem_max = 67108864
 # net.ipv6.conf.default.wmem_max = 67108864
 # net.ipv6.conf.all.accept_redirects = 0
 # net.ipv6.conf.default.accept_redirects = 0
 # net.ipv6.conf.all.accept_source_route = 0
 # net.ipv6.conf.default.accept_source_route = 0
 # net.ipv6.conf.all.rp_filter = 1
 # net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here
 # net.netfilter.nf_conntrack_max = 1048576 # Conntrack max is shared between IPv4 and IPv6
 # net.netfilter.nf_conntrack_ip6_timeout_established = 43200 # IPv6 specific timeouts if needed
 # ... (add other IPv6 conntrack timeouts if desired)
 # net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
diff --git a/80-pve.conf b/80-pve.conf
 # /etc/sysctl.d/80-pve.conf
 # Large Production Proxmox VE Cluster Host Tuning
 # Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Run `sysctl --system` after installing this file

 ########################
 # 1) System Options    #
 ########################
 # Reduce console noise from kernel messages
 # Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
 # 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
 kernel.printk                  = 3 4 1 3

 # Control swapping behavior
 # 0: Only swap to avoid OOM
 # 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
 # 10 is a common starting point for hypervisors to keep guest memory in RAM. Adjust based on monitoring.
 vm.swappiness                  = 10

 # Memory overcommit policy
 # 0: Heuristic overcommit (default)
 # 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
 # 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
 # 1 is often required by applications like Redis for background saves.
 # WARNING: Setting vm.overcommit_memory=1 on a hypervisor host can be risky.
 # If guests overcommit their own memory, and the host also overcommits,
 # you can potentially trigger an OOM killer event on the host itself.
 # Monitor memory usage carefully. Consider 0 or 2 if not strictly required by host applications.
 vm.overcommit_memory           = 1

 # Maximum number of open file descriptors system-wide
 # Increase for systems running many guests/processes, each potentially opening many files/sockets.
 # This value is very high and should be sufficient for most workloads.
 fs.file-max                    = 2097152

 # Maximum number of PIDs (processes/threads)
 # Increase for systems running many guests/processes.
 # 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes.
 kernel.pid_max                 = 65536

 # Incoming TCP connection backlog (SYN_RCVD state) for host services (PVE API, SSH, etc.)
 # Increase for servers handling high rates of new connections to host services. 32768 is generous.
 net.core.somaxconn             = 32768

 ########################
 # 2) Network Core      #
 ########################
 # Maximum backlog for packets arriving at the network device before being processed by the kernel
 # Increase for high packet rates on high-speed NICs to avoid drops. 30000 is a high value.
 net.core.netdev_max_backlog    = 30000

 # Default and Maximum socket buffer sizes (bytes)
 # These values are used by default and as limits for auto-tuning.
 # They are generous and suitable for 10Gbps+ networks common in clusters.
 # Actual buffer sizes are often auto-tuned by the kernel within these limits.
 net.core.rmem_default          = 31457280  # Default receive buffer size (30MB)
 net.core.wmem_default          = 31457280  # Default send buffer size (30MB)
 net.core.rmem_max              = 67108864  # Max receive buffer size (64MB)
 net.core.wmem_max              = 67108864  # Max send buffer size (64MB)

 # Default queueing discipline for network interfaces
 # fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
 net.core.default_qdisc         = fq_codel

 ########################
 # 3) IPv4 TCP Tuning   #
 ########################
 # Essential TCP features for modern performance
 net.ipv4.tcp_window_scaling    = 1         # Enable TCP window scaling (required for high bandwidth connections)
 net.ipv4.tcp_sack              = 1         # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
 net.ipv4.tcp_timestamps        = 1         # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
 net.ipv4.tcp_fastopen          = 3         # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)

 # SYN backlog & flood protection
 # Maximum number of entries in the SYN queue (SYN_RECV state)
 # Increase for servers handling high rates of new connections. 8192 is a good value.
 net.ipv4.tcp_max_syn_backlog   = 8192

 # Enable SYN cookies (protects against SYN floods when SYN queue is full)
 net.ipv4.tcp_syncookies        = 1

 # Reduce SYN retransmits before giving up
 net.ipv4.tcp_syn_retries       = 2
 net.ipv4.tcp_synack_retries    = 2

 # Local port range for outgoing connections (ephemeral ports)
 # Increase if the host or guests using host's IP initiate many short-lived connections
 # and tcp_tw_reuse is off. 1024-65535 is the widest possible range.
 net.ipv4.ip_local_port_range   = 1024 65535

 # Congestion control
 # BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
 # Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
 net.ipv4.tcp_congestion_control= bbr

 # Low watermark for TCP send buffer (bytes)
 # Helps reduce latency for small writes by preventing excessive buffering before sending.
 net.ipv4.tcp_notsent_lowat     = 16384 # 16KB

 # Disable slow start after idle periods (improves performance for intermittent traffic)
 net.ipv4.tcp_slow_start_after_idle = 0

 # Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
 net.ipv4.tcp_mtu_probing       = 1

 # Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
 # Useful on servers handling connections from a vast number of unique clients.
 net.ipv4.tcp_no_metrics_save   = 1

 # Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
 net.ipv4.tcp_autocorking       = 1

 ########################
 # 4) IPv4 Other Tuning #
 ########################
 # IP Fragmentation
 # Memory thresholds for reassembling IP fragments (bytes)
 # High: Start dropping fragments when memory exceeds this
 # Low: Stop dropping fragments when memory drops below this
 net.ipv4.ipfrag_high_thresh    = 262144    # 256KB
 net.ipv4.ipfrag_low_thresh     = 196608    # 192KB

 # Time to keep fragments in memory before discarding (seconds)
 net.ipv4.ipfrag_time           = 30

 # Time‑Wait Handling
 # tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/Bridging. Keep DISABLED.
 net.ipv4.tcp_tw_recycle        = 0
 # tcp_tw_reuse is also problematic with NAT/Bridging and generally not recommended on servers. Keep DISABLED.
 net.ipv4.tcp_tw_reuse          = 0

 # Maximum number of sockets in TIME_WAIT state
 # Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
 # 1440000 is a very high value, suitable for nodes handling a massive number of short connections.
 net.ipv4.tcp_max_tw_buckets    = 1440000

 # TCP Keepalives (Host Services)
 # Detect dead connections faster and free resources for host services (PVE API, SSH, etc.)
 # Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
 # Shorter times are recommended for servers to reclaim resources from idle clients sooner.
 net.ipv4.tcp_keepalive_time    = 600       # Send first keepalive after 10 minutes idle
 net.ipv4.tcp_keepalive_intvl   = 30        # Wait 30 seconds between probes
 net.ipv4.tcp_keepalive_probes  = 6         # Send up to 6 probes

 ########################
 # 5) Security & ICMP   #
 ########################
 # Disable redirects & source routing (Security hardening - prevents malicious routing)
 net.ipv4.conf.all.accept_redirects       = 0
 net.ipv4.conf.default.accept_redirects   = 0
 net.ipv4.conf.all.accept_source_route    = 0
 net.ipv4.conf.default.accept_source_route= 0

 # IP spoofing & logging (Security hardening)
 # rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
 net.ipv4.conf.all.rp_filter              = 1
 net.ipv4.conf.default.rp_filter          = 1 # Apply to default interface settings too

 # Log packets with impossible addresses (martians)
 net.ipv4.conf.all.log_martians           = 1

 # ICMP hardening
 net.ipv4.icmp_echo_ignore_broadcasts     = 1 # Ignore broadcast ICMP requests (reduces response to floods)
 net.ipv4.icmp_timestamp_ignore_all       = 1 # Ignore ICMP timestamp requests (reduces information leakage)

 ########################
 # 6) Routing & Bridge  #
 ########################
 # Enable IP forwarding (ESSENTIAL for hypervisor nodes to route traffic between guests and external networks)
 net.ipv4.ip_forward                      = 1

 # Enable netfilter processing for bridged IP packets (ESSENTIAL for PVE firewall and guest networking)
 net.bridge.bridge-nf-call-iptables = 1
 net.bridge.bridge-nf-call-ip6tables = 1
 # net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering

 ########################
 # 7) Conntrack Tuning  #
 ########################
 # Maximum number of connection tracking entries
 # Default is often low (e.g., 65536). Increase significantly for high connection loads from many guests.
 # 1000000 (1M) is a good starting point for busy nodes. Monitor usage (`/proc/sys/net/netfilter/nf_conntrack_count`).
 net.netfilter.nf_conntrack_max            = 1000000

 # Connection tracking timeouts (seconds)
 # Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster,
 # but too short can break legitimate long-lived connections.
 # Default established is 5 days (432000). 24 hours (86400) is a common reduction.
 net.netfilter.nf_conntrack_tcp_timeout_established = 86400
 net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60    # Default 60
 net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60      # Default 120, can be reduced
 net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120     # Default 120 (matches 2*MSL)
 net.netfilter.nf_conntrack_generic_timeout = 60           # Default 60 (for non-TCP/UDP)

 ########################
 # 8) VM & Storage      #
 ########################
 # Control how aggressively the kernel swaps out filesystem cache (dentries and inodes)
 # Lower values (closer to 0) mean the kernel is less aggressive about reclaiming memory
 # used for caching filesystem metadata. Higher values (closer to 100) mean it's more aggressive.
 # Default is 100. Reducing this can help keep filesystem metadata in RAM, potentially
 # improving performance for file-heavy operations, but uses more memory.
 # A value of 50 is a common compromise. Adjust based on monitoring.
 vm.vfs_cache_pressure = 50

 # Percentage of total system memory that can be filled with 'dirty' pages (modified data
 # that hasn't been written to disk) before the system starts actively writing it back.
 # vm.dirty_ratio                           = 15 # Adjust based on storage performance

 # Percentage of total system memory that can be filled with 'dirty' pages before a
 # background process starts writing it back.
 # vm.dirty_background_ratio                = 5 # Adjust based on storage performance
 # Note: vm.dirty_ratio and vm.dirty_background_ratio are CRITICAL for storage performance.
 # The optimal values depend heavily on your storage type (SSD, NVMe, HDD, network storage)
 # and workload. High values can lead to large write bursts and potential I/O stalls.
 # Low values can cause excessive small writes. The values 15/5 are starting points;
 # monitor I/O wait and adjust carefully. Consider using absolute bytes instead of percentages
 # (vm.dirty_bytes, vm.dirty_background_bytes) on systems with large amounts of RAM.

 ########################
 # 9) IPv6 Tuning (Optional) #
 ########################
 # If IPv6 is used in your cluster (host or guests), apply similar tuning as IPv4.
 # net.ipv6.conf.all.tcp_congestion_control = bbr
 # net.ipv6.conf.default.tcp_congestion_control = bbr
 # net.ipv6.conf.all.rmem_default = 31457280
 # net.ipv6.conf.all.wmem_default = 31457280
 # net.ipv6.conf.all.rmem_max = 67108864
 # net.ipv6.conf.all.wmem_max = 67108864
 # net.ipv6.conf.default.rmem_default = 31457280
 # net.ipv6.conf.default.wmem_default = 31457280
 # net.ipv6.conf.default.rmem_max = 67108864
 # net.ipv6.conf.default.wmem_max = 67108864
 # net.ipv6.conf.all.accept_redirects = 0
 # net.ipv6.conf.default.accept_redirects = 0
 # net.ipv6.conf.all.accept_source_route = 0
 # net.ipv6.conf.default.accept_source_route = 0
 # net.ipv6.conf.all.rp_filter = 1
 # net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here
 # net.netfilter.nf_conntrack_max = 1000000 # Conntrack max is shared between IPv4 and IPv6
 # net.netfilter.nf_conntrack_ip6_timeout_established = 86400 # IPv6 specific timeouts if needed
 # ... (add other IPv6 conntrack timeouts if desired)
 # net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
	# /etc/sysctl.d/60-sysctl.conf
	# High‑performance settings for API/Web server
	# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Run `sysctl --system` after installing this file

	########################
	# System Options #
	########################
	# Reduce console noise from kernel messages
	# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
	# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
	kernel.printk = 3 4 1 3

	# Control swapping behavior
	# 0: Only swap to avoid OOM
	# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
	# 10-60: Balanced approach
	# 60-100: Aggressively swap out anonymous memory
	# 10 is a common starting point for servers with ample RAM. Adjust based on monitoring.
	vm.swappiness = 10

	# Memory overcommit policy
	# 0: Heuristic overcommit (default)
	# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
	# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
	# 1 is often required by applications like Redis for background saves. Use with caution and monitor memory usage.
	vm.overcommit_memory = 1

	# Maximum number of open file descriptors system-wide
	# Increase for applications handling many concurrent connections (web servers, databases, etc.)
	# This value is very high and should be sufficient for most workloads.
	fs.file-max = 2097152

	# Maximum number of PIDs (processes/threads)
	# 65536 is a standard high value, suitable for systems with many processes or threads.
	kernel.pid_max = 65536

	# Incoming TCP connection backlog (SYN_RCVD state)
	# The maximum number of queued connection requests that have not yet been accepted by the application.
	# Increase for servers handling high rates of new connections. 32768 is a generous value.
	net.core.somaxconn = 32768

	########################
	# TCP/IP Tuning #
	########################
	# Essential TCP features for modern performance
	net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections)
	net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
	net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
	net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)

	# SYN backlog & flood protection
	# Maximum number of entries in the SYN queue (SYN_RECV state)
	net.ipv4.tcp_max_syn_backlog = 8192 # Increased from 4096, suitable for higher SYN rates

	# Enable SYN cookies (protects against SYN floods when SYN queue is full)
	net.ipv4.tcp_syncookies = 1

	# Reduce SYN retransmits before giving up
	net.ipv4.tcp_syn_retries = 2
	net.ipv4.tcp_synack_retries = 2

	# Time‑wait handling
	# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers. Keep DISABLED.
	net.ipv4.tcp_tw_recycle = 0
	# tcp_tw_reuse is also problematic with NAT and generally not recommended on servers. Keep DISABLED.
	net.ipv4.tcp_tw_reuse = 0

	# Congestion control
	# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
	# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
	net.ipv4.tcp_congestion_control= bbr

	# Low watermark for TCP send buffer (bytes)
	# Helps reduce latency for small writes by preventing excessive buffering before sending.
	net.ipv4.tcp_notsent_lowat = 16384 # 16KB

	# Local port range for outgoing connections
	# Increase if the server initiates many short-lived connections and tcp_tw_reuse is off.
	# 1024-65535 is the widest possible range.
	net.ipv4.ip_local_port_range = 1024 65535

	# Maximum number of sockets in TIME_WAIT state
	# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
	# Default is often 200000. This value is high and usually sufficient.
	net.ipv4.tcp_max_tw_buckets = 200000

	# TCP Keepalives (Detect dead connections faster and free resources)
	# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
	# Shorter times are recommended for servers to reclaim resources from idle clients sooner.
	net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle
	net.ipv4.tcp_keepalive_probes = 6 # Send up to 6 probes
	net.ipv4.tcp_keepalive_intvl = 30 # Wait 30 seconds between probes

	# Other TCP optimizations
	# Disable slow start after idle periods (improves performance for intermittent traffic)
	net.ipv4.tcp_slow_start_after_idle = 0

	# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
	net.ipv4.tcp_mtu_probing = 1

	# Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
	net.ipv4.tcp_no_metrics_save = 1

	# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
	net.ipv4.tcp_autocorking = 1

	# Explicitly enable SACK (usually on by default, but good for clarity)
	net.ipv4.tcp_sack_enabled = 1

	########################
	# Network Buffers #
	########################
	# Maximum backlog for packets arriving at the network device before being processed by the kernel
	# Increase for high packet rates to avoid drops. 30000 is a high value.
	net.core.netdev_max_backlog = 30000

	# Default and Maximum socket buffer sizes (bytes)
	# These values are used by default and as limits for auto-tuning.
	# They are generous and suitable for 10Gbps+ networks.
	# Actual buffer sizes are often auto-tuned by the kernel within these limits.
	net.core.rmem_default = 31457280 # Default receive buffer size (30MB)
	net.core.wmem_default = 31457280 # Default send buffer size (30MB)
	net.core.rmem_max = 67108864 # Max receive buffer size (64MB)
	net.core.wmem_max = 67108864 # Max send buffer size (64MB)

	# TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient
	# net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer
	# net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer

	########################
	# Queue Discipline #
	########################
	# Default queueing discipline for network interfaces
	# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
	net.core.default_qdisc = fq_codel

	########################
	# IP Fragmentation #
	########################
	# Memory thresholds for reassembling IP fragments (bytes)
	# High: Start dropping fragments when memory exceeds this
	# Low: Stop dropping fragments when memory drops below this
	net.ipv4.ipfrag_high_thresh = 262144 # 256KB
	net.ipv4.ipfrag_low_thresh = 196608 # 192KB

	# Time to keep fragments in memory before discarding (seconds)
	net.ipv4.ipfrag_time = 30

	########################
	# Security & ICMP #
	########################
	# Disable redirects & source routing (Security hardening - prevents malicious routing)
	net.ipv4.conf.all.accept_redirects = 0
	net.ipv4.conf.default.accept_redirects = 0
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route= 0

	# IP spoofing & logging (Security hardening)
	# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too

	# Log packets with impossible addresses (martians)
	net.ipv4.conf.all.log_martians = 1

	# ICMP hardening
	net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods)
	net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage)

	# IP forwarding (Disable unless server acts as a router/gateway)
	net.ipv4.ip_forward = 0

	########################
	# IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) #
	########################
	# If IPv6 is used, apply similar tuning as IPv4 for consistency and performance.
	# net.ipv6.conf.all.tcp_congestion_control = bbr
	# net.ipv6.conf.default.tcp_congestion_control = bbr
	# net.ipv6.conf.all.rmem_default = 31457280
	# net.ipv6.conf.all.wmem_default = 31457280
	# net.ipv6.conf.all.rmem_max = 67108864
	# net.ipv6.conf.all.wmem_max = 67108864
	# net.ipv6.conf.default.rmem_default = 31457280
	# net.ipv6.conf.default.wmem_default = 31457280
	# net.ipv6.conf.default.rmem_max = 67108864
	# net.ipv6.conf.default.wmem_max = 67108864
	# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
	# /etc/sysctl.d/80-k8s-ipvs.conf
	# Production Kubernetes node tuning with IPVS
	# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Run `sysctl --system` after installing this file

	##########################
	# System Options #
	##########################
	# Reduce console noise from kernel messages
	# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
	# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
	kernel.printk = 3 4 1 3

	# Control swapping behavior
	# 0: Only swap to avoid OOM
	# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
	# 10-60: Balanced approach
	# 60-100: Aggressively swap out anonymous memory
	# 10 is a common starting point for servers with ample RAM. Adjust based on monitoring.
	vm.swappiness = 10

	# Memory overcommit policy
	# 0: Heuristic overcommit (default)
	# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
	# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
	# 1 is often required by applications like Redis for background saves, and can be beneficial in K8s
	# to allow pods to potentially burst memory, but requires careful monitoring to avoid node OOMs.
	vm.overcommit_memory = 1

	# Maximum number of open file descriptors system-wide
	# Increase for systems running many containers/pods, each potentially opening many files/sockets.
	# This value is very high and should be sufficient for most workloads.
	fs.file-max = 2097152

	# Maximum number of PIDs (processes/threads)
	# Increase for systems running many containers/pods, as each container can have multiple processes.
	# 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes.
	kernel.pid_max = 65536

	##########################
	# TCP Performance #
	##########################
	# Essential TCP features for modern performance
	net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections)
	net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
	net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
	net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)

	# SYN backlog & flood protection
	# Maximum number of entries in the SYN queue (SYN_RECV state)
	# Increase for servers handling high rates of new connections. 8192 is a good value.
	net.ipv4.tcp_max_syn_backlog = 8192

	# Enable SYN cookies (protects against SYN floods when SYN queue is full)
	net.ipv4.tcp_syncookies = 1

	# Reduce SYN retransmits before giving up
	net.ipv4.tcp_syn_retries = 2
	net.ipv4.tcp_synack_retries = 2

	# Local port range for outgoing connections (ephemeral ports)
	# Increase if the node initiates many short-lived connections (e.g., health checks, API calls)
	# and tcp_tw_reuse is off. 1024-65535 is the widest possible range.
	net.ipv4.ip_local_port_range = 1024 65535

	# Congestion control
	# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
	# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
	net.ipv4.tcp_congestion_control = bbr

	# Low watermark for TCP send buffer (bytes)
	# Helps reduce latency for small writes by preventing excessive buffering before sending.
	net.ipv4.tcp_notsent_lowat = 16384 # 16KB

	# Disable slow start after idle periods (improves performance for intermittent traffic)
	net.ipv4.tcp_slow_start_after_idle = 0

	# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
	net.ipv4.tcp_mtu_probing = 1

	# Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
	# Useful on servers handling connections from a vast number of unique clients.
	net.ipv4.tcp_no_metrics_save = 1

	# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
	net.ipv4.tcp_autocorking = 1

	##########################
	# Network Buffers #
	##########################
	# Maximum backlog for packets arriving at the network device before being processed by the kernel
	# Increase for high packet rates to avoid drops. 30000 is a high value.
	net.core.netdev_max_backlog = 30000

	# Default and Maximum socket buffer sizes (bytes)
	# These values are used by default and as limits for auto-tuning.
	# They are generous and suitable for 10Gbps+ networks common in clusters.
	# Actual buffer sizes are often auto-tuned by the kernel within these limits.
	net.core.rmem_default = 31457280 # Default receive buffer size (30MB)
	net.core.wmem_default = 31457280 # Default send buffer size (30MB)
	net.core.rmem_max = 67108864 # Max receive buffer size (64MB)
	net.core.wmem_max = 67108864 # Max send buffer size (64MB)

	# TCP specific buffer sizes (min, default, max) - Optional, net.core often sufficient
	# net.ipv4.tcp_rmem = 4096 87380 67108864 # Min, Default, Max receive buffer
	# net.ipv4.tcp_wmem = 4096 65536 67108864 # Min, Default, Max send buffer

	##########################
	# Queue Discipline #
	##########################
	# Default queueing discipline for network interfaces
	# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
	net.core.default_qdisc = fq_codel

	##########################
	# IP Fragmentation #
	##########################
	# Memory thresholds for reassembling IP fragments (bytes)
	# High: Start dropping fragments when memory exceeds this
	# Low: Stop dropping fragments when memory drops below this
	net.ipv4.ipfrag_high_thresh = 262144 # 256KB
	net.ipv4.ipfrag_low_thresh = 196608 # 192KB

	# Time to keep fragments in memory before discarding (seconds)
	net.ipv4.ipfrag_time = 30

	##########################
	# Time‑Wait & Keepalive #
	##########################
	# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/IPVS. Keep DISABLED.
	net.ipv4.tcp_tw_recycle = 0
	# tcp_tw_reuse is also problematic with NAT/IPVS and generally not recommended on servers. Keep DISABLED.
	net.ipv4.tcp_tw_reuse = 0

	# Maximum number of sockets in TIME_WAIT state
	# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
	# 1440000 is a very high value, suitable for nodes handling a massive number of short connections.
	net.ipv4.tcp_max_tw_buckets = 1440000

	# TCP Keepalives (Detect dead connections faster and free resources)
	# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
	# Shorter times are recommended for servers to reclaim resources from idle clients sooner.
	net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle
	net.ipv4.tcp_keepalive_intvl = 60 # Wait 60 seconds between probes
	net.ipv4.tcp_keepalive_probes = 5 # Send up to 5 probes

	##########################
	# Security & ICMP #
	##########################
	# Disable redirects & source routing (Security hardening - prevents malicious routing)
	net.ipv4.conf.all.accept_redirects = 0
	net.ipv4.conf.default.accept_redirects = 0
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route = 0

	# IP spoofing & logging (Security hardening)
	# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too

	# Log packets with impossible addresses (martians)
	net.ipv4.conf.all.log_martians = 1

	# ICMP hardening
	net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods)
	net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage)

	##########################
	# Routing & Bridge #
	##########################
	# Enable IP forwarding (Essential for Kubernetes nodes to route traffic between pods/services)
	net.ipv4.ip_forward = 1

	# Enable netfilter processing for bridged IP packets (Essential for K8s networking like Services/NetworkPolicy)
	net.bridge.bridge-nf-call-iptables = 1
	net.bridge.bridge-nf-call-ip6tables = 1
	# net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering

	##########################
	# Kubernetes IPVS #
	##########################
	# Enable IPVS connection tracking
	net.ipv4.vs.conntrack = 1

	# Expire connections for unavailable destinations (helps with rolling updates/pod evictions)
	net.ipv4.vs.expire_nodest_conn = 1

	# Expire templates for quiescent services
	net.ipv4.vs.expire_quiescent_template = 1

	# Relax strict TCP/UDP state checks (often needed in K8s due to health checks, probes, etc.)
	net.ipv4.vs.sloppy_tcp = 1
	net.ipv4.vs.sloppy_udp = 1

	##########################
	# Conntrack Tuning #
	##########################
	# Maximum number of connection tracking entries
	# Default is often low (e.g., 65536). Increase significantly for high connection loads in K8s.
	# A common value is 1048576 (1M) or higher depending on node size and workload.
	net.netfilter.nf_conntrack_max = 1048576

	# Connection tracking timeouts (seconds)
	# Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster,
	# but too short can break legitimate long-lived connections.
	net.netfilter.nf_conntrack_tcp_timeout_established = 43200 # Default 5 days (432000), 12 hours is often sufficient
	net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 # Default 60
	net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 # Default 120, can be reduced
	net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Default 120 (matches 2*MSL)
	net.netfilter.nf_conntrack_generic_timeout = 60 # Default 60 (for non-TCP/UDP)

	##########################
	# IPv6 Tuning (Optional - Uncomment and adjust if IPv6 is used) #
	##########################
	# If IPv6 is used in your cluster, apply similar tuning as IPv4 for consistency and performance.
	# net.ipv6.conf.all.tcp_congestion_control = bbr
	# net.ipv6.conf.default.tcp_congestion_control = bbr
	# net.ipv6.conf.all.rmem_default = 31457280
	# net.ipv6.conf.all.wmem_default = 31457280
	# net.ipv6.conf.all.rmem_max = 67108864
	# net.ipv6.conf.all.wmem_max = 67108864
	# net.ipv6.conf.default.rmem_default = 31457280
	# net.ipv6.conf.default.wmem_default = 31457280
	# net.ipv6.conf.default.rmem_max = 67108864
	# net.ipv6.conf.default.wmem_max = 67108864
	# net.ipv6.conf.all.accept_redirects = 0
	# net.ipv6.conf.default.accept_redirects = 0
	# net.ipv6.conf.all.accept_source_route = 0
	# net.ipv6.conf.default.accept_source_route = 0
	# net.ipv6.conf.all.rp_filter = 1
	# net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here
	# net.netfilter.nf_conntrack_max = 1048576 # Conntrack max is shared between IPv4 and IPv6
	# net.netfilter.nf_conntrack_ip6_timeout_established = 43200 # IPv6 specific timeouts if needed
	# ... (add other IPv6 conntrack timeouts if desired)
	# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed
	# /etc/sysctl.d/80-pve.conf
	# Large Production Proxmox VE Cluster Host Tuning
	# Optimized for modern Linux kernels (4.9+ recommended for BBR/fq_codel)
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Run `sysctl --system` after installing this file

	########################
	# 1) System Options #
	########################
	# Reduce console noise from kernel messages
	# Format: console_loglevel default_message_loglevel minimum_console_loglevel default_console_loglevel
	# 3 4 1 3: errors to console, warnings+ to messages, minimum 1 (emerg), default 3 (crit)
	kernel.printk = 3 4 1 3

	# Control swapping behavior
	# 0: Only swap to avoid OOM
	# 1-10: Prefer keeping anonymous memory in RAM, swap out file-backed pages more readily
	# 10 is a common starting point for hypervisors to keep guest memory in RAM. Adjust based on monitoring.
	vm.swappiness = 10

	# Memory overcommit policy
	# 0: Heuristic overcommit (default)
	# 1: Always overcommit (allows processes to request more memory than available, risking OOM kills)
	# 2: Never overcommit (strict accounting, can fail large allocations even if memory is free)
	# 1 is often required by applications like Redis for background saves.
	# WARNING: Setting vm.overcommit_memory=1 on a hypervisor host can be risky.
	# If guests overcommit their own memory, and the host also overcommits,
	# you can potentially trigger an OOM killer event on the host itself.
	# Monitor memory usage carefully. Consider 0 or 2 if not strictly required by host applications.
	vm.overcommit_memory = 1

	# Maximum number of open file descriptors system-wide
	# Increase for systems running many guests/processes, each potentially opening many files/sockets.
	# This value is very high and should be sufficient for most workloads.
	fs.file-max = 2097152

	# Maximum number of PIDs (processes/threads)
	# Increase for systems running many guests/processes.
	# 65536 is a standard high value, often sufficient, but consider increasing on very dense nodes.
	kernel.pid_max = 65536

	# Incoming TCP connection backlog (SYN_RCVD state) for host services (PVE API, SSH, etc.)
	# Increase for servers handling high rates of new connections to host services. 32768 is generous.
	net.core.somaxconn = 32768

	########################
	# 2) Network Core #
	########################
	# Maximum backlog for packets arriving at the network device before being processed by the kernel
	# Increase for high packet rates on high-speed NICs to avoid drops. 30000 is a high value.
	net.core.netdev_max_backlog = 30000

	# Default and Maximum socket buffer sizes (bytes)
	# These values are used by default and as limits for auto-tuning.
	# They are generous and suitable for 10Gbps+ networks common in clusters.
	# Actual buffer sizes are often auto-tuned by the kernel within these limits.
	net.core.rmem_default = 31457280 # Default receive buffer size (30MB)
	net.core.wmem_default = 31457280 # Default send buffer size (30MB)
	net.core.rmem_max = 67108864 # Max receive buffer size (64MB)
	net.core.wmem_max = 67108864 # Max send buffer size (64MB)

	# Default queueing discipline for network interfaces
	# fq_codel (Fair Queueing with CoDel) is a modern algorithm that reduces bufferbloat and improves fairness.
	net.core.default_qdisc = fq_codel

	########################
	# 3) IPv4 TCP Tuning #
	########################
	# Essential TCP features for modern performance
	net.ipv4.tcp_window_scaling = 1 # Enable TCP window scaling (required for high bandwidth connections)
	net.ipv4.tcp_sack = 1 # Enable TCP Selective Acknowledgement (improves recovery from packet loss)
	net.ipv4.tcp_timestamps = 1 # Enable TCP timestamps (improves RTT calculation, helps with PAWS/TW)
	net.ipv4.tcp_fastopen = 3 # Enable TCP Fast Open (3 = enable client and server support, reduces latency for repeated connections)

	# SYN backlog & flood protection
	# Maximum number of entries in the SYN queue (SYN_RECV state)
	# Increase for servers handling high rates of new connections. 8192 is a good value.
	net.ipv4.tcp_max_syn_backlog = 8192

	# Enable SYN cookies (protects against SYN floods when SYN queue is full)
	net.ipv4.tcp_syncookies = 1

	# Reduce SYN retransmits before giving up
	net.ipv4.tcp_syn_retries = 2
	net.ipv4.tcp_synack_retries = 2

	# Local port range for outgoing connections (ephemeral ports)
	# Increase if the host or guests using host's IP initiate many short-lived connections
	# and tcp_tw_reuse is off. 1024-65535 is the widest possible range.
	net.ipv4.ip_local_port_range = 1024 65535

	# Congestion control
	# BBR (Bottleneck Bandwidth and RTT) is excellent for throughput and latency on modern internet paths.
	# Requires kernel support (usually 4.9+). 'cubic' is the default and a good fallback.
	net.ipv4.tcp_congestion_control= bbr

	# Low watermark for TCP send buffer (bytes)
	# Helps reduce latency for small writes by preventing excessive buffering before sending.
	net.ipv4.tcp_notsent_lowat = 16384 # 16KB

	# Disable slow start after idle periods (improves performance for intermittent traffic)
	net.ipv4.tcp_slow_start_after_idle = 0

	# Enable MTU probing (helps avoid fragmentation issues by discovering path MTU)
	net.ipv4.tcp_mtu_probing = 1

	# Disable saving TCP metrics across connections (can save memory and CPU, minor impact)
	# Useful on servers handling connections from a vast number of unique clients.
	net.ipv4.tcp_no_metrics_save = 1

	# Enable TCP Corking auto-detection (improves efficiency for small writes followed by large writes)
	net.ipv4.tcp_autocorking = 1

	########################
	# 4) IPv4 Other Tuning #
	########################
	# IP Fragmentation
	# Memory thresholds for reassembling IP fragments (bytes)
	# High: Start dropping fragments when memory exceeds this
	# Low: Stop dropping fragments when memory drops below this
	net.ipv4.ipfrag_high_thresh = 262144 # 256KB
	net.ipv4.ipfrag_low_thresh = 196608 # 192KB

	# Time to keep fragments in memory before discarding (seconds)
	net.ipv4.ipfrag_time = 30

	# Time‑Wait Handling
	# tcp_tw_recycle is DANGEROUS and breaks NAT/Load Balancers/Bridging. Keep DISABLED.
	net.ipv4.tcp_tw_recycle = 0
	# tcp_tw_reuse is also problematic with NAT/Bridging and generally not recommended on servers. Keep DISABLED.
	net.ipv4.tcp_tw_reuse = 0

	# Maximum number of sockets in TIME_WAIT state
	# Relevant when tcp_tw_reuse is off. Increase if hitting limits under heavy load.
	# 1440000 is a very high value, suitable for nodes handling a massive number of short connections.
	net.ipv4.tcp_max_tw_buckets = 1440000

	# TCP Keepalives (Host Services)
	# Detect dead connections faster and free resources for host services (PVE API, SSH, etc.)
	# Default: 7200 9 75 (2 hours idle, 9 probes, 75s interval)
	# Shorter times are recommended for servers to reclaim resources from idle clients sooner.
	net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 minutes idle
	net.ipv4.tcp_keepalive_intvl = 30 # Wait 30 seconds between probes
	net.ipv4.tcp_keepalive_probes = 6 # Send up to 6 probes

	########################
	# 5) Security & ICMP #
	########################
	# Disable redirects & source routing (Security hardening - prevents malicious routing)
	net.ipv4.conf.all.accept_redirects = 0
	net.ipv4.conf.default.accept_redirects = 0
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route= 0

	# IP spoofing & logging (Security hardening)
	# rp_filter: Source validation (1=strict, 2=loose). Strict is recommended if routing is simple.
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1 # Apply to default interface settings too

	# Log packets with impossible addresses (martians)
	net.ipv4.conf.all.log_martians = 1

	# ICMP hardening
	net.ipv4.icmp_echo_ignore_broadcasts = 1 # Ignore broadcast ICMP requests (reduces response to floods)
	net.ipv4.icmp_timestamp_ignore_all = 1 # Ignore ICMP timestamp requests (reduces information leakage)

	########################
	# 6) Routing & Bridge #
	########################
	# Enable IP forwarding (ESSENTIAL for hypervisor nodes to route traffic between guests and external networks)
	net.ipv4.ip_forward = 1

	# Enable netfilter processing for bridged IP packets (ESSENTIAL for PVE firewall and guest networking)
	net.bridge.bridge-nf-call-iptables = 1
	net.bridge.bridge-nf-call-ip6tables = 1
	# net.bridge.bridge-nf-call-arptables = 1 # Less common, enable if needed for ARP filtering

	########################
	# 7) Conntrack Tuning #
	########################
	# Maximum number of connection tracking entries
	# Default is often low (e.g., 65536). Increase significantly for high connection loads from many guests.
	# 1000000 (1M) is a good starting point for busy nodes. Monitor usage (`/proc/sys/net/netfilter/nf_conntrack_count`).
	net.netfilter.nf_conntrack_max = 1000000

	# Connection tracking timeouts (seconds)
	# Adjust based on workload characteristics. Shorter timeouts free up conntrack entries faster,
	# but too short can break legitimate long-lived connections.
	# Default established is 5 days (432000). 24 hours (86400) is a common reduction.
	net.netfilter.nf_conntrack_tcp_timeout_established = 86400
	net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 # Default 60
	net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 # Default 120, can be reduced
	net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Default 120 (matches 2*MSL)
	net.netfilter.nf_conntrack_generic_timeout = 60 # Default 60 (for non-TCP/UDP)

	########################
	# 8) VM & Storage #
	########################
	# Control how aggressively the kernel swaps out filesystem cache (dentries and inodes)
	# Lower values (closer to 0) mean the kernel is less aggressive about reclaiming memory
	# used for caching filesystem metadata. Higher values (closer to 100) mean it's more aggressive.
	# Default is 100. Reducing this can help keep filesystem metadata in RAM, potentially
	# improving performance for file-heavy operations, but uses more memory.
	# A value of 50 is a common compromise. Adjust based on monitoring.
	vm.vfs_cache_pressure = 50

	# Percentage of total system memory that can be filled with 'dirty' pages (modified data
	# that hasn't been written to disk) before the system starts actively writing it back.
	# vm.dirty_ratio = 15 # Adjust based on storage performance

	# Percentage of total system memory that can be filled with 'dirty' pages before a
	# background process starts writing it back.
	# vm.dirty_background_ratio = 5 # Adjust based on storage performance
	# Note: vm.dirty_ratio and vm.dirty_background_ratio are CRITICAL for storage performance.
	# The optimal values depend heavily on your storage type (SSD, NVMe, HDD, network storage)
	# and workload. High values can lead to large write bursts and potential I/O stalls.
	# Low values can cause excessive small writes. The values 15/5 are starting points;
	# monitor I/O wait and adjust carefully. Consider using absolute bytes instead of percentages
	# (vm.dirty_bytes, vm.dirty_background_bytes) on systems with large amounts of RAM.

	########################
	# 9) IPv6 Tuning (Optional) #
	########################
	# If IPv6 is used in your cluster (host or guests), apply similar tuning as IPv4.
	# net.ipv6.conf.all.tcp_congestion_control = bbr
	# net.ipv6.conf.default.tcp_congestion_control = bbr
	# net.ipv6.conf.all.rmem_default = 31457280
	# net.ipv6.conf.all.wmem_default = 31457280
	# net.ipv6.conf.all.rmem_max = 67108864
	# net.ipv6.conf.all.wmem_max = 67108864
	# net.ipv6.conf.default.rmem_default = 31457280
	# net.ipv6.conf.default.wmem_default = 31457280
	# net.ipv6.conf.default.rmem_max = 67108864
	# net.ipv6.conf.default.wmem_max = 67108864
	# net.ipv6.conf.all.accept_redirects = 0
	# net.ipv6.conf.default.accept_redirects = 0
	# net.ipv6.conf.all.accept_source_route = 0
	# net.ipv6.conf.default.accept_source_route = 0
	# net.ipv6.conf.all.rp_filter = 1
	# net.bridge.bridge-nf-call-ip6tables = 1 # Already included above, but good to note here
	# net.netfilter.nf_conntrack_max = 1000000 # Conntrack max is shared between IPv4 and IPv6
	# net.netfilter.nf_conntrack_ip6_timeout_established = 86400 # IPv6 specific timeouts if needed
	# ... (add other IPv6 conntrack timeouts if desired)
	# net.ipv6.conf.all.disable_ipv6 = 1 # Uncomment and set to 1 to disable IPv6 entirely if not needed