2023 年我们一个 Nginx 入口集群,日常流量 20w QPS,大促前压测发现峰值 50w QPS 时 5xx 飙升,worker 进程频繁拒连,响应时间从 5ms 涨到 300ms。投了一周做 Nginx 调优:连接数、缓冲区、TCP 参数、TLS 加速、HTTP/2,把单机 QPS 从 10w 提到 60w,5xx 从 8% 降到 0.02%。本文复盘 Nginx 高并发调优的完整方案。
问题背景
架构:Nginx Plus 1.25 → 后端 Spring Boot 集群
机器:32C 64G,10Gbps 网卡
日常 QPS:单机 10w,集群 4 台 = 40w
压测目标:单机 60w QPS
压测发现:
- worker_processes auto = 32(对的)
- 单 worker QPS 顶到 3w 不动
- 5xx 错误率 8%
- TIME_WAIT 占满端口
- TLS handshake 慢(占 CPU)
需要从根本上优化
优化 1:进程 + 连接配置
# /etc/nginx/nginx.conf
# 进程数 = CPU 核数
worker_processes auto;
# 每个 worker 最大连接数(关键!)
events {
worker_connections 65535; # 默认 1024 太小
use epoll; # Linux 必须 epoll
multi_accept on; # 一次接受多个连接
accept_mutex off; # 1.11.3+ 默认 off,多 worker 抢
}
# 系统级 fd 限制
worker_rlimit_nofile 100000;
# /etc/security/limits.conf
nginx soft nofile 100000
nginx hard nofile 100000
# /etc/systemd/system/nginx.service.d/override.conf
[Service]
LimitNOFILE=100000
# 验证
$ cat /proc/$(pgrep -f 'nginx: worker' | head -1)/limits | grep -i 'open files'
Max open files 100000 100000
优化 2:HTTP 参数
http {
# 文件缓存(静态文件)
open_file_cache max=10000 inactive=30s;
open_file_cache_valid 60s;
open_file_cache_min_uses 2;
open_file_cache_errors on;
# 缓冲区
client_body_buffer_size 16k;
client_header_buffer_size 4k;
large_client_header_buffers 8 16k;
client_max_body_size 50m;
# 超时
client_body_timeout 12s;
client_header_timeout 12s;
send_timeout 10s;
keepalive_timeout 65s;
keepalive_requests 10000; # 单连接最多 10000 请求
keepalive_disable msie6;
# sendfile / tcp_nopush / tcp_nodelay
sendfile on;
tcp_nopush on;
tcp_nodelay on;
# gzip(注意:大文件不压缩,CPU 开销大)
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_comp_level 4; # 4 是 CPU/效果平衡点
gzip_types text/plain text/css application/json application/javascript text/xml application/xml;
gzip_proxied any;
# 隐藏版本
server_tokens off;
}
优化 3:upstream 配置
upstream backend {
# 负载均衡(默认 round-robin)
least_conn; # 最少连接(推荐高并发)
# 后端节点
server 10.0.1.10:8080 max_fails=3 fail_timeout=30s weight=5;
server 10.0.1.11:8080 max_fails=3 fail_timeout=30s weight=5;
server 10.0.1.12:8080 max_fails=3 fail_timeout=30s weight=5;
# keepalive 长连接(关键!)
keepalive 100; # 每 worker 保留 100 个长连接
keepalive_requests 10000;
keepalive_timeout 60s;
}
server {
listen 443 ssl http2 reuseport; # reuseport 必开!
location / {
proxy_pass http://backend;
proxy_http_version 1.1;
proxy_set_header Connection ""; # 关键:让 keepalive 生效
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_connect_timeout 3s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
proxy_buffering on;
proxy_buffer_size 16k;
proxy_buffers 8 16k;
proxy_busy_buffers_size 32k;
# 失败重试
proxy_next_upstream error timeout http_500 http_502 http_503;
proxy_next_upstream_tries 2;
proxy_next_upstream_timeout 5s;
}
}
优化 4:TLS 加速
server {
listen 443 ssl http2 reuseport;
server_name api.example.com;
# 证书
ssl_certificate /etc/nginx/ssl/cert.pem;
ssl_certificate_key /etc/nginx/ssl/key.pem;
# 协议(只开 TLS 1.2 + 1.3)
ssl_protocols TLSv1.2 TLSv1.3;
ssl_prefer_server_ciphers off;
# 加密套件(TLS 1.3 + 1.2 现代套件)
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305;
# Session 缓存(重要!避免每次握手)
ssl_session_cache shared:SSL:50m;
ssl_session_timeout 1d;
ssl_session_tickets on;
# OCSP stapling
ssl_stapling on;
ssl_stapling_verify on;
resolver 8.8.8.8 1.1.1.1 valid=300s;
resolver_timeout 5s;
# ECC 证书优先(比 RSA 快 5x)
# 双证书:RSA + ECC
ssl_certificate /etc/nginx/ssl/rsa.crt;
ssl_certificate_key /etc/nginx/ssl/rsa.key;
ssl_certificate /etc/nginx/ssl/ecc.crt;
ssl_certificate_key /etc/nginx/ssl/ecc.key;
# HSTS
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
}
# 效果:
# - Session cache 命中,握手从 50ms → 1ms
# - ECC 证书,握手 CPU 占用降 60%
# - TLS 1.3 0-RTT 老客户端再次连接立即返回
优化 5:HTTP/2 + HTTP/3
# HTTP/2(必开)
listen 443 ssl http2 reuseport;
# HTTP/2 优化
http2_max_concurrent_streams 128;
http2_recv_buffer_size 256k;
# HTTP/3 + QUIC(nginx 1.25+)
listen 443 quic reuseport;
listen 443 ssl http2 reuseport;
ssl_protocols TLSv1.3; # HTTP/3 必须 TLS 1.3
add_header Alt-Svc 'h3=":443"; ma=86400'; # 告诉客户端有 HTTP/3
# 客户端首次走 HTTP/2,得到 Alt-Svc 后下次走 HTTP/3
# Chrome/Edge/Safari 支持
优化 6:Linux 内核参数
# /etc/sysctl.conf
# TCP backlog(高并发关键)
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535
net.core.netdev_max_backlog = 50000
# TIME_WAIT 优化
net.ipv4.tcp_tw_reuse = 1 # 复用 TIME_WAIT
# net.ipv4.tcp_tw_recycle = 1 # 4.10+ 已删除
net.ipv4.tcp_fin_timeout = 15 # TIME_WAIT 时长(默认 60s)
net.ipv4.ip_local_port_range = 1024 65535
net.ipv4.tcp_max_tw_buckets = 1000000
# 连接队列
net.ipv4.tcp_synack_retries = 2
net.ipv4.tcp_syn_retries = 2
# TCP buffer
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
# 文件句柄
fs.file-max = 2097152
fs.nr_open = 2097152
# 应用
$ sysctl -p
# 验证
$ ss -tan | awk '{print $1}' | sort | uniq -c
LISTEN
ESTAB
TIME-WAIT 5000 # 大量 TW 但端口够用
优化 7:限流 + 防刷
http {
# 限流定义
limit_req_zone $binary_remote_addr zone=ip_limit:10m rate=100r/s;
limit_req_zone $http_x_forwarded_for zone=user_limit:10m rate=10r/s;
limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
server {
location /api/login {
limit_req zone=user_limit burst=20 nodelay; # 登录严格限
limit_req_status 429;
proxy_pass http://backend;
}
location /api/ {
limit_req zone=ip_limit burst=200 nodelay; # 一般 API
limit_conn conn_limit 50;
proxy_pass http://backend;
}
}
}
# 防爬虫
map $http_user_agent $bad_bot {
default 0;
~*(scrapy|bot|spider) 1;
}
server {
if ($bad_bot) {
return 403;
}
}
优化 8:静态资源 + CDN
server {
# 静态资源缓存
location ~* \.(jpg|jpeg|png|gif|ico|css|js|woff2)$ {
expires 7d;
add_header Cache-Control "public, immutable";
access_log off;
}
# HTML 不缓存
location ~ \.html$ {
expires -1;
add_header Cache-Control "no-store, must-revalidate";
}
}
# 配 CDN:Nginx 只做回源,大部分流量打 CDN
# CDN 命中率:90%+
# Nginx 实际压力降 90%
压测结果
压测工具:wrk2 -t 16 -c 5000 -d 60s -R 600000 https://api.example.com/api/ping
单机 32C 64G
阶段 QPS P50 P99 5xx
==========================================================================
基线(默认配置) 100k 50ms 2s 8%
调 worker + 连接 200k 20ms 500ms 5%
调 keepalive 300k 15ms 200ms 2%
TLS Session + ECC 400k 10ms 100ms 0.5%
+ HTTP/2 + reuseport 500k 8ms 50ms 0.1%
+ 内核参数 600k 5ms 30ms 0.02%
集群 4 台:240w QPS,5xx 0.02%
监控指标
# 状态接口
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1;
deny all;
}
$ curl localhost/nginx_status
Active connections: 23456
server accepts handled requests
1234567 1234567 9876543
Reading: 0 Writing: 1234 Waiting: 22222
# 关键指标:
# Active connections:总连接数
# Writing:正在响应(可能瓶颈)
# Waiting:keepalive 空闲连接(多说明长连接复用好)
# Prometheus + nginx_exporter
nginx_connections_active
nginx_connections_reading
nginx_connections_writing
nginx_http_requests_total
nginx_upstream_response_time_seconds
# 告警
- alert: NginxHigh5xx
expr: rate(nginx_http_requests_total{status=~"5.."}[1m]) / rate(nginx_http_requests_total[1m]) > 0.01
for: 2m
- alert: NginxUpstreamDown
expr: nginx_upstream_state == 0
for: 1m
避坑清单
- worker_connections 必须改 65535,默认 1024 不够用
- worker_rlimit_nofile + systemd LimitNOFILE 必须 100000+
- upstream 必须 keepalive,proxy_http_version 1.1 + Connection ""
- listen 443 必须加 reuseport,内核负载均衡
- TLS Session cache 必开,握手从 50ms → 1ms
- 双证书(RSA + ECC),客户端自动选 ECC
- HTTP/2 必开,HTTP/3 + QUIC 渐进推广
- limit_req + limit_conn 防刷,网关层就拦
- net.core.somaxconn 必须改 65535,默认 128 是瓶颈
- nginx -t 测试配置语法,reload 不重启进程
总结
Nginx 是个看起来简单实则深的工具,默认配置只能跑 1w QPS,调优后能跑 60w QPS。这次入口集群调优把单机性能提升 6x,大促完全顶得住。最大的认知改变:Nginx 高并发的关键不是某一个参数,而是"连接 + 缓冲 + keepalive + TLS + 内核"五层协同。任何一层有瓶颈整体就上不去。HTTP/2 + HTTP/3 + reuseport 这种新技术该用就用,降延迟显著。最后,nginx -V 看编译的模块,nginx -t 测语法,access.log + error.log + nginx_status 三件套是排查必备。日常生产 Nginx 别瞎重启,reload + 灰度才稳。
—— 别看了 · 2026