Nginx 高并发调优实录:单机 10w→60w QPS 全过程

入口 Nginx 集群压测大促瓶颈,单机 10w QPS 撞墙、5xx 飙到 8%。一周调优全实录:worker 进程/连接、upstream keepalive、TLS Session + ECC + HTTP/2、HTTP/3 QUIC、内核 TCP 参数、limit_req 限流。单机 QPS 提到 60w,5xx 降到 0.02%。

2023 年我们一个 Nginx 入口集群,日常流量 20w QPS,大促前压测发现峰值 50w QPS 时 5xx 飙升,worker 进程频繁拒连,响应时间从 5ms 涨到 300ms。投了一周做 Nginx 调优:连接数、缓冲区、TCP 参数、TLS 加速、HTTP/2,把单机 QPS 从 10w 提到 60w,5xx 从 8% 降到 0.02%。本文复盘 Nginx 高并发调优的完整方案。

问题背景

架构:Nginx Plus 1.25 → 后端 Spring Boot 集群
机器:32C 64G,10Gbps 网卡
日常 QPS:单机 10w,集群 4 台 = 40w
压测目标:单机 60w QPS

压测发现:
- worker_processes auto = 32(对的)
- 单 worker QPS 顶到 3w 不动
- 5xx 错误率 8%
- TIME_WAIT 占满端口
- TLS handshake 慢(占 CPU)

需要从根本上优化

优化 1:进程 + 连接配置

# /etc/nginx/nginx.conf

# 进程数 = CPU 核数
worker_processes auto;

# 每个 worker 最大连接数(关键!)
events {
    worker_connections 65535;     # 默认 1024 太小
    use epoll;                     # Linux 必须 epoll
    multi_accept on;               # 一次接受多个连接
    accept_mutex off;              # 1.11.3+ 默认 off,多 worker 抢
}

# 系统级 fd 限制
worker_rlimit_nofile 100000;

# /etc/security/limits.conf
nginx soft nofile 100000
nginx hard nofile 100000

# /etc/systemd/system/nginx.service.d/override.conf
[Service]
LimitNOFILE=100000

# 验证
$ cat /proc/$(pgrep -f 'nginx: worker' | head -1)/limits | grep -i 'open files'
Max open files            100000               100000

优化 2:HTTP 参数

http {
    # 文件缓存(静态文件)
    open_file_cache max=10000 inactive=30s;
    open_file_cache_valid 60s;
    open_file_cache_min_uses 2;
    open_file_cache_errors on;

    # 缓冲区
    client_body_buffer_size 16k;
    client_header_buffer_size 4k;
    large_client_header_buffers 8 16k;
    client_max_body_size 50m;

    # 超时
    client_body_timeout 12s;
    client_header_timeout 12s;
    send_timeout 10s;
    keepalive_timeout 65s;
    keepalive_requests 10000;          # 单连接最多 10000 请求
    keepalive_disable msie6;

    # sendfile / tcp_nopush / tcp_nodelay
    sendfile on;
    tcp_nopush on;
    tcp_nodelay on;

    # gzip(注意:大文件不压缩,CPU 开销大)
    gzip on;
    gzip_vary on;
    gzip_min_length 1024;
    gzip_comp_level 4;                 # 4 是 CPU/效果平衡点
    gzip_types text/plain text/css application/json application/javascript text/xml application/xml;
    gzip_proxied any;

    # 隐藏版本
    server_tokens off;
}

优化 3:upstream 配置

upstream backend {
    # 负载均衡(默认 round-robin)
    least_conn;                        # 最少连接(推荐高并发)

    # 后端节点
    server 10.0.1.10:8080 max_fails=3 fail_timeout=30s weight=5;
    server 10.0.1.11:8080 max_fails=3 fail_timeout=30s weight=5;
    server 10.0.1.12:8080 max_fails=3 fail_timeout=30s weight=5;

    # keepalive 长连接(关键!)
    keepalive 100;                     # 每 worker 保留 100 个长连接
    keepalive_requests 10000;
    keepalive_timeout 60s;
}

server {
    listen 443 ssl http2 reuseport;    # reuseport 必开!

    location / {
        proxy_pass http://backend;
        proxy_http_version 1.1;
        proxy_set_header Connection "";     # 关键:让 keepalive 生效
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;

        proxy_connect_timeout 3s;
        proxy_send_timeout 30s;
        proxy_read_timeout 30s;

        proxy_buffering on;
        proxy_buffer_size 16k;
        proxy_buffers 8 16k;
        proxy_busy_buffers_size 32k;

        # 失败重试
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 5s;
    }
}

优化 4:TLS 加速

server {
    listen 443 ssl http2 reuseport;
    server_name api.example.com;

    # 证书
    ssl_certificate /etc/nginx/ssl/cert.pem;
    ssl_certificate_key /etc/nginx/ssl/key.pem;

    # 协议(只开 TLS 1.2 + 1.3)
    ssl_protocols TLSv1.2 TLSv1.3;
    ssl_prefer_server_ciphers off;

    # 加密套件(TLS 1.3 + 1.2 现代套件)
    ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305;

    # Session 缓存(重要!避免每次握手)
    ssl_session_cache shared:SSL:50m;
    ssl_session_timeout 1d;
    ssl_session_tickets on;

    # OCSP stapling
    ssl_stapling on;
    ssl_stapling_verify on;
    resolver 8.8.8.8 1.1.1.1 valid=300s;
    resolver_timeout 5s;

    # ECC 证书优先(比 RSA 快 5x)
    # 双证书:RSA + ECC
    ssl_certificate /etc/nginx/ssl/rsa.crt;
    ssl_certificate_key /etc/nginx/ssl/rsa.key;
    ssl_certificate /etc/nginx/ssl/ecc.crt;
    ssl_certificate_key /etc/nginx/ssl/ecc.key;

    # HSTS
    add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
}

# 效果:
# - Session cache 命中,握手从 50ms → 1ms
# - ECC 证书,握手 CPU 占用降 60%
# - TLS 1.3 0-RTT 老客户端再次连接立即返回

优化 5:HTTP/2 + HTTP/3

# HTTP/2(必开)
listen 443 ssl http2 reuseport;

# HTTP/2 优化
http2_max_concurrent_streams 128;
http2_recv_buffer_size 256k;

# HTTP/3 + QUIC(nginx 1.25+)
listen 443 quic reuseport;
listen 443 ssl http2 reuseport;

ssl_protocols TLSv1.3;     # HTTP/3 必须 TLS 1.3

add_header Alt-Svc 'h3=":443"; ma=86400';     # 告诉客户端有 HTTP/3

# 客户端首次走 HTTP/2,得到 Alt-Svc 后下次走 HTTP/3
# Chrome/Edge/Safari 支持

优化 6:Linux 内核参数

# /etc/sysctl.conf
# TCP backlog(高并发关键)
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535
net.core.netdev_max_backlog = 50000

# TIME_WAIT 优化
net.ipv4.tcp_tw_reuse = 1                 # 复用 TIME_WAIT
# net.ipv4.tcp_tw_recycle = 1             # 4.10+ 已删除
net.ipv4.tcp_fin_timeout = 15             # TIME_WAIT 时长(默认 60s)
net.ipv4.ip_local_port_range = 1024 65535
net.ipv4.tcp_max_tw_buckets = 1000000

# 连接队列
net.ipv4.tcp_synack_retries = 2
net.ipv4.tcp_syn_retries = 2

# TCP buffer
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216

# 文件句柄
fs.file-max = 2097152
fs.nr_open = 2097152

# 应用
$ sysctl -p

# 验证
$ ss -tan | awk '{print $1}' | sort | uniq -c
  LISTEN
  ESTAB
  TIME-WAIT 5000        # 大量 TW 但端口够用

优化 7:限流 + 防刷

http {
    # 限流定义
    limit_req_zone $binary_remote_addr zone=ip_limit:10m rate=100r/s;
    limit_req_zone $http_x_forwarded_for zone=user_limit:10m rate=10r/s;
    limit_conn_zone $binary_remote_addr zone=conn_limit:10m;

    server {
        location /api/login {
            limit_req zone=user_limit burst=20 nodelay;     # 登录严格限
            limit_req_status 429;
            proxy_pass http://backend;
        }

        location /api/ {
            limit_req zone=ip_limit burst=200 nodelay;       # 一般 API
            limit_conn conn_limit 50;
            proxy_pass http://backend;
        }
    }
}

# 防爬虫
map $http_user_agent $bad_bot {
    default 0;
    ~*(scrapy|bot|spider) 1;
}

server {
    if ($bad_bot) {
        return 403;
    }
}

优化 8:静态资源 + CDN

server {
    # 静态资源缓存
    location ~* \.(jpg|jpeg|png|gif|ico|css|js|woff2)$ {
        expires 7d;
        add_header Cache-Control "public, immutable";
        access_log off;
    }

    # HTML 不缓存
    location ~ \.html$ {
        expires -1;
        add_header Cache-Control "no-store, must-revalidate";
    }
}

# 配 CDN:Nginx 只做回源,大部分流量打 CDN
# CDN 命中率:90%+
# Nginx 实际压力降 90%

压测结果

压测工具:wrk2 -t 16 -c 5000 -d 60s -R 600000 https://api.example.com/api/ping
单机 32C 64G

阶段                      QPS         P50        P99        5xx
==========================================================================
基线(默认配置)         100k         50ms       2s         8%
调 worker + 连接          200k         20ms       500ms      5%
调 keepalive             300k          15ms      200ms      2%
TLS Session + ECC        400k         10ms       100ms      0.5%
+ HTTP/2 + reuseport     500k         8ms        50ms       0.1%
+ 内核参数               600k         5ms        30ms       0.02%

集群 4 台:240w QPS,5xx 0.02%

监控指标

# 状态接口
location /nginx_status {
    stub_status on;
    access_log off;
    allow 127.0.0.1;
    deny all;
}

$ curl localhost/nginx_status
Active connections: 23456
server accepts handled requests
 1234567 1234567 9876543
Reading: 0 Writing: 1234 Waiting: 22222

# 关键指标:
# Active connections:总连接数
# Writing:正在响应(可能瓶颈)
# Waiting:keepalive 空闲连接(多说明长连接复用好)

# Prometheus + nginx_exporter
nginx_connections_active
nginx_connections_reading
nginx_connections_writing
nginx_http_requests_total
nginx_upstream_response_time_seconds

# 告警
- alert: NginxHigh5xx
  expr: rate(nginx_http_requests_total{status=~"5.."}[1m]) / rate(nginx_http_requests_total[1m]) > 0.01
  for: 2m

- alert: NginxUpstreamDown
  expr: nginx_upstream_state == 0
  for: 1m

避坑清单

  1. worker_connections 必须改 65535,默认 1024 不够用
  2. worker_rlimit_nofile + systemd LimitNOFILE 必须 100000+
  3. upstream 必须 keepalive,proxy_http_version 1.1 + Connection ""
  4. listen 443 必须加 reuseport,内核负载均衡
  5. TLS Session cache 必开,握手从 50ms → 1ms
  6. 双证书(RSA + ECC),客户端自动选 ECC
  7. HTTP/2 必开,HTTP/3 + QUIC 渐进推广
  8. limit_req + limit_conn 防刷,网关层就拦
  9. net.core.somaxconn 必须改 65535,默认 128 是瓶颈
  10. nginx -t 测试配置语法,reload 不重启进程

总结

Nginx 是个看起来简单实则深的工具,默认配置只能跑 1w QPS,调优后能跑 60w QPS。这次入口集群调优把单机性能提升 6x,大促完全顶得住。最大的认知改变:Nginx 高并发的关键不是某一个参数,而是"连接 + 缓冲 + keepalive + TLS + 内核"五层协同。任何一层有瓶颈整体就上不去。HTTP/2 + HTTP/3 + reuseport 这种新技术该用就用,降延迟显著。最后,nginx -V 看编译的模块,nginx -t 测语法,access.log + error.log + nginx_status 三件套是排查必备。日常生产 Nginx 别瞎重启,reload + 灰度才稳。

—— 别看了 · 2026
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理 邮箱1846861578@qq.com。
技术教程

Kafka 5000w Lag 8 小时事故复盘:消费端优化全实录

2026-5-19 12:49:03

技术教程

Elasticsearch 80 亿日志治理:P99 8s→200ms 磁盘 -60%

2026-5-19 12:53:19

0 条回复 A文章作者 M管理员
    暂无讨论,说说你的看法吧
个人中心
购物车
优惠劵
今日签到
有新私信 私信列表
搜索