在 Nginx 中,您可以通过 upstream模块实现服务器故障转移机制。这里有几种方法可以实现您的需求:
1. 基础故障转移配置
upstream backend_servers {
# 主服务器
server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
# 备份服务器1(主服务器失败时使用)
server 192.168.1.11:8080 backup max_fails=3 fail_timeout=30s;
# 备份服务器2(主和备份1都失败时使用)
server 192.168.1.12:8080 backup max_fails=3 fail_timeout=30s;
# 备份服务器3
server 192.168.1.13:8080 backup max_fails=3 fail_timeout=30s;
}
server {
listen 80;
server_name example.com;
location / {
proxy_pass http://backend_servers;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
proxy_connect_timeout 2s;
proxy_read_timeout 5s;
proxy_send_timeout 5s;
}
}
2. 完全按顺序的故障转移
要实现严格按顺序的故障转移(第一个失败 → 第二个 → 第三个),需要使用更复杂的配置:
# 定义主服务器
upstream primary_server {
server 192.168.1.10:8080 max_fails=2 fail_timeout=30s;
}
# 定义备份服务器组
upstream backup_servers {
server 192.168.1.11:8080;
server 192.168.1.12:8080;
server 192.168.1.13:8080;
}
server {
listen 80;
server_name example.com;
location / {
# 先尝试主服务器
proxy_pass http://primary_server;
proxy_intercept_errors on;
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
# 如果主服务器失败,重试到备用服务器
error_page 500 502 503 504 = @try_backup;
}
location @try_backup {
# 设置变量记录尝试的服务器
set $backend_try 1;
# 第一次尝试备份服务器1
if ($backend_try = 1) {
proxy_pass http://192.168.1.11:8080;
set $backend_try 2;
}
proxy_intercept_errors on;
error_page 500 502 503 504 = @try_backup2;
}
location @try_backup2 {
# 第二次尝试备份服务器2
proxy_pass http://192.168.1.12:8080;
proxy_intercept_errors on;
error_page 500 502 503 504 = @try_backup3;
}
location @try_backup3 {
# 第三次尝试备份服务器3
proxy_pass http://192.168.1.13:8080;
}
}
3. 使用 Nginx Plus 的商业版功能
如果您使用 Nginx Plus,有更强大的故障转移功能:
upstream backend {
zone backend 64k;
state /var/lib/nginx/state/backend.state;
# 主服务器
server 192.168.1.10:8080 resolve;
# 备份服务器,按顺序尝试
server 192.168.1.11:8080 backup resolve;
server 192.168.1.12:8080 backup resolve;
server 192.168.1.13:8080 backup resolve;
# 健康检查
health_check interval=5s fails=3 passes=2;
}
server {
listen 80;
location / {
proxy_pass http://backend;
proxy_next_upstream error timeout http_500 http_502 http_503 http_504;
}
}
4. 使用 Lua 脚本实现智能故障转移
需要安装 ngx_http_lua_module:
http {
lua_package_path "/etc/nginx/lua/?.lua;;";
upstream backend_servers {
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
server 192.168.1.13:8080;
}
init_worker_by_lua_block {
local health = require "resty.healthcheck"
local checker = health.new({
name = "backend",
shm_name = "healthcheck",
type = "http",
checks = {
active = {
timeout = 2,
http_path = "/health",
healthy = {
interval = 5,
successes = 2
},
unhealthy = {
interval = 1,
http_failures = 3
}
}
}
})
-- 检查所有服务器
local servers = {
{ host = "192.168.1.10", port = 8080 },
{ host = "192.168.1.11", port = 8080 },
{ host = "192.168.1.12", port = 8080 },
{ host = "192.168.1.13", port = 8080 },
}
for _, server in ipairs(servers) do
checker:add_target(server.host, server.port, nil)
end
}
server {
listen 80;
location / {
access_by_lua_block {
local checker = require("resty.healthcheck")
local instances = checker:get_targets("backend")
local target
for _, instance in ipairs(instances) do
if instance.healthy then
target = instance
break
end
end
if target then
ngx.var.backend = "http://" .. target.host .. ":" .. target.port
else
ngx.exit(503)
end
}
proxy_pass $backend;
}
location /health {
access_by_lua_block {
ngx.status = 200
ngx.say("OK")
ngx.exit(200)
}
}
}
}
5. 使用第三方模块:nginx-upstream-fair
upstream backend {
fair;
# 主服务器
server 192.168.1.10:8080;
# 备份服务器
server 192.168.1.11:8080 backup;
server 192.168.1.12:8080 backup;
server 192.168.1.13:8080 backup;
# 健康检查
check interval=3000 rise=2 fall=5 timeout=1000 type=http;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
6. 完整生产环境配置示例
# 主配置文件:nginx.conf
user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log warn;
pid /var/run/nginx.pid;
events {
worker_connections 1024;
use epoll;
multi_accept on;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" '
'upstream: $upstream_addr $upstream_status $request_time';
access_log /var/log/nginx/access.log main;
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
types_hash_max_size 2048;
# 定义服务器组
upstream backend_group {
# 主服务器
server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
# 第一备份
server 192.168.1.11:8080 backup max_fails=3 fail_timeout=30s;
# 第二备份
server 192.168.1.12:8080 backup max_fails=3 fail_timeout=30s;
# 第三备份
server 192.168.1.13:8080 backup max_fails=3 fail_timeout=30s;
# 负载均衡方法
least_conn; # 最少连接数
# 会话保持
sticky cookie srv_id expires=1h domain=.example.com path=/;
}
# 健康检查配置
upstream backend_check {
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
server 192.168.1.13:8080;
}
server {
listen 80;
server_name example.com;
# 健康检查端点
location /nginx_status {
stub_status on;
access_log off;
allow 192.168.1.0/24;
deny all;
}
location /health {
proxy_pass http://backend_check;
proxy_set_header Host $host;
proxy_next_upstream error timeout http_500 http_502 http_503 http_504;
access_log off;
}
location / {
# 代理到后端服务器
proxy_pass http://backend_group;
# 超时设置
proxy_connect_timeout 3s;
proxy_read_timeout 10s;
proxy_send_timeout 10s;
# 重试机制
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
proxy_next_upstream_tries 3;
proxy_next_upstream_timeout 10s;
# 请求头设置
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header X-Forwarded-Host $host;
proxy_set_header X-Forwarded-Port $server_port;
# 缓冲设置
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
proxy_busy_buffers_size 8k;
# 其他设置
proxy_redirect off;
proxy_http_version 1.1;
proxy_set_header Connection "";
}
# 错误页面
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root /usr/share/nginx/html;
}
}
# 监控配置
server {
listen 8080;
server_name localhost;
location /stub_status {
stub_status;
access_log off;
allow 127.0.0.1;
deny all;
}
location /upstream_status {
upstream_status;
access_log off;
allow 127.0.0.1;
deny all;
}
}
}
7. 使用 Keepalived 实现高可用
如果需要在多个 Nginx 节点间实现高可用:
# keepalived.conf
vrrp_script chk_nginx {
script "/usr/bin/pgrep nginx"
interval 2
weight 2
fall 2
rise 2
}
vrrp_instance VI_1 {
state MASTER
interface eth0
virtual_router_id 51
priority 100
advert_int 1
authentication {
auth_type PASS
auth_pass 1111
}
virtual_ipaddress {
192.168.1.100/24
}
track_script {
chk_nginx
}
}
8. 监控和日志配置
# 创建监控脚本
#!/bin/bash
# /etc/nginx/check_backend.sh
BACKENDS=("192.168.1.10:8080" "192.168.1.11:8080" "192.168.1.12:8080" "192.168.1.13:8080")
for backend in "${BACKENDS[@]}"; do
IFS=':' read -r ip port <<< "$backend"
if timeout 2 curl -f "http://$backend/health" >/dev/null 2>&1; then
echo "$(date): $backend is UP"
else
echo "$(date): $backend is DOWN"
fi
done
9. 测试配置
# 测试配置文件
nginx -t
# 重新加载配置
nginx -s reload
# 查看上游服务器状态
curl http://localhost/upstream_status
# 查看 Nginx 状态
curl http://localhost/nginx_status
# 测试故障转移
while true; do
curl -s http://example.com/health
sleep 1
done
10. 关键参数说明
| 参数 | 说明 | 建议值 |
|---|---|---|
max_fails | 最大失败次数 | 3 |
fail_timeout | 失败后暂停时间 | 30s |
backup | 标记为备份服务器 | – |
weight | 服务器权重 | 1-10 |
down | 手动标记服务器下线 | – |
proxy_next_upstream | 触发切换到下一服务器的条件 | error timeout http_500… |
proxy_next_upstream_tries | 最大重试次数 | 3 |
proxy_connect_timeout | 连接超时 | 2-5s |
proxy_read_timeout | 读取超时 | 10-30s |
11. 优化建议
- 健康检查:使用第三方模块或 Nginx Plus 进行主动健康检查
- 会话保持:如果应用需要会话,使用
sticky模块 - 监控告警:设置监控,当服务器故障时发送告警
- 自动恢复:配置脚本自动重启失败的服务
- 日志分析:分析访问日志,了解故障转移频率
12. 注意事项
- 备份服务器只在所有主服务器都不可用时才会被使用
- 使用
least_conn或ip_hash算法时,备份服务器的行为可能不同 - 生产环境建议结合监控系统(如 Prometheus + Grafana)
- 定期进行故障转移测试
- 确保备份服务器的数据和配置与主服务器同步
这种配置确保了当第一个服务器失效时,Nginx 会自动尝试第二个服务器,依次类推,直到找到可用的服务器。