Skip to content

Clustering and High Availability

Deploying Mailborder in a clustered configuration for high availability and load distribution.

Architecture Overview

Cluster Components

Load Balancer Tier: - HAProxy or NGINX load balancer - Failover with keepalived (VRRP) - Health check endpoints

Application Tier: - Multiple Mailborder nodes - Shared session storage (Redis) - Synchronized configuration

Database Tier: - MariaDB Galera cluster or primary-replica - Automated failover - Connection pooling

Storage Tier: - Shared quarantine storage (NFS or GlusterFS) - Centralized logging - Backup storage

Load Balancer Configuration

HAProxy Setup

Install HAProxy:

sudo apt install haproxy

Configure HAProxy:

sudo tee /etc/haproxy/haproxy.cfg << 'EOF'
global
    log /dev/log local0
    chroot /var/lib/haproxy
    stats socket /run/haproxy/admin.sock mode 660 level admin
    stats timeout 30s
    user haproxy
    group haproxy
    daemon

defaults
    log     global
    mode    tcp
    option  tcplog
    option  dontlognull
    timeout connect 5000
    timeout client  50000
    timeout server  50000

# SMTP Load Balancing
frontend smtp_frontend
    bind *:25
    mode tcp
    default_backend smtp_backend

backend smtp_backend
    mode tcp
    balance roundrobin
    option tcp-check
    tcp-check connect port 10025
    server mb1 192.168.1.101:25 check
    server mb2 192.168.1.102:25 check
    server mb3 192.168.1.103:25 check

# HTTPS Load Balancing
frontend https_frontend
    bind *:443 ssl crt /etc/ssl/certs/mailborder.pem
    mode http
    default_backend https_backend

backend https_backend
    mode http
    balance roundrobin
    option httpchk GET /health
    http-check expect status 200
    cookie SERVERID insert indirect nocache
    server mb1 192.168.1.101:443 check ssl verify none cookie mb1
    server mb2 192.168.1.102:443 check ssl verify none cookie mb2
    server mb3 192.168.1.103:443 check ssl verify none cookie mb3

# Statistics
listen stats
    bind *:8404
    mode http
    stats enable
    stats uri /stats
    stats refresh 5s
    stats auth admin:secure_password
EOF

sudo systemctl restart haproxy

Keepalived for HA

Install keepalived:

sudo apt install keepalived

Configure on primary load balancer:

sudo tee /etc/keepalived/keepalived.conf << 'EOF'
vrrp_script check_haproxy {
    script "/usr/bin/killall -0 haproxy"
    interval 2
    weight 2
}

vrrp_instance VI_1 {
    state MASTER
    interface eth0
    virtual_router_id 51
    priority 101
    advert_int 1

    authentication {
        auth_type PASS
        auth_pass secure_password
    }

    virtual_ipaddress {
        192.168.1.100/24
    }

    track_script {
        check_haproxy
    }
}
EOF

sudo systemctl restart keepalived

Configure on backup load balancer:

# Same as above, but:
state BACKUP
priority 100

Database Clustering

MariaDB Galera Cluster

Install Galera on all database nodes:

sudo apt install mariadb-server mariadb-client galera-4

Configure first node (mb-db1):

sudo tee /etc/mysql/mariadb.conf.d/60-galera.cnf << 'EOF'
[galera]
wsrep_on=ON
wsrep_provider=/usr/lib/galera/libgalera_smm.so
wsrep_cluster_name="mailborder_cluster"
wsrep_cluster_address="gcomm://192.168.1.111,192.168.1.112,192.168.1.113"
wsrep_node_address="192.168.1.111"
wsrep_node_name="mb-db1"
wsrep_sst_method=rsync

binlog_format=ROW
default_storage_engine=InnoDB
innodb_autoinc_lock_mode=2
innodb_doublewrite=1
innodb_flush_log_at_trx_commit=0

# Allow replication from all cluster nodes
bind-address=0.0.0.0
EOF

Bootstrap first node:

sudo galera_new_cluster
sudo systemctl start mysql

Configure additional nodes (mb-db2, mb-db3):

# Same configuration, change:
wsrep_node_address="192.168.1.112"  # or .113
wsrep_node_name="mb-db2"            # or mb-db3

sudo systemctl start mysql

Verify cluster:

sudo mysql -e "SHOW STATUS LIKE 'wsrep_cluster_size'"

Output:

+--------------------+-------+
| Variable_name      | Value |
+--------------------+-------+
| wsrep_cluster_size | 3     |
+--------------------+-------+

Database Connection Configuration

Configure Mailborder nodes to use cluster:

sudo nano /etc/mailborder/mailborder.conf

[database]
# Use load balancer or connection pooler
host = 192.168.1.100
port = 3306

# Or specify multiple hosts for failover
hosts = 192.168.1.111:3306,192.168.1.112:3306,192.168.1.113:3306

# Connection pooling
pool_size = 10
pool_max = 50
pool_timeout = 30

Shared Redis Configuration

Redis Sentinel for HA

Install Redis Sentinel on all nodes:

sudo apt install redis-sentinel

Configure Redis primary (mb-redis1):

sudo tee -a /etc/redis/redis.conf << 'EOF'
bind 0.0.0.0
protected-mode yes
requirepass secure_redis_password
masterauth secure_redis_password
EOF

sudo systemctl restart redis-server

Configure Redis replicas (mb-redis2, mb-redis3):

sudo tee -a /etc/redis/redis.conf << 'EOF'
bind 0.0.0.0
replicaof 192.168.1.121 6379
masterauth secure_redis_password
requirepass secure_redis_password
EOF

sudo systemctl restart redis-server

Configure Sentinel on all nodes:

sudo tee /etc/redis/sentinel.conf << 'EOF'
port 26379
sentinel monitor mailborder-master 192.168.1.121 6379 2
sentinel auth-pass mailborder-master secure_redis_password
sentinel down-after-milliseconds mailborder-master 5000
sentinel parallel-syncs mailborder-master 1
sentinel failover-timeout mailborder-master 10000
EOF

sudo systemctl restart redis-sentinel

Configure Mailborder to use Sentinel:

sudo nano /etc/mailborder/mailborder.conf

[redis]
sentinel_enabled = true
sentinel_hosts = 192.168.1.121:26379,192.168.1.122:26379,192.168.1.123:26379
sentinel_master = mailborder-master
password = secure_redis_password

Shared Storage

NFS for Quarantine Storage

Setup NFS server:

sudo apt install nfs-kernel-server

# Create shared directory
sudo mkdir -p /export/mailborder/quarantine
sudo chown mailborder:mailborder /export/mailborder/quarantine

# Export directory
sudo tee -a /etc/exports << 'EOF'
/export/mailborder/quarantine 192.168.1.0/24(rw,sync,no_subtree_check,no_root_squash)
EOF

sudo exportfs -a
sudo systemctl restart nfs-kernel-server

Mount on Mailborder nodes:

sudo apt install nfs-common

# Add to fstab
echo "192.168.1.130:/export/mailborder/quarantine /var/spool/mailborder/quarantine nfs defaults 0 0" | \
  sudo tee -a /etc/fstab

# Mount
sudo mount -a

Verify:

df -h | grep quarantine

GlusterFS Alternative

Install GlusterFS on storage nodes:

sudo apt install glusterfs-server
sudo systemctl start glusterd
sudo systemctl enable glusterd

Create trusted pool:

# Run on first node
sudo gluster peer probe 192.168.1.131
sudo gluster peer probe 192.168.1.132
sudo gluster peer status

Create replicated volume:

sudo gluster volume create mailborder-quarantine replica 3 \
  192.168.1.130:/data/gluster/quarantine \
  192.168.1.131:/data/gluster/quarantine \
  192.168.1.132:/data/gluster/quarantine

sudo gluster volume start mailborder-quarantine

Mount on Mailborder nodes:

sudo apt install glusterfs-client

echo "192.168.1.130:/mailborder-quarantine /var/spool/mailborder/quarantine glusterfs defaults,_netdev 0 0" | \
  sudo tee -a /etc/fstab

sudo mount -a

Configuration Synchronization

Automated Config Sync

Setup rsync-based synchronization:

sudo tee /usr/local/bin/mb-sync-config.sh << 'EOF'
#!/bin/bash

PRIMARY_NODE="192.168.1.101"
SECONDARY_NODES="192.168.1.102 192.168.1.103"

if [ "$(hostname -I | grep -o '192.168.1.101')" ]; then
    # This is the primary node, push to secondaries
    for node in $SECONDARY_NODES; do
        rsync -avz --delete \
          /etc/mailborder/ \
          root@$node:/etc/mailborder/

        ssh root@$node "systemctl reload mb-rpcd"
    done
    echo "Configuration synced to secondary nodes"
else
    echo "Not primary node, skipping sync"
fi
EOF

sudo chmod +x /usr/local/bin/mb-sync-config.sh

Automate with cron:

echo "*/5 * * * * /usr/local/bin/mb-sync-config.sh" | sudo crontab -

Git-Based Configuration Management

Initialize git repository on primary:

cd /etc/mailborder
sudo git init
sudo git add .
sudo git commit -m "Initial configuration"

Clone on secondary nodes:

sudo git clone root@192.168.1.101:/etc/mailborder /etc/mailborder-git

Auto-pull on secondaries:

sudo tee /usr/local/bin/mb-config-pull.sh << 'EOF'
#!/bin/bash
cd /etc/mailborder
git pull origin master
systemctl reload mb-rpcd
EOF

sudo chmod +x /usr/local/bin/mb-config-pull.sh
echo "*/5 * * * * /usr/local/bin/mb-config-pull.sh" | sudo crontab -

Session Management

Redis-Based Sessions

Configure shared sessions:

sudo nano /etc/mailborder/mailborder.conf

[sessions]
handler = redis
redis_host = 192.168.1.100  # Load balancer VIP
redis_port = 6379
redis_password = secure_redis_password
redis_db = 1
session_lifetime = 3600

Verify session sharing:

# Login on one node
curl -c cookies.txt -d "email=admin@example.com&password=test" https://mb1.example.com/login

# Use session on another node
curl -b cookies.txt https://mb2.example.com/dashboard

Health Checks

Application Health Endpoint

Create health check endpoint:

sudo tee /srv/mailborder/master/health.php << 'EOF'
<?php
header('Content-Type: application/json');

$health = [
    'status' => 'healthy',
    'checks' => []
];

// Check database
try {
    $pdo = new PDO('mysql:host=localhost;dbname=mailborder', 'mailborder', 'password');
    $health['checks']['database'] = 'ok';
} catch (Exception $e) {
    $health['checks']['database'] = 'failed';
    $health['status'] = 'unhealthy';
}

// Check Redis
try {
    $redis = new Redis();
    $redis->connect('localhost', 6379);
    $redis->ping();
    $health['checks']['redis'] = 'ok';
} catch (Exception $e) {
    $health['checks']['redis'] = 'failed';
    $health['status'] = 'unhealthy';
}

// Check services
$services = ['mb-rpcd', 'mb-filter', 'mb-milter'];
foreach ($services as $service) {
    $status = shell_exec("systemctl is-active $service");
    $health['checks'][$service] = (trim($status) === 'active') ? 'ok' : 'failed';
    if ($health['checks'][$service] === 'failed') {
        $health['status'] = 'unhealthy';
    }
}

http_response_code($health['status'] === 'healthy' ? 200 : 503);
echo json_encode($health);
?>
EOF

Configure Nginx location:

sudo nano /etc/nginx/sites-available/mailborder

location /health {
    access_log off;
    fastcgi_pass unix:/run/php/php8.2-fpm.sock;
    include fastcgi_params;
    fastcgi_param SCRIPT_FILENAME /srv/mailborder/master/health.php;
}

Test health check:

curl -s https://localhost/health | jq

Monitoring and Alerting

Cluster Status Monitoring

Create cluster status script:

sudo tee /usr/local/bin/mb-cluster-status.sh << 'EOF'
#!/bin/bash

echo "=== Mailborder Cluster Status ==="
echo

echo "Load Balancer:"
systemctl is-active haproxy keepalived

echo
echo "Application Nodes:"
for node in 192.168.1.101 192.168.1.102 192.168.1.103; do
    echo -n "$node: "
    curl -s -o /dev/null -w "%{http_code}" https://$node/health
    echo
done

echo
echo "Database Cluster:"
mysql -h 192.168.1.100 -e "SHOW STATUS LIKE 'wsrep_cluster_size'"

echo
echo "Redis Cluster:"
redis-cli -h 192.168.1.100 INFO replication | grep role
redis-cli -h 192.168.1.100 SENTINEL masters
EOF

sudo chmod +x /usr/local/bin/mb-cluster-status.sh

Run status check:

sudo /usr/local/bin/mb-cluster-status.sh

Failover Procedures

Planned Node Maintenance

Step 1: Drain traffic from node

# Mark node as down in HAProxy
echo "set server smtp_backend/mb2 state maint" | \
  sudo socat stdio /run/haproxy/admin.sock

Step 2: Wait for connections to drain

watch 'netstat -an | grep ESTABLISHED | grep ":25 " | wc -l'

Step 3: Stop services

sudo mb-services stop

Step 4: Perform maintenance

sudo apt update && sudo apt upgrade

Step 5: Start services

sudo mb-services start
sudo mb-status

Step 6: Return node to service

echo "set server smtp_backend/mb2 state ready" | \
  sudo socat stdio /run/haproxy/admin.sock

Unplanned Node Failure

Automatic failover: - HAProxy detects failure via health checks - Traffic automatically routed to healthy nodes - Database cluster continues with remaining nodes - Redis Sentinel promotes new primary if needed

Recovery procedure:

# Investigate failure
sudo journalctl -xe
sudo mb-doctor

# Fix issue
sudo systemctl restart mb-services

# Verify health
curl -s https://localhost/health | jq

# Node automatically rejoins cluster when healthy

Performance Tuning

Load Distribution

Optimize HAProxy algorithm:

backend smtp_backend
    balance leastconn  # Use least connections instead of roundrobin
    option tcp-check
    server mb1 192.168.1.101:25 check weight 10
    server mb2 192.168.1.102:25 check weight 10
    server mb3 192.168.1.103:25 check weight 5  # Lower weight for less powerful node

Session affinity:

backend https_backend
    balance roundrobin
    cookie SERVERID insert indirect nocache
    # Ensures user stays on same node for session

Connection Pooling

Optimize database connections:

[database]
pool_size = 20          # Per-node pool size
pool_max = 100          # Maximum connections
pool_recycle = 3600     # Recycle connections hourly
pool_pre_ping = true    # Test connection before use

Testing the Cluster

Load Testing

Install testing tools:

sudo apt install siege apache2-utils

HTTP load test:

ab -n 10000 -c 100 https://mailborder.example.com/

SMTP load test:

for i in {1..1000}; do
  echo "Test message $i" | mail -s "Load Test $i" test@example.com
done

Monitor during test:

# On each node
watch 'ps aux | grep mb-'
watch 'netstat -an | grep ESTABLISHED | wc -l'

See Also