Appendix This is where you will find all the code used in the article 'RabbitMQ for VMware vCloud Suite. ==================================================================================== Section: Installation File System Location: /etc/yum.repos.d/rabbitmq_erlang.repo ==================================================================================== [rabbitmq_erlang] name=rabbitmq_erlang baseurl=https://packagecloud.io/rabbitmq/erlang/el/8/$basearch repo_gpgcheck=1 gpgcheck=1 enabled=1 # PackageCloud's repository key and RabbitMQ package signing key gpgkey=https://packagecloud.io/rabbitmq/erlang/gpgkey https://dl.bintray.com/rabbitmq/Keys/rabbitmq-release-signing-key.asc sslverify=1 sslcacert=/etc/pki/tls/certs/ca-bundle.crt metadata_expire=300 [rabbitmq_erlang-source] name=rabbitmq_erlang-source baseurl=https://packagecloud.io/rabbitmq/erlang/el/8/SRPMS repo_gpgcheck=1 gpgcheck=0 enabled=1 # PackageCloud's repository key and RabbitMQ package signing key gpgkey=https://packagecloud.io/rabbitmq/erlang/gpgkey https://dl.bintray.com/rabbitmq/Keys/rabbitmq-release-signing-key.asc sslverfy=1 sslcacert=/etc/pki/tls/certs/ca-bundle.crt metadata_expire=300 ==================================================================================== Section: Installation File System Location: /etc/rabbitmq/rabbitmq.conf Step: 18 ==================================================================================== # The RabbitMQ server source repository contains an example rabbitmq.conf file named rabbitmq.conf.example. # It contains examples of most of the configuration items you might want to set (with some very obscure # ones omitted), along with documentation for those settings. # # https://github.com/rabbitmq/rabbitmq-server/blob/v3.7.x/docs/rabbitmq.conf.example ## ## Clustering ## ===================== ## # This guide covers one specific aspect of clustering: network failures between nodes, their effects and recovery options. # https://www.rabbitmq.com/partitions.html # Options and Behaviors described here --> https://www.rabbitmq.com/partitions.html#automatic-handling cluster_partition_handling = pause_minority ## Make clustering happen *automatically* at startup. Only applied ## to nodes that have just been reset or started for the first time. ## ## Relevant doc guide: https://rabbitmq.com//cluster-formation.html ## # Peer Discovery Mechanism cluster_formation.peer_discovery_backend = classic_config # Hardcode the list of nodes cluster_formation.classic_config.nodes.1 = rabbit@rabbit1.domain.local cluster_formation.classic_config.nodes.2 = rabbit@rabbit2.domain.local cluster_formation.classic_config.nodes.3 = rabbit@rabbit3.domain.local ## This node's type can be configured. If you are not sure ## what node type to use, always use 'disc'. cluster_formation.node_type = disc ## Mirror sync batch size, in messages. Increasing this will speed ## up syncing but total batch size in bytes must not exceed 2 GiB. ## Available in RabbitMQ 3.6.0 or later. ## mirroring_sync_batch_size = 100 vm_memory_high_watermark.relative = 0.8 disk_free_limit.absolute = 5GB ==================================================================================== Section: Installation File System Location: /etc/rabbitmq/rabbitmq.conf Step: 55 ==================================================================================== # The RabbitMQ server source repository contains an example rabbitmq.conf file named rabbitmq.conf.example. # It contains examples of most of the configuration items you might want to set (with some very obscure # ones omitted), along with documentation for those settings. # # https://github.com/rabbitmq/rabbitmq-server/blob/v3.7.x/docs/rabbitmq.conf.example ## ## Clustering ## ===================== ## # This guide covers one specific aspect of clustering: network failures between nodes, their effects and recovery options. # https://www.rabbitmq.com/partitions.html # Options and Behaviors described here --> https://www.rabbitmq.com/partitions.html#automatic-handling cluster_partition_handling = pause_minority ## Make clustering happen *automatically* at startup. Only applied ## to nodes that have just been reset or started for the first time. ## ## Relevant doc guide: https://rabbitmq.com//cluster-formation.html ## # Peer Discovery Mechanism cluster_formation.peer_discovery_backend = classic_config # Hardcode the list of nodes cluster_formation.classic_config.nodes.1 = rabbit@rabbit1.domain.local cluster_formation.classic_config.nodes.2 = rabbit@rabbit2.domain.local cluster_formation.classic_config.nodes.3 = rabbit@rabbit3.domain.local ## This node's type can be configured. If you are not sure ## what node type to use, always use 'disc'. cluster_formation.node_type = disc ## Mirror sync batch size, in messages. Increasing this will speed ## up syncing but total batch size in bytes must not exceed 2 GiB. ## Available in RabbitMQ 3.6.0 or later. ## mirroring_sync_batch_size = 100 vm_memory_high_watermark.relative = 0.8 disk_free_limit.absolute = 5GB ## ## Encrypted Communications ## ======================== ## listeners.tcp = none listeners.ssl.default = 5671 num_acceptors.ssl = 10 ## TLS configuration. ## ## Related doc guide: https://rabbitmq.com/ssl.html. ## ssl_options.verify = verify_peer ssl_options.fail_if_no_peer_cert = true ssl_options.cacertfile = /etc/pki/tls/certs/domain.local.CA.cer.pem ssl_options.certfile = /etc/pki/tls/certs/rabbitmqcluster.pem ssl_options.keyfile = /etc/pki/tls/certs/rabbitmq.key ssl_options.depth = 5 ssl_options.client_renegotiation = false ssl_options.secure_renegotiate = true ssl_options.honor_cipher_order = true ssl_options.honor_ecc_order = true ssl_options.versions.1 = tlsv1.3 ssl_options.versions.2 = tlsv1.2 ssl_options.ciphers.1 = ECDHE-ECDSA-AES256-GCM-SHA384 ssl_options.ciphers.2 = ECDHE-RSA-AES256-GCM-SHA384 ssl_options.ciphers.3 = ECDHE-ECDSA-AES256-SHA384 ssl_options.ciphers.4 = ECDHE-RSA-AES256-SHA384 ssl_options.ciphers.5 = ECDH-ECDSA-AES256-GCM-SHA384 ssl_options.ciphers.6 = ECDH-RSA-AES256-GCM-SHA384 ssl_options.ciphers.7 = ECDH-ECDSA-AES256-SHA384 ssl_options.ciphers.8 = ECDH-RSA-AES256-SHA384 ssl_options.ciphers.9 = DHE-RSA-AES256-GCM-SHA384 ssl_options.ciphers.10 = DHE-DSS-AES256-GCM-SHA384 ssl_options.ciphers.11 = DHE-RSA-AES256-SHA256 ssl_options.ciphers.12 = DHE-DSS-AES256-SHA256 ssl_options.ciphers.13 = ECDHE-ECDSA-AES128-GCM-SHA256 ssl_options.ciphers.14 = ECDHE-RSA-AES128-GCM-SHA256 ssl_options.ciphers.15 = ECDHE-ECDSA-AES128-SHA256 ssl_options.ciphers.16 = ECDHE-RSA-AES128-SHA256 ssl_options.ciphers.17 = ECDH-ECDSA-AES128-GCM-SHA256 ssl_options.ciphers.18 = ECDH-RSA-AES128-GCM-SHA256 ssl_options.ciphers.19 = ECDH-ECDSA-AES128-SHA256 ssl_options.ciphers.20 = ECDH-RSA-AES128-SHA256 ssl_options.ciphers.21 = DHE-RSA-AES128-GCM-SHA256 ssl_options.ciphers.22 = DHE-DSS-AES128-GCM-SHA256 ssl_options.ciphers.23 = DHE-RSA-AES128-SHA256 ssl_options.ciphers.24 = DHE-DSS-AES128-SHA256 ssl_options.ciphers.25 = ECDHE-ECDSA-AES256-SHA ssl_options.ciphers.26 = ECDHE-RSA-AES256-SHA ssl_options.ciphers.27 = DHE-RSA-AES256-SHA ssl_options.ciphers.28 = DHE-DSS-AES256-SHA ssl_options.ciphers.29 = ECDH-ECDSA-AES256-SHA ssl_options.ciphers.30 = ECDH-RSA-AES256-SHA ssl_options.ciphers.31 = ECDHE-ECDSA-AES128-SHA ssl_options.ciphers.32 = ECDHE-RSA-AES128-SHA ssl_options.ciphers.33 = DHE-RSA-AES128-SHA ssl_options.ciphers.34 = DHE-DSS-AES128-SHA ssl_options.ciphers.35 = ECDH-ECDSA-AES128-SHA ssl_options.ciphers.36 = ECDH-RSA-AES128-SHA ## ## Management Web UI Encryption ## management.ssl.port = 15671 management.ssl.cacertfile = /etc/pki/tls/certs/domain.local.CA.cer.pem management.ssl.certfile = /etc/pki/tls/certs/rabbitmqcluster.pem management.ssl.keyfile = /etc/pki/tls/certs/rabbitmq.key management.ssl.honor_cipher_order = true management.ssl.honor_ecc_order = true management.ssl.client_renegotiation = false management.ssl.secure_renegotiate = true management.ssl.versions.1 = tlsv1.3 management.ssl.versions.2 = tlsv1.2 management.ssl.ciphers.1 = ECDHE-ECDSA-AES256-GCM-SHA384 management.ssl.ciphers.2 = ECDHE-RSA-AES256-GCM-SHA384 management.ssl.ciphers.3 = ECDHE-ECDSA-AES256-SHA384 management.ssl.ciphers.4 = ECDHE-RSA-AES256-SHA384 management.ssl.ciphers.5 = ECDH-ECDSA-AES256-GCM-SHA384 management.ssl.ciphers.6 = ECDH-RSA-AES256-GCM-SHA384 management.ssl.ciphers.7 = ECDH-ECDSA-AES256-SHA384 management.ssl.ciphers.8 = ECDH-RSA-AES256-SHA384 management.ssl.ciphers.9 = DHE-RSA-AES256-GCM-SHA384 ==================================================================================== Section: Installation File System Location: /etc/rabbitmq/rabbitmq-env.conf Step: 19 ==================================================================================== # https://blog.sleeplessbeastie.eu/2020/02/03/how-to-specify-rabbitmq-node-name/ NODENAME=rabbit USE_LONGNAME=true ==================================================================================== Section: Installation File System Location: /etc/rabbitmq/rabbitmq-env.conf Step: 56 ==================================================================================== # NOTE: the following path is system dependent and will change between Erlang versions ERL_SSL_PATH=/usr/lib64/erlang/lib/ssl-10.0/ebin # Flag Description # ====== ============= # -pa $ERL_SSL_PATH prepends the directory ERL_SSL_PATH points at to the code path # -proto_dist inet_tls tells the runtime to encrypt inter-node communication # -ssl_dist_optfile tells the runtime where to find its inter-node TLS configuration file SERVER_ADDITIONAL_ERL_ARGS="-pa $ERL_SSL_PATH -proto_dist inet_tls -ssl_dist_optfile /etc/rabbitmq/inter_node_tls.config" RABBITMQ_CTL_ERL_ARGS="-pa $ERL_SSL_PATH -proto_dist inet_tls -ssl_dist_optfile /etc/rabbitmq/inter_node_tls.config" # https://blog.sleeplessbeastie.eu/2020/02/03/how-to-specify-rabbitmq-node-name/ NODENAME=rabbit USE_LONGNAME=true ==================================================================================== Section: Installation File System Location: /etc/rabbitmq/inter_node_tls.config ==================================================================================== [ {server, [ {cacertfile, "/etc/pki/tls/certs/domain.local.CA.cer.pem"}, {certfile, "/etc/pki/tls/certs/rabbitmqcluster.pem"}, {keyfile, "/etc/pki/tls/certs/rabbitmq.key"}, {depth, 5}, {secure_renegotiate, true}, {verify, verify_peer}, {fail_if_no_peer_cert, false} ]}, {client, [ {secure_renegotiate, true} ]} ]. ==================================================================================== Section: Installation File System Location: /etc/keepalived/keepalived.conf Node: 1 ==================================================================================== # Define scripts to check RabbitMQ health vrrp_script health_check_port_5671 { # How this particular health check works: # # Run a script to verify port TCP 5671 on localhost is open. If yes, give exit code 0 (success). # If not open, give exit code 1 (failure). If exit code 0 is seen by this health check, then Keepalived # will add priority weight (10 points) to this node. If an exit code other than zero is returned, then # Keepalived will not add priority weight. # # # How health checks (in general) work: # # When all keepalived nodes start, they are assigned a base priority. This priority is hardcoded in this # configuration file to 1. # # The keepalived nodes compare each other's priorities and the one with the highest priority becomes # the MASTER. Because this configuration file is the same across all our keepalived nodes, each node # will have the same priority value (1). # # When more than one node is tied for 1st Place, they will hold an election to decide who will be the MASTER. # # But before the election, each node will run a series of health check scripts. If the script has an exit code of zero, # that node has passed the health check and their priority is raised by the health check weight (10). # # If the script has a non-zero exit code, that node has failed the health check. Their priority is reduced # the same amount as the weight of this health check (10). # Example: Default_Priority + Health_Check_Weight (Pass) = Final_Priority # 1 + 10 = 11 # # Later, when it fails this health check, the points get taken away: # Example: Current_Priority - Health_Check_Weight (Fail) = Final_Priority # 11 - 10 = 1 # # Health check weights can only be removed after they have been added. If a node has a current priority of 1, # and fails a health check worth 10, the node will retain the priority of 1. The node will never have a # negative priority (1 - 10 = -9). I've experimented and negative values don't work. # # # # So what happens when a keepalived node is booted, or the service is restarted when there is no network? # Answer: That node will not hear the priority broadcasts of the other Keepalived nodes and therefore # think it has the highest priority. Having the highest priority, this node will assume the role of MASTER # and begin to advertise (broadcast) the keepalived virtual IP. Of course, these broadcasts go # nowhere because there is no network connectivity, but the cluster virtual IP might show up in scripts # or on a hypervisor web portal for that VM because the IP address data is extracted through a back-channel. # # Rephrased Answer: If keepalived process cannot see any other VRRP speaker for a certain virtual_router_id # (in our case #99), then the keepalived process on this server will believe itself to be the keepalived # cluster member with the highest priority, and thus the rightful MASTER. # # But what about the failing health check? I thought failing a health check was supposed to stop a node from becoming MASTER? # Nope. All it does is reduce the node's priority to make it more difficult to become MASTER. # This health check must be located in a certain directory that is allowed by SELinux # To know more, visit https://bits.enigmabridge.com/articles/2016-11/keepalived-high-available-setup.html#scripts-not-working---fault-state script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5671.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_5672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_15672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-15672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_25672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-25672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_cluster { # NOTE: This check can take a long time. script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-cluster.sh" weight 10 interval 30 # Check every 30 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_queue_master { # NOTE: This check can take a long time. script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-queue-master.sh" weight 10 interval 60 # Check every 60 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } # Create a VRRP instance vrrp_instance RabbitMQ_Virtual_IP { # The initial state of each keepalived node. This option isn't # really all that valuable, since an election will occur # and the host with the highest priority will become # the master. state BACKUP # The interface keepalived will manage interface ens192 # This particular VLAN/Distributed Switch does not allow multicast traffic. # Must tell Keepalived to use unicast instead of multicast. unicast_src_ip A.A.A.12 # Unicast specific option, this is the IP of the interface keepalived listens on unicast_peer { # Unicast specific option, this is the IP of the peer instance #A.A.A.11 # Disable yourself A.A.A.12 A.A.A.13 } # The virtual router id number to assign the routers to virtual_router_id 99 # The priority to assign to this device. This controls # who will become the MASTER and BACKUP for a given # VRRP instance. priority 1 # How many seconds to wait until a gratuitous arp is sent #garp_master_delay 2 # Allow a lower priority machine to retain the master role # when a higher priority machine comes online. #nopreeempt # How often to send out VRRP advertisements advert_int 1 # Execute a notification script when a host transitions to # MASTER or BACKUP, or when a fault occurs. The arguments # passed to the script are: # $1 - "GROUP"|"INSTANCE" # $2 = name of group or instance # $3 = target state of transition # Sample: VRRP-notification.sh VRRP_ROUTER1 BACKUP 100 #notify "/usr/local/bin/VRRP-notification.sh" # Send an SMTP alert during a state transition #smtp_alert # Authenticate the remote endpoints via a simple # username/password combination authentication { auth_type AH auth_pass 12345678 } # The virtual IP addresses to float between nodes. The # label statement can be used to bring an interface # online to represent the virtual IP. virtual_ipaddress { A.A.A.10/24 } # Health Check Script(s) track_script { health_check_port_5671 } track_script { health_check_port_5672 } track_script { health_check_port_15672 } track_script { health_check_port_25672 } track_script { health_check_cluster } track_script { health_check_queue_master } } ==================================================================================== Section: Installation File System Location: /etc/keepalived/keepalived.conf Node: 2 ==================================================================================== # Define scripts to check RabbitMQ health vrrp_script health_check_port_5671 { # How this particular health check works: # # Run a script to verify port TCP 5671 on localhost is open. If yes, give exit code 0 (success). # If not open, give exit code 1 (failure). If exit code 0 is seen by this health check, then Keepalived # will add priority weight (10 points) to this node. If an exit code other than zero is returned, then # Keepalived will not add priority weight. # # # How health checks (in general) work: # # When all keepalived nodes start, they are assigned a base priority. This priority is hardcoded in this # configuration file to 1. # # The keepalived nodes compare each other's priorities and the one with the highest priority becomes # the MASTER. Because this configuration file is the same across all our keepalived nodes, each node # will have the same priority value (1). # # When more than one node is tied for 1st Place, they will hold an election to decide who will be the MASTER. # # But before the election, each node will run a series of health check scripts. If the script has an exit code of zero, # that node has passed the health check and their priority is raised by the health check weight (10). # # If the script has a non-zero exit code, that node has failed the health check. Their priority is reduced # the same amount as the weight of this health check (10). # Example: Default_Priority + Health_Check_Weight (Pass) = Final_Priority # 1 + 10 = 11 # # Later, when it fails this health check, the points get taken away: # Example: Current_Priority - Health_Check_Weight (Fail) = Final_Priority # 11 - 10 = 1 # # Health check weights can only be removed after they have been added. If a node has a current priority of 1, # and fails a health check worth 10, the node will retain the priority of 1. The node will never have a # negative priority (1 - 10 = -9). I've experimented and negative values don't work. # # # # So what happens when a keepalived node is booted, or the service is restarted when there is no network? # Answer: That node will not hear the priority broadcasts of the other Keepalived nodes and therefore # think it has the highest priority. Having the highest priority, this node will assume the role of MASTER # and begin to advertise (broadcast) the keepalived virtual IP. Of course, these broadcasts go # nowhere because there is no network connectivity, but the cluster virtual IP might show up in scripts # or on a hypervisor web portal for that VM because the IP address data is extracted through a back-channel. # # Rephrased Answer: If keepalived process cannot see any other VRRP speaker for a certain virtual_router_id # (in our case #99), then the keepalived process on this server will believe itself to be the keepalived # cluster member with the highest priority, and thus the rightful MASTER. # # But what about the failing health check? I thought failing a health check was supposed to stop a node from becoming MASTER? # Nope. All it does is reduce the node's priority to make it more difficult to become MASTER. # This health check must be located in a certain directory that is allowed by SELinux # To know more, visit https://bits.enigmabridge.com/articles/2016-11/keepalived-high-available-setup.html#scripts-not-working---fault-state script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5671.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_5672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_15672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-15672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_25672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-25672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_cluster { # NOTE: This check can take a long time. script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-cluster.sh" weight 10 interval 30 # Check every 30 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_queue_master { # NOTE: This check can take a long time. script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-queue-master.sh" weight 10 interval 60 # Check every 60 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } # Create a VRRP instance vrrp_instance RabbitMQ_Virtual_IP { # The initial state of each keepalived node. This option isn't # really all that valuable, since an election will occur # and the host with the highest priority will become # the master. state BACKUP # The interface keepalived will manage interface ens192 # This particular VLAN/Distributed Switch does not allow multicast traffic. # Must tell Keepalived to use unicast instead of multicast. unicast_src_ip A.A.A.12 # Unicast specific option, this is the IP of the interface keepalived listens on unicast_peer { # Unicast specific option, this is the IP of the peer instance A.A.A.11 #A.A.A.12 # Disable yourself A.A.A.13 } # The virtual router id number to assign the routers to virtual_router_id 99 # The priority to assign to this device. This controls # who will become the MASTER and BACKUP for a given # VRRP instance. priority 1 # How many seconds to wait until a gratuitous arp is sent #garp_master_delay 2 # Allow a lower priority machine to retain the master role # when a higher priority machine comes online. #nopreeempt # How often to send out VRRP advertisements advert_int 1 # Execute a notification script when a host transitions to # MASTER or BACKUP, or when a fault occurs. The arguments # passed to the script are: # $1 - "GROUP"|"INSTANCE" # $2 = name of group or instance # $3 = target state of transition # Sample: VRRP-notification.sh VRRP_ROUTER1 BACKUP 100 #notify "/usr/local/bin/VRRP-notification.sh" # Send an SMTP alert during a state transition #smtp_alert # Authenticate the remote endpoints via a simple # username/password combination authentication { auth_type AH auth_pass 12345678 } # The virtual IP addresses to float between nodes. The # label statement can be used to bring an interface # online to represent the virtual IP. virtual_ipaddress { A.A.A.10/24 } # Health Check Script(s) track_script { health_check_port_5671 } track_script { health_check_port_5672 } track_script { health_check_port_15672 } track_script { health_check_port_25672 } track_script { health_check_cluster } track_script { health_check_queue_master } } ==================================================================================== Section: Installation File System Location: /etc/keepalived/keepalived.conf Node: 3 ==================================================================================== # Define scripts to check RabbitMQ health vrrp_script health_check_port_5671 { # How this particular health check works: # # Run a script to verify port TCP 5671 on localhost is open. If yes, give exit code 0 (success). # If not open, give exit code 1 (failure). If exit code 0 is seen by this health check, then Keepalived # will add priority weight (10 points) to this node. If an exit code other than zero is returned, then # Keepalived will not add priority weight. # # # How health checks (in general) work: # # When all keepalived nodes start, they are assigned a base priority. This priority is hardcoded in this # configuration file to 1. # # The keepalived nodes compare each other's priorities and the one with the highest priority becomes # the MASTER. Because this configuration file is the same across all our keepalived nodes, each node # will have the same priority value (1). # # When more than one node is tied for 1st Place, they will hold an election to decide who will be the MASTER. # # But before the election, each node will run a series of health check scripts. If the script has an exit code of zero, # that node has passed the health check and their priority is raised by the health check weight (10). # # If the script has a non-zero exit code, that node has failed the health check. Their priority is reduced # the same amount as the weight of this health check (10). # Example: Default_Priority + Health_Check_Weight (Pass) = Final_Priority # 1 + 10 = 11 # # Later, when it fails this health check, the points get taken away: # Example: Current_Priority - Health_Check_Weight (Fail) = Final_Priority # 11 - 10 = 1 # # Health check weights can only be removed after they have been added. If a node has a current priority of 1, # and fails a health check worth 10, the node will retain the priority of 1. The node will never have a # negative priority (1 - 10 = -9). I've experimented and negative values don't work. # # # # So what happens when a keepalived node is booted, or the service is restarted when there is no network? # Answer: That node will not hear the priority broadcasts of the other Keepalived nodes and therefore # think it has the highest priority. Having the highest priority, this node will assume the role of MASTER # and begin to advertise (broadcast) the keepalived virtual IP. Of course, these broadcasts go # nowhere because there is no network connectivity, but the cluster virtual IP might show up in scripts # or on a hypervisor web portal for that VM because the IP address data is extracted through a back-channel. # # Rephrased Answer: If keepalived process cannot see any other VRRP speaker for a certain virtual_router_id # (in our case #99), then the keepalived process on this server will believe itself to be the keepalived # cluster member with the highest priority, and thus the rightful MASTER. # # But what about the failing health check? I thought failing a health check was supposed to stop a node from becoming MASTER? # Nope. All it does is reduce the node's priority to make it more difficult to become MASTER. # This health check must be located in a certain directory that is allowed by SELinux # To know more, visit https://bits.enigmabridge.com/articles/2016-11/keepalived-high-available-setup.html#scripts-not-working---fault-state script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5671.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_5672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_15672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-15672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_port_25672 { script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-25672.sh" weight 10 interval 2 # Check every 2 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_cluster { # NOTE: This check can take a long time. script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-cluster.sh" weight 10 interval 30 # Check every 30 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } vrrp_script health_check_queue_master { # NOTE: This check can take a long time. script "/usr/libexec/keepalived/keepalived-rabbitmq-health-check-queue-master.sh" weight 10 interval 60 # Check every 60 seconds fall 1 # Require 1 failure for KO rise 1 # Require 1 success for OK } # Create a VRRP instance vrrp_instance RabbitMQ_Virtual_IP { # The initial state of each keepalived node. This option isn't # really all that valuable, since an election will occur # and the host with the highest priority will become # the master. state BACKUP # The interface keepalived will manage interface ens192 # This particular VLAN/Distributed Switch does not allow multicast traffic. # Must tell Keepalived to use unicast instead of multicast. unicast_src_ip A.A.A.12 # Unicast specific option, this is the IP of the interface keepalived listens on unicast_peer { # Unicast specific option, this is the IP of the peer instance A.A.A.11 A.A.A.12 #A.A.A.13 # Disable yourself } # The virtual router id number to assign the routers to virtual_router_id 99 # The priority to assign to this device. This controls # who will become the MASTER and BACKUP for a given # VRRP instance. priority 1 # How many seconds to wait until a gratuitous arp is sent #garp_master_delay 2 # Allow a lower priority machine to retain the master role # when a higher priority machine comes online. #nopreeempt # How often to send out VRRP advertisements advert_int 1 # Execute a notification script when a host transitions to # MASTER or BACKUP, or when a fault occurs. The arguments # passed to the script are: # $1 - "GROUP"|"INSTANCE" # $2 = name of group or instance # $3 = target state of transition # Sample: VRRP-notification.sh VRRP_ROUTER1 BACKUP 100 #notify "/usr/local/bin/VRRP-notification.sh" # Send an SMTP alert during a state transition #smtp_alert # Authenticate the remote endpoints via a simple # username/password combination authentication { auth_type AH auth_pass 12345678 } # The virtual IP addresses to float between nodes. The # label statement can be used to bring an interface # online to represent the virtual IP. virtual_ipaddress { A.A.A.10/24 } # Health Check Script(s) track_script { health_check_port_5671 } track_script { health_check_port_5672 } track_script { health_check_port_15672 } track_script { health_check_port_25672 } track_script { health_check_cluster } track_script { health_check_queue_master } } ==================================================================================== Section: Installation File System Location: /usr/libexec/keepalived/keepalived-rabbitmq-health-check-cluster.sh ==================================================================================== #!/bin/bash # RabbitMQ health check rabbitmqctl --timeout 10 cluster_status &>/dev/null # Exit codes are stored in "$?". Exit code values: # Zero = RabbitMQ is healthy. Worthy of earning points to host the VIP. # Non-zero = RabbitMQ is not healthy. Not a good candidate for hosting the VIP. Zero points earned. # Effect of Timeout: # When node and RMQ is up, but network is down, and "--timeout" is not specified, eventually the command will return "69" or "64". # When node and RMQ is up, but network is down, and "--timeout" is specified, the command will return "75", which is "timeout reached". # Return the exit code of the RabbitMQ health check if [ "$?" -gt 0 ] then exit 1 else exit 0 fi ==================================================================================== Section: Installation File System Location: /usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5671.sh ==================================================================================== #!/bin/bash # RabbitMQ health check # Exit codes are stored in "$?". Exit code values: # Zero = RabbitMQ is healthy. Worthy of earning points to host the VIP. # Non-zero = RabbitMQ is not healthy. Not a good candidate for hosting the VIP. Zero points earned. # Test port TCP 25672, which is used for inter-node and CLI tools communications. # The exit codes are always 1 or 0. lsof -i -P -n | grep LISTEN | grep :5671 &>/dev/null exit $? ==================================================================================== Section: Installation File System Location: /usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-5672.sh ==================================================================================== #!/bin/bash # RabbitMQ health check # Exit codes are stored in "$?". Exit code values: # Zero = RabbitMQ is healthy. Worthy of earning points to host the VIP. # Non-zero = RabbitMQ is not healthy. Not a good candidate for hosting the VIP. Zero points earned. # Test port TCP 25672, which is used for inter-node and CLI tools communications. # The exit codes are always 1 or 0. lsof -i -P -n | grep LISTEN | grep :5672 &>/dev/null exit $? ==================================================================================== Section: Installation File System Location: /usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-15672.sh ==================================================================================== #!/bin/bash # RabbitMQ health check # Exit codes are stored in "$?". Exit code values: # Zero = RabbitMQ is healthy. Worthy of earning points to host the VIP. # Non-zero = RabbitMQ is not healthy. Not a good candidate for hosting the VIP. Zero points earned. # Test port TCP 25672, which is used for inter-node and CLI tools communications. # The exit codes are always 1 or 0. lsof -i -P -n | grep LISTEN | grep :15672 &>/dev/null exit $? ==================================================================================== Section: Installation File System Location: /usr/libexec/keepalived/keepalived-rabbitmq-health-check-port-25672.sh ==================================================================================== #!/bin/bash # RabbitMQ health check # Exit codes are stored in "$?". Exit code values: # Zero = RabbitMQ is healthy. Worthy of earning points to host the VIP. # Non-zero = RabbitMQ is not healthy. Not a good candidate for hosting the VIP. Zero points earned. # Test port TCP 25672, which is used for inter-node and CLI tools communications. # The exit codes are always 1 or 0. lsof -i -P -n | grep LISTEN | grep :25672 &>/dev/null exit $? ==================================================================================== Section: Installation File System Location: /usr/libexec/keepalived/keepalived-rabbitmq-health-check-queue-master.sh ==================================================================================== #!/bin/bash # List all queues that are online # Ignore any lines that contain "{" OnlineQueueCount=$(rabbitmqctl list_queues --timeout 1 --quiet --online --no-table-headers | grep -v { | wc -l) # Bias this health check towards failure OnlineQueueCount=$((OnlineQueueCount + 1)) # Calculate the half-way point. The local RabbitMQ server must be over this threshold to earn these keepalived healthcheck points. HalfOnlineQueueCount=$(echo "scale=2 ; $OnlineQueueCount / 2" | bc) # List queues that are online and local to this server LocalQueueCount=$(rabbitmqctl list_queues --timeout 1 --quiet --online --local --no-table-headers | grep -v { | wc -l) echo "$HalfOnlineQueueCount vs $LocalQueueCount" #if (( $LocalQueueCount > $HalfOnlineQueueCount )) if (( $(echo "$HalfOnlineQueueCount < $LocalQueueCount" | bc -l) )) then exit 0 else exit 1 fi # Exit codes are stored in "$?". Exit code values: # Zero = RabbitMQ is healthy. Worthy of earning points to host the VIP. # Non-zero = RabbitMQ is not healthy. Not a good candidate for hosting the VIP. Zero points earned. ==================================================================================== Section: Installation File System Location: /etc/firewalld/services/vrrp.xml ==================================================================================== VRRP Virtual Router Redundancy Protocol ==================================================================================== Section: Maintenance Subsection: Keepalived File System Location: Crontab Contents ==================================================================================== * * * * * /root/RestartKeepalivedIf100PercentCPU.sh >/dev/null 2>/root/crontab.errorlog ==================================================================================== Section: Maintenance Subsection: Keepalived Title: Restart Keepalived Script File System Location: /root/RestartKeepalivedIf100PercentCPU.sh ==================================================================================== #/bin/bash -P export TERM=xterm export COLUMNS=512 DoPingCheck=true DoCPUCheck=true echo $(date -u) "Starting" | tee /root/RestartKeepalivedIf100PercentCPU_RecentRun.log if [ "$DoPingCheck" = true ] then IpResults=$(/sbin/ip addr show | grep “A\. A\. A\.10”) #echo $IpResults | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log if [ ! -z "$IpResults" ] then echo $(date -u) "This machine hosts the VIP." | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log PingResult=$(ping -c 5 A.A.A.10 &> /dev/null; echo $?) # 0 = Host Responded # 2 = Host Unreachable if [ $PingResult -ne 0 ] then # The VIP did not respond to pings. # Restart KeepaliveD. echo $(date -u) "The VIP did NOT respond to pings. Restarting." | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log /root/RestartKeepalivedIf100PercentCPU_Historical.log /bin/systemctl restart keepalived # Skip CPU check DoCPUCheck=false else echo $(date -u) "The VIP did respond to pings." | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log fi else echo $(date -u) "This machine does NOT host the VIP." | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log fi fi if [ "$DoCPUCheck" = true ] then # Monitor the CPU utilitization for KeepaliveD service for 10 seconds # Set KeepalivedCpuUsage to the average utilization over 10 seconds KeepalivedCpuUsage=$(top -n 5 -i -b | grep "keepalived" | awk '{s+=$9}END{print s/NR}') # Put into the log file what top is returning # top -n 5 -b -i | grep "keepalived" >> /root/RestartKeepalivedIf100PercentCPU_RecentRun.log echo $(date -u) "KeepaliveD CPU Utilization:" $KeepalivedCpuUsage "%" | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log # If average CPU utilization over 10 seconds is greater than 75%, restart keepalived if (( $(echo "$KeepalivedCpuUsage > 90" | bc -l) )) then echo $(date -u) "KeepaliveD at/near 100% CPU. Restarting." | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log /root/RestartKeepalivedIf100PercentCPU_Historical.log /bin/systemctl restart keepalived else echo $(date -u) "KeepaliveD CPU utilization is OK." | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log fi fi echo $(date -u) "Finishing" | tee -a /root/RestartKeepalivedIf100PercentCPU_RecentRun.log