proxysql_galera_checker.sh 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. #!/bin/bash
  2. ## inspired by Percona clustercheck.sh
  3. # https://github.com/sysown/proxysql/blob/v2.x/tools/proxysql_galera_checker.sh
  4. # CHANGE THOSE
  5. PROXYSQL_USERNAME="admin"
  6. PROXYSQL_PASSWORD="admin"
  7. PROXYSQL_HOSTNAME="localhost"
  8. PROXYSQL_PORT="6032"
  9. #
  10. function usage()
  11. {
  12. cat << EOF
  13. Usage: $0 <hostgroup_id write> [hostgroup_id read] [number writers] [writers are readers 0|1] [log_file]
  14. - HOSTGROUP WRITERS (required) (0..) The hostgroup_id that contains nodes that will server 'writes'
  15. - HOSTGROUP READERS (optional) (0..) The hostgroup_id that contains nodes that will server 'reads'
  16. - NUMBER WRITERS (optional) (0..) Maximum number of write hostgroup_id node that can be marked ONLINE
  17. When 0 (default), all nodes can be marked ONLINE
  18. - WRITERS ARE READERS (optional) (0|1) When 1 (default), ONLINE nodes in write hostgroup_id will prefer not
  19. to be ONLINE in read hostgroup_id
  20. - LOG_FILE (optional) file logfile where node state checks & changes are written to (verbose)
  21. Notes about the mysql_servers in ProxySQL:
  22. - WEIGHT Hosts with a higher weight will be prefered to be put ONLINE
  23. - NODE STATUS * Nodes that are in status OFFLINE_HARD will not be checked nor will their status be changed
  24. * SHUNNED nodes are not to be used with Galera based systems, they will be checked and status
  25. will be changed to either ONLINE or OFFLINE_SOFT.
  26. When no nodes were found to be in wsrep_local_state=4 (SYNCED) for either
  27. read or write nodes, then the script will try 5 times for each node to try
  28. to find nodes wsrep_local_state=4 (SYNCED) or wsrep_local_state=2 (DONOR/DESYNC)
  29. This is to avoid $0 to mark all nodes as OFFLINE_SOFT
  30. EOF
  31. }
  32. # DEFAULTS
  33. HOSTGROUP_WRITER_ID="${1}"
  34. HOSTGROUP_READER_ID="${2:--1}"
  35. NUMBER_WRITERS="${3:-0}"
  36. WRITER_IS_READER="${4:-1}"
  37. ERR_FILE="${5:-/dev/null}"
  38. RELOAD_CHECK_FILE="/var/lib/proxysql/reload"
  39. echo "0" > ${RELOAD_CHECK_FILE}
  40. if [ "$1" = '-h' -o "$1" = '--help' -o -z "$1" ]
  41. then
  42. usage
  43. exit 0
  44. fi
  45. test $HOSTGROUP_WRITER_ID -ge 0 &> /dev/null
  46. if [ $? -ne 0 ]; then
  47. echo "ERROR: writer hostgroup_id is not an integer"
  48. usage
  49. exit 1
  50. fi
  51. test $HOSTGROUP_READER_ID -ge -1 &> /dev/null
  52. if [ $? -ne 0 ]; then
  53. echo "ERROR: reader hostgroup_id is not an integer"
  54. usage
  55. exit 1
  56. fi
  57. if [ $# -lt 1 -o $# -gt 5 ]; then
  58. echo "ERROR: Invalid number of arguments"
  59. usage
  60. exit 1
  61. fi
  62. if [ $NUMBER_WRITERS -lt 0 ]; then
  63. echo "ERROR: The number of writers should either be 0 to enable all possible nodes ONLINE"
  64. echo " or be larger than 0 to limit the number of writers"
  65. usage
  66. exit 1
  67. fi
  68. if [ $WRITER_IS_READER -ne 0 -a $WRITER_IS_READER -ne 1 ]; then
  69. echo "ERROR: Writers are readers requires a boolean argument (0|1)"
  70. usage
  71. exit 1
  72. fi
  73. # print information prior to a run if ${ERR_FILE} is defined
  74. echo "`date` ###### proxysql_galera_checker.sh SUMMARY ######" >> ${ERR_FILE}
  75. echo "`date` Hostgroup writers $HOSTGROUP_WRITER_ID" >> ${ERR_FILE}
  76. echo "`date` Hostgroup readers $HOSTGROUP_READER_ID" >> ${ERR_FILE}
  77. echo "`date` Number of writers $NUMBER_WRITERS" >> ${ERR_FILE}
  78. echo "`date` Writers are readers $WRITER_IS_READER" >> ${ERR_FILE}
  79. echo "`date` log file $ERR_FILE" >> ${ERR_FILE}
  80. #Timeout exists for instances where mysqld may be hung
  81. TIMEOUT=10
  82. PROXYSQL_CMDLINE="env MYSQL_PWD=$PROXYSQL_PASSWORD mysql -u$PROXYSQL_USERNAME -h $PROXYSQL_HOSTNAME -P $PROXYSQL_PORT --protocol=tcp -Nse"
  83. MYSQL_CREDENTIALS=$($PROXYSQL_CMDLINE "SELECT variable_value FROM global_variables WHERE variable_name IN ('mysql-monitor_username','mysql-monitor_password') ORDER BY variable_name DESC")
  84. MYSQL_USERNAME=$(echo $MYSQL_CREDENTIALS | awk '{print $1}')
  85. MYSQL_PASSWORD=$(echo $MYSQL_CREDENTIALS | awk '{print $2}')
  86. MYSQL_CMDLINE="env MYSQL_PWD=$MYSQL_PASSWORD timeout $TIMEOUT mysql -nNE -u$MYSQL_USERNAME"
  87. function change_server_status() {
  88. echo "`date` Changing server $1:$2:$3 to status $4. Reason: $5" >> ${ERR_FILE}
  89. $PROXYSQL_CMDLINE "UPDATE mysql_servers set status = '$4' WHERE hostgroup_id = $1 AND hostname = '$2' AND port = $3;" 2>> ${ERR_FILE}
  90. }
  91. echo "`date` ###### HANDLE WRITER NODES ######" >> ${ERR_FILE}
  92. NUMBER_WRITERS_ONLINE=0
  93. $PROXYSQL_CMDLINE "SELECT hostgroup_id, hostname, port, status FROM mysql_servers WHERE hostgroup_id IN ($HOSTGROUP_WRITER_ID) AND status <> 'OFFLINE_HARD' ORDER BY hostgroup_id, weight DESC, hostname, port" | while read hostgroup server port stat
  94. do
  95. WSREP_STATUS=$($MYSQL_CMDLINE -h $server -P $port -e "SHOW STATUS LIKE 'wsrep_local_state'" 2>>${ERR_FILE}| tail -1 2>>${ERR_FILE})
  96. echo "`date` --> Checking WRITE server $hostgroup:$server:$port, current status $stat, wsrep_local_state $WSREP_STATUS" >> ${ERR_FILE}
  97. # we have to limit amount of writers, WSREP status OK, AND node is not marked ONLINE
  98. if [ $NUMBER_WRITERS -gt 0 -a "${WSREP_STATUS}" = "4" -a "$stat" == "ONLINE" ] ; then
  99. if [ $NUMBER_WRITERS_ONLINE -lt $NUMBER_WRITERS ]; then
  100. NUMBER_WRITERS_ONLINE=$(( $NUMBER_WRITERS_ONLINE + 1 ))
  101. echo "`date` server $hostgroup:$server:$port is already ONLINE: ${NUMBER_WRITERS_ONLINE} of ${NUMBER_WRITERS} write nodes" >> ${ERR_FILE}
  102. else
  103. NUMBER_WRITERS_ONLINE=$(( $NUMBER_WRITERS_ONLINE + 1 ))
  104. change_server_status $HOSTGROUP_WRITER_ID "$server" $port "OFFLINE_SOFT" "max write nodes reached (${NUMBER_WRITERS})"
  105. echo "1" > ${RELOAD_CHECK_FILE}
  106. fi
  107. fi
  108. # WSREP status OK, but node is not marked ONLINE
  109. if [ "${WSREP_STATUS}" = "4" -a "$stat" != "ONLINE" ] ; then
  110. # we have to limit amount of writers
  111. if [ $NUMBER_WRITERS -gt 0 ] ; then
  112. if [ $NUMBER_WRITERS_ONLINE -lt $NUMBER_WRITERS ]; then
  113. NUMBER_WRITERS_ONLINE=$(( $NUMBER_WRITERS_ONLINE + 1 ))
  114. change_server_status $HOSTGROUP_WRITER_ID "$server" $port "ONLINE" "{NUMBER_WRITERS_ONLINE} of ${NUMBER_WRITERS} write nodes"
  115. echo "1" > ${RELOAD_CHECK_FILE}
  116. else
  117. NUMBER_WRITERS_ONLINE=$(( $NUMBER_WRITERS_ONLINE + 1 ))
  118. if [ "$stat" != "OFFLINE_SOFT" ]; then
  119. change_server_status $HOSTGROUP_WRITER_ID "$server" $port "OFFLINE_SOFT" "max write nodes reached (${NUMBER_WRITERS})"
  120. echo "1" > ${RELOAD_CHECK_FILE}
  121. else
  122. echo "`date` server $hostgroup:$server:$port is already OFFLINE_SOFT, max write nodes reached (${NUMBER_WRITERS})" >> ${ERR_FILE}
  123. fi
  124. fi
  125. # we do not have to limit
  126. elif [ $NUMBER_WRITERS -eq 0 ] ; then
  127. change_server_status $HOSTGROUP_WRITER_ID "$server" $port "ONLINE" "Changed state, marking write node ONLINE"
  128. echo "1" > ${RELOAD_CHECK_FILE}
  129. fi
  130. fi
  131. # WSREP status is not ok, but the node is marked online, we should put it offline
  132. if [ "${WSREP_STATUS}" != "4" -a "$stat" = "ONLINE" ]; then
  133. change_server_status $HOSTGROUP_WRITER_ID "$server" $port "OFFLINE_SOFT" "WSREP status is ${WSREP_STATUS} which is not ok"
  134. echo "1" > ${RELOAD_CHECK_FILE}
  135. elif [ "${WSREP_STATUS}" != "4" -a "$stat" = "OFFLINE_SOFT" ]; then
  136. echo "`date` server $hostgroup:$server:$port is already OFFLINE_SOFT, WSREP status is ${WSREP_STATUS} which is not ok" >> ${ERR_FILE}
  137. fi
  138. done
  139. # NUMBER_WRITERS_ONLINE is lost after loop
  140. NUMBER_WRITERS_ONLINE=$($PROXYSQL_CMDLINE "SELECT count(*) FROM mysql_servers WHERE hostgroup_id IN ($HOSTGROUP_WRITER_ID) AND status = 'ONLINE'" 2>>${ERR_FILE}| tail -1 2>>${ERR_FILE})
  141. NUMBER_READERS_ONLINE=0
  142. if [ ${HOSTGROUP_READER_ID} -ne -1 ]; then
  143. echo "`date` ###### HANDLE READER NODES ######" >> ${ERR_FILE}
  144. if [ $WRITER_IS_READER -eq 1 ]; then
  145. READER_PROXYSQL_QUERY="SELECT hostgroup_id, hostname, port, status, 'NULL' FROM mysql_servers WHERE hostgroup_id IN ($HOSTGROUP_READER_ID) AND status <> 'OFFLINE_HARD' ORDER BY weight DESC, hostname, port"
  146. elif [ $WRITER_IS_READER -eq 0 ]; then
  147. # We will not try to change reader state of nodes that are writer ONLINE, so what we do is we ORDER BY writer.status ASC because by accident ONLINE is last in the line
  148. READER_PROXYSQL_QUERY="SELECT reader.hostgroup_id,
  149. reader.hostname,
  150. reader.port,
  151. reader.status,
  152. writer.status
  153. FROM mysql_servers as reader
  154. LEFT JOIN mysql_servers as writer
  155. ON writer.hostgroup_id = $HOSTGROUP_WRITER_ID
  156. AND writer.hostname = reader.hostname
  157. AND writer.port = reader.port
  158. WHERE reader.hostgroup_id = $HOSTGROUP_READER_ID
  159. AND reader.status <> 'OFFLINE_HARD'
  160. ORDER BY writer.status ASC,
  161. reader.weight DESC,
  162. reader.hostname,
  163. reader.port"
  164. fi
  165. OFFLINE_READERS_FOUND=0
  166. $PROXYSQL_CMDLINE "$READER_PROXYSQL_QUERY" | while read hostgroup server port stat writer_stat
  167. do
  168. WSREP_STATUS=$($MYSQL_CMDLINE -h $server -P $port -e "SHOW STATUS LIKE 'wsrep_local_state'" 2>>${ERR_FILE}| tail -1 2>>${ERR_FILE})
  169. echo "`date` --> Checking READ server $hostgroup:$server:$port, current status $stat, wsrep_local_state $WSREP_STATUS" >> ${ERR_FILE}
  170. if [ $WRITER_IS_READER -eq 0 -a "$writer_stat" == "ONLINE" ] ; then
  171. if [ $OFFLINE_READERS_FOUND -eq 0 ] ; then
  172. if [ "${WSREP_STATUS}" = "4" -a "$stat" == "ONLINE" ] ; then
  173. echo "`date` server $hostgroup:$server:$port is already ONLINE, is also write node in ONLINE state, not enough non-ONLINE readers found" >> ${ERR_FILE}
  174. fi
  175. if [ "${WSREP_STATUS}" = "4" -a "$stat" != "ONLINE" ] ; then
  176. change_server_status $HOSTGROUP_READER_ID "$server" $port "ONLINE" "marking ONLINE write node as read ONLINE state, not enough non-ONLINE readers found"
  177. echo "1" > ${RELOAD_CHECK_FILE}
  178. fi
  179. else
  180. if [ "${WSREP_STATUS}" = "4" -a "$stat" == "ONLINE" ] ; then
  181. change_server_status $HOSTGROUP_READER_ID "$server" $port "OFFLINE_SOFT" "making ONLINE writer node as read OFFLINE_SOFT as well because writers should not be readers"
  182. echo "1" > ${RELOAD_CHECK_FILE}
  183. fi
  184. if [ "${WSREP_STATUS}" = "4" -a "$stat" != "ONLINE" ] ; then
  185. echo "`date` server $hostgroup:$server:$port is $stat, keeping node in $stat is a writer ONLINE and it's preferred not to have writers as readers" >> ${ERR_FILE}
  186. fi
  187. fi
  188. else
  189. if [ "${WSREP_STATUS}" = "4" -a "$stat" == "ONLINE" ] ; then
  190. echo "`date` server $hostgroup:$server:$port is already ONLINE" >> ${ERR_FILE}
  191. OFFLINE_READERS_FOUND=$(( $OFFLINE_READERS_FOUND + 1 ))
  192. fi
  193. # WSREP status OK, but node is not marked ONLINE
  194. if [ "${WSREP_STATUS}" = "4" -a "$stat" != "ONLINE" ] ; then
  195. change_server_status $HOSTGROUP_READER_ID "$server" $port "ONLINE" "changed state, making read node ONLINE"
  196. echo "1" > ${RELOAD_CHECK_FILE}
  197. OFFLINE_READERS_FOUND=$(( $OFFLINE_READERS_FOUND + 1 ))
  198. fi
  199. fi
  200. # WSREP status is not ok, but the node is marked online, we should put it offline
  201. if [ "${WSREP_STATUS}" != "4" -a "$stat" = "ONLINE" ]; then
  202. change_server_status $HOSTGROUP_READER_ID "$server" $port "OFFLINE_SOFT" "WSREP status is ${WSREP_STATUS} which is not ok"
  203. echo "1" > ${RELOAD_CHECK_FILE}
  204. elif [ "${WSREP_STATUS}" != "4" -a "$stat" = "OFFLINE_SOFT" ]; then
  205. echo "`date` server $hostgroup:$server:$port is already OFFLINE_SOFT, WSREP status is ${WSREP_STATUS} which is not ok" >> ${ERR_FILE}
  206. fi
  207. done
  208. NUMBER_READERS_ONLINE=$($PROXYSQL_CMDLINE "SELECT count(*) FROM mysql_servers WHERE hostgroup_id IN ($HOSTGROUP_READER_ID) AND status = 'ONLINE'" 2>>${ERR_FILE}| tail -1 2>>${ERR_FILE})
  209. fi
  210. echo "`date` ###### SUMMARY ######" >> ${ERR_FILE}
  211. echo "`date` --> Number of writers that are 'ONLINE': ${NUMBER_WRITERS_ONLINE} : hostgroup: ${HOSTGROUP_WRITER_ID}" >> ${ERR_FILE}
  212. [ ${HOSTGROUP_READER_ID} -ne -1 ] && echo "`date` --> Number of readers that are 'ONLINE': ${NUMBER_READERS_ONLINE} : hostgroup: ${HOSTGROUP_READER_ID}" >> ${ERR_FILE}
  213. cnt=0
  214. # We don't have any writers... alert, try to bring some online!
  215. # This includes bringing a DONOR online
  216. if [ ${NUMBER_WRITERS_ONLINE} -eq 0 ]; then
  217. echo "`date` ###### TRYING TO FIX MISSING WRITERS ######"
  218. echo "`date` No writers found, Trying to enable last available node of the cluster (in Donor/Desync state)" >> ${ERR_FILE}
  219. $PROXYSQL_CMDLINE "SELECT hostgroup_id, hostname, port, status FROM mysql_servers WHERE hostgroup_id IN ($HOSTGROUP_WRITER_ID) AND status <> 'OFFLINE_HARD'" | while read hostgroup server port stat
  220. do
  221. safety_cnt=0
  222. while [ ${cnt} -le $NUMBER_WRITERS -a ${safety_cnt} -lt 5 ]
  223. do
  224. WSREP_STATUS=$($MYSQL_CMDLINE -h $server -P $port -e "SHOW STATUS LIKE 'wsrep_local_state'" 2>>${ERR_FILE} | tail -1 2>>${ERR_FILE})
  225. echo "`date` Check server $hostgroup:$server:$port for only available node in DONOR state, status $stat , wsrep_local_state $WSREP_STATUS" >> ${ERR_FILE}
  226. if [ "${WSREP_STATUS}" = "2" -a "$stat" != "ONLINE" ]
  227. then
  228. change_server_status $HOSTGROUP_WRITER_ID "$server" $port "ONLINE" "WSREP status is DESYNC/DONOR, as this is the only node we will put this one online"
  229. echo "1" > ${RELOAD_CHECK_FILE}
  230. cnt=$(( $cnt + 1 ))
  231. fi
  232. safety_cnt=$(( $safety_cnt + 1 ))
  233. done
  234. done
  235. fi
  236. cnt=0
  237. # We don't have any readers... alert, try to bring some online!
  238. if [ ${HOSTGROUP_READER_ID} -ne -1 -a ${NUMBER_READERS_ONLINE} -eq 0 ]; then
  239. echo "`date` ###### TRYING TO FIX MISSING READERS ######"
  240. echo "`date` --> No readers found, Trying to enable last available node of the cluster (in Donor/Desync state) or pick the master" >> ${ERR_FILE}
  241. $PROXYSQL_CMDLINE "SELECT hostgroup_id, hostname, port, status FROM mysql_servers WHERE hostgroup_id IN ($HOSTGROUP_READER_ID) AND status <> 'OFFLINE_HARD'" | while read hostgroup server port stat
  242. do
  243. safety_cnt=0
  244. while [ ${cnt} -eq 0 -a ${safety_cnt} -lt 5 ]
  245. do
  246. WSREP_STATUS=$($MYSQL_CMDLINE -h $server -P $port -e "SHOW STATUS LIKE 'wsrep_local_state'" 2>>${ERR_FILE} | tail -1 2>>${ERR_FILE})
  247. echo "`date` Check server $hostgroup:$server:$port for only available node in DONOR state, status $stat , wsrep_local_state $WSREP_STATUS" >> ${ERR_FILE}
  248. if [ "${WSREP_STATUS}" = "2" -a "$stat" != "ONLINE" ]
  249. then
  250. change_server_status $HOSTGROUP_READER_ID "$server" $port "ONLINE" "WSREP status is DESYNC/DONOR, as this is the only node we will put this one online"
  251. echo "1" > ${RELOAD_CHECK_FILE}
  252. cnt=$(( $cnt + 1 ))
  253. fi
  254. safety_cnt=$(( $safety_cnt + 1 ))
  255. done
  256. done
  257. fi
  258. if [ $(cat ${RELOAD_CHECK_FILE}) -ne 0 ] ; then
  259. echo "`date` ###### Loading mysql_servers config into runtime ######" >> ${ERR_FILE}
  260. $PROXYSQL_CMDLINE "LOAD MYSQL SERVERS TO RUNTIME;" 2>> ${ERR_FILE}
  261. else
  262. echo "`date` ###### Not loading mysql_servers, no change needed ######" >> ${ERR_FILE}
  263. fi
  264. exit 0