Skip to content

Commit

Permalink
MDEV-31177: SHOW SLAVE STATUS Last_SQL_Errno Race Condition on Errore…
Browse files Browse the repository at this point in the history
…d Slave Restart

The SQL thread and a user connection executing SHOW SLAVE STATUS
have a race condition on Last_SQL_Errno, such that a slave which
previously errored and stopped, on its next start, SHOW SLAVE STATUS
can show that the SQL Thread is running while the previous error is
also showing.

The fix is to move when the last error is cleared when the SQL
thread starts to occur before setting the status of
Slave_SQL_Running.

Thanks to Kristian Nielson for his work diagnosing the problem!

Reviewed By:
============
Andrei Elkin <[email protected]>
Kristian Nielson <[email protected]>
  • Loading branch information
bnestere committed Sep 13, 2023
1 parent 1831f8e commit 1407f99
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 11 deletions.
48 changes: 48 additions & 0 deletions mysql-test/suite/rpl/r/rpl_sql_thd_start_errno_cleared.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
include/master-slave.inc
[connection master]
connection master;
create table t1 (a int primary key, b int) engine=innodb;
insert t1 values (1,1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
set @save_slave_trans_retries= @@global.slave_transaction_retries;
set @@global.innodb_lock_wait_timeout= 1;
set @@global.slave_transaction_retries= 0;
connection master;
update t1 set b=b+10 where a=1;
include/save_master_gtid.inc
connection slave1;
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
a b
1 1
connection slave;
include/start_slave.inc
include/wait_for_slave_sql_error.inc [errno=1205]
connection slave1;
ROLLBACK;
connection slave;
set @save_dbug = @@global.debug_dbug;
set @@global.debug_dbug= "+d,delay_sql_thread_after_release_run_lock";
include/start_slave.inc
set debug_sync= "now wait_for sql_thread_run_lock_released";
# Validating that the SQL thread is running..
# ..success
# Validating that Last_SQL_Errno is cleared..
# ..success
set debug_sync= "now signal sql_thread_continue";
set @@global.debug_dbug= @saved_dbug;
set debug_sync= "RESET";
# Cleanup
connection master;
drop table t1;
connection slave;
include/stop_slave.inc
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
set @@global.slave_transaction_retries= @save_slave_trans_retries;
include/start_slave.inc
include/rpl_end.inc
# End of rpl_sql_thd_start_errno_cleared.test
93 changes: 93 additions & 0 deletions mysql-test/suite/rpl/t/rpl_sql_thd_start_errno_cleared.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#
# Ensure that when the slave restarts, the last error code displayed by
# SHOW SLAVE STATUS is cleared before Slave_SQL_Running is set.
#
# To ensure that, this test uses the debug_sync mechanism to pause an errored
# and restarting slave's SQL thread after it has set its running state to YES,
# and then ensures that Last_SQL_Errno is 0. The slave error is a forced innodb
# row lock timeout.
#
#
# References
# MDEV-31177: SHOW SLAVE STATUS Last_SQL_Errno Race Condition on Errored
# Slave Restart
#
source include/have_binlog_format_row.inc;
source include/have_innodb.inc;
source include/have_debug.inc;
source include/have_debug_sync.inc;
source include/master-slave.inc;

--connection master
create table t1 (a int primary key, b int) engine=innodb;
insert t1 values (1,1);
--source include/save_master_gtid.inc

--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
set @save_slave_trans_retries= @@global.slave_transaction_retries;
set @@global.innodb_lock_wait_timeout= 1;
set @@global.slave_transaction_retries= 0;

--connection master
update t1 set b=b+10 where a=1;
--source include/save_master_gtid.inc

--connection slave1
BEGIN;
--eval SELECT * FROM t1 WHERE a=1 FOR UPDATE

--connection slave
--source include/start_slave.inc

--let $slave_sql_errno= 1205
--source include/wait_for_slave_sql_error.inc

--connection slave1
ROLLBACK;

--connection slave
set @save_dbug = @@global.debug_dbug;
set @@global.debug_dbug= "+d,delay_sql_thread_after_release_run_lock";
--source include/start_slave.inc
set debug_sync= "now wait_for sql_thread_run_lock_released";

--let $sql_running = query_get_value("SHOW SLAVE STATUS", Slave_SQL_Running, 1)
--echo # Validating that the SQL thread is running..
if (`SELECT strcmp("$sql_running", "YES") != 0`)
{
--echo # ..failed
--echo # Slave_SQL_Running: $sql_running
--die Slave SQL thread is not running
}
--echo # ..success

--let $last_error = query_get_value("SHOW SLAVE STATUS", Last_SQL_Errno, 1)
--echo # Validating that Last_SQL_Errno is cleared..
if ($last_error)
{
--echo # ..failed
--echo # Last_SQL_Errno: $last_error
--die SHOW SLAVE STATUS shows the error from the last session on startup
}
--echo # ..success

set debug_sync= "now signal sql_thread_continue";

set @@global.debug_dbug= @saved_dbug;
set debug_sync= "RESET";

--echo # Cleanup
--connection master
drop table t1;

--connection slave
--source include/stop_slave.inc
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
set @@global.slave_transaction_retries= @save_slave_trans_retries;
--source include/start_slave.inc

--source include/rpl_end.inc
--echo # End of rpl_sql_thd_start_errno_cleared.test
34 changes: 23 additions & 11 deletions sql/slave.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5236,6 +5236,19 @@ pthread_handler_t handle_slave_sql(void *arg)

DBUG_ASSERT(rli->inited);
DBUG_ASSERT(rli->mi == mi);

/*
Reset errors for a clean start (otherwise, if the master is idle, the SQL
thread may execute no Query_log_event, so the error will remain even
though there's no problem anymore). Do not reset the master timestamp
(imagine the slave has caught everything, the STOP SLAVE and START SLAVE:
as we are not sure that we are going to receive a query, we want to
remember the last master timestamp (to say how many seconds behind we are
now.
But the master timestamp is reset by RESET SLAVE & CHANGE MASTER.
*/
rli->clear_error();

mysql_mutex_lock(&rli->run_lock);
DBUG_ASSERT(!rli->slave_running);
errmsg= 0;
Expand Down Expand Up @@ -5310,17 +5323,16 @@ pthread_handler_t handle_slave_sql(void *arg)
mysql_mutex_unlock(&rli->run_lock);
mysql_cond_broadcast(&rli->start_cond);

/*
Reset errors for a clean start (otherwise, if the master is idle, the SQL
thread may execute no Query_log_event, so the error will remain even
though there's no problem anymore). Do not reset the master timestamp
(imagine the slave has caught everything, the STOP SLAVE and START SLAVE:
as we are not sure that we are going to receive a query, we want to
remember the last master timestamp (to say how many seconds behind we are
now.
But the master timestamp is reset by RESET SLAVE & CHANGE MASTER.
*/
rli->clear_error();
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("delay_sql_thread_after_release_run_lock", {
const char act[]= "now "
"signal sql_thread_run_lock_released "
"wait_for sql_thread_continue";
DBUG_ASSERT(debug_sync_service);
DBUG_ASSERT(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act)));
};);
#endif

rli->parallel.reset();

//tell the I/O thread to take relay_log_space_limit into account from now on
Expand Down

0 comments on commit 1407f99

Please sign in to comment.