Fix for HSQLDB deadlock during CHECKPOINT.

Symptoms are:

* db/blockchain.log is pretty much exactly 50MB - the checkpoint-triggering size.

* Loads of threads are stuck waiting for HSQLDB's CountUpDownLatch$Sync.await()

* Synchronizer, or some other thread, possibly orphaning blocks.

The cause seems to be method A, which has a repository session,
calls EventBus.INSTANCE.notify() and one of the event listeners
then obtains their own repository session to do repository 'work'.

In the meantime, the HSQLDB log has reached 50MB, triggering auto-checkpoint.

HSQLDB attempts to CHECKPOINT, but waits for existing transactions
to complete, and also blocks starting new transactions.

Thus, one of the event listeners is blocked when they try to obtain
a new repository session, but HSQLDB never performs CHECKPOINT
because the event notifier (method A) still has an unfinished
transaction - hence deadlock.
This commit is contained in:
catbref 2020-09-28 14:22:18 +01:00
parent 81a5b154c2
commit d85a3d17c8
5 changed files with 26 additions and 1 deletions

View File

@ -567,6 +567,7 @@ public class BlockChain {
--height; --height;
orphanBlockData = repository.getBlockRepository().fromHeight(height); orphanBlockData = repository.getBlockRepository().fromHeight(height);
repository.discardChanges(); // clear transaction status to prevent deadlocks
Controller.getInstance().onNewBlock(orphanBlockData); Controller.getInstance().onNewBlock(orphanBlockData);
} }

View File

@ -306,8 +306,10 @@ public class BlockMinter extends Thread {
} }
if (newBlockMinted) { if (newBlockMinted) {
BlockData newBlockData = newBlock.getBlockData();
// Notify Controller and broadcast our new chain to network // Notify Controller and broadcast our new chain to network
BlockData newBlockData = newBlock.getBlockData();
repository.discardChanges(); // clear transaction status to prevent deadlocks
Controller.getInstance().onNewBlock(newBlockData); Controller.getInstance().onNewBlock(newBlockData);
Network network = Network.getInstance(); Network network = Network.getInstance();

View File

@ -835,6 +835,12 @@ public class Controller extends Thread {
} }
} }
/**
* Callback for when we've received a new block.
* <p>
* See <b>WARNING</b> for {@link EventBus#notify(Event)}
* to prevent deadlocks.
*/
public void onNewBlock(BlockData latestBlockData) { public void onNewBlock(BlockData latestBlockData) {
// Protective copy // Protective copy
BlockData blockDataCopy = new BlockData(latestBlockData); BlockData blockDataCopy = new BlockData(latestBlockData);

View File

@ -410,6 +410,7 @@ public class Synchronizer {
--ourHeight; --ourHeight;
orphanBlockData = repository.getBlockRepository().fromHeight(ourHeight); orphanBlockData = repository.getBlockRepository().fromHeight(ourHeight);
repository.discardChanges(); // clear transaction status to prevent deadlocks
Controller.getInstance().onNewBlock(orphanBlockData); Controller.getInstance().onNewBlock(orphanBlockData);
} }

View File

@ -20,6 +20,21 @@ public enum EventBus {
} }
} }
/**
* <b>WARNING:</b> before calling this method,
* make sure repository holds no locks, e.g. by calling
* <tt>repository.discardChanges()</tt>.
* <p>
* This is because event listeners might open a new
* repository session which will deadlock HSQLDB
* if it tries to CHECKPOINT.
* <p>
* The HSQLDB deadlock occurs because the caller's
* repository session blocks the CHECKPOINT until
* their transaction is closed, yet event listeners
* new sessions are blocked until CHECKPOINT is
* completed, hence deadlock.
*/
public void notify(Event event) { public void notify(Event event) {
List<Listener> clonedListeners; List<Listener> clonedListeners;