Rework HSQLDB CHECKPOINTing to defer until there are no ongoing SQL transactions, in order to prevent DB deadlocks.

Symptoms of a CHECKPOINT-related DB deadlock:

On Controller thread:
"Controller" #20 prio=5 os_prio=31 cpu=1577665.56ms elapsed=17666.97s allocated=475G defined_classes=412 tid=0x00007fe99f97b000 nid=0x1644b waiting on condition  [0x0000700009a21000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@14.0.2/Native Method)
	- parking to wait for  <0x0000000602f2a6f8> (a org.hsqldb.lib.CountUpDownLatch$Sync)
[...some more lines...]
[this next line is the best indicator: ]
	at org.qortal.repository.hsqldb.HSQLDBRepository.checkpoint(HSQLDBRepository.java:385)
	at org.qortal.repository.RepositoryManager.checkpoint(RepositoryManager.java:51)
	at org.qortal.controller.Controller.run(Controller.java:544)

Other threads stuck at:
	- parking to wait for  <0x00000007ff09f0b0> (a org.hsqldb.lib.CountUpDownLatch$Sync)
	at java.util.concurrent.locks.LockSupport.park(java.base@14.0.2/LockSupport.java:211)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(java.base@14.0.2/AbstractQueuedSynchronizer.java:714)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(java.base@14.0.2/AbstractQueuedSynchronizer.java:1046)
	at org.hsqldb.lib.CountUpDownLatch.await(Unknown Source)
	at org.hsqldb.Session.executeCompiledStatement(Unknown Source)
This commit is contained in:
catbref 2021-02-13 17:32:09 +00:00
parent eb23940996
commit 3453f0efaf
5 changed files with 76 additions and 42 deletions

View File

@ -628,25 +628,9 @@ public class AdminResource {
public String checkpointRepository() {
Security.checkApiCallAllowed(request);
try (final Repository repository = RepositoryManager.getRepository()) {
ReentrantLock blockchainLock = Controller.getInstance().getBlockchainLock();
blockchainLock.lockInterruptibly();
try {
repository.checkpoint(true);
repository.saveChanges();
RepositoryManager.setRequestedCheckpoint(Boolean.TRUE);
return "true";
} finally {
blockchainLock.unlock();
}
} catch (InterruptedException e) {
// We couldn't lock blockchain to perform checkpoint
return "false";
} catch (DataException e) {
throw ApiExceptionFactory.INSTANCE.createException(request, ApiError.REPOSITORY_ISSUE, e);
}
}
@POST

View File

@ -536,12 +536,7 @@ public class Controller extends Thread {
if (now >= repositoryCheckpointTimestamp + repositoryCheckpointInterval) {
repositoryCheckpointTimestamp = now + repositoryCheckpointInterval;
if (Settings.getInstance().getShowCheckpointNotification())
SysTray.getInstance().showMessage(Translator.INSTANCE.translate("SysTray", "DB_CHECKPOINT"),
Translator.INSTANCE.translate("SysTray", "PERFORMING_DB_CHECKPOINT"),
MessageType.INFO);
RepositoryManager.checkpoint(true);
RepositoryManager.setRequestedCheckpoint(Boolean.TRUE);
}
// Give repository a chance to backup (if enabled)

View File

@ -47,8 +47,6 @@ public interface Repository extends AutoCloseable {
public void backup(boolean quick) throws DataException;
public void checkpoint(boolean quick) throws DataException;
public void performPeriodicMaintenance() throws DataException;
public void exportNodeLocalData() throws DataException;

View File

@ -4,6 +4,9 @@ public abstract class RepositoryManager {
private static RepositoryFactory repositoryFactory = null;
/** null if no checkpoint requested, TRUE for quick checkpoint, false for slow/full checkpoint. */
private static Boolean quickCheckpointRequested = null;
public static RepositoryFactory getRepositoryFactory() {
return repositoryFactory;
}
@ -46,12 +49,12 @@ public abstract class RepositoryManager {
}
}
public static void checkpoint(boolean quick) {
try (final Repository repository = getRepository()) {
repository.checkpoint(quick);
} catch (DataException e) {
// Checkpoint is best-effort so don't complain
public static void setRequestedCheckpoint(Boolean quick) {
quickCheckpointRequested = quick;
}
public static Boolean getRequestedCheckpoint() {
return quickCheckpointRequested;
}
public static void rebuild() throws DataException {

View File

@ -1,5 +1,6 @@
package org.qortal.repository.hsqldb;
import java.awt.TrayIcon.MessageType;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
@ -31,6 +32,8 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.qortal.account.PrivateKeyAccount;
import org.qortal.crypto.Crypto;
import org.qortal.globalization.Translator;
import org.qortal.gui.SysTray;
import org.qortal.repository.ATRepository;
import org.qortal.repository.AccountRepository;
import org.qortal.repository.ArbitraryRepository;
@ -54,6 +57,8 @@ public class HSQLDBRepository implements Repository {
private static final Logger LOGGER = LogManager.getLogger(HSQLDBRepository.class);
private static final Object CHECKPOINT_LOCK = new Object();
protected Connection connection;
protected final Deque<Savepoint> savepoints = new ArrayDeque<>(3);
protected boolean debugState = false;
@ -103,8 +108,11 @@ public class HSQLDBRepository implements Repository {
throw new DataException("Unable to fetch session ID from repository", e);
}
// synchronize to block new connections if checkpointing in progress
synchronized (CHECKPOINT_LOCK) {
assertEmptyTransaction("connection creation");
}
}
// Getters / setters
@ -284,6 +292,9 @@ public class HSQLDBRepository implements Repository {
this.sqlStatements = null;
this.savepoints.clear();
// If a checkpoint has been requested, we could perform that now
this.maybeCheckpoint();
// Give connection back to the pool
this.connection.close();
this.connection = null;
@ -292,6 +303,58 @@ public class HSQLDBRepository implements Repository {
}
}
private void maybeCheckpoint() throws DataException {
// To serialize checkpointing and to block new sessions when checkpointing in progress
synchronized (CHECKPOINT_LOCK) {
Boolean quickCheckpointRequest = RepositoryManager.getRequestedCheckpoint();
if (quickCheckpointRequest == null)
return;
// We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction,
// otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions
// due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock
String sql = "SELECT COUNT(*) "
+ "FROM Information_schema.system_sessions "
+ "WHERE transaction = TRUE";
try {
PreparedStatement pstmt = this.cachePreparedStatement(sql);
if (!pstmt.execute())
throw new DataException("Unable to check repository session status");
try (ResultSet resultSet = pstmt.getResultSet()) {
if (resultSet == null || !resultSet.next())
// Failed to even find HSQLDB session info!
throw new DataException("No results when checking repository session status");
int transactionCount = resultSet.getInt(1);
if (transactionCount > 0)
// We can't safely perform CHECKPOINT due to ongoing SQL transactions
return;
}
LOGGER.info("Performing repository CHECKPOINT...");
if (Settings.getInstance().getShowCheckpointNotification())
SysTray.getInstance().showMessage(Translator.INSTANCE.translate("SysTray", "DB_CHECKPOINT"),
Translator.INSTANCE.translate("SysTray", "PERFORMING_DB_CHECKPOINT"),
MessageType.INFO);
try (Statement stmt = this.connection.createStatement()) {
stmt.execute(Boolean.TRUE.equals(quickCheckpointRequest) ? "CHECKPOINT" : "CHECKPOINT DEFRAG");
}
// Completed!
LOGGER.info("Repository CHECKPOINT completed!");
RepositoryManager.setRequestedCheckpoint(null);
} catch (SQLException e) {
throw new DataException("Unable to check repository session status", e);
}
}
}
@Override
public void rebuild() throws DataException {
LOGGER.info("Rebuilding repository from scratch");
@ -379,15 +442,6 @@ public class HSQLDBRepository implements Repository {
}
}
@Override
public void checkpoint(boolean quick) throws DataException {
try (Statement stmt = this.connection.createStatement()) {
stmt.execute(quick ? "CHECKPOINT" : "CHECKPOINT DEFRAG");
} catch (SQLException e) {
throw new DataException("Unable to perform repository checkpoint");
}
}
@Override
public void performPeriodicMaintenance() throws DataException {
// Defrag DB - takes a while!