From 3453f0efafc6a70499ab1ae0df91fb98a8922f50 Mon Sep 17 00:00:00 2001 From: catbref Date: Sat, 13 Feb 2021 17:32:09 +0000 Subject: [PATCH] Rework HSQLDB CHECKPOINTing to defer until there are no ongoing SQL transactions, in order to prevent DB deadlocks. Symptoms of a CHECKPOINT-related DB deadlock: On Controller thread: "Controller" #20 prio=5 os_prio=31 cpu=1577665.56ms elapsed=17666.97s allocated=475G defined_classes=412 tid=0x00007fe99f97b000 nid=0x1644b waiting on condition [0x0000700009a21000] java.lang.Thread.State: WAITING (parking) at jdk.internal.misc.Unsafe.park(java.base@14.0.2/Native Method) - parking to wait for <0x0000000602f2a6f8> (a org.hsqldb.lib.CountUpDownLatch$Sync) [...some more lines...] [this next line is the best indicator: ] at org.qortal.repository.hsqldb.HSQLDBRepository.checkpoint(HSQLDBRepository.java:385) at org.qortal.repository.RepositoryManager.checkpoint(RepositoryManager.java:51) at org.qortal.controller.Controller.run(Controller.java:544) Other threads stuck at: - parking to wait for <0x00000007ff09f0b0> (a org.hsqldb.lib.CountUpDownLatch$Sync) at java.util.concurrent.locks.LockSupport.park(java.base@14.0.2/LockSupport.java:211) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(java.base@14.0.2/AbstractQueuedSynchronizer.java:714) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(java.base@14.0.2/AbstractQueuedSynchronizer.java:1046) at org.hsqldb.lib.CountUpDownLatch.await(Unknown Source) at org.hsqldb.Session.executeCompiledStatement(Unknown Source) --- .../qortal/api/resource/AdminResource.java | 20 +---- .../org/qortal/controller/Controller.java | 7 +- .../org/qortal/repository/Repository.java | 2 - .../qortal/repository/RepositoryManager.java | 15 ++-- .../repository/hsqldb/HSQLDBRepository.java | 74 ++++++++++++++++--- 5 files changed, 76 insertions(+), 42 deletions(-) diff --git a/src/main/java/org/qortal/api/resource/AdminResource.java b/src/main/java/org/qortal/api/resource/AdminResource.java index 81eb0bf8..8069a0d5 100644 --- a/src/main/java/org/qortal/api/resource/AdminResource.java +++ b/src/main/java/org/qortal/api/resource/AdminResource.java @@ -628,25 +628,9 @@ public class AdminResource { public String checkpointRepository() { Security.checkApiCallAllowed(request); - try (final Repository repository = RepositoryManager.getRepository()) { - ReentrantLock blockchainLock = Controller.getInstance().getBlockchainLock(); + RepositoryManager.setRequestedCheckpoint(Boolean.TRUE); - blockchainLock.lockInterruptibly(); - - try { - repository.checkpoint(true); - repository.saveChanges(); - - return "true"; - } finally { - blockchainLock.unlock(); - } - } catch (InterruptedException e) { - // We couldn't lock blockchain to perform checkpoint - return "false"; - } catch (DataException e) { - throw ApiExceptionFactory.INSTANCE.createException(request, ApiError.REPOSITORY_ISSUE, e); - } + return "true"; } @POST diff --git a/src/main/java/org/qortal/controller/Controller.java b/src/main/java/org/qortal/controller/Controller.java index 08a1e6a8..c64e9e88 100644 --- a/src/main/java/org/qortal/controller/Controller.java +++ b/src/main/java/org/qortal/controller/Controller.java @@ -536,12 +536,7 @@ public class Controller extends Thread { if (now >= repositoryCheckpointTimestamp + repositoryCheckpointInterval) { repositoryCheckpointTimestamp = now + repositoryCheckpointInterval; - if (Settings.getInstance().getShowCheckpointNotification()) - SysTray.getInstance().showMessage(Translator.INSTANCE.translate("SysTray", "DB_CHECKPOINT"), - Translator.INSTANCE.translate("SysTray", "PERFORMING_DB_CHECKPOINT"), - MessageType.INFO); - - RepositoryManager.checkpoint(true); + RepositoryManager.setRequestedCheckpoint(Boolean.TRUE); } // Give repository a chance to backup (if enabled) diff --git a/src/main/java/org/qortal/repository/Repository.java b/src/main/java/org/qortal/repository/Repository.java index d4ef35ce..656e6e1e 100644 --- a/src/main/java/org/qortal/repository/Repository.java +++ b/src/main/java/org/qortal/repository/Repository.java @@ -47,8 +47,6 @@ public interface Repository extends AutoCloseable { public void backup(boolean quick) throws DataException; - public void checkpoint(boolean quick) throws DataException; - public void performPeriodicMaintenance() throws DataException; public void exportNodeLocalData() throws DataException; diff --git a/src/main/java/org/qortal/repository/RepositoryManager.java b/src/main/java/org/qortal/repository/RepositoryManager.java index 2b6e637b..55e4f304 100644 --- a/src/main/java/org/qortal/repository/RepositoryManager.java +++ b/src/main/java/org/qortal/repository/RepositoryManager.java @@ -4,6 +4,9 @@ public abstract class RepositoryManager { private static RepositoryFactory repositoryFactory = null; + /** null if no checkpoint requested, TRUE for quick checkpoint, false for slow/full checkpoint. */ + private static Boolean quickCheckpointRequested = null; + public static RepositoryFactory getRepositoryFactory() { return repositoryFactory; } @@ -46,12 +49,12 @@ public abstract class RepositoryManager { } } - public static void checkpoint(boolean quick) { - try (final Repository repository = getRepository()) { - repository.checkpoint(quick); - } catch (DataException e) { - // Checkpoint is best-effort so don't complain - } + public static void setRequestedCheckpoint(Boolean quick) { + quickCheckpointRequested = quick; + } + + public static Boolean getRequestedCheckpoint() { + return quickCheckpointRequested; } public static void rebuild() throws DataException { diff --git a/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java b/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java index 563038d7..6d01516e 100644 --- a/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java +++ b/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java @@ -1,5 +1,6 @@ package org.qortal.repository.hsqldb; +import java.awt.TrayIcon.MessageType; import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -31,6 +32,8 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.qortal.account.PrivateKeyAccount; import org.qortal.crypto.Crypto; +import org.qortal.globalization.Translator; +import org.qortal.gui.SysTray; import org.qortal.repository.ATRepository; import org.qortal.repository.AccountRepository; import org.qortal.repository.ArbitraryRepository; @@ -54,6 +57,8 @@ public class HSQLDBRepository implements Repository { private static final Logger LOGGER = LogManager.getLogger(HSQLDBRepository.class); + private static final Object CHECKPOINT_LOCK = new Object(); + protected Connection connection; protected final Deque savepoints = new ArrayDeque<>(3); protected boolean debugState = false; @@ -103,7 +108,10 @@ public class HSQLDBRepository implements Repository { throw new DataException("Unable to fetch session ID from repository", e); } - assertEmptyTransaction("connection creation"); + // synchronize to block new connections if checkpointing in progress + synchronized (CHECKPOINT_LOCK) { + assertEmptyTransaction("connection creation"); + } } // Getters / setters @@ -284,6 +292,9 @@ public class HSQLDBRepository implements Repository { this.sqlStatements = null; this.savepoints.clear(); + // If a checkpoint has been requested, we could perform that now + this.maybeCheckpoint(); + // Give connection back to the pool this.connection.close(); this.connection = null; @@ -292,6 +303,58 @@ public class HSQLDBRepository implements Repository { } } + private void maybeCheckpoint() throws DataException { + // To serialize checkpointing and to block new sessions when checkpointing in progress + synchronized (CHECKPOINT_LOCK) { + Boolean quickCheckpointRequest = RepositoryManager.getRequestedCheckpoint(); + if (quickCheckpointRequest == null) + return; + + // We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction, + // otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions + // due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock + String sql = "SELECT COUNT(*) " + + "FROM Information_schema.system_sessions " + + "WHERE transaction = TRUE"; + + try { + PreparedStatement pstmt = this.cachePreparedStatement(sql); + + if (!pstmt.execute()) + throw new DataException("Unable to check repository session status"); + + try (ResultSet resultSet = pstmt.getResultSet()) { + if (resultSet == null || !resultSet.next()) + // Failed to even find HSQLDB session info! + throw new DataException("No results when checking repository session status"); + + int transactionCount = resultSet.getInt(1); + + if (transactionCount > 0) + // We can't safely perform CHECKPOINT due to ongoing SQL transactions + return; + } + + LOGGER.info("Performing repository CHECKPOINT..."); + + if (Settings.getInstance().getShowCheckpointNotification()) + SysTray.getInstance().showMessage(Translator.INSTANCE.translate("SysTray", "DB_CHECKPOINT"), + Translator.INSTANCE.translate("SysTray", "PERFORMING_DB_CHECKPOINT"), + MessageType.INFO); + + try (Statement stmt = this.connection.createStatement()) { + stmt.execute(Boolean.TRUE.equals(quickCheckpointRequest) ? "CHECKPOINT" : "CHECKPOINT DEFRAG"); + } + + // Completed! + LOGGER.info("Repository CHECKPOINT completed!"); + RepositoryManager.setRequestedCheckpoint(null); + } catch (SQLException e) { + throw new DataException("Unable to check repository session status", e); + } + } + } + @Override public void rebuild() throws DataException { LOGGER.info("Rebuilding repository from scratch"); @@ -379,15 +442,6 @@ public class HSQLDBRepository implements Repository { } } - @Override - public void checkpoint(boolean quick) throws DataException { - try (Statement stmt = this.connection.createStatement()) { - stmt.execute(quick ? "CHECKPOINT" : "CHECKPOINT DEFRAG"); - } catch (SQLException e) { - throw new DataException("Unable to perform repository checkpoint"); - } - } - @Override public void performPeriodicMaintenance() throws DataException { // Defrag DB - takes a while!