From adeb654248bf504e755b7f30161096ca1279c056 Mon Sep 17 00:00:00 2001 From: CalDescent Date: Sat, 2 Oct 2021 13:23:26 +0100 Subject: [PATCH] Rework of repository maintenance and backups It will now attempt to wait until there are no other active transactions before starting, to avoid deadlocks. A timeout for this process is specified - generally 60 seconds - so that callers can give up or retry if something is holding a transaction open for too long. Right now we will give up in all places except for bootstrap creation, where it will keep retrying until successful. --- .../org/qortal/RepositoryMaintenance.java | 5 +- .../qortal/api/resource/AdminResource.java | 13 +- .../org/qortal/controller/AutoUpdate.java | 14 +- .../org/qortal/controller/Controller.java | 10 +- .../java/org/qortal/repository/Bootstrap.java | 16 +- .../org/qortal/repository/Repository.java | 5 +- .../qortal/repository/RepositoryManager.java | 5 +- .../hsqldb/HSQLDBDatabasePruning.java | 11 +- .../repository/hsqldb/HSQLDBRepository.java | 178 +++++++++++++----- .../java/org/qortal/test/RepositoryTests.java | 32 ++-- 10 files changed, 200 insertions(+), 89 deletions(-) diff --git a/src/main/java/org/qortal/RepositoryMaintenance.java b/src/main/java/org/qortal/RepositoryMaintenance.java index c3ae0616..b085822b 100644 --- a/src/main/java/org/qortal/RepositoryMaintenance.java +++ b/src/main/java/org/qortal/RepositoryMaintenance.java @@ -1,6 +1,7 @@ package org.qortal; import java.security.Security; +import java.util.concurrent.TimeoutException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -57,10 +58,10 @@ public class RepositoryMaintenance { LOGGER.info("Starting repository periodic maintenance. This can take a while..."); try (final Repository repository = RepositoryManager.getRepository()) { - repository.performPeriodicMaintenance(); + repository.performPeriodicMaintenance(null); LOGGER.info("Repository periodic maintenance completed"); - } catch (DataException e) { + } catch (DataException | TimeoutException e) { LOGGER.error("Repository periodic maintenance failed", e); } diff --git a/src/main/java/org/qortal/api/resource/AdminResource.java b/src/main/java/org/qortal/api/resource/AdminResource.java index 39deabee..c7e8eabd 100644 --- a/src/main/java/org/qortal/api/resource/AdminResource.java +++ b/src/main/java/org/qortal/api/resource/AdminResource.java @@ -22,6 +22,7 @@ import java.time.OffsetDateTime; import java.time.ZoneOffset; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; @@ -668,14 +669,16 @@ public class AdminResource { blockchainLock.lockInterruptibly(); try { - repository.backup(true, "backup"); + // Timeout if the database isn't ready for backing up after 60 seconds + long timeout = 60 * 1000L; + repository.backup(true, "backup", timeout); repository.saveChanges(); return "true"; } finally { blockchainLock.unlock(); } - } catch (InterruptedException e) { + } catch (InterruptedException | TimeoutException e) { // We couldn't lock blockchain to perform backup return "false"; } catch (DataException e) { @@ -700,13 +703,15 @@ public class AdminResource { blockchainLock.lockInterruptibly(); try { - repository.performPeriodicMaintenance(); + // Timeout if the database isn't ready to start after 60 seconds + long timeout = 60 * 1000L; + repository.performPeriodicMaintenance(timeout); } finally { blockchainLock.unlock(); } } catch (InterruptedException e) { // No big deal - } catch (DataException e) { + } catch (DataException | TimeoutException e) { throw ApiExceptionFactory.INSTANCE.createException(request, ApiError.REPOSITORY_ISSUE, e); } } diff --git a/src/main/java/org/qortal/controller/AutoUpdate.java b/src/main/java/org/qortal/controller/AutoUpdate.java index 6d74e0e8..f07e82d1 100644 --- a/src/main/java/org/qortal/controller/AutoUpdate.java +++ b/src/main/java/org/qortal/controller/AutoUpdate.java @@ -14,6 +14,7 @@ import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.concurrent.TimeoutException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -215,8 +216,17 @@ public class AutoUpdate extends Thread { } // Give repository a chance to backup in case things go badly wrong (if enabled) - if (Settings.getInstance().getRepositoryBackupInterval() > 0) - RepositoryManager.backup(true, "backup"); + if (Settings.getInstance().getRepositoryBackupInterval() > 0) { + try { + // Timeout if the database isn't ready for backing up after 60 seconds + long timeout = 60 * 1000L; + RepositoryManager.backup(true, "backup", timeout); + + } catch (TimeoutException e) { + LOGGER.info("Attempt to backup repository failed due to timeout: {}", e.getMessage()); + // Continue with the auto update anyway... + } + } // Call ApplyUpdate to end this process (unlocking current JAR so it can be replaced) String javaHome = System.getProperty("java.home"); diff --git a/src/main/java/org/qortal/controller/Controller.java b/src/main/java/org/qortal/controller/Controller.java index feb4b309..253dee03 100644 --- a/src/main/java/org/qortal/controller/Controller.java +++ b/src/main/java/org/qortal/controller/Controller.java @@ -27,6 +27,7 @@ import java.util.Properties; import java.util.Random; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantLock; import java.util.function.Function; @@ -579,7 +580,14 @@ public class Controller extends Thread { Translator.INSTANCE.translate("SysTray", "CREATING_BACKUP_OF_DB_FILES"), MessageType.INFO); - RepositoryManager.backup(true, "backup"); + try { + // Timeout if the database isn't ready for backing up after 60 seconds + long timeout = 60 * 1000L; + RepositoryManager.backup(true, "backup", timeout); + + } catch (TimeoutException e) { + LOGGER.info("Attempt to backup repository failed due to timeout: {}", e.getMessage()); + } } // Prune stuck/slow/old peers diff --git a/src/main/java/org/qortal/repository/Bootstrap.java b/src/main/java/org/qortal/repository/Bootstrap.java index e25a4d39..bd76918e 100644 --- a/src/main/java/org/qortal/repository/Bootstrap.java +++ b/src/main/java/org/qortal/repository/Bootstrap.java @@ -19,6 +19,7 @@ import java.io.InputStream; import java.net.URL; import java.nio.file.*; import java.util.List; +import java.util.concurrent.TimeoutException; import java.util.concurrent.locks.ReentrantLock; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; @@ -227,11 +228,18 @@ public class Bootstrap { repository.saveChanges(); - LOGGER.info("Performing repository maintenance..."); - repository.performPeriodicMaintenance(); - LOGGER.info("Creating bootstrap..."); - repository.backup(true, "bootstrap"); + while (!Controller.isStopping()) { + try { + // Timeout if the database isn't ready for backing up after 10 seconds + long timeout = 10 * 1000L; + repository.backup(false, "bootstrap", timeout); + break; + } + catch (TimeoutException e) { + LOGGER.info("Unable to create bootstrap due to timeout. Retrying..."); + } + } LOGGER.info("Moving files to output directory..."); inputPath = Paths.get(Settings.getInstance().getRepositoryPath(), "bootstrap"); diff --git a/src/main/java/org/qortal/repository/Repository.java b/src/main/java/org/qortal/repository/Repository.java index f6728968..c0bdb0d9 100644 --- a/src/main/java/org/qortal/repository/Repository.java +++ b/src/main/java/org/qortal/repository/Repository.java @@ -1,6 +1,7 @@ package org.qortal.repository; import java.io.IOException; +import java.util.concurrent.TimeoutException; public interface Repository extends AutoCloseable { @@ -49,9 +50,9 @@ public interface Repository extends AutoCloseable { public void setDebug(boolean debugState); - public void backup(boolean quick, String name) throws DataException; + public void backup(boolean quick, String name, Long timeout) throws DataException, TimeoutException; - public void performPeriodicMaintenance() throws DataException; + public void performPeriodicMaintenance(Long timeout) throws DataException, TimeoutException; public void exportNodeLocalData() throws DataException; diff --git a/src/main/java/org/qortal/repository/RepositoryManager.java b/src/main/java/org/qortal/repository/RepositoryManager.java index 1b398057..2f6e4688 100644 --- a/src/main/java/org/qortal/repository/RepositoryManager.java +++ b/src/main/java/org/qortal/repository/RepositoryManager.java @@ -8,6 +8,7 @@ import org.qortal.repository.hsqldb.HSQLDBRepository; import org.qortal.settings.Settings; import java.sql.SQLException; +import java.util.concurrent.TimeoutException; public abstract class RepositoryManager { private static final Logger LOGGER = LogManager.getLogger(RepositoryManager.class); @@ -51,9 +52,9 @@ public abstract class RepositoryManager { repositoryFactory = null; } - public static void backup(boolean quick, String name) { + public static void backup(boolean quick, String name, Long timeout) throws TimeoutException { try (final Repository repository = getRepository()) { - repository.backup(quick, name); + repository.backup(quick, name, timeout); } catch (DataException e) { // Backup is best-effort so don't complain } diff --git a/src/main/java/org/qortal/repository/hsqldb/HSQLDBDatabasePruning.java b/src/main/java/org/qortal/repository/hsqldb/HSQLDBDatabasePruning.java index 4eea59be..232f7058 100644 --- a/src/main/java/org/qortal/repository/hsqldb/HSQLDBDatabasePruning.java +++ b/src/main/java/org/qortal/repository/hsqldb/HSQLDBDatabasePruning.java @@ -12,6 +12,7 @@ import org.qortal.settings.Settings; import java.sql.ResultSet; import java.sql.SQLException; +import java.util.concurrent.TimeoutException; /** * @@ -315,7 +316,15 @@ public class HSQLDBDatabasePruning { } public static void performMaintenance(Repository repository) throws SQLException, DataException { - repository.performPeriodicMaintenance(); + try { + // Timeout if the database isn't ready for backing up after 5 minutes + // Nothing else should be using the db at this point, so a timeout shouldn't happen + long timeout = 5 * 60 * 1000L; + repository.performPeriodicMaintenance(timeout); + + } catch (TimeoutException e) { + LOGGER.info("Attempt to perform maintenance failed due to timeout: {}", e.getMessage()); + } } } diff --git a/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java b/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java index 6be8b9ea..a9f1a7f8 100644 --- a/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java +++ b/src/main/java/org/qortal/repository/hsqldb/HSQLDBRepository.java @@ -16,6 +16,7 @@ import java.sql.SQLException; import java.sql.Savepoint; import java.sql.Statement; import java.util.*; +import java.util.concurrent.TimeoutException; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -379,66 +380,92 @@ public class HSQLDBRepository implements Repository { } @Override - public void backup(boolean quick, String name) throws DataException { - if (!quick) - // First perform a CHECKPOINT + public void backup(boolean quick, String name, Long timeout) throws DataException, TimeoutException { + synchronized (CHECKPOINT_LOCK) { + + // We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction, + // otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions + // due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock. + // Since we don't want to give up too easily, it's best to wait until the other transaction + // count reaches zero, and then continue. + this.blockUntilNoOtherTransactions(timeout); + + if (!quick) + // First perform a CHECKPOINT + try (Statement stmt = this.connection.createStatement()) { + LOGGER.info("Performing maintenance - this will take a while..."); + stmt.execute("CHECKPOINT"); + stmt.execute("CHECKPOINT DEFRAG"); + LOGGER.info("Maintenance completed"); + } catch (SQLException e) { + throw new DataException("Unable to prepare repository for backup"); + } + + // Clean out any previous backup + try { + String connectionUrl = this.connection.getMetaData().getURL(); + String dbPathname = getDbPathname(connectionUrl); + if (dbPathname == null) + throw new DataException("Unable to locate repository for backup?"); + + // Doesn't really make sense to backup an in-memory database... + if (dbPathname.equals("mem")) { + LOGGER.debug("Ignoring request to backup in-memory repository!"); + return; + } + + String backupUrl = buildBackupUrl(dbPathname, name); + String backupPathname = getDbPathname(backupUrl); + if (backupPathname == null) + throw new DataException("Unable to determine location for repository backup?"); + + Path backupDirPath = Paths.get(backupPathname).getParent(); + String backupDirPathname = backupDirPath.toString(); + + try (Stream paths = Files.walk(backupDirPath)) { + paths.sorted(Comparator.reverseOrder()) + .map(Path::toFile) + .filter(file -> file.getPath().startsWith(backupDirPathname)) + .forEach(File::delete); + } + } catch (NoSuchFileException e) { + // Nothing to remove + } catch (SQLException | IOException e) { + throw new DataException("Unable to remove previous repository backup"); + } + + // Actually create backup try (Statement stmt = this.connection.createStatement()) { - stmt.execute("CHECKPOINT DEFRAG"); + LOGGER.info("Backing up repository..."); + stmt.execute(String.format("BACKUP DATABASE TO '%s/' BLOCKING AS FILES", name)); + LOGGER.info("Backup completed"); } catch (SQLException e) { - throw new DataException("Unable to prepare repository for backup"); + throw new DataException("Unable to backup repository"); } - // Clean out any previous backup - try { - String connectionUrl = this.connection.getMetaData().getURL(); - String dbPathname = getDbPathname(connectionUrl); - if (dbPathname == null) - throw new DataException("Unable to locate repository for backup?"); - - // Doesn't really make sense to backup an in-memory database... - if (dbPathname.equals("mem")) { - LOGGER.debug("Ignoring request to backup in-memory repository!"); - return; - } - - String backupUrl = buildBackupUrl(dbPathname, name); - String backupPathname = getDbPathname(backupUrl); - if (backupPathname == null) - throw new DataException("Unable to determine location for repository backup?"); - - Path backupDirPath = Paths.get(backupPathname).getParent(); - String backupDirPathname = backupDirPath.toString(); - - try (Stream paths = Files.walk(backupDirPath)) { - paths.sorted(Comparator.reverseOrder()) - .map(Path::toFile) - .filter(file -> file.getPath().startsWith(backupDirPathname)) - .forEach(File::delete); - } - } catch (NoSuchFileException e) { - // Nothing to remove - } catch (SQLException | IOException e) { - throw new DataException("Unable to remove previous repository backup"); - } - - // Actually create backup - try (Statement stmt = this.connection.createStatement()) { - stmt.execute(String.format("BACKUP DATABASE TO '%s/' BLOCKING AS FILES", name)); - } catch (SQLException e) { - throw new DataException("Unable to backup repository"); } } @Override - public void performPeriodicMaintenance() throws DataException { - // Defrag DB - takes a while! - try (Statement stmt = this.connection.createStatement()) { - LOGGER.info("performing maintenance - this will take a while"); - stmt.execute("CHECKPOINT"); - stmt.execute("CHECKPOINT DEFRAG"); - LOGGER.info("maintenance completed"); - } catch (SQLException e) { - throw new DataException("Unable to defrag repository"); + public void performPeriodicMaintenance(Long timeout) throws DataException, TimeoutException { + synchronized (CHECKPOINT_LOCK) { + + // We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction, + // otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions + // due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock. + // Since we don't want to give up too easily, it's best to wait until the other transaction + // count reaches zero, and then continue. + this.blockUntilNoOtherTransactions(timeout); + + // Defrag DB - takes a while! + try (Statement stmt = this.connection.createStatement()) { + LOGGER.info("performing maintenance - this will take a while"); + stmt.execute("CHECKPOINT"); + stmt.execute("CHECKPOINT DEFRAG"); + LOGGER.info("maintenance completed"); + } catch (SQLException e) { + throw new DataException("Unable to defrag repository"); + } } } @@ -990,4 +1017,51 @@ public class HSQLDBRepository implements Repository { return DEADLOCK_ERROR_CODE.equals(e.getErrorCode()); } + private int otherTransactionsCount() throws DataException { + // We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction, + // otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions + // due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock + String sql = "SELECT COUNT(*) " + + "FROM Information_schema.system_sessions " + + "WHERE transaction = TRUE AND session_id != ?"; + try { + PreparedStatement pstmt = this.cachePreparedStatement(sql); + pstmt.setLong(1, this.sessionId); + + if (!pstmt.execute()) + throw new DataException("Unable to check repository session status"); + + try (ResultSet resultSet = pstmt.getResultSet()) { + if (resultSet == null || !resultSet.next()) + // Failed to even find HSQLDB session info! + throw new DataException("No results when checking repository session status"); + + int transactionCount = resultSet.getInt(1); + + return transactionCount; + } + } catch (SQLException e) { + throw new DataException("Unable to check repository session status", e); + } + } + + private void blockUntilNoOtherTransactions(Long timeout) throws DataException, TimeoutException { + try { + long startTime = System.currentTimeMillis(); + while (this.otherTransactionsCount() > 0) { + // Wait and try again + LOGGER.info("Waiting for repository..."); + Thread.sleep(1000L); + + if (timeout != null) { + if (System.currentTimeMillis() - startTime >= timeout) { + throw new TimeoutException("Timed out waiting for repository to become available"); + } + } + } + } catch (InterruptedException e) { + throw new DataException("Interrupted before repository became available"); + } + } + } diff --git a/src/test/java/org/qortal/test/RepositoryTests.java b/src/test/java/org/qortal/test/RepositoryTests.java index 33a4de7f..bb6510d5 100644 --- a/src/test/java/org/qortal/test/RepositoryTests.java +++ b/src/test/java/org/qortal/test/RepositoryTests.java @@ -26,13 +26,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.*; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -445,44 +439,44 @@ public class RepositoryTests extends Common { } @Test - public void testDefrag() throws DataException { + public void testDefrag() throws DataException, TimeoutException { try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) { this.populateWithRandomData(hsqldb); - hsqldb.performPeriodicMaintenance(); + hsqldb.performPeriodicMaintenance(10 * 1000L); } } @Test - public void testDefragOnDisk() throws DataException { + public void testDefragOnDisk() throws DataException, TimeoutException { Common.useSettingsAndDb(testSettingsFilename, false); try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) { this.populateWithRandomData(hsqldb); - hsqldb.performPeriodicMaintenance(); + hsqldb.performPeriodicMaintenance(10 * 1000L); } } @Test - public void testMultipleDefrags() throws DataException { + public void testMultipleDefrags() throws DataException, TimeoutException { // Mint some more blocks to populate the database try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) { this.populateWithRandomData(hsqldb); for (int i = 0; i < 10; i++) { - hsqldb.performPeriodicMaintenance(); + hsqldb.performPeriodicMaintenance(10 * 1000L); } } } @Test - public void testMultipleDefragsOnDisk() throws DataException { + public void testMultipleDefragsOnDisk() throws DataException, TimeoutException { Common.useSettingsAndDb(testSettingsFilename, false); // Mint some more blocks to populate the database @@ -491,31 +485,31 @@ public class RepositoryTests extends Common { this.populateWithRandomData(hsqldb); for (int i = 0; i < 10; i++) { - hsqldb.performPeriodicMaintenance(); + hsqldb.performPeriodicMaintenance(10 * 1000L); } } } @Test - public void testMultipleDefragsWithDifferentData() throws DataException { + public void testMultipleDefragsWithDifferentData() throws DataException, TimeoutException { for (int i=0; i<10; i++) { try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) { this.populateWithRandomData(hsqldb); - hsqldb.performPeriodicMaintenance(); + hsqldb.performPeriodicMaintenance(10 * 1000L); } } } @Test - public void testMultipleDefragsOnDiskWithDifferentData() throws DataException { + public void testMultipleDefragsOnDiskWithDifferentData() throws DataException, TimeoutException { Common.useSettingsAndDb(testSettingsFilename, false); for (int i=0; i<10; i++) { try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) { this.populateWithRandomData(hsqldb); - hsqldb.performPeriodicMaintenance(); + hsqldb.performPeriodicMaintenance(10 * 1000L); } } }