Rework of repository maintenance and backups

It will now attempt to wait until there are no other active transactions before starting, to avoid deadlocks. A timeout for this process is specified - generally 60 seconds - so that callers can give up or retry if something is holding a transaction open for too long. Right now we will give up in all places except for bootstrap creation, where it will keep retrying until successful.
This commit is contained in:
CalDescent 2021-10-02 13:23:26 +01:00
parent c4d7335fdd
commit adeb654248
10 changed files with 200 additions and 89 deletions

View File

@ -1,6 +1,7 @@
package org.qortal;
import java.security.Security;
import java.util.concurrent.TimeoutException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -57,10 +58,10 @@ public class RepositoryMaintenance {
LOGGER.info("Starting repository periodic maintenance. This can take a while...");
try (final Repository repository = RepositoryManager.getRepository()) {
repository.performPeriodicMaintenance();
repository.performPeriodicMaintenance(null);
LOGGER.info("Repository periodic maintenance completed");
} catch (DataException e) {
} catch (DataException | TimeoutException e) {
LOGGER.error("Repository periodic maintenance failed", e);
}

View File

@ -22,6 +22,7 @@ import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;
@ -668,14 +669,16 @@ public class AdminResource {
blockchainLock.lockInterruptibly();
try {
repository.backup(true, "backup");
// Timeout if the database isn't ready for backing up after 60 seconds
long timeout = 60 * 1000L;
repository.backup(true, "backup", timeout);
repository.saveChanges();
return "true";
} finally {
blockchainLock.unlock();
}
} catch (InterruptedException e) {
} catch (InterruptedException | TimeoutException e) {
// We couldn't lock blockchain to perform backup
return "false";
} catch (DataException e) {
@ -700,13 +703,15 @@ public class AdminResource {
blockchainLock.lockInterruptibly();
try {
repository.performPeriodicMaintenance();
// Timeout if the database isn't ready to start after 60 seconds
long timeout = 60 * 1000L;
repository.performPeriodicMaintenance(timeout);
} finally {
blockchainLock.unlock();
}
} catch (InterruptedException e) {
// No big deal
} catch (DataException e) {
} catch (DataException | TimeoutException e) {
throw ApiExceptionFactory.INSTANCE.createException(request, ApiError.REPOSITORY_ISSUE, e);
}
}

View File

@ -14,6 +14,7 @@ import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeoutException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -215,8 +216,17 @@ public class AutoUpdate extends Thread {
}
// Give repository a chance to backup in case things go badly wrong (if enabled)
if (Settings.getInstance().getRepositoryBackupInterval() > 0)
RepositoryManager.backup(true, "backup");
if (Settings.getInstance().getRepositoryBackupInterval() > 0) {
try {
// Timeout if the database isn't ready for backing up after 60 seconds
long timeout = 60 * 1000L;
RepositoryManager.backup(true, "backup", timeout);
} catch (TimeoutException e) {
LOGGER.info("Attempt to backup repository failed due to timeout: {}", e.getMessage());
// Continue with the auto update anyway...
}
}
// Call ApplyUpdate to end this process (unlocking current JAR so it can be replaced)
String javaHome = System.getProperty("java.home");

View File

@ -27,6 +27,7 @@ import java.util.Properties;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Function;
@ -579,7 +580,14 @@ public class Controller extends Thread {
Translator.INSTANCE.translate("SysTray", "CREATING_BACKUP_OF_DB_FILES"),
MessageType.INFO);
RepositoryManager.backup(true, "backup");
try {
// Timeout if the database isn't ready for backing up after 60 seconds
long timeout = 60 * 1000L;
RepositoryManager.backup(true, "backup", timeout);
} catch (TimeoutException e) {
LOGGER.info("Attempt to backup repository failed due to timeout: {}", e.getMessage());
}
}
// Prune stuck/slow/old peers

View File

@ -19,6 +19,7 @@ import java.io.InputStream;
import java.net.URL;
import java.nio.file.*;
import java.util.List;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.locks.ReentrantLock;
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
@ -227,11 +228,18 @@ public class Bootstrap {
repository.saveChanges();
LOGGER.info("Performing repository maintenance...");
repository.performPeriodicMaintenance();
LOGGER.info("Creating bootstrap...");
repository.backup(true, "bootstrap");
while (!Controller.isStopping()) {
try {
// Timeout if the database isn't ready for backing up after 10 seconds
long timeout = 10 * 1000L;
repository.backup(false, "bootstrap", timeout);
break;
}
catch (TimeoutException e) {
LOGGER.info("Unable to create bootstrap due to timeout. Retrying...");
}
}
LOGGER.info("Moving files to output directory...");
inputPath = Paths.get(Settings.getInstance().getRepositoryPath(), "bootstrap");

View File

@ -1,6 +1,7 @@
package org.qortal.repository;
import java.io.IOException;
import java.util.concurrent.TimeoutException;
public interface Repository extends AutoCloseable {
@ -49,9 +50,9 @@ public interface Repository extends AutoCloseable {
public void setDebug(boolean debugState);
public void backup(boolean quick, String name) throws DataException;
public void backup(boolean quick, String name, Long timeout) throws DataException, TimeoutException;
public void performPeriodicMaintenance() throws DataException;
public void performPeriodicMaintenance(Long timeout) throws DataException, TimeoutException;
public void exportNodeLocalData() throws DataException;

View File

@ -8,6 +8,7 @@ import org.qortal.repository.hsqldb.HSQLDBRepository;
import org.qortal.settings.Settings;
import java.sql.SQLException;
import java.util.concurrent.TimeoutException;
public abstract class RepositoryManager {
private static final Logger LOGGER = LogManager.getLogger(RepositoryManager.class);
@ -51,9 +52,9 @@ public abstract class RepositoryManager {
repositoryFactory = null;
}
public static void backup(boolean quick, String name) {
public static void backup(boolean quick, String name, Long timeout) throws TimeoutException {
try (final Repository repository = getRepository()) {
repository.backup(quick, name);
repository.backup(quick, name, timeout);
} catch (DataException e) {
// Backup is best-effort so don't complain
}

View File

@ -12,6 +12,7 @@ import org.qortal.settings.Settings;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.concurrent.TimeoutException;
/**
*
@ -315,7 +316,15 @@ public class HSQLDBDatabasePruning {
}
public static void performMaintenance(Repository repository) throws SQLException, DataException {
repository.performPeriodicMaintenance();
try {
// Timeout if the database isn't ready for backing up after 5 minutes
// Nothing else should be using the db at this point, so a timeout shouldn't happen
long timeout = 5 * 60 * 1000L;
repository.performPeriodicMaintenance(timeout);
} catch (TimeoutException e) {
LOGGER.info("Attempt to perform maintenance failed due to timeout: {}", e.getMessage());
}
}
}

View File

@ -16,6 +16,7 @@ import java.sql.SQLException;
import java.sql.Savepoint;
import java.sql.Statement;
import java.util.*;
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@ -379,66 +380,92 @@ public class HSQLDBRepository implements Repository {
}
@Override
public void backup(boolean quick, String name) throws DataException {
if (!quick)
// First perform a CHECKPOINT
public void backup(boolean quick, String name, Long timeout) throws DataException, TimeoutException {
synchronized (CHECKPOINT_LOCK) {
// We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction,
// otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions
// due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock.
// Since we don't want to give up too easily, it's best to wait until the other transaction
// count reaches zero, and then continue.
this.blockUntilNoOtherTransactions(timeout);
if (!quick)
// First perform a CHECKPOINT
try (Statement stmt = this.connection.createStatement()) {
LOGGER.info("Performing maintenance - this will take a while...");
stmt.execute("CHECKPOINT");
stmt.execute("CHECKPOINT DEFRAG");
LOGGER.info("Maintenance completed");
} catch (SQLException e) {
throw new DataException("Unable to prepare repository for backup");
}
// Clean out any previous backup
try {
String connectionUrl = this.connection.getMetaData().getURL();
String dbPathname = getDbPathname(connectionUrl);
if (dbPathname == null)
throw new DataException("Unable to locate repository for backup?");
// Doesn't really make sense to backup an in-memory database...
if (dbPathname.equals("mem")) {
LOGGER.debug("Ignoring request to backup in-memory repository!");
return;
}
String backupUrl = buildBackupUrl(dbPathname, name);
String backupPathname = getDbPathname(backupUrl);
if (backupPathname == null)
throw new DataException("Unable to determine location for repository backup?");
Path backupDirPath = Paths.get(backupPathname).getParent();
String backupDirPathname = backupDirPath.toString();
try (Stream<Path> paths = Files.walk(backupDirPath)) {
paths.sorted(Comparator.reverseOrder())
.map(Path::toFile)
.filter(file -> file.getPath().startsWith(backupDirPathname))
.forEach(File::delete);
}
} catch (NoSuchFileException e) {
// Nothing to remove
} catch (SQLException | IOException e) {
throw new DataException("Unable to remove previous repository backup");
}
// Actually create backup
try (Statement stmt = this.connection.createStatement()) {
stmt.execute("CHECKPOINT DEFRAG");
LOGGER.info("Backing up repository...");
stmt.execute(String.format("BACKUP DATABASE TO '%s/' BLOCKING AS FILES", name));
LOGGER.info("Backup completed");
} catch (SQLException e) {
throw new DataException("Unable to prepare repository for backup");
throw new DataException("Unable to backup repository");
}
// Clean out any previous backup
try {
String connectionUrl = this.connection.getMetaData().getURL();
String dbPathname = getDbPathname(connectionUrl);
if (dbPathname == null)
throw new DataException("Unable to locate repository for backup?");
// Doesn't really make sense to backup an in-memory database...
if (dbPathname.equals("mem")) {
LOGGER.debug("Ignoring request to backup in-memory repository!");
return;
}
String backupUrl = buildBackupUrl(dbPathname, name);
String backupPathname = getDbPathname(backupUrl);
if (backupPathname == null)
throw new DataException("Unable to determine location for repository backup?");
Path backupDirPath = Paths.get(backupPathname).getParent();
String backupDirPathname = backupDirPath.toString();
try (Stream<Path> paths = Files.walk(backupDirPath)) {
paths.sorted(Comparator.reverseOrder())
.map(Path::toFile)
.filter(file -> file.getPath().startsWith(backupDirPathname))
.forEach(File::delete);
}
} catch (NoSuchFileException e) {
// Nothing to remove
} catch (SQLException | IOException e) {
throw new DataException("Unable to remove previous repository backup");
}
// Actually create backup
try (Statement stmt = this.connection.createStatement()) {
stmt.execute(String.format("BACKUP DATABASE TO '%s/' BLOCKING AS FILES", name));
} catch (SQLException e) {
throw new DataException("Unable to backup repository");
}
}
@Override
public void performPeriodicMaintenance() throws DataException {
// Defrag DB - takes a while!
try (Statement stmt = this.connection.createStatement()) {
LOGGER.info("performing maintenance - this will take a while");
stmt.execute("CHECKPOINT");
stmt.execute("CHECKPOINT DEFRAG");
LOGGER.info("maintenance completed");
} catch (SQLException e) {
throw new DataException("Unable to defrag repository");
public void performPeriodicMaintenance(Long timeout) throws DataException, TimeoutException {
synchronized (CHECKPOINT_LOCK) {
// We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction,
// otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions
// due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock.
// Since we don't want to give up too easily, it's best to wait until the other transaction
// count reaches zero, and then continue.
this.blockUntilNoOtherTransactions(timeout);
// Defrag DB - takes a while!
try (Statement stmt = this.connection.createStatement()) {
LOGGER.info("performing maintenance - this will take a while");
stmt.execute("CHECKPOINT");
stmt.execute("CHECKPOINT DEFRAG");
LOGGER.info("maintenance completed");
} catch (SQLException e) {
throw new DataException("Unable to defrag repository");
}
}
}
@ -990,4 +1017,51 @@ public class HSQLDBRepository implements Repository {
return DEADLOCK_ERROR_CODE.equals(e.getErrorCode());
}
private int otherTransactionsCount() throws DataException {
// We can only perform a CHECKPOINT if no other HSQLDB session is mid-transaction,
// otherwise the CHECKPOINT blocks for COMMITs and other threads can't open HSQLDB sessions
// due to HSQLDB blocking until CHECKPOINT finishes - i.e. deadlock
String sql = "SELECT COUNT(*) "
+ "FROM Information_schema.system_sessions "
+ "WHERE transaction = TRUE AND session_id != ?";
try {
PreparedStatement pstmt = this.cachePreparedStatement(sql);
pstmt.setLong(1, this.sessionId);
if (!pstmt.execute())
throw new DataException("Unable to check repository session status");
try (ResultSet resultSet = pstmt.getResultSet()) {
if (resultSet == null || !resultSet.next())
// Failed to even find HSQLDB session info!
throw new DataException("No results when checking repository session status");
int transactionCount = resultSet.getInt(1);
return transactionCount;
}
} catch (SQLException e) {
throw new DataException("Unable to check repository session status", e);
}
}
private void blockUntilNoOtherTransactions(Long timeout) throws DataException, TimeoutException {
try {
long startTime = System.currentTimeMillis();
while (this.otherTransactionsCount() > 0) {
// Wait and try again
LOGGER.info("Waiting for repository...");
Thread.sleep(1000L);
if (timeout != null) {
if (System.currentTimeMillis() - startTime >= timeout) {
throw new TimeoutException("Timed out waiting for repository to become available");
}
}
}
} catch (InterruptedException e) {
throw new DataException("Interrupted before repository became available");
}
}
}

View File

@ -26,13 +26,7 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -445,44 +439,44 @@ public class RepositoryTests extends Common {
}
@Test
public void testDefrag() throws DataException {
public void testDefrag() throws DataException, TimeoutException {
try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) {
this.populateWithRandomData(hsqldb);
hsqldb.performPeriodicMaintenance();
hsqldb.performPeriodicMaintenance(10 * 1000L);
}
}
@Test
public void testDefragOnDisk() throws DataException {
public void testDefragOnDisk() throws DataException, TimeoutException {
Common.useSettingsAndDb(testSettingsFilename, false);
try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) {
this.populateWithRandomData(hsqldb);
hsqldb.performPeriodicMaintenance();
hsqldb.performPeriodicMaintenance(10 * 1000L);
}
}
@Test
public void testMultipleDefrags() throws DataException {
public void testMultipleDefrags() throws DataException, TimeoutException {
// Mint some more blocks to populate the database
try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) {
this.populateWithRandomData(hsqldb);
for (int i = 0; i < 10; i++) {
hsqldb.performPeriodicMaintenance();
hsqldb.performPeriodicMaintenance(10 * 1000L);
}
}
}
@Test
public void testMultipleDefragsOnDisk() throws DataException {
public void testMultipleDefragsOnDisk() throws DataException, TimeoutException {
Common.useSettingsAndDb(testSettingsFilename, false);
// Mint some more blocks to populate the database
@ -491,31 +485,31 @@ public class RepositoryTests extends Common {
this.populateWithRandomData(hsqldb);
for (int i = 0; i < 10; i++) {
hsqldb.performPeriodicMaintenance();
hsqldb.performPeriodicMaintenance(10 * 1000L);
}
}
}
@Test
public void testMultipleDefragsWithDifferentData() throws DataException {
public void testMultipleDefragsWithDifferentData() throws DataException, TimeoutException {
for (int i=0; i<10; i++) {
try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) {
this.populateWithRandomData(hsqldb);
hsqldb.performPeriodicMaintenance();
hsqldb.performPeriodicMaintenance(10 * 1000L);
}
}
}
@Test
public void testMultipleDefragsOnDiskWithDifferentData() throws DataException {
public void testMultipleDefragsOnDiskWithDifferentData() throws DataException, TimeoutException {
Common.useSettingsAndDb(testSettingsFilename, false);
for (int i=0; i<10; i++) {
try (final HSQLDBRepository hsqldb = (HSQLDBRepository) RepositoryManager.getRepository()) {
this.populateWithRandomData(hsqldb);
hsqldb.performPeriodicMaintenance();
hsqldb.performPeriodicMaintenance(10 * 1000L);
}
}
}