8303861: Error handling step timeouts should never be blocked by OnError and others

Reviewed-by: dholmes, rkennke
This commit is contained in:
Thomas Stuefe 2023-03-14 15:51:20 +00:00
parent da044dd569
commit a00f5d24d3
3 changed files with 92 additions and 36 deletions

View File

@ -270,8 +270,8 @@ void WatcherThread::run() {
os::die();
}
// Wait a second, then recheck for timeout.
os::naked_short_sleep(999);
// Wait a bit, then recheck for timeout.
os::naked_short_sleep(250);
}
}

View File

@ -1,6 +1,7 @@
/*
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2017, 2020 SAP SE. All rights reserved.
* Copyright (c) 2023, Red Hat, Inc. and/or its affiliates.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -1744,43 +1745,68 @@ void VMError::show_message_box(char *buf, int buflen) {
} while (yes);
}
// Timeout handling: check if a timeout happened (either a single step did
// timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt
// the reporting thread if that is the case.
// Fatal error handling is subject to several timeouts:
// - a global timeout (controlled via ErrorLogTimeout)
// - local error reporting step timeouts.
//
// The latter aims to "give the JVM a kick" if it gets stuck in one particular place during
// error reporting. This prevents one error reporting step from hogging all the time allotted
// to error reporting under ErrorLogTimeout.
//
// VMError::check_timeout() is called from the watcher thread and checks for either global
// or step timeout. If a timeout happened, we interrupt the reporting thread and set either
// _reporting_did_timeout or _step_did_timeout to signal which timeout fired. Function returns
// true if the *global* timeout fired, which will cause WatcherThread to shut down the JVM
// immediately.
bool VMError::check_timeout() {
// This function is supposed to be called from watcher thread during fatal error handling only.
assert(VMError::is_error_reported(), "Only call during error handling");
assert(Thread::current()->is_Watcher_thread(), "Only call from watcher thread");
if (ErrorLogTimeout == 0) {
return false;
}
// Do not check for timeouts if we still have a message box to show to the
// user or if there are OnError handlers to be run.
if (ShowMessageBoxOnError
|| (OnError != nullptr && OnError[0] != '\0')
|| Arguments::abort_hook() != nullptr) {
return false;
}
// There are three situations where we suppress the *global* error timeout:
// - if the JVM is embedded and the launcher has its abort hook installed.
// That must be allowed to run.
// - if the user specified one or more OnError commands to run, and these
// did not yet run. These must have finished.
// - if the user (typically developer) specified ShowMessageBoxOnError,
// and the error box has not yet been shown
const bool ignore_global_timeout =
(ShowMessageBoxOnError
|| (OnError != nullptr && OnError[0] != '\0')
|| Arguments::abort_hook() != nullptr);
const jlong reporting_start_time_l = get_reporting_start_time();
const jlong now = get_current_timestamp();
// Timestamp is stored in nanos.
if (reporting_start_time_l > 0) {
const jlong end = reporting_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR;
if (end <= now && !_reporting_did_timeout) {
// We hit ErrorLogTimeout and we haven't interrupted the reporting
// thread yet.
_reporting_did_timeout = true;
interrupt_reporting_thread();
return true; // global timeout
// Global timeout hit?
if (!ignore_global_timeout) {
const jlong reporting_start_time = get_reporting_start_time();
// Timestamp is stored in nanos.
if (reporting_start_time > 0) {
const jlong end = reporting_start_time + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR;
if (end <= now && !_reporting_did_timeout) {
// We hit ErrorLogTimeout and we haven't interrupted the reporting
// thread yet.
_reporting_did_timeout = true;
interrupt_reporting_thread();
return true; // global timeout
}
}
}
const jlong step_start_time_l = get_step_start_time();
if (step_start_time_l > 0) {
// Reporting step timeout?
const jlong step_start_time = get_step_start_time();
if (step_start_time > 0) {
// A step times out after a quarter of the total timeout. Steps are mostly fast unless they
// hang for some reason, so this simple rule allows for three hanging step and still
// hopefully leaves time enough for the rest of the steps to finish.
const jlong end = step_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4;
const int max_step_timeout_secs = 5;
const jlong timeout_duration = MAX2((jlong)max_step_timeout_secs, (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4);
const jlong end = step_start_time + timeout_duration;
if (end <= now && !_step_did_timeout) {
// The step timed out and we haven't interrupted the reporting
// thread yet.

View File

@ -1,5 +1,6 @@
/*
* Copyright (c) 2017, 2022 SAP SE. All rights reserved.
* Copyright (c) 2023, Red Hat, Inc. and/or its affiliates.
* Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
@ -26,6 +27,9 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
import jdk.test.lib.process.OutputAnalyzer;
@ -33,7 +37,7 @@ import jdk.test.lib.Platform;
import jdk.test.lib.process.ProcessTools;
/*
* @test
* @test id=default
* @bug 8166944
* @summary Hanging Error Reporting steps may lead to torn error logs
* @modules java.base/jdk.internal.misc
@ -43,6 +47,16 @@ import jdk.test.lib.process.ProcessTools;
* @author Thomas Stuefe (SAP)
*/
/*
* @test id=with-on-error
* @bug 8303861
* @summary Error handling step timeouts should never be blocked by OnError etc.
* @modules java.base/jdk.internal.misc
* @library /test/lib
* @requires (vm.debug == true) & (os.family != "windows")
* @run driver TimeoutInErrorHandlingTest with-on-error
*/
public class TimeoutInErrorHandlingTest {
public static final boolean verbose = System.getProperty("verbose") != null;
@ -71,14 +85,28 @@ public class TimeoutInErrorHandlingTest {
* little timeout messages to see that repeated timeout handling is basically working.
*/
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(
"-XX:+UnlockDiagnosticVMOptions",
"-Xmx100M",
"-XX:ErrorHandlerTest=14",
"-XX:+TestUnresponsiveErrorHandler",
"-XX:ErrorLogTimeout=" + ERROR_LOG_TIMEOUT,
"-XX:-CreateCoredumpOnCrash",
"-version");
boolean withOnError = false;
if (args.length > 0) {
switch (args[0]) {
case "with-on-error": withOnError = true; break;
default: throw new RuntimeException("Invalid argument " + args[1]);
}
}
List<String> arguments = new ArrayList<>();
Collections.addAll(arguments,
"-XX:+UnlockDiagnosticVMOptions",
"-Xmx100M",
"-XX:ErrorHandlerTest=14",
"-XX:+TestUnresponsiveErrorHandler",
"-XX:ErrorLogTimeout=" + ERROR_LOG_TIMEOUT,
"-XX:-CreateCoredumpOnCrash");
if (withOnError) {
arguments.add("-XX:OnError=echo hi");
}
arguments.add("-version");
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(arguments);
OutputAnalyzer output_detail = new OutputAnalyzer(pb.start());
@ -92,8 +120,10 @@ public class TimeoutInErrorHandlingTest {
output_detail.shouldMatch("# A fatal error has been detected by the Java Runtime Environment:.*");
output_detail.shouldMatch("# +(?:SIGSEGV|SIGBUS|EXCEPTION_ACCESS_VIOLATION).*");
// VM should have been aborted by WatcherThread
output_detail.shouldMatch(".*timer expired, abort.*");
// Unless we specified OnError, VM should have been aborted by WatcherThread
if (!withOnError) {
output_detail.shouldMatch(".*timer expired, abort.*");
}
// extract hs-err file
File hs_err_file;