Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try to detect the heap dump file of the OLC server after RPC exception between the OLC server and the console. #2871

Merged
merged 1 commit into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ java_library(
":annotations",
":builtin_olc_server_flags",
":server_environment_preparer",
":server_heap_dump_file_detector",
"//src/devtools/mobileharness/infra/client/longrunningservice/proto:control_service_java_proto",
"//src/devtools/mobileharness/infra/client/longrunningservice/proto:version_service_java_proto",
"//src/java/com/google/devtools/common/metrics/stability/rpc/grpc",
Expand Down Expand Up @@ -133,3 +134,17 @@ java_library(
"@maven//:javax_inject_jsr330_api",
],
)

java_library(
name = "server_heap_dump_file_detector",
srcs = ["ServerHeapDumpFileDetector.java"],
deps = [
"//src/java/com/google/devtools/common/metrics/stability/rpc/grpc",
"//src/java/com/google/devtools/mobileharness/shared/constant:log_record_importance",
"//src/java/com/google/devtools/mobileharness/shared/util/file/local",
"//src/java/com/google/devtools/mobileharness/shared/util/logging:google_logger",
"@io_grpc_grpc_java//core",
"@maven//:com_google_code_findbugs_jsr305",
"@maven//:javax_inject_jsr330_api",
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,11 @@ abstract class ServerEnvironment {

public abstract Path javaBinary();

public static ServerEnvironment of(Path serverBinary, Path javaBinary) {
return new AutoValue_ServerEnvironmentPreparer_ServerEnvironment(serverBinary, javaBinary);
public abstract Path serverWorkingDir();

public static ServerEnvironment of(Path serverBinary, Path javaBinary, Path serverWorkingDir) {
return new AutoValue_ServerEnvironmentPreparer_ServerEnvironment(
serverBinary, javaBinary, serverWorkingDir);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.devtools.mobileharness.infra.ats.common.olcserver;

import static com.google.devtools.mobileharness.shared.constant.LogRecordImportance.IMPORTANCE;
import static com.google.devtools.mobileharness.shared.constant.LogRecordImportance.Importance.DEBUG;

import com.google.common.flogger.FluentLogger;
import com.google.devtools.common.metrics.stability.rpc.grpc.GrpcExceptionWithErrorId;
import com.google.devtools.mobileharness.shared.util.file.local.LocalFileUtil;
import io.grpc.Status.Code;
import java.nio.file.Path;
import javax.annotation.concurrent.GuardedBy;
import javax.inject.Inject;
import javax.inject.Singleton;

/** A detector to detect if the OLC server is OOM. */
@Singleton
public class ServerHeapDumpFileDetector {

private static final FluentLogger logger = FluentLogger.forEnclosingClass();

private final LocalFileUtil localFileUtil;

private final Object olcInfoLock = new Object();

@GuardedBy("olcInfoLock")
private Long olcProcessId = null;

@GuardedBy("olcInfoLock")
private String olcServerWorkingDir = null;

@Inject
ServerHeapDumpFileDetector(LocalFileUtil localFileUtil) {
this.localFileUtil = localFileUtil;
}

public void detectHeapDumpExistenceWithGrpcError(GrpcExceptionWithErrorId e) {
if (!e.getUnderlyingRpcException().getStatus().getCode().equals(Code.INTERNAL)) {
detectHeapDumpExistence();
}
}

/** Detects if there is a heap dump file generated by the current OLC server. */
private void detectHeapDumpExistence() {
synchronized (olcInfoLock) {
if (olcServerWorkingDir == null || olcProcessId == null) {
logger
.atInfo()
.with(IMPORTANCE, DEBUG)
.log("Skip checking heap dump existence because no existing OLC server is recorded.");
return;
}

if (!localFileUtil.isFileOrDirExist(olcServerWorkingDir)) {
logger
.atInfo()
.with(IMPORTANCE, DEBUG)
.log(
"Skip checking heap dump existence because the working directory of OLC server is"
+ " not found.");
return;
}

Path heapDumpPath = Path.of(olcServerWorkingDir, "java_pid" + olcProcessId + ".hprof");
if (localFileUtil.isFileOrDirExist(heapDumpPath)) {
logger.atSevere().log(
"Detected OOM heap dump file from OLC server with pid [%s]. Please refer to the"
+ " heap dump file [%s] and attach it when reporting the issue.",
olcProcessId, heapDumpPath);

// Clears the recorded OLC server information.
olcServerWorkingDir = null;
olcProcessId = null;
}
}
}

void setOlcServerInfo(long olcProcessId, String olcServerWorkingDir) {
synchronized (olcInfoLock) {
this.olcProcessId = olcProcessId;
this.olcServerWorkingDir = olcServerWorkingDir;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ public class ServerPreparer {
private final ServerEnvironmentPreparer serverEnvironmentPreparer;
private final FlagsString deviceInfraServiceFlags;
private final ListeningScheduledExecutorService scheduledThreadPool;
private final ServerHeapDumpFileDetector serverHeapDumpFileDetector;

private final Object prepareServerLock = new Object();

Expand All @@ -132,7 +133,8 @@ public class ServerPreparer {
@ServerStub(ServerStub.Type.VERSION_SERVICE) Provider<VersionStub> versionStub,
ServerEnvironmentPreparer serverEnvironmentPreparer,
@DeviceInfraServiceFlags FlagsString deviceInfraServiceFlags,
ListeningScheduledExecutorService scheduledThreadPool) {
ListeningScheduledExecutorService scheduledThreadPool,
ServerHeapDumpFileDetector serverHeapDumpFileDetector) {
this.clientComponentName = clientComponentName;
this.clientId = clientId;
this.commandExecutor = commandExecutor;
Expand All @@ -144,6 +146,7 @@ public class ServerPreparer {
this.serverEnvironmentPreparer = serverEnvironmentPreparer;
this.deviceInfraServiceFlags = deviceInfraServiceFlags;
this.scheduledThreadPool = scheduledThreadPool;
this.serverHeapDumpFileDetector = serverHeapDumpFileDetector;
}

public void startSendingHeartbeats() {
Expand All @@ -159,6 +162,7 @@ public void startSendingHeartbeats() {
.atMostEvery(5, MINUTES)
.with(IMPORTANCE, DEBUG)
.log("Error when sending heartbeat to OLC server");
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
}
},
Duration.ZERO,
Expand Down Expand Up @@ -195,21 +199,28 @@ public void prepareOlcServer() throws MobileHarnessException, InterruptedExcepti
// Acquires file lock.
Optional<NonThrowingAutoCloseable> fileUnlocker = lockFile();
try {

// Tries to get server version.
GetVersionResponse version = tryConnectToOlcServer().orElse(null);
if (version != null) {
GetVersionResponse existingServerVersion = tryConnectToOlcServer().orElse(null);
if (existingServerVersion != null) {
if (firstPreparation) {
logger.atInfo().log(
"Connected to existing OLC server, version=[%s]", shortDebugString(version));
"Connected to existing OLC server, version=[%s]",
shortDebugString(existingServerVersion));
}

if (needKillExistingServer(firstPreparation, version)) {
if (needKillExistingServer(firstPreparation, existingServerVersion)) {
killExistingServer(/* forcibly= */ false);
} else {
if (firstPreparation) {
logger.atInfo().log("Using existing OLC server");
checkAndPrintServerVersionWarning(version);
checkAndPrintServerVersionWarning(existingServerVersion);
Optional<String> processWorkingDir =
systemUtil.getProcessWorkingDirectory(existingServerVersion.getProcessId());
if (processWorkingDir.isPresent()) {
// Records the server information.
serverHeapDumpFileDetector.setOlcServerInfo(
existingServerVersion.getProcessId(), processWorkingDir.get());
}
}
return;
}
Expand Down Expand Up @@ -263,6 +274,7 @@ public void prepareOlcServer() throws MobileHarnessException, InterruptedExcepti
SH_COMMAND,
"-c",
Joiner.on(" ").join(startOlcServerCommandBuilder.build())))
.workDir(serverEnvironment.serverWorkingDir())
.timeout(ChronoUnit.YEARS.getDuration())
.redirectStderr(false)
.onStdout(serverOutputLineCallback)
Expand Down Expand Up @@ -290,6 +302,9 @@ public void prepareOlcServer() throws MobileHarnessException, InterruptedExcepti
logger.atInfo().log(
"OLC server started, port=%s, pid=%s",
Flags.instance().olcServerPort.getNonNull(), serverVersion.getProcessId());
// Records the server information.
serverHeapDumpFileDetector.setOlcServerInfo(
serverVersion.getProcessId(), serverEnvironment.serverWorkingDir().toString());
} catch (MobileHarnessException | InterruptedException | RuntimeException | Error e) {
// Kills the wrapper process.
if (serverProcess.isAlive()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import com.google.devtools.mobileharness.api.model.error.MobileHarnessExceptionFactory;
import com.google.devtools.mobileharness.infra.ats.common.olcserver.Annotations.ClientId;
import com.google.devtools.mobileharness.infra.ats.common.olcserver.Annotations.ServerStub;
import com.google.devtools.mobileharness.infra.ats.common.olcserver.ServerHeapDumpFileDetector;
import com.google.devtools.mobileharness.infra.ats.console.controller.proto.SessionPluginProto.AtsSessionPluginConfig;
import com.google.devtools.mobileharness.infra.ats.console.controller.proto.SessionPluginProto.AtsSessionPluginNotification;
import com.google.devtools.mobileharness.infra.ats.console.controller.proto.SessionPluginProto.AtsSessionPluginOutput;
Expand Down Expand Up @@ -115,17 +116,20 @@ public class AtsSessionStub {
private final String clientId;
private final ListeningExecutorService threadPool;
private final Sleeper sleeper;
private final ServerHeapDumpFileDetector serverHeapDumpFileDetector;

@Inject
AtsSessionStub(
@ServerStub(ServerStub.Type.SESSION_SERVICE) Provider<SessionStub> sessionStubProvider,
@ClientId String clientId,
ListeningExecutorService threadPool,
Sleeper sleeper) {
Sleeper sleeper,
ServerHeapDumpFileDetector serverHeapDumpFileDetector) {
this.sessionStubProvider = sessionStubProvider;
this.clientId = clientId;
this.threadPool = threadPool;
this.sleeper = sleeper;
this.serverHeapDumpFileDetector = serverHeapDumpFileDetector;
}

/** Runs a session in OmniLab long-running client. */
Expand All @@ -147,6 +151,7 @@ public ListenableFuture<AtsSessionPluginOutput> runSession(
createSessionResponse =
requireNonNull(sessionStubProvider.get()).createSession(createSessionRequest);
} catch (GrpcExceptionWithErrorId e) {
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
return immediateFailedFuture(
new MobileHarnessException(
InfraErrorId.ATSC_SESSION_STUB_CREATE_SESSION_ERROR,
Expand Down Expand Up @@ -186,6 +191,7 @@ public AtsSessionPluginOutput runShortSession(String sessionName, AtsSessionPlug
try {
runSessionResponse = requireNonNull(sessionStubProvider.get()).runSession(runSessionRequest);
} catch (GrpcExceptionWithErrorId e) {
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
throw new MobileHarnessException(
InfraErrorId.ATSC_SESSION_STUB_RUN_SESSION_ERROR,
String.format(
Expand Down Expand Up @@ -251,6 +257,7 @@ private GetAllSessionsResponse getAllSessionsByRequest(
try {
return requireNonNull(sessionStubProvider.get()).getAllSessions(getAllSessionsRequest);
} catch (GrpcExceptionWithErrorId e) {
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
throw new MobileHarnessException(
InfraErrorId.ATSC_SESSION_STUB_GET_ALL_SESSIONS_ERROR,
String.format(
Expand Down Expand Up @@ -278,6 +285,7 @@ private AbortSessionsResponse abortSessions(AbortSessionsRequest request)
.log("Successfully aborted sessions, response=[%s]", shortDebugString(response));
return response;
} catch (GrpcExceptionWithErrorId e) {
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
throw new MobileHarnessException(
InfraErrorId.ATSC_SESSION_STUB_ABORT_SESSION_ERROR,
String.format("Failed to abort sessions, request=[%s]", shortDebugString(request)),
Expand Down Expand Up @@ -329,6 +337,7 @@ private NotifyAllSessionsResponse cancelSessionsByNotification(NotifyAllSessions
shortDebugString(response));
return response;
} catch (GrpcExceptionWithErrorId e) {
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
throw new MobileHarnessException(
InfraErrorId.ATSC_SESSION_STUB_CANCEL_UNFINISHED_SESSIONS_ERROR,
String.format(
Expand Down Expand Up @@ -370,6 +379,7 @@ public AtsSessionPluginOutput call() throws MobileHarnessException, InterruptedE
}
} while (!sessionStatus.equals(SessionStatus.SESSION_FINISHED));
} catch (GrpcExceptionWithErrorId e) {
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
throw new MobileHarnessException(
InfraErrorId.ATSC_SESSION_STUB_GET_SESSION_STATUS_ERROR,
String.format(
Expand All @@ -385,6 +395,7 @@ public AtsSessionPluginOutput call() throws MobileHarnessException, InterruptedE
.getSession(GetSessionRequest.newBuilder().setSessionId(sessionId).build())
.getSessionDetail();
} catch (GrpcExceptionWithErrorId e) {
serverHeapDumpFileDetector.detectHeapDumpExistenceWithGrpcError(e);
throw new MobileHarnessException(
InfraErrorId.ATSC_SESSION_STUB_GET_SESSION_RESULT_ERROR,
String.format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ java_library(
"//src/java/com/google/devtools/mobileharness/api/model/error",
"//src/java/com/google/devtools/mobileharness/api/model/error:exception_factory",
"//src/java/com/google/devtools/mobileharness/infra/ats/common/olcserver:annotations",
"//src/java/com/google/devtools/mobileharness/infra/ats/common/olcserver:server_heap_dump_file_detector",
"//src/java/com/google/devtools/mobileharness/infra/ats/console/controller/sessionplugin:ats_session_plugin_config_output",
"//src/java/com/google/devtools/mobileharness/infra/client/longrunningservice/constant:session_properties",
"//src/java/com/google/devtools/mobileharness/infra/client/longrunningservice/rpc/stub:session_stub",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ public ServerEnvironment prepareServerEnvironment()
Path xtsRootDir = consoleInfo.getXtsRootDirectoryNonEmpty();
Path initialServerBinary = Path.of(Flags.instance().atsConsoleOlcServerPath.getNonNull());

// Prepare olc server work dir.
Path serverWorkDir = Path.of(Flags.instance().xtsResDirRoot.getNonNull(), "olc_server_work");
localFileUtil.prepareDir(serverWorkDir);

if (Flags.instance().atsConsoleOlcServerCopyServerResource.getNonNull()) {
// Prepares server resource dir.
Path serverResDir = Path.of(Flags.instance().xtsResDirRoot.getNonNull(), "olc_server_res");
Expand Down Expand Up @@ -111,7 +115,7 @@ public ServerEnvironment prepareServerEnvironment()
// Checks files.
localFileUtil.checkFile(serverBinary);
localFileUtil.checkFile(javaBinary);
return ServerEnvironment.of(serverBinary, javaBinary);
return ServerEnvironment.of(serverBinary, javaBinary, serverWorkDir);
}

/** Grants full access to a file/dir and ignores errors if any. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ protected void configure() {
@Singleton
ServerEnvironmentPreparer provideServerEnvironmentPreparer(SystemUtil systemUtil) {
return new NoOpServerEnvironmentPreparer(
ServerEnvironment.of(olcServerBinary, Path.of(systemUtil.getJavaBin())));
ServerEnvironment.of(
olcServerBinary,
Path.of(systemUtil.getJavaBin()),
Path.of(System.getProperty("user.dir"))));
}

@Provides
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.ListMultimap;
import com.google.common.flogger.FluentLogger;
import com.google.common.graph.Traverser;
Expand Down Expand Up @@ -1301,4 +1302,24 @@ public Optional<String> getUbuntuVersion() throws InterruptedException {
return Optional.empty();
}
}

/** Returns the working directory of the process with the given process id. */
public Optional<String> getProcessWorkingDirectory(long processId) throws InterruptedException {
if (!isOnLinux()) {
return Optional.empty();
}

String output = null;
try {
output = executor.run(Command.of("pwdx", String.valueOf(processId)));
// Parse the working directory from the output.
// Ideally, the output has the format of "<pid>: <working directory>" like "12345: /tmp/olc".
return Optional.of(Iterables.get(Splitter.on(' ').split(output), 1));
} catch (CommandException | IndexOutOfBoundsException e) {
logger.atWarning().log(
"Failed to get process working directory for process id %d with output [%s]",
processId, String.valueOf(output));
return Optional.empty();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ public void setUp() {
new ConsoleCommandTestModule(
consoleInfo,
ServerEnvironment.of(
Path.of("/fake_server_binary"), Path.of("/fake_java_binary")))))
Path.of("/fake_server_binary"),
Path.of("/fake_java_binary"),
Path.of("/fake_working_dir")))))
.create(RunCommand.class);
commandLine =
new CommandLine(runCommand)
Expand Down
Loading
Loading