blob: 415ef0e6d4406e77f20ef4cb4e069f15f5ea3c2c [file] [log] [blame]
// Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "vm_tools/concierge/virtual_machine.h"
#include <arpa/inet.h>
#include <linux/capability.h>
#include <signal.h>
#include <sys/wait.h>
#include <unistd.h>
#include <utility>
#include <base/bind.h>
#include <base/files/file.h>
#include <base/files/file_util.h>
#include <base/guid.h>
#include <base/logging.h>
#include <base/memory/ptr_util.h>
#include <base/strings/stringprintf.h>
#include <base/sys_info.h>
#include <base/time/time.h>
#include <google/protobuf/repeated_field.h>
#include <grpc++/grpc++.h>
#include "vm_tools/common/constants.h"
#include "vm_tools/concierge/tap_device_builder.h"
using std::string;
namespace vm_tools {
namespace concierge {
namespace {
using ProcessExitBehavior = VirtualMachine::ProcessExitBehavior;
using ProcessStatus = VirtualMachine::ProcessStatus;
using Subnet = SubnetPool::Subnet;
// Path to the crosvm binary.
constexpr char kCrosvmBin[] = "/usr/bin/crosvm";
// Name of the control socket used for controlling crosvm.
constexpr char kCrosvmSocket[] = "crosvm.sock";
// Path to the logger(1) binary.
constexpr char kLoggerBin[] = "/usr/bin/logger";
// Path to the wayland socket.
constexpr char kWaylandSocket[] = "/run/chrome/wayland-0";
// How long to wait before timing out on shutdown RPCs.
constexpr int64_t kShutdownTimeoutSeconds = 30;
// How long to wait before timing out on StartTermina RPCs.
constexpr int64_t kStartTerminaTimeoutSeconds = 150;
// How long to wait before timing out on regular RPCs.
constexpr int64_t kDefaultTimeoutSeconds = 10;
// How long to wait before timing out on child process exits.
constexpr base::TimeDelta kChildExitTimeout = base::TimeDelta::FromSeconds(10);
// Offset in a subnet of the gateway/host.
constexpr size_t kHostAddressOffset = 0;
// Offset in a subnet of the client/guest.
constexpr size_t kGuestAddressOffset = 1;
// Calculates the amount of memory to give the virtual machine. Currently
// configured to provide 75% of system memory. This is deliberately over
// provisioned with the expectation that we will use the balloon driver to
// reduce the actual memory footprint.
string GetVmMemoryMiB() {
int64_t vm_memory_mb = base::SysInfo::AmountOfPhysicalMemoryMB();
vm_memory_mb /= 4;
vm_memory_mb *= 3;
return std::to_string(vm_memory_mb);
}
// Sets the pgid of the current process to its pid. This is needed because
// crosvm assumes that only it and its children are in the same process group
// and indiscriminately sends a SIGKILL if it needs to shut them down.
bool SetPgid() {
if (setpgid(0, 0) != 0) {
PLOG(ERROR) << "Failed to change process group id";
return false;
}
return true;
}
// Waits for the |pid| to exit. Returns true if |pid| successfully exited and
// false if it did not exit in time.
bool WaitForChild(pid_t child, base::TimeDelta timeout) {
sigset_t set;
sigemptyset(&set);
sigaddset(&set, SIGCHLD);
const base::Time deadline = base::Time::Now() + timeout;
while (true) {
pid_t ret = waitpid(child, nullptr, WNOHANG);
if (ret == child || (ret < 0 && errno == ECHILD)) {
// Either the child exited or it doesn't exist anymore.
return true;
}
// ret == 0 means that the child is still alive
if (ret < 0) {
PLOG(ERROR) << "Failed to wait for child process";
return false;
}
base::Time now = base::Time::Now();
if (deadline <= now) {
// Timed out.
return false;
}
const struct timespec ts = (deadline - now).ToTimeSpec();
if (sigtimedwait(&set, nullptr, &ts) < 0 && errno == EAGAIN) {
// Timed out.
return false;
}
}
}
} // namespace
VirtualMachine::VirtualMachine(MacAddress mac_addr,
std::unique_ptr<Subnet> subnet,
uint32_t vsock_cid,
base::FilePath runtime_dir)
: mac_addr_(std::move(mac_addr)),
subnet_(std::move(subnet)),
vsock_cid_(vsock_cid) {
CHECK(subnet_);
CHECK(base::DirectoryExists(runtime_dir));
// Take ownership of the runtime directory.
CHECK(runtime_dir_.Set(runtime_dir));
}
VirtualMachine::~VirtualMachine() {
Shutdown();
}
std::unique_ptr<VirtualMachine> VirtualMachine::Create(
base::FilePath kernel,
base::FilePath rootfs,
std::vector<VirtualMachine::Disk> disks,
MacAddress mac_addr,
std::unique_ptr<Subnet> subnet,
uint32_t vsock_cid,
base::FilePath runtime_dir) {
auto vm = base::WrapUnique(new VirtualMachine(std::move(mac_addr),
std::move(subnet), vsock_cid,
std::move(runtime_dir)));
if (!vm->Start(std::move(kernel), std::move(rootfs), std::move(disks))) {
vm.reset();
}
return vm;
}
bool VirtualMachine::Start(base::FilePath kernel,
base::FilePath rootfs,
std::vector<VirtualMachine::Disk> disks) {
// Set up the tap device.
base::ScopedFD tap_fd =
BuildTapDevice(mac_addr_, GatewayAddress(), Netmask());
if (!tap_fd.is_valid()) {
LOG(ERROR) << "Unable to build and configure TAP device";
return false;
}
// Build up the process arguments.
// clang-format off
std::vector<string> args = {
kCrosvmBin, "run",
"--cpus", std::to_string(base::SysInfo::NumberOfProcessors()),
"--mem", GetVmMemoryMiB(),
"--root", rootfs.value(),
"--tap-fd", std::to_string(tap_fd.get()),
"--cid", std::to_string(vsock_cid_),
"--socket", runtime_dir_.GetPath().Append(kCrosvmSocket).value(),
"--wayland-sock", kWaylandSocket,
"--wayland-dmabuf",
};
// clang-format on
// Add any extra disks.
for (const auto& disk : disks) {
if (disk.writable) {
if (disk.image_type == VirtualMachine::DiskImageType::RAW)
args.emplace_back("--rwdisk");
else
args.emplace_back("--rwqcow");
} else {
if (disk.image_type == VirtualMachine::DiskImageType::RAW)
args.emplace_back("--disk");
else
args.emplace_back("--qcow");
}
args.emplace_back(disk.path.value());
}
// Finally list the path to the kernel.
args.emplace_back(kernel.value());
// Put everything into the brillo::ProcessImpl.
for (string& arg : args) {
process_.AddArg(std::move(arg));
}
// Change the process group before exec so that crosvm sending SIGKILL to the
// whole process group doesn't kill us as well.
process_.SetPreExecCallback(base::Bind(&SetPgid));
// Redirect STDOUT to a pipe.
process_.RedirectUsingPipe(STDOUT_FILENO, false /* is_input */);
if (!process_.Start()) {
LOG(ERROR) << "Failed to start VM process";
return false;
}
// Setup kernel logger process.
// Setup logger arguments.
std::vector<string> logger_args = {
kLoggerBin,
// Host syslog deamon requires priority to be set.
"-p", "auth.info", "--skip-empty",
// Tag each to specify the VM number.
"--tag", base::StringPrintf("VM(%u)", vsock_cid_),
};
for (string& arg : logger_args) {
logger_process_.AddArg(std::move(arg));
}
// Bind crosvm's output pipe to the logger's input pipe.
logger_process_.BindFd(process_.GetPipe(STDOUT_FILENO), STDIN_FILENO);
// If the Logger file fails to start, just leave a warning.
if (!logger_process_.Start()) {
LOG(ERROR) << "Failed to start the logger process for VM " << vsock_cid_;
}
// Create a stub for talking to the maitre'd instance inside the VM.
stub_ = std::make_unique<vm_tools::Maitred::Stub>(grpc::CreateChannel(
base::StringPrintf("vsock:%u:%u", vsock_cid_, vm_tools::kMaitredPort),
grpc::InsecureChannelCredentials()));
return true;
}
bool VirtualMachine::Shutdown() {
// Do a sanity check here to make sure the process is still around. It may
// have crashed and we don't want to be waiting around for an RPC response
// that's never going to come. kill with a signal value of 0 is explicitly
// documented as a way to check for the existence of a process.
if (process_.pid() == 0 || (kill(process_.pid(), 0) < 0 && errno == ESRCH)) {
// The process is already gone.
process_.Release();
return true;
}
grpc::ClientContext ctx;
ctx.set_deadline(gpr_time_add(
gpr_now(GPR_CLOCK_MONOTONIC),
gpr_time_from_seconds(kShutdownTimeoutSeconds, GPR_TIMESPAN)));
vm_tools::EmptyMessage empty;
grpc::Status status = stub_->Shutdown(&ctx, empty, &empty);
// brillo::ProcessImpl doesn't provide a timed wait function and while the
// Shutdown RPC may have been successful we can't really trust crosvm to
// actually exit. This may result in an untimed wait() blocking indefinitely.
// Instead, do a timed wait here and only return success if the process
// _actually_ exited as reported by the kernel, which is really the only
// thing we can trust here.
if (status.ok() && WaitForChild(process_.pid(), kChildExitTimeout)) {
process_.Release();
return true;
}
LOG(WARNING) << "Shutdown RPC failed for VM " << vsock_cid_ << " with error "
<< "code " << status.error_code() << ": "
<< status.error_message();
// Try to shut it down via the crosvm socket.
brillo::ProcessImpl crosvm;
crosvm.AddArg(kCrosvmBin);
crosvm.AddArg("stop");
crosvm.AddArg(runtime_dir_.GetPath().Append(kCrosvmSocket).value());
crosvm.Run();
// We can't actually trust the exit codes that crosvm gives us so just see if
// it exited.
if (WaitForChild(process_.pid(), kChildExitTimeout)) {
process_.Release();
return true;
}
LOG(WARNING) << "Failed to stop VM " << vsock_cid_ << " via crosvm socket";
// Kill the process with SIGTERM.
if (process_.Kill(SIGTERM, kChildExitTimeout.InSeconds())) {
return true;
}
LOG(WARNING) << "Failed to kill VM " << vsock_cid_ << " with SIGTERM";
// Kill it with fire.
if (process_.Kill(SIGKILL, kChildExitTimeout.InSeconds())) {
return true;
}
LOG(ERROR) << "Failed to kill VM " << vsock_cid_ << " with SIGKILL";
return false;
}
bool VirtualMachine::LaunchProcess(std::vector<string> args,
std::map<string, string> env,
bool respawn,
bool wait_for_exit,
int64_t timeout_seconds) {
CHECK(!args.empty());
DCHECK(!(respawn && wait_for_exit));
LOG(INFO) << "Launching " << args[0] << " inside VM " << vsock_cid_;
vm_tools::LaunchProcessRequest request;
vm_tools::LaunchProcessResponse response;
google::protobuf::RepeatedPtrField<string> argv(args.begin(), args.end());
request.mutable_argv()->Swap(&argv);
google::protobuf::Map<string, string> environ(env.begin(), env.end());
request.mutable_env()->swap(environ);
request.set_respawn(respawn);
request.set_wait_for_exit(wait_for_exit);
grpc::ClientContext ctx;
ctx.set_deadline(
gpr_time_add(gpr_now(GPR_CLOCK_MONOTONIC),
gpr_time_from_seconds(timeout_seconds, GPR_TIMESPAN)));
grpc::Status status = stub_->LaunchProcess(&ctx, request, &response);
if (!status.ok()) {
LOG(ERROR) << "Failed to launch " << args[0] << ": "
<< status.error_message();
return false;
}
// If waiting for the process to exit, a success means the process had to
// return 0. Otherwise, just check that the process launched successfully.
if (response.status() == vm_tools::EXITED && wait_for_exit) {
return response.code() == 0;
} else if (response.status() == vm_tools::LAUNCHED && !wait_for_exit) {
return true;
}
return false;
}
bool VirtualMachine::StartProcess(std::vector<string> args,
std::map<string, string> env,
ProcessExitBehavior exit_behavior) {
return LaunchProcess(std::move(args), std::move(env),
exit_behavior == ProcessExitBehavior::RESPAWN_ON_EXIT,
false, kDefaultTimeoutSeconds);
}
bool VirtualMachine::RunProcess(std::vector<string> args,
std::map<string, string> env) {
return LaunchProcess(std::move(args), std::move(env), false, true,
kDefaultTimeoutSeconds);
}
bool VirtualMachine::RunProcessWithTimeout(std::vector<string> args,
std::map<string, string> env,
base::TimeDelta timeout) {
return LaunchProcess(std::move(args), std::move(env), false, true,
timeout.InSeconds());
}
bool VirtualMachine::ConfigureNetwork() {
LOG(INFO) << "Configuring network for VM " << vsock_cid_;
vm_tools::NetworkConfigRequest request;
vm_tools::EmptyMessage response;
vm_tools::IPv4Config* config = request.mutable_ipv4_config();
config->set_address(IPv4Address());
config->set_gateway(GatewayAddress());
config->set_netmask(Netmask());
grpc::ClientContext ctx;
ctx.set_deadline(gpr_time_add(
gpr_now(GPR_CLOCK_MONOTONIC),
gpr_time_from_seconds(kDefaultTimeoutSeconds, GPR_TIMESPAN)));
grpc::Status status = stub_->ConfigureNetwork(&ctx, request, &response);
if (!status.ok()) {
LOG(ERROR) << "Failed to configure network for VM " << vsock_cid_ << ": "
<< status.error_message();
return false;
}
return true;
}
bool VirtualMachine::Mount(string source,
string target,
string fstype,
uint64_t mountflags,
string options) {
LOG(INFO) << "Mounting " << source << " on " << target << " inside VM "
<< vsock_cid_;
vm_tools::MountRequest request;
vm_tools::MountResponse response;
request.mutable_source()->swap(source);
request.mutable_target()->swap(target);
request.mutable_fstype()->swap(fstype);
request.set_mountflags(mountflags);
request.mutable_options()->swap(options);
grpc::ClientContext ctx;
ctx.set_deadline(gpr_time_add(
gpr_now(GPR_CLOCK_MONOTONIC),
gpr_time_from_seconds(kDefaultTimeoutSeconds, GPR_TIMESPAN)));
grpc::Status status = stub_->Mount(&ctx, request, &response);
if (!status.ok() || response.error() != 0) {
LOG(ERROR) << "Failed to mount " << request.source() << " on "
<< request.target() << " inside VM " << vsock_cid_ << ": "
<< (status.ok() ? strerror(response.error())
: status.error_message());
return false;
}
return true;
}
bool VirtualMachine::StartTermina(std::string lxd_subnet,
std::string* out_error) {
vm_tools::StartTerminaRequest request;
vm_tools::StartTerminaResponse response;
request.set_tremplin_ipv4_address(GatewayAddress());
request.mutable_lxd_ipv4_subnet()->swap(lxd_subnet);
grpc::ClientContext ctx;
ctx.set_deadline(gpr_time_add(
gpr_now(GPR_CLOCK_MONOTONIC),
gpr_time_from_seconds(kStartTerminaTimeoutSeconds, GPR_TIMESPAN)));
grpc::Status status = stub_->StartTermina(&ctx, request, &response);
if (!status.ok()) {
LOG(ERROR) << "Failed to start Termina: " << status.error_message();
out_error->assign(status.error_message());
return false;
}
return true;
}
void VirtualMachine::SetContainerSubnet(
std::unique_ptr<SubnetPool::Subnet> subnet) {
container_subnet_ = std::move(subnet);
}
uint32_t VirtualMachine::GatewayAddress() const {
return subnet_->AddressAtOffset(kHostAddressOffset);
}
uint32_t VirtualMachine::IPv4Address() const {
return subnet_->AddressAtOffset(kGuestAddressOffset);
}
uint32_t VirtualMachine::Netmask() const {
return subnet_->Netmask();
}
uint32_t VirtualMachine::ContainerNetmask() const {
if (container_subnet_)
return container_subnet_->Netmask();
return INADDR_ANY;
}
size_t VirtualMachine::ContainerPrefix() const {
if (container_subnet_)
return container_subnet_->Prefix();
return 0;
}
uint32_t VirtualMachine::ContainerSubnet() const {
if (container_subnet_)
return container_subnet_->AddressAtOffset(0);
return INADDR_ANY;
}
void VirtualMachine::set_stub_for_testing(
std::unique_ptr<vm_tools::Maitred::Stub> stub) {
stub_ = std::move(stub);
}
std::unique_ptr<VirtualMachine> VirtualMachine::CreateForTesting(
MacAddress mac_addr,
std::unique_ptr<SubnetPool::Subnet> subnet,
uint32_t vsock_cid,
base::FilePath runtime_dir,
std::unique_ptr<vm_tools::Maitred::Stub> stub) {
auto vm = base::WrapUnique(new VirtualMachine(std::move(mac_addr),
std::move(subnet), vsock_cid,
std::move(runtime_dir)));
vm->set_stub_for_testing(std::move(stub));
return vm;
}
} // namespace concierge
} // namespace vm_tools