From 4f44af44493a5d0b5739949827c1ea8e47bbf0c2 Mon Sep 17 00:00:00 2001 From: Valentin Haudiquet Date: Tue, 16 Jun 2026 19:16:34 +0200 Subject: [PATCH] feat: add --kernel flag for QEMU system emulation mode Add --kernel option to boot extracted rootfs in a QEMU virtual machine instead of namespace/chroot mode. The rootfs is converted to an ext4 disk image using mke2fs and booted with the provided kernel. --- Cargo.lock | 31 +++++ Cargo.toml | 1 + README.md | 27 ++++ SPEC.md | 66 ++++++++++ src/cli.rs | 8 ++ src/main.rs | 57 ++++++-- src/qemu_vm.rs | 345 +++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 526 insertions(+), 9 deletions(-) create mode 100644 src/qemu_vm.rs diff --git a/Cargo.lock b/Cargo.lock index 4040ca5..ec0da83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -310,10 +310,17 @@ dependencies = [ "tempfile", "tokio", "users", + "which", "xz2", "zstd", ] +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + [[package]] name = "encode_unicode" version = "1.0.0" @@ -329,6 +336,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "env_home" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe" + [[package]] name = "equivalent" version = "1.0.2" @@ -1997,6 +2010,18 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "which" +version = "7.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762" +dependencies = [ + "either", + "env_home", + "rustix", + "winsafe", +] + [[package]] name = "winapi-util" version = "0.1.11" @@ -2197,6 +2222,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winsafe" +version = "0.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index aae7d0f..47b798c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ anyhow = "1" # Utilities dirs = "6" +which = "7" tokio = { version = "1", features = ["rt-multi-thread", "macros", "io-util"] } futures-util = "0.3" indicatif = "0.18" diff --git a/README.md b/README.md index bce5dd4..1946627 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,8 @@ ecr [OPTIONS] [-- COMMAND...] | `--no-cache` | Force a fresh download, bypassing the cache | | `-v, --verbose` | Print diagnostic output (URLs, layer info, extraction steps) | | `-a, --arch ` | Target architecture (`amd64`, `arm64`, `armhf`, `riscv64`, …) | +| `--kernel ` | Boot with QEMU system emulation using specified kernel | +| `-m, --memory ` | Memory for QEMU VM (default: 2G, only with `--kernel`) | ## Examples @@ -56,8 +58,33 @@ ecr --arch arm64 alpine -- uname -m # Always pull a fresh image ecr --no-cache fedora + +# Boot with QEMU system emulation (requires qemu-system- and e2fsprogs) +ecr --kernel /boot/vmlinuz ubuntu + +# Boot with custom memory +ecr --kernel /boot/vmlinuz --memory 4G alpine ``` +## QEMU System Mode + +When `--kernel` is specified, ecr boots the rootfs in a full QEMU virtual machine instead of using namespaces: + +```sh +ecr --kernel /boot/vmlinuz alpine +``` + +This mode: +- Creates an ext4 disk image from the rootfs +- Boots QEMU with your kernel +- Provides full VM isolation +- Works for any architecture (no binfmt_misc needed) + +Requirements: +- `qemu-system-` installed +- `e2fsprogs` for disk image creation +- Kernel with virtio support + ## Supported distributions | Name | Source | Version examples | diff --git a/SPEC.md b/SPEC.md index 085fbd2..da3b1ba 100644 --- a/SPEC.md +++ b/SPEC.md @@ -22,6 +22,9 @@ ecr [OPTIONS] -- [COMMAND]... | `--bind-rw ` | none | Read-write bind mount at `/mnt/` (can be specified multiple times, overrides `--bind` for same path) | | `--no-cache` | false | Download fresh tarball, ignore cache | | `--no-bind` | false | Skip mounting any directory | +| `--kernel ` | none | Boot with QEMU system emulation using specified kernel (triggers disk image creation) | +| `-m, --memory ` | 2G | Memory size for QEMU VM (only used with `--kernel`) | +| `-v, --verbose` | false | Print diagnostic messages | | `-h, --help` | - | Show help | | `-V, --version` | - | Show version | @@ -182,6 +185,69 @@ Install QEMU user emulation: No action required. Modern qemu-user-static packages register binfmt_misc with the `F` (fix binary) flag, loading the interpreter into kernel memory. The kernel handles foreign binary execution transparently. +## QEMU System Emulation Mode + +When `--kernel` is specified, ecr switches from namespace/chroot mode to QEMU system emulation. The extracted rootfs is converted to a disk image and booted with the provided kernel. + +### Usage + +```sh +ecr --kernel /boot/vmlinuz ubuntu:noble +ecr --kernel /boot/vmlinuz --memory 4G alpine +ecr --kernel /boot/vmlinuz debian -- /bin/sh -c "echo hello" +``` + +### Execution Flow + +1. Download/cache rootfs tarball (same as namespace mode) +2. Extract tarball to temporary directory +3. Create ext4 disk image from rootfs using `mke2fs -d` (requires `e2fsprogs`) +4. Launch QEMU with: + - `-kernel ` - provided kernel + - `-append "root=/dev/vda rw console=ttyS0"` - kernel command line + - `-m ` - memory size (default 2G) + - `-nographic` - console on stdio + - `-drive file=rootfs.img,format=raw,if=virtio` - rootfs disk + - `-netdev user,id=net0 -device virtio-net-pci,netdev=net0` - network +5. Wait for QEMU to exit +6. Cleanup temporary files + +### Disk Image Creation + +The rootfs directory is converted to an ext4 disk image using `mke2fs -t ext4 -d `. This requires the `e2fsprogs` package: + +- Ubuntu/Debian: `sudo apt install e2fsprogs` +- Arch: `sudo pacman -S e2fsprogs` +- Alpine: `sudo apk add e2fsprogs` + +### Architecture Support + +| ecr Arch | QEMU System Binary | +|----------|-------------------| +| amd64/x86_64 | qemu-system-x86_64 | +| arm64/aarch64 | qemu-system-aarch64 | +| armhf/armv7 | qemu-system-arm | +| riscv64 | qemu-system-riscv64 | +| ppc64el | qemu-system-ppc64 | +| s390x | qemu-system-s390x | + +### Requirements + +- QEMU system emulator installed (`qemu-system-`) +- `e2fsprogs` for disk image creation +- Kernel with virtio support (for disk and network drivers) + +### Differences from Namespace Mode + +| Feature | Namespace Mode | QEMU Mode | +|---------|---------------|-----------| +| Isolation | User namespace | Full VM | +| Performance | Near-native | Emulated (slower) | +| Root access | No | No | +| Foreign arch | binfmt_misc required | Built-in emulation | +| Bind mounts | Overlay/bind | Not supported | +| Network | Host network | User-mode network | + ## File Handling ### Overlay Mount (Default) diff --git a/src/cli.rs b/src/cli.rs index 8e7d353..11cf837 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -33,6 +33,14 @@ pub struct Args { #[arg(short = 'v', long)] pub verbose: bool, + /// Boot with QEMU system emulation using specified kernel (extracts rootfs as disk image) + #[arg(long, value_name = "KERNEL_PATH")] + pub kernel: Option, + + /// Memory size for QEMU VM (only used with --kernel, e.g., 512M, 2G) + #[arg(short = 'm', long, default_value = "2G", value_name = "SIZE")] + pub memory: String, + /// Command to run inside the chroot (default: interactive shell) #[arg( trailing_var_arg = true, diff --git a/src/main.rs b/src/main.rs index c50c9e7..5c70e50 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ mod extract; mod mount; mod namespace; mod qemu; +mod qemu_vm; mod verbose; /// Print to stderr only when --verbose / -v is active. @@ -113,11 +114,56 @@ fn main() -> Result<()> { veprintln!("Using cached tarball: {}", cache_path.display()); } - // Check QEMU if foreign architecture - if arch != host_arch { + // Check QEMU if foreign architecture (for namespace mode) + // For VM mode, we don't need binfmt_misc since we're using system emulation + if args.kernel.is_none() && arch != host_arch { qemu::check_binfmt(&arch)?; } + // Create temp directory for extraction + let temp_dir = tempfile::tempdir()?; + let rootfs = temp_dir.path().to_path_buf(); + + veprintln!("Extracting to: {}", rootfs.display()); + extract_tarball(&cache_path, &rootfs)?; + + // Branch based on --kernel flag + if let Some(kernel_path) = &args.kernel { + // QEMU system mode + veprintln!("QEMU mode: booting with kernel {}", kernel_path.display()); + + let command = if args.command.is_empty() { + None + } else { + Some(args.command.clone()) + }; + + let result = qemu_vm::launch_qemu(qemu_vm::QemuConfig { + kernel_path: kernel_path.clone(), + rootfs_path: rootfs, + memory: args.memory.clone(), + arch: arch.clone(), + command, + }); + + // Cleanup happens automatically via tempfile + if result.is_ok() { + veprintln!("Cleanup complete."); + } + + result + } else { + // Namespace/chroot mode + namespace_mode(args, rootfs, config) + } +} + +/// Run in namespace/chroot mode +fn namespace_mode( + args: Args, + rootfs: std::path::PathBuf, + config: Config, +) -> Result<()> { // Check user namespace availability namespace::check_user_namespace()?; @@ -140,13 +186,6 @@ fn main() -> Result<()> { } let bind_rw_paths: Vec = args.bind_rw.clone(); - // Create temp directory for extraction - let temp_dir = tempfile::tempdir()?; - let rootfs = temp_dir.path().to_path_buf(); - - veprintln!("Extracting to: {}", rootfs.display()); - extract_tarball(&cache_path, &rootfs)?; - // Prepare data for the closure let bind_paths_clone = bind_paths.clone(); let bind_rw_paths_clone = bind_rw_paths.clone(); diff --git a/src/qemu_vm.rs b/src/qemu_vm.rs new file mode 100644 index 0000000..d6f34e4 --- /dev/null +++ b/src/qemu_vm.rs @@ -0,0 +1,345 @@ +use crate::veprintln; +use anyhow::{anyhow, Context, Result}; +use std::io::Write; +use std::path::PathBuf; +use std::process::{Command, Stdio}; + +/// QEMU system emulation configuration +pub struct QemuConfig { + /// Path to the kernel image (vmlinuz) + pub kernel_path: PathBuf, + /// Path to the rootfs directory + pub rootfs_path: PathBuf, + /// Memory size for VM (e.g., "2G", "512M") + pub memory: String, + /// Target architecture + pub arch: String, + /// Optional command to run instead of default init + pub command: Option>, +} + +/// Launch QEMU with the given configuration +pub fn launch_qemu(config: QemuConfig) -> Result<()> { + // Check that kernel exists + if !config.kernel_path.exists() { + return Err(anyhow!( + "Kernel not found: {}", + config.kernel_path.display() + )); + } + + // Check that rootfs exists + if !config.rootfs_path.exists() { + return Err(anyhow!( + "Rootfs not found: {}", + config.rootfs_path.display() + )); + } + + // Create a disk image from the rootfs + let disk_image = create_disk_image(&config.rootfs_path)?; + + // Get QEMU binary for architecture + let qemu_bin = qemu_binary_for_arch(&config.arch); + + // Check QEMU exists + which::which(&qemu_bin).context(format!( + "QEMU system emulator '{}' not found. Install it with:\n\ + Ubuntu/Debian: sudo apt install qemu-system-{}\n\ + Arch: sudo pacman -S qemu-system-{}\n\ + Alpine: sudo apk add qemu-system-{}", + qemu_bin, get_arch_package_suffix(&config.arch), get_arch_package_suffix(&config.arch), get_arch_package_suffix(&config.arch) + ))?; + + // Build kernel command line + // Container rootfs images don't have /sbin/init - they expect a command as PID 1 + // We use init=/bin/sh as default, and if a command is specified, we pass it to sh + let kernel_append = if let Some(ref cmd) = config.command { + let cmd_str = cmd.join(" "); + format!( + "root=/dev/vda rw console=ttyS0 init=/bin/sh -- -c \"{}\"", + cmd_str + ) + } else { + // Default to interactive shell + "root=/dev/vda rw console=ttyS0 init=/bin/sh".to_string() + }; + + veprintln!("Launching QEMU: {}", qemu_bin); + veprintln!(" Kernel: {}", config.kernel_path.display()); + veprintln!(" Disk image: {}", disk_image.display()); + veprintln!(" Memory: {}", config.memory); + veprintln!(" Kernel append: {}", kernel_append); + + // Build QEMU arguments + // -display none suppresses VGA/BIOS output + // -serial mon:stdio connects serial console to terminal with QEMU monitor muxed + let args = vec![ + "-kernel".to_string(), + config.kernel_path.to_string_lossy().to_string(), + "-append".to_string(), + kernel_append, + "-m".to_string(), + config.memory.clone(), + "-display".to_string(), + "none".to_string(), + "-serial".to_string(), + "mon:stdio".to_string(), + "-drive".to_string(), + format!( + "file={},format=raw,if=virtio", + disk_image.to_string_lossy() + ), + "-netdev".to_string(), + "user,id=net0".to_string(), + "-device".to_string(), + "virtio-net-pci,netdev=net0".to_string(), + ]; + + // Execute QEMU + let status = Command::new(&qemu_bin) + .args(&args) + .status() + .context("Failed to execute QEMU")?; + + // Cleanup disk image + if let Err(e) = std::fs::remove_file(&disk_image) { + veprintln!("Warning: failed to cleanup disk image: {}", e); + } + + if !status.success() { + return Err(anyhow!( + "QEMU exited with non-zero status: {}", + status.code().unwrap_or(-1) + )); + } + + Ok(()) +} + +/// Get QEMU system binary name for architecture +fn qemu_binary_for_arch(arch: &str) -> String { + match arch { + "amd64" | "x86_64" => "qemu-system-x86_64".to_string(), + "arm64" | "aarch64" => "qemu-system-aarch64".to_string(), + "armhf" | "armv7" => "qemu-system-arm".to_string(), + "riscv64" => "qemu-system-riscv64".to_string(), + "ppc64el" | "ppc64le" => "qemu-system-ppc64".to_string(), + "s390x" => "qemu-system-s390x".to_string(), + other => format!("qemu-system-{}", other), + } +} + +/// Get architecture suffix for package names +fn get_arch_package_suffix(arch: &str) -> &str { + match arch { + "amd64" | "x86_64" => "x86", + "arm64" | "aarch64" => "aarch64", + "armhf" | "armv7" => "arm", + "riscv64" => "riscv64", + "ppc64el" | "ppc64le" => "ppc", + "s390x" => "s390x", + other => other, + } +} + +/// Create a raw disk image from a directory +fn create_disk_image(rootfs: &PathBuf) -> Result { + veprintln!("Creating disk image from rootfs..."); + + // Create a temporary file for the disk image + let disk_image = rootfs.parent().unwrap().join("rootfs.img"); + + // Use mke2fs to create an ext4 filesystem image + // First, calculate size needed (du -sb) + let du_output = Command::new("du") + .arg("-sb") + .arg(rootfs) + .output() + .context("Failed to calculate rootfs size")?; + + let size_str = String::from_utf8_lossy(&du_output.stdout); + let size: u64 = size_str + .split_whitespace() + .next() + .context("Failed to parse du output")? + .parse() + .context("Failed to parse size")?; + + // Add 50% overhead for filesystem metadata, journal, and some free space + // ext4 with journal can have significant overhead + let image_size = size + (size / 2); + // Minimum 64MB for small rootfs to ensure enough space for metadata + let image_size = image_size.max(64 * 1024 * 1024); + + veprintln!("Rootfs size: {} bytes, image size: {} bytes", size, image_size); + + // Create the image file + let image_file = std::fs::File::create(&disk_image) + .context("Failed to create disk image file")?; + + // Pre-allocate the file + image_file + .set_len(image_size) + .context("Failed to allocate disk image")?; + drop(image_file); + + // Try to use mke2fs to create an ext4 image with the directory contents + // This is the most efficient way on Linux + veprintln!("Running: mke2fs -t ext4 -d {} {}", rootfs.display(), disk_image.display()); + let mke2fs_result = Command::new("mke2fs") + .arg("-t") + .arg("ext4") + .arg("-d") + .arg(rootfs) + .arg(&disk_image) + .output(); + + match mke2fs_result { + Ok(output) => { + if output.status.success() { + veprintln!("Disk image created successfully with mke2fs"); + // Verify the image has content by checking if we can list files + let verify = Command::new("debugfs") + .arg("-R") + .arg("ls -l /") + .arg(&disk_image) + .output(); + if let Ok(v) = verify { + veprintln!("Root directory contents:\n{}", String::from_utf8_lossy(&v.stdout)); + } + return Ok(disk_image); + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + veprintln!("mke2fs failed!"); + veprintln!(" stdout: {}", stdout); + veprintln!(" stderr: {}", stderr); + // Don't continue if mke2fs exists but failed - it's the only reliable method + return Err(anyhow!( + "mke2fs -d failed to create disk image.\n\ + stdout: {}\n\ + stderr: {}", + stdout, stderr + )); + } + } + Err(e) => { + veprintln!("mke2fs not available: {}", e); + } + } + + // Fallback: create a simple ext4 image and copy files + // Try mkfs.ext4 + let mkfs_result = Command::new("mkfs.ext4") + .arg("-F") + .arg(&disk_image) + .output(); + + match mkfs_result { + Ok(output) => { + if !output.status.success() { + return Err(anyhow!( + "mkfs.ext4 failed: {}", + String::from_utf8_lossy(&output.stderr) + )); + } + } + Err(e) => { + return Err(anyhow!( + "Neither mke2fs nor mkfs.ext4 available. Install e2fsprogs:\n\ + Ubuntu/Debian: sudo apt install e2fsprogs\n\ + Arch: sudo pacman -S e2fsprogs\n\ + Alpine: sudo apk add e2fsprogs\n\ + Error: {}", + e + )); + } + } + + // Mount the image and copy files + veprintln!("Mounting disk image and copying files..."); + + // Use debugfs to copy files (doesn't require root/mount) + let debugfs_result = copy_with_debugfs(rootfs, &disk_image)?; + + if debugfs_result { + veprintln!("Disk image created successfully"); + return Ok(disk_image); + } + + // If debugfs failed, we need to try mounting (requires root or fuse) + // This is a last resort + Err(anyhow!( + "Could not create disk image. Please ensure e2fsprogs is installed with mke2fs support.\n\ + The mke2fs -d option is required for non-root disk image creation." + )) +} + +/// Copy files to the disk image using debugfs +fn copy_with_debugfs(rootfs: &PathBuf, disk_image: &PathBuf) -> Result { + // Use debugfs to write files - this doesn't require mounting + let mut debugfs = match Command::new("debugfs") + .arg("-w") + .arg(disk_image) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + { + Ok(child) => child, + Err(_) => return Ok(false), + }; + + let stdin = debugfs.stdin.as_mut().context("Failed to open debugfs stdin")?; + + // Write files recursively + fn write_directory( + dir: &PathBuf, + prefix: &str, + stdin: &mut std::process::ChildStdin, + ) -> Result<()> { + for entry in std::fs::read_dir(dir)? { + let entry = entry?; + let path = entry.path(); + let name = entry.file_name().to_string_lossy().into_owned(); + let target = if prefix.is_empty() { + format!("/{}", name) + } else { + format!("{}/{}", prefix, name) + }; + + if path.is_dir() { + // Create directory + writeln!(stdin, "mkdir {}", target)?; + write_directory(&path, &target, stdin)?; + } else { + // Write file + writeln!(stdin, "write {} {}", path.display(), target)?; + } + } + Ok(()) + } + + if let Err(e) = write_directory(rootfs, "", stdin) { + veprintln!("debugfs write failed: {}", e); + let _ = debugfs.kill(); + return Ok(false); + } + + // stdin is implicitly dropped here when it goes out of scope + + let output = debugfs + .wait_with_output() + .context("Failed to wait for debugfs")?; + + if !output.status.success() { + veprintln!( + "debugfs failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + return Ok(false); + } + + Ok(true) +}