Files
pkh/src/pull.rs
Valentin Haudiquet c3a116203a
Some checks failed
CI / build (push) Failing after 7m56s
pull: allow applying diff.gz from archive download
2026-01-22 00:05:39 +01:00

694 lines
22 KiB
Rust

use std::cmp::min;
use std::error::Error;
use std::path::Path;
use std::path::PathBuf;
use crate::package_info::PackageInfo;
use std::process::Command;
use log::debug;
use regex::Regex;
use crate::ProgressCallback;
fn clone_repo(
url: &str,
package: &str,
branch: Option<&str>,
cwd: Option<&Path>,
progress: ProgressCallback<'_>,
) -> Result<(), Box<dyn Error>> {
let target_path = if let Some(path) = cwd {
path.join(package)
} else {
Path::new(package).to_path_buf()
};
let mut callbacks = git2::RemoteCallbacks::new();
if let Some(ref progress_cb) = progress {
// Download progress
callbacks.transfer_progress(move |stats| {
(progress_cb)(
"",
"Receiving objects...",
stats.received_objects(),
stats.total_objects(),
);
true
});
// Remote progress: messages 'Remote: compressing objects 10% (34/340)'
// Parse progress informations to display them in callbacks
callbacks.sideband_progress(move |data| {
let msg = String::from_utf8_lossy(data);
let re = Regex::new(r"(.*):[ ]*([0-9]*)% \(([0-9]*)/([0-9]*)\)").unwrap();
if let Some(caps) = re.captures(msg.trim()) {
let msg = caps.get(1).map_or("", |m| m.as_str()).to_string();
let objects = caps
.get(3)
.map_or("", |m| m.as_str())
.to_string()
.parse::<usize>()
.unwrap_or(0);
let total = caps
.get(4)
.map_or("", |m| m.as_str())
.to_string()
.parse::<usize>()
.unwrap_or(0);
(progress_cb)("", msg.as_str(), objects, total);
}
true
});
}
let mut fetch_options = git2::FetchOptions::new();
fetch_options.remote_callbacks(callbacks);
let mut builder = git2::build::RepoBuilder::new();
builder.fetch_options(fetch_options);
if let Some(b) = branch {
builder.branch(b);
}
match builder.clone(url, &target_path) {
Ok(_repo) => Ok(()),
Err(e) => Err(format!("Failed to clone: {}", e).into()),
}
}
use sha2::{Digest, Sha256};
use std::fs::File;
use std::io::Write;
use flate2::read::GzDecoder;
use futures_util::StreamExt;
use tar::Archive;
use xz2::read::XzDecoder;
fn copy_dir_all(src: &Path, dst: &Path) -> Result<(), Box<dyn Error>> {
if !dst.exists() {
std::fs::create_dir_all(dst)?;
}
for entry in std::fs::read_dir(src)? {
let entry = entry?;
let src_path = entry.path();
let dst_path = dst.join(entry.file_name());
if src_path.is_dir() {
copy_dir_all(&src_path, &dst_path)?;
} else {
std::fs::copy(&src_path, &dst_path)?;
}
}
Ok(())
}
/// Helper function to extract tar archive with progress tracking
fn extract_tar_archive<D, F>(
file_path: &Path,
dest: &Path,
progress: ProgressCallback<'_>,
decoder_factory: F,
) -> Result<Vec<String>, Box<dyn Error>>
where
D: std::io::Read,
F: Fn(File) -> D,
{
let file = File::open(file_path)?;
let decoder = decoder_factory(file);
let mut archive = Archive::new(decoder);
// Get total number of entries for progress tracking
let total_entries = archive.entries()?.count();
let mut current_entry = 0;
// Reset the archive to read entries again
let file = File::open(file_path)?;
let decoder = decoder_factory(file);
let mut archive = Archive::new(decoder);
let mut extracted_files = Vec::new();
for entry in archive.entries()? {
let mut entry = entry?;
let path = entry.path()?.to_path_buf();
let dest_path = dest.join(&path);
// Create parent directories if needed
if let Some(parent) = dest_path.parent() {
std::fs::create_dir_all(parent)?;
}
// Extract the file
entry.unpack(&dest_path)?;
extracted_files.push(dest_path.to_string_lossy().to_string());
current_entry += 1;
// Report progress
if let Some(cb) = progress {
cb("", "Extracting...", current_entry, total_entries);
}
}
Ok(extracted_files)
}
fn extract_archive(
path: &Path,
dest: &Path,
progress: ProgressCallback<'_>,
) -> Result<Vec<String>, Box<dyn Error>> {
let filename = path.file_name().unwrap().to_string_lossy();
if filename.ends_with(".tar.gz") || filename.ends_with(".tgz") {
extract_tar_archive(path, dest, progress, GzDecoder::new)
} else if filename.ends_with(".tar.xz") || filename.ends_with(".txz") {
extract_tar_archive(path, dest, progress, XzDecoder::new)
} else {
Err(format!("Unsupported archive format: {}", filename).into())
}
}
fn checkout_pristine_tar(package_dir: &Path, filename: &str) -> Result<(), Box<dyn Error>> {
let output = Command::new("pristine-tar")
.current_dir(package_dir)
.args(["checkout", format!("../{filename}").as_str()])
.output()
.expect("pristine-tar checkout failed");
if !output.status.success() {
return Err(format!(
"pristine-tar checkout failed with status: {}",
output.status
)
.into());
}
Ok(())
}
async fn download_file_checksum(
url: &str,
checksum: &str,
target_dir: &Path,
progress: ProgressCallback<'_>,
) -> Result<(), Box<dyn Error>> {
// Download with reqwest
let response = reqwest::get(url).await?;
if !response.status().is_success() {
return Err(format!("Failed to download '{}' : {}", &url, response.status()).into());
}
let total_size = response
.content_length()
.ok_or(format!("Failed to get content length from '{}'", &url))?;
let mut index = 0;
// Target file: extract file name from URL
let filename = Path::new(url).file_name().unwrap().to_str().unwrap();
let path = target_dir.join(filename);
let mut file = File::create(path)?;
// Download chunk by chunk to disk, while updating hasher for checksum
let mut stream = response.bytes_stream();
let mut hasher = Sha256::new();
while let Some(item) = stream.next().await {
let chunk = item?;
file.write_all(&chunk)?;
hasher.update(&chunk);
if let Some(cb) = progress {
index = min(index + chunk.len(), total_size as usize);
cb("", "Downloading...", index, total_size as usize);
}
}
// Verify checksum
let result = hasher.finalize();
let calculated_checksum = hex::encode(result);
if calculated_checksum != checksum {
return Err(format!(
"Checksum mismatch! Expected {}, got {}",
checksum, calculated_checksum
)
.into());
}
Ok(())
}
fn setup_pristine_tar_branch(package_dir: &Path, dist: &str) -> Result<(), Box<dyn Error>> {
let repo = git2::Repository::open(package_dir)?;
// Check if local branch already exists
if repo
.find_branch("pristine-tar", git2::BranchType::Local)
.is_ok()
{
return Ok(());
}
// Find remote pristine-tar branch
let branches = repo.branches(Some(git2::BranchType::Remote))?;
for branch_result in branches {
let (branch, _) = branch_result?;
if let Some(name) = branch.name()?
&& name.ends_with(&format!("/{dist}/pristine-tar"))
{
debug!("Found remote pristine-tar branch: {}", name);
let commit = branch.get().peel_to_commit()?;
// Create local branch
let mut local_branch = repo.branch("pristine-tar", &commit, false)?;
// Set upstream
local_branch.set_upstream(Some(name))?;
debug!("Created local pristine-tar branch tracking {}", name);
return Ok(());
}
}
debug!("No remote pristine-tar branch found.");
Ok(())
}
async fn fetch_orig_tarball(
info: &PackageInfo,
cwd: Option<&Path>,
progress: ProgressCallback<'_>,
) -> Result<(), Box<dyn Error>> {
let package_dir = if let Some(path) = cwd {
path.join(&info.stanza.package)
} else {
Path::new(&info.stanza.package).to_path_buf()
};
// Find the orig tarball in the file list
// Usually ends with .orig.tar.gz or .orig.tar.xz
let orig_file = info
.stanza
.files
.iter()
.find(|f| f.name.contains(".orig.tar."))
.unwrap();
let filename = &orig_file.name;
// 1. Try executing pristine-tar
// Setup pristine-tar branch if needed (by tracking remote branch)
let _ = setup_pristine_tar_branch(&package_dir, info.dist.as_str());
if let Err(e) = checkout_pristine_tar(&package_dir, filename.as_str()) {
debug!(
"pristine-tar failed: {}. Falling back to archive download.",
e
);
// 2. Fallback to archive download
// We download to the parent directory of the package repo (which is standard for build tools)
// or the current directory if cwd is None (which effectively is the parent of the package dir)
let target_dir = cwd.unwrap_or_else(|| Path::new("."));
download_file_checksum(
format!("{}/{}", &info.archive_url, filename).as_str(),
&orig_file.sha256,
target_dir,
progress,
)
.await?;
}
Ok(())
}
async fn fetch_dsc_file(
info: &PackageInfo,
cwd: Option<&Path>,
progress: ProgressCallback<'_>,
) -> Result<(), Box<dyn Error>> {
let target_dir = cwd.unwrap_or_else(|| Path::new("."));
// Find the dsc file in the file list
let dsc_file = info
.stanza
.files
.iter()
.find(|f| f.name.ends_with(".dsc"))
.ok_or("Could not find .dsc file in package info")?;
let filename = &dsc_file.name;
debug!("Fetching dsc file: {}", filename);
download_file_checksum(
format!("{}/{}", &info.archive_url, filename).as_str(),
&dsc_file.sha256,
target_dir,
progress,
)
.await?;
Ok(())
}
async fn fetch_archive_sources(
info: &PackageInfo,
cwd: Option<&Path>,
progress: ProgressCallback<'_>,
) -> Result<(), Box<dyn Error>> {
let package_dir = if let Some(path) = cwd {
path
} else {
&Path::new(".").to_path_buf()
};
std::fs::create_dir_all(package_dir)?;
for file in &info.stanza.files {
let url = format!("{}/{}", info.archive_url, file.name);
download_file_checksum(&url, &file.sha256, package_dir, progress).await?;
// Extract all tar archives, merging extracted directories
if file.name.ends_with(".tar.gz") || file.name.ends_with(".tar.xz") {
let path = package_dir.join(&file.name);
let extract_dir = package_dir.join(&info.stanza.package);
let extracted = extract_archive(&path, &extract_dir, progress)?;
// Special case: the debian tar does only contain 'debian'
if file.name.contains("debian.tar.") {
continue;
}
// List root directories extracted and use the first one as the source directory
debug!("Root directories extracted:");
let mut source_dir: Option<PathBuf> = None;
for file in &extracted {
let path = Path::new(file);
// Check if this is a directory and is at the archive root level
// (i.e., the path relative to extract_dir has no parent components)
if let Ok(relative_path) = path.strip_prefix(&extract_dir)
&& relative_path.components().count() == 1
&& path.is_dir()
{
debug!("- {}", relative_path.file_name().unwrap().to_string_lossy());
// Use the first directory found as the source
if source_dir.is_none() {
source_dir = Some(path.to_path_buf());
}
}
}
// Use the extracted directory as the source, assuming there is only one
if let Some(src_dir) = source_dir {
let target_dir = package_dir.join(&info.stanza.package);
if target_dir.exists() {
// Target exists, we need to merge contents
for sub_entry in std::fs::read_dir(&src_dir)? {
let sub_entry = sub_entry?;
let sub_path = sub_entry.path();
let target_path = target_dir.join(sub_entry.file_name());
if sub_path.is_dir() {
std::fs::create_dir_all(&target_path)?;
// Recursively copy directory contents
copy_dir_all(&sub_path, &target_path)?;
} else {
std::fs::copy(&sub_path, &target_path)?;
}
}
std::fs::remove_dir_all(&src_dir)?;
} else {
std::fs::rename(&src_dir, &target_dir)?;
}
}
}
// Extract and apply .diff.gz if present (old packages)
if file.name.ends_with(".diff.gz") {
let diff_gz_path = package_dir.join(&file.name);
let source_dir = package_dir.join(&info.stanza.package);
// Create the .diff file path by replacing .gz with empty string
let diff_path = diff_gz_path.with_extension("");
// Decompress the .diff.gz file directly to .diff
let input_file = File::open(&diff_gz_path)?;
let mut decoder = GzDecoder::new(input_file);
let mut output_file = File::create(&diff_path)?;
std::io::copy(&mut decoder, &mut output_file)?;
// Use relative path for the diff file (it's in the parent directory)
let relative_diff_path =
format!("../{}", diff_path.file_name().unwrap().to_string_lossy());
// Apply the patch using the patch command with relative path
let output = Command::new("patch")
.current_dir(&source_dir)
.arg("-p1")
.arg("--input")
.arg(&relative_diff_path)
.output()?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(
format!("Failed to apply patch: {}\n{}", diff_path.display(), stderr).into(),
);
}
debug!("Successfully applied patch: {}", diff_path.display());
// Clean up the extracted .diff file
std::fs::remove_file(&diff_path)?;
}
}
Ok(())
}
/// Pull a source package locally using pre-retrieved package information
///
/// This function takes a PackageInfo struct and downloads the package using the preferred method
/// (either git or direct archive download), as well as orig tarball, inside 'package' directory.
/// The source will be extracted under 'package/package'.
pub async fn pull(
package_info: &PackageInfo,
cwd: Option<&Path>,
progress: ProgressCallback<'_>,
force_archive: bool,
) -> Result<(), Box<dyn Error>> {
let package = &package_info.stanza.package;
let series = &package_info.series;
let package_dir = if let Some(path) = cwd {
path.join(package)
} else {
Path::new(package).to_path_buf()
};
/* Fetch the package: either via git (preferred VCS) or the archive */
if let Some(ref url) = package_info.preferred_vcs
&& !force_archive
{
// We have found a preferred VCS (git repository) for the package, so
// we fetch the package from that repo.
// Depending on target series, we pick target branch; if latest series is specified,
// we target the development branch, i.e. the default branch
let branch_name = if crate::distro_info::get_ordered_series(package_info.dist.as_str())
.await?[0]
!= *series
{
if package_info.dist == "ubuntu" {
Some(format!("{}/{}", package_info.dist, series))
} else {
// Debian does not have reliable branch naming...
// For now, we skip that part and clone default
// TODO: Inspect remote branches and tags for matches
None
}
} else {
None
};
if let Some(cb) = progress {
cb(
&format!(
"Cloning {}{}...",
url,
if let Some(b) = &branch_name {
format!(" (branch {})", b)
} else {
String::new()
}
),
"",
0,
0,
);
}
clone_repo(
url.as_str(),
package,
branch_name.as_deref(),
Some(&package_dir),
progress,
)?;
if !package_info.is_native() {
if let Some(cb) = progress {
cb("Fetching orig tarball...", "", 0, 0);
}
fetch_orig_tarball(package_info, Some(&package_dir), progress).await?;
} else {
debug!("Native package, skipping orig tarball fetch.");
}
if let Some(cb) = progress {
cb("Fetching dsc file...", "", 0, 0);
}
fetch_dsc_file(package_info, Some(&package_dir), progress).await?;
} else {
// Fallback to archive fetching
if let Some(cb) = progress {
cb("Downloading from archive...", "", 0, 0);
}
fetch_archive_sources(package_info, Some(&package_dir), progress).await?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
async fn test_pull_package_end_to_end(
package: &str,
series: Option<&str>,
dist: Option<&str>,
archive: Option<bool>,
) {
// This test verifies that 'pkh pull' clones the repo and fetches the tarball.
// For determinism, we require for tests that either a distro or series is specified,
// as no distribution would mean fallback to system distro
assert!(dist.is_some() || series.is_some());
// Use a temp directory as working directory
let temp_dir = tempfile::tempdir().unwrap();
let cwd = temp_dir.path();
// Main 'pull' command: the one we want to test
let info = crate::package_info::lookup(package, None, series, "", dist, None)
.await
.unwrap();
pull(&info, Some(cwd), None, archive.unwrap_or(false))
.await
.unwrap();
let package_dir = cwd.join(package);
assert!(package_dir.exists());
let package_source_dir = package_dir.join(package);
assert!(
package_source_dir.exists(),
"Package git repo directory not created"
);
assert!(
package_source_dir.join("debian").exists(),
"debian directory not present"
);
if package_source_dir.join(".git").exists() {
// Verify we are on the correct branch
let repo = git2::Repository::open(&package_source_dir).unwrap();
let head = repo.head().unwrap();
let name = head.name().unwrap();
if let Some(s) = series {
// The local branch should be named dist/series
// We skip debian for now as it does not have a reliable naming scheme
if info.dist == "ubuntu" {
assert_eq!(name, format!("refs/heads/{0}/{s}", info.dist));
}
} else {
// The local branch should be named ubuntu/devel for Ubuntu
// Debian unfortunately does not have a reliable naming scheme
// Given that there was no series specified, and this is a test,
// we require to have a distribution specified
if dist.unwrap() == "ubuntu" {
assert_eq!(name, "refs/heads/ubuntu/devel");
}
}
}
// Check for orig tarball in package dir (only for non-native packages)
let mut found_tarball = false;
let mut found_dsc = false;
for entry in std::fs::read_dir(package_dir).unwrap() {
let entry = entry.unwrap();
let name = entry.file_name().to_string_lossy().to_string();
if name.contains(".orig.tar.") {
found_tarball = true;
}
if name.ends_with(".dsc") {
found_dsc = true;
}
}
// Only check for orig tarball if the package is not native
if !info.is_native() {
assert!(found_tarball, "Orig tarball not found in package dir");
}
assert!(found_dsc, "DSC file not found in package dir");
}
#[tokio::test]
async fn test_pull_hello_ubuntu_end_to_end() {
test_pull_package_end_to_end("hello", Some("noble"), None, None).await;
}
#[tokio::test]
async fn test_pull_hello_debian_end_to_end() {
test_pull_package_end_to_end("hello", Some("bookworm"), None, None).await;
}
/// Specific test for a package using a .diff.gz, instead of .debian and .orig
#[tokio::test]
async fn test_pull_linux_riscv_ubuntu_end_to_end() {
test_pull_package_end_to_end("linux-riscv", Some("noble"), None, Some(true)).await;
}
#[tokio::test]
async fn test_pull_2048_universe_ubuntu_end_to_end() {
test_pull_package_end_to_end("2048", Some("noble"), None, None).await;
}
#[tokio::test]
async fn test_pull_1oom_contrib_debian_end_to_end() {
test_pull_package_end_to_end("1oom", Some("trixie"), None, None).await;
}
#[tokio::test]
async fn test_pull_agg_svn_fallback_ok() {
test_pull_package_end_to_end("agg", Some("trixie"), None, None).await;
}
#[tokio::test]
async fn test_pull_hello_debian_latest_end_to_end() {
test_pull_package_end_to_end("hello", None, Some("debian"), None).await;
}
#[tokio::test]
async fn test_pull_hello_ubuntu_latest_end_to_end() {
test_pull_package_end_to_end("hello", None, Some("ubuntu"), None).await;
}
}