dev_scripts/sync/mirror/
mod.rs

1//! All logic for downloading data from an Arch Linux package mirror.
2//!
3//! This includes the database files or packages.
4
5mod rsync_changes;
6
7use std::{
8    collections::HashSet,
9    fs::{create_dir_all, remove_dir_all},
10    path::{Path, PathBuf},
11    process::Command,
12};
13
14use anyhow::{Context, Result, anyhow, bail};
15use log::{debug, info, trace};
16use rayon::iter::{IntoParallelIterator, ParallelIterator};
17
18use super::{PackageRepositories, filenames_in_dir};
19use crate::{cmd::ensure_success, ui::get_progress_bar};
20
21/// The entry point for downloading any data from package mirrors.
22#[derive(Clone, Debug)]
23pub struct MirrorDownloader {
24    /// The destination folder into which files should be downloaded.
25    pub dest: PathBuf,
26    /// The mirror url from which files will be downloaded.
27    pub mirror: String,
28    /// The repositories that should be downloaded.
29    pub repositories: Vec<PackageRepositories>,
30    /// Whether to extract all packages (regardless of changes).
31    pub extract_all: bool,
32}
33
34impl MirrorDownloader {
35    /// Download all official repository file databases and unpack them.
36    /// They contain the following files:
37    ///
38    /// - `desc`
39    /// - `files`
40    pub fn sync_remote_databases(&self) -> Result<()> {
41        let download_dir = self.dest.join("download/databases/");
42        let target_dir = self.dest.join("databases");
43
44        if !download_dir.exists() {
45            create_dir_all(&download_dir).context("Failed to create download directory")?;
46        }
47
48        if !target_dir.exists() {
49            create_dir_all(&target_dir)
50                .context("Failed to create pacman cache target directory")?;
51        }
52
53        for repo in self.repositories.iter() {
54            let name = repo.to_string();
55            info!("Downloading database for repository {name}");
56
57            let filename = format!("{name}.files");
58            let file_source = format!("rsync://{}/{name}/os/x86_64/{filename}", self.mirror);
59
60            let download_dest = download_dir.join(filename);
61
62            // Download the db from the mirror
63            let mut db_sync_command = Command::new("rsync");
64            db_sync_command
65                .args([
66                    "--recursive",
67                    "--perms",
68                    "--times",
69                    // Report changes status
70                    "--itemize-changes",
71                    // Copy files instead of symlinks
72                    // Symlinks may point to files up the tree of where we're looking at,
73                    // which is why normal symlinks would be invalid.
74                    "--copy-links",
75                ])
76                .arg(file_source)
77                .arg(&download_dest);
78
79            trace!("Running command: {db_sync_command:?}");
80            let output = db_sync_command
81                .output()
82                .context(format!("Failed to run rsync for pacman db {name}"))?;
83
84            if !output.status.success() {
85                bail!("rsync failed for pacman db {name}");
86            }
87
88            trace!(
89                "Rsync reports: {}",
90                String::from_utf8_lossy(&output.stdout).trim()
91            );
92
93            let repo_target_dir = target_dir.join(&name);
94            if repo_target_dir.exists() {
95                if !self.extract_all
96                    && rsync_changes::Report::parser(&output.stdout)
97                        .map_err(|e| anyhow!("{e}"))?
98                        .file_content_updated()?
99                        .is_none()
100                {
101                    debug!("Database {name} is unchanged upstream, skipping extraction");
102                    continue;
103                } else {
104                    // There are old versions of the files, remove them.
105                    remove_dir_all(&repo_target_dir).context(format!(
106                        "Failed to remove old repository: {repo_target_dir:?}"
107                    ))?;
108                }
109            }
110            create_dir_all(&repo_target_dir)?;
111
112            debug!("Extracting db to {repo_target_dir:?}");
113
114            // Extract the db into the target folder.
115            let mut tar_command = Command::new("tar");
116            tar_command
117                .arg("-x")
118                .arg("-f")
119                .arg(&download_dest)
120                .arg("-C")
121                .arg(&repo_target_dir);
122
123            trace!("Running command: {tar_command:?}");
124            let output = tar_command
125                .output()
126                .context(format!("Failed to start tar to extract pacman dbs {name}"))?;
127            ensure_success(&output)?;
128        }
129
130        Ok(())
131    }
132
133    /// Download all official repository packages and extract all files that're interesting to us.
134    /// Specifically:
135    ///
136    ///  - `.BUILDINFO`
137    ///  - `.MTREE`
138    ///  - `.PKGINFO`
139    ///  - `.INSTALL` (Optional)
140    pub fn sync_remote_packages(&self) -> Result<()> {
141        let download_dir = self.dest.join("download/packages");
142        let target_dir = self.dest.join("packages");
143
144        if !download_dir.exists() {
145            create_dir_all(&download_dir).context("Failed to create download directory")?;
146        }
147
148        if !target_dir.exists() {
149            create_dir_all(&target_dir)
150                .context("Failed to create pacman cache target directory")?;
151        }
152
153        for repo in self.repositories.iter() {
154            let repo_name = repo.to_string();
155            info!("Downloading packages for repository {repo_name}");
156
157            let file_source = format!("rsync://{}/{repo_name}/os/x86_64/", self.mirror);
158            let download_dest = download_dir.join(&repo_name);
159            let changed = self.download_packages(&repo_name, file_source, &download_dest)?;
160
161            let packages: Vec<PathBuf> = if self.extract_all {
162                let files: Vec<_> =
163                    std::fs::read_dir(&download_dest)?.collect::<Result<_, std::io::Error>>()?;
164                files
165                    .into_iter()
166                    .map(|entry| entry.path().to_owned())
167                    .collect::<Vec<_>>()
168            } else {
169                changed
170                    .into_iter()
171                    .map(|pkg| download_dest.join(pkg))
172                    .collect()
173            }
174            .into_iter()
175            // Filter out any dotfiles.
176            // Those might be temporary download artifacts from previous rsync runs.
177            .filter(|entry| {
178                if let Some(path) = entry.to_str() {
179                    !path.starts_with('.')
180                } else {
181                    false
182                }
183            })
184            .collect();
185
186            info!("Extracting packages for repository {repo_name}");
187            let progress_bar = get_progress_bar(packages.len() as u64);
188            packages
189                .into_par_iter()
190                .map(|pkg| {
191                    // Extract all files that we're interested in.
192                    let result = extract_pkg_files(&pkg, &target_dir, &repo_name);
193                    progress_bar.inc(1);
194                    result
195                })
196                .collect::<Result<Vec<()>>>()?;
197            // Finish the progress_bar
198            progress_bar.finish_with_message("Finished extracting files for repository {repo}.");
199        }
200
201        // Clean up package data of packages that're no longer on the mirror.
202        for repo in self.repositories.iter() {
203            let mirror_packages = filenames_in_dir(&download_dir.join(repo.to_string()))?
204                .into_iter()
205                .map(remove_tarball_suffix)
206                .collect::<Result<HashSet<String>>>()?;
207
208            let local_packages = filenames_in_dir(&target_dir.join(repo.to_string()))?;
209
210            // Get the packages that no longer exist on the mirror.
211            let removed_pkgs: Vec<&String> = local_packages.difference(&mirror_packages).collect();
212
213            // Delete the package data
214            if !removed_pkgs.is_empty() {
215                info!("Found {} packages for cleanup:", removed_pkgs.len());
216                for removed in removed_pkgs {
217                    debug!("Removing local package: {removed}");
218                    remove_dir_all(target_dir.join(repo.to_string()).join(removed)).context(
219                        format!(
220                            "Failed to remove local package {:?}",
221                            target_dir.join(repo.to_string()).join(removed)
222                        ),
223                    )?;
224                }
225            }
226        }
227
228        Ok(())
229    }
230
231    /// Download all packages of a given arch package repository into the download directory.
232    fn download_packages(
233        &self,
234        repo_name: &str,
235        file_source: String,
236        download_dest: &PathBuf,
237    ) -> Result<Vec<PathBuf>> {
238        let mut cmd = Command::new("rsync");
239        cmd.args([
240            "--recursive",
241            "--perms",
242            "--times",
243            "--delete",
244            "--hard-links",
245            // Copy actual files instead of symlinks.
246            // Most symlinks point to files up the tree of where we're looking at,
247            // which is why normal symlinks would be invalid.
248            "--copy-links",
249            // Check for deletions once everything has been transferred
250            "--delete-after",
251            // Only overwrite updated files in the very end.
252            // This allows for a somewhat "atomic" update process.
253            "--delay-updates",
254            // Print structured change information to be parsed
255            "--itemize-changes",
256            // Exclude package signatures
257            "--exclude=*.sig",
258        ]);
259
260        // Don't download any files related to repository sync databases (signatures are generally
261        // excluded by the rsync call).
262        for variation in [
263            ".db",
264            ".db.tar.gz",
265            ".db.tar.gz.old",
266            ".links.tar.gz",
267            ".files",
268            ".files.tar.gz",
269            ".files.tar.gz.old",
270        ] {
271            cmd.arg(format!("--exclude={repo_name}{variation}"));
272        }
273
274        trace!("Running command: {cmd:?}");
275        let output = cmd
276            .arg(file_source)
277            .arg(download_dest)
278            .output()
279            .context(format!(
280                "Failed to start package rsync for pacman db {repo_name}"
281            ))?;
282
283        if !output.status.success() {
284            bail!("Package rsync failed for pacman db {repo_name}");
285        }
286
287        let mut changed_files = Vec::new();
288
289        for line in output.stdout.split(|&b| b == b'\n') {
290            if let Some(path) = rsync_changes::Report::parser(line)
291                .map_err(|e| anyhow!("{e}"))?
292                .file_content_updated()?
293            {
294                trace!("File at {path:?} changed, marking for extraction");
295                changed_files.push(path.to_owned());
296            }
297        }
298
299        Ok(changed_files)
300    }
301}
302
303/// Get the list of all files inside a given compressed tarball.
304///
305/// This function provides data which is necessary to determine which subset of files should be
306/// extracted.
307fn get_tar_file_list(pkg: &Path) -> Result<HashSet<String>> {
308    let mut tar_command = Command::new("tar");
309    tar_command.arg("-tf").arg(pkg);
310    trace!("Running command: {tar_command:?}");
311    let peek_output = tar_command
312        .output()
313        .context(format!("Failed to peek into pkg {pkg:?}"))?;
314    ensure_success(&peek_output).context("Error while peeking into package")?;
315
316    Ok(String::from_utf8_lossy(&peek_output.stdout)
317        .lines()
318        .map(|line| line.to_string())
319        .collect())
320}
321
322/// Use `tar` to extract relevant package metadata and script files from packages files.
323///
324/// This function attempts to extract ".MTREE", ".BUILDINFO", ".PKGINFO" and ".INSTALL" files.
325/// Extracted files are placed in a directory structure that reflects the package's association with
326/// a package repository.
327///
328/// ## Note
329///
330/// Since some files are optional, we have to take a look at the files in that tarball to determine
331/// which of the files need to be actually extracted.
332///
333/// # Panics
334///
335/// Panics if `pkg` points to a directory.
336fn extract_pkg_files(pkg: &Path, target_dir: &Path, repo_name: &str) -> Result<()> {
337    let pkg_file_name = pkg
338        .file_name()
339        .expect("got directory when expecting file")
340        .to_string_lossy()
341        .to_string();
342    let pkg_name = remove_tarball_suffix(pkg_file_name)?;
343
344    // Peek into the pkg tar to see what kind of files we need to extract.
345    let files = get_tar_file_list(pkg)?;
346
347    // Create the target directory where all the files should be extracted to.
348    let pkg_target_dir = target_dir.join(repo_name).join(pkg_name);
349    create_dir_all(&pkg_target_dir)?;
350
351    let mut cmd_args = vec![
352        "-C".to_string(),
353        pkg_target_dir.to_string_lossy().to_string(),
354        "-xf".to_string(),
355        pkg.to_string_lossy().to_string(),
356    ];
357
358    // Check for each of the known filetypes, whether it exists in the package.
359    // If it does, add it to the tar command for extraction.
360    for filetype in [".MTREE", ".BUILDINFO", ".PKGINFO", ".INSTALL"] {
361        if files.contains(filetype) {
362            cmd_args.push(filetype.to_string());
363        }
364    }
365
366    // Run the extraction command
367    let mut tar_command = Command::new("tar");
368    tar_command.args(cmd_args);
369
370    trace!("Running command: {tar_command:?}");
371    let output = tar_command
372        .output()
373        .context(format!("Failed to extract files from pkg {pkg:?}"))?;
374    ensure_success(&output).context("Error while downloading packages via rsync")?;
375
376    Ok(())
377}
378
379/// A small helper function that removes the `.pkg.tar.*` suffix of a tarball.
380/// This is necessary to get the actual package name from a packages full file name.
381pub fn remove_tarball_suffix(pkg_name: String) -> Result<String> {
382    let pkg_name = if let Some(pkg_name) = pkg_name.strip_suffix(".pkg.tar.zst") {
383        pkg_name
384    } else if let Some(pkg_name) = pkg_name.strip_suffix(".pkg.tar.xz") {
385        pkg_name
386    } else {
387        bail!("Found package with unknown tarball compression: {pkg_name:?}");
388    };
389
390    Ok(pkg_name.to_string())
391}