dev_scripts/sync/
mirror.rs

1use std::{
2    collections::HashSet,
3    fs::{DirEntry, create_dir_all, remove_dir_all},
4    path::{Path, PathBuf},
5    process::Command,
6};
7
8use anyhow::{Context, Result, bail};
9use log::{debug, info};
10use rayon::iter::{IntoParallelIterator, ParallelIterator};
11
12use super::{PackageRepositories, filenames_in_dir};
13use crate::{cmd::ensure_success, ui::get_progress_bar};
14
15/// The entry point for downloading any data from package mirrors.
16pub struct MirrorDownloader {
17    /// The destination folder into which files should be downloaded.
18    pub dest: PathBuf,
19    /// The mirror url from which files will be downloaded.
20    pub mirror: String,
21    /// The repositories that should be downloaded.
22    pub repositories: Vec<PackageRepositories>,
23}
24
25impl MirrorDownloader {
26    /// Download all official repository file databases and unpack them.
27    /// They contain the following files:
28    ///
29    /// - `desc`
30    /// - `files`
31    pub fn sync_remote_databases(&self) -> Result<()> {
32        let download_dir = self.dest.join("download/databases/");
33        let target_dir = self.dest.join("databases");
34
35        if !download_dir.exists() {
36            create_dir_all(&download_dir).context("Failed to create download directory")?;
37        }
38
39        if !target_dir.exists() {
40            create_dir_all(&target_dir)
41                .context("Failed to create pacman cache target directory")?;
42        }
43
44        for repo in self.repositories.iter() {
45            let name = repo.to_string();
46            info!("Downloading database for repository {name}");
47
48            let filename = format!("{name}.files");
49            let file_source = format!("rsync://{}/{name}/os/x86_64/{filename}", self.mirror);
50
51            let download_dest = download_dir.join(filename);
52
53            // Download the db from the mirror
54            let status = Command::new("rsync")
55                .args([
56                    "--recursive",
57                    "--perms",
58                    "--times",
59                    // Copy files instead of symlinks
60                    // Symlinks may point to files up the tree of where we're looking at,
61                    // which is why normal symlinks would be invalid.
62                    "--copy-links",
63                    // Show total progress
64                    "--info=progress2",
65                ])
66                .arg(file_source)
67                .arg(&download_dest)
68                .spawn()
69                .context(format!("Failed to run rsync for pacman db {name}"))?
70                .wait()
71                .context(format!("Failed to start rsync for pacman db {name}"))?;
72
73            if !status.success() {
74                bail!("rsync failed for pacman db {name}");
75            }
76
77            // Remove any old files.
78            let repo_target_dir = target_dir.join(&name);
79            if repo_target_dir.exists() {
80                remove_dir_all(&repo_target_dir).context(format!(
81                    "Failed to remove old repository: {repo_target_dir:?}"
82                ))?;
83            }
84            create_dir_all(&repo_target_dir)?;
85
86            debug!("Extracting db to {repo_target_dir:?}");
87
88            // Extract the db into the target folder.
89            let output = Command::new("tar")
90                .arg("-x")
91                .arg("-f")
92                .arg(&download_dest)
93                .arg("-C")
94                .arg(&repo_target_dir)
95                .output()
96                .context(format!("Failed to start tar to extract pacman dbs {name}"))?;
97            ensure_success(&output)?;
98        }
99
100        Ok(())
101    }
102
103    /// Download all official repository packages and extract all files that're interesting to us.
104    /// Specifically:
105    ///
106    ///  - `.BUILDINFO`
107    ///  - `.MTREE`
108    ///  - `.PKGINFO`
109    ///  - `.INSTALL` (Optional)
110    pub fn sync_remote_packages(&self) -> Result<()> {
111        let download_dir = self.dest.join("download/packages");
112        let target_dir = self.dest.join("packages");
113
114        if !download_dir.exists() {
115            create_dir_all(&download_dir).context("Failed to create download directory")?;
116        }
117
118        if !target_dir.exists() {
119            create_dir_all(&target_dir)
120                .context("Failed to create pacman cache target directory")?;
121        }
122
123        for repo in self.repositories.iter() {
124            let repo_name = repo.to_string();
125            info!("Downloading packages for repository {repo_name}");
126
127            let file_source = format!("rsync://{}/{repo_name}/os/x86_64/", self.mirror);
128            let download_dest = download_dir.join(&repo_name);
129            self.download_packages(&repo_name, file_source, &download_dest)?;
130
131            // Go through all packages of this repo and extract the respective relevant files.
132            let packages = std::fs::read_dir(&download_dest)?
133                .collect::<Result<Vec<DirEntry>, std::io::Error>>()?;
134
135            // Filter out any dotfiles.
136            // Those might be temporary download artifacts from previous rsync runs.
137            let packages: Vec<DirEntry> = packages
138                .into_iter()
139                .filter(|entry| {
140                    if let Some(path) = entry.file_name().to_str() {
141                        !path.starts_with('.')
142                    } else {
143                        false
144                    }
145                })
146                .collect();
147
148            // TODO:
149            // The extraction work can be cut down on successive runs by using rsync's `--itemize`
150            // flag, which gives a list of changed packages.
151            // That way, we only extract packages that actually changed and don't do any duplicate
152            // work on old packages.
153            // See https://gitlab.archlinux.org/archlinux/alpm/alpm/-/issues/68
154            info!("Extracting packages for repository {repo_name}");
155            let progress_bar = get_progress_bar(packages.len() as u64);
156            packages
157                .into_par_iter()
158                .map(|pkg| {
159                    // Extract all files that we're interested in.
160                    let result = extract_pkg_files(&pkg, &target_dir, &repo_name);
161                    progress_bar.inc(1);
162                    result
163                })
164                .collect::<Result<Vec<()>>>()?;
165            // Finish the progress_bar
166            progress_bar.finish_with_message("Finished extracting files for repository {repo}.");
167        }
168
169        // Clean up package data of packages that're no longer on the mirror.
170        for repo in self.repositories.iter() {
171            let mirror_packages = filenames_in_dir(&download_dir.join(repo.to_string()))?
172                .into_iter()
173                .map(remove_tarball_suffix)
174                .collect::<Result<HashSet<String>>>()?;
175
176            let local_packages = filenames_in_dir(&target_dir.join(repo.to_string()))?;
177
178            // Get the packages that no longer exist on the mirror.
179            let removed_pkgs: Vec<&String> = local_packages.difference(&mirror_packages).collect();
180
181            // Delete the package data
182            if !removed_pkgs.is_empty() {
183                info!("Found {} packages for cleanup:", removed_pkgs.len());
184                for removed in removed_pkgs {
185                    debug!("Removing local package: {removed}");
186                    remove_dir_all(target_dir.join(repo.to_string()).join(removed)).context(
187                        format!(
188                            "Failed to remove local package {:?}",
189                            target_dir.join(repo.to_string()).join(removed)
190                        ),
191                    )?;
192                }
193            }
194        }
195
196        Ok(())
197    }
198
199    /// Download all packages of a given arch package repository into the download directory.
200    fn download_packages(
201        &self,
202        repo_name: &str,
203        file_source: String,
204        download_dest: &PathBuf,
205    ) -> Result<()> {
206        let mut cmd = Command::new("rsync");
207        cmd.args([
208            "--recursive",
209            "--perms",
210            "--times",
211            "--delete",
212            "--hard-links",
213            // Copy actual files instead of symlinks.
214            // Most symlinks point to files up the tree of where we're looking at,
215            // which is why normal symlinks would be invalid.
216            "--copy-links",
217            // Check for deletions once everything has been transferred
218            "--delete-after",
219            // Only overwrite updated files in the very end.
220            // This allows for a somewhat "atomic" update process.
221            "--delay-updates",
222            // Show total progress
223            "--info=progress2",
224            // Exclude package signatures
225            "--exclude=*.sig",
226        ]);
227
228        // Don't download any files related to repository sync databases (signatures are generally
229        // excluded by the rsync call).
230        for variation in [
231            ".db",
232            ".db.tar.gz",
233            ".db.tar.gz.old",
234            ".links.tar.gz",
235            ".files",
236            ".files.tar.gz",
237            ".files.tar.gz.old",
238        ] {
239            cmd.arg(format!("--exclude={repo_name}{variation}"));
240        }
241
242        let status = cmd
243            .arg(file_source)
244            .arg(download_dest)
245            .spawn()
246            .context(format!(
247                "Failed to start package rsync for pacman db {repo_name}"
248            ))?
249            .wait()
250            .context(format!(
251                "Failed to start package rsync for pacman db {repo_name}"
252            ))?;
253
254        if !status.success() {
255            bail!("Package rsync failed for pacman db {repo_name}");
256        }
257
258        Ok(())
259    }
260}
261
262/// Get the list of all files inside a given compressed tarball.
263///
264/// This function provides data which is necessary to determine which subset of files should be
265/// extracted.
266fn get_tar_file_list(pkg: &DirEntry) -> Result<HashSet<String>> {
267    let peek_output = Command::new("tar")
268        .arg("-tf")
269        .arg(pkg.path())
270        .output()
271        .context(format!("Failed to peek into pkg {:?}", pkg.path()))?;
272    ensure_success(&peek_output).context("Error while peeking into package")?;
273
274    Ok(String::from_utf8_lossy(&peek_output.stdout)
275        .lines()
276        .map(|line| line.to_string())
277        .collect())
278}
279
280/// Use `tar` to extract relevant package metadata and script files from packages files.
281///
282/// This function attempts to extract ".MTREE", ".BUILDINFO", ".PKGINFO" and ".INSTALL" files.
283/// Extracted files are placed in a directory structure that reflects the package's association with
284/// a package repository.
285///
286/// ## Note
287///
288/// Since some files are optional, we have to take a look at the files in that tarball to determine
289/// which of the files need to be actually extracted.
290fn extract_pkg_files(pkg: &DirEntry, target_dir: &Path, repo_name: &str) -> Result<()> {
291    let pkg_file_name = pkg.file_name().to_string_lossy().to_string();
292    let pkg_name = remove_tarball_suffix(pkg_file_name)?;
293
294    // Peek into the pkg tar to see what kind of files we need to extract.
295    let files = get_tar_file_list(pkg)?;
296
297    // Create the target directory where all the files should be extracted to.
298    let pkg_target_dir = target_dir.join(repo_name).join(pkg_name);
299    create_dir_all(&pkg_target_dir)?;
300
301    let mut cmd_args = vec![
302        "-C".to_string(),
303        pkg_target_dir.to_string_lossy().to_string(),
304        "-xf".to_string(),
305        pkg.path().to_string_lossy().to_string(),
306    ];
307
308    // Check for each of the known filetypes, whether it exists in the package.
309    // If it does, add it to the tar command for extraction.
310    for filetype in [".MTREE", ".BUILDINFO", ".PKGINFO", ".INSTALL"] {
311        if files.contains(filetype) {
312            cmd_args.push(filetype.to_string());
313        }
314    }
315
316    // Run the extraction command
317    let output = Command::new("tar")
318        .args(cmd_args)
319        .output()
320        .context(format!("Failed to extract files from pkg {:?}", pkg.path()))?;
321    ensure_success(&output).context("Error while downloading packages via rsync")?;
322
323    Ok(())
324}
325
326/// A small helper function that removes the `.pkg.tar.*` suffix of a tarball.
327/// This is necessary to get the actual package name from a packages full file name.
328pub fn remove_tarball_suffix(pkg_name: String) -> Result<String> {
329    let pkg_name = if let Some(pkg_name) = pkg_name.strip_suffix(".pkg.tar.zst") {
330        pkg_name
331    } else if let Some(pkg_name) = pkg_name.strip_suffix(".pkg.tar.xz") {
332        pkg_name
333    } else {
334        bail!("Found package with unknown tarball compression: {pkg_name:?}");
335    };
336
337    Ok(pkg_name.to_string())
338}