dev_scripts/sync/mirror/
mod.rs

1mod rsync_changes;
2
3use std::{
4    collections::HashSet,
5    fs::{create_dir_all, remove_dir_all},
6    path::{Path, PathBuf},
7    process::Command,
8};
9
10use anyhow::{Context, Result, anyhow, bail};
11use log::{debug, info, trace};
12use rayon::iter::{IntoParallelIterator, ParallelIterator};
13
14use super::{PackageRepositories, filenames_in_dir};
15use crate::{cmd::ensure_success, ui::get_progress_bar};
16
17/// The entry point for downloading any data from package mirrors.
18pub struct MirrorDownloader {
19    /// The destination folder into which files should be downloaded.
20    pub dest: PathBuf,
21    /// The mirror url from which files will be downloaded.
22    pub mirror: String,
23    /// The repositories that should be downloaded.
24    pub repositories: Vec<PackageRepositories>,
25    /// Whether to extract all packages (regardless of changes).
26    pub extract_all: bool,
27}
28
29impl MirrorDownloader {
30    /// Download all official repository file databases and unpack them.
31    /// They contain the following files:
32    ///
33    /// - `desc`
34    /// - `files`
35    pub fn sync_remote_databases(&self) -> Result<()> {
36        let download_dir = self.dest.join("download/databases/");
37        let target_dir = self.dest.join("databases");
38
39        if !download_dir.exists() {
40            create_dir_all(&download_dir).context("Failed to create download directory")?;
41        }
42
43        if !target_dir.exists() {
44            create_dir_all(&target_dir)
45                .context("Failed to create pacman cache target directory")?;
46        }
47
48        for repo in self.repositories.iter() {
49            let name = repo.to_string();
50            info!("Downloading database for repository {name}");
51
52            let filename = format!("{name}.files");
53            let file_source = format!("rsync://{}/{name}/os/x86_64/{filename}", self.mirror);
54
55            let download_dest = download_dir.join(filename);
56
57            // Download the db from the mirror
58            let mut db_sync_command = Command::new("rsync");
59            db_sync_command
60                .args([
61                    "--recursive",
62                    "--perms",
63                    "--times",
64                    // Report changes status
65                    "--itemize-changes",
66                    // Copy files instead of symlinks
67                    // Symlinks may point to files up the tree of where we're looking at,
68                    // which is why normal symlinks would be invalid.
69                    "--copy-links",
70                ])
71                .arg(file_source)
72                .arg(&download_dest);
73
74            trace!("Running command: {db_sync_command:?}");
75            let output = db_sync_command
76                .output()
77                .context(format!("Failed to run rsync for pacman db {name}"))?;
78
79            if !output.status.success() {
80                bail!("rsync failed for pacman db {name}");
81            }
82
83            trace!(
84                "Rsync reports: {}",
85                String::from_utf8_lossy(&output.stdout).trim()
86            );
87
88            let repo_target_dir = target_dir.join(&name);
89            if repo_target_dir.exists() {
90                if !self.extract_all
91                    && rsync_changes::Report::parser(&output.stdout)
92                        .map_err(|e| anyhow!("{e}"))?
93                        .file_content_updated()?
94                        .is_none()
95                {
96                    debug!("Database {name} is unchanged upstream, skipping extraction");
97                    continue;
98                } else {
99                    // There are old versions of the files, remove them.
100                    remove_dir_all(&repo_target_dir).context(format!(
101                        "Failed to remove old repository: {repo_target_dir:?}"
102                    ))?;
103                }
104            }
105            create_dir_all(&repo_target_dir)?;
106
107            debug!("Extracting db to {repo_target_dir:?}");
108
109            // Extract the db into the target folder.
110            let mut tar_command = Command::new("tar");
111            tar_command
112                .arg("-x")
113                .arg("-f")
114                .arg(&download_dest)
115                .arg("-C")
116                .arg(&repo_target_dir);
117
118            trace!("Running command: {tar_command:?}");
119            let output = tar_command
120                .output()
121                .context(format!("Failed to start tar to extract pacman dbs {name}"))?;
122            ensure_success(&output)?;
123        }
124
125        Ok(())
126    }
127
128    /// Download all official repository packages and extract all files that're interesting to us.
129    /// Specifically:
130    ///
131    ///  - `.BUILDINFO`
132    ///  - `.MTREE`
133    ///  - `.PKGINFO`
134    ///  - `.INSTALL` (Optional)
135    pub fn sync_remote_packages(&self) -> Result<()> {
136        let download_dir = self.dest.join("download/packages");
137        let target_dir = self.dest.join("packages");
138
139        if !download_dir.exists() {
140            create_dir_all(&download_dir).context("Failed to create download directory")?;
141        }
142
143        if !target_dir.exists() {
144            create_dir_all(&target_dir)
145                .context("Failed to create pacman cache target directory")?;
146        }
147
148        for repo in self.repositories.iter() {
149            let repo_name = repo.to_string();
150            info!("Downloading packages for repository {repo_name}");
151
152            let file_source = format!("rsync://{}/{repo_name}/os/x86_64/", self.mirror);
153            let download_dest = download_dir.join(&repo_name);
154            let changed = self.download_packages(&repo_name, file_source, &download_dest)?;
155
156            let packages: Vec<PathBuf> = if self.extract_all {
157                let files: Vec<_> =
158                    std::fs::read_dir(&download_dest)?.collect::<Result<_, std::io::Error>>()?;
159                files
160                    .into_iter()
161                    .map(|entry| entry.path().to_owned())
162                    .collect::<Vec<_>>()
163            } else {
164                changed
165                    .into_iter()
166                    .map(|pkg| download_dest.join(pkg))
167                    .collect()
168            }
169            .into_iter()
170            // Filter out any dotfiles.
171            // Those might be temporary download artifacts from previous rsync runs.
172            .filter(|entry| {
173                if let Some(path) = entry.to_str() {
174                    !path.starts_with('.')
175                } else {
176                    false
177                }
178            })
179            .collect();
180
181            info!("Extracting packages for repository {repo_name}");
182            let progress_bar = get_progress_bar(packages.len() as u64);
183            packages
184                .into_par_iter()
185                .map(|pkg| {
186                    // Extract all files that we're interested in.
187                    let result = extract_pkg_files(&pkg, &target_dir, &repo_name);
188                    progress_bar.inc(1);
189                    result
190                })
191                .collect::<Result<Vec<()>>>()?;
192            // Finish the progress_bar
193            progress_bar.finish_with_message("Finished extracting files for repository {repo}.");
194        }
195
196        // Clean up package data of packages that're no longer on the mirror.
197        for repo in self.repositories.iter() {
198            let mirror_packages = filenames_in_dir(&download_dir.join(repo.to_string()))?
199                .into_iter()
200                .map(remove_tarball_suffix)
201                .collect::<Result<HashSet<String>>>()?;
202
203            let local_packages = filenames_in_dir(&target_dir.join(repo.to_string()))?;
204
205            // Get the packages that no longer exist on the mirror.
206            let removed_pkgs: Vec<&String> = local_packages.difference(&mirror_packages).collect();
207
208            // Delete the package data
209            if !removed_pkgs.is_empty() {
210                info!("Found {} packages for cleanup:", removed_pkgs.len());
211                for removed in removed_pkgs {
212                    debug!("Removing local package: {removed}");
213                    remove_dir_all(target_dir.join(repo.to_string()).join(removed)).context(
214                        format!(
215                            "Failed to remove local package {:?}",
216                            target_dir.join(repo.to_string()).join(removed)
217                        ),
218                    )?;
219                }
220            }
221        }
222
223        Ok(())
224    }
225
226    /// Download all packages of a given arch package repository into the download directory.
227    fn download_packages(
228        &self,
229        repo_name: &str,
230        file_source: String,
231        download_dest: &PathBuf,
232    ) -> Result<Vec<PathBuf>> {
233        let mut cmd = Command::new("rsync");
234        cmd.args([
235            "--recursive",
236            "--perms",
237            "--times",
238            "--delete",
239            "--hard-links",
240            // Copy actual files instead of symlinks.
241            // Most symlinks point to files up the tree of where we're looking at,
242            // which is why normal symlinks would be invalid.
243            "--copy-links",
244            // Check for deletions once everything has been transferred
245            "--delete-after",
246            // Only overwrite updated files in the very end.
247            // This allows for a somewhat "atomic" update process.
248            "--delay-updates",
249            // Print structured change information to be parsed
250            "--itemize-changes",
251            // Exclude package signatures
252            "--exclude=*.sig",
253        ]);
254
255        // Don't download any files related to repository sync databases (signatures are generally
256        // excluded by the rsync call).
257        for variation in [
258            ".db",
259            ".db.tar.gz",
260            ".db.tar.gz.old",
261            ".links.tar.gz",
262            ".files",
263            ".files.tar.gz",
264            ".files.tar.gz.old",
265        ] {
266            cmd.arg(format!("--exclude={repo_name}{variation}"));
267        }
268
269        trace!("Running command: {cmd:?}");
270        let output = cmd
271            .arg(file_source)
272            .arg(download_dest)
273            .output()
274            .context(format!(
275                "Failed to start package rsync for pacman db {repo_name}"
276            ))?;
277
278        if !output.status.success() {
279            bail!("Package rsync failed for pacman db {repo_name}");
280        }
281
282        let mut changed_files = Vec::new();
283
284        for line in output.stdout.split(|&b| b == b'\n') {
285            if let Some(path) = rsync_changes::Report::parser(line)
286                .map_err(|e| anyhow!("{e}"))?
287                .file_content_updated()?
288            {
289                trace!("File at {path:?} changed, marking for extraction");
290                changed_files.push(path.to_owned());
291            }
292        }
293
294        Ok(changed_files)
295    }
296}
297
298/// Get the list of all files inside a given compressed tarball.
299///
300/// This function provides data which is necessary to determine which subset of files should be
301/// extracted.
302fn get_tar_file_list(pkg: &Path) -> Result<HashSet<String>> {
303    let mut tar_command = Command::new("tar");
304    tar_command.arg("-tf").arg(pkg);
305    trace!("Running command: {tar_command:?}");
306    let peek_output = tar_command
307        .output()
308        .context(format!("Failed to peek into pkg {pkg:?}"))?;
309    ensure_success(&peek_output).context("Error while peeking into package")?;
310
311    Ok(String::from_utf8_lossy(&peek_output.stdout)
312        .lines()
313        .map(|line| line.to_string())
314        .collect())
315}
316
317/// Use `tar` to extract relevant package metadata and script files from packages files.
318///
319/// This function attempts to extract ".MTREE", ".BUILDINFO", ".PKGINFO" and ".INSTALL" files.
320/// Extracted files are placed in a directory structure that reflects the package's association with
321/// a package repository.
322///
323/// ## Note
324///
325/// Since some files are optional, we have to take a look at the files in that tarball to determine
326/// which of the files need to be actually extracted.
327///
328/// # Panics
329///
330/// Panics if `pkg` points to a directory.
331fn extract_pkg_files(pkg: &Path, target_dir: &Path, repo_name: &str) -> Result<()> {
332    let pkg_file_name = pkg
333        .file_name()
334        .expect("got directory when expecting file")
335        .to_string_lossy()
336        .to_string();
337    let pkg_name = remove_tarball_suffix(pkg_file_name)?;
338
339    // Peek into the pkg tar to see what kind of files we need to extract.
340    let files = get_tar_file_list(pkg)?;
341
342    // Create the target directory where all the files should be extracted to.
343    let pkg_target_dir = target_dir.join(repo_name).join(pkg_name);
344    create_dir_all(&pkg_target_dir)?;
345
346    let mut cmd_args = vec![
347        "-C".to_string(),
348        pkg_target_dir.to_string_lossy().to_string(),
349        "-xf".to_string(),
350        pkg.to_string_lossy().to_string(),
351    ];
352
353    // Check for each of the known filetypes, whether it exists in the package.
354    // If it does, add it to the tar command for extraction.
355    for filetype in [".MTREE", ".BUILDINFO", ".PKGINFO", ".INSTALL"] {
356        if files.contains(filetype) {
357            cmd_args.push(filetype.to_string());
358        }
359    }
360
361    // Run the extraction command
362    let mut tar_command = Command::new("tar");
363    tar_command.args(cmd_args);
364
365    trace!("Running command: {tar_command:?}");
366    let output = tar_command
367        .output()
368        .context(format!("Failed to extract files from pkg {pkg:?}"))?;
369    ensure_success(&output).context("Error while downloading packages via rsync")?;
370
371    Ok(())
372}
373
374/// A small helper function that removes the `.pkg.tar.*` suffix of a tarball.
375/// This is necessary to get the actual package name from a packages full file name.
376pub fn remove_tarball_suffix(pkg_name: String) -> Result<String> {
377    let pkg_name = if let Some(pkg_name) = pkg_name.strip_suffix(".pkg.tar.zst") {
378        pkg_name
379    } else if let Some(pkg_name) = pkg_name.strip_suffix(".pkg.tar.xz") {
380        pkg_name
381    } else {
382        bail!("Found package with unknown tarball compression: {pkg_name:?}");
383    };
384
385    Ok(pkg_name.to_string())
386}