diff --git a/Cargo.toml b/Cargo.toml index 3864f13..100d1d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,11 @@ [package] name = "lndups" -version = "0.1.2" +version = "0.1.3" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] shlex = "1.3" +smallvec = "1.13" structopt = "0.3" diff --git a/prebuilt-x86-64-linux/lndups b/prebuilt-x86-64-linux/lndups index 93da4de..9c89648 100755 Binary files a/prebuilt-x86-64-linux/lndups and b/prebuilt-x86-64-linux/lndups differ diff --git a/src/main.rs b/src/main.rs index 0c6ac57..97ca443 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,14 @@ extern crate shlex; +extern crate smallvec; extern crate structopt; use std::borrow::Borrow; -use std::io::{Read, Write, BufReader, BufRead}; -use std::path::{Path, PathBuf}; +use std::cell::RefCell; use std::collections::HashMap; +use std::io::{Read, Write, BufReader, BufRead}; use std::os::linux::fs::MetadataExt as MetadataExtLinux; +use std::path::{Path, PathBuf}; use crate::structopt::StructOpt; +use crate::smallvec::*; @@ -13,96 +16,6 @@ macro_rules! s_arg_target_file_name { () => { "target-file" } } macro_rules! s_default_target_separator { () => { ";" } } - -fn main() -> Result<(), i32> { - let mut args = CLIArguments::from_args(); - let verbosity = args.verbose - args.quiet; - - let config = Config { - min_size: args.min_size.map(|v| if v > 1 { v } else { 1 }).unwrap_or(1), - no_brace_output: args.no_brace_output, - dry_run: args.dry_run, - verbosity - }; - - if let Some(arg_file) = args.file_containing_targets { - if !args.targets.is_empty() { - eprintln!("No targets should be provided as cli arguments if arguments are being read from file"); - return Err(1); - } - if let Err(s) = { - if arg_file == "-" { - read_lines(std::io::stdin().lock(), &mut args.targets) - } else { - read_file_lines(Path::new(&arg_file), &mut args.targets) - } - } { - eprintln!("Error reading file containing targets: {}", s); - return Err(1); - } - } - - for target in &args.targets { - if target.contains('\0') { - eprintln!("Paths can never contain null byte: {}", target); - return Err(1); - } - } - let run_targets: Vec> = split_vec(&args.targets, &args.separator.unwrap_or(s_default_target_separator!().to_string())); - - if run_targets.is_empty() { - if verbosity > 0 { - println!("No targets provided"); - } - return Ok(()); - } - - let mut bad = false; - let run_paths: Vec> = run_targets.iter().enumerate().map( - |(_,spaths)| spaths.iter().map( - |spath| Path::new(spath).canonicalize().unwrap_or_else( - |_| { - eprintln!("Failed to retrieve absolute path for {}", shlex::try_quote(spath).unwrap()); - bad = true; - Default::default() - } - ) - ).collect() - ).collect(); - if bad { - return Err(1); - } - - if args.prompt { - if !prompt_confirm(&run_targets) { - return Ok(()); - } - } - - - for paths in &run_paths { - if let Err(s) = check_all_same_device(paths) { - eprintln!("{}", s); - return Err(1); - } - } - - - for paths in run_paths { - run(paths, &config); - } - Ok(()) -} - - -struct Config { - dry_run: bool, - min_size: u64, - verbosity: i8, - no_brace_output: bool -} - - #[derive(StructOpt)] #[structopt( about="Hardlink duplicate files recursively\nSymlinks are treated as normal files", @@ -157,12 +70,295 @@ struct CLIArguments { " All targets must be on the same device\n", "All symlinks are ignored\n", "'-' is not treated as special\n", - "Mutually exclusive with ", s_arg_target_file_name!(), + "Mutually exclusive with --", s_arg_target_file_name!(), ))] targets: Vec, } + +struct Config { + dry_run: bool, + min_size: u64, + verbosity: i8, + no_brace_output: bool +} + + + +fn main() -> Result<(), i32> { + let mut args = CLIArguments::from_args(); + let verbosity = args.verbose - args.quiet; + + let config = Config { + min_size: args.min_size.map(|v| if v > 1 { v } else { 1 }).unwrap_or(1), + no_brace_output: args.no_brace_output, + dry_run: args.dry_run, + verbosity + }; + + let run_targets: Vec> = obtain_run_targets( + args.file_containing_targets.as_ref(), + &mut args.targets, + args.separator.as_ref().unwrap_or(&s_default_target_separator!().to_string()), + verbosity, + )?; + if run_targets.is_empty() { + if verbosity >= 1 { + println!("No targets provided"); + } + return Ok(()); + } + + let run_paths: Vec> = obtain_run_paths( + run_targets.iter().map(|v| v.iter()), + verbosity, + )?; + + for paths in &run_paths { + if let Err(s) = check_all_same_device(paths) { + eprintln!("{}", s); + return Err(1); + } + } + + if run_paths.len() == 0 { + return Ok(()); + } + + if args.prompt { + if !prompt_confirm(&run_targets) { + return Ok(()); + } + } + + for paths in run_paths { + run(paths, &config); + } + + Ok(()) +} + + +/// result may be empty; contents each nonempty +fn obtain_run_targets<'a>( + arg_file: Option<&String>, + arg_targets: &'a mut Vec, + separator: &String, verbosity: i8 +) -> Result>, i32> { + if let Some(arg_file) = &arg_file { + if !arg_targets.is_empty() { + if verbosity >= 0 { + eprintln!("No targets should be provided as cli arguments if arguments are being read from file"); + } + return Err(1); + } + if let Err(s) = { + if *arg_file == "-" { + read_lines(std::io::stdin().lock(), arg_targets) + } else { + read_file_lines(Path::new(&arg_file), arg_targets) + } + } { + if verbosity >= 0 { + eprintln!("Error reading file containing targets: {}", s); + } + return Err(1); + } + } else { + for target in arg_targets.iter() { + if target.contains('\0') { + if verbosity >= 0 { + eprintln!("Paths can never contain null byte: {}", target); + } + return Err(1); + } + } + } + + let mut run_targets = split_vec(arg_targets, &separator); + for i in (0..run_targets.len()).rev() { + if run_targets[i].len() == 0 { + run_targets.swap_remove(i); + } + } + Ok(run_targets) +} + + +/// result has no symlinks; may be empty; contents each nonempty +fn obtain_run_paths(run_targets: T, verbosity: i8) -> Result>, i32> +where + T: Iterator + ExactSizeIterator, + Y: Iterator + ExactSizeIterator, + U: AsRef, +{ + let mut run_paths: Vec> = Vec::with_capacity(run_targets.len()); + for spaths in run_targets { + let mut paths = Vec::with_capacity(spaths.len()); + for spath in spaths { + let path = Path::new(spath.as_ref()).canonicalize().map_err(|_| { + if verbosity >= 1 { + eprintln!("Failed to retrieve absolute path for {}", shlex::try_quote(spath.as_ref()).unwrap()); + } + 1 + })?; + let pwmd = PathWithMetadata::new(path).map_err(|s| { + if verbosity >= 1 { + eprintln!("{}", s); + } + 1 + })?; + if !pwmd.md().file_type().is_symlink() { + paths.push(pwmd); + } + } + if paths.len() > 0 { + run_paths.push(paths); + } + } + Ok(run_paths) +} + + +/// perform a full run +fn run(pwmds: Vec, cfg: &Config) { + let mut registry: HashMap> = HashMap::new(); + for pwmd in pwmds { + register(pwmd, &mut registry, cfg); + } + registry.retain(|_,files| files.len() >= 2); + + let mut stdout_buffer = (cfg.verbosity >= 0).then(|| std::io::BufWriter::new(std::io::stdout().lock())); + + if let Some(stdout_buffer) = &mut stdout_buffer { + if cfg.verbosity >= 0 { + writeln!(stdout_buffer, "Considering {} total files for duplicates", registry.iter().map(|(_,files)| files.len()).sum::()).unwrap(); + } + } + + for (fsize, pwmds) in registry { + run_one_size(fsize, &pwmds, cfg, stdout_buffer.as_mut()); + } +} + +fn run_one_size(fsize: u64, pwmds: &[PathWithMetadata], cfg: &Config, mut stdout_buffer: Option<&mut W>) { + if let Some(stdout_buffer) = stdout_buffer.as_mut() { + if cfg.verbosity >= 1 { + writeln!(stdout_buffer, "Considering {} files of size {} for duplicates", pwmds.len(), fsize).unwrap(); + } + } + // if cfg.verbosity >= 0 { + // pwmds.sort_by_key(|pwmd| pwmd.path.file_name().unwrap_or_default().to_string_lossy().to_string()); + // } + let mut by_inode: Vec> = Vec::with_capacity((pwmds.len() as f64 * 0.8) as usize); // each nonempty + let mut inodes: Vec = Vec::with_capacity(by_inode.len()); + for pwmd in pwmds { + let inode: u64 = pwmd.md().st_ino(); + match inodes.binary_search(&inode) { + Ok(i) => { + by_inode[i].push(pwmd); + }, + Err(i) => { + inodes.insert(i, inode); + by_inode.insert(i, smallvec![pwmd]); + } + } + } + drop(inodes); + by_inode.sort_by(|a,b| b.len().cmp(&a.len())); // descending size order + + // compare each with eachother + let mut i = 0; + while i < by_inode.len() { + let mut j = i+1; + while j < by_inode.len() { + let (keeps, replaces) = get2mut(&mut by_inode, i, j); + if hardlink_all(keeps, replaces, cfg, stdout_buffer.as_mut()) { + by_inode.swap_remove(j); + } else { + j += 1; + } + } + i += 1; + } +} + + +/// recursively register path or its contents if directory into registry +/// eprints errors +fn register( + pwmd: PathWithMetadata, + registry: &mut HashMap>, + cfg: &Config, +) { + if pwmd.md().file_type().is_symlink() { + return; + } + + if pwmd.path.is_file() { + let size = pwmd.md().st_size(); + if size >= cfg.min_size { + registry.entry(size).or_default().push(pwmd); + } + return; + } + + if pwmd.path.is_dir() { match std::fs::read_dir(&pwmd.path) { + Ok(entries) => for entry in entries { match entry { + Ok(entry) => match PathWithMetadata::new(entry.path()) { + Ok(child_pwmd) => register(child_pwmd, registry, cfg), + Err(s) => if cfg.verbosity >= 1 { + eprintln!("{}", s); + }, + }, + Err(error) => if cfg.verbosity >= 1 { + eprintln!("Failed to inspect {}: {}", shlex::try_quote(&pwmd.path.to_string_lossy()).unwrap(), error); + }, + } }, + Err(error) => if cfg.verbosity >= 1 { + eprintln!("Failed to read dir {}: {}", shlex::try_quote(&pwmd.path.to_string_lossy()).unwrap(), error); + }, + } } +} + + + +struct PathWithMetadata { + pub path: PathBuf, + md: RefCell, +} +impl PathWithMetadata { + pub fn new(path: PathBuf) -> Result{ + let md = RefCell::new(Self::get_md(&path)?); + Ok(PathWithMetadata{ path, md }) + } + #[inline(always)] + pub fn md(&self) -> std::cell::Ref { + self.md.borrow() + } + pub fn reset_md(&self) -> Result<(), String> { + *self.md.borrow_mut() = Self::get_md(&self.path)?; + Ok(()) + } + fn get_md(path: &PathBuf) -> Result { + std::fs::symlink_metadata(path).map_err(|_| format!("Failed to retrive metadata for {}", shlex::try_quote(&path.to_string_lossy()).unwrap())) + } + +} +impl AsRef for PathWithMetadata { + fn as_ref(&self) -> &PathBuf { + return &self.path; + } +} +impl AsRef for PathWithMetadata { + fn as_ref(&self) -> &Path { + return &self.path.as_ref(); + } +} + + + /// return whether or not user gave confirmation fn prompt_confirm<'a, T: Borrow<[Y]>, Y: AsRef>(run_targets: &[T]) -> bool { println!("Are you sure you want to link all duplicates in each of these sets of targets?"); @@ -183,13 +379,9 @@ fn prompt_confirm<'a, T: Borrow<[Y]>, Y: AsRef>(run_targets: &[T]) -> bool response.to_lowercase().starts_with("y") } - -fn read_lines(reader: R, dest: &mut Vec) -> Result<(), String> { +fn read_lines(reader: impl BufRead, dest: &mut Vec) -> Result<(), String> { for line in reader.lines() { - match line { - Ok(line) => dest.push(line), - Err(err) => return Err(format!("Error reading line: {}", err)) - } + dest.push(line.map_err(|e| format!("Error reading line: {}", e))?); } Ok(()) } @@ -198,116 +390,84 @@ fn read_file_lines(path: &Path, dest: &mut Vec) -> Result<(), String> { if !path.is_file() { return Err(format!("File does not exist or is not a normal file ({})", shlex::try_quote(&path.to_string_lossy()).unwrap())); } - if let Ok(f) = std::fs::File::open(path) { - let reader = BufReader::new(f); - read_lines(reader, dest) - } else { - Err(format!("Could not open {}", shlex::try_quote(&path.to_string_lossy()).unwrap())) - } + let reader = BufReader::new(std::fs::File::open(path).map_err( + |e| format!("Could not open {}: {}", shlex::try_quote(&path.to_string_lossy()).unwrap(), e) + )?); + read_lines(reader, dest) } -/// exit on error -fn get_st_dev(file: &PathBuf) -> Result { - if let Ok(metadata) = std::fs::metadata(file) { - Ok(metadata.st_dev()) - } else { - Err(format!("Failed to retrive device id for {}", shlex::try_quote(&file.to_string_lossy()).unwrap())) - } -} - -fn check_all_same_device(paths: &[PathBuf]) -> Result<(), String> { - if paths.len() <= 1 { +fn check_all_same_device(pwmds: &[PathWithMetadata]) -> Result<(), String> { + if pwmds.len() <= 1 { return Ok(()) } - let first_device_id = get_st_dev(&paths[0])?; - let mut wrong: Vec<&PathBuf> = Vec::new(); - for path in &paths[1..] { - if get_st_dev(path)? != first_device_id { - wrong.push(path); - } + let mut by_dev: HashMap> = Default::default(); + for pwmd in pwmds.iter() { + by_dev.entry(pwmd.md().st_dev()).or_default().push(pwmd); } - if wrong.is_empty() { - Ok(()) - } else { - let mut s = String::with_capacity(wrong.len()*128); // 75 max estimated len of path, 53 for prefix msg + nl - for path in wrong { - s.push_str("Device ids must all be the same; got different for: {}"); - s.push_str(&shlex::try_quote(&path.to_string_lossy()).unwrap()); - s.push_str("\n"); - } - s.pop(); // remove last newline - Err(s) + if by_dev.len() <= 1 { + return Ok(()); } + let mut lines = Vec::with_capacity(1+by_dev.len()); + lines.push(String::from("Device ids must all be the same; got paths on different devices:")); + lines.extend(by_dev.into_iter().map(|(dev,pwmds)| { + if pwmds.len() == 1 { + format!(" Device {}: {} path: {}", dev, pwmds.len(), &shlex::try_quote(&pwmds[0].path.to_string_lossy()).unwrap()) + } else { + format!(" Device {}: {} paths", dev, pwmds.len()) + } + })); + Err(lines.join("\n")) } -/// perform a full run -fn run(paths: Vec, cfg: &Config) { - let mut registry: HashMap> = HashMap::new(); +/// get two mutable references in an array +/// expects correct inputs +fn get2mut<'a, T>(v: &'a mut [T], i: usize, j: usize) -> (&'a mut T, &'a mut T) { + let (left, right) = v.split_at_mut(j); + (&mut left[i], &mut right[0]) +} - for path in paths { - register(path.to_path_buf(), &mut registry, cfg); + +fn hardlink(keep: &PathWithMetadata, replace: &PathWithMetadata) -> Result<(), String> { + std::fs::remove_file(&replace.path).map_err(|_| "Failed to remove for hardlinking")?; + std::fs::hard_link(&keep.path, &replace.path).map_err(|_| { + match std::fs::copy(&keep.path, &replace.path) { + Ok(_) => "Failed to hardlink (copied instead)", + Err(_) => "Failed to hardlink or copy" // awful scenario but i believe it is impossible since i don't see how you could remove a file yet not create one in its place + } + })?; + replace.reset_md()?; + Ok(()) +} + +/// returns whether linking was done +/// eprints errors +fn hardlink_all<'a, 'b, T, W: Write>(keeps: &'a mut SmallVec, replaces: &'a mut SmallVec, cfg: &Config, mut stdout_buffer: Option<&mut W>) -> bool +where T: smallvec::Array, +{ + if !cmp(&replaces.first().unwrap().path, &keeps.first().unwrap().path).unwrap_or(false) { + return false; } - registry.retain(|_,files| files.len() >= 2); - - let mut stdout_buffer = if cfg.verbosity >= 0 { - let stdout = std::io::stdout(); - let stdout_buffer = std::io::BufWriter::new(stdout.lock()); - Some(stdout_buffer) - } else { - None - }; - - if let Some(stdout_buffer) = &mut stdout_buffer { - if cfg.verbosity >= 0 { - writeln!(stdout_buffer, "Considering {} total files for duplicates", registry.iter().map(|(_,files)| files.len()).sum::()).unwrap(); - } - } - - for (fsize, mut files) in registry { - if files.len() > 8 { - files.sort_by_key(|path| path.file_name().unwrap_or_default().to_string_lossy().to_string()); - } - if let Some(stdout_buffer) = &mut stdout_buffer { - if cfg.verbosity > 1 { - writeln!(stdout_buffer, "Considering {} files of size {} for duplicates", files.len(), fsize).unwrap(); - } - } - for i in (0..files.len()).rev() { - let f1 = &files[i]; - for j in (0..i).rev() { - let f2 = &files[j]; - if !are_hardlinked(f1, f2) && cmp(f1, f2).unwrap_or(false) { - if !cfg.dry_run { - if let Err(msg) = hardlink(f1, f2) { - eprintln!("{}: {}", msg, format_pair(&f1.to_string_lossy(), &f2.to_string_lossy(), cfg)); - continue - } - } - if let Some(stdout_buffer) = &mut stdout_buffer { - if cfg.verbosity >= 0 { - writeln!(stdout_buffer, "hardlinked {}", format_pair(&f1.to_string_lossy(), &f2.to_string_lossy(), cfg)).unwrap(); - } - } + for replace in replaces.into_iter() { + let keep = keeps.first().unwrap(); + if !cfg.dry_run { + if let Err(msg) = hardlink(keep, replace) { + if cfg.verbosity >= 0 { + eprintln!("{}: {}", msg, format_pair(&keep.path.to_string_lossy(), &replace.path.to_string_lossy(), cfg)); } + continue // path no longer valid } } - } -} - - -fn hardlink(f1: &PathBuf, f2: &PathBuf) -> Result<(), &'static str> { - if let Err(_) = std::fs::remove_file(f2) { - Err("Failed to remove second file for hardlinking") - } else if let Err(_) = std::fs::hard_link(f1, f2) { // same as ln in terms of args: left args's inode becomes right arg's inode - match std::fs::copy(f1, f2) { - Ok(_) => Err("Failed to hardlink (copied instead)"), - Err(_) => Err("Failed to hardlink or copy") + if let Some(stdout_buffer) = stdout_buffer.as_mut() { + if cfg.verbosity >= 0 { + writeln!(stdout_buffer, "hardlinked {}", format_pair(&keep.path.to_string_lossy(), &replace.path.to_string_lossy(), cfg)).unwrap(); + } } - } else { - Ok(()) + drop(keep); + keeps.push(replace); } + true } @@ -356,49 +516,14 @@ fn format_pair(f1s: &str, f2s: &str, cfg: &Config) -> String { } -/// recursively register path or its contents if directory into registry -fn register(path: PathBuf, registry: &mut HashMap>, cfg: &Config) { - if let Ok(metadata) = std::fs::symlink_metadata(&path) { - if metadata.file_type().is_symlink() { - return - } - - if path.is_file() { - let size = metadata.st_size(); - if size >= cfg.min_size { - registry.entry(size).or_insert_with(|| Vec::new()).push(path); - } - } else if path.is_dir() { - if let Ok(entries) = std::fs::read_dir(path) { - for entry in entries { - if let Ok(entry) = entry { - register(entry.path(), registry, cfg); - } - } - } - } - } -} - - -fn are_hardlinked(f1: &PathBuf, f2: &PathBuf) -> bool { - if let (Ok(md1), Ok(md2)) = (std::fs::metadata(f1), std::fs::metadata(f2)) { - md1.st_ino() == md2.st_ino() - } else { - false - } -} - - /// check equality of contents of two paths to files -fn cmp(f1: &PathBuf, f2: &PathBuf) -> std::io::Result { - if let (Ok(mut f1), Ok(mut f2)) = (std::fs::File::open(f1), std::fs::File::open(f2)) { - cmp_files(&mut f1, &mut f2) - } else { Ok(false) } +/// does not check sizes +fn cmp(f1: impl AsRef, f2: impl AsRef) -> std::io::Result { + cmp_read(std::fs::File::open(f1)?, std::fs::File::open(f2)?) } /// check equality of contents of two open files -fn cmp_files(f1: &mut std::fs::File, f2: &mut std::fs::File) -> std::io::Result { +fn cmp_read(mut f1: impl Read, mut f2: impl Read) -> std::io::Result { let buff1: &mut [u8] = &mut [0; 1024]; let buff2: &mut [u8] = &mut [0; 1024]; loop { @@ -436,6 +561,7 @@ fn common_suffix<'a>(s1: &'a str, s2: &'a str) -> &'a str { } +/// double delimiters will result in empty vecs fn split_vec<'a, T: std::cmp::PartialEq>(input: &'a [T], delimiter: &T) -> Vec> { let mut result: Vec> = Vec::new(); @@ -462,9 +588,8 @@ mod tests { use super::*; #[test] fn _split_vec() { - let v: Vec<_> = vec![";", "hi", "bye", ";", "1", ";", ";", "2", "2", ";"].into_iter().map(|s| s.to_string()).collect(); + let v: Vec<_> = vec![";", ";", ";"].into_iter().map(|s| s.to_string()).collect(); let res = split_vec(&v[..], &";".to_string()); - println!("{:?}", v); - println!("{:?}", res); + assert_eq!(res.len(), 2) } }