extern crate shlex; extern crate structopt; use std::borrow::Borrow; use std::io::{Read, Write, BufReader, BufRead}; use std::path::{Path,PathBuf}; use std::collections::HashMap; use std::os::linux::fs::MetadataExt as MetadataExtLinux; use crate::structopt::StructOpt; macro_rules! s_default_target_separator { () => { ";" } } fn main() -> Result<(), i32> { match process_args() { ProcessArgsResult::Ok(run_paths, cfg) => { for paths in run_paths { run(paths, &cfg); } Ok(()) }, ProcessArgsResult::Exit => Ok(()), ProcessArgsResult::ExitError => Err(1), } } struct Config { dry_run: bool, min_size: u64, verbosity: i8, no_brace_output: bool } #[derive(StructOpt)] #[structopt( about="Hardlink duplicate files recursively\nSymlinks are treated as normal files", usage=concat!(env!("CARGO_PKG_NAME"), " [OPTION]... TARGET... ['", s_default_target_separator!(), "' TARGET...]") )] struct CLIArguments { #[structopt(short, long, parse(from_occurrences), help="Increase verbosity")] verbose: i8, #[structopt(short, long, parse(from_occurrences), help="Decrease verbosity")] quiet: i8, #[structopt(long, help="Disable brace notation for output\n Ex: /home/user/{dir,backup}/file")] no_brace_output: bool, #[structopt(long, help="Perform no operations on the filesystem")] dry_run: bool, #[structopt(short="i", help="Prompt once before operating\nDoesn't occurs if no targets are provided")] prompt: bool, #[structopt(short, long, value_name="VALUE", help="Minimum file size to be considered for hardlinking\nNever goes below 1 (the default)")] min_size: Option, #[structopt(short, long, value_name="SEPARATOR", help=concat!("Separator between sets of targets (default: ", s_default_target_separator!(), ")"))] separator: Option, #[structopt(long, value_name="FILE", help="File to source arguments from (can be '-' for stdin)")] argument_file: Option, #[structopt(value_name="TARGET", help="Target files and directories (recursive)\nEach SEPARATOR denotes a new set of targets\n Each set of targets are separate from all other sets\n All targets must be on the same device\nAll symlinks are ignored\n'-' is not treated as special")] targets: Vec, } /// return whether or not user gave confirmation fn prompt_confirm<'a, T: Borrow<[Y]>, Y: AsRef>(run_targets: &[T]) -> bool { println!("Are you sure you want to link all duplicates in each of these sets of targets?"); for spaths in run_targets { println!(" {}", shlex::join(spaths.borrow().iter().map(|s| s.as_ref()))); } print!("> "); std::io::stdout().flush().unwrap_or_else(|_| ()); let mut response = String::new(); std::io::stdin().read_line(&mut response).unwrap_or_else( |_| { eprintln!("Problem reading input"); std::process::exit(1); } ); response.to_lowercase().starts_with("y") } fn read_file_lines(path: &Path, dest: &mut Vec) -> Result<(), String> { if !path.is_file() { return Err(format!("File does not exist or is not a normal file ({})", shlex::quote(&path.to_string_lossy()))); } if let Ok(f) = std::fs::File::open(path) { let reader = BufReader::new(f); for line in reader.lines() { match line { Ok(line) => dest.push(line), Err(err) => return Err(format!("Error reading line: {}", err)) } } Ok(()) } else { Err(format!("Could not open {}", shlex::quote(&path.to_string_lossy()))) } } enum ProcessArgsResult { Ok(Vec>, Config), Exit, ExitError, } /// may exit fn process_args() -> ProcessArgsResult { let mut args = CLIArguments::from_args(); let verbosity = args.verbose - args.quiet; let config = Config { min_size: args.min_size.map(|v| if v > 1 { v } else { 1 }).unwrap_or(1), no_brace_output: args.no_brace_output, dry_run: args.dry_run, verbosity }; if let Some(arg_file) = args.argument_file { if !args.targets.is_empty() { eprintln!("No targets should be provided as cli arguments if arguments are being read from file"); return ProcessArgsResult::ExitError; } let path = Path::new(&arg_file); if let Err(s) = read_file_lines(path, &mut args.targets) { eprintln!("Error reading argument file: {}", s); return ProcessArgsResult::ExitError; } } let run_targets: Vec> = split_vec(&args.targets, &args.separator.unwrap_or(s_default_target_separator!().to_string())); if run_targets.is_empty() { if verbosity > 0 { println!("No targets provided"); } return ProcessArgsResult::Exit; } if args.prompt { if !prompt_confirm(&run_targets) { std::process::exit(0); } } let mut bad = false; let run_paths: Vec> = run_targets.iter().enumerate().map( |(_,spaths)| spaths.iter().map( |spath| Path::new(spath).canonicalize().unwrap_or_else( |_| { eprintln!("Failed to retrieve absolute path for {}", shlex::quote(spath)); bad = true; Default::default() } ) ).collect() ).collect(); if bad { return ProcessArgsResult::ExitError; } for paths in &run_paths { if let Err(s) = check_all_same_device(paths) { eprintln!("{}", s); return ProcessArgsResult::ExitError; } } ProcessArgsResult::Ok(run_paths, config) } /// exit on error fn get_st_dev(file: &PathBuf) -> Result { if let Ok(metadata) = std::fs::metadata(file) { Ok(metadata.st_dev()) } else { Err(format!("Failed to retrive device id for {}", shlex::quote(&file.to_string_lossy()))) } } fn check_all_same_device(paths: &[PathBuf]) -> Result<(), String> { if paths.len() <= 1 { return Ok(()) } let first_device_id = get_st_dev(&paths[0])?; let mut wrong: Vec<&PathBuf> = Vec::new(); for path in &paths[1..] { if get_st_dev(path)? != first_device_id { wrong.push(path); } } if wrong.is_empty() { Ok(()) } else { let mut s = String::with_capacity(wrong.len()*128); // 75 max estimated len of path, 53 for prefix msg + nl for path in wrong { s.push_str("Device ids must all be the same; got different for: {}"); s.push_str(&shlex::quote(&path.to_string_lossy())); s.push_str("\n"); } s.pop(); // remove last newline Err(s) } } /// perform a full run fn run(paths: Vec, cfg: &Config) { let mut registry: HashMap> = HashMap::new(); for path in paths { register(path.to_path_buf(), &mut registry, cfg); } registry.retain(|_,files| files.len() >= 2); let mut stdout_buffer = if cfg.verbosity >= 0 { let stdout = std::io::stdout(); let stdout_buffer = std::io::BufWriter::new(stdout.lock()); Some(stdout_buffer) } else { None }; if let Some(stdout_buffer) = &mut stdout_buffer { if cfg.verbosity >= 0 { writeln!(stdout_buffer, "Considering {} total files for duplicates", registry.iter().map(|(_,files)| files.len()).sum::()).unwrap(); } } for (fsize, mut files) in registry { if files.len() > 8 { files.sort_by_key(|path| path.file_name().unwrap_or_default().to_string_lossy().to_string()); } if let Some(stdout_buffer) = &mut stdout_buffer { if cfg.verbosity > 1 { writeln!(stdout_buffer, "Considering {} files of size {} for duplicates", files.len(), fsize).unwrap(); } } for i in (0..files.len()).rev() { let f1 = &files[i]; for j in (0..i).rev() { let f2 = &files[j]; if !are_hardlinked(f1, f2) && cmp(f1, f2).unwrap_or(false) { if !cfg.dry_run { if let Err(msg) = hardlink(f1, f2) { eprintln!("{}: {}", msg, format_pair(&f1.to_string_lossy(), &f2.to_string_lossy(), cfg)); continue } } if let Some(stdout_buffer) = &mut stdout_buffer { if cfg.verbosity >= 0 { writeln!(stdout_buffer, "hardlinked {}", format_pair(&f1.to_string_lossy(), &f2.to_string_lossy(), cfg)).unwrap(); } } } } } } } fn hardlink(f1: &PathBuf, f2: &PathBuf) -> Result<(), &'static str> { if let Err(_) = std::fs::remove_file(f2) { Err("Failed to remove second file for hardlinking") } else if let Err(_) = std::fs::hard_link(f1, f2) { // same as ln in terms of args: left args's inode becomes right arg's inode match std::fs::copy(f1, f2) { Ok(_) => Err("Failed to hardlink (copied instead)"), Err(_) => Err("Failed to hardlink or copy") } } else { Ok(()) } } fn format_pair(f1s: &str, f2s: &str, cfg: &Config) -> String { if cfg.no_brace_output { return format!( "{} {}", shlex::quote(&f1s), shlex::quote(&f2s) ) } let prefix = common_prefix(&f1s, &f2s); let suffix = common_suffix(&f1s, &f2s); let prefixlong = prefix.len() > 2; let suffixlong = suffix.len() > 2; if prefixlong && suffixlong { format!( "{}{{{},{}}}{}", shlex::quote(prefix), shlex::quote(&f1s[ prefix.len()..std::cmp::max(prefix.len(), f1s.len()-suffix.len()) ]), shlex::quote(&f2s[ prefix.len()..std::cmp::max(prefix.len(), f2s.len()-suffix.len()) ]), shlex::quote(suffix) ) } else if prefixlong { format!( "{}{{{},{}}}", shlex::quote(prefix), shlex::quote(&f1s[prefix.len()..]), shlex::quote(&f2s[prefix.len()..]) ) } else if suffixlong { format!( "{{{},{}}}{}", shlex::quote(&f1s[..f1s.len()-suffix.len()]), shlex::quote(&f2s[..f2s.len()-suffix.len()]), shlex::quote(suffix), ) } else { format!( "{} <-> {}", shlex::quote(&f1s), shlex::quote(&f2s) ) } } /// recursively register path or its contents if directory into registry fn register(path: PathBuf, registry: &mut HashMap>, cfg: &Config) { if let Ok(metadata) = std::fs::symlink_metadata(&path) { if metadata.file_type().is_symlink() { return } if path.is_file() { let size = metadata.st_size(); if size >= cfg.min_size { registry.entry(size).or_insert_with(|| Vec::new()).push(path); } } else if path.is_dir() { if let Ok(entries) = std::fs::read_dir(path) { for entry in entries { if let Ok(entry) = entry { register(entry.path(), registry, cfg); } } } } } } fn are_hardlinked(f1: &PathBuf, f2: &PathBuf) -> bool { if let (Ok(md1), Ok(md2)) = (std::fs::metadata(f1), std::fs::metadata(f2)) { md1.st_ino() == md2.st_ino() } else { false } } /// check equality of contents of two paths to files fn cmp(f1: &PathBuf, f2: &PathBuf) -> std::io::Result { if let (Ok(mut f1), Ok(mut f2)) = (std::fs::File::open(f1), std::fs::File::open(f2)) { cmp_files(&mut f1, &mut f2) } else { Ok(false) } } /// check equality of contents of two open files fn cmp_files(f1: &mut std::fs::File, f2: &mut std::fs::File) -> std::io::Result { let buff1: &mut [u8] = &mut [0; 1024]; let buff2: &mut [u8] = &mut [0; 1024]; loop { let l1 = f1.read(buff1)?; let l2 = f2.read(buff2)?; if l1 != l2 { // different sizes return Ok(false); } if l1 == 0 { // end of both files return Ok(true); } if &buff1[0..l1] != &buff2[0..l2] { // compare data return Ok(false); } } } fn common_prefix<'a>(s1: &'a str, s2: &'a str) -> &'a str { let len = s1 .chars() .zip(s2.chars()) .take_while(|(char1, char2)| char1 == char2) .count(); &s1[..len] } fn common_suffix<'a>(s1: &'a str, s2: &'a str) -> &'a str { let len = s1 .chars() .rev() .zip(s2.chars().rev()) .take_while(|(char1, char2)| char1 == char2) .count(); &s1[s1.len() - len..] } fn split_vec<'a, T: std::cmp::PartialEq>(input: &'a [T], delimiter: &T) -> Vec> { let mut result: Vec> = Vec::new(); let mut chunk_start = 0; for (i,item) in input.iter().enumerate() { if item != delimiter { continue } if i == chunk_start { // zero size chunk continue } result.push(input[chunk_start..i].iter().collect::>()); chunk_start = i+1; // next chunk starts on next index } if chunk_start < input.len() { result.push(input[chunk_start..].iter().collect::>()); } result } #[cfg(test)] mod tests { use super::*; #[test] fn _split_vec() { let v: Vec<_> = vec![";", "hi", "bye", ";", "1", ";", ";", "2", "2", ";"].into_iter().map(|s| s.to_string()).collect(); let res = split_vec(&v[..], &";".to_string()); println!("{:?}", v); println!("{:?}", res); } }