initial commit

This commit is contained in:
2025-04-05 19:31:07 +02:00
commit a5a3133ef0
8 changed files with 1148 additions and 0 deletions

326
src/duplicates.rs Normal file
View File

@ -0,0 +1,326 @@
//! find duplicate files
//!
//! Find duplicate files in a FileTree comparing their hash.
use crate::FileTree;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
// hash
use sha2::{Digest, Sha256};
use std::io::{BufReader, Read};
// parallelism
use rayon::prelude::*;
use std::sync::{Arc, Mutex};
impl FileTree {
/// Find duplicates in a directory (including sub-directories).
///
/// If path exist in tree, find duplicates using sha256. If 2 (or more) files have the same hash they're duplicates.
///
/// returns a `Vec` containing a `Vec<PathBuf>` for each group of duplicates.
///
/// If path doesn't exist, or if no duplicates are found, return None
///
/// # Examples
/// Assuming a directory structure like this. files in lowercase, directories in uppercase. And assuming same letter means duplicate. dups will contain a Some(Vec<Vec<_>>) with a,A/a,B/a
///
/// ./{A/a,B/a,a,C/c}
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let dups = f.duplicates(&p);
/// ```
pub fn duplicates(&self, path: &Path) -> Option<Vec<Vec<PathBuf>>> {
match self.get_children(path) {
Some(value) => {
// every path in subdir (directories+files)
let children: HashSet<PathBuf> = value.into_iter().collect();
// every files (whole tree)
let files: HashSet<PathBuf> = self.files().into_iter().collect();
// every files in subdir
let intersection: Vec<_> = children.intersection(&files).cloned().collect();
// Arc<Mutex<_>> used to safely update HashMap in parallel
let hashes: Arc<Mutex<HashMap<[u8; 32], Vec<PathBuf>>>> =
Arc::new(Mutex::new(HashMap::new()));
// parallelized loop
intersection.par_iter().for_each(|item| {
if let Ok(file) = fs::File::open(item) {
let mut sha256 = Sha256::new();
// sizable buffer
let mut buffer = [0; 8192];
let mut reader = BufReader::new(file);
// https://rust-lang-nursery.github.io/rust-cookbook/cryptography/hashing.html
loop {
let count = reader.read(&mut buffer).unwrap();
// rust complains with .ok()?
// unwrap should not fail in this context though
if count == 0 {
break;
}
sha256.update(&buffer[..count]);
}
let hash = sha256.finalize();
// Use a Mutex to update HashMap in parallel
let mut locked_hashes = hashes.lock().unwrap();
locked_hashes
.entry(hash.into())
.or_default()
.push(item.clone());
}
});
// extract result from Mutex
// converting Arc<Mutex<HashMap>> into HashMap
let hashes = Arc::try_unwrap(hashes).ok().unwrap().into_inner().unwrap();
let dups = hashes
.values()
.filter(|a| a.len() > 1) // if more than 1 path for a hash it's a duplicate
.cloned()
.collect::<Vec<Vec<PathBuf>>>();
if dups.is_empty() {
None
} else {
Some(dups)
}
}
_ => None,
}
}
/// Collision shouldn't happen with sha256, but a method to check if there is a collision in duplicates found
/// # Examples
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let dups = f.duplicates(&p);
/// let check = FileTree::collision(dups);
/// ```
pub fn collision(dups: Option<Vec<Vec<PathBuf>>>) -> bool {
match dups {
Some(value) => {
for mut subgroup in value {
let path1 = subgroup.pop().unwrap();
while let Some(path2) = subgroup.pop() {
if !Self::raw_compare(&path1, &path2) {
return true; // files contents are different, it's a collision
}
}
}
false
}
None => false, // no collision since there is any duplicate
}
} // no unittest for collision as finding a collision is highly unlikely, and there are no known collision yet
/// Check if 2 files have the same content character to character, return a boolean.
fn raw_compare(path1: &Path, path2: &Path) -> bool {
if let Ok(file1) = fs::File::open(path1) {
if let Ok(file2) = fs::File::open(path2) {
// sizable buffer
let mut buffer1 = [0; 8192];
let mut buffer2 = [0; 8192];
let mut reader1 = BufReader::new(file1);
let mut reader2 = BufReader::new(file2);
loop {
let count1 = reader1.read(&mut buffer1).unwrap();
let count2 = reader2.read(&mut buffer2).unwrap();
if count1 == 0 || count2 == 0 {
if count1 != count2 {
return false;
} // files do not have same size
break;
}
if buffer1[..count1] != buffer2[..count2] {
return false;
}
}
true
} else {
panic!("{}", format!("couldn't read file {}", path2.display()))
}
} else {
panic!("{}", format!("couldn't read file {}", path1.display()))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::prelude::*;
use std::process::Command;
#[test]
fn duplicate_test() {
// create temporary test directories and files
let _ = Command::new("mkdir")
.arg("/tmp/test_du_dups1/dir1/dir11")
.arg("-p")
.spawn()
.unwrap()
.wait();
let _ = Command::new("mkdir")
.arg("/tmp/test_du_dups1/dir1/dir12")
.arg("-p")
.spawn()
.unwrap()
.wait();
// let's create the same file 3 times
let mut file = File::create("/tmp/test_du_dups1/file1").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file1")
.arg("/tmp/test_du_dups1/dir1/")
.arg("-p")
.spawn()
.unwrap()
.wait();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file1")
.arg("/tmp/test_du_dups1/file2")
.arg("-p")
.spawn()
.unwrap()
.wait();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file1")
.arg("/tmp/test_du_dups1/dir1/dir11/")
.arg("-p")
.spawn()
.unwrap()
.wait();
// let's create a different set of duplicates
let mut file1 = File::create("/tmp/test_du_dups1/file10").unwrap();
let _ = file1.write_all(b"lorem ipsum dolor sit amet").unwrap();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file10")
.arg("/tmp/test_du_dups1/dir1/")
.arg("-p")
.spawn()
.unwrap()
.wait();
// and a unique file here
let mut file2 = File::create("/tmp/test_du_dups1/file100").unwrap();
let _ = file2.write_all(b"consectetur adipiscing elit").unwrap();
let p = Path::new("/tmp/test_du_dups1");
let f = FileTree::new(&p, &false).unwrap();
let mut res = f.duplicates(&p).unwrap(); // unwrap asserts it's not none
res.sort_by_key(|b| b.len());
let expected_vec: Vec<Vec<PathBuf>> = vec![
vec![
Path::new("/tmp/test_du_dups1/file10").to_path_buf(),
Path::new("/tmp/test_du_dups1/dir1/file10").to_path_buf(),
],
vec![
Path::new("/tmp/test_du_dups1/file1").to_path_buf(),
Path::new("/tmp/test_du_dups1/dir1/dir11/file1").to_path_buf(),
Path::new("/tmp/test_du_dups1/dir1/file1").to_path_buf(),
Path::new("/tmp/test_du_dups1/file2").to_path_buf(),
],
];
// using a HashMap to find duplicates, items are not ordered, so we compare them using a set
// first set of duplicates
let expected = expected_vec[0].iter().cloned().collect::<HashSet<_>>();
let actual = res[0].iter().cloned().collect::<HashSet<_>>();
assert_eq!(expected, actual);
// second set of duplicates
let expected = expected_vec[1].iter().cloned().collect::<HashSet<_>>();
let actual = res[1].iter().cloned().collect::<HashSet<_>>();
assert_eq!(expected, actual);
// sets do not allow multiple elements so we have to check vec length separately
assert_eq!(expected_vec[0].len(), res[0].len());
assert_eq!(expected_vec[1].len(), res[1].len());
// we check if we have same number of duplicate groups
assert_eq!(expected_vec.len(), res.len());
//clean
let _ = Command::new("rm")
.arg("-rf")
.arg("/tmp/test_du_dups1")
.spawn()
.unwrap()
.wait();
}
#[test]
fn duplicate_test_empty() {
// create temporary test directories and files
let _ = Command::new("mkdir")
.arg("/tmp/test_du_dups2/dir1")
.arg("-p")
.spawn()
.unwrap()
.wait();
let mut file = File::create("/tmp/test_du_dups2/file10").unwrap();
let _ = file
.write_all(b"Integer faucibus sapien vitae aliquet dapibus")
.unwrap();
let mut file1 = File::create("/tmp/test_du_dups2/dir1/file10").unwrap();
let _ = file1.write_all(b"Pellentesque at pharetra enim").unwrap();
let p = Path::new("/tmp/test_du_dups2");
let f = FileTree::new(&p, &false).unwrap();
let res = f.duplicates(&p);
// should not find duplicates
assert!(res.is_none());
// clean
let _ = Command::new("rm")
.arg("-rf")
.arg("/tmp/test_du_dups2")
.spawn()
.unwrap()
.wait();
}
#[test]
fn file_compare() {
// create temporary test directory and files
let _ = Command::new("mkdir")
.arg("/tmp/test_du_compare")
.arg("-p")
.spawn()
.unwrap()
.wait();
let mut file = File::create("/tmp/test_du_compare/file1").unwrap();
let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
let mut file = File::create("/tmp/test_du_compare/file2").unwrap();
let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
let mut file = File::create("/tmp/test_du_compare/file3").unwrap();
let _ = file.write_all(b"a blandit elit mattis et").unwrap();
let p1 = Path::new("/tmp/test_du_compare/file1");
let p2 = Path::new("/tmp/test_du_compare/file2");
let p3 = Path::new("/tmp/test_du_compare/file3");
assert!(FileTree::raw_compare(p1, p2));
assert!(!FileTree::raw_compare(p1, p3));
// clean
let _ = Command::new("rm")
.arg("-rf")
.arg("/tmp/test_du_compare")
.spawn()
.unwrap()
.wait();
}
}

453
src/file_tree.rs Normal file
View File

@ -0,0 +1,453 @@
//! tree structure representation
//!
//! Implement a tree structure to represent data on a disk (files and directories).
//!
//! Include methods to
//! + construct this tree
//! + get a directory's children
//! + get a file or directory size
//! + get files in tree.
use crate::size::Size;
use std::collections::HashMap;
use std::fs;
use std::io::{Error, ErrorKind};
use std::path::{Path, PathBuf};
/// FileTree structure
pub struct FileTree {
/// tree root (base path)
root: PathBuf,
/// HashMap containing every path in the tree (every files and directories, sub-directories included) and their associated data (size and children).
map: HashMap<PathBuf, EntryNode>,
}
enum EntryNode {
File { size: Size },
Path { children: Vec<PathBuf>, size: Size },
}
impl FileTree {
/// Create a new filetree from given path.
///
/// Will return an error if path doesn't exist or if user doesn't have read permission.
///
/// If user is missing read permission in a subdir, will ignore this subdir and keep building the tree.
///
/// access_denied=true show ignored paths, access_denied=false to hide that these paths were ignored.
///
/// # Examples
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, false);
/// ```
pub fn new(root: &Path, access_denied: &bool) -> std::io::Result<Self> {
if root.exists() {
if let Ok(meta) = fs::metadata(root) {
let r = root.to_path_buf();
if meta.is_dir() {
let entries = fs::read_dir(root)?;
let mut paths: Vec<PathBuf> = entries
.filter_map(|entry| match entry {
Ok(entry) => Some(entry.path()),
Err(_) => None,
})
.collect();
let e = EntryNode::Path {
children: paths.clone(),
size: Size::new(0),
};
let mut map_entry: Vec<(PathBuf, EntryNode)> = Vec::new();
map_entry.push((r.clone(), e));
while let Some(current_path) = paths.pop() {
if !(current_path.is_symlink()) {
// need to find a better way to handle symlinks
if let Ok(meta) = fs::metadata(current_path.clone()) {
if meta.is_dir() {
if let Ok(entries) = fs::read_dir(&current_path) {
let new_paths: Vec<PathBuf> = entries
.filter_map(|entry| match entry {
Ok(entry) => Some(entry.path()),
Err(_) => None,
})
.collect();
let e = EntryNode::Path {
children: new_paths.clone(),
size: Size::new(0),
};
map_entry.push((current_path.clone(), e));
paths.extend(new_paths);
} else if *access_denied {
println!("{} children not processed, might be missing read permission",current_path.display());
}
} else {
// file
let e = EntryNode::File {
size: Size::new(meta.len()),
};
map_entry.push((current_path, e));
}
} else if *access_denied {
println!(
"{} not processed, might be missing read permission",
current_path.display()
);
}
}
}
let hashmap: HashMap<PathBuf, EntryNode> = map_entry.into_iter().collect();
let mut tree = Self {
root: r.clone(),
map: hashmap,
};
tree.set_size();
Ok(tree)
} else {
let e = EntryNode::File {
size: Size::new(meta.len()),
};
Ok(Self {
root: r.clone(),
map: HashMap::from([(r, e)]),
})
}
} else {
Err(Error::new(
ErrorKind::Other,
format!("Error processing path {}", root.display()),
))
}
} else {
Err(Error::new(ErrorKind::Other, "path not found"))
}
}
/// set attribute size of every EntryNode in the tree
fn set_size(&mut self) {
let mut paths: Vec<_> = self.map.keys().cloned().collect();
paths.sort_by_key(|b| std::cmp::Reverse(b.as_os_str().len())); // path for a child is necessarilly longer than its parent's path so we process items with longest path first.
for path in paths {
let mut s = Size::new(0);
if let Some(EntryNode::Path { children, .. }) = self.map.get_mut(path.as_path()) {
for child in children.clone() {
// children were already processed before so we can get their size
if let Some(EntryNode::Path { size, .. }) | Some(EntryNode::File { size }) =
self.map.get(child.as_path())
{
s = s + *size;
}
}
}
if let Some(EntryNode::Path { ref mut size, .. }) = self.map.get_mut(path.as_path()) {
*size = s;
}
}
}
/// return FileTree root
/// # Examples
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let root = f.get_root();
/// assert_eq!(root, p);
/// ```
pub fn get_root(&self) -> &Path {
self.root.as_path()
}
/// return a Vec with every paths in the subtree found from given path (recursively)
///
/// return None if given path doesn't exist in the FileTree
/// # Examples
/// assuming a directories structure like this, children will contains a (a eventually contains files and directories in sub-directories).
///
/// ./{A/a,B}
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let children = f.get_children(&Path::new("./A"));
/// ```
pub fn get_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
if let Some(vec_paths) = self.get_direct_children(path) {
let mut paths = vec_paths;
let mut temp_paths = paths.clone();
while let Some(current_path) = temp_paths.pop() {
if let Some(vec_paths) = self.get_direct_children(current_path.as_path()) {
paths.extend(vec_paths.clone());
temp_paths.extend(vec_paths);
}
}
Some(paths)
} else {
None
}
}
/// return direct children of a given path (non recursively, depth=1)
///
/// None if path is a file, or doesn't exist.
///
/// A `Vec<PathBuf>` if it's a directory.
/// # Examples
/// assuming a directories structure like this, children will contains a and b, but not a/c.
///
/// ./{A/{a/c,b},B}
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let children = f.get_children(&Path::new("./A"));
/// ```
pub fn get_direct_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
match self.map.get(path) {
Some(EntryNode::Path { children, .. }) => Some(children.clone()),
Some(EntryNode::File { .. }) => None,
_ => None,
}
}
/// If given path exists in the FileTree, return its size. None if it doesn't. Size can eventually be 0, an empty directory for example.
/// # Examples
/// size of entire tree (size from root):
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p);
/// let size = f.get_size(&p);
/// ```
pub fn get_size(&self, path: &Path) -> Option<Size> {
match self.map.get(path) {
Some(EntryNode::File { size }) | Some(EntryNode::Path { size, .. }) => Some(*size),
_ => None,
}
}
/// return a Vec with all the files in FileTree (and only the files), can be empty.
/// # Examples
/// assuming a directories structure like this, lowercase for files, uppercase for directories. files will only contain a, b and c.
///
/// ./{A/a,B/b,c}
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let files = f.get_files();
/// ```
// an iterator instead of a Vec<PathBuf> would be lazy,
// however, we use files to find an intersection with children in a subtree to find duplicates, an iterator wouldn't be useful for this
pub fn files(&self) -> Vec<PathBuf> {
let mut files = self
.map
.iter()
.filter(|(key, _value)| !key.is_dir())
.map(|(key, _value)| key.clone())
.collect::<Vec<PathBuf>>();
// we order by name because original hashmap is not ordered
files.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
files
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::prelude::*;
use std::process::Command;
fn set_temp(number: u8) {
// create temporary test directories and files
let _ = Command::new("mkdir")
.arg(format!("/tmp/test_du{}/dir1", number))
.arg("-p")
.spawn()
.unwrap()
.wait();
let mut file = File::create(format!("/tmp/test_du{}/file1", number)).unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
}
fn clear_temp(number: u8) {
let _ = Command::new("rm")
.arg("-rf")
.arg(format!("/tmp/test_du{}", number))
.spawn()
.unwrap()
.wait();
}
#[test]
fn simple_filetree() {
set_temp(1);
let p = Path::new("/tmp/test_du1");
let p_buf = p.to_path_buf();
let f = FileTree::new(&p, &false).unwrap();
let paths = [
Path::new("/tmp/test_du1").to_path_buf(),
Path::new("/tmp/test_du1/dir1").to_path_buf(),
Path::new("/tmp/test_du1/file1").to_path_buf(),
];
assert_eq!(f.root, p_buf);
for path in paths {
assert_eq!(f.map.contains_key(&path), true);
}
// for root
let expected_children = vec![
Path::new("/tmp/test_du1/dir1").to_path_buf(),
Path::new("/tmp/test_du1/file1").to_path_buf(),
];
let expected_size = "13"; // "Hello, wold!" has 13 characters, so file1 = 26B, directories are 0B
let actual_value = f.map.get(&p_buf).unwrap();
match actual_value {
EntryNode::Path {
children: actual_children,
size,
} => {
assert_eq!(*actual_children, expected_children);
assert_eq!(format!("{}", *size), expected_size)
}
_ => panic!(), // shouldn't happen this entry refers to a dir
}
// for dir1
let expected_children: Vec<PathBuf> = vec![];
let actual_value = f
.map
.get(&Path::new("/tmp/test_du1/dir1").to_path_buf())
.unwrap();
match actual_value {
EntryNode::Path {
children: actual_children,
size,
} => {
assert_eq!(*actual_children, expected_children);
assert_eq!(format!("{}", *size), "0"); // dir1 is empty
}
_ => panic!(), // shouldn't happen this entry refers to a dir
}
// for file1
let expected_size = "13";
let actual_value = f
.map
.get(&Path::new("/tmp/test_du1/file1").to_path_buf())
.unwrap();
match actual_value {
EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
_ => panic!(), // this entry refers to a file
}
clear_temp(1);
}
#[test]
fn simple_filetree_file() {
set_temp(2);
// for a file
let p2 = Path::new("/tmp/test_du2/file1");
let p_buf2 = p2.to_path_buf();
let f2 = FileTree::new(&p2, &false).unwrap();
assert_eq!(f2.root, p_buf2);
assert_eq!(f2.map.contains_key(&p_buf2), true);
let expected_size = "13"; // "Hello, world!" has 13 characters
let actual_value = f2.map.get(&p_buf2).unwrap();
match actual_value {
EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
_ => panic!(), // shouldn't happen f2 refers to a file
}
// clean
clear_temp(2);
}
#[test]
fn filetree_fail() {
let p = Path::new("/tmp/test_du/shouldntexist");
let f = FileTree::new(&p, &false);
assert!(f.is_err());
}
#[test]
fn get_root_test() {
set_temp(3);
let p = Path::new("/tmp/test_du3");
let f = FileTree::new(&p, &false).unwrap();
assert_eq!(f.get_root(), p);
clear_temp(3);
}
#[test]
fn files_test() {
set_temp(4);
let mut file = File::create("/tmp/test_du4/dir1/file2").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let p = Path::new("/tmp/test_du4");
let f = FileTree::new(&p, &false).unwrap();
let expected = vec![
Path::new("/tmp/test_du4/file1").to_path_buf(),
Path::new("/tmp/test_du4/dir1/file2").to_path_buf(),
];
assert_eq!(f.files(), expected);
clear_temp(4);
}
#[test]
fn children() {
set_temp(5);
let mut file = File::create("/tmp/test_du5/dir1/file2").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let p = Path::new("/tmp/test_du5");
let f = FileTree::new(&p, &false).unwrap();
let expected = vec![
Path::new("/tmp/test_du5/dir1").to_path_buf(),
Path::new("/tmp/test_du5/file1").to_path_buf(),
Path::new("/tmp/test_du5/dir1/file2").to_path_buf(),
];
assert_eq!(f.get_children(&p), Some(expected));
let p = Path::new("/tmp/test_du5/dir1");
let expected = vec![Path::new("/tmp/test_du5/dir1/file2").to_path_buf()];
assert_eq!(f.get_children(&p), Some(expected));
let p = Path::new("/tmp/test_du5/file1");
assert_eq!(f.get_children(&p), None);
let p = Path::new("/tmp/test_du5/shouldntexist");
assert_eq!(f.get_children(&p), None);
clear_temp(5);
}
#[test]
fn size_test() {
set_temp(6);
let mut file = File::create("/tmp/test_du6/dir1/file2").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let _ = Command::new("mkdir")
.arg("/tmp/test_du6/dir3")
.arg("-p")
.spawn()
.unwrap()
.wait();
let p = Path::new("/tmp/test_du6");
let f = FileTree::new(&p, &false).unwrap();
let expected_size = "26"; // file1 and file2
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
let p = Path::new("/tmp/test_du6/file1");
let expected_size = "13";
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
let p = Path::new("/tmp/test_du6/dir3");
let expected_size = "0";
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
let p = Path::new("/tmp/test_du6/shouldntexist");
assert!(f.get_size(&p).is_none());
clear_temp(6);
}
}

134
src/main.rs Normal file
View File

@ -0,0 +1,134 @@
//! Basic disk usage analyzer
//!
//! Include a command to show disk usage and file tree, ordered by size (default) or name. It is possible to filter files by "end with".
//!
//! Include a command to find duplicate files.
//! # Examples
//! show ~ disk usage
//! ```
//! $ cargo run -- usage ~
//! ```
//! show pdf files in ~/Downloads , ordered by name
//! ```
//! $ cargo run -- usage ~/Downloads --sort name --filter ".pdf"
//! ```
//! same command but with short options
//! ```
//! $ cargo run -- usage ~/Downloads -s name -f ".pdf"
//! ```
//! show duplicates in /tmp
//! ```
//! $ cargo run -- duplicates /tmp
//! ```
//! show duplicates in /tmp and check for collision
//! ```
//! $ cargo run -- duplicates /tmp --check
//! ```
//! same command but with short option
//! ```
//! $ cargo run -- duplicates /tmp -c
//! ```
//! for both duplicates and usage, to show directories not processed because of missing read permission
//! ```
//! $ cargo run -- duplicates ~ --access-denied
//! $ cargo run -- usage ~ --access-denied
//! ```
//! same with short option
//! ```
//! $ cargo run -- duplicates ~ -a
//! $ cargo run -- usage ~ -a
//! ```
mod duplicates;
mod file_tree;
mod print_tree;
mod size;
use clap::{Parser, Subcommand};
use file_tree::FileTree;
use std::path::{Path, PathBuf};
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[command(propagate_version = true)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Show the disk usage tree for the given path
Usage {
/// (default '.')
path: Option<PathBuf>,
/// (default "size")
#[arg(long = "sort", short = 's')]
sort: Option<String>,
/// (default None)
#[arg(long = "filter", short = 'f')]
filter: Option<String>,
/// (default false)
#[arg(long = "access-denied", short = 'a')]
access: bool,
},
/// Find duplicates in given path
Duplicates {
/// (default '.')
path: Option<PathBuf>,
/// (default false)
#[arg(long = "check", short = 'c')]
check: bool,
/// (default false)
#[arg(long = "access-denied", short = 'a')]
access: bool,
},
}
fn main() -> std::io::Result<()> {
let cli = Cli::parse();
match &cli.command {
Commands::Usage {
path,
sort,
filter,
access,
} => {
let path = path.as_deref().unwrap_or(Path::new("."));
match sort {
Some(ref s) if s == "size" => {
FileTree::new(path, access)?.show(0, filter.as_deref())
}
None => FileTree::new(path, access)?.show(0, filter.as_deref()),
Some(ref s) if s == "name" => {
FileTree::new(path, access)?.show(1, filter.as_deref())
}
_ => (),
}
}
Commands::Duplicates {
path,
check,
access,
} => {
let path = path.as_deref().unwrap_or(Path::new("."));
let duplicate_files = FileTree::new(path, access)?.duplicates(path);
if let Some(ref item_vec) = duplicate_files {
for (i, item) in item_vec.iter().enumerate() {
println!("DUPLICATES {}", i + 1);
for i in item {
println!("{}", i.display());
}
println!("\n")
}
} else {
println!("No duplicate found in {}\n", path.display());
}
if *check {
println!("collision found: {}", FileTree::collision(duplicate_files));
}
}
}
Ok(())
}

103
src/print_tree.rs Normal file
View File

@ -0,0 +1,103 @@
//! Print human readable file tree.
//!
//! Print nodes in a filetree (files and directories) with their sizes and tree structure.
//!
//! Include filter and sort arguments.
//!
//! Filter by "end with" (preserve the tree structure when printing).
//! Sort by name or by size.
use crate::file_tree::FileTree;
use crate::size::Size;
use std::path::Path;
impl FileTree {
/// Print a FileTree in the console in a human readable format.
///
/// sort = 0 is size order
///
/// sort = 1 is lexicographical order
///
/// Any other integer for sort won't fail but nodes won't be sorted and order might seem random.
///
/// filter_suffix is for "end with" filter, will still preserve tree structure (will show parent directories if one descendant pass the filter).
///
/// use filter_suffix = None to show everything
/// # Examples
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p);
/// f.show(1,None); // show everything ordered by name
/// f.show(0,Some(".pdf")); // show pdf files (preserving tree structure) ordered by size
/// ```
pub fn show(&self, sort: u8, filter_suffix: Option<&str>) {
let root = self.get_root();
self.print_entry(root, &self.get_size(root).unwrap(), 0);
self.show_recursive(root, 1, sort, filter_suffix);
}
/// recursively visit a FileTree to print it
///
/// do not use alone, wrapper is show()
fn show_recursive(&self, path: &Path, indent: usize, sort: u8, filter_suffix: Option<&str>) {
if let Some(mut children) = self.get_direct_children(path) {
match sort {
0 => {
// Sort children by size in descending order
children.sort_by_key(|b| std::cmp::Reverse(self.get_size(b)))
}
1 => children.sort_by(|a, b| a.file_name().cmp(&b.file_name())),
_ => (),
}
for child in children {
if let Some(entry) = self.get_size(&child) {
if let Some(suffix) = filter_suffix {
let mut descendant = self.get_children(&child).unwrap_or_default();
descendant.push(child.clone());
let mut found = false;
// we check if one descendant pass the filter
// currently though we do it more than necessary
// might be more efficient to work with get_direct_children and memorize which parents pass the filter
// might be possible to visit descendant in reverse order and have a bit with true or false (default false) for each item
for item in descendant {
if item
.clone()
.into_os_string()
.into_string()
.unwrap()
.ends_with(suffix)
{
found = true;
break;
}
}
if found {
self.print_entry(&child, &entry, indent);
}
} else {
self.print_entry(&child, &entry, indent);
}
if let Some(_grandchildren) = self.get_direct_children(&child) {
self.show_recursive(&child, indent + 1, sort, filter_suffix);
}
}
}
}
}
/// print a single entry
///
/// do not use alone, is part os show_recursive()
fn print_entry(&self, path: &Path, size: &Size, indent: usize) {
let indentation = "\t".repeat(indent);
println!("{}{}\t{}", indentation, size, path.display());
}
}
// no unittest here for now as show is mostly "visual"

92
src/size.rs Normal file
View File

@ -0,0 +1,92 @@
//! size in bytes
//!
//! implement fmt::Display to display size in a human readable unit (B, KB, MB, GB, TB).
//!
//! For Bytes, unit isn't printed.
//!
//! implement std::ops::Add
use std::fmt;
#[derive(PartialEq, PartialOrd, Eq, Ord, Copy, Clone)]
pub struct Size(u64);
//#[derive(Debug)]
impl Size {
pub fn new(bytes: u64) -> Self {
Self(bytes)
}
}
impl fmt::Display for Size {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let length: u32 = self.0.checked_ilog10().unwrap_or(0); //number of digits -1
// match over number of 3 digits groups, 1000 ~ 1024
// 1000^n and 1024^n have the same number of digits if n<98
// here n<=4 (TB)
//
// if size in KB or above we want at most 1 decimal
match length / 3 {
0 => write!(f, "{}", self.0 as f32), // we assume no unit printed means Bytes.
1 => write!(
f,
"{}KB",
(((self.0 as f32) / 1024.0) * 10.0_f32).round().trunc() / 10.0
),
2 => write!(
f,
"{}MB",
(((self.0 as f32) / 1048576.0) * 10.0_f32).round().trunc() / 10.0
),
3 => write!(
f,
"{}GB",
(((self.0 as f32) / 1073741824.0) * 10.0_f32)
.round()
.trunc()
/ 10.0
),
4 => write!(
f,
"{}TB",
(((self.0 as f32) / 1099511627776.0) * 10.0_f32)
.round()
.trunc()
/ 10.0
),
_ => panic!(), // unlikely to have PetaBytes of files (and above) on consumer grade hardware
}
}
}
impl std::ops::Add for Size {
type Output = Self;
fn add(self, other: Self) -> Self::Output {
Self(self.0 + other.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn create() {
assert_eq!(Size::new(1).0, 1);
}
#[test]
fn add() {
let s1 = Size::new(60);
let s2 = Size::new(40);
assert_eq!((s1 + s2).0, 100);
}
#[test]
fn display() {
assert_eq!(10u32.checked_ilog10().unwrap_or(0) + 1, 2);
assert_eq!(format!("{}", Size::new(1024)), "1KB");
// 1700/1024 = 1.66015625
assert_eq!(format!("{}", Size::new(1700)), "1.7KB");
// 2411724/(1024^2) = 2.299999237060547
assert_eq!(format!("{}", Size::new(2411724)), "2.3MB");
}
}