initial commit

This commit is contained in:
Sam Hadow 2025-04-05 19:31:07 +02:00
commit a5a3133ef0
8 changed files with 1148 additions and 0 deletions

18
.gitignore vendored Normal file
View File

@ -0,0 +1,18 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb
# Swap Files
.*.kate-swp
.swp.*

11
COPYRIGHT Normal file
View File

@ -0,0 +1,11 @@
Copyright 2025 Sam Hadow (hadow.fr)
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
4. Redistributions of any form whatsoever must retain the following acknowledgment: 'This product includes software developed by "Sam Hadow" (http://hadow.fr/).'
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

11
Cargo.toml Normal file
View File

@ -0,0 +1,11 @@
[package]
name = "du"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = { version = "4.4.6", features = ["derive"] }
rayon = "1.8.0"
sha2 = "0.10.8"

326
src/duplicates.rs Normal file
View File

@ -0,0 +1,326 @@
//! find duplicate files
//!
//! Find duplicate files in a FileTree comparing their hash.
use crate::FileTree;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
// hash
use sha2::{Digest, Sha256};
use std::io::{BufReader, Read};
// parallelism
use rayon::prelude::*;
use std::sync::{Arc, Mutex};
impl FileTree {
/// Find duplicates in a directory (including sub-directories).
///
/// If path exist in tree, find duplicates using sha256. If 2 (or more) files have the same hash they're duplicates.
///
/// returns a `Vec` containing a `Vec<PathBuf>` for each group of duplicates.
///
/// If path doesn't exist, or if no duplicates are found, return None
///
/// # Examples
/// Assuming a directory structure like this. files in lowercase, directories in uppercase. And assuming same letter means duplicate. dups will contain a Some(Vec<Vec<_>>) with a,A/a,B/a
///
/// ./{A/a,B/a,a,C/c}
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let dups = f.duplicates(&p);
/// ```
pub fn duplicates(&self, path: &Path) -> Option<Vec<Vec<PathBuf>>> {
match self.get_children(path) {
Some(value) => {
// every path in subdir (directories+files)
let children: HashSet<PathBuf> = value.into_iter().collect();
// every files (whole tree)
let files: HashSet<PathBuf> = self.files().into_iter().collect();
// every files in subdir
let intersection: Vec<_> = children.intersection(&files).cloned().collect();
// Arc<Mutex<_>> used to safely update HashMap in parallel
let hashes: Arc<Mutex<HashMap<[u8; 32], Vec<PathBuf>>>> =
Arc::new(Mutex::new(HashMap::new()));
// parallelized loop
intersection.par_iter().for_each(|item| {
if let Ok(file) = fs::File::open(item) {
let mut sha256 = Sha256::new();
// sizable buffer
let mut buffer = [0; 8192];
let mut reader = BufReader::new(file);
// https://rust-lang-nursery.github.io/rust-cookbook/cryptography/hashing.html
loop {
let count = reader.read(&mut buffer).unwrap();
// rust complains with .ok()?
// unwrap should not fail in this context though
if count == 0 {
break;
}
sha256.update(&buffer[..count]);
}
let hash = sha256.finalize();
// Use a Mutex to update HashMap in parallel
let mut locked_hashes = hashes.lock().unwrap();
locked_hashes
.entry(hash.into())
.or_default()
.push(item.clone());
}
});
// extract result from Mutex
// converting Arc<Mutex<HashMap>> into HashMap
let hashes = Arc::try_unwrap(hashes).ok().unwrap().into_inner().unwrap();
let dups = hashes
.values()
.filter(|a| a.len() > 1) // if more than 1 path for a hash it's a duplicate
.cloned()
.collect::<Vec<Vec<PathBuf>>>();
if dups.is_empty() {
None
} else {
Some(dups)
}
}
_ => None,
}
}
/// Collision shouldn't happen with sha256, but a method to check if there is a collision in duplicates found
/// # Examples
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let dups = f.duplicates(&p);
/// let check = FileTree::collision(dups);
/// ```
pub fn collision(dups: Option<Vec<Vec<PathBuf>>>) -> bool {
match dups {
Some(value) => {
for mut subgroup in value {
let path1 = subgroup.pop().unwrap();
while let Some(path2) = subgroup.pop() {
if !Self::raw_compare(&path1, &path2) {
return true; // files contents are different, it's a collision
}
}
}
false
}
None => false, // no collision since there is any duplicate
}
} // no unittest for collision as finding a collision is highly unlikely, and there are no known collision yet
/// Check if 2 files have the same content character to character, return a boolean.
fn raw_compare(path1: &Path, path2: &Path) -> bool {
if let Ok(file1) = fs::File::open(path1) {
if let Ok(file2) = fs::File::open(path2) {
// sizable buffer
let mut buffer1 = [0; 8192];
let mut buffer2 = [0; 8192];
let mut reader1 = BufReader::new(file1);
let mut reader2 = BufReader::new(file2);
loop {
let count1 = reader1.read(&mut buffer1).unwrap();
let count2 = reader2.read(&mut buffer2).unwrap();
if count1 == 0 || count2 == 0 {
if count1 != count2 {
return false;
} // files do not have same size
break;
}
if buffer1[..count1] != buffer2[..count2] {
return false;
}
}
true
} else {
panic!("{}", format!("couldn't read file {}", path2.display()))
}
} else {
panic!("{}", format!("couldn't read file {}", path1.display()))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::prelude::*;
use std::process::Command;
#[test]
fn duplicate_test() {
// create temporary test directories and files
let _ = Command::new("mkdir")
.arg("/tmp/test_du_dups1/dir1/dir11")
.arg("-p")
.spawn()
.unwrap()
.wait();
let _ = Command::new("mkdir")
.arg("/tmp/test_du_dups1/dir1/dir12")
.arg("-p")
.spawn()
.unwrap()
.wait();
// let's create the same file 3 times
let mut file = File::create("/tmp/test_du_dups1/file1").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file1")
.arg("/tmp/test_du_dups1/dir1/")
.arg("-p")
.spawn()
.unwrap()
.wait();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file1")
.arg("/tmp/test_du_dups1/file2")
.arg("-p")
.spawn()
.unwrap()
.wait();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file1")
.arg("/tmp/test_du_dups1/dir1/dir11/")
.arg("-p")
.spawn()
.unwrap()
.wait();
// let's create a different set of duplicates
let mut file1 = File::create("/tmp/test_du_dups1/file10").unwrap();
let _ = file1.write_all(b"lorem ipsum dolor sit amet").unwrap();
let _ = Command::new("cp")
.arg("/tmp/test_du_dups1/file10")
.arg("/tmp/test_du_dups1/dir1/")
.arg("-p")
.spawn()
.unwrap()
.wait();
// and a unique file here
let mut file2 = File::create("/tmp/test_du_dups1/file100").unwrap();
let _ = file2.write_all(b"consectetur adipiscing elit").unwrap();
let p = Path::new("/tmp/test_du_dups1");
let f = FileTree::new(&p, &false).unwrap();
let mut res = f.duplicates(&p).unwrap(); // unwrap asserts it's not none
res.sort_by_key(|b| b.len());
let expected_vec: Vec<Vec<PathBuf>> = vec![
vec![
Path::new("/tmp/test_du_dups1/file10").to_path_buf(),
Path::new("/tmp/test_du_dups1/dir1/file10").to_path_buf(),
],
vec![
Path::new("/tmp/test_du_dups1/file1").to_path_buf(),
Path::new("/tmp/test_du_dups1/dir1/dir11/file1").to_path_buf(),
Path::new("/tmp/test_du_dups1/dir1/file1").to_path_buf(),
Path::new("/tmp/test_du_dups1/file2").to_path_buf(),
],
];
// using a HashMap to find duplicates, items are not ordered, so we compare them using a set
// first set of duplicates
let expected = expected_vec[0].iter().cloned().collect::<HashSet<_>>();
let actual = res[0].iter().cloned().collect::<HashSet<_>>();
assert_eq!(expected, actual);
// second set of duplicates
let expected = expected_vec[1].iter().cloned().collect::<HashSet<_>>();
let actual = res[1].iter().cloned().collect::<HashSet<_>>();
assert_eq!(expected, actual);
// sets do not allow multiple elements so we have to check vec length separately
assert_eq!(expected_vec[0].len(), res[0].len());
assert_eq!(expected_vec[1].len(), res[1].len());
// we check if we have same number of duplicate groups
assert_eq!(expected_vec.len(), res.len());
//clean
let _ = Command::new("rm")
.arg("-rf")
.arg("/tmp/test_du_dups1")
.spawn()
.unwrap()
.wait();
}
#[test]
fn duplicate_test_empty() {
// create temporary test directories and files
let _ = Command::new("mkdir")
.arg("/tmp/test_du_dups2/dir1")
.arg("-p")
.spawn()
.unwrap()
.wait();
let mut file = File::create("/tmp/test_du_dups2/file10").unwrap();
let _ = file
.write_all(b"Integer faucibus sapien vitae aliquet dapibus")
.unwrap();
let mut file1 = File::create("/tmp/test_du_dups2/dir1/file10").unwrap();
let _ = file1.write_all(b"Pellentesque at pharetra enim").unwrap();
let p = Path::new("/tmp/test_du_dups2");
let f = FileTree::new(&p, &false).unwrap();
let res = f.duplicates(&p);
// should not find duplicates
assert!(res.is_none());
// clean
let _ = Command::new("rm")
.arg("-rf")
.arg("/tmp/test_du_dups2")
.spawn()
.unwrap()
.wait();
}
#[test]
fn file_compare() {
// create temporary test directory and files
let _ = Command::new("mkdir")
.arg("/tmp/test_du_compare")
.arg("-p")
.spawn()
.unwrap()
.wait();
let mut file = File::create("/tmp/test_du_compare/file1").unwrap();
let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
let mut file = File::create("/tmp/test_du_compare/file2").unwrap();
let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
let mut file = File::create("/tmp/test_du_compare/file3").unwrap();
let _ = file.write_all(b"a blandit elit mattis et").unwrap();
let p1 = Path::new("/tmp/test_du_compare/file1");
let p2 = Path::new("/tmp/test_du_compare/file2");
let p3 = Path::new("/tmp/test_du_compare/file3");
assert!(FileTree::raw_compare(p1, p2));
assert!(!FileTree::raw_compare(p1, p3));
// clean
let _ = Command::new("rm")
.arg("-rf")
.arg("/tmp/test_du_compare")
.spawn()
.unwrap()
.wait();
}
}

453
src/file_tree.rs Normal file
View File

@ -0,0 +1,453 @@
//! tree structure representation
//!
//! Implement a tree structure to represent data on a disk (files and directories).
//!
//! Include methods to
//! + construct this tree
//! + get a directory's children
//! + get a file or directory size
//! + get files in tree.
use crate::size::Size;
use std::collections::HashMap;
use std::fs;
use std::io::{Error, ErrorKind};
use std::path::{Path, PathBuf};
/// FileTree structure
pub struct FileTree {
/// tree root (base path)
root: PathBuf,
/// HashMap containing every path in the tree (every files and directories, sub-directories included) and their associated data (size and children).
map: HashMap<PathBuf, EntryNode>,
}
enum EntryNode {
File { size: Size },
Path { children: Vec<PathBuf>, size: Size },
}
impl FileTree {
/// Create a new filetree from given path.
///
/// Will return an error if path doesn't exist or if user doesn't have read permission.
///
/// If user is missing read permission in a subdir, will ignore this subdir and keep building the tree.
///
/// access_denied=true show ignored paths, access_denied=false to hide that these paths were ignored.
///
/// # Examples
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, false);
/// ```
pub fn new(root: &Path, access_denied: &bool) -> std::io::Result<Self> {
if root.exists() {
if let Ok(meta) = fs::metadata(root) {
let r = root.to_path_buf();
if meta.is_dir() {
let entries = fs::read_dir(root)?;
let mut paths: Vec<PathBuf> = entries
.filter_map(|entry| match entry {
Ok(entry) => Some(entry.path()),
Err(_) => None,
})
.collect();
let e = EntryNode::Path {
children: paths.clone(),
size: Size::new(0),
};
let mut map_entry: Vec<(PathBuf, EntryNode)> = Vec::new();
map_entry.push((r.clone(), e));
while let Some(current_path) = paths.pop() {
if !(current_path.is_symlink()) {
// need to find a better way to handle symlinks
if let Ok(meta) = fs::metadata(current_path.clone()) {
if meta.is_dir() {
if let Ok(entries) = fs::read_dir(&current_path) {
let new_paths: Vec<PathBuf> = entries
.filter_map(|entry| match entry {
Ok(entry) => Some(entry.path()),
Err(_) => None,
})
.collect();
let e = EntryNode::Path {
children: new_paths.clone(),
size: Size::new(0),
};
map_entry.push((current_path.clone(), e));
paths.extend(new_paths);
} else if *access_denied {
println!("{} children not processed, might be missing read permission",current_path.display());
}
} else {
// file
let e = EntryNode::File {
size: Size::new(meta.len()),
};
map_entry.push((current_path, e));
}
} else if *access_denied {
println!(
"{} not processed, might be missing read permission",
current_path.display()
);
}
}
}
let hashmap: HashMap<PathBuf, EntryNode> = map_entry.into_iter().collect();
let mut tree = Self {
root: r.clone(),
map: hashmap,
};
tree.set_size();
Ok(tree)
} else {
let e = EntryNode::File {
size: Size::new(meta.len()),
};
Ok(Self {
root: r.clone(),
map: HashMap::from([(r, e)]),
})
}
} else {
Err(Error::new(
ErrorKind::Other,
format!("Error processing path {}", root.display()),
))
}
} else {
Err(Error::new(ErrorKind::Other, "path not found"))
}
}
/// set attribute size of every EntryNode in the tree
fn set_size(&mut self) {
let mut paths: Vec<_> = self.map.keys().cloned().collect();
paths.sort_by_key(|b| std::cmp::Reverse(b.as_os_str().len())); // path for a child is necessarilly longer than its parent's path so we process items with longest path first.
for path in paths {
let mut s = Size::new(0);
if let Some(EntryNode::Path { children, .. }) = self.map.get_mut(path.as_path()) {
for child in children.clone() {
// children were already processed before so we can get their size
if let Some(EntryNode::Path { size, .. }) | Some(EntryNode::File { size }) =
self.map.get(child.as_path())
{
s = s + *size;
}
}
}
if let Some(EntryNode::Path { ref mut size, .. }) = self.map.get_mut(path.as_path()) {
*size = s;
}
}
}
/// return FileTree root
/// # Examples
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let root = f.get_root();
/// assert_eq!(root, p);
/// ```
pub fn get_root(&self) -> &Path {
self.root.as_path()
}
/// return a Vec with every paths in the subtree found from given path (recursively)
///
/// return None if given path doesn't exist in the FileTree
/// # Examples
/// assuming a directories structure like this, children will contains a (a eventually contains files and directories in sub-directories).
///
/// ./{A/a,B}
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let children = f.get_children(&Path::new("./A"));
/// ```
pub fn get_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
if let Some(vec_paths) = self.get_direct_children(path) {
let mut paths = vec_paths;
let mut temp_paths = paths.clone();
while let Some(current_path) = temp_paths.pop() {
if let Some(vec_paths) = self.get_direct_children(current_path.as_path()) {
paths.extend(vec_paths.clone());
temp_paths.extend(vec_paths);
}
}
Some(paths)
} else {
None
}
}
/// return direct children of a given path (non recursively, depth=1)
///
/// None if path is a file, or doesn't exist.
///
/// A `Vec<PathBuf>` if it's a directory.
/// # Examples
/// assuming a directories structure like this, children will contains a and b, but not a/c.
///
/// ./{A/{a/c,b},B}
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let children = f.get_children(&Path::new("./A"));
/// ```
pub fn get_direct_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
match self.map.get(path) {
Some(EntryNode::Path { children, .. }) => Some(children.clone()),
Some(EntryNode::File { .. }) => None,
_ => None,
}
}
/// If given path exists in the FileTree, return its size. None if it doesn't. Size can eventually be 0, an empty directory for example.
/// # Examples
/// size of entire tree (size from root):
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p);
/// let size = f.get_size(&p);
/// ```
pub fn get_size(&self, path: &Path) -> Option<Size> {
match self.map.get(path) {
Some(EntryNode::File { size }) | Some(EntryNode::Path { size, .. }) => Some(*size),
_ => None,
}
}
/// return a Vec with all the files in FileTree (and only the files), can be empty.
/// # Examples
/// assuming a directories structure like this, lowercase for files, uppercase for directories. files will only contain a, b and c.
///
/// ./{A/a,B/b,c}
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p, &false);
/// let files = f.get_files();
/// ```
// an iterator instead of a Vec<PathBuf> would be lazy,
// however, we use files to find an intersection with children in a subtree to find duplicates, an iterator wouldn't be useful for this
pub fn files(&self) -> Vec<PathBuf> {
let mut files = self
.map
.iter()
.filter(|(key, _value)| !key.is_dir())
.map(|(key, _value)| key.clone())
.collect::<Vec<PathBuf>>();
// we order by name because original hashmap is not ordered
files.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
files
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::prelude::*;
use std::process::Command;
fn set_temp(number: u8) {
// create temporary test directories and files
let _ = Command::new("mkdir")
.arg(format!("/tmp/test_du{}/dir1", number))
.arg("-p")
.spawn()
.unwrap()
.wait();
let mut file = File::create(format!("/tmp/test_du{}/file1", number)).unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
}
fn clear_temp(number: u8) {
let _ = Command::new("rm")
.arg("-rf")
.arg(format!("/tmp/test_du{}", number))
.spawn()
.unwrap()
.wait();
}
#[test]
fn simple_filetree() {
set_temp(1);
let p = Path::new("/tmp/test_du1");
let p_buf = p.to_path_buf();
let f = FileTree::new(&p, &false).unwrap();
let paths = [
Path::new("/tmp/test_du1").to_path_buf(),
Path::new("/tmp/test_du1/dir1").to_path_buf(),
Path::new("/tmp/test_du1/file1").to_path_buf(),
];
assert_eq!(f.root, p_buf);
for path in paths {
assert_eq!(f.map.contains_key(&path), true);
}
// for root
let expected_children = vec![
Path::new("/tmp/test_du1/dir1").to_path_buf(),
Path::new("/tmp/test_du1/file1").to_path_buf(),
];
let expected_size = "13"; // "Hello, wold!" has 13 characters, so file1 = 26B, directories are 0B
let actual_value = f.map.get(&p_buf).unwrap();
match actual_value {
EntryNode::Path {
children: actual_children,
size,
} => {
assert_eq!(*actual_children, expected_children);
assert_eq!(format!("{}", *size), expected_size)
}
_ => panic!(), // shouldn't happen this entry refers to a dir
}
// for dir1
let expected_children: Vec<PathBuf> = vec![];
let actual_value = f
.map
.get(&Path::new("/tmp/test_du1/dir1").to_path_buf())
.unwrap();
match actual_value {
EntryNode::Path {
children: actual_children,
size,
} => {
assert_eq!(*actual_children, expected_children);
assert_eq!(format!("{}", *size), "0"); // dir1 is empty
}
_ => panic!(), // shouldn't happen this entry refers to a dir
}
// for file1
let expected_size = "13";
let actual_value = f
.map
.get(&Path::new("/tmp/test_du1/file1").to_path_buf())
.unwrap();
match actual_value {
EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
_ => panic!(), // this entry refers to a file
}
clear_temp(1);
}
#[test]
fn simple_filetree_file() {
set_temp(2);
// for a file
let p2 = Path::new("/tmp/test_du2/file1");
let p_buf2 = p2.to_path_buf();
let f2 = FileTree::new(&p2, &false).unwrap();
assert_eq!(f2.root, p_buf2);
assert_eq!(f2.map.contains_key(&p_buf2), true);
let expected_size = "13"; // "Hello, world!" has 13 characters
let actual_value = f2.map.get(&p_buf2).unwrap();
match actual_value {
EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
_ => panic!(), // shouldn't happen f2 refers to a file
}
// clean
clear_temp(2);
}
#[test]
fn filetree_fail() {
let p = Path::new("/tmp/test_du/shouldntexist");
let f = FileTree::new(&p, &false);
assert!(f.is_err());
}
#[test]
fn get_root_test() {
set_temp(3);
let p = Path::new("/tmp/test_du3");
let f = FileTree::new(&p, &false).unwrap();
assert_eq!(f.get_root(), p);
clear_temp(3);
}
#[test]
fn files_test() {
set_temp(4);
let mut file = File::create("/tmp/test_du4/dir1/file2").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let p = Path::new("/tmp/test_du4");
let f = FileTree::new(&p, &false).unwrap();
let expected = vec![
Path::new("/tmp/test_du4/file1").to_path_buf(),
Path::new("/tmp/test_du4/dir1/file2").to_path_buf(),
];
assert_eq!(f.files(), expected);
clear_temp(4);
}
#[test]
fn children() {
set_temp(5);
let mut file = File::create("/tmp/test_du5/dir1/file2").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let p = Path::new("/tmp/test_du5");
let f = FileTree::new(&p, &false).unwrap();
let expected = vec![
Path::new("/tmp/test_du5/dir1").to_path_buf(),
Path::new("/tmp/test_du5/file1").to_path_buf(),
Path::new("/tmp/test_du5/dir1/file2").to_path_buf(),
];
assert_eq!(f.get_children(&p), Some(expected));
let p = Path::new("/tmp/test_du5/dir1");
let expected = vec![Path::new("/tmp/test_du5/dir1/file2").to_path_buf()];
assert_eq!(f.get_children(&p), Some(expected));
let p = Path::new("/tmp/test_du5/file1");
assert_eq!(f.get_children(&p), None);
let p = Path::new("/tmp/test_du5/shouldntexist");
assert_eq!(f.get_children(&p), None);
clear_temp(5);
}
#[test]
fn size_test() {
set_temp(6);
let mut file = File::create("/tmp/test_du6/dir1/file2").unwrap();
let _ = file.write_all(b"Hello, world!").unwrap();
let _ = Command::new("mkdir")
.arg("/tmp/test_du6/dir3")
.arg("-p")
.spawn()
.unwrap()
.wait();
let p = Path::new("/tmp/test_du6");
let f = FileTree::new(&p, &false).unwrap();
let expected_size = "26"; // file1 and file2
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
let p = Path::new("/tmp/test_du6/file1");
let expected_size = "13";
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
let p = Path::new("/tmp/test_du6/dir3");
let expected_size = "0";
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
let p = Path::new("/tmp/test_du6/shouldntexist");
assert!(f.get_size(&p).is_none());
clear_temp(6);
}
}

134
src/main.rs Normal file
View File

@ -0,0 +1,134 @@
//! Basic disk usage analyzer
//!
//! Include a command to show disk usage and file tree, ordered by size (default) or name. It is possible to filter files by "end with".
//!
//! Include a command to find duplicate files.
//! # Examples
//! show ~ disk usage
//! ```
//! $ cargo run -- usage ~
//! ```
//! show pdf files in ~/Downloads , ordered by name
//! ```
//! $ cargo run -- usage ~/Downloads --sort name --filter ".pdf"
//! ```
//! same command but with short options
//! ```
//! $ cargo run -- usage ~/Downloads -s name -f ".pdf"
//! ```
//! show duplicates in /tmp
//! ```
//! $ cargo run -- duplicates /tmp
//! ```
//! show duplicates in /tmp and check for collision
//! ```
//! $ cargo run -- duplicates /tmp --check
//! ```
//! same command but with short option
//! ```
//! $ cargo run -- duplicates /tmp -c
//! ```
//! for both duplicates and usage, to show directories not processed because of missing read permission
//! ```
//! $ cargo run -- duplicates ~ --access-denied
//! $ cargo run -- usage ~ --access-denied
//! ```
//! same with short option
//! ```
//! $ cargo run -- duplicates ~ -a
//! $ cargo run -- usage ~ -a
//! ```
mod duplicates;
mod file_tree;
mod print_tree;
mod size;
use clap::{Parser, Subcommand};
use file_tree::FileTree;
use std::path::{Path, PathBuf};
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[command(propagate_version = true)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Show the disk usage tree for the given path
Usage {
/// (default '.')
path: Option<PathBuf>,
/// (default "size")
#[arg(long = "sort", short = 's')]
sort: Option<String>,
/// (default None)
#[arg(long = "filter", short = 'f')]
filter: Option<String>,
/// (default false)
#[arg(long = "access-denied", short = 'a')]
access: bool,
},
/// Find duplicates in given path
Duplicates {
/// (default '.')
path: Option<PathBuf>,
/// (default false)
#[arg(long = "check", short = 'c')]
check: bool,
/// (default false)
#[arg(long = "access-denied", short = 'a')]
access: bool,
},
}
fn main() -> std::io::Result<()> {
let cli = Cli::parse();
match &cli.command {
Commands::Usage {
path,
sort,
filter,
access,
} => {
let path = path.as_deref().unwrap_or(Path::new("."));
match sort {
Some(ref s) if s == "size" => {
FileTree::new(path, access)?.show(0, filter.as_deref())
}
None => FileTree::new(path, access)?.show(0, filter.as_deref()),
Some(ref s) if s == "name" => {
FileTree::new(path, access)?.show(1, filter.as_deref())
}
_ => (),
}
}
Commands::Duplicates {
path,
check,
access,
} => {
let path = path.as_deref().unwrap_or(Path::new("."));
let duplicate_files = FileTree::new(path, access)?.duplicates(path);
if let Some(ref item_vec) = duplicate_files {
for (i, item) in item_vec.iter().enumerate() {
println!("DUPLICATES {}", i + 1);
for i in item {
println!("{}", i.display());
}
println!("\n")
}
} else {
println!("No duplicate found in {}\n", path.display());
}
if *check {
println!("collision found: {}", FileTree::collision(duplicate_files));
}
}
}
Ok(())
}

103
src/print_tree.rs Normal file
View File

@ -0,0 +1,103 @@
//! Print human readable file tree.
//!
//! Print nodes in a filetree (files and directories) with their sizes and tree structure.
//!
//! Include filter and sort arguments.
//!
//! Filter by "end with" (preserve the tree structure when printing).
//! Sort by name or by size.
use crate::file_tree::FileTree;
use crate::size::Size;
use std::path::Path;
impl FileTree {
/// Print a FileTree in the console in a human readable format.
///
/// sort = 0 is size order
///
/// sort = 1 is lexicographical order
///
/// Any other integer for sort won't fail but nodes won't be sorted and order might seem random.
///
/// filter_suffix is for "end with" filter, will still preserve tree structure (will show parent directories if one descendant pass the filter).
///
/// use filter_suffix = None to show everything
/// # Examples
///
/// ```no_run
/// let p = Path::new(".");
/// let f = FileTree::new(&p);
/// f.show(1,None); // show everything ordered by name
/// f.show(0,Some(".pdf")); // show pdf files (preserving tree structure) ordered by size
/// ```
pub fn show(&self, sort: u8, filter_suffix: Option<&str>) {
let root = self.get_root();
self.print_entry(root, &self.get_size(root).unwrap(), 0);
self.show_recursive(root, 1, sort, filter_suffix);
}
/// recursively visit a FileTree to print it
///
/// do not use alone, wrapper is show()
fn show_recursive(&self, path: &Path, indent: usize, sort: u8, filter_suffix: Option<&str>) {
if let Some(mut children) = self.get_direct_children(path) {
match sort {
0 => {
// Sort children by size in descending order
children.sort_by_key(|b| std::cmp::Reverse(self.get_size(b)))
}
1 => children.sort_by(|a, b| a.file_name().cmp(&b.file_name())),
_ => (),
}
for child in children {
if let Some(entry) = self.get_size(&child) {
if let Some(suffix) = filter_suffix {
let mut descendant = self.get_children(&child).unwrap_or_default();
descendant.push(child.clone());
let mut found = false;
// we check if one descendant pass the filter
// currently though we do it more than necessary
// might be more efficient to work with get_direct_children and memorize which parents pass the filter
// might be possible to visit descendant in reverse order and have a bit with true or false (default false) for each item
for item in descendant {
if item
.clone()
.into_os_string()
.into_string()
.unwrap()
.ends_with(suffix)
{
found = true;
break;
}
}
if found {
self.print_entry(&child, &entry, indent);
}
} else {
self.print_entry(&child, &entry, indent);
}
if let Some(_grandchildren) = self.get_direct_children(&child) {
self.show_recursive(&child, indent + 1, sort, filter_suffix);
}
}
}
}
}
/// print a single entry
///
/// do not use alone, is part os show_recursive()
fn print_entry(&self, path: &Path, size: &Size, indent: usize) {
let indentation = "\t".repeat(indent);
println!("{}{}\t{}", indentation, size, path.display());
}
}
// no unittest here for now as show is mostly "visual"

92
src/size.rs Normal file
View File

@ -0,0 +1,92 @@
//! size in bytes
//!
//! implement fmt::Display to display size in a human readable unit (B, KB, MB, GB, TB).
//!
//! For Bytes, unit isn't printed.
//!
//! implement std::ops::Add
use std::fmt;
#[derive(PartialEq, PartialOrd, Eq, Ord, Copy, Clone)]
pub struct Size(u64);
//#[derive(Debug)]
impl Size {
pub fn new(bytes: u64) -> Self {
Self(bytes)
}
}
impl fmt::Display for Size {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let length: u32 = self.0.checked_ilog10().unwrap_or(0); //number of digits -1
// match over number of 3 digits groups, 1000 ~ 1024
// 1000^n and 1024^n have the same number of digits if n<98
// here n<=4 (TB)
//
// if size in KB or above we want at most 1 decimal
match length / 3 {
0 => write!(f, "{}", self.0 as f32), // we assume no unit printed means Bytes.
1 => write!(
f,
"{}KB",
(((self.0 as f32) / 1024.0) * 10.0_f32).round().trunc() / 10.0
),
2 => write!(
f,
"{}MB",
(((self.0 as f32) / 1048576.0) * 10.0_f32).round().trunc() / 10.0
),
3 => write!(
f,
"{}GB",
(((self.0 as f32) / 1073741824.0) * 10.0_f32)
.round()
.trunc()
/ 10.0
),
4 => write!(
f,
"{}TB",
(((self.0 as f32) / 1099511627776.0) * 10.0_f32)
.round()
.trunc()
/ 10.0
),
_ => panic!(), // unlikely to have PetaBytes of files (and above) on consumer grade hardware
}
}
}
impl std::ops::Add for Size {
type Output = Self;
fn add(self, other: Self) -> Self::Output {
Self(self.0 + other.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn create() {
assert_eq!(Size::new(1).0, 1);
}
#[test]
fn add() {
let s1 = Size::new(60);
let s2 = Size::new(40);
assert_eq!((s1 + s2).0, 100);
}
#[test]
fn display() {
assert_eq!(10u32.checked_ilog10().unwrap_or(0) + 1, 2);
assert_eq!(format!("{}", Size::new(1024)), "1KB");
// 1700/1024 = 1.66015625
assert_eq!(format!("{}", Size::new(1700)), "1.7KB");
// 2411724/(1024^2) = 2.299999237060547
assert_eq!(format!("{}", Size::new(2411724)), "2.3MB");
}
}