initial commit
This commit is contained in:
326
src/duplicates.rs
Normal file
326
src/duplicates.rs
Normal file
@ -0,0 +1,326 @@
|
||||
//! find duplicate files
|
||||
//!
|
||||
//! Find duplicate files in a FileTree comparing their hash.
|
||||
|
||||
use crate::FileTree;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
// hash
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::io::{BufReader, Read};
|
||||
|
||||
// parallelism
|
||||
use rayon::prelude::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
impl FileTree {
|
||||
/// Find duplicates in a directory (including sub-directories).
|
||||
///
|
||||
/// If path exist in tree, find duplicates using sha256. If 2 (or more) files have the same hash they're duplicates.
|
||||
///
|
||||
/// returns a `Vec` containing a `Vec<PathBuf>` for each group of duplicates.
|
||||
///
|
||||
/// If path doesn't exist, or if no duplicates are found, return None
|
||||
///
|
||||
/// # Examples
|
||||
/// Assuming a directory structure like this. files in lowercase, directories in uppercase. And assuming same letter means duplicate. dups will contain a Some(Vec<Vec<_>>) with a,A/a,B/a
|
||||
///
|
||||
/// ./{A/a,B/a,a,C/c}
|
||||
///
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p, &false);
|
||||
/// let dups = f.duplicates(&p);
|
||||
/// ```
|
||||
pub fn duplicates(&self, path: &Path) -> Option<Vec<Vec<PathBuf>>> {
|
||||
match self.get_children(path) {
|
||||
Some(value) => {
|
||||
// every path in subdir (directories+files)
|
||||
let children: HashSet<PathBuf> = value.into_iter().collect();
|
||||
// every files (whole tree)
|
||||
let files: HashSet<PathBuf> = self.files().into_iter().collect();
|
||||
|
||||
// every files in subdir
|
||||
let intersection: Vec<_> = children.intersection(&files).cloned().collect();
|
||||
|
||||
// Arc<Mutex<_>> used to safely update HashMap in parallel
|
||||
let hashes: Arc<Mutex<HashMap<[u8; 32], Vec<PathBuf>>>> =
|
||||
Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
// parallelized loop
|
||||
intersection.par_iter().for_each(|item| {
|
||||
if let Ok(file) = fs::File::open(item) {
|
||||
let mut sha256 = Sha256::new();
|
||||
|
||||
// sizable buffer
|
||||
let mut buffer = [0; 8192];
|
||||
let mut reader = BufReader::new(file);
|
||||
|
||||
// https://rust-lang-nursery.github.io/rust-cookbook/cryptography/hashing.html
|
||||
loop {
|
||||
let count = reader.read(&mut buffer).unwrap();
|
||||
// rust complains with .ok()?
|
||||
// unwrap should not fail in this context though
|
||||
if count == 0 {
|
||||
break;
|
||||
}
|
||||
sha256.update(&buffer[..count]);
|
||||
}
|
||||
|
||||
let hash = sha256.finalize();
|
||||
|
||||
// Use a Mutex to update HashMap in parallel
|
||||
let mut locked_hashes = hashes.lock().unwrap();
|
||||
locked_hashes
|
||||
.entry(hash.into())
|
||||
.or_default()
|
||||
.push(item.clone());
|
||||
}
|
||||
});
|
||||
|
||||
// extract result from Mutex
|
||||
// converting Arc<Mutex<HashMap>> into HashMap
|
||||
let hashes = Arc::try_unwrap(hashes).ok().unwrap().into_inner().unwrap();
|
||||
|
||||
let dups = hashes
|
||||
.values()
|
||||
.filter(|a| a.len() > 1) // if more than 1 path for a hash it's a duplicate
|
||||
.cloned()
|
||||
.collect::<Vec<Vec<PathBuf>>>();
|
||||
|
||||
if dups.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(dups)
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collision shouldn't happen with sha256, but a method to check if there is a collision in duplicates found
|
||||
/// # Examples
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p, &false);
|
||||
/// let dups = f.duplicates(&p);
|
||||
/// let check = FileTree::collision(dups);
|
||||
/// ```
|
||||
pub fn collision(dups: Option<Vec<Vec<PathBuf>>>) -> bool {
|
||||
match dups {
|
||||
Some(value) => {
|
||||
for mut subgroup in value {
|
||||
let path1 = subgroup.pop().unwrap();
|
||||
while let Some(path2) = subgroup.pop() {
|
||||
if !Self::raw_compare(&path1, &path2) {
|
||||
return true; // files contents are different, it's a collision
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
None => false, // no collision since there is any duplicate
|
||||
}
|
||||
} // no unittest for collision as finding a collision is highly unlikely, and there are no known collision yet
|
||||
|
||||
/// Check if 2 files have the same content character to character, return a boolean.
|
||||
fn raw_compare(path1: &Path, path2: &Path) -> bool {
|
||||
if let Ok(file1) = fs::File::open(path1) {
|
||||
if let Ok(file2) = fs::File::open(path2) {
|
||||
// sizable buffer
|
||||
let mut buffer1 = [0; 8192];
|
||||
let mut buffer2 = [0; 8192];
|
||||
let mut reader1 = BufReader::new(file1);
|
||||
let mut reader2 = BufReader::new(file2);
|
||||
loop {
|
||||
let count1 = reader1.read(&mut buffer1).unwrap();
|
||||
let count2 = reader2.read(&mut buffer2).unwrap();
|
||||
if count1 == 0 || count2 == 0 {
|
||||
if count1 != count2 {
|
||||
return false;
|
||||
} // files do not have same size
|
||||
break;
|
||||
}
|
||||
if buffer1[..count1] != buffer2[..count2] {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
} else {
|
||||
panic!("{}", format!("couldn't read file {}", path2.display()))
|
||||
}
|
||||
} else {
|
||||
panic!("{}", format!("couldn't read file {}", path1.display()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::process::Command;
|
||||
#[test]
|
||||
fn duplicate_test() {
|
||||
// create temporary test directories and files
|
||||
let _ = Command::new("mkdir")
|
||||
.arg("/tmp/test_du_dups1/dir1/dir11")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
let _ = Command::new("mkdir")
|
||||
.arg("/tmp/test_du_dups1/dir1/dir12")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
|
||||
// let's create the same file 3 times
|
||||
let mut file = File::create("/tmp/test_du_dups1/file1").unwrap();
|
||||
let _ = file.write_all(b"Hello, world!").unwrap();
|
||||
let _ = Command::new("cp")
|
||||
.arg("/tmp/test_du_dups1/file1")
|
||||
.arg("/tmp/test_du_dups1/dir1/")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
let _ = Command::new("cp")
|
||||
.arg("/tmp/test_du_dups1/file1")
|
||||
.arg("/tmp/test_du_dups1/file2")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
let _ = Command::new("cp")
|
||||
.arg("/tmp/test_du_dups1/file1")
|
||||
.arg("/tmp/test_du_dups1/dir1/dir11/")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
// let's create a different set of duplicates
|
||||
let mut file1 = File::create("/tmp/test_du_dups1/file10").unwrap();
|
||||
let _ = file1.write_all(b"lorem ipsum dolor sit amet").unwrap();
|
||||
let _ = Command::new("cp")
|
||||
.arg("/tmp/test_du_dups1/file10")
|
||||
.arg("/tmp/test_du_dups1/dir1/")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
// and a unique file here
|
||||
let mut file2 = File::create("/tmp/test_du_dups1/file100").unwrap();
|
||||
let _ = file2.write_all(b"consectetur adipiscing elit").unwrap();
|
||||
|
||||
let p = Path::new("/tmp/test_du_dups1");
|
||||
let f = FileTree::new(&p, &false).unwrap();
|
||||
let mut res = f.duplicates(&p).unwrap(); // unwrap asserts it's not none
|
||||
|
||||
res.sort_by_key(|b| b.len());
|
||||
|
||||
let expected_vec: Vec<Vec<PathBuf>> = vec![
|
||||
vec![
|
||||
Path::new("/tmp/test_du_dups1/file10").to_path_buf(),
|
||||
Path::new("/tmp/test_du_dups1/dir1/file10").to_path_buf(),
|
||||
],
|
||||
vec![
|
||||
Path::new("/tmp/test_du_dups1/file1").to_path_buf(),
|
||||
Path::new("/tmp/test_du_dups1/dir1/dir11/file1").to_path_buf(),
|
||||
Path::new("/tmp/test_du_dups1/dir1/file1").to_path_buf(),
|
||||
Path::new("/tmp/test_du_dups1/file2").to_path_buf(),
|
||||
],
|
||||
];
|
||||
|
||||
// using a HashMap to find duplicates, items are not ordered, so we compare them using a set
|
||||
// first set of duplicates
|
||||
let expected = expected_vec[0].iter().cloned().collect::<HashSet<_>>();
|
||||
let actual = res[0].iter().cloned().collect::<HashSet<_>>();
|
||||
assert_eq!(expected, actual);
|
||||
// second set of duplicates
|
||||
let expected = expected_vec[1].iter().cloned().collect::<HashSet<_>>();
|
||||
let actual = res[1].iter().cloned().collect::<HashSet<_>>();
|
||||
assert_eq!(expected, actual);
|
||||
|
||||
// sets do not allow multiple elements so we have to check vec length separately
|
||||
assert_eq!(expected_vec[0].len(), res[0].len());
|
||||
assert_eq!(expected_vec[1].len(), res[1].len());
|
||||
|
||||
// we check if we have same number of duplicate groups
|
||||
assert_eq!(expected_vec.len(), res.len());
|
||||
|
||||
//clean
|
||||
let _ = Command::new("rm")
|
||||
.arg("-rf")
|
||||
.arg("/tmp/test_du_dups1")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
}
|
||||
#[test]
|
||||
fn duplicate_test_empty() {
|
||||
// create temporary test directories and files
|
||||
let _ = Command::new("mkdir")
|
||||
.arg("/tmp/test_du_dups2/dir1")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
let mut file = File::create("/tmp/test_du_dups2/file10").unwrap();
|
||||
let _ = file
|
||||
.write_all(b"Integer faucibus sapien vitae aliquet dapibus")
|
||||
.unwrap();
|
||||
let mut file1 = File::create("/tmp/test_du_dups2/dir1/file10").unwrap();
|
||||
let _ = file1.write_all(b"Pellentesque at pharetra enim").unwrap();
|
||||
|
||||
let p = Path::new("/tmp/test_du_dups2");
|
||||
let f = FileTree::new(&p, &false).unwrap();
|
||||
let res = f.duplicates(&p);
|
||||
// should not find duplicates
|
||||
assert!(res.is_none());
|
||||
|
||||
// clean
|
||||
let _ = Command::new("rm")
|
||||
.arg("-rf")
|
||||
.arg("/tmp/test_du_dups2")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn file_compare() {
|
||||
// create temporary test directory and files
|
||||
let _ = Command::new("mkdir")
|
||||
.arg("/tmp/test_du_compare")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
let mut file = File::create("/tmp/test_du_compare/file1").unwrap();
|
||||
let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
|
||||
let mut file = File::create("/tmp/test_du_compare/file2").unwrap();
|
||||
let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
|
||||
let mut file = File::create("/tmp/test_du_compare/file3").unwrap();
|
||||
let _ = file.write_all(b"a blandit elit mattis et").unwrap();
|
||||
|
||||
let p1 = Path::new("/tmp/test_du_compare/file1");
|
||||
let p2 = Path::new("/tmp/test_du_compare/file2");
|
||||
let p3 = Path::new("/tmp/test_du_compare/file3");
|
||||
|
||||
assert!(FileTree::raw_compare(p1, p2));
|
||||
assert!(!FileTree::raw_compare(p1, p3));
|
||||
|
||||
// clean
|
||||
let _ = Command::new("rm")
|
||||
.arg("-rf")
|
||||
.arg("/tmp/test_du_compare")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
}
|
||||
}
|
453
src/file_tree.rs
Normal file
453
src/file_tree.rs
Normal file
@ -0,0 +1,453 @@
|
||||
//! tree structure representation
|
||||
//!
|
||||
//! Implement a tree structure to represent data on a disk (files and directories).
|
||||
//!
|
||||
//! Include methods to
|
||||
//! + construct this tree
|
||||
//! + get a directory's children
|
||||
//! + get a file or directory size
|
||||
//! + get files in tree.
|
||||
|
||||
use crate::size::Size;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// FileTree structure
|
||||
pub struct FileTree {
|
||||
/// tree root (base path)
|
||||
root: PathBuf,
|
||||
/// HashMap containing every path in the tree (every files and directories, sub-directories included) and their associated data (size and children).
|
||||
map: HashMap<PathBuf, EntryNode>,
|
||||
}
|
||||
|
||||
enum EntryNode {
|
||||
File { size: Size },
|
||||
Path { children: Vec<PathBuf>, size: Size },
|
||||
}
|
||||
|
||||
impl FileTree {
|
||||
/// Create a new filetree from given path.
|
||||
///
|
||||
/// Will return an error if path doesn't exist or if user doesn't have read permission.
|
||||
///
|
||||
/// If user is missing read permission in a subdir, will ignore this subdir and keep building the tree.
|
||||
///
|
||||
/// access_denied=true show ignored paths, access_denied=false to hide that these paths were ignored.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p, false);
|
||||
/// ```
|
||||
pub fn new(root: &Path, access_denied: &bool) -> std::io::Result<Self> {
|
||||
if root.exists() {
|
||||
if let Ok(meta) = fs::metadata(root) {
|
||||
let r = root.to_path_buf();
|
||||
|
||||
if meta.is_dir() {
|
||||
let entries = fs::read_dir(root)?;
|
||||
let mut paths: Vec<PathBuf> = entries
|
||||
.filter_map(|entry| match entry {
|
||||
Ok(entry) => Some(entry.path()),
|
||||
Err(_) => None,
|
||||
})
|
||||
.collect();
|
||||
let e = EntryNode::Path {
|
||||
children: paths.clone(),
|
||||
size: Size::new(0),
|
||||
};
|
||||
let mut map_entry: Vec<(PathBuf, EntryNode)> = Vec::new();
|
||||
map_entry.push((r.clone(), e));
|
||||
while let Some(current_path) = paths.pop() {
|
||||
if !(current_path.is_symlink()) {
|
||||
// need to find a better way to handle symlinks
|
||||
if let Ok(meta) = fs::metadata(current_path.clone()) {
|
||||
if meta.is_dir() {
|
||||
if let Ok(entries) = fs::read_dir(¤t_path) {
|
||||
let new_paths: Vec<PathBuf> = entries
|
||||
.filter_map(|entry| match entry {
|
||||
Ok(entry) => Some(entry.path()),
|
||||
Err(_) => None,
|
||||
})
|
||||
.collect();
|
||||
let e = EntryNode::Path {
|
||||
children: new_paths.clone(),
|
||||
size: Size::new(0),
|
||||
};
|
||||
map_entry.push((current_path.clone(), e));
|
||||
paths.extend(new_paths);
|
||||
} else if *access_denied {
|
||||
println!("{} children not processed, might be missing read permission",current_path.display());
|
||||
}
|
||||
} else {
|
||||
// file
|
||||
let e = EntryNode::File {
|
||||
size: Size::new(meta.len()),
|
||||
};
|
||||
map_entry.push((current_path, e));
|
||||
}
|
||||
} else if *access_denied {
|
||||
println!(
|
||||
"{} not processed, might be missing read permission",
|
||||
current_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let hashmap: HashMap<PathBuf, EntryNode> = map_entry.into_iter().collect();
|
||||
let mut tree = Self {
|
||||
root: r.clone(),
|
||||
map: hashmap,
|
||||
};
|
||||
tree.set_size();
|
||||
Ok(tree)
|
||||
} else {
|
||||
let e = EntryNode::File {
|
||||
size: Size::new(meta.len()),
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
root: r.clone(),
|
||||
map: HashMap::from([(r, e)]),
|
||||
})
|
||||
}
|
||||
} else {
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("Error processing path {}", root.display()),
|
||||
))
|
||||
}
|
||||
} else {
|
||||
Err(Error::new(ErrorKind::Other, "path not found"))
|
||||
}
|
||||
}
|
||||
|
||||
/// set attribute size of every EntryNode in the tree
|
||||
fn set_size(&mut self) {
|
||||
let mut paths: Vec<_> = self.map.keys().cloned().collect();
|
||||
paths.sort_by_key(|b| std::cmp::Reverse(b.as_os_str().len())); // path for a child is necessarilly longer than its parent's path so we process items with longest path first.
|
||||
for path in paths {
|
||||
let mut s = Size::new(0);
|
||||
if let Some(EntryNode::Path { children, .. }) = self.map.get_mut(path.as_path()) {
|
||||
for child in children.clone() {
|
||||
// children were already processed before so we can get their size
|
||||
if let Some(EntryNode::Path { size, .. }) | Some(EntryNode::File { size }) =
|
||||
self.map.get(child.as_path())
|
||||
{
|
||||
s = s + *size;
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(EntryNode::Path { ref mut size, .. }) = self.map.get_mut(path.as_path()) {
|
||||
*size = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// return FileTree root
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p, &false);
|
||||
/// let root = f.get_root();
|
||||
/// assert_eq!(root, p);
|
||||
/// ```
|
||||
pub fn get_root(&self) -> &Path {
|
||||
self.root.as_path()
|
||||
}
|
||||
|
||||
/// return a Vec with every paths in the subtree found from given path (recursively)
|
||||
///
|
||||
/// return None if given path doesn't exist in the FileTree
|
||||
/// # Examples
|
||||
/// assuming a directories structure like this, children will contains a (a eventually contains files and directories in sub-directories).
|
||||
///
|
||||
/// ./{A/a,B}
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p, &false);
|
||||
/// let children = f.get_children(&Path::new("./A"));
|
||||
/// ```
|
||||
pub fn get_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
|
||||
if let Some(vec_paths) = self.get_direct_children(path) {
|
||||
let mut paths = vec_paths;
|
||||
let mut temp_paths = paths.clone();
|
||||
while let Some(current_path) = temp_paths.pop() {
|
||||
if let Some(vec_paths) = self.get_direct_children(current_path.as_path()) {
|
||||
paths.extend(vec_paths.clone());
|
||||
temp_paths.extend(vec_paths);
|
||||
}
|
||||
}
|
||||
Some(paths)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// return direct children of a given path (non recursively, depth=1)
|
||||
///
|
||||
/// None if path is a file, or doesn't exist.
|
||||
///
|
||||
/// A `Vec<PathBuf>` if it's a directory.
|
||||
/// # Examples
|
||||
/// assuming a directories structure like this, children will contains a and b, but not a/c.
|
||||
///
|
||||
/// ./{A/{a/c,b},B}
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p, &false);
|
||||
/// let children = f.get_children(&Path::new("./A"));
|
||||
/// ```
|
||||
pub fn get_direct_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
|
||||
match self.map.get(path) {
|
||||
Some(EntryNode::Path { children, .. }) => Some(children.clone()),
|
||||
Some(EntryNode::File { .. }) => None,
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// If given path exists in the FileTree, return its size. None if it doesn't. Size can eventually be 0, an empty directory for example.
|
||||
/// # Examples
|
||||
/// size of entire tree (size from root):
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p);
|
||||
/// let size = f.get_size(&p);
|
||||
/// ```
|
||||
pub fn get_size(&self, path: &Path) -> Option<Size> {
|
||||
match self.map.get(path) {
|
||||
Some(EntryNode::File { size }) | Some(EntryNode::Path { size, .. }) => Some(*size),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// return a Vec with all the files in FileTree (and only the files), can be empty.
|
||||
/// # Examples
|
||||
/// assuming a directories structure like this, lowercase for files, uppercase for directories. files will only contain a, b and c.
|
||||
///
|
||||
/// ./{A/a,B/b,c}
|
||||
///
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p, &false);
|
||||
/// let files = f.get_files();
|
||||
/// ```
|
||||
// an iterator instead of a Vec<PathBuf> would be lazy,
|
||||
// however, we use files to find an intersection with children in a subtree to find duplicates, an iterator wouldn't be useful for this
|
||||
pub fn files(&self) -> Vec<PathBuf> {
|
||||
let mut files = self
|
||||
.map
|
||||
.iter()
|
||||
.filter(|(key, _value)| !key.is_dir())
|
||||
.map(|(key, _value)| key.clone())
|
||||
.collect::<Vec<PathBuf>>();
|
||||
// we order by name because original hashmap is not ordered
|
||||
files.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
|
||||
files
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::process::Command;
|
||||
fn set_temp(number: u8) {
|
||||
// create temporary test directories and files
|
||||
let _ = Command::new("mkdir")
|
||||
.arg(format!("/tmp/test_du{}/dir1", number))
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
let mut file = File::create(format!("/tmp/test_du{}/file1", number)).unwrap();
|
||||
let _ = file.write_all(b"Hello, world!").unwrap();
|
||||
}
|
||||
fn clear_temp(number: u8) {
|
||||
let _ = Command::new("rm")
|
||||
.arg("-rf")
|
||||
.arg(format!("/tmp/test_du{}", number))
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_filetree() {
|
||||
set_temp(1);
|
||||
|
||||
let p = Path::new("/tmp/test_du1");
|
||||
let p_buf = p.to_path_buf();
|
||||
let f = FileTree::new(&p, &false).unwrap();
|
||||
let paths = [
|
||||
Path::new("/tmp/test_du1").to_path_buf(),
|
||||
Path::new("/tmp/test_du1/dir1").to_path_buf(),
|
||||
Path::new("/tmp/test_du1/file1").to_path_buf(),
|
||||
];
|
||||
|
||||
assert_eq!(f.root, p_buf);
|
||||
for path in paths {
|
||||
assert_eq!(f.map.contains_key(&path), true);
|
||||
}
|
||||
|
||||
// for root
|
||||
let expected_children = vec![
|
||||
Path::new("/tmp/test_du1/dir1").to_path_buf(),
|
||||
Path::new("/tmp/test_du1/file1").to_path_buf(),
|
||||
];
|
||||
let expected_size = "13"; // "Hello, wold!" has 13 characters, so file1 = 26B, directories are 0B
|
||||
let actual_value = f.map.get(&p_buf).unwrap();
|
||||
match actual_value {
|
||||
EntryNode::Path {
|
||||
children: actual_children,
|
||||
size,
|
||||
} => {
|
||||
assert_eq!(*actual_children, expected_children);
|
||||
assert_eq!(format!("{}", *size), expected_size)
|
||||
}
|
||||
_ => panic!(), // shouldn't happen this entry refers to a dir
|
||||
}
|
||||
// for dir1
|
||||
let expected_children: Vec<PathBuf> = vec![];
|
||||
let actual_value = f
|
||||
.map
|
||||
.get(&Path::new("/tmp/test_du1/dir1").to_path_buf())
|
||||
.unwrap();
|
||||
match actual_value {
|
||||
EntryNode::Path {
|
||||
children: actual_children,
|
||||
size,
|
||||
} => {
|
||||
assert_eq!(*actual_children, expected_children);
|
||||
assert_eq!(format!("{}", *size), "0"); // dir1 is empty
|
||||
}
|
||||
_ => panic!(), // shouldn't happen this entry refers to a dir
|
||||
}
|
||||
// for file1
|
||||
let expected_size = "13";
|
||||
let actual_value = f
|
||||
.map
|
||||
.get(&Path::new("/tmp/test_du1/file1").to_path_buf())
|
||||
.unwrap();
|
||||
match actual_value {
|
||||
EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
|
||||
_ => panic!(), // this entry refers to a file
|
||||
}
|
||||
clear_temp(1);
|
||||
}
|
||||
#[test]
|
||||
fn simple_filetree_file() {
|
||||
set_temp(2);
|
||||
// for a file
|
||||
let p2 = Path::new("/tmp/test_du2/file1");
|
||||
let p_buf2 = p2.to_path_buf();
|
||||
let f2 = FileTree::new(&p2, &false).unwrap();
|
||||
|
||||
assert_eq!(f2.root, p_buf2);
|
||||
assert_eq!(f2.map.contains_key(&p_buf2), true);
|
||||
|
||||
let expected_size = "13"; // "Hello, world!" has 13 characters
|
||||
let actual_value = f2.map.get(&p_buf2).unwrap();
|
||||
match actual_value {
|
||||
EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
|
||||
_ => panic!(), // shouldn't happen f2 refers to a file
|
||||
}
|
||||
|
||||
// clean
|
||||
clear_temp(2);
|
||||
}
|
||||
#[test]
|
||||
fn filetree_fail() {
|
||||
let p = Path::new("/tmp/test_du/shouldntexist");
|
||||
let f = FileTree::new(&p, &false);
|
||||
assert!(f.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_root_test() {
|
||||
set_temp(3);
|
||||
let p = Path::new("/tmp/test_du3");
|
||||
let f = FileTree::new(&p, &false).unwrap();
|
||||
assert_eq!(f.get_root(), p);
|
||||
clear_temp(3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn files_test() {
|
||||
set_temp(4);
|
||||
let mut file = File::create("/tmp/test_du4/dir1/file2").unwrap();
|
||||
let _ = file.write_all(b"Hello, world!").unwrap();
|
||||
let p = Path::new("/tmp/test_du4");
|
||||
let f = FileTree::new(&p, &false).unwrap();
|
||||
let expected = vec![
|
||||
Path::new("/tmp/test_du4/file1").to_path_buf(),
|
||||
Path::new("/tmp/test_du4/dir1/file2").to_path_buf(),
|
||||
];
|
||||
assert_eq!(f.files(), expected);
|
||||
clear_temp(4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn children() {
|
||||
set_temp(5);
|
||||
let mut file = File::create("/tmp/test_du5/dir1/file2").unwrap();
|
||||
let _ = file.write_all(b"Hello, world!").unwrap();
|
||||
let p = Path::new("/tmp/test_du5");
|
||||
let f = FileTree::new(&p, &false).unwrap();
|
||||
let expected = vec![
|
||||
Path::new("/tmp/test_du5/dir1").to_path_buf(),
|
||||
Path::new("/tmp/test_du5/file1").to_path_buf(),
|
||||
Path::new("/tmp/test_du5/dir1/file2").to_path_buf(),
|
||||
];
|
||||
assert_eq!(f.get_children(&p), Some(expected));
|
||||
|
||||
let p = Path::new("/tmp/test_du5/dir1");
|
||||
let expected = vec![Path::new("/tmp/test_du5/dir1/file2").to_path_buf()];
|
||||
assert_eq!(f.get_children(&p), Some(expected));
|
||||
|
||||
let p = Path::new("/tmp/test_du5/file1");
|
||||
assert_eq!(f.get_children(&p), None);
|
||||
|
||||
let p = Path::new("/tmp/test_du5/shouldntexist");
|
||||
assert_eq!(f.get_children(&p), None);
|
||||
|
||||
clear_temp(5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn size_test() {
|
||||
set_temp(6);
|
||||
let mut file = File::create("/tmp/test_du6/dir1/file2").unwrap();
|
||||
let _ = file.write_all(b"Hello, world!").unwrap();
|
||||
let _ = Command::new("mkdir")
|
||||
.arg("/tmp/test_du6/dir3")
|
||||
.arg("-p")
|
||||
.spawn()
|
||||
.unwrap()
|
||||
.wait();
|
||||
let p = Path::new("/tmp/test_du6");
|
||||
let f = FileTree::new(&p, &false).unwrap();
|
||||
|
||||
let expected_size = "26"; // file1 and file2
|
||||
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
|
||||
|
||||
let p = Path::new("/tmp/test_du6/file1");
|
||||
let expected_size = "13";
|
||||
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
|
||||
|
||||
let p = Path::new("/tmp/test_du6/dir3");
|
||||
let expected_size = "0";
|
||||
assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
|
||||
|
||||
let p = Path::new("/tmp/test_du6/shouldntexist");
|
||||
assert!(f.get_size(&p).is_none());
|
||||
|
||||
clear_temp(6);
|
||||
}
|
||||
}
|
134
src/main.rs
Normal file
134
src/main.rs
Normal file
@ -0,0 +1,134 @@
|
||||
//! Basic disk usage analyzer
|
||||
//!
|
||||
//! Include a command to show disk usage and file tree, ordered by size (default) or name. It is possible to filter files by "end with".
|
||||
//!
|
||||
//! Include a command to find duplicate files.
|
||||
//! # Examples
|
||||
//! show ~ disk usage
|
||||
//! ```
|
||||
//! $ cargo run -- usage ~
|
||||
//! ```
|
||||
//! show pdf files in ~/Downloads , ordered by name
|
||||
//! ```
|
||||
//! $ cargo run -- usage ~/Downloads --sort name --filter ".pdf"
|
||||
//! ```
|
||||
//! same command but with short options
|
||||
//! ```
|
||||
//! $ cargo run -- usage ~/Downloads -s name -f ".pdf"
|
||||
//! ```
|
||||
//! show duplicates in /tmp
|
||||
//! ```
|
||||
//! $ cargo run -- duplicates /tmp
|
||||
//! ```
|
||||
//! show duplicates in /tmp and check for collision
|
||||
//! ```
|
||||
//! $ cargo run -- duplicates /tmp --check
|
||||
//! ```
|
||||
//! same command but with short option
|
||||
//! ```
|
||||
//! $ cargo run -- duplicates /tmp -c
|
||||
//! ```
|
||||
//! for both duplicates and usage, to show directories not processed because of missing read permission
|
||||
//! ```
|
||||
//! $ cargo run -- duplicates ~ --access-denied
|
||||
//! $ cargo run -- usage ~ --access-denied
|
||||
//! ```
|
||||
//! same with short option
|
||||
//! ```
|
||||
//! $ cargo run -- duplicates ~ -a
|
||||
//! $ cargo run -- usage ~ -a
|
||||
//! ```
|
||||
|
||||
mod duplicates;
|
||||
mod file_tree;
|
||||
mod print_tree;
|
||||
mod size;
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use file_tree::FileTree;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(propagate_version = true)]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Show the disk usage tree for the given path
|
||||
Usage {
|
||||
/// (default '.')
|
||||
path: Option<PathBuf>,
|
||||
/// (default "size")
|
||||
#[arg(long = "sort", short = 's')]
|
||||
sort: Option<String>,
|
||||
/// (default None)
|
||||
#[arg(long = "filter", short = 'f')]
|
||||
filter: Option<String>,
|
||||
/// (default false)
|
||||
#[arg(long = "access-denied", short = 'a')]
|
||||
access: bool,
|
||||
},
|
||||
/// Find duplicates in given path
|
||||
Duplicates {
|
||||
/// (default '.')
|
||||
path: Option<PathBuf>,
|
||||
/// (default false)
|
||||
#[arg(long = "check", short = 'c')]
|
||||
check: bool,
|
||||
/// (default false)
|
||||
#[arg(long = "access-denied", short = 'a')]
|
||||
access: bool,
|
||||
},
|
||||
}
|
||||
|
||||
fn main() -> std::io::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
match &cli.command {
|
||||
Commands::Usage {
|
||||
path,
|
||||
sort,
|
||||
filter,
|
||||
access,
|
||||
} => {
|
||||
let path = path.as_deref().unwrap_or(Path::new("."));
|
||||
match sort {
|
||||
Some(ref s) if s == "size" => {
|
||||
FileTree::new(path, access)?.show(0, filter.as_deref())
|
||||
}
|
||||
None => FileTree::new(path, access)?.show(0, filter.as_deref()),
|
||||
Some(ref s) if s == "name" => {
|
||||
FileTree::new(path, access)?.show(1, filter.as_deref())
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
Commands::Duplicates {
|
||||
path,
|
||||
check,
|
||||
access,
|
||||
} => {
|
||||
let path = path.as_deref().unwrap_or(Path::new("."));
|
||||
let duplicate_files = FileTree::new(path, access)?.duplicates(path);
|
||||
if let Some(ref item_vec) = duplicate_files {
|
||||
for (i, item) in item_vec.iter().enumerate() {
|
||||
println!("DUPLICATES {}", i + 1);
|
||||
for i in item {
|
||||
println!("{}", i.display());
|
||||
}
|
||||
println!("\n")
|
||||
}
|
||||
} else {
|
||||
println!("No duplicate found in {}\n", path.display());
|
||||
}
|
||||
if *check {
|
||||
println!("collision found: {}", FileTree::collision(duplicate_files));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
103
src/print_tree.rs
Normal file
103
src/print_tree.rs
Normal file
@ -0,0 +1,103 @@
|
||||
//! Print human readable file tree.
|
||||
//!
|
||||
//! Print nodes in a filetree (files and directories) with their sizes and tree structure.
|
||||
//!
|
||||
//! Include filter and sort arguments.
|
||||
//!
|
||||
//! Filter by "end with" (preserve the tree structure when printing).
|
||||
//! Sort by name or by size.
|
||||
|
||||
use crate::file_tree::FileTree;
|
||||
use crate::size::Size;
|
||||
use std::path::Path;
|
||||
|
||||
impl FileTree {
|
||||
/// Print a FileTree in the console in a human readable format.
|
||||
///
|
||||
/// sort = 0 is size order
|
||||
///
|
||||
/// sort = 1 is lexicographical order
|
||||
///
|
||||
/// Any other integer for sort won't fail but nodes won't be sorted and order might seem random.
|
||||
///
|
||||
/// filter_suffix is for "end with" filter, will still preserve tree structure (will show parent directories if one descendant pass the filter).
|
||||
///
|
||||
/// use filter_suffix = None to show everything
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// let p = Path::new(".");
|
||||
/// let f = FileTree::new(&p);
|
||||
/// f.show(1,None); // show everything ordered by name
|
||||
/// f.show(0,Some(".pdf")); // show pdf files (preserving tree structure) ordered by size
|
||||
/// ```
|
||||
pub fn show(&self, sort: u8, filter_suffix: Option<&str>) {
|
||||
let root = self.get_root();
|
||||
self.print_entry(root, &self.get_size(root).unwrap(), 0);
|
||||
self.show_recursive(root, 1, sort, filter_suffix);
|
||||
}
|
||||
|
||||
/// recursively visit a FileTree to print it
|
||||
///
|
||||
/// do not use alone, wrapper is show()
|
||||
fn show_recursive(&self, path: &Path, indent: usize, sort: u8, filter_suffix: Option<&str>) {
|
||||
if let Some(mut children) = self.get_direct_children(path) {
|
||||
match sort {
|
||||
0 => {
|
||||
// Sort children by size in descending order
|
||||
children.sort_by_key(|b| std::cmp::Reverse(self.get_size(b)))
|
||||
}
|
||||
1 => children.sort_by(|a, b| a.file_name().cmp(&b.file_name())),
|
||||
_ => (),
|
||||
}
|
||||
|
||||
for child in children {
|
||||
if let Some(entry) = self.get_size(&child) {
|
||||
if let Some(suffix) = filter_suffix {
|
||||
let mut descendant = self.get_children(&child).unwrap_or_default();
|
||||
descendant.push(child.clone());
|
||||
|
||||
let mut found = false;
|
||||
|
||||
// we check if one descendant pass the filter
|
||||
// currently though we do it more than necessary
|
||||
// might be more efficient to work with get_direct_children and memorize which parents pass the filter
|
||||
// might be possible to visit descendant in reverse order and have a bit with true or false (default false) for each item
|
||||
for item in descendant {
|
||||
if item
|
||||
.clone()
|
||||
.into_os_string()
|
||||
.into_string()
|
||||
.unwrap()
|
||||
.ends_with(suffix)
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
self.print_entry(&child, &entry, indent);
|
||||
}
|
||||
} else {
|
||||
self.print_entry(&child, &entry, indent);
|
||||
}
|
||||
|
||||
if let Some(_grandchildren) = self.get_direct_children(&child) {
|
||||
self.show_recursive(&child, indent + 1, sort, filter_suffix);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// print a single entry
|
||||
///
|
||||
/// do not use alone, is part os show_recursive()
|
||||
fn print_entry(&self, path: &Path, size: &Size, indent: usize) {
|
||||
let indentation = "\t".repeat(indent);
|
||||
println!("{}{}\t{}", indentation, size, path.display());
|
||||
}
|
||||
}
|
||||
|
||||
// no unittest here for now as show is mostly "visual"
|
92
src/size.rs
Normal file
92
src/size.rs
Normal file
@ -0,0 +1,92 @@
|
||||
//! size in bytes
|
||||
//!
|
||||
//! implement fmt::Display to display size in a human readable unit (B, KB, MB, GB, TB).
|
||||
//!
|
||||
//! For Bytes, unit isn't printed.
|
||||
//!
|
||||
//! implement std::ops::Add
|
||||
|
||||
use std::fmt;
|
||||
|
||||
#[derive(PartialEq, PartialOrd, Eq, Ord, Copy, Clone)]
|
||||
pub struct Size(u64);
|
||||
|
||||
//#[derive(Debug)]
|
||||
impl Size {
|
||||
pub fn new(bytes: u64) -> Self {
|
||||
Self(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Size {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let length: u32 = self.0.checked_ilog10().unwrap_or(0); //number of digits -1
|
||||
// match over number of 3 digits groups, 1000 ~ 1024
|
||||
// 1000^n and 1024^n have the same number of digits if n<98
|
||||
// here n<=4 (TB)
|
||||
//
|
||||
// if size in KB or above we want at most 1 decimal
|
||||
match length / 3 {
|
||||
0 => write!(f, "{}", self.0 as f32), // we assume no unit printed means Bytes.
|
||||
1 => write!(
|
||||
f,
|
||||
"{}KB",
|
||||
(((self.0 as f32) / 1024.0) * 10.0_f32).round().trunc() / 10.0
|
||||
),
|
||||
2 => write!(
|
||||
f,
|
||||
"{}MB",
|
||||
(((self.0 as f32) / 1048576.0) * 10.0_f32).round().trunc() / 10.0
|
||||
),
|
||||
3 => write!(
|
||||
f,
|
||||
"{}GB",
|
||||
(((self.0 as f32) / 1073741824.0) * 10.0_f32)
|
||||
.round()
|
||||
.trunc()
|
||||
/ 10.0
|
||||
),
|
||||
4 => write!(
|
||||
f,
|
||||
"{}TB",
|
||||
(((self.0 as f32) / 1099511627776.0) * 10.0_f32)
|
||||
.round()
|
||||
.trunc()
|
||||
/ 10.0
|
||||
),
|
||||
_ => panic!(), // unlikely to have PetaBytes of files (and above) on consumer grade hardware
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add for Size {
|
||||
type Output = Self;
|
||||
fn add(self, other: Self) -> Self::Output {
|
||||
Self(self.0 + other.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn create() {
|
||||
assert_eq!(Size::new(1).0, 1);
|
||||
}
|
||||
#[test]
|
||||
fn add() {
|
||||
let s1 = Size::new(60);
|
||||
let s2 = Size::new(40);
|
||||
assert_eq!((s1 + s2).0, 100);
|
||||
}
|
||||
#[test]
|
||||
fn display() {
|
||||
assert_eq!(10u32.checked_ilog10().unwrap_or(0) + 1, 2);
|
||||
|
||||
assert_eq!(format!("{}", Size::new(1024)), "1KB");
|
||||
// 1700/1024 = 1.66015625
|
||||
assert_eq!(format!("{}", Size::new(1700)), "1.7KB");
|
||||
// 2411724/(1024^2) = 2.299999237060547
|
||||
assert_eq!(format!("{}", Size::new(2411724)), "2.3MB");
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user