initial commit

2025-04-05 19:31:07 +02:00
commit a5a3133ef0
8 changed files with 1148 additions and 0 deletions
--- a/src/duplicates.rs
+++ b/src/duplicates.rs
@ -0,0 +1,326 @@
+//! find duplicate files
+//!
+//! Find duplicate files in a FileTree comparing their hash.
+
+use crate::FileTree;
+use std::collections::{HashMap, HashSet};
+use std::fs;
+use std::path::{Path, PathBuf};
+
+// hash
+use sha2::{Digest, Sha256};
+use std::io::{BufReader, Read};
+
+// parallelism
+use rayon::prelude::*;
+use std::sync::{Arc, Mutex};
+
+impl FileTree {
+    /// Find duplicates in a directory (including sub-directories).
+    ///
+    /// If path exist in tree, find duplicates using sha256. If 2 (or more) files have the same hash they're duplicates.
+    ///
+    /// returns a `Vec` containing a `Vec<PathBuf>` for each group of duplicates.
+    ///
+    /// If path doesn't exist, or if no duplicates are found, return None
+    ///
+    /// # Examples
+    /// Assuming a directory structure like this. files in lowercase, directories in uppercase. And assuming same letter means duplicate. dups will contain a Some(Vec<Vec<_>>) with a,A/a,B/a
+    ///
+    /// ./{A/a,B/a,a,C/c}
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let dups = f.duplicates(&p);
+    /// ```
+    pub fn duplicates(&self, path: &Path) -> Option<Vec<Vec<PathBuf>>> {
+        match self.get_children(path) {
+            Some(value) => {
+                // every path in subdir (directories+files)
+                let children: HashSet<PathBuf> = value.into_iter().collect();
+                // every files (whole tree)
+                let files: HashSet<PathBuf> = self.files().into_iter().collect();
+
+                // every files in subdir
+                let intersection: Vec<_> = children.intersection(&files).cloned().collect();
+
+                // Arc<Mutex<_>> used to safely update HashMap in parallel
+                let hashes: Arc<Mutex<HashMap<[u8; 32], Vec<PathBuf>>>> =
+                    Arc::new(Mutex::new(HashMap::new()));
+
+                // parallelized loop
+                intersection.par_iter().for_each(|item| {
+                    if let Ok(file) = fs::File::open(item) {
+                        let mut sha256 = Sha256::new();
+
+                        // sizable buffer
+                        let mut buffer = [0; 8192];
+                        let mut reader = BufReader::new(file);
+
+                        // https://rust-lang-nursery.github.io/rust-cookbook/cryptography/hashing.html
+                        loop {
+                            let count = reader.read(&mut buffer).unwrap();
+                            // rust complains with .ok()?
+                            // unwrap should not fail in this context though
+                            if count == 0 {
+                                break;
+                            }
+                            sha256.update(&buffer[..count]);
+                        }
+
+                        let hash = sha256.finalize();
+
+                        // Use a Mutex to update HashMap in parallel
+                        let mut locked_hashes = hashes.lock().unwrap();
+                        locked_hashes
+                            .entry(hash.into())
+                            .or_default()
+                            .push(item.clone());
+                    }
+                });
+
+                // extract result from Mutex
+                // converting Arc<Mutex<HashMap>> into HashMap
+                let hashes = Arc::try_unwrap(hashes).ok().unwrap().into_inner().unwrap();
+
+                let dups = hashes
+                    .values()
+                    .filter(|a| a.len() > 1) // if more than 1 path for a hash it's a duplicate
+                    .cloned()
+                    .collect::<Vec<Vec<PathBuf>>>();
+
+                if dups.is_empty() {
+                    None
+                } else {
+                    Some(dups)
+                }
+            }
+            _ => None,
+        }
+    }
+
+    /// Collision shouldn't happen with sha256, but a method to check if there is a collision in duplicates found
+    /// # Examples
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let dups = f.duplicates(&p);
+    /// let check = FileTree::collision(dups);
+    /// ```
+    pub fn collision(dups: Option<Vec<Vec<PathBuf>>>) -> bool {
+        match dups {
+            Some(value) => {
+                for mut subgroup in value {
+                    let path1 = subgroup.pop().unwrap();
+                    while let Some(path2) = subgroup.pop() {
+                        if !Self::raw_compare(&path1, &path2) {
+                            return true; // files contents are different, it's a collision
+                        }
+                    }
+                }
+                false
+            }
+            None => false, // no collision since there is any duplicate
+        }
+    } // no unittest for collision as finding a collision is highly unlikely, and there are no known collision yet
+
+    /// Check if 2 files have the same content character to character, return a boolean.
+    fn raw_compare(path1: &Path, path2: &Path) -> bool {
+        if let Ok(file1) = fs::File::open(path1) {
+            if let Ok(file2) = fs::File::open(path2) {
+                // sizable buffer
+                let mut buffer1 = [0; 8192];
+                let mut buffer2 = [0; 8192];
+                let mut reader1 = BufReader::new(file1);
+                let mut reader2 = BufReader::new(file2);
+                loop {
+                    let count1 = reader1.read(&mut buffer1).unwrap();
+                    let count2 = reader2.read(&mut buffer2).unwrap();
+                    if count1 == 0 || count2 == 0 {
+                        if count1 != count2 {
+                            return false;
+                        } // files do not have same size
+                        break;
+                    }
+                    if buffer1[..count1] != buffer2[..count2] {
+                        return false;
+                    }
+                }
+                true
+            } else {
+                panic!("{}", format!("couldn't read file {}", path2.display()))
+            }
+        } else {
+            panic!("{}", format!("couldn't read file {}", path1.display()))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::File;
+    use std::io::prelude::*;
+    use std::process::Command;
+    #[test]
+    fn duplicate_test() {
+        // create temporary test directories and files
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_dups1/dir1/dir11")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_dups1/dir1/dir12")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+
+        // let's create the same file 3 times
+        let mut file = File::create("/tmp/test_du_dups1/file1").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file1")
+            .arg("/tmp/test_du_dups1/dir1/")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file1")
+            .arg("/tmp/test_du_dups1/file2")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file1")
+            .arg("/tmp/test_du_dups1/dir1/dir11/")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        // let's create a different set of duplicates
+        let mut file1 = File::create("/tmp/test_du_dups1/file10").unwrap();
+        let _ = file1.write_all(b"lorem ipsum dolor sit amet").unwrap();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file10")
+            .arg("/tmp/test_du_dups1/dir1/")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        // and a unique file here
+        let mut file2 = File::create("/tmp/test_du_dups1/file100").unwrap();
+        let _ = file2.write_all(b"consectetur adipiscing elit").unwrap();
+
+        let p = Path::new("/tmp/test_du_dups1");
+        let f = FileTree::new(&p, &false).unwrap();
+        let mut res = f.duplicates(&p).unwrap(); // unwrap asserts it's not none
+
+        res.sort_by_key(|b| b.len());
+
+        let expected_vec: Vec<Vec<PathBuf>> = vec![
+            vec![
+                Path::new("/tmp/test_du_dups1/file10").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/dir1/file10").to_path_buf(),
+            ],
+            vec![
+                Path::new("/tmp/test_du_dups1/file1").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/dir1/dir11/file1").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/dir1/file1").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/file2").to_path_buf(),
+            ],
+        ];
+
+        // using a HashMap to find duplicates, items are not ordered, so we compare them using a set
+        // first set of duplicates
+        let expected = expected_vec[0].iter().cloned().collect::<HashSet<_>>();
+        let actual = res[0].iter().cloned().collect::<HashSet<_>>();
+        assert_eq!(expected, actual);
+        // second set of duplicates
+        let expected = expected_vec[1].iter().cloned().collect::<HashSet<_>>();
+        let actual = res[1].iter().cloned().collect::<HashSet<_>>();
+        assert_eq!(expected, actual);
+
+        // sets do not allow multiple elements so we have to check vec length separately
+        assert_eq!(expected_vec[0].len(), res[0].len());
+        assert_eq!(expected_vec[1].len(), res[1].len());
+
+        // we check if we have same number of duplicate groups
+        assert_eq!(expected_vec.len(), res.len());
+
+        //clean
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg("/tmp/test_du_dups1")
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+    #[test]
+    fn duplicate_test_empty() {
+        // create temporary test directories and files
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_dups2/dir1")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let mut file = File::create("/tmp/test_du_dups2/file10").unwrap();
+        let _ = file
+            .write_all(b"Integer faucibus sapien vitae aliquet dapibus")
+            .unwrap();
+        let mut file1 = File::create("/tmp/test_du_dups2/dir1/file10").unwrap();
+        let _ = file1.write_all(b"Pellentesque at pharetra enim").unwrap();
+
+        let p = Path::new("/tmp/test_du_dups2");
+        let f = FileTree::new(&p, &false).unwrap();
+        let res = f.duplicates(&p);
+        // should not find duplicates
+        assert!(res.is_none());
+
+        // clean
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg("/tmp/test_du_dups2")
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+
+    #[test]
+    fn file_compare() {
+        // create temporary test directory and files
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_compare")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let mut file = File::create("/tmp/test_du_compare/file1").unwrap();
+        let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
+        let mut file = File::create("/tmp/test_du_compare/file2").unwrap();
+        let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
+        let mut file = File::create("/tmp/test_du_compare/file3").unwrap();
+        let _ = file.write_all(b"a blandit elit mattis et").unwrap();
+
+        let p1 = Path::new("/tmp/test_du_compare/file1");
+        let p2 = Path::new("/tmp/test_du_compare/file2");
+        let p3 = Path::new("/tmp/test_du_compare/file3");
+
+        assert!(FileTree::raw_compare(p1, p2));
+        assert!(!FileTree::raw_compare(p1, p3));
+
+        // clean
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg("/tmp/test_du_compare")
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+}
--- a/src/file_tree.rs
+++ b/src/file_tree.rs
@ -0,0 +1,453 @@
+//! tree structure representation
+//!
+//! Implement a tree structure to represent data on a disk (files and directories).
+//!
+//! Include methods to
+//! + construct this tree
+//! + get a directory's children
+//! + get a file or directory size
+//! + get files in tree.
+
+use crate::size::Size;
+use std::collections::HashMap;
+use std::fs;
+use std::io::{Error, ErrorKind};
+use std::path::{Path, PathBuf};
+
+/// FileTree structure
+pub struct FileTree {
+    /// tree root (base path)
+    root: PathBuf,
+    /// HashMap containing every path in the tree (every files and directories, sub-directories included) and their associated data (size and children).
+    map: HashMap<PathBuf, EntryNode>,
+}
+
+enum EntryNode {
+    File { size: Size },
+    Path { children: Vec<PathBuf>, size: Size },
+}
+
+impl FileTree {
+    /// Create a new filetree from given path.
+    ///
+    /// Will return an error if path doesn't exist or if user doesn't have read permission.
+    ///
+    /// If user is missing read permission in a subdir, will ignore this subdir and keep building the tree.
+    ///
+    /// access_denied=true show ignored paths, access_denied=false to hide that these paths were ignored.
+    ///
+    /// # Examples
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, false);
+    /// ```
+    pub fn new(root: &Path, access_denied: &bool) -> std::io::Result<Self> {
+        if root.exists() {
+            if let Ok(meta) = fs::metadata(root) {
+                let r = root.to_path_buf();
+
+                if meta.is_dir() {
+                    let entries = fs::read_dir(root)?;
+                    let mut paths: Vec<PathBuf> = entries
+                        .filter_map(|entry| match entry {
+                            Ok(entry) => Some(entry.path()),
+                            Err(_) => None,
+                        })
+                        .collect();
+                    let e = EntryNode::Path {
+                        children: paths.clone(),
+                        size: Size::new(0),
+                    };
+                    let mut map_entry: Vec<(PathBuf, EntryNode)> = Vec::new();
+                    map_entry.push((r.clone(), e));
+                    while let Some(current_path) = paths.pop() {
+                        if !(current_path.is_symlink()) {
+                            // need to find a better way to handle symlinks
+                            if let Ok(meta) = fs::metadata(current_path.clone()) {
+                                if meta.is_dir() {
+                                    if let Ok(entries) = fs::read_dir(&current_path) {
+                                        let new_paths: Vec<PathBuf> = entries
+                                            .filter_map(|entry| match entry {
+                                                Ok(entry) => Some(entry.path()),
+                                                Err(_) => None,
+                                            })
+                                            .collect();
+                                        let e = EntryNode::Path {
+                                            children: new_paths.clone(),
+                                            size: Size::new(0),
+                                        };
+                                        map_entry.push((current_path.clone(), e));
+                                        paths.extend(new_paths);
+                                    } else if *access_denied {
+                                        println!("{} children not processed, might be missing read permission",current_path.display());
+                                    }
+                                } else {
+                                    // file
+                                    let e = EntryNode::File {
+                                        size: Size::new(meta.len()),
+                                    };
+                                    map_entry.push((current_path, e));
+                                }
+                            } else if *access_denied {
+                                println!(
+                                    "{} not processed, might be missing read permission",
+                                    current_path.display()
+                                );
+                            }
+                        }
+                    }
+
+                    let hashmap: HashMap<PathBuf, EntryNode> = map_entry.into_iter().collect();
+                    let mut tree = Self {
+                        root: r.clone(),
+                        map: hashmap,
+                    };
+                    tree.set_size();
+                    Ok(tree)
+                } else {
+                    let e = EntryNode::File {
+                        size: Size::new(meta.len()),
+                    };
+
+                    Ok(Self {
+                        root: r.clone(),
+                        map: HashMap::from([(r, e)]),
+                    })
+                }
+            } else {
+                Err(Error::new(
+                    ErrorKind::Other,
+                    format!("Error processing path {}", root.display()),
+                ))
+            }
+        } else {
+            Err(Error::new(ErrorKind::Other, "path not found"))
+        }
+    }
+
+    /// set attribute size of every EntryNode in the tree
+    fn set_size(&mut self) {
+        let mut paths: Vec<_> = self.map.keys().cloned().collect();
+        paths.sort_by_key(|b| std::cmp::Reverse(b.as_os_str().len())); // path for a child is necessarilly longer than its parent's path so we process items with longest path first.
+        for path in paths {
+            let mut s = Size::new(0);
+            if let Some(EntryNode::Path { children, .. }) = self.map.get_mut(path.as_path()) {
+                for child in children.clone() {
+                    // children were already processed before so we can get their size
+                    if let Some(EntryNode::Path { size, .. }) | Some(EntryNode::File { size }) =
+                        self.map.get(child.as_path())
+                    {
+                        s = s + *size;
+                    }
+                }
+            }
+            if let Some(EntryNode::Path { ref mut size, .. }) = self.map.get_mut(path.as_path()) {
+                *size = s;
+            }
+        }
+    }
+
+    /// return FileTree root
+    /// # Examples
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let root = f.get_root();
+    /// assert_eq!(root, p);
+    /// ```
+    pub fn get_root(&self) -> &Path {
+        self.root.as_path()
+    }
+
+    /// return a Vec with every paths in the subtree found from given path (recursively)
+    ///
+    /// return None if given path doesn't exist in the FileTree
+    /// # Examples
+    /// assuming a directories structure like this, children will contains a (a eventually contains files and directories in sub-directories).
+    ///
+    /// ./{A/a,B}
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let children = f.get_children(&Path::new("./A"));
+    /// ```
+    pub fn get_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
+        if let Some(vec_paths) = self.get_direct_children(path) {
+            let mut paths = vec_paths;
+            let mut temp_paths = paths.clone();
+            while let Some(current_path) = temp_paths.pop() {
+                if let Some(vec_paths) = self.get_direct_children(current_path.as_path()) {
+                    paths.extend(vec_paths.clone());
+                    temp_paths.extend(vec_paths);
+                }
+            }
+            Some(paths)
+        } else {
+            None
+        }
+    }
+
+    /// return direct children of a given path (non recursively, depth=1)
+    ///
+    /// None if path is a file, or doesn't exist.
+    ///
+    /// A `Vec<PathBuf>` if it's a directory.
+    /// # Examples
+    /// assuming a directories structure like this, children will contains a and b, but not a/c.
+    ///
+    /// ./{A/{a/c,b},B}
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let children = f.get_children(&Path::new("./A"));
+    /// ```
+    pub fn get_direct_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
+        match self.map.get(path) {
+            Some(EntryNode::Path { children, .. }) => Some(children.clone()),
+            Some(EntryNode::File { .. }) => None,
+            _ => None,
+        }
+    }
+
+    /// If given path exists in the FileTree, return its size. None if it doesn't. Size can eventually be 0, an empty directory for example.
+    /// # Examples
+    /// size of entire tree (size from root):
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p);
+    /// let size = f.get_size(&p);
+    /// ```
+    pub fn get_size(&self, path: &Path) -> Option<Size> {
+        match self.map.get(path) {
+            Some(EntryNode::File { size }) | Some(EntryNode::Path { size, .. }) => Some(*size),
+            _ => None,
+        }
+    }
+
+    /// return a Vec with all the files in FileTree (and only the files), can be empty.
+    /// # Examples
+    /// assuming a directories structure like this, lowercase for files, uppercase for directories. files will only contain a, b and c.
+    ///
+    /// ./{A/a,B/b,c}
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let files = f.get_files();
+    /// ```
+    // an iterator instead of a Vec<PathBuf> would be lazy,
+    // however, we use files to find an intersection with children in a subtree to find duplicates, an iterator wouldn't be useful for this
+    pub fn files(&self) -> Vec<PathBuf> {
+        let mut files = self
+            .map
+            .iter()
+            .filter(|(key, _value)| !key.is_dir())
+            .map(|(key, _value)| key.clone())
+            .collect::<Vec<PathBuf>>();
+        // we order by name because original hashmap is not ordered
+        files.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
+        files
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::File;
+    use std::io::prelude::*;
+    use std::process::Command;
+    fn set_temp(number: u8) {
+        // create temporary test directories and files
+        let _ = Command::new("mkdir")
+            .arg(format!("/tmp/test_du{}/dir1", number))
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let mut file = File::create(format!("/tmp/test_du{}/file1", number)).unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+    }
+    fn clear_temp(number: u8) {
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg(format!("/tmp/test_du{}", number))
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+
+    #[test]
+    fn simple_filetree() {
+        set_temp(1);
+
+        let p = Path::new("/tmp/test_du1");
+        let p_buf = p.to_path_buf();
+        let f = FileTree::new(&p, &false).unwrap();
+        let paths = [
+            Path::new("/tmp/test_du1").to_path_buf(),
+            Path::new("/tmp/test_du1/dir1").to_path_buf(),
+            Path::new("/tmp/test_du1/file1").to_path_buf(),
+        ];
+
+        assert_eq!(f.root, p_buf);
+        for path in paths {
+            assert_eq!(f.map.contains_key(&path), true);
+        }
+
+        // for root
+        let expected_children = vec![
+            Path::new("/tmp/test_du1/dir1").to_path_buf(),
+            Path::new("/tmp/test_du1/file1").to_path_buf(),
+        ];
+        let expected_size = "13"; // "Hello, wold!" has 13 characters, so file1 = 26B, directories are 0B
+        let actual_value = f.map.get(&p_buf).unwrap();
+        match actual_value {
+            EntryNode::Path {
+                children: actual_children,
+                size,
+            } => {
+                assert_eq!(*actual_children, expected_children);
+                assert_eq!(format!("{}", *size), expected_size)
+            }
+            _ => panic!(), // shouldn't happen this entry refers to a dir
+        }
+        // for dir1
+        let expected_children: Vec<PathBuf> = vec![];
+        let actual_value = f
+            .map
+            .get(&Path::new("/tmp/test_du1/dir1").to_path_buf())
+            .unwrap();
+        match actual_value {
+            EntryNode::Path {
+                children: actual_children,
+                size,
+            } => {
+                assert_eq!(*actual_children, expected_children);
+                assert_eq!(format!("{}", *size), "0"); // dir1 is empty
+            }
+            _ => panic!(), // shouldn't happen this entry refers to a dir
+        }
+        // for file1
+        let expected_size = "13";
+        let actual_value = f
+            .map
+            .get(&Path::new("/tmp/test_du1/file1").to_path_buf())
+            .unwrap();
+        match actual_value {
+            EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
+            _ => panic!(), // this entry refers to a file
+        }
+        clear_temp(1);
+    }
+    #[test]
+    fn simple_filetree_file() {
+        set_temp(2);
+        // for a file
+        let p2 = Path::new("/tmp/test_du2/file1");
+        let p_buf2 = p2.to_path_buf();
+        let f2 = FileTree::new(&p2, &false).unwrap();
+
+        assert_eq!(f2.root, p_buf2);
+        assert_eq!(f2.map.contains_key(&p_buf2), true);
+
+        let expected_size = "13"; // "Hello, world!" has 13 characters
+        let actual_value = f2.map.get(&p_buf2).unwrap();
+        match actual_value {
+            EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
+            _ => panic!(), // shouldn't happen f2 refers to a file
+        }
+
+        // clean
+        clear_temp(2);
+    }
+    #[test]
+    fn filetree_fail() {
+        let p = Path::new("/tmp/test_du/shouldntexist");
+        let f = FileTree::new(&p, &false);
+        assert!(f.is_err());
+    }
+
+    #[test]
+    fn get_root_test() {
+        set_temp(3);
+        let p = Path::new("/tmp/test_du3");
+        let f = FileTree::new(&p, &false).unwrap();
+        assert_eq!(f.get_root(), p);
+        clear_temp(3);
+    }
+
+    #[test]
+    fn files_test() {
+        set_temp(4);
+        let mut file = File::create("/tmp/test_du4/dir1/file2").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let p = Path::new("/tmp/test_du4");
+        let f = FileTree::new(&p, &false).unwrap();
+        let expected = vec![
+            Path::new("/tmp/test_du4/file1").to_path_buf(),
+            Path::new("/tmp/test_du4/dir1/file2").to_path_buf(),
+        ];
+        assert_eq!(f.files(), expected);
+        clear_temp(4);
+    }
+
+    #[test]
+    fn children() {
+        set_temp(5);
+        let mut file = File::create("/tmp/test_du5/dir1/file2").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let p = Path::new("/tmp/test_du5");
+        let f = FileTree::new(&p, &false).unwrap();
+        let expected = vec![
+            Path::new("/tmp/test_du5/dir1").to_path_buf(),
+            Path::new("/tmp/test_du5/file1").to_path_buf(),
+            Path::new("/tmp/test_du5/dir1/file2").to_path_buf(),
+        ];
+        assert_eq!(f.get_children(&p), Some(expected));
+
+        let p = Path::new("/tmp/test_du5/dir1");
+        let expected = vec![Path::new("/tmp/test_du5/dir1/file2").to_path_buf()];
+        assert_eq!(f.get_children(&p), Some(expected));
+
+        let p = Path::new("/tmp/test_du5/file1");
+        assert_eq!(f.get_children(&p), None);
+
+        let p = Path::new("/tmp/test_du5/shouldntexist");
+        assert_eq!(f.get_children(&p), None);
+
+        clear_temp(5);
+    }
+
+    #[test]
+    fn size_test() {
+        set_temp(6);
+        let mut file = File::create("/tmp/test_du6/dir1/file2").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du6/dir3")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let p = Path::new("/tmp/test_du6");
+        let f = FileTree::new(&p, &false).unwrap();
+
+        let expected_size = "26"; // file1 and file2
+        assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
+
+        let p = Path::new("/tmp/test_du6/file1");
+        let expected_size = "13";
+        assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
+
+        let p = Path::new("/tmp/test_du6/dir3");
+        let expected_size = "0";
+        assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
+
+        let p = Path::new("/tmp/test_du6/shouldntexist");
+        assert!(f.get_size(&p).is_none());
+
+        clear_temp(6);
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,134 @@
+//! Basic disk usage analyzer
+//!
+//! Include a command to show disk usage and file tree, ordered by size (default) or name. It is possible to filter files by "end with".
+//!
+//! Include a command to find duplicate files.
+//! # Examples
+//! show ~ disk usage
+//! ```
+//! $ cargo run -- usage ~
+//! ```
+//! show pdf files in ~/Downloads , ordered by name
+//! ```
+//! $ cargo run -- usage ~/Downloads --sort name --filter ".pdf"
+//! ```
+//! same command but with short options
+//! ```
+//! $ cargo run -- usage ~/Downloads -s name -f ".pdf"
+//! ```
+//! show duplicates in /tmp
+//! ```
+//! $ cargo run -- duplicates /tmp
+//! ```
+//! show duplicates in /tmp and check for collision
+//! ```
+//! $ cargo run -- duplicates /tmp --check
+//! ```
+//! same command but with short option
+//! ```
+//! $ cargo run -- duplicates /tmp -c
+//! ```
+//! for both duplicates and usage, to show directories not processed because of missing read permission
+//! ```
+//! $ cargo run -- duplicates ~ --access-denied
+//! $ cargo run -- usage ~ --access-denied
+//! ```
+//! same with short option
+//! ```
+//! $ cargo run -- duplicates ~ -a
+//! $ cargo run -- usage ~ -a
+//! ```
+
+mod duplicates;
+mod file_tree;
+mod print_tree;
+mod size;
+
+use clap::{Parser, Subcommand};
+use file_tree::FileTree;
+use std::path::{Path, PathBuf};
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(propagate_version = true)]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Show the disk usage tree for the given path
+    Usage {
+        /// (default '.')
+        path: Option<PathBuf>,
+        /// (default "size")
+        #[arg(long = "sort", short = 's')]
+        sort: Option<String>,
+        /// (default None)
+        #[arg(long = "filter", short = 'f')]
+        filter: Option<String>,
+        /// (default false)
+        #[arg(long = "access-denied", short = 'a')]
+        access: bool,
+    },
+    /// Find duplicates in given path
+    Duplicates {
+        /// (default '.')
+        path: Option<PathBuf>,
+        /// (default false)
+        #[arg(long = "check", short = 'c')]
+        check: bool,
+        /// (default false)
+        #[arg(long = "access-denied", short = 'a')]
+        access: bool,
+    },
+}
+
+fn main() -> std::io::Result<()> {
+    let cli = Cli::parse();
+
+    match &cli.command {
+        Commands::Usage {
+            path,
+            sort,
+            filter,
+            access,
+        } => {
+            let path = path.as_deref().unwrap_or(Path::new("."));
+            match sort {
+                Some(ref s) if s == "size" => {
+                    FileTree::new(path, access)?.show(0, filter.as_deref())
+                }
+                None => FileTree::new(path, access)?.show(0, filter.as_deref()),
+                Some(ref s) if s == "name" => {
+                    FileTree::new(path, access)?.show(1, filter.as_deref())
+                }
+                _ => (),
+            }
+        }
+        Commands::Duplicates {
+            path,
+            check,
+            access,
+        } => {
+            let path = path.as_deref().unwrap_or(Path::new("."));
+            let duplicate_files = FileTree::new(path, access)?.duplicates(path);
+            if let Some(ref item_vec) = duplicate_files {
+                for (i, item) in item_vec.iter().enumerate() {
+                    println!("DUPLICATES {}", i + 1);
+                    for i in item {
+                        println!("{}", i.display());
+                    }
+                    println!("\n")
+                }
+            } else {
+                println!("No duplicate found in {}\n", path.display());
+            }
+            if *check {
+                println!("collision found: {}", FileTree::collision(duplicate_files));
+            }
+        }
+    }
+    Ok(())
+}
--- a/src/print_tree.rs
+++ b/src/print_tree.rs
@ -0,0 +1,103 @@
+//! Print human readable file tree.
+//!
+//! Print nodes in a filetree (files and directories) with their sizes and tree structure.
+//!
+//! Include filter and sort arguments.
+//!
+//! Filter by "end with" (preserve the tree structure when printing).
+//! Sort by name or by size.
+
+use crate::file_tree::FileTree;
+use crate::size::Size;
+use std::path::Path;
+
+impl FileTree {
+    /// Print a FileTree in the console in a human readable format.
+    ///
+    /// sort = 0 is size order
+    ///
+    /// sort = 1 is lexicographical order
+    ///
+    /// Any other integer for sort won't fail but nodes won't be sorted and order might seem random.
+    ///
+    /// filter_suffix is for "end with" filter, will still preserve tree structure (will show parent directories if one descendant pass the filter).
+    ///
+    /// use filter_suffix = None to show everything
+    /// # Examples
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p);
+    /// f.show(1,None); // show everything ordered by name
+    /// f.show(0,Some(".pdf")); // show pdf files (preserving tree structure) ordered by size
+    /// ```
+    pub fn show(&self, sort: u8, filter_suffix: Option<&str>) {
+        let root = self.get_root();
+        self.print_entry(root, &self.get_size(root).unwrap(), 0);
+        self.show_recursive(root, 1, sort, filter_suffix);
+    }
+
+    /// recursively visit a FileTree to print it
+    ///
+    /// do not use alone, wrapper is show()
+    fn show_recursive(&self, path: &Path, indent: usize, sort: u8, filter_suffix: Option<&str>) {
+        if let Some(mut children) = self.get_direct_children(path) {
+            match sort {
+                0 => {
+                    // Sort children by size in descending order
+                    children.sort_by_key(|b| std::cmp::Reverse(self.get_size(b)))
+                }
+                1 => children.sort_by(|a, b| a.file_name().cmp(&b.file_name())),
+                _ => (),
+            }
+
+            for child in children {
+                if let Some(entry) = self.get_size(&child) {
+                    if let Some(suffix) = filter_suffix {
+                        let mut descendant = self.get_children(&child).unwrap_or_default();
+                        descendant.push(child.clone());
+
+                        let mut found = false;
+
+                        // we check if one descendant pass the filter
+                        // currently though we do it more than necessary
+                        // might be more efficient to work with get_direct_children and memorize which parents pass the filter
+                        // might be possible to visit descendant in reverse order and have a bit with true or false (default false) for each item
+                        for item in descendant {
+                            if item
+                                .clone()
+                                .into_os_string()
+                                .into_string()
+                                .unwrap()
+                                .ends_with(suffix)
+                            {
+                                found = true;
+                                break;
+                            }
+                        }
+
+                        if found {
+                            self.print_entry(&child, &entry, indent);
+                        }
+                    } else {
+                        self.print_entry(&child, &entry, indent);
+                    }
+
+                    if let Some(_grandchildren) = self.get_direct_children(&child) {
+                        self.show_recursive(&child, indent + 1, sort, filter_suffix);
+                    }
+                }
+            }
+        }
+    }
+
+    /// print a single entry
+    ///
+    /// do not use alone, is part os show_recursive()
+    fn print_entry(&self, path: &Path, size: &Size, indent: usize) {
+        let indentation = "\t".repeat(indent);
+        println!("{}{}\t{}", indentation, size, path.display());
+    }
+}
+
+// no unittest here for now as show is mostly "visual"
--- a/src/size.rs
+++ b/src/size.rs
@ -0,0 +1,92 @@
+//! size in bytes
+//!
+//! implement fmt::Display to display size in a human readable unit (B, KB, MB, GB, TB).
+//!
+//! For Bytes, unit isn't printed.
+//!
+//! implement std::ops::Add
+
+use std::fmt;
+
+#[derive(PartialEq, PartialOrd, Eq, Ord, Copy, Clone)]
+pub struct Size(u64);
+
+//#[derive(Debug)]
+impl Size {
+    pub fn new(bytes: u64) -> Self {
+        Self(bytes)
+    }
+}
+
+impl fmt::Display for Size {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let length: u32 = self.0.checked_ilog10().unwrap_or(0); //number of digits -1
+                                                                // match over number of 3 digits groups, 1000 ~ 1024
+                                                                // 1000^n and 1024^n have the same number of digits if n<98
+                                                                // here n<=4 (TB)
+                                                                //
+                                                                // if size in KB or above we want at most 1 decimal
+        match length / 3 {
+            0 => write!(f, "{}", self.0 as f32), // we assume no unit printed means Bytes.
+            1 => write!(
+                f,
+                "{}KB",
+                (((self.0 as f32) / 1024.0) * 10.0_f32).round().trunc() / 10.0
+            ),
+            2 => write!(
+                f,
+                "{}MB",
+                (((self.0 as f32) / 1048576.0) * 10.0_f32).round().trunc() / 10.0
+            ),
+            3 => write!(
+                f,
+                "{}GB",
+                (((self.0 as f32) / 1073741824.0) * 10.0_f32)
+                    .round()
+                    .trunc()
+                    / 10.0
+            ),
+            4 => write!(
+                f,
+                "{}TB",
+                (((self.0 as f32) / 1099511627776.0) * 10.0_f32)
+                    .round()
+                    .trunc()
+                    / 10.0
+            ),
+            _ => panic!(), // unlikely to have PetaBytes of files (and above) on consumer grade hardware
+        }
+    }
+}
+
+impl std::ops::Add for Size {
+    type Output = Self;
+    fn add(self, other: Self) -> Self::Output {
+        Self(self.0 + other.0)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn create() {
+        assert_eq!(Size::new(1).0, 1);
+    }
+    #[test]
+    fn add() {
+        let s1 = Size::new(60);
+        let s2 = Size::new(40);
+        assert_eq!((s1 + s2).0, 100);
+    }
+    #[test]
+    fn display() {
+        assert_eq!(10u32.checked_ilog10().unwrap_or(0) + 1, 2);
+
+        assert_eq!(format!("{}", Size::new(1024)), "1KB");
+        // 1700/1024 = 1.66015625
+        assert_eq!(format!("{}", Size::new(1700)), "1.7KB");
+        // 2411724/(1024^2) = 2.299999237060547
+        assert_eq!(format!("{}", Size::new(2411724)), "2.3MB");
+    }
+}