initial commit

2025-04-05 19:31:07 +02:00
commit a5a3133ef0
8 changed files with 1148 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,18 @@
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
+
+# Swap Files
+.*.kate-swp
+.swp.*
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+Copyright 2025 Sam Hadow (hadow.fr)
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+    4. Redistributions of any form whatsoever must retain the following acknowledgment: 'This product includes software developed by "Sam Hadow" (http://hadow.fr/).'
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,11 @@
+[package]
+name = "du"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+clap = { version = "4.4.6", features = ["derive"] }
+rayon = "1.8.0"
+sha2 = "0.10.8"
--- a/src/duplicates.rs
+++ b/src/duplicates.rs
@ -0,0 +1,326 @@
+//! find duplicate files
+//!
+//! Find duplicate files in a FileTree comparing their hash.
+
+use crate::FileTree;
+use std::collections::{HashMap, HashSet};
+use std::fs;
+use std::path::{Path, PathBuf};
+
+// hash
+use sha2::{Digest, Sha256};
+use std::io::{BufReader, Read};
+
+// parallelism
+use rayon::prelude::*;
+use std::sync::{Arc, Mutex};
+
+impl FileTree {
+    /// Find duplicates in a directory (including sub-directories).
+    ///
+    /// If path exist in tree, find duplicates using sha256. If 2 (or more) files have the same hash they're duplicates.
+    ///
+    /// returns a `Vec` containing a `Vec<PathBuf>` for each group of duplicates.
+    ///
+    /// If path doesn't exist, or if no duplicates are found, return None
+    ///
+    /// # Examples
+    /// Assuming a directory structure like this. files in lowercase, directories in uppercase. And assuming same letter means duplicate. dups will contain a Some(Vec<Vec<_>>) with a,A/a,B/a
+    ///
+    /// ./{A/a,B/a,a,C/c}
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let dups = f.duplicates(&p);
+    /// ```
+    pub fn duplicates(&self, path: &Path) -> Option<Vec<Vec<PathBuf>>> {
+        match self.get_children(path) {
+            Some(value) => {
+                // every path in subdir (directories+files)
+                let children: HashSet<PathBuf> = value.into_iter().collect();
+                // every files (whole tree)
+                let files: HashSet<PathBuf> = self.files().into_iter().collect();
+
+                // every files in subdir
+                let intersection: Vec<_> = children.intersection(&files).cloned().collect();
+
+                // Arc<Mutex<_>> used to safely update HashMap in parallel
+                let hashes: Arc<Mutex<HashMap<[u8; 32], Vec<PathBuf>>>> =
+                    Arc::new(Mutex::new(HashMap::new()));
+
+                // parallelized loop
+                intersection.par_iter().for_each(|item| {
+                    if let Ok(file) = fs::File::open(item) {
+                        let mut sha256 = Sha256::new();
+
+                        // sizable buffer
+                        let mut buffer = [0; 8192];
+                        let mut reader = BufReader::new(file);
+
+                        // https://rust-lang-nursery.github.io/rust-cookbook/cryptography/hashing.html
+                        loop {
+                            let count = reader.read(&mut buffer).unwrap();
+                            // rust complains with .ok()?
+                            // unwrap should not fail in this context though
+                            if count == 0 {
+                                break;
+                            }
+                            sha256.update(&buffer[..count]);
+                        }
+
+                        let hash = sha256.finalize();
+
+                        // Use a Mutex to update HashMap in parallel
+                        let mut locked_hashes = hashes.lock().unwrap();
+                        locked_hashes
+                            .entry(hash.into())
+                            .or_default()
+                            .push(item.clone());
+                    }
+                });
+
+                // extract result from Mutex
+                // converting Arc<Mutex<HashMap>> into HashMap
+                let hashes = Arc::try_unwrap(hashes).ok().unwrap().into_inner().unwrap();
+
+                let dups = hashes
+                    .values()
+                    .filter(|a| a.len() > 1) // if more than 1 path for a hash it's a duplicate
+                    .cloned()
+                    .collect::<Vec<Vec<PathBuf>>>();
+
+                if dups.is_empty() {
+                    None
+                } else {
+                    Some(dups)
+                }
+            }
+            _ => None,
+        }
+    }
+
+    /// Collision shouldn't happen with sha256, but a method to check if there is a collision in duplicates found
+    /// # Examples
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let dups = f.duplicates(&p);
+    /// let check = FileTree::collision(dups);
+    /// ```
+    pub fn collision(dups: Option<Vec<Vec<PathBuf>>>) -> bool {
+        match dups {
+            Some(value) => {
+                for mut subgroup in value {
+                    let path1 = subgroup.pop().unwrap();
+                    while let Some(path2) = subgroup.pop() {
+                        if !Self::raw_compare(&path1, &path2) {
+                            return true; // files contents are different, it's a collision
+                        }
+                    }
+                }
+                false
+            }
+            None => false, // no collision since there is any duplicate
+        }
+    } // no unittest for collision as finding a collision is highly unlikely, and there are no known collision yet
+
+    /// Check if 2 files have the same content character to character, return a boolean.
+    fn raw_compare(path1: &Path, path2: &Path) -> bool {
+        if let Ok(file1) = fs::File::open(path1) {
+            if let Ok(file2) = fs::File::open(path2) {
+                // sizable buffer
+                let mut buffer1 = [0; 8192];
+                let mut buffer2 = [0; 8192];
+                let mut reader1 = BufReader::new(file1);
+                let mut reader2 = BufReader::new(file2);
+                loop {
+                    let count1 = reader1.read(&mut buffer1).unwrap();
+                    let count2 = reader2.read(&mut buffer2).unwrap();
+                    if count1 == 0 || count2 == 0 {
+                        if count1 != count2 {
+                            return false;
+                        } // files do not have same size
+                        break;
+                    }
+                    if buffer1[..count1] != buffer2[..count2] {
+                        return false;
+                    }
+                }
+                true
+            } else {
+                panic!("{}", format!("couldn't read file {}", path2.display()))
+            }
+        } else {
+            panic!("{}", format!("couldn't read file {}", path1.display()))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::File;
+    use std::io::prelude::*;
+    use std::process::Command;
+    #[test]
+    fn duplicate_test() {
+        // create temporary test directories and files
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_dups1/dir1/dir11")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_dups1/dir1/dir12")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+
+        // let's create the same file 3 times
+        let mut file = File::create("/tmp/test_du_dups1/file1").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file1")
+            .arg("/tmp/test_du_dups1/dir1/")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file1")
+            .arg("/tmp/test_du_dups1/file2")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file1")
+            .arg("/tmp/test_du_dups1/dir1/dir11/")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        // let's create a different set of duplicates
+        let mut file1 = File::create("/tmp/test_du_dups1/file10").unwrap();
+        let _ = file1.write_all(b"lorem ipsum dolor sit amet").unwrap();
+        let _ = Command::new("cp")
+            .arg("/tmp/test_du_dups1/file10")
+            .arg("/tmp/test_du_dups1/dir1/")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        // and a unique file here
+        let mut file2 = File::create("/tmp/test_du_dups1/file100").unwrap();
+        let _ = file2.write_all(b"consectetur adipiscing elit").unwrap();
+
+        let p = Path::new("/tmp/test_du_dups1");
+        let f = FileTree::new(&p, &false).unwrap();
+        let mut res = f.duplicates(&p).unwrap(); // unwrap asserts it's not none
+
+        res.sort_by_key(|b| b.len());
+
+        let expected_vec: Vec<Vec<PathBuf>> = vec![
+            vec![
+                Path::new("/tmp/test_du_dups1/file10").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/dir1/file10").to_path_buf(),
+            ],
+            vec![
+                Path::new("/tmp/test_du_dups1/file1").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/dir1/dir11/file1").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/dir1/file1").to_path_buf(),
+                Path::new("/tmp/test_du_dups1/file2").to_path_buf(),
+            ],
+        ];
+
+        // using a HashMap to find duplicates, items are not ordered, so we compare them using a set
+        // first set of duplicates
+        let expected = expected_vec[0].iter().cloned().collect::<HashSet<_>>();
+        let actual = res[0].iter().cloned().collect::<HashSet<_>>();
+        assert_eq!(expected, actual);
+        // second set of duplicates
+        let expected = expected_vec[1].iter().cloned().collect::<HashSet<_>>();
+        let actual = res[1].iter().cloned().collect::<HashSet<_>>();
+        assert_eq!(expected, actual);
+
+        // sets do not allow multiple elements so we have to check vec length separately
+        assert_eq!(expected_vec[0].len(), res[0].len());
+        assert_eq!(expected_vec[1].len(), res[1].len());
+
+        // we check if we have same number of duplicate groups
+        assert_eq!(expected_vec.len(), res.len());
+
+        //clean
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg("/tmp/test_du_dups1")
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+    #[test]
+    fn duplicate_test_empty() {
+        // create temporary test directories and files
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_dups2/dir1")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let mut file = File::create("/tmp/test_du_dups2/file10").unwrap();
+        let _ = file
+            .write_all(b"Integer faucibus sapien vitae aliquet dapibus")
+            .unwrap();
+        let mut file1 = File::create("/tmp/test_du_dups2/dir1/file10").unwrap();
+        let _ = file1.write_all(b"Pellentesque at pharetra enim").unwrap();
+
+        let p = Path::new("/tmp/test_du_dups2");
+        let f = FileTree::new(&p, &false).unwrap();
+        let res = f.duplicates(&p);
+        // should not find duplicates
+        assert!(res.is_none());
+
+        // clean
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg("/tmp/test_du_dups2")
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+
+    #[test]
+    fn file_compare() {
+        // create temporary test directory and files
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du_compare")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let mut file = File::create("/tmp/test_du_compare/file1").unwrap();
+        let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
+        let mut file = File::create("/tmp/test_du_compare/file2").unwrap();
+        let _ = file.write_all(b"Cras eleifend nisi nibh").unwrap();
+        let mut file = File::create("/tmp/test_du_compare/file3").unwrap();
+        let _ = file.write_all(b"a blandit elit mattis et").unwrap();
+
+        let p1 = Path::new("/tmp/test_du_compare/file1");
+        let p2 = Path::new("/tmp/test_du_compare/file2");
+        let p3 = Path::new("/tmp/test_du_compare/file3");
+
+        assert!(FileTree::raw_compare(p1, p2));
+        assert!(!FileTree::raw_compare(p1, p3));
+
+        // clean
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg("/tmp/test_du_compare")
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+}
--- a/src/file_tree.rs
+++ b/src/file_tree.rs
@ -0,0 +1,453 @@
+//! tree structure representation
+//!
+//! Implement a tree structure to represent data on a disk (files and directories).
+//!
+//! Include methods to
+//! + construct this tree
+//! + get a directory's children
+//! + get a file or directory size
+//! + get files in tree.
+
+use crate::size::Size;
+use std::collections::HashMap;
+use std::fs;
+use std::io::{Error, ErrorKind};
+use std::path::{Path, PathBuf};
+
+/// FileTree structure
+pub struct FileTree {
+    /// tree root (base path)
+    root: PathBuf,
+    /// HashMap containing every path in the tree (every files and directories, sub-directories included) and their associated data (size and children).
+    map: HashMap<PathBuf, EntryNode>,
+}
+
+enum EntryNode {
+    File { size: Size },
+    Path { children: Vec<PathBuf>, size: Size },
+}
+
+impl FileTree {
+    /// Create a new filetree from given path.
+    ///
+    /// Will return an error if path doesn't exist or if user doesn't have read permission.
+    ///
+    /// If user is missing read permission in a subdir, will ignore this subdir and keep building the tree.
+    ///
+    /// access_denied=true show ignored paths, access_denied=false to hide that these paths were ignored.
+    ///
+    /// # Examples
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, false);
+    /// ```
+    pub fn new(root: &Path, access_denied: &bool) -> std::io::Result<Self> {
+        if root.exists() {
+            if let Ok(meta) = fs::metadata(root) {
+                let r = root.to_path_buf();
+
+                if meta.is_dir() {
+                    let entries = fs::read_dir(root)?;
+                    let mut paths: Vec<PathBuf> = entries
+                        .filter_map(|entry| match entry {
+                            Ok(entry) => Some(entry.path()),
+                            Err(_) => None,
+                        })
+                        .collect();
+                    let e = EntryNode::Path {
+                        children: paths.clone(),
+                        size: Size::new(0),
+                    };
+                    let mut map_entry: Vec<(PathBuf, EntryNode)> = Vec::new();
+                    map_entry.push((r.clone(), e));
+                    while let Some(current_path) = paths.pop() {
+                        if !(current_path.is_symlink()) {
+                            // need to find a better way to handle symlinks
+                            if let Ok(meta) = fs::metadata(current_path.clone()) {
+                                if meta.is_dir() {
+                                    if let Ok(entries) = fs::read_dir(&current_path) {
+                                        let new_paths: Vec<PathBuf> = entries
+                                            .filter_map(|entry| match entry {
+                                                Ok(entry) => Some(entry.path()),
+                                                Err(_) => None,
+                                            })
+                                            .collect();
+                                        let e = EntryNode::Path {
+                                            children: new_paths.clone(),
+                                            size: Size::new(0),
+                                        };
+                                        map_entry.push((current_path.clone(), e));
+                                        paths.extend(new_paths);
+                                    } else if *access_denied {
+                                        println!("{} children not processed, might be missing read permission",current_path.display());
+                                    }
+                                } else {
+                                    // file
+                                    let e = EntryNode::File {
+                                        size: Size::new(meta.len()),
+                                    };
+                                    map_entry.push((current_path, e));
+                                }
+                            } else if *access_denied {
+                                println!(
+                                    "{} not processed, might be missing read permission",
+                                    current_path.display()
+                                );
+                            }
+                        }
+                    }
+
+                    let hashmap: HashMap<PathBuf, EntryNode> = map_entry.into_iter().collect();
+                    let mut tree = Self {
+                        root: r.clone(),
+                        map: hashmap,
+                    };
+                    tree.set_size();
+                    Ok(tree)
+                } else {
+                    let e = EntryNode::File {
+                        size: Size::new(meta.len()),
+                    };
+
+                    Ok(Self {
+                        root: r.clone(),
+                        map: HashMap::from([(r, e)]),
+                    })
+                }
+            } else {
+                Err(Error::new(
+                    ErrorKind::Other,
+                    format!("Error processing path {}", root.display()),
+                ))
+            }
+        } else {
+            Err(Error::new(ErrorKind::Other, "path not found"))
+        }
+    }
+
+    /// set attribute size of every EntryNode in the tree
+    fn set_size(&mut self) {
+        let mut paths: Vec<_> = self.map.keys().cloned().collect();
+        paths.sort_by_key(|b| std::cmp::Reverse(b.as_os_str().len())); // path for a child is necessarilly longer than its parent's path so we process items with longest path first.
+        for path in paths {
+            let mut s = Size::new(0);
+            if let Some(EntryNode::Path { children, .. }) = self.map.get_mut(path.as_path()) {
+                for child in children.clone() {
+                    // children were already processed before so we can get their size
+                    if let Some(EntryNode::Path { size, .. }) | Some(EntryNode::File { size }) =
+                        self.map.get(child.as_path())
+                    {
+                        s = s + *size;
+                    }
+                }
+            }
+            if let Some(EntryNode::Path { ref mut size, .. }) = self.map.get_mut(path.as_path()) {
+                *size = s;
+            }
+        }
+    }
+
+    /// return FileTree root
+    /// # Examples
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let root = f.get_root();
+    /// assert_eq!(root, p);
+    /// ```
+    pub fn get_root(&self) -> &Path {
+        self.root.as_path()
+    }
+
+    /// return a Vec with every paths in the subtree found from given path (recursively)
+    ///
+    /// return None if given path doesn't exist in the FileTree
+    /// # Examples
+    /// assuming a directories structure like this, children will contains a (a eventually contains files and directories in sub-directories).
+    ///
+    /// ./{A/a,B}
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let children = f.get_children(&Path::new("./A"));
+    /// ```
+    pub fn get_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
+        if let Some(vec_paths) = self.get_direct_children(path) {
+            let mut paths = vec_paths;
+            let mut temp_paths = paths.clone();
+            while let Some(current_path) = temp_paths.pop() {
+                if let Some(vec_paths) = self.get_direct_children(current_path.as_path()) {
+                    paths.extend(vec_paths.clone());
+                    temp_paths.extend(vec_paths);
+                }
+            }
+            Some(paths)
+        } else {
+            None
+        }
+    }
+
+    /// return direct children of a given path (non recursively, depth=1)
+    ///
+    /// None if path is a file, or doesn't exist.
+    ///
+    /// A `Vec<PathBuf>` if it's a directory.
+    /// # Examples
+    /// assuming a directories structure like this, children will contains a and b, but not a/c.
+    ///
+    /// ./{A/{a/c,b},B}
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let children = f.get_children(&Path::new("./A"));
+    /// ```
+    pub fn get_direct_children(&self, path: &Path) -> Option<Vec<PathBuf>> {
+        match self.map.get(path) {
+            Some(EntryNode::Path { children, .. }) => Some(children.clone()),
+            Some(EntryNode::File { .. }) => None,
+            _ => None,
+        }
+    }
+
+    /// If given path exists in the FileTree, return its size. None if it doesn't. Size can eventually be 0, an empty directory for example.
+    /// # Examples
+    /// size of entire tree (size from root):
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p);
+    /// let size = f.get_size(&p);
+    /// ```
+    pub fn get_size(&self, path: &Path) -> Option<Size> {
+        match self.map.get(path) {
+            Some(EntryNode::File { size }) | Some(EntryNode::Path { size, .. }) => Some(*size),
+            _ => None,
+        }
+    }
+
+    /// return a Vec with all the files in FileTree (and only the files), can be empty.
+    /// # Examples
+    /// assuming a directories structure like this, lowercase for files, uppercase for directories. files will only contain a, b and c.
+    ///
+    /// ./{A/a,B/b,c}
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p, &false);
+    /// let files = f.get_files();
+    /// ```
+    // an iterator instead of a Vec<PathBuf> would be lazy,
+    // however, we use files to find an intersection with children in a subtree to find duplicates, an iterator wouldn't be useful for this
+    pub fn files(&self) -> Vec<PathBuf> {
+        let mut files = self
+            .map
+            .iter()
+            .filter(|(key, _value)| !key.is_dir())
+            .map(|(key, _value)| key.clone())
+            .collect::<Vec<PathBuf>>();
+        // we order by name because original hashmap is not ordered
+        files.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
+        files
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::File;
+    use std::io::prelude::*;
+    use std::process::Command;
+    fn set_temp(number: u8) {
+        // create temporary test directories and files
+        let _ = Command::new("mkdir")
+            .arg(format!("/tmp/test_du{}/dir1", number))
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let mut file = File::create(format!("/tmp/test_du{}/file1", number)).unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+    }
+    fn clear_temp(number: u8) {
+        let _ = Command::new("rm")
+            .arg("-rf")
+            .arg(format!("/tmp/test_du{}", number))
+            .spawn()
+            .unwrap()
+            .wait();
+    }
+
+    #[test]
+    fn simple_filetree() {
+        set_temp(1);
+
+        let p = Path::new("/tmp/test_du1");
+        let p_buf = p.to_path_buf();
+        let f = FileTree::new(&p, &false).unwrap();
+        let paths = [
+            Path::new("/tmp/test_du1").to_path_buf(),
+            Path::new("/tmp/test_du1/dir1").to_path_buf(),
+            Path::new("/tmp/test_du1/file1").to_path_buf(),
+        ];
+
+        assert_eq!(f.root, p_buf);
+        for path in paths {
+            assert_eq!(f.map.contains_key(&path), true);
+        }
+
+        // for root
+        let expected_children = vec![
+            Path::new("/tmp/test_du1/dir1").to_path_buf(),
+            Path::new("/tmp/test_du1/file1").to_path_buf(),
+        ];
+        let expected_size = "13"; // "Hello, wold!" has 13 characters, so file1 = 26B, directories are 0B
+        let actual_value = f.map.get(&p_buf).unwrap();
+        match actual_value {
+            EntryNode::Path {
+                children: actual_children,
+                size,
+            } => {
+                assert_eq!(*actual_children, expected_children);
+                assert_eq!(format!("{}", *size), expected_size)
+            }
+            _ => panic!(), // shouldn't happen this entry refers to a dir
+        }
+        // for dir1
+        let expected_children: Vec<PathBuf> = vec![];
+        let actual_value = f
+            .map
+            .get(&Path::new("/tmp/test_du1/dir1").to_path_buf())
+            .unwrap();
+        match actual_value {
+            EntryNode::Path {
+                children: actual_children,
+                size,
+            } => {
+                assert_eq!(*actual_children, expected_children);
+                assert_eq!(format!("{}", *size), "0"); // dir1 is empty
+            }
+            _ => panic!(), // shouldn't happen this entry refers to a dir
+        }
+        // for file1
+        let expected_size = "13";
+        let actual_value = f
+            .map
+            .get(&Path::new("/tmp/test_du1/file1").to_path_buf())
+            .unwrap();
+        match actual_value {
+            EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
+            _ => panic!(), // this entry refers to a file
+        }
+        clear_temp(1);
+    }
+    #[test]
+    fn simple_filetree_file() {
+        set_temp(2);
+        // for a file
+        let p2 = Path::new("/tmp/test_du2/file1");
+        let p_buf2 = p2.to_path_buf();
+        let f2 = FileTree::new(&p2, &false).unwrap();
+
+        assert_eq!(f2.root, p_buf2);
+        assert_eq!(f2.map.contains_key(&p_buf2), true);
+
+        let expected_size = "13"; // "Hello, world!" has 13 characters
+        let actual_value = f2.map.get(&p_buf2).unwrap();
+        match actual_value {
+            EntryNode::File { size } => assert_eq!(format!("{}", *size), expected_size),
+            _ => panic!(), // shouldn't happen f2 refers to a file
+        }
+
+        // clean
+        clear_temp(2);
+    }
+    #[test]
+    fn filetree_fail() {
+        let p = Path::new("/tmp/test_du/shouldntexist");
+        let f = FileTree::new(&p, &false);
+        assert!(f.is_err());
+    }
+
+    #[test]
+    fn get_root_test() {
+        set_temp(3);
+        let p = Path::new("/tmp/test_du3");
+        let f = FileTree::new(&p, &false).unwrap();
+        assert_eq!(f.get_root(), p);
+        clear_temp(3);
+    }
+
+    #[test]
+    fn files_test() {
+        set_temp(4);
+        let mut file = File::create("/tmp/test_du4/dir1/file2").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let p = Path::new("/tmp/test_du4");
+        let f = FileTree::new(&p, &false).unwrap();
+        let expected = vec![
+            Path::new("/tmp/test_du4/file1").to_path_buf(),
+            Path::new("/tmp/test_du4/dir1/file2").to_path_buf(),
+        ];
+        assert_eq!(f.files(), expected);
+        clear_temp(4);
+    }
+
+    #[test]
+    fn children() {
+        set_temp(5);
+        let mut file = File::create("/tmp/test_du5/dir1/file2").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let p = Path::new("/tmp/test_du5");
+        let f = FileTree::new(&p, &false).unwrap();
+        let expected = vec![
+            Path::new("/tmp/test_du5/dir1").to_path_buf(),
+            Path::new("/tmp/test_du5/file1").to_path_buf(),
+            Path::new("/tmp/test_du5/dir1/file2").to_path_buf(),
+        ];
+        assert_eq!(f.get_children(&p), Some(expected));
+
+        let p = Path::new("/tmp/test_du5/dir1");
+        let expected = vec![Path::new("/tmp/test_du5/dir1/file2").to_path_buf()];
+        assert_eq!(f.get_children(&p), Some(expected));
+
+        let p = Path::new("/tmp/test_du5/file1");
+        assert_eq!(f.get_children(&p), None);
+
+        let p = Path::new("/tmp/test_du5/shouldntexist");
+        assert_eq!(f.get_children(&p), None);
+
+        clear_temp(5);
+    }
+
+    #[test]
+    fn size_test() {
+        set_temp(6);
+        let mut file = File::create("/tmp/test_du6/dir1/file2").unwrap();
+        let _ = file.write_all(b"Hello, world!").unwrap();
+        let _ = Command::new("mkdir")
+            .arg("/tmp/test_du6/dir3")
+            .arg("-p")
+            .spawn()
+            .unwrap()
+            .wait();
+        let p = Path::new("/tmp/test_du6");
+        let f = FileTree::new(&p, &false).unwrap();
+
+        let expected_size = "26"; // file1 and file2
+        assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
+
+        let p = Path::new("/tmp/test_du6/file1");
+        let expected_size = "13";
+        assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
+
+        let p = Path::new("/tmp/test_du6/dir3");
+        let expected_size = "0";
+        assert_eq!(format!("{}", f.get_size(&p).unwrap()), expected_size);
+
+        let p = Path::new("/tmp/test_du6/shouldntexist");
+        assert!(f.get_size(&p).is_none());
+
+        clear_temp(6);
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,134 @@
+//! Basic disk usage analyzer
+//!
+//! Include a command to show disk usage and file tree, ordered by size (default) or name. It is possible to filter files by "end with".
+//!
+//! Include a command to find duplicate files.
+//! # Examples
+//! show ~ disk usage
+//! ```
+//! $ cargo run -- usage ~
+//! ```
+//! show pdf files in ~/Downloads , ordered by name
+//! ```
+//! $ cargo run -- usage ~/Downloads --sort name --filter ".pdf"
+//! ```
+//! same command but with short options
+//! ```
+//! $ cargo run -- usage ~/Downloads -s name -f ".pdf"
+//! ```
+//! show duplicates in /tmp
+//! ```
+//! $ cargo run -- duplicates /tmp
+//! ```
+//! show duplicates in /tmp and check for collision
+//! ```
+//! $ cargo run -- duplicates /tmp --check
+//! ```
+//! same command but with short option
+//! ```
+//! $ cargo run -- duplicates /tmp -c
+//! ```
+//! for both duplicates and usage, to show directories not processed because of missing read permission
+//! ```
+//! $ cargo run -- duplicates ~ --access-denied
+//! $ cargo run -- usage ~ --access-denied
+//! ```
+//! same with short option
+//! ```
+//! $ cargo run -- duplicates ~ -a
+//! $ cargo run -- usage ~ -a
+//! ```
+
+mod duplicates;
+mod file_tree;
+mod print_tree;
+mod size;
+
+use clap::{Parser, Subcommand};
+use file_tree::FileTree;
+use std::path::{Path, PathBuf};
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(propagate_version = true)]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Show the disk usage tree for the given path
+    Usage {
+        /// (default '.')
+        path: Option<PathBuf>,
+        /// (default "size")
+        #[arg(long = "sort", short = 's')]
+        sort: Option<String>,
+        /// (default None)
+        #[arg(long = "filter", short = 'f')]
+        filter: Option<String>,
+        /// (default false)
+        #[arg(long = "access-denied", short = 'a')]
+        access: bool,
+    },
+    /// Find duplicates in given path
+    Duplicates {
+        /// (default '.')
+        path: Option<PathBuf>,
+        /// (default false)
+        #[arg(long = "check", short = 'c')]
+        check: bool,
+        /// (default false)
+        #[arg(long = "access-denied", short = 'a')]
+        access: bool,
+    },
+}
+
+fn main() -> std::io::Result<()> {
+    let cli = Cli::parse();
+
+    match &cli.command {
+        Commands::Usage {
+            path,
+            sort,
+            filter,
+            access,
+        } => {
+            let path = path.as_deref().unwrap_or(Path::new("."));
+            match sort {
+                Some(ref s) if s == "size" => {
+                    FileTree::new(path, access)?.show(0, filter.as_deref())
+                }
+                None => FileTree::new(path, access)?.show(0, filter.as_deref()),
+                Some(ref s) if s == "name" => {
+                    FileTree::new(path, access)?.show(1, filter.as_deref())
+                }
+                _ => (),
+            }
+        }
+        Commands::Duplicates {
+            path,
+            check,
+            access,
+        } => {
+            let path = path.as_deref().unwrap_or(Path::new("."));
+            let duplicate_files = FileTree::new(path, access)?.duplicates(path);
+            if let Some(ref item_vec) = duplicate_files {
+                for (i, item) in item_vec.iter().enumerate() {
+                    println!("DUPLICATES {}", i + 1);
+                    for i in item {
+                        println!("{}", i.display());
+                    }
+                    println!("\n")
+                }
+            } else {
+                println!("No duplicate found in {}\n", path.display());
+            }
+            if *check {
+                println!("collision found: {}", FileTree::collision(duplicate_files));
+            }
+        }
+    }
+    Ok(())
+}
--- a/src/print_tree.rs
+++ b/src/print_tree.rs
@ -0,0 +1,103 @@
+//! Print human readable file tree.
+//!
+//! Print nodes in a filetree (files and directories) with their sizes and tree structure.
+//!
+//! Include filter and sort arguments.
+//!
+//! Filter by "end with" (preserve the tree structure when printing).
+//! Sort by name or by size.
+
+use crate::file_tree::FileTree;
+use crate::size::Size;
+use std::path::Path;
+
+impl FileTree {
+    /// Print a FileTree in the console in a human readable format.
+    ///
+    /// sort = 0 is size order
+    ///
+    /// sort = 1 is lexicographical order
+    ///
+    /// Any other integer for sort won't fail but nodes won't be sorted and order might seem random.
+    ///
+    /// filter_suffix is for "end with" filter, will still preserve tree structure (will show parent directories if one descendant pass the filter).
+    ///
+    /// use filter_suffix = None to show everything
+    /// # Examples
+    ///
+    /// ```no_run
+    /// let p = Path::new(".");
+    /// let f = FileTree::new(&p);
+    /// f.show(1,None); // show everything ordered by name
+    /// f.show(0,Some(".pdf")); // show pdf files (preserving tree structure) ordered by size
+    /// ```
+    pub fn show(&self, sort: u8, filter_suffix: Option<&str>) {
+        let root = self.get_root();
+        self.print_entry(root, &self.get_size(root).unwrap(), 0);
+        self.show_recursive(root, 1, sort, filter_suffix);
+    }
+
+    /// recursively visit a FileTree to print it
+    ///
+    /// do not use alone, wrapper is show()
+    fn show_recursive(&self, path: &Path, indent: usize, sort: u8, filter_suffix: Option<&str>) {
+        if let Some(mut children) = self.get_direct_children(path) {
+            match sort {
+                0 => {
+                    // Sort children by size in descending order
+                    children.sort_by_key(|b| std::cmp::Reverse(self.get_size(b)))
+                }
+                1 => children.sort_by(|a, b| a.file_name().cmp(&b.file_name())),
+                _ => (),
+            }
+
+            for child in children {
+                if let Some(entry) = self.get_size(&child) {
+                    if let Some(suffix) = filter_suffix {
+                        let mut descendant = self.get_children(&child).unwrap_or_default();
+                        descendant.push(child.clone());
+
+                        let mut found = false;
+
+                        // we check if one descendant pass the filter
+                        // currently though we do it more than necessary
+                        // might be more efficient to work with get_direct_children and memorize which parents pass the filter
+                        // might be possible to visit descendant in reverse order and have a bit with true or false (default false) for each item
+                        for item in descendant {
+                            if item
+                                .clone()
+                                .into_os_string()
+                                .into_string()
+                                .unwrap()
+                                .ends_with(suffix)
+                            {
+                                found = true;
+                                break;
+                            }
+                        }
+
+                        if found {
+                            self.print_entry(&child, &entry, indent);
+                        }
+                    } else {
+                        self.print_entry(&child, &entry, indent);
+                    }
+
+                    if let Some(_grandchildren) = self.get_direct_children(&child) {
+                        self.show_recursive(&child, indent + 1, sort, filter_suffix);
+                    }
+                }
+            }
+        }
+    }
+
+    /// print a single entry
+    ///
+    /// do not use alone, is part os show_recursive()
+    fn print_entry(&self, path: &Path, size: &Size, indent: usize) {
+        let indentation = "\t".repeat(indent);
+        println!("{}{}\t{}", indentation, size, path.display());
+    }
+}
+
+// no unittest here for now as show is mostly "visual"
--- a/src/size.rs
+++ b/src/size.rs
@ -0,0 +1,92 @@
+//! size in bytes
+//!
+//! implement fmt::Display to display size in a human readable unit (B, KB, MB, GB, TB).
+//!
+//! For Bytes, unit isn't printed.
+//!
+//! implement std::ops::Add
+
+use std::fmt;
+
+#[derive(PartialEq, PartialOrd, Eq, Ord, Copy, Clone)]
+pub struct Size(u64);
+
+//#[derive(Debug)]
+impl Size {
+    pub fn new(bytes: u64) -> Self {
+        Self(bytes)
+    }
+}
+
+impl fmt::Display for Size {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let length: u32 = self.0.checked_ilog10().unwrap_or(0); //number of digits -1
+                                                                // match over number of 3 digits groups, 1000 ~ 1024
+                                                                // 1000^n and 1024^n have the same number of digits if n<98
+                                                                // here n<=4 (TB)
+                                                                //
+                                                                // if size in KB or above we want at most 1 decimal
+        match length / 3 {
+            0 => write!(f, "{}", self.0 as f32), // we assume no unit printed means Bytes.
+            1 => write!(
+                f,
+                "{}KB",
+                (((self.0 as f32) / 1024.0) * 10.0_f32).round().trunc() / 10.0
+            ),
+            2 => write!(
+                f,
+                "{}MB",
+                (((self.0 as f32) / 1048576.0) * 10.0_f32).round().trunc() / 10.0
+            ),
+            3 => write!(
+                f,
+                "{}GB",
+                (((self.0 as f32) / 1073741824.0) * 10.0_f32)
+                    .round()
+                    .trunc()
+                    / 10.0
+            ),
+            4 => write!(
+                f,
+                "{}TB",
+                (((self.0 as f32) / 1099511627776.0) * 10.0_f32)
+                    .round()
+                    .trunc()
+                    / 10.0
+            ),
+            _ => panic!(), // unlikely to have PetaBytes of files (and above) on consumer grade hardware
+        }
+    }
+}
+
+impl std::ops::Add for Size {
+    type Output = Self;
+    fn add(self, other: Self) -> Self::Output {
+        Self(self.0 + other.0)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn create() {
+        assert_eq!(Size::new(1).0, 1);
+    }
+    #[test]
+    fn add() {
+        let s1 = Size::new(60);
+        let s2 = Size::new(40);
+        assert_eq!((s1 + s2).0, 100);
+    }
+    #[test]
+    fn display() {
+        assert_eq!(10u32.checked_ilog10().unwrap_or(0) + 1, 2);
+
+        assert_eq!(format!("{}", Size::new(1024)), "1KB");
+        // 1700/1024 = 1.66015625
+        assert_eq!(format!("{}", Size::new(1700)), "1.7KB");
+        // 2411724/(1024^2) = 2.299999237060547
+        assert_eq!(format!("{}", Size::new(2411724)), "2.3MB");
+    }
+}