diff --git a/config-daemon.ac b/config-daemon.ac index 5ddc740600..86306effe1 100644 --- a/config-daemon.ac +++ b/config-daemon.ac @@ -94,17 +94,6 @@ if test "x$guix_build_daemon" = "xyes"; then AC_CHECK_FUNCS([lutimes lchown posix_fallocate sched_setaffinity \ statvfs nanosleep strsignal statx]) - dnl Check whether the store optimiser can optimise symlinks. - AC_MSG_CHECKING([whether it is possible to create a link to a symlink]) - ln -s bla tmp_link - if ln tmp_link tmp_link2 2> /dev/null; then - AC_MSG_RESULT(yes) - AC_DEFINE(CAN_LINK_SYMLINK, 1, [Whether link() works on symlinks.]) - else - AC_MSG_RESULT(no) - fi - rm -f tmp_link tmp_link2 - dnl Check for . AC_LANG_PUSH(C++) AC_CHECK_HEADERS([locale]) diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm index cd9660174c..370df4a74c 100644 --- a/guix/store/deduplication.scm +++ b/guix/store/deduplication.scm @@ -1,6 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2017 Caleb Ristvedt -;;; Copyright © 2018, 2019, 2020 Ludovic Courtès +;;; Copyright © 2018-2021 Ludovic Courtès ;;; ;;; This file is part of GNU Guix. ;;; @@ -22,12 +22,13 @@ (define-module (guix store deduplication) #:use-module (gcrypt hash) - #:use-module (guix build utils) + #:use-module ((guix build utils) #:hide (dump-port)) #:use-module (guix build syscalls) #:use-module (guix base32) #:use-module (srfi srfi-11) #:use-module (srfi srfi-34) #:use-module (srfi srfi-35) + #:use-module (rnrs bytevectors) #:use-module (rnrs io ports) #:use-module (ice-9 ftw) #:use-module (ice-9 match) @@ -37,6 +38,31 @@ dump-file/deduplicate copy-file/deduplicate)) +;; TODO: Remove once 'dump-port' in (guix build utils) has an optional 'len' +;; parameter. +(define* (dump-port in out + #:optional len + #:key (buffer-size 16384)) + "Read LEN bytes from IN (or as much as possible if LEN is #f) and write it +to OUT, using chunks of BUFFER-SIZE bytes." + (define buffer + (make-bytevector buffer-size)) + + (let loop ((total 0) + (bytes (get-bytevector-n! in buffer 0 + (if len + (min len buffer-size) + buffer-size)))) + (or (eof-object? bytes) + (and len (= total len)) + (let ((total (+ total bytes))) + (put-bytevector out buffer 0 bytes) + (loop total + (get-bytevector-n! in buffer 0 + (if len + (min (- len total) buffer-size) + buffer-size))))))) + (define (nar-sha256 file) "Gives the sha256 hash of a file and the size of the file in nar form." (let-values (((port get-hash) (open-sha256-port))) @@ -127,11 +153,27 @@ Note: TARGET, TO-REPLACE, and SWAP-DIRECTORY must be on the same file system." (unless (= EMLINK (system-error-errno args)) (apply throw args))))))) +(define %deduplication-minimum-size + ;; Size below which files are not deduplicated. This avoids adding too many + ;; entries to '.links', which would slow down 'removeUnusedLinks' while + ;; saving little space. Keep in sync with optimize-store.cc. + 8192) + (define* (deduplicate path hash #:key (store (%store-directory))) "Check if a store item with sha256 hash HASH already exists. If so, replace PATH with a hardlink to the already-existing one. If not, register PATH so that future duplicates can hardlink to it. PATH is assumed to be under STORE." + ;; Lightweight promises. + (define-syntax-rule (delay exp) + (let ((value #f)) + (lambda () + (unless value + (set! value exp)) + value))) + (define-syntax-rule (force promise) + (promise)) + (define links-directory (string-append store "/.links")) @@ -144,13 +186,18 @@ under STORE." ((file . properties) (unless (member file '("." "..")) (let* ((file (string-append path "/" file)) + (st (delay (lstat file))) (type (match (assoc-ref properties 'type) ((or 'unknown #f) - (stat:type (lstat file))) + (stat:type (force st))) (type type)))) - (loop file type - (and (not (eq? 'directory type)) - (nar-sha256 file))))))) + (when (or (eq? 'directory type) + (and (eq? 'regular type) + (>= (stat:size (force st)) + %deduplication-minimum-size))) + (loop file type + (and (not (eq? 'directory type)) + (nar-sha256 file)))))))) (scandir* path)) (let ((link-file (string-append links-directory "/" (bytevector->nix-base32-string hash)))) @@ -222,9 +269,9 @@ OUTPUT as it goes." This procedure is suitable as a #:dump-file argument to 'restore-file'. When used that way, it deduplicates files on the fly as they are restored, thereby -removing the need to a deduplication pass that would re-read all the files +removing the need for a deduplication pass that would re-read all the files down the road." - (define hash + (define (dump-and-compute-hash) (call-with-output-file file (lambda (output) (let-values (((hash-port get-hash) @@ -236,7 +283,11 @@ down the road." (close-port hash-port) (get-hash))))) - (deduplicate file hash #:store store)) + (if (>= size %deduplication-minimum-size) + (deduplicate file (dump-and-compute-hash) #:store store) + (call-with-output-file file + (lambda (output) + (dump-port input output size))))) (define* (copy-file/deduplicate source target #:key (store (%store-directory))) diff --git a/nix/libstore/gc.cc b/nix/libstore/gc.cc index e1d0765154..16519116e4 100644 --- a/nix/libstore/gc.cc +++ b/nix/libstore/gc.cc @@ -606,7 +606,9 @@ void LocalStore::removeUnusedLinks(const GCState & state) throw SysError(format("statting `%1%'") % path); #endif - if (st.st_nlink != 1) { + /* Drop links for files smaller than 'deduplicationMinSize', even if + they have more than one hard link. */ + if (st.st_nlink != 1 && st.st_size >= deduplicationMinSize) { actualSize += st.st_size; unsharedSize += (st.st_nlink - 1) * st.st_size; continue; diff --git a/nix/libstore/local-store.hh b/nix/libstore/local-store.hh index 9ba37219da..20d3c3c893 100644 --- a/nix/libstore/local-store.hh +++ b/nix/libstore/local-store.hh @@ -292,4 +292,7 @@ void canonicaliseTimestampAndPermissions(const Path & path); MakeError(PathInUse, Error); +/* Size below which a file is not considered for deduplication. */ +extern const size_t deduplicationMinSize; + } diff --git a/nix/libstore/optimise-store.cc b/nix/libstore/optimise-store.cc index eb303ab4c3..9fd6f3cb35 100644 --- a/nix/libstore/optimise-store.cc +++ b/nix/libstore/optimise-store.cc @@ -15,6 +15,9 @@ namespace nix { +/* Any file smaller than this is not considered for deduplication. + Keep in sync with (guix store deduplication). */ +const size_t deduplicationMinSize = 8192; static void makeWritable(const Path & path) { @@ -105,12 +108,12 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHa return; } - /* We can hard link regular files and maybe symlinks. */ - if (!S_ISREG(st.st_mode) -#if CAN_LINK_SYMLINK - && !S_ISLNK(st.st_mode) -#endif - ) return; + /* We can hard link regular files (and maybe symlinks), but do that only + for files larger than some threshold. This avoids adding too many + entries to '.links', which would slow down 'removeUnusedLinks' while + saving little space. */ + if (!S_ISREG(st.st_mode) || ((size_t) st.st_size) < deduplicationMinSize) + return; /* Sometimes SNAFUs can cause files in the store to be modified, in particular when running programs as root under diff --git a/tests/derivations.scm b/tests/derivations.scm index cd165d1be6..0775719ea3 100644 --- a/tests/derivations.scm +++ b/tests/derivations.scm @@ -1,5 +1,5 @@ ;;; GNU Guix --- Functional package management for GNU -;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès +;;; Copyright © 2012-2021 Ludovic Courtès ;;; ;;; This file is part of GNU Guix. ;;; @@ -170,11 +170,15 @@ #f)))) (test-assert "identical files are deduplicated" - (let* ((build1 (add-text-to-store %store "one.sh" - "echo hello, world > \"$out\"\n" + ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE. + (let* ((data (make-string 9000 #\a)) + (build1 (add-text-to-store %store "one.sh" + (string-append "echo -n " data + " > \"$out\"\n") '())) (build2 (add-text-to-store %store "two.sh" - "# Hey!\necho hello, world > \"$out\"\n" + (string-append "# Hey!\necho -n " + data " > \"$out\"\n") '())) (drv1 (derivation %store "foo" %bash `(,build1) @@ -187,7 +191,7 @@ (file2 (derivation->output-path drv2))) (and (valid-path? %store file1) (valid-path? %store file2) (string=? (call-with-input-file file1 get-string-all) - "hello, world\n") + data) (= (stat:ino (lstat file1)) (stat:ino (lstat file2)))))))) diff --git a/tests/nar.scm b/tests/nar.scm index ba4881caaa..98752f2088 100644 --- a/tests/nar.scm +++ b/tests/nar.scm @@ -1,5 +1,5 @@ ;;; GNU Guix --- Functional package management for GNU -;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès +;;; Copyright © 2012-2021 Ludovic Courtès ;;; ;;; This file is part of GNU Guix. ;;; @@ -486,8 +486,9 @@ ;; their mtime and permissions were not reset. Ensure that this bug is ;; gone. (with-store store - (let* ((text1 (random-text)) - (text2 (random-text)) + ;; Note: TEXT1 and TEXT2 must be longer than %DEDUPLICATION-MINIMUM-SIZE. + (let* ((text1 (string-concatenate (make-list 200 (random-text)))) + (text2 (string-concatenate (make-list 200 (random-text)))) (tree `("tree" directory ("a" regular (data ,text1)) ("b" directory diff --git a/tests/store-deduplication.scm b/tests/store-deduplication.scm index b1c2d93bbd..2950fbc1a3 100644 --- a/tests/store-deduplication.scm +++ b/tests/store-deduplication.scm @@ -1,5 +1,5 @@ ;;; GNU Guix --- Functional package management for GNU -;;; Copyright © 2018, 2020 Ludovic Courtès +;;; Copyright © 2018, 2020-2021 Ludovic Courtès ;;; ;;; This file is part of GNU Guix. ;;; @@ -30,13 +30,40 @@ (test-begin "store-deduplication") +(test-equal "deduplicate, below %deduplication-minimum-size" + (list #t (make-list 5 1)) + + (call-with-temporary-directory + (lambda (store) + ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE. + (let ((data "Hello, world!") + (identical (map (lambda (n) + (string-append store "/" (number->string n) + "/a/b/c")) + (iota 5)))) + (for-each (lambda (file) + (mkdir-p (dirname file)) + (call-with-output-file file + (lambda (port) + (put-bytevector port (string->utf8 data))))) + identical) + + (deduplicate store (nar-sha256 store) #:store store) + + ;; (system (string-append "ls -lRia " store)) + (list (= (length (delete-duplicates + (map (compose stat:ino stat) identical))) + (length identical)) + (map (compose stat:nlink stat) identical)))))) + (test-equal "deduplicate" (cons* #t #f ;inode comparisons 2 (make-list 5 6)) ;'nlink' values (call-with-temporary-directory (lambda (store) - (let ((data (string->utf8 "Hello, world!")) + ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE. + (let ((data (string-concatenate (make-list 1000 "Hello, world!"))) (identical (map (lambda (n) (string-append store "/" (number->string n) "/a/b/c")) @@ -46,7 +73,7 @@ (mkdir-p (dirname file)) (call-with-output-file file (lambda (port) - (put-bytevector port data)))) + (put-bytevector port (string->utf8 data))))) identical) ;; Make the parent of IDENTICAL read-only. This should not prevent ;; deduplication from inserting its hard link. @@ -54,7 +81,7 @@ (call-with-output-file unique (lambda (port) - (put-bytevector port (string->utf8 "This is unique.")))) + (put-bytevector port (string->utf8 (string-reverse data))))) (deduplicate store (nar-sha256 store) #:store store) @@ -77,8 +104,10 @@ (lambda (store) (let ((true-link link) (links 0) - (data1 (string->utf8 "Hello, world!")) - (data2 (string->utf8 "Hi, world!")) + (data1 (string->utf8 + (string-concatenate (make-list 1000 "Hello, world!")))) + (data2 (string->utf8 + (string-concatenate (make-list 1000 "Hi, world!")))) (identical (map (lambda (n) (string-append store "/" (number->string n) "/a/b/c")) diff --git a/tests/store.scm b/tests/store.scm index 2150a0048c..5c9f651d6c 100644 --- a/tests/store.scm +++ b/tests/store.scm @@ -759,7 +759,9 @@ (test-assert "substitute, deduplication" (with-store s - (let* ((c (random-text)) ; contents of the output + ;; Note: C must be longer than %DEDUPLICATION-MINIMUM-SIZE. + (let* ((c (string-concatenate + (make-list 200 (random-text)))) ; contents of the output (g (package-derivation s %bootstrap-guile)) (d1 (build-expression->derivation s "substitute-me" `(begin ,c (exit 1))