Full log
Message #139 received at 33848@debbugs.gnu.org (full text , mbox , reply ):
[Message part 1 (text/plain, inline)]
Here's a preliminary draft patch to add support for UTF-32 and UTF-16
references to our grafting code. I haven't yet measured the efficiency
impact of these changes, but I suspect it's not too bad.
I'd be curious to know whether it fixes the Nyxt graft.
Mark
[0001-DRAFT-grafts-Add-support-for-UTF-16-and-UTF-32-store.patch (text/x-patch, inline)]
From 0fcfd804570fd1c07ffb1f6c176d6ec3430907df Mon Sep 17 00:00:00 2001
From: Mark H Weaver <mhw@netris.org>
Date: Fri, 2 Apr 2021 18:36:23 -0400
Subject: [PATCH] DRAFT: grafts: Add support for UTF-16 and UTF-32 store
references.
---
guix/build/graft.scm | 138 +++++++++++++++++++++++++++++--------------
tests/grafts.scm | 68 +++++++++++++++++++++
2 files changed, 162 insertions(+), 44 deletions(-)
diff --git a/guix/build/graft.scm b/guix/build/graft.scm
index c119ee71d1..6e7f3859cb 100644
--- a/guix/build/graft.scm
+++ b/guix/build/graft.scm
@@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès <ludo@gnu.org>
-;;; Copyright © 2016 Mark H Weaver <mhw@netris.org>
+;;; Copyright © 2016, 2021 Mark H Weaver <mhw@netris.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -55,6 +55,36 @@
(string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
<>))
+(define (nix-base32-char-or-nul? byte)
+ (or (nix-base32-char? byte)
+ (char=? byte #\nul)))
+
+(define (has-utf16-zeroes? buffer i)
+ (let loop ((j (+ 1 (- i (* 2 hash-length)))))
+ (or (>= j i)
+ (and (zero? (bytevector-u8-ref buffer j))
+ (loop (+ j 2))))))
+
+(define (has-utf32-zeroes? buffer i)
+ (let loop ((j (+ 1 (- i (* 4 hash-length)))))
+ (or (>= j i)
+ (and (zero? (bytevector-u8-ref buffer j))
+ (zero? (bytevector-u8-ref buffer (+ j 1)))
+ (zero? (bytevector-u8-ref buffer (+ j 2)))
+ (loop (+ j 4))))))
+
+(define (expand-bytevector bv char-size)
+ (let* ((len (bytevector-length bv))
+ (bv* (make-bytevector (+ 1 (* char-size
+ (- len 1)))
+ 0)))
+ (let loop ((i 0))
+ (when (< i len)
+ (bytevector-u8-set! bv* (* i char-size)
+ (bytevector-u8-ref bv i))
+ (loop (+ i 1))))
+ bv*))
+
(define* (replace-store-references input output replacement-table
#:optional (store (%store-directory)))
"Read data from INPUT, replacing store references according to
@@ -76,15 +106,16 @@ bytevectors to the same value."
(list->vector (map pred (iota 256)))
<>))
- (define nix-base32-byte?
+ (define nix-base32-byte-or-nul?
(optimize-u8-predicate
- (compose nix-base32-char?
+ (compose nix-base32-char-or-nul?
integer->char)))
(define (dash? byte) (= byte 45))
(define request-size (expt 2 20)) ; 1 MiB
+ ;; XXX This comment is no longer accurate!
;; We scan the file for the following 33-byte pattern: 32 bytes of
;; nix-base32 characters followed by a dash. To accommodate large files,
;; we do not read the entire file, but instead work on buffers of up to
@@ -116,43 +147,61 @@ bytevectors to the same value."
;; written.
(if (< i end)
(let ((byte (bytevector-u8-ref buffer i)))
- (cond ((and (dash? byte)
- ;; We've found a dash. Note that we do not know
- ;; whether the preceeding 32 bytes are nix-base32
- ;; characters, but we do not need to know. If
- ;; they are not, the following lookup will fail.
- (lookup-replacement
- (string-tabulate (lambda (j)
- (integer->char
- (bytevector-u8-ref buffer
- (+ j (- i hash-length)))))
- hash-length)))
- => (lambda (replacement)
- ;; We've found a hash that needs to be replaced.
- ;; First, write out all bytes preceding the hash
- ;; that have not yet been written.
- (put-bytevector output buffer written
- (- i hash-length written))
- ;; Now write the replacement string.
- (put-bytevector output replacement)
- ;; Since the byte at position 'i' is a dash,
- ;; which is not a nix-base32 char, the earliest
- ;; position where the next hash might start is
- ;; i+1, and the earliest position where the
- ;; following dash might start is (+ i 1
- ;; hash-length). Also, increase the write
- ;; position to account for REPLACEMENT.
- (let ((len (bytevector-length replacement)))
- (scan-from (+ i 1 len)
- (+ i (- len hash-length))))))
- ;; If the byte at position 'i' is a nix-base32 char,
+ (cond ((dash? byte)
+ (let* ((char-size
+ (if (zero? (bytevector-u8-ref buffer (- i 1)))
+ (if (zero? (bytevector-u8-ref buffer (- i 2)))
+ (if (and (<= (* 4 hash-length)
+ (- i written))
+ (has-utf32-zeroes? buffer i))
+ 4
+ 1)
+ (if (and (<= (* 2 hash-length)
+ (- i written))
+ (has-utf16-zeroes? buffer i))
+ 2
+ 1))
+ 1))
+ (replacement*
+ (lookup-replacement
+ (string-tabulate (lambda (j)
+ (integer->char
+ (bytevector-u8-ref buffer
+ (- i (* char-size
+ (- hash-length j))))))
+ hash-length)))
+ (replacement
+ (and replacement*
+ (expand-bytevector replacement*
+ char-size))))
+ (if replacement
+ (begin
+ ;; We've found a hash that needs to be replaced.
+ ;; First, write out all bytes preceding the hash
+ ;; that have not yet been written.
+ (put-bytevector output buffer written
+ (- i (* char-size hash-length) written))
+ ;; Now write the replacement string.
+ (put-bytevector output replacement)
+ ;; Now compute the new value of 'written' and
+ ;; the new value of 'i', and iterate.
+ (let ((written (+ (- i (* char-size hash-length))
+ (bytevector-length replacement))))
+ (scan-from (+ written hash-length) written)))
+ ;; The byte at position 'i' is a dash, which is
+ ;; not a nix-base32 char, so the earliest
+ ;; position where the next hash might start is
+ ;; i+1, with the following dash at position (+ i
+ ;; 1 hash-length).
+ (scan-from (+ i 1 hash-length) written))))
+ ;; If the byte at position 'i' is a nix-base32 char or nul,
;; then the dash we're looking for might be as early as
;; the following byte, so we can only advance by 1.
- ((nix-base32-byte? byte)
+ ((nix-base32-byte-or-nul? byte)
(scan-from (+ i 1) written))
- ;; If the byte at position 'i' is NOT a nix-base32
- ;; char, then the earliest position where the next hash
- ;; might start is i+1, with the following dash at
+ ;; If the byte at position 'i' is NOT a nix-base32 char
+ ;; or nul, then the earliest position where the next
+ ;; hash might start is i+1, with the following dash at
;; position (+ i 1 hash-length).
(else
(scan-from (+ i 1 hash-length) written))))
@@ -162,18 +211,19 @@ bytevectors to the same value."
;; "unget". If 'end' is less than 'request-size' then we read
;; less than we asked for, which indicates that we are at EOF,
;; so we needn't unget anything. Otherwise, we unget up to
- ;; 'hash-length' bytes (32 bytes). However, we must be careful
- ;; not to unget bytes that have already been written, because
- ;; that would cause them to be written again from the next
- ;; buffer. In practice, this case occurs when a replacement is
- ;; made near or beyond the end of the buffer. When REPLACEMENT
- ;; went beyond END, we consume the extra bytes from INPUT.
+ ;; (* 4 hash-length) bytes. However, we must be careful not to
+ ;; unget bytes that have already been written, because that
+ ;; would cause them to be written again from the next buffer.
+ ;; In practice, this case occurs when a replacement is made
+ ;; near or beyond the end of the buffer. When REPLACEMENT went
+ ;; beyond END, we consume the extra bytes from INPUT.
(begin
(if (> written end)
(get-bytevector-n! input buffer 0 (- written end))
(let* ((unwritten (- end written))
(unget-size (if (= end request-size)
- (min hash-length unwritten)
+ (min (* 4 hash-length)
+ unwritten)
0))
(write-size (- unwritten unget-size)))
(put-bytevector output buffer written write-size)
diff --git a/tests/grafts.scm b/tests/grafts.scm
index a12c6a5911..0e1c7355b1 100644
--- a/tests/grafts.scm
+++ b/tests/grafts.scm
@@ -1,5 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2021 Mark H Weaver <mhw@netris.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -468,4 +469,71 @@
replacement
"/gnu/store")))))
+(define (nul-expand str char-size)
+ (string-join (map string (string->list str))
+ (make-string (- char-size 1) #\nul)))
+
+(for-each
+ (lambda (char-size1)
+ (for-each
+ (lambda (char-size2)
+ (for-each
+ (lambda (gap)
+ (for-each
+ (lambda (offset)
+ (test-equal (format #f "replace-store-references, char-sizes ~a ~a, gap ~s, offset ~a"
+ char-size1 char-size2 gap offset)
+ (string-append (make-string offset #\=)
+ (nul-expand (string-append "/gnu/store/"
+ (make-string 32 #\6)
+ "-BlahBlaH")
+ char-size1)
+ gap
+ (nul-expand (string-append "/gnu/store/"
+ (make-string 32 #\8)
+ "-SoMeTHiNG")
+ char-size2)
+ (list->string (map integer->char (iota 77 33))))
+
+ ;; Create input data where the right-hand-size of the dash ("-something"
+ ;; here) goes beyond the end of the internal buffer of
+ ;; 'replace-store-references'.
+ (let* ((content (string-append (make-string offset #\=)
+ (nul-expand (string-append "/gnu/store/"
+ (make-string 32 #\5)
+ "-blahblah")
+ char-size1)
+ gap
+ (nul-expand (string-append "/gnu/store/"
+ (make-string 32 #\7)
+ "-something")
+ char-size2)
+ (list->string
+ (map integer->char (iota 77 33)))))
+ (replacement (alist->vhash
+ `((,(make-string 32 #\5)
+ . ,(string->utf8 (string-append
+ (make-string 32 #\6)
+ "-BlahBlaH")))
+ (,(make-string 32 #\7)
+ . ,(string->utf8 (string-append
+ (make-string 32 #\8)
+ "-SoMeTHiNG")))))))
+ (call-with-output-string
+ (lambda (output)
+ ((@@ (guix build graft) replace-store-references)
+ (open-input-string content) output
+ replacement
+ "/gnu/store"))))))
+ ;; offsets to test
+ (map (lambda (i) (- buffer-size (* 40 char-size1) i))
+ (iota 30))))
+ ;; gaps
+ '("" "-" " " "a")))
+ ;; char-size2 values to test
+ '(1 2)))
+ ;; char-size1 values to test
+ '(1 2 4))
+
+
(test-end)
--
2.31.1
Display info messages
Send a report that this bug log contains spam .
debbugs.gnu.org maintainers
<help-debbugs@gnu.org >.
Last modified:
Sat Apr 12 09:23:56 2025;
Machine Name:
wallace-server
GNU bug tracking system
Debbugs is free software and licensed under the terms of the
GNU Public License version 2. The current version can be
obtained from https://bugs.debian.org/debbugs-source/ .
Copyright © 1999 Darren O. Benham,
1997,2003 nCipher Corporation Ltd,
1994-97 Ian Jackson,
2005-2017 Don Armstrong, and many other contributors.