(define-module (uniseg) #:use-module (srfi srfi-1) #:use-module (ice-9 match) #:use-module (srfi srfi-41) #:use-module (uniseg graphemes) #:use-module (uniseg graphemes stream) #:use-module (uniseg charsets emoji) #:use-module (uniseg charsets eastasian) #:use-module (uniseg charsets graphemes) #:export (emoji? char->grapheme-property char->eastasian-property char-width string-width)) (define (emoji? char) (char-set-contains? char-set:emoji-extended-pictographic char)) (define (char->prop sets char) (let ((pair (find (λ (p) (char-set-contains? (second p) char)) sets))) (and pair (first pair)))) (define (char->grapheme-property char) "Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/" (or (char->prop grapheme-charsets char) 'other)) (define (char->eastasian-property char) "Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/" (define (<==> lo hi) "Create a range predicate, inclusive between two numbers" (λ (num) (and (>= num lo) (<= num hi)))) ;; The spec requires that any values not in the table be double-width, if ;; they are in certain ranges. So let's take care of that here. ;; These characters are not in the char-sets as they are undesignated. ;; See comments in unicode's EastAsianWidth.txt (define range-unified-ideographs-ext-a? (<==> #x3400 #x4DBF)) (define range-unified-ideographs? (<==> #x4E00 #x9FFF)) (define range-compatibility-ideographs? (<==> #xF900 #xFAFF)) (define range-plane-2? (<==> #x20000 #x2FFFD)) (define range-plane-3? (<==> #x30000 #x3FFFD)) (or (char->prop eastasian-charsets char) (let ((int (char->integer char))) (match int ((or (? range-unified-ideographs-ext-a?) (? range-unified-ideographs?) (? range-compatibility-ideographs?) (? range-plane-2?) (? range-plane-3?)) 'doublewidth) (else 'neutral))))) (define (char-width rune) "Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations" (define grapheme-prop (char->grapheme-property rune)) (values (or (match grapheme-prop ((or 'control 'carriage-return 'line-feed 'extend 'zero-width-joiner) 0) ('regional-indicator 2) ('extended-pictographic (if (char-set-contains? char-set:emoji-presentation rune) 2 1)) (else #f) ) (match (char->integer rune) (#x2E3A 3) (#x2E3B 4) (else #f)) (match (char->eastasian-property rune) ('doublewidth 2) ;; Default for monospace is to render ambiguous as 1 ('ambiguous 1) (else 1))) grapheme-prop)) (define (string-width str) "Get the width of a string by adding up the widths of each grapheme" (stream-fold (λ (val grapheme) (+ val (grapheme-delta-width grapheme))) 0 (string->grapheme-stream str)))