Vivianne Langdon
31012d5b8f
- We change the stream iterator to *always* return a grapheme (except for EOF). The grapheme then gets built up over time. - This way, trans flag for example is first white flag, then white flag + zwj, etc until it finally transforms into the trans flag. - Users of the stream library can then use the `modification?' flag to determine if the stream value is a modification of the prior grapheme instead of a new grapheme. - Abstracted iteration to an iterator object to support use cases where we don't have an input stream (reflow needs this!)
105 lines
3.1 KiB
Scheme
105 lines
3.1 KiB
Scheme
(define-module (uniseg)
|
|
#:use-module (srfi srfi-1)
|
|
#:use-module (ice-9 match)
|
|
#:use-module (srfi srfi-41)
|
|
#:use-module (uniseg graphemes)
|
|
#:use-module (uniseg graphemes stream)
|
|
#:use-module (uniseg charsets emoji)
|
|
#:use-module (uniseg charsets eastasian)
|
|
#:use-module (uniseg charsets graphemes)
|
|
#:export (emoji?
|
|
char->grapheme-property
|
|
char->eastasian-property
|
|
char-width
|
|
string-width))
|
|
|
|
(define (emoji? char)
|
|
(char-set-contains? char-set:emoji-extended-pictographic char))
|
|
|
|
(define (char->prop sets char)
|
|
(let ((pair
|
|
(find
|
|
(λ (p) (char-set-contains? (second p) char))
|
|
sets)))
|
|
(and pair
|
|
(first pair))))
|
|
|
|
(define (char->grapheme-property char)
|
|
"Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/"
|
|
(or (char->prop grapheme-charsets char)
|
|
'other))
|
|
|
|
(define (char->eastasian-property char)
|
|
"Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/"
|
|
(define (<==> lo hi)
|
|
"Create a range predicate, inclusive between two numbers"
|
|
(λ (num)
|
|
(and (>= num lo)
|
|
(<= num hi))))
|
|
|
|
;; The spec requires that any values not in the table be double-width, if
|
|
;; they are in certain ranges. So let's take care of that here.
|
|
;; These characters are not in the char-sets as they are undesignated.
|
|
;; See comments in unicode's EastAsianWidth.txt
|
|
(define range-unified-ideographs-ext-a?
|
|
(<==> #x3400 #x4DBF))
|
|
(define range-unified-ideographs?
|
|
(<==> #x4E00 #x9FFF))
|
|
(define range-compatibility-ideographs?
|
|
(<==> #xF900 #xFAFF))
|
|
(define range-plane-2?
|
|
(<==> #x20000 #x2FFFD))
|
|
(define range-plane-3?
|
|
(<==> #x30000 #x3FFFD))
|
|
|
|
(or (char->prop eastasian-charsets char)
|
|
(let ((int (char->integer char)))
|
|
(match int
|
|
((or (? range-unified-ideographs-ext-a?)
|
|
(? range-unified-ideographs?)
|
|
(? range-compatibility-ideographs?)
|
|
(? range-plane-2?)
|
|
(? range-plane-3?))
|
|
'doublewidth)
|
|
(else
|
|
'neutral)))))
|
|
|
|
(define (char-width rune)
|
|
"Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations"
|
|
(define grapheme-prop (char->grapheme-property rune))
|
|
(values
|
|
(or
|
|
(match grapheme-prop
|
|
((or 'control
|
|
'carriage-return
|
|
'line-feed
|
|
'extend
|
|
'zero-width-joiner)
|
|
0)
|
|
('regional-indicator
|
|
2)
|
|
('extended-pictographic
|
|
(if
|
|
(char-set-contains? char-set:emoji-presentation rune)
|
|
2
|
|
1))
|
|
(else #f)
|
|
)
|
|
(match (char->integer rune)
|
|
(#x2E3A 3)
|
|
(#x2E3B 4)
|
|
(else #f))
|
|
(match (char->eastasian-property rune)
|
|
('doublewidth 2)
|
|
;; Default for monospace is to render ambiguous as 1
|
|
('ambiguous 1)
|
|
(else 1)))
|
|
grapheme-prop))
|
|
|
|
(define (string-width str)
|
|
"Get the width of a string by adding up the widths of each grapheme"
|
|
(stream-fold
|
|
(λ (val grapheme)
|
|
(+ val (grapheme-delta-width grapheme)))
|
|
0
|
|
(string->grapheme-stream str)))
|