Vivianne Langdon
31012d5b8f
- We change the stream iterator to *always* return a grapheme (except for EOF). The grapheme then gets built up over time. - This way, trans flag for example is first white flag, then white flag + zwj, etc until it finally transforms into the trans flag. - Users of the stream library can then use the `modification?' flag to determine if the stream value is a modification of the prior grapheme instead of a new grapheme. - Abstracted iteration to an iterator object to support use cases where we don't have an input stream (reflow needs this!)
143 lines
5.1 KiB
Scheme
143 lines
5.1 KiB
Scheme
(define-module (uniseg graphemes iterator)
|
|
#:use-module (uniseg)
|
|
#:use-module (uniseg graphemes)
|
|
#:use-module (ice-9 match)
|
|
#:use-module (srfi srfi-71)
|
|
#:use-module (srfi srfi-41)
|
|
#:export (make-grapheme-iterator))
|
|
|
|
(define (make-grapheme-iterator)
|
|
"Create a grapheme iterator that accepts a character and provides grapheme records over time."
|
|
;; The first is what state we are in, and the next is the grapheme
|
|
;; property of the current character.
|
|
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
|
(define (state-machine cur-state cur-prop)
|
|
(match (list cur-state cur-prop)
|
|
;; Specifics need to go first, and then non-specifics afterwards
|
|
;; SPECIFIC RULES - no `_' in the match
|
|
|
|
;; Grapheme boundary #3s
|
|
(('carriage-return 'line-feed)
|
|
(values 'control+line-feed #f))
|
|
|
|
;; Grapheme boundary #6s
|
|
(('hangul-syllable-l 'hangul-syllable-l)
|
|
(values 'hangul-syllable-l #f))
|
|
((or ('hangul-syllable-l 'hangul-syllable-v)
|
|
('hangul-syllable-l 'hangul-syllable-lv))
|
|
(values 'hangul-syllable-lv #f))
|
|
(('hangul-syllable-l 'hangul-syllable-lvt)
|
|
(values 'hangul-syllable-lvt #f))
|
|
|
|
;; Grapheme boundary #7s
|
|
(('hangul-syllable-lv 'hangul-syllable-v)
|
|
(values 'hangul-syllable-lv #f))
|
|
(('hangul-syllable-lv 'hangul-syllable-t)
|
|
(values 'hangul-syllable-lvt #f))
|
|
|
|
;; Grapheme boundary #8s
|
|
(('hangul-syllable-lvt 'hangul-syllable-t)
|
|
(values 'hangul-syllable-lvt #f))
|
|
|
|
;; Grapheme boundary #11s (emoji!)
|
|
(('extended-pictographic 'extend)
|
|
(values 'extended-pictographic #f))
|
|
(('extended-pictographic 'zero-width-joiner)
|
|
(values 'extended-pictographic+zero-width-joiner #f))
|
|
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
|
(values 'extended-pictographic #f))
|
|
|
|
;; Grapheme boundaries #12s and #13s
|
|
(('regional-indicator-odd 'regional-indicator)
|
|
(values 'regioinal-indicator-even #f))
|
|
(('regional-indicator-even 'regional-indicator)
|
|
(values 'regional-indicator-odd #t))
|
|
|
|
;; NON-SPECIFIC RULES
|
|
|
|
;; Grapheme boundary #4n
|
|
((or ('carriage-return _)
|
|
('control+line-feed _))
|
|
(values 'any #t))
|
|
|
|
;; Grapheme boundary #5n
|
|
((_ 'carriage-return)
|
|
(values 'carriage-return #t))
|
|
((or (_ 'line-feed)
|
|
(_ 'control))
|
|
(values 'control+line-feed #t))
|
|
((_ 'hangul-syllable-l)
|
|
(values 'hangul-syllable-l #t))
|
|
|
|
;; Grapheme boundary #7n
|
|
((or (_ 'hangul-syllable-lv)
|
|
(_ 'hangul-syllable-v))
|
|
(values 'hangul-syllable-lv #t))
|
|
|
|
;; Grapheme boundary #8n
|
|
((or (_ 'hangul-syllable-lvt)
|
|
(_ 'hangul-syllable-t))
|
|
(values 'hangul-syllable-lvt #t))
|
|
|
|
;; Grapheme boundary #9n
|
|
((or (_ 'extend)
|
|
(_ 'zero-width-joiner))
|
|
(values 'any #f))
|
|
|
|
;; Grapheme boundary #9n-A
|
|
((_ 'spacing-mark)
|
|
(values 'any #f))
|
|
|
|
;; Grapheme boundary #9n-B
|
|
(('prepend _)
|
|
(values 'any #f))
|
|
((_ 'prepend)
|
|
(values 'prepend #t))
|
|
|
|
;; Grapheme boundary #11n (emoji!)
|
|
((_ 'extended-pictographic)
|
|
(values 'extended-pictographic #t))
|
|
|
|
;; Grapheme boundaries #12n and #13n
|
|
((_ 'regional-indicator)
|
|
(values 'regional-indicator-odd #t))
|
|
|
|
;; Everything else considered a boundary
|
|
(else (values 'any #t))))
|
|
|
|
(define %current-grapheme #f)
|
|
|
|
(define (set-grapheme! width delta modification? state glyphs-reverse)
|
|
(let ((new (make-grapheme width delta modification? state glyphs-reverse)))
|
|
(set! %current-grapheme new)
|
|
new))
|
|
|
|
(define (iterate-through-grapheme glyph)
|
|
"Grapheme iteration function. May return false, in which case it requires new characters in order to produce output."
|
|
(if (eof-object? glyph)
|
|
;; eof means nothing to do, clear grapheme and return false
|
|
(begin
|
|
(set! %current-grapheme #f)
|
|
#f)
|
|
(begin
|
|
(let* ((glyph-width prop (char-width glyph))
|
|
(cur-state (if %current-grapheme
|
|
(grapheme-state %current-grapheme)
|
|
'any))
|
|
(next-state boundary? (state-machine cur-state prop)))
|
|
|
|
;; Boundary is between this glyph and previous
|
|
(if boundary?
|
|
;; If we hit a boundary with previous, we simply restart the state
|
|
;; and output the current single glyph as a grapheme
|
|
(set-grapheme! glyph-width glyph-width #f next-state (list glyph))
|
|
;; If it's not a boundary with previous, we have to add to the grapheme
|
|
;; Only possible to get a boundary if %current-grapheme is set, so assume
|
|
(let* ((cur-width (grapheme-width %current-grapheme))
|
|
(cur-glyphs-reverse (grapheme-glyphs-reverse %current-grapheme))
|
|
(new-width (+ cur-width glyph-width))
|
|
(new-glyphs-reverse (cons glyph cur-glyphs-reverse)))
|
|
|
|
(set-grapheme! new-width glyph-width #t next-state new-glyphs-reverse)))))))
|
|
|
|
iterate-through-grapheme)
|