144 lines
5.1 KiB
Scheme
144 lines
5.1 KiB
Scheme
(define-module (uniseg graphemes iterator)
|
|
#:use-module (uniseg)
|
|
#:use-module (uniseg graphemes)
|
|
#:use-module (ice-9 match)
|
|
#:use-module (srfi srfi-71)
|
|
#:use-module (srfi srfi-41)
|
|
#:export (make-grapheme-iterator))
|
|
|
|
(define (make-grapheme-iterator)
|
|
"Create a grapheme iterator that accepts a character and provides grapheme records over time."
|
|
;; The first is what state we are in, and the next is the grapheme
|
|
;; property of the current character.
|
|
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
|
(define (state-machine cur-state cur-prop)
|
|
(match (list cur-state cur-prop)
|
|
;; Specifics need to go first, and then non-specifics afterwards
|
|
;; SPECIFIC RULES - no `_' in the match
|
|
|
|
;; Grapheme boundary #3s
|
|
(('carriage-return 'line-feed)
|
|
(values 'control+line-feed #f))
|
|
|
|
;; Grapheme boundary #6s
|
|
(('hangul-syllable-l 'hangul-syllable-l)
|
|
(values 'hangul-syllable-l #f))
|
|
((or ('hangul-syllable-l 'hangul-syllable-v)
|
|
('hangul-syllable-l 'hangul-syllable-lv))
|
|
(values 'hangul-syllable-lv #f))
|
|
(('hangul-syllable-l 'hangul-syllable-lvt)
|
|
(values 'hangul-syllable-lvt #f))
|
|
|
|
;; Grapheme boundary #7s
|
|
(('hangul-syllable-lv 'hangul-syllable-v)
|
|
(values 'hangul-syllable-lv #f))
|
|
(('hangul-syllable-lv 'hangul-syllable-t)
|
|
(values 'hangul-syllable-lvt #f))
|
|
|
|
;; Grapheme boundary #8s
|
|
(('hangul-syllable-lvt 'hangul-syllable-t)
|
|
(values 'hangul-syllable-lvt #f))
|
|
|
|
;; Grapheme boundary #11s (emoji!)
|
|
(('extended-pictographic 'extend)
|
|
(values 'extended-pictographic #f))
|
|
(('extended-pictographic 'zero-width-joiner)
|
|
(values 'extended-pictographic+zero-width-joiner #f))
|
|
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
|
(values 'extended-pictographic #f))
|
|
|
|
;; Grapheme boundaries #12s and #13s
|
|
(('regional-indicator-odd 'regional-indicator)
|
|
(values 'regioinal-indicator-even #f))
|
|
(('regional-indicator-even 'regional-indicator)
|
|
(values 'regional-indicator-odd #t))
|
|
|
|
;; NON-SPECIFIC RULES
|
|
|
|
;; Grapheme boundary #4n
|
|
((or ('carriage-return _)
|
|
('control+line-feed _))
|
|
(values 'any #t))
|
|
|
|
;; Grapheme boundary #5n
|
|
((_ 'carriage-return)
|
|
(values 'carriage-return #t))
|
|
((or (_ 'line-feed)
|
|
(_ 'control))
|
|
(values 'control+line-feed #t))
|
|
((_ 'hangul-syllable-l)
|
|
(values 'hangul-syllable-l #t))
|
|
|
|
;; Grapheme boundary #7n
|
|
((or (_ 'hangul-syllable-lv)
|
|
(_ 'hangul-syllable-v))
|
|
(values 'hangul-syllable-lv #t))
|
|
|
|
;; Grapheme boundary #8n
|
|
((or (_ 'hangul-syllable-lvt)
|
|
(_ 'hangul-syllable-t))
|
|
(values 'hangul-syllable-lvt #t))
|
|
|
|
;; Grapheme boundary #9n
|
|
((or (_ 'extend)
|
|
(_ 'zero-width-joiner))
|
|
(values 'any #f))
|
|
|
|
;; Grapheme boundary #9n-A
|
|
((_ 'spacing-mark)
|
|
(values 'any #f))
|
|
|
|
;; Grapheme boundary #9n-B
|
|
(('prepend _)
|
|
(values 'any #f))
|
|
((_ 'prepend)
|
|
(values 'prepend #t))
|
|
|
|
;; Grapheme boundary #11n (emoji!)
|
|
((_ 'extended-pictographic)
|
|
(values 'extended-pictographic #t))
|
|
|
|
;; Grapheme boundaries #12n and #13n
|
|
((_ 'regional-indicator)
|
|
(values 'regional-indicator-odd #t))
|
|
|
|
;; Everything else considered a boundary
|
|
(else (values 'any #t))))
|
|
|
|
(define %current-grapheme #f)
|
|
|
|
(define (set-grapheme! width delta modification? state glyphs-reverse)
|
|
(let ((new (make-grapheme width delta modification? state glyphs-reverse)))
|
|
(set! %current-grapheme new)
|
|
new))
|
|
|
|
(define (iterate-through-grapheme glyph)
|
|
"Grapheme iteration function. May return false, in which case it requires new characters in order to produce output."
|
|
(if (eof-object? glyph)
|
|
;; eof means nothing to do, clear grapheme and return false
|
|
(begin
|
|
(set! %current-grapheme #f)
|
|
#f)
|
|
(begin
|
|
(let* ((glyph-width prop (char-width glyph))
|
|
(cur-state (if %current-grapheme
|
|
(grapheme-state %current-grapheme)
|
|
'any))
|
|
(next-state boundary? (state-machine cur-state prop)))
|
|
|
|
;; Boundary is between this glyph and previous
|
|
(if boundary?
|
|
;; If we hit a boundary with previous, we simply restart the state
|
|
;; and output the current single glyph as a grapheme
|
|
(set-grapheme! glyph-width glyph-width #f next-state (list glyph))
|
|
;; If it's not a boundary with previous, we have to add to the grapheme
|
|
;; Only possible to get a boundary if %current-grapheme is set, so assume
|
|
(let* ((cur-width (grapheme-width %current-grapheme))
|
|
(cur-glyphs-reverse (grapheme-glyphs-reverse %current-grapheme))
|
|
(new-width (+ cur-width glyph-width))
|
|
(new-glyphs-reverse (cons glyph cur-glyphs-reverse)))
|
|
|
|
(set-grapheme! new-width glyph-width #t next-state new-glyphs-reverse)))))))
|
|
|
|
iterate-through-grapheme)
|