guile-uniseg/uniseg/graphemes/iterator.scm
Vivianne Langdon 31012d5b8f Yet another reorganization, and solve Christine's 'rude problem'
- We change the stream iterator to *always* return a grapheme
  (except for EOF). The grapheme then gets built up over time.
  - This way, trans flag for example is first white flag, then
  white flag + zwj, etc until it finally transforms into the
  trans flag.
  - Users of the stream library can then use the `modification?'
  flag to determine if the stream value is a modification of the
  prior grapheme instead of a new grapheme.
  - Abstracted iteration to an iterator object to support use cases
  where we don't have an input stream (reflow needs this!)
2024-03-05 11:46:32 -05:00

143 lines
5.1 KiB
Scheme

(define-module (uniseg graphemes iterator)
#:use-module (uniseg)
#:use-module (uniseg graphemes)
#:use-module (ice-9 match)
#:use-module (srfi srfi-71)
#:use-module (srfi srfi-41)
#:export (make-grapheme-iterator))
(define (make-grapheme-iterator)
"Create a grapheme iterator that accepts a character and provides grapheme records over time."
;; The first is what state we are in, and the next is the grapheme
;; property of the current character.
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
(define (state-machine cur-state cur-prop)
(match (list cur-state cur-prop)
;; Specifics need to go first, and then non-specifics afterwards
;; SPECIFIC RULES - no `_' in the match
;; Grapheme boundary #3s
(('carriage-return 'line-feed)
(values 'control+line-feed #f))
;; Grapheme boundary #6s
(('hangul-syllable-l 'hangul-syllable-l)
(values 'hangul-syllable-l #f))
((or ('hangul-syllable-l 'hangul-syllable-v)
('hangul-syllable-l 'hangul-syllable-lv))
(values 'hangul-syllable-lv #f))
(('hangul-syllable-l 'hangul-syllable-lvt)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #7s
(('hangul-syllable-lv 'hangul-syllable-v)
(values 'hangul-syllable-lv #f))
(('hangul-syllable-lv 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #8s
(('hangul-syllable-lvt 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #11s (emoji!)
(('extended-pictographic 'extend)
(values 'extended-pictographic #f))
(('extended-pictographic 'zero-width-joiner)
(values 'extended-pictographic+zero-width-joiner #f))
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
(values 'extended-pictographic #f))
;; Grapheme boundaries #12s and #13s
(('regional-indicator-odd 'regional-indicator)
(values 'regioinal-indicator-even #f))
(('regional-indicator-even 'regional-indicator)
(values 'regional-indicator-odd #t))
;; NON-SPECIFIC RULES
;; Grapheme boundary #4n
((or ('carriage-return _)
('control+line-feed _))
(values 'any #t))
;; Grapheme boundary #5n
((_ 'carriage-return)
(values 'carriage-return #t))
((or (_ 'line-feed)
(_ 'control))
(values 'control+line-feed #t))
((_ 'hangul-syllable-l)
(values 'hangul-syllable-l #t))
;; Grapheme boundary #7n
((or (_ 'hangul-syllable-lv)
(_ 'hangul-syllable-v))
(values 'hangul-syllable-lv #t))
;; Grapheme boundary #8n
((or (_ 'hangul-syllable-lvt)
(_ 'hangul-syllable-t))
(values 'hangul-syllable-lvt #t))
;; Grapheme boundary #9n
((or (_ 'extend)
(_ 'zero-width-joiner))
(values 'any #f))
;; Grapheme boundary #9n-A
((_ 'spacing-mark)
(values 'any #f))
;; Grapheme boundary #9n-B
(('prepend _)
(values 'any #f))
((_ 'prepend)
(values 'prepend #t))
;; Grapheme boundary #11n (emoji!)
((_ 'extended-pictographic)
(values 'extended-pictographic #t))
;; Grapheme boundaries #12n and #13n
((_ 'regional-indicator)
(values 'regional-indicator-odd #t))
;; Everything else considered a boundary
(else (values 'any #t))))
(define %current-grapheme #f)
(define (set-grapheme! width delta modification? state glyphs-reverse)
(let ((new (make-grapheme width delta modification? state glyphs-reverse)))
(set! %current-grapheme new)
new))
(define (iterate-through-grapheme glyph)
"Grapheme iteration function. May return false, in which case it requires new characters in order to produce output."
(if (eof-object? glyph)
;; eof means nothing to do, clear grapheme and return false
(begin
(set! %current-grapheme #f)
#f)
(begin
(let* ((glyph-width prop (char-width glyph))
(cur-state (if %current-grapheme
(grapheme-state %current-grapheme)
'any))
(next-state boundary? (state-machine cur-state prop)))
;; Boundary is between this glyph and previous
(if boundary?
;; If we hit a boundary with previous, we simply restart the state
;; and output the current single glyph as a grapheme
(set-grapheme! glyph-width glyph-width #f next-state (list glyph))
;; If it's not a boundary with previous, we have to add to the grapheme
;; Only possible to get a boundary if %current-grapheme is set, so assume
(let* ((cur-width (grapheme-width %current-grapheme))
(cur-glyphs-reverse (grapheme-glyphs-reverse %current-grapheme))
(new-width (+ cur-width glyph-width))
(new-glyphs-reverse (cons glyph cur-glyphs-reverse)))
(set-grapheme! new-width glyph-width #t next-state new-glyphs-reverse)))))))
iterate-through-grapheme)