From 1692a9fc05e040ffa1333d12fd812e37bda36864 Mon Sep 17 00:00:00 2001 From: Vivianne Langdon Date: Sun, 3 Mar 2024 14:20:02 -0500 Subject: [PATCH] first pass untested state machine --- uniseg/graphemes/stream.scm | 100 ++++++++++++++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 3 deletions(-) diff --git a/uniseg/graphemes/stream.scm b/uniseg/graphemes/stream.scm index 511d109..b17956c 100644 --- a/uniseg/graphemes/stream.scm +++ b/uniseg/graphemes/stream.scm @@ -1,8 +1,13 @@ (define-module (runewidth graphemes stream) + #:use-module (ice-9 match) #:use-module (srfi srfi-41) #:use-module (srfi srfi-9 gnu) #:export (make-grapheme - + grapheme? + grapheme-glyphs + grapheme-width + grapheme-sentence-end? + grapheme-word-end? input->grapheme-stream)) (define-immutable-record-type @@ -17,6 +22,95 @@ (unless port (set! port (current-input-port))) - (define gr (make-grapheme glyphs width sentence-end? word-end?)) + ;; The first is what state we are in, and the next is the grapheme + ;; property of the current character. + ;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go + (define (state-machine cur-state cur-prop) + (match (list cur-state cur-prop) + ;; Grapheme boundary #3 + (('carriage-return 'line-feed) + (values 'control+line-feed 'no-boundary)) - (stream-cons c (input->grapheme-stream port))) + ;; Grapheme boundary #4 + ((or ('carriage-return _) + ('control+line-feed _)) + (values 'any 'boundary)) + + ;; Grapheme boundary #5 + ((_ 'carriage-return) + (values 'carriage-return 'boundary)) + ((or (_ 'line-feed) + (_ 'control)) + (values 'control+line-feed 'boundary)) + + ;; Grapheme boundary #6 + (('hangul-syllable-l 'hangul-syllable-l) + (values 'hangul-syllable-l 'no-boundary)) + ((or ('hangul-syllable-l 'hangul-syllable-v) + ('hangul-syllable-l 'hangul-syllable-lv)) + (values 'hangul-syllable-lv 'no-boundary)) + (('hangul-syllable-l 'hangul-syllable-lvt) + (values 'hangul-syllable-lvt 'no-boundary)) + ((_ 'hangul-syllable-l) + (values 'hangul-syllable-l 'boundary)) + + ;; Grapheme boundary #7 + (('hangul-syllable-lv 'hangul-syllable-v) + (values 'hangul-syllable-lv 'no-boundary)) + (('hangul-syllable-lv 'hangul-syllable-t) + (values 'hangul-syllable-lvt 'no-boundary)) + ((or (_ 'hangul-syllable-lv) + (_ 'hangul-syllable-v)) + (values 'hangul-syllable-lv 'boundary)) + + ;; Grapheme boundary #8 + ((or (_ 'hangul-syllable-lvt) + (_ 'hangul-syllable-t)) + (values 'hangul-syllable-lvt 'boundary)) + (('hangul-syllable-lvt 'hangul-syllable-t) + (values 'hangul-syllable-lvt 'no-boundary)) + + ;; Grapheme boundary #9 + ((or (_ 'extend) + (_ 'zero-width-joiner)) + (values 'any 'no-boundary)) + + ;; Grapheme boundary #9a + ((_ 'spacing-mark) + (values 'any 'no-boundary)) + + ;; Grapheme boundary #9b + (('prepend _) + (values 'any 'no-boundary)) + ((_ 'prepend) + (values 'prepend 'boundary)) + + ;; Grapheme boundary #11 (emoji!) + (('extended-pictographic 'extend) + (values 'extended-pictographic 'no-boundary)) + (('extended-pictographic 'zero-width-joiner) + (values 'extended-pictographic+zero-width-joiner 'no-boundary)) + (('extended-pictographic+zero-width-joiner 'extended-pictographic) + (values 'extended-pictographic 'no-boundary)) + ((_ 'extended-pictographic) + (values 'extended-pictographic 'boundary)) + + + ;; Grapheme boundaries #12 and #13 + (('regional-indicator-odd 'regional-indicator) + (values 'regioinal-indicator-even 'no-boundary)) + (('regional-indicator-even 'regional-indicator) + (values 'regional-indicator-odd 'boundary)) + ((_ 'regional-indicator) + (values 'regional-indicator-odd 'boundary)) + + (else (values 'other 'other)))) + + (define (transition-state state char) + "Given the current state and the next char, run a state transition" + + + ) + + (define grapheme (make-grapheme glyphs width sentence-end? word-end?)) + (stream-cons grapheme (input->grapheme-stream port)))