Fix broken unit tests, significant cleanup

- Have to break the match statement into specific section and non-specific section
  - (some ordering is still unclear to me, lets see if we can copy more go-uniseg unit tests)
- Add emoji to the grapheme list
- First character must be processed first
  - then state transition done on next char with peek!
- Fix other off-by-one issues, now unit tests pass!
This commit is contained in:
Vivianne 2024-03-04 14:01:11 -05:00
parent a4938d04e1
commit dc90b8fd6c
4 changed files with 1566 additions and 1568 deletions

View file

@ -123,9 +123,9 @@
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (uniseg internal)
#:use-module (uniseg emoji)
#:export (,@grapheme-symbol-names
grapheme-charsets)))
grapheme-charsets)))
(pretty-print
`(define grapheme-ht
@ -149,7 +149,10 @@
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
grapheme-sets-and-symbols))))
grapheme-sets-and-symbols)
;; Need emoji in this set too!
(list 'extended-pictographic char-set:emoji-extended-pictographic))))
(display "\n")

View file

@ -41,8 +41,8 @@
(test-equal "double-em-dash is in right place"
"⸺" (grapheme-string double-em-dash-grapheme))
(test-equal "double-em-dash is double-width"
2 (grapheme-width double-em-dash-grapheme))
(test-equal "double-em-dash is triple-width"
3 (grapheme-width double-em-dash-grapheme))
(advance-stream!)
@ -51,4 +51,9 @@
(test-equal "advancing one goes to the a"
"a" (grapheme-string a-grapheme))
(define empty-stream (string->grapheme-stream ""))
(test-equal "a stream of nothing resolves to an empty stream"
#t (stream-null? empty-stream))
(test-end "tests-graphemes-stream")

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,6 @@
(define-module (uniseg graphemes stream)
#:use-module (uniseg)
#:use-module (uniseg internal)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 match)
#:use-module (srfi srfi-71)
@ -41,23 +42,14 @@
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
(define (state-machine cur-state cur-prop)
(match (list cur-state cur-prop)
;; Grapheme boundary #3
;; Specifics need to go first, and then non-specifics afterwards
;; SPECIFIC RULES - no `_' in the match
;; Grapheme boundary #3s
(('carriage-return 'line-feed)
(values 'control+line-feed #f))
;; Grapheme boundary #4
((or ('carriage-return _)
('control+line-feed _))
(values 'any #t))
;; Grapheme boundary #5
((_ 'carriage-return)
(values 'carriage-return #t))
((or (_ 'line-feed)
(_ 'control))
(values 'control+line-feed #t))
;; Grapheme boundary #6
;; Grapheme boundary #6s
(('hangul-syllable-l 'hangul-syllable-l)
(values 'hangul-syllable-l #f))
((or ('hangul-syllable-l 'hangul-syllable-v)
@ -65,100 +57,126 @@
(values 'hangul-syllable-lv #f))
(('hangul-syllable-l 'hangul-syllable-lvt)
(values 'hangul-syllable-lvt #f))
((_ 'hangul-syllable-l)
(values 'hangul-syllable-l #t))
;; Grapheme boundary #7
;; Grapheme boundary #7s
(('hangul-syllable-lv 'hangul-syllable-v)
(values 'hangul-syllable-lv #f))
(('hangul-syllable-lv 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
((or (_ 'hangul-syllable-lv)
(_ 'hangul-syllable-v))
(values 'hangul-syllable-lv #t))
;; Grapheme boundary #8
((or (_ 'hangul-syllable-lvt)
(_ 'hangul-syllable-t))
(values 'hangul-syllable-lvt #t))
;; Grapheme boundary #8s
(('hangul-syllable-lvt 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #9
((or (_ 'extend)
(_ 'zero-width-joiner))
(values 'any #f))
;; Grapheme boundary #9a
((_ 'spacing-mark)
(values 'any #f))
;; Grapheme boundary #9b
(('prepend _)
(values 'any #f))
((_ 'prepend)
(values 'prepend #t))
;; Grapheme boundary #11 (emoji!)
;; Grapheme boundary #11s (emoji!)
(('extended-pictographic 'extend)
(values 'extended-pictographic #f))
(('extended-pictographic 'zero-width-joiner)
(values 'extended-pictographic+zero-width-joiner #f))
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
(values 'extended-pictographic #f))
((_ 'extended-pictographic)
(values 'extended-pictographic #t))
;; Grapheme boundaries #12 and #13
;; Grapheme boundaries #12s and #13s
(('regional-indicator-odd 'regional-indicator)
(values 'regioinal-indicator-even #f))
(('regional-indicator-even 'regional-indicator)
(values 'regional-indicator-odd #t))
;; NON-SPECIFIC RULES
;; Grapheme boundary #4n
((or ('carriage-return _)
('control+line-feed _))
(values 'any #t))
;; Grapheme boundary #5n
((_ 'carriage-return)
(values 'carriage-return #t))
((or (_ 'line-feed)
(_ 'control))
(values 'control+line-feed #t))
((_ 'hangul-syllable-l)
(values 'hangul-syllable-l #t))
;; Grapheme boundary #7n
((or (_ 'hangul-syllable-lv)
(_ 'hangul-syllable-v))
(values 'hangul-syllable-lv #t))
;; Grapheme boundary #8n
((or (_ 'hangul-syllable-lvt)
(_ 'hangul-syllable-t))
(values 'hangul-syllable-lvt #t))
;; Grapheme boundary #9n
((or (_ 'extend)
(_ 'zero-width-joiner))
(values 'any #f))
;; Grapheme boundary #9n-A
((_ 'spacing-mark)
(values 'any #f))
;; Grapheme boundary #9n-B
(('prepend _)
(values 'any #f))
((_ 'prepend)
(values 'prepend #t))
;; Grapheme boundary #11n (emoji!)
((_ 'extended-pictographic)
(values 'extended-pictographic #t))
;; Grapheme boundaries #12n and #13n
((_ 'regional-indicator)
(values 'regional-indicator-odd #t))
;; Everything else considered a boundeary
;; Everything else considered a boundary
(else (values 'any #t))))
(define %glyphs-reverse '())
;; State variables, initialized with the first glyph we get!
(define first-glyph (get-char port))
(define hit-eof? (eof-object? first-glyph))
(define %glyphs-reverse (list first-glyph))
(define %grapheme-width 0)
(define hit-eof #f)
(define (iterate-through-grapheme state)
(define glyph (peek-char port))
(if (eof-object? glyph)
(begin
(set! hit-eof #t)
(set! hit-eof? #t)
state)
(begin
(let* ((width property (char-width glyph))
(next-state boundary? (state-machine state property)))
(begin
(let* ((width property (char-width glyph))
(next-state boundary? (state-machine state property)))
(pk width next-state)
(if boundary?
state
(let ((new-width (+ %grapheme-width width))
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
(if boundary?
state
(let ((new-width (+ %grapheme-width width))
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
;; Officially induct this char into the cluster
(get-char port)
(set! %grapheme-width new-width)
(set! %glyphs-reverse new-glyphs-reverse)
(iterate-through-grapheme next-state)))))))
;; Officially induct this char into the cluster
(get-char port)
(set! %grapheme-width new-width)
(set! %glyphs-reverse new-glyphs-reverse)
(iterate-through-grapheme state)))))))
;; Need to explicitly get the first state
;; Skip all this if it's an empty stream at start
(if hit-eof?
stream-null
(let ((first-width first-prop (char-width first-glyph)))
(set! %grapheme-width first-width)
(iterate-through-grapheme (state-machine 'any first-prop))
(define final-state (iterate-through-grapheme 'any))
(let ((grapheme (make-grapheme
%grapheme-width
;; Delay to avoid construction of unnecessary lists and strings!
(delay (reverse %glyphs-reverse))
(delay (reverse-list->string %glyphs-reverse)))))
(if
hit-eof
stream-null
(stream-cons
(make-grapheme
%grapheme-width
;; Delay to avoid construction of unnecessary lists and strings!
(delay (reverse-list->string %glyphs-reverse))
(delay (reverse %glyphs-reverse)))
(input->grapheme-stream port))))
;; If we hit the eof here, we need one last stream entry, otherwise iterate further
(stream-cons grapheme
(if hit-eof?
stream-null
(input->grapheme-stream port)))))))