Fix broken unit tests, significant cleanup
- Have to break the match statement into specific section and non-specific section - (some ordering is still unclear to me, lets see if we can copy more go-uniseg unit tests) - Add emoji to the grapheme list - First character must be processed first - then state transition done on next char with peek! - Fix other off-by-one issues, now unit tests pass!
This commit is contained in:
parent
a4938d04e1
commit
dc90b8fd6c
4 changed files with 1566 additions and 1568 deletions
|
@ -123,9 +123,9 @@
|
|||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (uniseg emoji)
|
||||
#:export (,@grapheme-symbol-names
|
||||
grapheme-charsets)))
|
||||
|
||||
grapheme-charsets)))
|
||||
|
||||
(pretty-print
|
||||
`(define grapheme-ht
|
||||
|
@ -149,7 +149,10 @@
|
|||
(let ((f (first pair))
|
||||
(s (second pair)))
|
||||
`(list ',f ,s)))
|
||||
grapheme-sets-and-symbols))))
|
||||
grapheme-sets-and-symbols)
|
||||
;; Need emoji in this set too!
|
||||
(list 'extended-pictographic char-set:emoji-extended-pictographic))))
|
||||
|
||||
|
||||
(display "\n")
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@
|
|||
(test-equal "double-em-dash is in right place"
|
||||
"⸺" (grapheme-string double-em-dash-grapheme))
|
||||
|
||||
(test-equal "double-em-dash is double-width"
|
||||
2 (grapheme-width double-em-dash-grapheme))
|
||||
(test-equal "double-em-dash is triple-width"
|
||||
3 (grapheme-width double-em-dash-grapheme))
|
||||
|
||||
(advance-stream!)
|
||||
|
||||
|
@ -51,4 +51,9 @@
|
|||
(test-equal "advancing one goes to the a"
|
||||
"a" (grapheme-string a-grapheme))
|
||||
|
||||
(define empty-stream (string->grapheme-stream ""))
|
||||
|
||||
(test-equal "a stream of nothing resolves to an empty stream"
|
||||
#t (stream-null? empty-stream))
|
||||
|
||||
(test-end "tests-graphemes-stream")
|
||||
|
|
2950
uniseg/graphemes.scm
2950
uniseg/graphemes.scm
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,6 @@
|
|||
(define-module (uniseg graphemes stream)
|
||||
#:use-module (uniseg)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (ice-9 textual-ports)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (srfi srfi-71)
|
||||
|
@ -41,23 +42,14 @@
|
|||
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
||||
(define (state-machine cur-state cur-prop)
|
||||
(match (list cur-state cur-prop)
|
||||
;; Grapheme boundary #3
|
||||
;; Specifics need to go first, and then non-specifics afterwards
|
||||
;; SPECIFIC RULES - no `_' in the match
|
||||
|
||||
;; Grapheme boundary #3s
|
||||
(('carriage-return 'line-feed)
|
||||
(values 'control+line-feed #f))
|
||||
|
||||
;; Grapheme boundary #4
|
||||
((or ('carriage-return _)
|
||||
('control+line-feed _))
|
||||
(values 'any #t))
|
||||
|
||||
;; Grapheme boundary #5
|
||||
((_ 'carriage-return)
|
||||
(values 'carriage-return #t))
|
||||
((or (_ 'line-feed)
|
||||
(_ 'control))
|
||||
(values 'control+line-feed #t))
|
||||
|
||||
;; Grapheme boundary #6
|
||||
;; Grapheme boundary #6s
|
||||
(('hangul-syllable-l 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #f))
|
||||
((or ('hangul-syllable-l 'hangul-syllable-v)
|
||||
|
@ -65,100 +57,126 @@
|
|||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-l 'hangul-syllable-lvt)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
((_ 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #t))
|
||||
|
||||
;; Grapheme boundary #7
|
||||
;; Grapheme boundary #7s
|
||||
(('hangul-syllable-lv 'hangul-syllable-v)
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-lv 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
((or (_ 'hangul-syllable-lv)
|
||||
(_ 'hangul-syllable-v))
|
||||
(values 'hangul-syllable-lv #t))
|
||||
|
||||
;; Grapheme boundary #8
|
||||
((or (_ 'hangul-syllable-lvt)
|
||||
(_ 'hangul-syllable-t))
|
||||
(values 'hangul-syllable-lvt #t))
|
||||
;; Grapheme boundary #8s
|
||||
(('hangul-syllable-lvt 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #9
|
||||
((or (_ 'extend)
|
||||
(_ 'zero-width-joiner))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9a
|
||||
((_ 'spacing-mark)
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9b
|
||||
(('prepend _)
|
||||
(values 'any #f))
|
||||
((_ 'prepend)
|
||||
(values 'prepend #t))
|
||||
|
||||
;; Grapheme boundary #11 (emoji!)
|
||||
;; Grapheme boundary #11s (emoji!)
|
||||
(('extended-pictographic 'extend)
|
||||
(values 'extended-pictographic #f))
|
||||
(('extended-pictographic 'zero-width-joiner)
|
||||
(values 'extended-pictographic+zero-width-joiner #f))
|
||||
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
||||
(values 'extended-pictographic #f))
|
||||
((_ 'extended-pictographic)
|
||||
(values 'extended-pictographic #t))
|
||||
|
||||
|
||||
;; Grapheme boundaries #12 and #13
|
||||
;; Grapheme boundaries #12s and #13s
|
||||
(('regional-indicator-odd 'regional-indicator)
|
||||
(values 'regioinal-indicator-even #f))
|
||||
(('regional-indicator-even 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; NON-SPECIFIC RULES
|
||||
|
||||
;; Grapheme boundary #4n
|
||||
((or ('carriage-return _)
|
||||
('control+line-feed _))
|
||||
(values 'any #t))
|
||||
|
||||
;; Grapheme boundary #5n
|
||||
((_ 'carriage-return)
|
||||
(values 'carriage-return #t))
|
||||
((or (_ 'line-feed)
|
||||
(_ 'control))
|
||||
(values 'control+line-feed #t))
|
||||
((_ 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #t))
|
||||
|
||||
;; Grapheme boundary #7n
|
||||
((or (_ 'hangul-syllable-lv)
|
||||
(_ 'hangul-syllable-v))
|
||||
(values 'hangul-syllable-lv #t))
|
||||
|
||||
;; Grapheme boundary #8n
|
||||
((or (_ 'hangul-syllable-lvt)
|
||||
(_ 'hangul-syllable-t))
|
||||
(values 'hangul-syllable-lvt #t))
|
||||
|
||||
;; Grapheme boundary #9n
|
||||
((or (_ 'extend)
|
||||
(_ 'zero-width-joiner))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-A
|
||||
((_ 'spacing-mark)
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-B
|
||||
(('prepend _)
|
||||
(values 'any #f))
|
||||
((_ 'prepend)
|
||||
(values 'prepend #t))
|
||||
|
||||
;; Grapheme boundary #11n (emoji!)
|
||||
((_ 'extended-pictographic)
|
||||
(values 'extended-pictographic #t))
|
||||
|
||||
;; Grapheme boundaries #12n and #13n
|
||||
((_ 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; Everything else considered a boundeary
|
||||
;; Everything else considered a boundary
|
||||
(else (values 'any #t))))
|
||||
|
||||
|
||||
(define %glyphs-reverse '())
|
||||
;; State variables, initialized with the first glyph we get!
|
||||
(define first-glyph (get-char port))
|
||||
(define hit-eof? (eof-object? first-glyph))
|
||||
(define %glyphs-reverse (list first-glyph))
|
||||
(define %grapheme-width 0)
|
||||
|
||||
(define hit-eof #f)
|
||||
|
||||
(define (iterate-through-grapheme state)
|
||||
(define glyph (peek-char port))
|
||||
(if (eof-object? glyph)
|
||||
(begin
|
||||
(set! hit-eof #t)
|
||||
(set! hit-eof? #t)
|
||||
state)
|
||||
(begin
|
||||
(let* ((width property (char-width glyph))
|
||||
(next-state boundary? (state-machine state property)))
|
||||
(begin
|
||||
(let* ((width property (char-width glyph))
|
||||
(next-state boundary? (state-machine state property)))
|
||||
|
||||
(pk width next-state)
|
||||
(if boundary?
|
||||
state
|
||||
(let ((new-width (+ %grapheme-width width))
|
||||
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
|
||||
|
||||
(if boundary?
|
||||
state
|
||||
(let ((new-width (+ %grapheme-width width))
|
||||
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
|
||||
;; Officially induct this char into the cluster
|
||||
(get-char port)
|
||||
(set! %grapheme-width new-width)
|
||||
(set! %glyphs-reverse new-glyphs-reverse)
|
||||
(iterate-through-grapheme next-state)))))))
|
||||
|
||||
;; Officially induct this char into the cluster
|
||||
(get-char port)
|
||||
(set! %grapheme-width new-width)
|
||||
(set! %glyphs-reverse new-glyphs-reverse)
|
||||
(iterate-through-grapheme state)))))))
|
||||
;; Need to explicitly get the first state
|
||||
;; Skip all this if it's an empty stream at start
|
||||
(if hit-eof?
|
||||
stream-null
|
||||
(let ((first-width first-prop (char-width first-glyph)))
|
||||
(set! %grapheme-width first-width)
|
||||
(iterate-through-grapheme (state-machine 'any first-prop))
|
||||
|
||||
(define final-state (iterate-through-grapheme 'any))
|
||||
(let ((grapheme (make-grapheme
|
||||
%grapheme-width
|
||||
;; Delay to avoid construction of unnecessary lists and strings!
|
||||
(delay (reverse %glyphs-reverse))
|
||||
(delay (reverse-list->string %glyphs-reverse)))))
|
||||
|
||||
(if
|
||||
hit-eof
|
||||
stream-null
|
||||
(stream-cons
|
||||
(make-grapheme
|
||||
%grapheme-width
|
||||
;; Delay to avoid construction of unnecessary lists and strings!
|
||||
(delay (reverse-list->string %glyphs-reverse))
|
||||
(delay (reverse %glyphs-reverse)))
|
||||
(input->grapheme-stream port))))
|
||||
;; If we hit the eof here, we need one last stream entry, otherwise iterate further
|
||||
(stream-cons grapheme
|
||||
(if hit-eof?
|
||||
stream-null
|
||||
(input->grapheme-stream port)))))))
|
||||
|
|
Loading…
Reference in a new issue