Fixing issues with the stream unit tests #1
5 changed files with 1597 additions and 1569 deletions
|
@ -123,9 +123,9 @@
|
|||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (uniseg emoji)
|
||||
#:export (,@grapheme-symbol-names
|
||||
grapheme-charsets)))
|
||||
|
||||
grapheme-charsets)))
|
||||
|
||||
(pretty-print
|
||||
`(define grapheme-ht
|
||||
|
@ -149,7 +149,10 @@
|
|||
(let ((f (first pair))
|
||||
(s (second pair)))
|
||||
`(list ',f ,s)))
|
||||
grapheme-sets-and-symbols))))
|
||||
grapheme-sets-and-symbols)
|
||||
;; Need emoji in this set too!
|
||||
(list 'extended-pictographic char-set:emoji-extended-pictographic))))
|
||||
|
||||
|
||||
(display "\n")
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@
|
|||
(test-equal "double-em-dash is in right place"
|
||||
"⸺" (grapheme-string double-em-dash-grapheme))
|
||||
|
||||
(test-equal "double-em-dash is double-width"
|
||||
2 (grapheme-width double-em-dash-grapheme))
|
||||
(test-equal "double-em-dash is triple-width"
|
||||
3 (grapheme-width double-em-dash-grapheme))
|
||||
|
||||
(advance-stream!)
|
||||
|
||||
|
@ -51,4 +51,9 @@
|
|||
(test-equal "advancing one goes to the a"
|
||||
"a" (grapheme-string a-grapheme))
|
||||
|
||||
(define empty-stream (string->grapheme-stream ""))
|
||||
|
||||
(test-equal "a stream of nothing resolves to an empty stream"
|
||||
#t (stream-null? empty-stream))
|
||||
|
||||
(test-end "tests-graphemes-stream")
|
||||
|
|
2950
uniseg/graphemes.scm
2950
uniseg/graphemes.scm
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,6 @@
|
|||
(define-module (uniseg graphemes stream)
|
||||
#:use-module (uniseg)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (ice-9 textual-ports)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (srfi srfi-71)
|
||||
|
@ -41,23 +42,14 @@
|
|||
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
||||
(define (state-machine cur-state cur-prop)
|
||||
(match (list cur-state cur-prop)
|
||||
;; Grapheme boundary #3
|
||||
;; Specifics need to go first, and then non-specifics afterwards
|
||||
;; SPECIFIC RULES - no `_' in the match
|
||||
|
||||
;; Grapheme boundary #3s
|
||||
(('carriage-return 'line-feed)
|
||||
(values 'control+line-feed #f))
|
||||
|
||||
;; Grapheme boundary #4
|
||||
((or ('carriage-return _)
|
||||
('control+line-feed _))
|
||||
(values 'any #t))
|
||||
|
||||
;; Grapheme boundary #5
|
||||
((_ 'carriage-return)
|
||||
(values 'carriage-return #t))
|
||||
((or (_ 'line-feed)
|
||||
(_ 'control))
|
||||
(values 'control+line-feed #t))
|
||||
|
||||
;; Grapheme boundary #6
|
||||
;; Grapheme boundary #6s
|
||||
(('hangul-syllable-l 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #f))
|
||||
((or ('hangul-syllable-l 'hangul-syllable-v)
|
||||
|
@ -65,95 +57,126 @@
|
|||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-l 'hangul-syllable-lvt)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
((_ 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #t))
|
||||
|
||||
;; Grapheme boundary #7
|
||||
;; Grapheme boundary #7s
|
||||
(('hangul-syllable-lv 'hangul-syllable-v)
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-lv 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
((or (_ 'hangul-syllable-lv)
|
||||
(_ 'hangul-syllable-v))
|
||||
(values 'hangul-syllable-lv #t))
|
||||
|
||||
;; Grapheme boundary #8
|
||||
((or (_ 'hangul-syllable-lvt)
|
||||
(_ 'hangul-syllable-t))
|
||||
(values 'hangul-syllable-lvt #t))
|
||||
;; Grapheme boundary #8s
|
||||
(('hangul-syllable-lvt 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #9
|
||||
((or (_ 'extend)
|
||||
(_ 'zero-width-joiner))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9a
|
||||
((_ 'spacing-mark)
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9b
|
||||
(('prepend _)
|
||||
(values 'any #f))
|
||||
((_ 'prepend)
|
||||
(values 'prepend #t))
|
||||
|
||||
;; Grapheme boundary #11 (emoji!)
|
||||
;; Grapheme boundary #11s (emoji!)
|
||||
(('extended-pictographic 'extend)
|
||||
(values 'extended-pictographic #f))
|
||||
(('extended-pictographic 'zero-width-joiner)
|
||||
(values 'extended-pictographic+zero-width-joiner #f))
|
||||
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
||||
(values 'extended-pictographic #f))
|
||||
((_ 'extended-pictographic)
|
||||
(values 'extended-pictographic #t))
|
||||
|
||||
|
||||
;; Grapheme boundaries #12 and #13
|
||||
;; Grapheme boundaries #12s and #13s
|
||||
(('regional-indicator-odd 'regional-indicator)
|
||||
(values 'regioinal-indicator-even #f))
|
||||
(('regional-indicator-even 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; NON-SPECIFIC RULES
|
||||
|
||||
;; Grapheme boundary #4n
|
||||
((or ('carriage-return _)
|
||||
('control+line-feed _))
|
||||
(values 'any #t))
|
||||
|
||||
;; Grapheme boundary #5n
|
||||
((_ 'carriage-return)
|
||||
(values 'carriage-return #t))
|
||||
((or (_ 'line-feed)
|
||||
(_ 'control))
|
||||
(values 'control+line-feed #t))
|
||||
((_ 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #t))
|
||||
|
||||
;; Grapheme boundary #7n
|
||||
((or (_ 'hangul-syllable-lv)
|
||||
(_ 'hangul-syllable-v))
|
||||
(values 'hangul-syllable-lv #t))
|
||||
|
||||
;; Grapheme boundary #8n
|
||||
((or (_ 'hangul-syllable-lvt)
|
||||
(_ 'hangul-syllable-t))
|
||||
(values 'hangul-syllable-lvt #t))
|
||||
|
||||
;; Grapheme boundary #9n
|
||||
((or (_ 'extend)
|
||||
(_ 'zero-width-joiner))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-A
|
||||
((_ 'spacing-mark)
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-B
|
||||
(('prepend _)
|
||||
(values 'any #f))
|
||||
((_ 'prepend)
|
||||
(values 'prepend #t))
|
||||
|
||||
;; Grapheme boundary #11n (emoji!)
|
||||
((_ 'extended-pictographic)
|
||||
(values 'extended-pictographic #t))
|
||||
|
||||
;; Grapheme boundaries #12n and #13n
|
||||
((_ 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; Everything else considered a boundeary
|
||||
;; Everything else considered a boundary
|
||||
(else (values 'any #t))))
|
||||
|
||||
|
||||
(define glyphs-reverse '())
|
||||
(define grapheme-width 0)
|
||||
|
||||
(define hit-eof #f)
|
||||
;; State variables, initialized with the first glyph we get!
|
||||
(define first-glyph (get-char port))
|
||||
(define hit-eof? (eof-object? first-glyph))
|
||||
(define %glyphs-reverse (list first-glyph))
|
||||
(define %grapheme-width 0)
|
||||
|
||||
(define (iterate-through-grapheme state)
|
||||
(define glyph (get-char port))
|
||||
(if
|
||||
(eof-object? glyph)
|
||||
(begin
|
||||
(set! hit-eof #t)
|
||||
state)
|
||||
(begin
|
||||
(set! glyphs-reverse (cons glyph glyphs-reverse))
|
||||
(let* ((width property (char-width glyph))
|
||||
(next-state boundary? (state-machine state property)))
|
||||
(define glyph (peek-char port))
|
||||
(if (eof-object? glyph)
|
||||
(begin
|
||||
(set! hit-eof? #t)
|
||||
state)
|
||||
(begin
|
||||
(let* ((width property (char-width glyph))
|
||||
(next-state boundary? (state-machine state property)))
|
||||
|
||||
(set! grapheme-width (+ grapheme-width width))
|
||||
(if boundary?
|
||||
state
|
||||
(let ((new-width (+ %grapheme-width width))
|
||||
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
|
||||
|
||||
(if boundary?
|
||||
state
|
||||
(iterate-through-grapheme state))))))
|
||||
;; Officially induct this char into the cluster
|
||||
(get-char port)
|
||||
(set! %grapheme-width new-width)
|
||||
(set! %glyphs-reverse new-glyphs-reverse)
|
||||
(iterate-through-grapheme next-state)))))))
|
||||
|
||||
(define final-state (iterate-through-grapheme 'any))
|
||||
;; Need to explicitly get the first state
|
||||
;; Skip all this if it's an empty stream at start
|
||||
(if hit-eof?
|
||||
stream-null
|
||||
(let ((first-width first-prop (char-width first-glyph)))
|
||||
(set! %grapheme-width first-width)
|
||||
(iterate-through-grapheme (state-machine 'any first-prop))
|
||||
|
||||
(if
|
||||
hit-eof
|
||||
stream-null
|
||||
(stream-cons
|
||||
(make-grapheme
|
||||
grapheme-width
|
||||
;; Delay to avoid construction of unnecessary lists and strings!
|
||||
(delay (reverse-list->string glyphs-reverse))
|
||||
(delay (reverse glyphs-reverse)))
|
||||
(input->grapheme-stream port))))
|
||||
(let ((grapheme (make-grapheme
|
||||
%grapheme-width
|
||||
;; Delay to avoid construction of unnecessary lists and strings!
|
||||
(delay (reverse %glyphs-reverse))
|
||||
(delay (reverse-list->string %glyphs-reverse)))))
|
||||
|
||||
;; If we hit the eof here, we need one last stream entry, otherwise iterate further
|
||||
(stream-cons grapheme
|
||||
(if hit-eof?
|
||||
stream-null
|
||||
(input->grapheme-stream port)))))))
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#:use-module (ice-9 textual-ports)
|
||||
#:use-module (ice-9 exceptions)
|
||||
#:use-module (ice-9 i18n)
|
||||
#:use-module (ice-9 pretty-print)
|
||||
#:use-module (web uri)
|
||||
#:use-module (web client)
|
||||
#:use-module (web request)
|
||||
|
@ -13,6 +14,8 @@
|
|||
@codepoint-range
|
||||
@comment
|
||||
@ws
|
||||
cpk-values
|
||||
cpk
|
||||
cons-hash-list!
|
||||
hex-string->integer
|
||||
format-exception-msg
|
||||
|
@ -95,4 +98,26 @@
|
|||
(with-input-from-file path
|
||||
(λ () (get-string-all (current-input-port)))) #\newline))
|
||||
|
||||
;; kludge: emacs mis-renders this as a string, so...
|
||||
(define hash-semicolon-semicolon
|
||||
(list->string '(#\# #\; #\;)))
|
||||
|
||||
;; for debugging
|
||||
(define (cpk . vals)
|
||||
"Peek at values for print debugging, but return 'em"
|
||||
(display hash-semicolon-semicolon (current-error-port))
|
||||
(display " cpk\n" (current-error-port))
|
||||
(pretty-print vals (current-error-port))
|
||||
;; return the last value
|
||||
(last vals))
|
||||
|
||||
(define-syntax-rule (cpk-values print-these ... body)
|
||||
;; Like pk, but supporting multiple value return
|
||||
(call-with-values
|
||||
(lambda () body)
|
||||
(lambda vals
|
||||
(display hash-semicolon-semicolon (current-error-port))
|
||||
(display " cpk-values\n" (current-error-port))
|
||||
(pretty-print (list print-these ... '*values:* vals)
|
||||
(current-error-port))
|
||||
(apply values vals))))
|
||||
|
|
Loading…
Reference in a new issue