Compare commits
3 commits
main
...
fixing-str
Author | SHA1 | Date | |
---|---|---|---|
dc90b8fd6c | |||
a4938d04e1 | |||
4ac39aebd1 |
5 changed files with 1597 additions and 1569 deletions
|
@ -123,10 +123,10 @@
|
||||||
#:use-module (ice-9 hash-table)
|
#:use-module (ice-9 hash-table)
|
||||||
#:use-module (srfi srfi-1)
|
#:use-module (srfi srfi-1)
|
||||||
#:use-module (uniseg internal)
|
#:use-module (uniseg internal)
|
||||||
|
#:use-module (uniseg emoji)
|
||||||
#:export (,@grapheme-symbol-names
|
#:export (,@grapheme-symbol-names
|
||||||
grapheme-charsets)))
|
grapheme-charsets)))
|
||||||
|
|
||||||
|
|
||||||
(pretty-print
|
(pretty-print
|
||||||
`(define grapheme-ht
|
`(define grapheme-ht
|
||||||
(alist->hashq-table ',(hash-map->list cons grapheme-ht))))
|
(alist->hashq-table ',(hash-map->list cons grapheme-ht))))
|
||||||
|
@ -149,7 +149,10 @@
|
||||||
(let ((f (first pair))
|
(let ((f (first pair))
|
||||||
(s (second pair)))
|
(s (second pair)))
|
||||||
`(list ',f ,s)))
|
`(list ',f ,s)))
|
||||||
grapheme-sets-and-symbols))))
|
grapheme-sets-and-symbols)
|
||||||
|
;; Need emoji in this set too!
|
||||||
|
(list 'extended-pictographic char-set:emoji-extended-pictographic))))
|
||||||
|
|
||||||
|
|
||||||
(display "\n")
|
(display "\n")
|
||||||
|
|
||||||
|
|
|
@ -41,8 +41,8 @@
|
||||||
(test-equal "double-em-dash is in right place"
|
(test-equal "double-em-dash is in right place"
|
||||||
"⸺" (grapheme-string double-em-dash-grapheme))
|
"⸺" (grapheme-string double-em-dash-grapheme))
|
||||||
|
|
||||||
(test-equal "double-em-dash is double-width"
|
(test-equal "double-em-dash is triple-width"
|
||||||
2 (grapheme-width double-em-dash-grapheme))
|
3 (grapheme-width double-em-dash-grapheme))
|
||||||
|
|
||||||
(advance-stream!)
|
(advance-stream!)
|
||||||
|
|
||||||
|
@ -51,4 +51,9 @@
|
||||||
(test-equal "advancing one goes to the a"
|
(test-equal "advancing one goes to the a"
|
||||||
"a" (grapheme-string a-grapheme))
|
"a" (grapheme-string a-grapheme))
|
||||||
|
|
||||||
|
(define empty-stream (string->grapheme-stream ""))
|
||||||
|
|
||||||
|
(test-equal "a stream of nothing resolves to an empty stream"
|
||||||
|
#t (stream-null? empty-stream))
|
||||||
|
|
||||||
(test-end "tests-graphemes-stream")
|
(test-end "tests-graphemes-stream")
|
||||||
|
|
1256
uniseg/graphemes.scm
1256
uniseg/graphemes.scm
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,6 @@
|
||||||
(define-module (uniseg graphemes stream)
|
(define-module (uniseg graphemes stream)
|
||||||
#:use-module (uniseg)
|
#:use-module (uniseg)
|
||||||
|
#:use-module (uniseg internal)
|
||||||
#:use-module (ice-9 textual-ports)
|
#:use-module (ice-9 textual-ports)
|
||||||
#:use-module (ice-9 match)
|
#:use-module (ice-9 match)
|
||||||
#:use-module (srfi srfi-71)
|
#:use-module (srfi srfi-71)
|
||||||
|
@ -41,23 +42,14 @@
|
||||||
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
||||||
(define (state-machine cur-state cur-prop)
|
(define (state-machine cur-state cur-prop)
|
||||||
(match (list cur-state cur-prop)
|
(match (list cur-state cur-prop)
|
||||||
;; Grapheme boundary #3
|
;; Specifics need to go first, and then non-specifics afterwards
|
||||||
|
;; SPECIFIC RULES - no `_' in the match
|
||||||
|
|
||||||
|
;; Grapheme boundary #3s
|
||||||
(('carriage-return 'line-feed)
|
(('carriage-return 'line-feed)
|
||||||
(values 'control+line-feed #f))
|
(values 'control+line-feed #f))
|
||||||
|
|
||||||
;; Grapheme boundary #4
|
;; Grapheme boundary #6s
|
||||||
((or ('carriage-return _)
|
|
||||||
('control+line-feed _))
|
|
||||||
(values 'any #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #5
|
|
||||||
((_ 'carriage-return)
|
|
||||||
(values 'carriage-return #t))
|
|
||||||
((or (_ 'line-feed)
|
|
||||||
(_ 'control))
|
|
||||||
(values 'control+line-feed #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #6
|
|
||||||
(('hangul-syllable-l 'hangul-syllable-l)
|
(('hangul-syllable-l 'hangul-syllable-l)
|
||||||
(values 'hangul-syllable-l #f))
|
(values 'hangul-syllable-l #f))
|
||||||
((or ('hangul-syllable-l 'hangul-syllable-v)
|
((or ('hangul-syllable-l 'hangul-syllable-v)
|
||||||
|
@ -65,95 +57,126 @@
|
||||||
(values 'hangul-syllable-lv #f))
|
(values 'hangul-syllable-lv #f))
|
||||||
(('hangul-syllable-l 'hangul-syllable-lvt)
|
(('hangul-syllable-l 'hangul-syllable-lvt)
|
||||||
(values 'hangul-syllable-lvt #f))
|
(values 'hangul-syllable-lvt #f))
|
||||||
((_ 'hangul-syllable-l)
|
|
||||||
(values 'hangul-syllable-l #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #7
|
;; Grapheme boundary #7s
|
||||||
(('hangul-syllable-lv 'hangul-syllable-v)
|
(('hangul-syllable-lv 'hangul-syllable-v)
|
||||||
(values 'hangul-syllable-lv #f))
|
(values 'hangul-syllable-lv #f))
|
||||||
(('hangul-syllable-lv 'hangul-syllable-t)
|
(('hangul-syllable-lv 'hangul-syllable-t)
|
||||||
(values 'hangul-syllable-lvt #f))
|
(values 'hangul-syllable-lvt #f))
|
||||||
((or (_ 'hangul-syllable-lv)
|
|
||||||
(_ 'hangul-syllable-v))
|
|
||||||
(values 'hangul-syllable-lv #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #8
|
;; Grapheme boundary #8s
|
||||||
((or (_ 'hangul-syllable-lvt)
|
|
||||||
(_ 'hangul-syllable-t))
|
|
||||||
(values 'hangul-syllable-lvt #t))
|
|
||||||
(('hangul-syllable-lvt 'hangul-syllable-t)
|
(('hangul-syllable-lvt 'hangul-syllable-t)
|
||||||
(values 'hangul-syllable-lvt #f))
|
(values 'hangul-syllable-lvt #f))
|
||||||
|
|
||||||
;; Grapheme boundary #9
|
;; Grapheme boundary #11s (emoji!)
|
||||||
((or (_ 'extend)
|
|
||||||
(_ 'zero-width-joiner))
|
|
||||||
(values 'any #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #9a
|
|
||||||
((_ 'spacing-mark)
|
|
||||||
(values 'any #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #9b
|
|
||||||
(('prepend _)
|
|
||||||
(values 'any #f))
|
|
||||||
((_ 'prepend)
|
|
||||||
(values 'prepend #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #11 (emoji!)
|
|
||||||
(('extended-pictographic 'extend)
|
(('extended-pictographic 'extend)
|
||||||
(values 'extended-pictographic #f))
|
(values 'extended-pictographic #f))
|
||||||
(('extended-pictographic 'zero-width-joiner)
|
(('extended-pictographic 'zero-width-joiner)
|
||||||
(values 'extended-pictographic+zero-width-joiner #f))
|
(values 'extended-pictographic+zero-width-joiner #f))
|
||||||
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
||||||
(values 'extended-pictographic #f))
|
(values 'extended-pictographic #f))
|
||||||
((_ 'extended-pictographic)
|
|
||||||
(values 'extended-pictographic #t))
|
|
||||||
|
|
||||||
|
;; Grapheme boundaries #12s and #13s
|
||||||
;; Grapheme boundaries #12 and #13
|
|
||||||
(('regional-indicator-odd 'regional-indicator)
|
(('regional-indicator-odd 'regional-indicator)
|
||||||
(values 'regioinal-indicator-even #f))
|
(values 'regioinal-indicator-even #f))
|
||||||
(('regional-indicator-even 'regional-indicator)
|
(('regional-indicator-even 'regional-indicator)
|
||||||
(values 'regional-indicator-odd #t))
|
(values 'regional-indicator-odd #t))
|
||||||
|
|
||||||
|
;; NON-SPECIFIC RULES
|
||||||
|
|
||||||
|
;; Grapheme boundary #4n
|
||||||
|
((or ('carriage-return _)
|
||||||
|
('control+line-feed _))
|
||||||
|
(values 'any #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #5n
|
||||||
|
((_ 'carriage-return)
|
||||||
|
(values 'carriage-return #t))
|
||||||
|
((or (_ 'line-feed)
|
||||||
|
(_ 'control))
|
||||||
|
(values 'control+line-feed #t))
|
||||||
|
((_ 'hangul-syllable-l)
|
||||||
|
(values 'hangul-syllable-l #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #7n
|
||||||
|
((or (_ 'hangul-syllable-lv)
|
||||||
|
(_ 'hangul-syllable-v))
|
||||||
|
(values 'hangul-syllable-lv #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #8n
|
||||||
|
((or (_ 'hangul-syllable-lvt)
|
||||||
|
(_ 'hangul-syllable-t))
|
||||||
|
(values 'hangul-syllable-lvt #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #9n
|
||||||
|
((or (_ 'extend)
|
||||||
|
(_ 'zero-width-joiner))
|
||||||
|
(values 'any #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #9n-A
|
||||||
|
((_ 'spacing-mark)
|
||||||
|
(values 'any #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #9n-B
|
||||||
|
(('prepend _)
|
||||||
|
(values 'any #f))
|
||||||
|
((_ 'prepend)
|
||||||
|
(values 'prepend #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #11n (emoji!)
|
||||||
|
((_ 'extended-pictographic)
|
||||||
|
(values 'extended-pictographic #t))
|
||||||
|
|
||||||
|
;; Grapheme boundaries #12n and #13n
|
||||||
((_ 'regional-indicator)
|
((_ 'regional-indicator)
|
||||||
(values 'regional-indicator-odd #t))
|
(values 'regional-indicator-odd #t))
|
||||||
|
|
||||||
;; Everything else considered a boundeary
|
;; Everything else considered a boundary
|
||||||
(else (values 'any #t))))
|
(else (values 'any #t))))
|
||||||
|
|
||||||
|
;; State variables, initialized with the first glyph we get!
|
||||||
(define glyphs-reverse '())
|
(define first-glyph (get-char port))
|
||||||
(define grapheme-width 0)
|
(define hit-eof? (eof-object? first-glyph))
|
||||||
|
(define %glyphs-reverse (list first-glyph))
|
||||||
(define hit-eof #f)
|
(define %grapheme-width 0)
|
||||||
|
|
||||||
(define (iterate-through-grapheme state)
|
(define (iterate-through-grapheme state)
|
||||||
(define glyph (get-char port))
|
(define glyph (peek-char port))
|
||||||
(if
|
(if (eof-object? glyph)
|
||||||
(eof-object? glyph)
|
|
||||||
(begin
|
(begin
|
||||||
(set! hit-eof #t)
|
(set! hit-eof? #t)
|
||||||
state)
|
state)
|
||||||
(begin
|
(begin
|
||||||
(set! glyphs-reverse (cons glyph glyphs-reverse))
|
|
||||||
(let* ((width property (char-width glyph))
|
(let* ((width property (char-width glyph))
|
||||||
(next-state boundary? (state-machine state property)))
|
(next-state boundary? (state-machine state property)))
|
||||||
|
|
||||||
(set! grapheme-width (+ grapheme-width width))
|
|
||||||
|
|
||||||
(if boundary?
|
(if boundary?
|
||||||
state
|
state
|
||||||
(iterate-through-grapheme state))))))
|
(let ((new-width (+ %grapheme-width width))
|
||||||
|
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
|
||||||
|
|
||||||
(define final-state (iterate-through-grapheme 'any))
|
;; Officially induct this char into the cluster
|
||||||
|
(get-char port)
|
||||||
|
(set! %grapheme-width new-width)
|
||||||
|
(set! %glyphs-reverse new-glyphs-reverse)
|
||||||
|
(iterate-through-grapheme next-state)))))))
|
||||||
|
|
||||||
(if
|
;; Need to explicitly get the first state
|
||||||
hit-eof
|
;; Skip all this if it's an empty stream at start
|
||||||
|
(if hit-eof?
|
||||||
stream-null
|
stream-null
|
||||||
(stream-cons
|
(let ((first-width first-prop (char-width first-glyph)))
|
||||||
(make-grapheme
|
(set! %grapheme-width first-width)
|
||||||
grapheme-width
|
(iterate-through-grapheme (state-machine 'any first-prop))
|
||||||
|
|
||||||
|
(let ((grapheme (make-grapheme
|
||||||
|
%grapheme-width
|
||||||
;; Delay to avoid construction of unnecessary lists and strings!
|
;; Delay to avoid construction of unnecessary lists and strings!
|
||||||
(delay (reverse-list->string glyphs-reverse))
|
(delay (reverse %glyphs-reverse))
|
||||||
(delay (reverse glyphs-reverse)))
|
(delay (reverse-list->string %glyphs-reverse)))))
|
||||||
(input->grapheme-stream port))))
|
|
||||||
|
;; If we hit the eof here, we need one last stream entry, otherwise iterate further
|
||||||
|
(stream-cons grapheme
|
||||||
|
(if hit-eof?
|
||||||
|
stream-null
|
||||||
|
(input->grapheme-stream port)))))))
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#:use-module (ice-9 textual-ports)
|
#:use-module (ice-9 textual-ports)
|
||||||
#:use-module (ice-9 exceptions)
|
#:use-module (ice-9 exceptions)
|
||||||
#:use-module (ice-9 i18n)
|
#:use-module (ice-9 i18n)
|
||||||
|
#:use-module (ice-9 pretty-print)
|
||||||
#:use-module (web uri)
|
#:use-module (web uri)
|
||||||
#:use-module (web client)
|
#:use-module (web client)
|
||||||
#:use-module (web request)
|
#:use-module (web request)
|
||||||
|
@ -13,6 +14,8 @@
|
||||||
@codepoint-range
|
@codepoint-range
|
||||||
@comment
|
@comment
|
||||||
@ws
|
@ws
|
||||||
|
cpk-values
|
||||||
|
cpk
|
||||||
cons-hash-list!
|
cons-hash-list!
|
||||||
hex-string->integer
|
hex-string->integer
|
||||||
format-exception-msg
|
format-exception-msg
|
||||||
|
@ -95,4 +98,26 @@
|
||||||
(with-input-from-file path
|
(with-input-from-file path
|
||||||
(λ () (get-string-all (current-input-port)))) #\newline))
|
(λ () (get-string-all (current-input-port)))) #\newline))
|
||||||
|
|
||||||
|
;; kludge: emacs mis-renders this as a string, so...
|
||||||
|
(define hash-semicolon-semicolon
|
||||||
|
(list->string '(#\# #\; #\;)))
|
||||||
|
|
||||||
|
;; for debugging
|
||||||
|
(define (cpk . vals)
|
||||||
|
"Peek at values for print debugging, but return 'em"
|
||||||
|
(display hash-semicolon-semicolon (current-error-port))
|
||||||
|
(display " cpk\n" (current-error-port))
|
||||||
|
(pretty-print vals (current-error-port))
|
||||||
|
;; return the last value
|
||||||
|
(last vals))
|
||||||
|
|
||||||
|
(define-syntax-rule (cpk-values print-these ... body)
|
||||||
|
;; Like pk, but supporting multiple value return
|
||||||
|
(call-with-values
|
||||||
|
(lambda () body)
|
||||||
|
(lambda vals
|
||||||
|
(display hash-semicolon-semicolon (current-error-port))
|
||||||
|
(display " cpk-values\n" (current-error-port))
|
||||||
|
(pretty-print (list print-these ... '*values:* vals)
|
||||||
|
(current-error-port))
|
||||||
|
(apply values vals))))
|
||||||
|
|
Loading…
Reference in a new issue