Yet another reorganization, and solve Christine's 'rude problem'
- We change the stream iterator to *always* return a grapheme (except for EOF). The grapheme then gets built up over time. - This way, trans flag for example is first white flag, then white flag + zwj, etc until it finally transforms into the trans flag. - Users of the stream library can then use the `modification?' flag to determine if the stream value is a modification of the prior grapheme instead of a new grapheme. - Abstracted iteration to an iterator object to support use cases where we don't have an input stream (reflow needs this!)
This commit is contained in:
parent
1c0c4634f9
commit
31012d5b8f
29
hall.scm
29
hall.scm
|
@ -19,20 +19,27 @@
|
||||||
(native-language-support #f)
|
(native-language-support #f)
|
||||||
(licensing #f)))
|
(licensing #f)))
|
||||||
(files (libraries
|
(files (libraries
|
||||||
((scheme-file "uniseg")
|
((directory
|
||||||
(directory
|
|
||||||
"uniseg"
|
"uniseg"
|
||||||
((scheme-file "emoji")
|
((directory
|
||||||
|
"charsets"
|
||||||
|
((scheme-file "emoji")
|
||||||
|
(scheme-file "eastasian")
|
||||||
|
(scheme-file "graphemes")))
|
||||||
|
(directory
|
||||||
|
"graphemes"
|
||||||
|
((scheme-file "iterator") (scheme-file "stream")))
|
||||||
(directory "eastasian" ((scheme-file "locale")))
|
(directory "eastasian" ((scheme-file "locale")))
|
||||||
(scheme-file "eastasian")
|
|
||||||
(directory "graphemes" ((scheme-file "stream")))
|
|
||||||
(scheme-file "graphemes")
|
(scheme-file "graphemes")
|
||||||
(scheme-file "internal")))))
|
(scheme-file "internal")))
|
||||||
(tests ((directory
|
(scheme-file "uniseg")))
|
||||||
"tests"
|
(tests
|
||||||
((scheme-file "test-eastasian-locale")
|
((directory
|
||||||
(scheme-file "test-uniseg")
|
"tests"
|
||||||
(scheme-file "test-graphemes-stream")))))
|
((scheme-file "test-eastasian-locale")
|
||||||
|
(scheme-file "test-uniseg")
|
||||||
|
(scheme-file "test-graphemes-stream")
|
||||||
|
(scheme-file "test-graphemes-iterator")))))
|
||||||
(programs
|
(programs
|
||||||
((directory
|
((directory
|
||||||
"scripts"
|
"scripts"
|
||||||
|
|
|
@ -44,7 +44,7 @@
|
||||||
(λ (prop) (symbol-with-prefix "char-set:eastasian-" prop))
|
(λ (prop) (symbol-with-prefix "char-set:eastasian-" prop))
|
||||||
eastasian-properties))
|
eastasian-properties))
|
||||||
|
|
||||||
(define file "uniseg/eastasian.scm")
|
(define file "uniseg/charsets/eastasian.scm")
|
||||||
|
|
||||||
(format stdout "Writing to ~a...\n" file)
|
(format stdout "Writing to ~a...\n" file)
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@
|
||||||
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
||||||
|
|
||||||
(pretty-print
|
(pretty-print
|
||||||
`(define-module (uniseg eastasian)
|
`(define-module (uniseg charsets eastasian)
|
||||||
#:use-module (uniseg internal)
|
#:use-module (uniseg internal)
|
||||||
#:use-module (ice-9 hash-table)
|
#:use-module (ice-9 hash-table)
|
||||||
#:use-module (srfi srfi-1)
|
#:use-module (srfi srfi-1)
|
||||||
|
|
|
@ -40,7 +40,7 @@
|
||||||
("Emoji_Component" 'emoji-component)
|
("Emoji_Component" 'emoji-component)
|
||||||
("Extended_Pictographic" 'emoji-extended-pictographic)))
|
("Extended_Pictographic" 'emoji-extended-pictographic)))
|
||||||
|
|
||||||
(define file "uniseg/emoji.scm")
|
(define file "uniseg/charsets/emoji.scm")
|
||||||
|
|
||||||
(format stdout "Writing to ~a...\n" file)
|
(format stdout "Writing to ~a...\n" file)
|
||||||
|
|
||||||
|
@ -49,7 +49,7 @@
|
||||||
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
||||||
|
|
||||||
(pretty-print
|
(pretty-print
|
||||||
`(define-module (uniseg emoji)
|
`(define-module (uniseg charsets emoji)
|
||||||
#:use-module (uniseg internal)
|
#:use-module (uniseg internal)
|
||||||
#:use-module (ice-9 hash-table)
|
#:use-module (ice-9 hash-table)
|
||||||
#:use-module (srfi srfi-1)
|
#:use-module (srfi srfi-1)
|
||||||
|
|
|
@ -56,7 +56,7 @@
|
||||||
("SpacingMark" 'spacing-mark)
|
("SpacingMark" 'spacing-mark)
|
||||||
("ZWJ" 'zero-width-joiner)))
|
("ZWJ" 'zero-width-joiner)))
|
||||||
|
|
||||||
(define file "uniseg/graphemes.scm")
|
(define file "uniseg/charsets/graphemes.scm")
|
||||||
|
|
||||||
(format stdout "Writing to ~a...\n" file)
|
(format stdout "Writing to ~a...\n" file)
|
||||||
|
|
||||||
|
@ -65,11 +65,11 @@
|
||||||
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
||||||
|
|
||||||
(pretty-print
|
(pretty-print
|
||||||
`(define-module (uniseg graphemes)
|
`(define-module (uniseg charsets graphemes)
|
||||||
#:use-module (ice-9 hash-table)
|
#:use-module (ice-9 hash-table)
|
||||||
#:use-module (srfi srfi-1)
|
#:use-module (srfi srfi-1)
|
||||||
#:use-module (uniseg internal)
|
#:use-module (uniseg internal)
|
||||||
#:use-module (uniseg emoji)
|
#:use-module (uniseg charsets emoji)
|
||||||
#:export (,@grapheme-symbols
|
#:export (,@grapheme-symbols
|
||||||
grapheme-charsets)))
|
grapheme-charsets)))
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
(define-module (tests test-graphemes-stream)
|
(define-module (tests test-graphemes-stream)
|
||||||
|
#:use-module (uniseg graphemes)
|
||||||
#:use-module (uniseg graphemes stream)
|
#:use-module (uniseg graphemes stream)
|
||||||
|
#:use-module (uniseg internal)
|
||||||
#:use-module (srfi srfi-41)
|
#:use-module (srfi srfi-41)
|
||||||
#:use-module (srfi srfi-64))
|
#:use-module (srfi srfi-64))
|
||||||
|
|
||||||
|
@ -10,11 +12,10 @@
|
||||||
|
|
||||||
(define* (advance-stream! #:optional (times 1))
|
(define* (advance-stream! #:optional (times 1))
|
||||||
(for-each
|
(for-each
|
||||||
(λ (_)
|
(λ (_) (set! stream (stream-cdr stream)))
|
||||||
(set! stream (stream-cdr stream)))
|
|
||||||
(make-list times)))
|
(make-list times)))
|
||||||
|
|
||||||
(advance-stream! 6)
|
(advance-stream! 10)
|
||||||
|
|
||||||
(define trans-flag-grapheme (stream-car stream))
|
(define trans-flag-grapheme (stream-car stream))
|
||||||
|
|
||||||
|
@ -56,4 +57,11 @@
|
||||||
(test-equal "a stream of nothing resolves to an empty stream"
|
(test-equal "a stream of nothing resolves to an empty stream"
|
||||||
#t (stream-null? empty-stream))
|
#t (stream-null? empty-stream))
|
||||||
|
|
||||||
|
(define singleton-stream (string->grapheme-stream "a"))
|
||||||
|
|
||||||
|
(set! a-grapheme (stream-car stream))
|
||||||
|
|
||||||
|
(test-equal "a stream with a single character resolves to a grapheme"
|
||||||
|
"a" (grapheme-string a-grapheme))
|
||||||
|
|
||||||
(test-end "tests-graphemes-stream")
|
(test-end "tests-graphemes-stream")
|
||||||
|
|
|
@ -2,10 +2,11 @@
|
||||||
#:use-module (srfi srfi-1)
|
#:use-module (srfi srfi-1)
|
||||||
#:use-module (ice-9 match)
|
#:use-module (ice-9 match)
|
||||||
#:use-module (srfi srfi-41)
|
#:use-module (srfi srfi-41)
|
||||||
#:use-module (uniseg emoji)
|
|
||||||
#:use-module (uniseg graphemes)
|
#:use-module (uniseg graphemes)
|
||||||
#:use-module (uniseg graphemes stream)
|
#:use-module (uniseg graphemes stream)
|
||||||
#:use-module (uniseg eastasian)
|
#:use-module (uniseg charsets emoji)
|
||||||
|
#:use-module (uniseg charsets eastasian)
|
||||||
|
#:use-module (uniseg charsets graphemes)
|
||||||
#:export (emoji?
|
#:export (emoji?
|
||||||
char->grapheme-property
|
char->grapheme-property
|
||||||
char->eastasian-property
|
char->eastasian-property
|
||||||
|
@ -99,6 +100,6 @@
|
||||||
"Get the width of a string by adding up the widths of each grapheme"
|
"Get the width of a string by adding up the widths of each grapheme"
|
||||||
(stream-fold
|
(stream-fold
|
||||||
(λ (val grapheme)
|
(λ (val grapheme)
|
||||||
(+ val (grapheme-width grapheme)))
|
(+ val (grapheme-delta-width grapheme)))
|
||||||
0
|
0
|
||||||
(string->grapheme-stream str)))
|
(string->grapheme-stream str)))
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
;; Code generated by generate-eastasian. DO NOT EDIT
|
;; Code generated by generate-eastasian. DO NOT EDIT
|
||||||
|
|
||||||
(define-module
|
(define-module
|
||||||
(uniseg eastasian)
|
(uniseg charsets eastasian)
|
||||||
#:use-module
|
#:use-module
|
||||||
(uniseg internal)
|
(uniseg internal)
|
||||||
#:use-module
|
#:use-module
|
||||||
|
@ -18,7 +18,50 @@
|
||||||
eastasian-charsets))
|
eastasian-charsets))
|
||||||
(define hashtable
|
(define hashtable
|
||||||
(alist->hashq-table
|
(alist->hashq-table
|
||||||
'((combining
|
'((narrow
|
||||||
|
(10630 10630)
|
||||||
|
(10629 10629)
|
||||||
|
(10221 10221)
|
||||||
|
(10220 10220)
|
||||||
|
(10219 10219)
|
||||||
|
(10218 10218)
|
||||||
|
(10217 10217)
|
||||||
|
(10216 10216)
|
||||||
|
(10215 10215)
|
||||||
|
(10214 10214)
|
||||||
|
(175 175)
|
||||||
|
(172 172)
|
||||||
|
(166 166)
|
||||||
|
(165 165)
|
||||||
|
(162 163)
|
||||||
|
(126 126)
|
||||||
|
(125 125)
|
||||||
|
(124 124)
|
||||||
|
(123 123)
|
||||||
|
(97 122)
|
||||||
|
(96 96)
|
||||||
|
(95 95)
|
||||||
|
(94 94)
|
||||||
|
(93 93)
|
||||||
|
(92 92)
|
||||||
|
(91 91)
|
||||||
|
(65 90)
|
||||||
|
(63 64)
|
||||||
|
(60 62)
|
||||||
|
(58 59)
|
||||||
|
(48 57)
|
||||||
|
(46 47)
|
||||||
|
(45 45)
|
||||||
|
(44 44)
|
||||||
|
(43 43)
|
||||||
|
(42 42)
|
||||||
|
(41 41)
|
||||||
|
(40 40)
|
||||||
|
(37 39)
|
||||||
|
(36 36)
|
||||||
|
(33 35)
|
||||||
|
(32 32))
|
||||||
|
(combining
|
||||||
(125136 125142)
|
(125136 125142)
|
||||||
(122918 122922)
|
(122918 122922)
|
||||||
(122915 122916)
|
(122915 122916)
|
||||||
|
@ -71,67 +114,6 @@
|
||||||
(1160 1161)
|
(1160 1161)
|
||||||
(1155 1159)
|
(1155 1159)
|
||||||
(768 879))
|
(768 879))
|
||||||
(halfwidth
|
|
||||||
(65517 65518)
|
|
||||||
(65513 65516)
|
|
||||||
(65512 65512)
|
|
||||||
(65498 65500)
|
|
||||||
(65490 65495)
|
|
||||||
(65482 65487)
|
|
||||||
(65474 65479)
|
|
||||||
(65440 65470)
|
|
||||||
(65438 65439)
|
|
||||||
(65393 65437)
|
|
||||||
(65392 65392)
|
|
||||||
(65382 65391)
|
|
||||||
(65380 65381)
|
|
||||||
(65379 65379)
|
|
||||||
(65378 65378)
|
|
||||||
(65377 65377)
|
|
||||||
(8361 8361))
|
|
||||||
(narrow
|
|
||||||
(10630 10630)
|
|
||||||
(10629 10629)
|
|
||||||
(10221 10221)
|
|
||||||
(10220 10220)
|
|
||||||
(10219 10219)
|
|
||||||
(10218 10218)
|
|
||||||
(10217 10217)
|
|
||||||
(10216 10216)
|
|
||||||
(10215 10215)
|
|
||||||
(10214 10214)
|
|
||||||
(175 175)
|
|
||||||
(172 172)
|
|
||||||
(166 166)
|
|
||||||
(165 165)
|
|
||||||
(162 163)
|
|
||||||
(126 126)
|
|
||||||
(125 125)
|
|
||||||
(124 124)
|
|
||||||
(123 123)
|
|
||||||
(97 122)
|
|
||||||
(96 96)
|
|
||||||
(95 95)
|
|
||||||
(94 94)
|
|
||||||
(93 93)
|
|
||||||
(92 92)
|
|
||||||
(91 91)
|
|
||||||
(65 90)
|
|
||||||
(63 64)
|
|
||||||
(60 62)
|
|
||||||
(58 59)
|
|
||||||
(48 57)
|
|
||||||
(46 47)
|
|
||||||
(45 45)
|
|
||||||
(44 44)
|
|
||||||
(43 43)
|
|
||||||
(42 42)
|
|
||||||
(41 41)
|
|
||||||
(40 40)
|
|
||||||
(37 39)
|
|
||||||
(36 36)
|
|
||||||
(33 35)
|
|
||||||
(32 32))
|
|
||||||
(ambiguous
|
(ambiguous
|
||||||
(1048576 1114109)
|
(1048576 1114109)
|
||||||
(983040 1048573)
|
(983040 1048573)
|
||||||
|
@ -330,6 +312,24 @@
|
||||||
(167 167)
|
(167 167)
|
||||||
(164 164)
|
(164 164)
|
||||||
(161 161))
|
(161 161))
|
||||||
|
(halfwidth
|
||||||
|
(65517 65518)
|
||||||
|
(65513 65516)
|
||||||
|
(65512 65512)
|
||||||
|
(65498 65500)
|
||||||
|
(65490 65495)
|
||||||
|
(65482 65487)
|
||||||
|
(65474 65479)
|
||||||
|
(65440 65470)
|
||||||
|
(65438 65439)
|
||||||
|
(65393 65437)
|
||||||
|
(65392 65392)
|
||||||
|
(65382 65391)
|
||||||
|
(65380 65381)
|
||||||
|
(65379 65379)
|
||||||
|
(65378 65378)
|
||||||
|
(65377 65377)
|
||||||
|
(8361 8361))
|
||||||
(neutral
|
(neutral
|
||||||
(917536 917631)
|
(917536 917631)
|
||||||
(917505 917505)
|
(917505 917505)
|
|
@ -1,7 +1,7 @@
|
||||||
;; Code generated by generate-emoji. DO NOT EDIT
|
;; Code generated by generate-emoji. DO NOT EDIT
|
||||||
|
|
||||||
(define-module
|
(define-module
|
||||||
(uniseg emoji)
|
(uniseg charsets emoji)
|
||||||
#:use-module
|
#:use-module
|
||||||
(uniseg internal)
|
(uniseg internal)
|
||||||
#:use-module
|
#:use-module
|
||||||
|
@ -18,7 +18,269 @@
|
||||||
emoji-charsets))
|
emoji-charsets))
|
||||||
(define hashtable
|
(define hashtable
|
||||||
(alist->hashq-table
|
(alist->hashq-table
|
||||||
'((emoji-modifier-base
|
'((emoji-presentation
|
||||||
|
(129744 129750)
|
||||||
|
(129728 129730)
|
||||||
|
(129712 129718)
|
||||||
|
(129686 129704)
|
||||||
|
(129680 129685)
|
||||||
|
(129667 129670)
|
||||||
|
(129664 129666)
|
||||||
|
(129656 129658)
|
||||||
|
(129652 129652)
|
||||||
|
(129648 129651)
|
||||||
|
(129511 129535)
|
||||||
|
(129488 129510)
|
||||||
|
(129485 129487)
|
||||||
|
(129483 129483)
|
||||||
|
(129475 129482)
|
||||||
|
(129473 129474)
|
||||||
|
(129472 129472)
|
||||||
|
(129466 129471)
|
||||||
|
(129456 129465)
|
||||||
|
(129454 129455)
|
||||||
|
(129451 129453)
|
||||||
|
(129445 129450)
|
||||||
|
(129443 129444)
|
||||||
|
(129432 129442)
|
||||||
|
(129426 129431)
|
||||||
|
(129413 129425)
|
||||||
|
(129408 129412)
|
||||||
|
(129404 129407)
|
||||||
|
(129403 129403)
|
||||||
|
(129402 129402)
|
||||||
|
(129399 129400)
|
||||||
|
(129395 129398)
|
||||||
|
(129394 129394)
|
||||||
|
(129393 129393)
|
||||||
|
(129388 129392)
|
||||||
|
(129375 129387)
|
||||||
|
(129360 129374)
|
||||||
|
(129357 129359)
|
||||||
|
(129356 129356)
|
||||||
|
(129351 129355)
|
||||||
|
(129344 129349)
|
||||||
|
(129343 129343)
|
||||||
|
(129340 129342)
|
||||||
|
(129331 129338)
|
||||||
|
(129329 129330)
|
||||||
|
(129328 129328)
|
||||||
|
(129320 129327)
|
||||||
|
(129312 129319)
|
||||||
|
(129311 129311)
|
||||||
|
(129305 129310)
|
||||||
|
(129296 129304)
|
||||||
|
(129293 129295)
|
||||||
|
(129292 129292)
|
||||||
|
(128992 129003)
|
||||||
|
(128763 128764)
|
||||||
|
(128762 128762)
|
||||||
|
(128761 128761)
|
||||||
|
(128759 128760)
|
||||||
|
(128756 128758)
|
||||||
|
(128747 128748)
|
||||||
|
(128726 128727)
|
||||||
|
(128725 128725)
|
||||||
|
(128721 128722)
|
||||||
|
(128720 128720)
|
||||||
|
(128716 128716)
|
||||||
|
(128705 128709)
|
||||||
|
(128704 128704)
|
||||||
|
(128703 128703)
|
||||||
|
(128697 128702)
|
||||||
|
(128695 128696)
|
||||||
|
(128694 128694)
|
||||||
|
(128691 128693)
|
||||||
|
(128690 128690)
|
||||||
|
(128686 128689)
|
||||||
|
(128679 128685)
|
||||||
|
(128678 128678)
|
||||||
|
(128676 128677)
|
||||||
|
(128675 128675)
|
||||||
|
(128674 128674)
|
||||||
|
(128667 128673)
|
||||||
|
(128665 128666)
|
||||||
|
(128664 128664)
|
||||||
|
(128663 128663)
|
||||||
|
(128662 128662)
|
||||||
|
(128661 128661)
|
||||||
|
(128660 128660)
|
||||||
|
(128657 128659)
|
||||||
|
(128656 128656)
|
||||||
|
(128655 128655)
|
||||||
|
(128654 128654)
|
||||||
|
(128653 128653)
|
||||||
|
(128652 128652)
|
||||||
|
(128650 128651)
|
||||||
|
(128649 128649)
|
||||||
|
(128648 128648)
|
||||||
|
(128647 128647)
|
||||||
|
(128646 128646)
|
||||||
|
(128643 128645)
|
||||||
|
(128641 128642)
|
||||||
|
(128640 128640)
|
||||||
|
(128581 128591)
|
||||||
|
(128577 128580)
|
||||||
|
(128567 128576)
|
||||||
|
(128566 128566)
|
||||||
|
(128565 128565)
|
||||||
|
(128564 128564)
|
||||||
|
(128560 128563)
|
||||||
|
(128558 128559)
|
||||||
|
(128557 128557)
|
||||||
|
(128556 128556)
|
||||||
|
(128552 128555)
|
||||||
|
(128550 128551)
|
||||||
|
(128544 128549)
|
||||||
|
(128543 128543)
|
||||||
|
(128540 128542)
|
||||||
|
(128539 128539)
|
||||||
|
(128538 128538)
|
||||||
|
(128537 128537)
|
||||||
|
(128536 128536)
|
||||||
|
(128535 128535)
|
||||||
|
(128534 128534)
|
||||||
|
(128533 128533)
|
||||||
|
(128530 128532)
|
||||||
|
(128529 128529)
|
||||||
|
(128528 128528)
|
||||||
|
(128527 128527)
|
||||||
|
(128526 128526)
|
||||||
|
(128521 128525)
|
||||||
|
(128519 128520)
|
||||||
|
(128513 128518)
|
||||||
|
(128512 128512)
|
||||||
|
(128507 128511)
|
||||||
|
(128420 128420)
|
||||||
|
(128405 128406)
|
||||||
|
(128378 128378)
|
||||||
|
(128348 128359)
|
||||||
|
(128336 128347)
|
||||||
|
(128331 128334)
|
||||||
|
(128302 128317)
|
||||||
|
(128300 128301)
|
||||||
|
(128278 128299)
|
||||||
|
(128277 128277)
|
||||||
|
(128266 128276)
|
||||||
|
(128265 128265)
|
||||||
|
(128264 128264)
|
||||||
|
(128260 128263)
|
||||||
|
(128259 128259)
|
||||||
|
(128255 128258)
|
||||||
|
(128249 128252)
|
||||||
|
(128248 128248)
|
||||||
|
(128246 128247)
|
||||||
|
(128245 128245)
|
||||||
|
(128240 128244)
|
||||||
|
(128239 128239)
|
||||||
|
(128238 128238)
|
||||||
|
(128236 128237)
|
||||||
|
(128184 128235)
|
||||||
|
(128182 128183)
|
||||||
|
(128174 128181)
|
||||||
|
(128173 128173)
|
||||||
|
(128110 128172)
|
||||||
|
(128108 128109)
|
||||||
|
(128102 128107)
|
||||||
|
(128101 128101)
|
||||||
|
(128066 128100)
|
||||||
|
(128064 128064)
|
||||||
|
(128043 128062)
|
||||||
|
(128042 128042)
|
||||||
|
(128023 128041)
|
||||||
|
(128022 128022)
|
||||||
|
(128021 128021)
|
||||||
|
(128020 128020)
|
||||||
|
(128019 128019)
|
||||||
|
(128017 128018)
|
||||||
|
(128015 128016)
|
||||||
|
(128012 128014)
|
||||||
|
(128009 128011)
|
||||||
|
(128008 128008)
|
||||||
|
(127992 128007)
|
||||||
|
(127988 127988)
|
||||||
|
(127973 127984)
|
||||||
|
(127972 127972)
|
||||||
|
(127968 127971)
|
||||||
|
(127951 127955)
|
||||||
|
(127946 127946)
|
||||||
|
(127945 127945)
|
||||||
|
(127944 127944)
|
||||||
|
(127943 127943)
|
||||||
|
(127942 127942)
|
||||||
|
(127941 127941)
|
||||||
|
(127904 127940)
|
||||||
|
(127872 127891)
|
||||||
|
(127870 127871)
|
||||||
|
(127868 127868)
|
||||||
|
(127825 127867)
|
||||||
|
(127824 127824)
|
||||||
|
(127820 127823)
|
||||||
|
(127819 127819)
|
||||||
|
(127799 127818)
|
||||||
|
(127796 127797)
|
||||||
|
(127794 127795)
|
||||||
|
(127792 127793)
|
||||||
|
(127789 127791)
|
||||||
|
(127775 127776)
|
||||||
|
(127773 127774)
|
||||||
|
(127772 127772)
|
||||||
|
(127771 127771)
|
||||||
|
(127770 127770)
|
||||||
|
(127769 127769)
|
||||||
|
(127766 127768)
|
||||||
|
(127763 127765)
|
||||||
|
(127762 127762)
|
||||||
|
(127761 127761)
|
||||||
|
(127760 127760)
|
||||||
|
(127759 127759)
|
||||||
|
(127757 127758)
|
||||||
|
(127744 127756)
|
||||||
|
(127568 127569)
|
||||||
|
(127544 127546)
|
||||||
|
(127538 127542)
|
||||||
|
(127535 127535)
|
||||||
|
(127514 127514)
|
||||||
|
(127489 127489)
|
||||||
|
(127462 127487)
|
||||||
|
(127377 127386)
|
||||||
|
(127374 127374)
|
||||||
|
(127183 127183)
|
||||||
|
(126980 126980)
|
||||||
|
(11093 11093)
|
||||||
|
(11088 11088)
|
||||||
|
(11035 11036)
|
||||||
|
(10175 10175)
|
||||||
|
(10160 10160)
|
||||||
|
(10133 10135)
|
||||||
|
(10071 10071)
|
||||||
|
(10067 10069)
|
||||||
|
(10062 10062)
|
||||||
|
(10060 10060)
|
||||||
|
(10024 10024)
|
||||||
|
(9994 9995)
|
||||||
|
(9989 9989)
|
||||||
|
(9981 9981)
|
||||||
|
(9978 9978)
|
||||||
|
(9973 9973)
|
||||||
|
(9970 9971)
|
||||||
|
(9962 9962)
|
||||||
|
(9940 9940)
|
||||||
|
(9934 9934)
|
||||||
|
(9924 9925)
|
||||||
|
(9917 9918)
|
||||||
|
(9898 9899)
|
||||||
|
(9889 9889)
|
||||||
|
(9875 9875)
|
||||||
|
(9855 9855)
|
||||||
|
(9800 9811)
|
||||||
|
(9748 9749)
|
||||||
|
(9725 9726)
|
||||||
|
(9203 9203)
|
||||||
|
(9200 9200)
|
||||||
|
(9193 9196)
|
||||||
|
(8986 8987))
|
||||||
|
(emoji-modifier-base
|
||||||
(129489 129501)
|
(129489 129501)
|
||||||
(129485 129487)
|
(129485 129487)
|
||||||
(129467 129467)
|
(129467 129467)
|
||||||
|
@ -449,7 +711,6 @@
|
||||||
(48 57)
|
(48 57)
|
||||||
(42 42)
|
(42 42)
|
||||||
(35 35))
|
(35 35))
|
||||||
(emoji-modifier (127995 127999))
|
|
||||||
(emoji-extended-pictographic
|
(emoji-extended-pictographic
|
||||||
(130048 131069)
|
(130048 131069)
|
||||||
(129751 129791)
|
(129751 129791)
|
||||||
|
@ -942,268 +1203,6 @@
|
||||||
(8252 8252)
|
(8252 8252)
|
||||||
(174 174)
|
(174 174)
|
||||||
(169 169))
|
(169 169))
|
||||||
(emoji-presentation
|
|
||||||
(129744 129750)
|
|
||||||
(129728 129730)
|
|
||||||
(129712 129718)
|
|
||||||
(129686 129704)
|
|
||||||
(129680 129685)
|
|
||||||
(129667 129670)
|
|
||||||
(129664 129666)
|
|
||||||
(129656 129658)
|
|
||||||
(129652 129652)
|
|
||||||
(129648 129651)
|
|
||||||
(129511 129535)
|
|
||||||
(129488 129510)
|
|
||||||
(129485 129487)
|
|
||||||
(129483 129483)
|
|
||||||
(129475 129482)
|
|
||||||
(129473 129474)
|
|
||||||
(129472 129472)
|
|
||||||
(129466 129471)
|
|
||||||
(129456 129465)
|
|
||||||
(129454 129455)
|
|
||||||
(129451 129453)
|
|
||||||
(129445 129450)
|
|
||||||
(129443 129444)
|
|
||||||
(129432 129442)
|
|
||||||
(129426 129431)
|
|
||||||
(129413 129425)
|
|
||||||
(129408 129412)
|
|
||||||
(129404 129407)
|
|
||||||
(129403 129403)
|
|
||||||
(129402 129402)
|
|
||||||
(129399 129400)
|
|
||||||
(129395 129398)
|
|
||||||
(129394 129394)
|
|
||||||
(129393 129393)
|
|
||||||
(129388 129392)
|
|
||||||
(129375 129387)
|
|
||||||
(129360 129374)
|
|
||||||
(129357 129359)
|
|
||||||
(129356 129356)
|
|
||||||
(129351 129355)
|
|
||||||
(129344 129349)
|
|
||||||
(129343 129343)
|
|
||||||
(129340 129342)
|
|
||||||
(129331 129338)
|
|
||||||
(129329 129330)
|
|
||||||
(129328 129328)
|
|
||||||
(129320 129327)
|
|
||||||
(129312 129319)
|
|
||||||
(129311 129311)
|
|
||||||
(129305 129310)
|
|
||||||
(129296 129304)
|
|
||||||
(129293 129295)
|
|
||||||
(129292 129292)
|
|
||||||
(128992 129003)
|
|
||||||
(128763 128764)
|
|
||||||
(128762 128762)
|
|
||||||
(128761 128761)
|
|
||||||
(128759 128760)
|
|
||||||
(128756 128758)
|
|
||||||
(128747 128748)
|
|
||||||
(128726 128727)
|
|
||||||
(128725 128725)
|
|
||||||
(128721 128722)
|
|
||||||
(128720 128720)
|
|
||||||
(128716 128716)
|
|
||||||
(128705 128709)
|
|
||||||
(128704 128704)
|
|
||||||
(128703 128703)
|
|
||||||
(128697 128702)
|
|
||||||
(128695 128696)
|
|
||||||
(128694 128694)
|
|
||||||
(128691 128693)
|
|
||||||
(128690 128690)
|
|
||||||
(128686 128689)
|
|
||||||
(128679 128685)
|
|
||||||
(128678 128678)
|
|
||||||
(128676 128677)
|
|
||||||
(128675 128675)
|
|
||||||
(128674 128674)
|
|
||||||
(128667 128673)
|
|
||||||
(128665 128666)
|
|
||||||
(128664 128664)
|
|
||||||
(128663 128663)
|
|
||||||
(128662 128662)
|
|
||||||
(128661 128661)
|
|
||||||
(128660 128660)
|
|
||||||
(128657 128659)
|
|
||||||
(128656 128656)
|
|
||||||
(128655 128655)
|
|
||||||
(128654 128654)
|
|
||||||
(128653 128653)
|
|
||||||
(128652 128652)
|
|
||||||
(128650 128651)
|
|
||||||
(128649 128649)
|
|
||||||
(128648 128648)
|
|
||||||
(128647 128647)
|
|
||||||
(128646 128646)
|
|
||||||
(128643 128645)
|
|
||||||
(128641 128642)
|
|
||||||
(128640 128640)
|
|
||||||
(128581 128591)
|
|
||||||
(128577 128580)
|
|
||||||
(128567 128576)
|
|
||||||
(128566 128566)
|
|
||||||
(128565 128565)
|
|
||||||
(128564 128564)
|
|
||||||
(128560 128563)
|
|
||||||
(128558 128559)
|
|
||||||
(128557 128557)
|
|
||||||
(128556 128556)
|
|
||||||
(128552 128555)
|
|
||||||
(128550 128551)
|
|
||||||
(128544 128549)
|
|
||||||
(128543 128543)
|
|
||||||
(128540 128542)
|
|
||||||
(128539 128539)
|
|
||||||
(128538 128538)
|
|
||||||
(128537 128537)
|
|
||||||
(128536 128536)
|
|
||||||
(128535 128535)
|
|
||||||
(128534 128534)
|
|
||||||
(128533 128533)
|
|
||||||
(128530 128532)
|
|
||||||
(128529 128529)
|
|
||||||
(128528 128528)
|
|
||||||
(128527 128527)
|
|
||||||
(128526 128526)
|
|
||||||
(128521 128525)
|
|
||||||
(128519 128520)
|
|
||||||
(128513 128518)
|
|
||||||
(128512 128512)
|
|
||||||
(128507 128511)
|
|
||||||
(128420 128420)
|
|
||||||
(128405 128406)
|
|
||||||
(128378 128378)
|
|
||||||
(128348 128359)
|
|
||||||
(128336 128347)
|
|
||||||
(128331 128334)
|
|
||||||
(128302 128317)
|
|
||||||
(128300 128301)
|
|
||||||
(128278 128299)
|
|
||||||
(128277 128277)
|
|
||||||
(128266 128276)
|
|
||||||
(128265 128265)
|
|
||||||
(128264 128264)
|
|
||||||
(128260 128263)
|
|
||||||
(128259 128259)
|
|
||||||
(128255 128258)
|
|
||||||
(128249 128252)
|
|
||||||
(128248 128248)
|
|
||||||
(128246 128247)
|
|
||||||
(128245 128245)
|
|
||||||
(128240 128244)
|
|
||||||
(128239 128239)
|
|
||||||
(128238 128238)
|
|
||||||
(128236 128237)
|
|
||||||
(128184 128235)
|
|
||||||
(128182 128183)
|
|
||||||
(128174 128181)
|
|
||||||
(128173 128173)
|
|
||||||
(128110 128172)
|
|
||||||
(128108 128109)
|
|
||||||
(128102 128107)
|
|
||||||
(128101 128101)
|
|
||||||
(128066 128100)
|
|
||||||
(128064 128064)
|
|
||||||
(128043 128062)
|
|
||||||
(128042 128042)
|
|
||||||
(128023 128041)
|
|
||||||
(128022 128022)
|
|
||||||
(128021 128021)
|
|
||||||
(128020 128020)
|
|
||||||
(128019 128019)
|
|
||||||
(128017 128018)
|
|
||||||
(128015 128016)
|
|
||||||
(128012 128014)
|
|
||||||
(128009 128011)
|
|
||||||
(128008 128008)
|
|
||||||
(127992 128007)
|
|
||||||
(127988 127988)
|
|
||||||
(127973 127984)
|
|
||||||
(127972 127972)
|
|
||||||
(127968 127971)
|
|
||||||
(127951 127955)
|
|
||||||
(127946 127946)
|
|
||||||
(127945 127945)
|
|
||||||
(127944 127944)
|
|
||||||
(127943 127943)
|
|
||||||
(127942 127942)
|
|
||||||
(127941 127941)
|
|
||||||
(127904 127940)
|
|
||||||
(127872 127891)
|
|
||||||
(127870 127871)
|
|
||||||
(127868 127868)
|
|
||||||
(127825 127867)
|
|
||||||
(127824 127824)
|
|
||||||
(127820 127823)
|
|
||||||
(127819 127819)
|
|
||||||
(127799 127818)
|
|
||||||
(127796 127797)
|
|
||||||
(127794 127795)
|
|
||||||
(127792 127793)
|
|
||||||
(127789 127791)
|
|
||||||
(127775 127776)
|
|
||||||
(127773 127774)
|
|
||||||
(127772 127772)
|
|
||||||
(127771 127771)
|
|
||||||
(127770 127770)
|
|
||||||
(127769 127769)
|
|
||||||
(127766 127768)
|
|
||||||
(127763 127765)
|
|
||||||
(127762 127762)
|
|
||||||
(127761 127761)
|
|
||||||
(127760 127760)
|
|
||||||
(127759 127759)
|
|
||||||
(127757 127758)
|
|
||||||
(127744 127756)
|
|
||||||
(127568 127569)
|
|
||||||
(127544 127546)
|
|
||||||
(127538 127542)
|
|
||||||
(127535 127535)
|
|
||||||
(127514 127514)
|
|
||||||
(127489 127489)
|
|
||||||
(127462 127487)
|
|
||||||
(127377 127386)
|
|
||||||
(127374 127374)
|
|
||||||
(127183 127183)
|
|
||||||
(126980 126980)
|
|
||||||
(11093 11093)
|
|
||||||
(11088 11088)
|
|
||||||
(11035 11036)
|
|
||||||
(10175 10175)
|
|
||||||
(10160 10160)
|
|
||||||
(10133 10135)
|
|
||||||
(10071 10071)
|
|
||||||
(10067 10069)
|
|
||||||
(10062 10062)
|
|
||||||
(10060 10060)
|
|
||||||
(10024 10024)
|
|
||||||
(9994 9995)
|
|
||||||
(9989 9989)
|
|
||||||
(9981 9981)
|
|
||||||
(9978 9978)
|
|
||||||
(9973 9973)
|
|
||||||
(9970 9971)
|
|
||||||
(9962 9962)
|
|
||||||
(9940 9940)
|
|
||||||
(9934 9934)
|
|
||||||
(9924 9925)
|
|
||||||
(9917 9918)
|
|
||||||
(9898 9899)
|
|
||||||
(9889 9889)
|
|
||||||
(9875 9875)
|
|
||||||
(9855 9855)
|
|
||||||
(9800 9811)
|
|
||||||
(9748 9749)
|
|
||||||
(9725 9726)
|
|
||||||
(9203 9203)
|
|
||||||
(9200 9200)
|
|
||||||
(9193 9196)
|
|
||||||
(8986 8987))
|
|
||||||
(emoji-component
|
(emoji-component
|
||||||
(917536 917631)
|
(917536 917631)
|
||||||
(129456 129459)
|
(129456 129459)
|
||||||
|
@ -1214,7 +1213,8 @@
|
||||||
(8205 8205)
|
(8205 8205)
|
||||||
(48 57)
|
(48 57)
|
||||||
(42 42)
|
(42 42)
|
||||||
(35 35)))))
|
(35 35))
|
||||||
|
(emoji-modifier (127995 127999)))))
|
||||||
|
|
||||||
(define char-set:emoji (char-set))
|
(define char-set:emoji (char-set))
|
||||||
(define char-set:emoji-presentation (char-set))
|
(define char-set:emoji-presentation (char-set))
|
File diff suppressed because it is too large
Load Diff
1518
uniseg/graphemes.scm
1518
uniseg/graphemes.scm
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,143 @@
|
||||||
|
(define-module (uniseg graphemes iterator)
|
||||||
|
#:use-module (uniseg)
|
||||||
|
#:use-module (uniseg graphemes)
|
||||||
|
#:use-module (ice-9 match)
|
||||||
|
#:use-module (srfi srfi-71)
|
||||||
|
#:use-module (srfi srfi-41)
|
||||||
|
#:export (make-grapheme-iterator))
|
||||||
|
|
||||||
|
(define (make-grapheme-iterator)
|
||||||
|
"Create a grapheme iterator that accepts a character and provides grapheme records over time."
|
||||||
|
;; The first is what state we are in, and the next is the grapheme
|
||||||
|
;; property of the current character.
|
||||||
|
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
||||||
|
(define (state-machine cur-state cur-prop)
|
||||||
|
(match (list cur-state cur-prop)
|
||||||
|
;; Specifics need to go first, and then non-specifics afterwards
|
||||||
|
;; SPECIFIC RULES - no `_' in the match
|
||||||
|
|
||||||
|
;; Grapheme boundary #3s
|
||||||
|
(('carriage-return 'line-feed)
|
||||||
|
(values 'control+line-feed #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #6s
|
||||||
|
(('hangul-syllable-l 'hangul-syllable-l)
|
||||||
|
(values 'hangul-syllable-l #f))
|
||||||
|
((or ('hangul-syllable-l 'hangul-syllable-v)
|
||||||
|
('hangul-syllable-l 'hangul-syllable-lv))
|
||||||
|
(values 'hangul-syllable-lv #f))
|
||||||
|
(('hangul-syllable-l 'hangul-syllable-lvt)
|
||||||
|
(values 'hangul-syllable-lvt #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #7s
|
||||||
|
(('hangul-syllable-lv 'hangul-syllable-v)
|
||||||
|
(values 'hangul-syllable-lv #f))
|
||||||
|
(('hangul-syllable-lv 'hangul-syllable-t)
|
||||||
|
(values 'hangul-syllable-lvt #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #8s
|
||||||
|
(('hangul-syllable-lvt 'hangul-syllable-t)
|
||||||
|
(values 'hangul-syllable-lvt #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #11s (emoji!)
|
||||||
|
(('extended-pictographic 'extend)
|
||||||
|
(values 'extended-pictographic #f))
|
||||||
|
(('extended-pictographic 'zero-width-joiner)
|
||||||
|
(values 'extended-pictographic+zero-width-joiner #f))
|
||||||
|
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
||||||
|
(values 'extended-pictographic #f))
|
||||||
|
|
||||||
|
;; Grapheme boundaries #12s and #13s
|
||||||
|
(('regional-indicator-odd 'regional-indicator)
|
||||||
|
(values 'regioinal-indicator-even #f))
|
||||||
|
(('regional-indicator-even 'regional-indicator)
|
||||||
|
(values 'regional-indicator-odd #t))
|
||||||
|
|
||||||
|
;; NON-SPECIFIC RULES
|
||||||
|
|
||||||
|
;; Grapheme boundary #4n
|
||||||
|
((or ('carriage-return _)
|
||||||
|
('control+line-feed _))
|
||||||
|
(values 'any #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #5n
|
||||||
|
((_ 'carriage-return)
|
||||||
|
(values 'carriage-return #t))
|
||||||
|
((or (_ 'line-feed)
|
||||||
|
(_ 'control))
|
||||||
|
(values 'control+line-feed #t))
|
||||||
|
((_ 'hangul-syllable-l)
|
||||||
|
(values 'hangul-syllable-l #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #7n
|
||||||
|
((or (_ 'hangul-syllable-lv)
|
||||||
|
(_ 'hangul-syllable-v))
|
||||||
|
(values 'hangul-syllable-lv #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #8n
|
||||||
|
((or (_ 'hangul-syllable-lvt)
|
||||||
|
(_ 'hangul-syllable-t))
|
||||||
|
(values 'hangul-syllable-lvt #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #9n
|
||||||
|
((or (_ 'extend)
|
||||||
|
(_ 'zero-width-joiner))
|
||||||
|
(values 'any #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #9n-A
|
||||||
|
((_ 'spacing-mark)
|
||||||
|
(values 'any #f))
|
||||||
|
|
||||||
|
;; Grapheme boundary #9n-B
|
||||||
|
(('prepend _)
|
||||||
|
(values 'any #f))
|
||||||
|
((_ 'prepend)
|
||||||
|
(values 'prepend #t))
|
||||||
|
|
||||||
|
;; Grapheme boundary #11n (emoji!)
|
||||||
|
((_ 'extended-pictographic)
|
||||||
|
(values 'extended-pictographic #t))
|
||||||
|
|
||||||
|
;; Grapheme boundaries #12n and #13n
|
||||||
|
((_ 'regional-indicator)
|
||||||
|
(values 'regional-indicator-odd #t))
|
||||||
|
|
||||||
|
;; Everything else considered a boundary
|
||||||
|
(else (values 'any #t))))
|
||||||
|
|
||||||
|
(define %current-grapheme #f)
|
||||||
|
|
||||||
|
(define (set-grapheme! width delta modification? state glyphs-reverse)
|
||||||
|
(let ((new (make-grapheme width delta modification? state glyphs-reverse)))
|
||||||
|
(set! %current-grapheme new)
|
||||||
|
new))
|
||||||
|
|
||||||
|
(define (iterate-through-grapheme glyph)
|
||||||
|
"Grapheme iteration function. May return false, in which case it requires new characters in order to produce output."
|
||||||
|
(if (eof-object? glyph)
|
||||||
|
;; eof means nothing to do, clear grapheme and return false
|
||||||
|
(begin
|
||||||
|
(set! %current-grapheme #f)
|
||||||
|
#f)
|
||||||
|
(begin
|
||||||
|
(let* ((glyph-width prop (char-width glyph))
|
||||||
|
(cur-state (if %current-grapheme
|
||||||
|
(grapheme-state %current-grapheme)
|
||||||
|
'any))
|
||||||
|
(next-state boundary? (state-machine cur-state prop)))
|
||||||
|
|
||||||
|
;; Boundary is between this glyph and previous
|
||||||
|
(if boundary?
|
||||||
|
;; If we hit a boundary with previous, we simply restart the state
|
||||||
|
;; and output the current single glyph as a grapheme
|
||||||
|
(set-grapheme! glyph-width glyph-width #f next-state (list glyph))
|
||||||
|
;; If it's not a boundary with previous, we have to add to the grapheme
|
||||||
|
;; Only possible to get a boundary if %current-grapheme is set, so assume
|
||||||
|
(let* ((cur-width (grapheme-width %current-grapheme))
|
||||||
|
(cur-glyphs-reverse (grapheme-glyphs-reverse %current-grapheme))
|
||||||
|
(new-width (+ cur-width glyph-width))
|
||||||
|
(new-glyphs-reverse (cons glyph cur-glyphs-reverse)))
|
||||||
|
|
||||||
|
(set-grapheme! new-width glyph-width #t next-state new-glyphs-reverse)))))))
|
||||||
|
|
||||||
|
iterate-through-grapheme)
|
|
@ -1,183 +1,24 @@
|
||||||
(define-module (uniseg graphemes stream)
|
(define-module (uniseg graphemes stream)
|
||||||
#:use-module (uniseg)
|
#:use-module (uniseg graphemes iterator)
|
||||||
#:use-module (uniseg internal)
|
|
||||||
#:use-module (ice-9 textual-ports)
|
#:use-module (ice-9 textual-ports)
|
||||||
#:use-module (ice-9 match)
|
|
||||||
#:use-module (srfi srfi-71)
|
|
||||||
#:use-module (srfi srfi-41)
|
#:use-module (srfi srfi-41)
|
||||||
#:use-module (srfi srfi-9 gnu)
|
#:export (string->grapheme-stream
|
||||||
#:export (make-grapheme
|
|
||||||
grapheme?
|
|
||||||
grapheme-glyphs
|
|
||||||
grapheme-string
|
|
||||||
grapheme-width
|
|
||||||
string->grapheme-stream
|
|
||||||
input->grapheme-stream))
|
input->grapheme-stream))
|
||||||
|
|
||||||
;; TODO: the golang uniseg also does word and sentence boundaries. These state machines could be implemented if we wanted to.
|
|
||||||
(define-immutable-record-type <grapheme>
|
|
||||||
(make-grapheme width glyphs-promise string-promise)
|
|
||||||
grapheme?
|
|
||||||
(width grapheme-width)
|
|
||||||
(glyphs-promise _grapheme-glyphs-promise)
|
|
||||||
(string-promise _grapheme-string-promise))
|
|
||||||
|
|
||||||
(define (grapheme-glyphs grapheme)
|
|
||||||
"Return a lazily-constructed list of glyphs in the grapheme"
|
|
||||||
(force (_grapheme-glyphs-promise grapheme)))
|
|
||||||
|
|
||||||
(define (grapheme-string grapheme)
|
|
||||||
"Return a lazily-constructed string of the glyphs in the grapheme."
|
|
||||||
(force (_grapheme-string-promise grapheme)))
|
|
||||||
|
|
||||||
(define (string->grapheme-stream str)
|
(define (string->grapheme-stream str)
|
||||||
"Given a string, create a (lazy) stream of graphemes."
|
"Given a string, create a (lazy) stream of graphemes."
|
||||||
(with-input-from-string str
|
(call-with-input-string str input->grapheme-stream))
|
||||||
(λ () (input->grapheme-stream (current-input-port)))))
|
|
||||||
|
|
||||||
(define-stream (input->grapheme-stream port)
|
|
||||||
|
(define (input->grapheme-stream port)
|
||||||
"Given an input port, create a (lazy) stream of graphemes."
|
"Given an input port, create a (lazy) stream of graphemes."
|
||||||
;; The first is what state we are in, and the next is the grapheme
|
(define grapheme-iterator (make-grapheme-iterator))
|
||||||
;; property of the current character.
|
|
||||||
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
|
||||||
(define (state-machine cur-state cur-prop)
|
|
||||||
(match (list cur-state cur-prop)
|
|
||||||
;; Specifics need to go first, and then non-specifics afterwards
|
|
||||||
;; SPECIFIC RULES - no `_' in the match
|
|
||||||
|
|
||||||
;; Grapheme boundary #3s
|
(define-stream (grapheme-stream)
|
||||||
(('carriage-return 'line-feed)
|
(define grapheme (grapheme-iterator (get-char port)))
|
||||||
(values 'control+line-feed #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #6s
|
(if grapheme
|
||||||
(('hangul-syllable-l 'hangul-syllable-l)
|
(stream-cons grapheme (grapheme-stream))
|
||||||
(values 'hangul-syllable-l #f))
|
stream-null))
|
||||||
((or ('hangul-syllable-l 'hangul-syllable-v)
|
|
||||||
('hangul-syllable-l 'hangul-syllable-lv))
|
|
||||||
(values 'hangul-syllable-lv #f))
|
|
||||||
(('hangul-syllable-l 'hangul-syllable-lvt)
|
|
||||||
(values 'hangul-syllable-lvt #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #7s
|
(grapheme-stream))
|
||||||
(('hangul-syllable-lv 'hangul-syllable-v)
|
|
||||||
(values 'hangul-syllable-lv #f))
|
|
||||||
(('hangul-syllable-lv 'hangul-syllable-t)
|
|
||||||
(values 'hangul-syllable-lvt #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #8s
|
|
||||||
(('hangul-syllable-lvt 'hangul-syllable-t)
|
|
||||||
(values 'hangul-syllable-lvt #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #11s (emoji!)
|
|
||||||
(('extended-pictographic 'extend)
|
|
||||||
(values 'extended-pictographic #f))
|
|
||||||
(('extended-pictographic 'zero-width-joiner)
|
|
||||||
(values 'extended-pictographic+zero-width-joiner #f))
|
|
||||||
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
|
||||||
(values 'extended-pictographic #f))
|
|
||||||
|
|
||||||
;; Grapheme boundaries #12s and #13s
|
|
||||||
(('regional-indicator-odd 'regional-indicator)
|
|
||||||
(values 'regioinal-indicator-even #f))
|
|
||||||
(('regional-indicator-even 'regional-indicator)
|
|
||||||
(values 'regional-indicator-odd #t))
|
|
||||||
|
|
||||||
;; NON-SPECIFIC RULES
|
|
||||||
|
|
||||||
;; Grapheme boundary #4n
|
|
||||||
((or ('carriage-return _)
|
|
||||||
('control+line-feed _))
|
|
||||||
(values 'any #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #5n
|
|
||||||
((_ 'carriage-return)
|
|
||||||
(values 'carriage-return #t))
|
|
||||||
((or (_ 'line-feed)
|
|
||||||
(_ 'control))
|
|
||||||
(values 'control+line-feed #t))
|
|
||||||
((_ 'hangul-syllable-l)
|
|
||||||
(values 'hangul-syllable-l #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #7n
|
|
||||||
((or (_ 'hangul-syllable-lv)
|
|
||||||
(_ 'hangul-syllable-v))
|
|
||||||
(values 'hangul-syllable-lv #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #8n
|
|
||||||
((or (_ 'hangul-syllable-lvt)
|
|
||||||
(_ 'hangul-syllable-t))
|
|
||||||
(values 'hangul-syllable-lvt #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #9n
|
|
||||||
((or (_ 'extend)
|
|
||||||
(_ 'zero-width-joiner))
|
|
||||||
(values 'any #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #9n-A
|
|
||||||
((_ 'spacing-mark)
|
|
||||||
(values 'any #f))
|
|
||||||
|
|
||||||
;; Grapheme boundary #9n-B
|
|
||||||
(('prepend _)
|
|
||||||
(values 'any #f))
|
|
||||||
((_ 'prepend)
|
|
||||||
(values 'prepend #t))
|
|
||||||
|
|
||||||
;; Grapheme boundary #11n (emoji!)
|
|
||||||
((_ 'extended-pictographic)
|
|
||||||
(pk "EXTENDED")
|
|
||||||
(values 'extended-pictographic #t))
|
|
||||||
|
|
||||||
;; Grapheme boundaries #12n and #13n
|
|
||||||
((_ 'regional-indicator)
|
|
||||||
(values 'regional-indicator-odd #t))
|
|
||||||
|
|
||||||
;; Everything else considered a boundary
|
|
||||||
(else (values 'any #t))))
|
|
||||||
|
|
||||||
;; State variables, initialized with the first glyph we get!
|
|
||||||
(define first-glyph (get-char port))
|
|
||||||
(define hit-eof? (eof-object? first-glyph))
|
|
||||||
(define %glyphs-reverse (list first-glyph))
|
|
||||||
(define %grapheme-width 0)
|
|
||||||
|
|
||||||
(define (iterate-through-grapheme state)
|
|
||||||
(define glyph (peek-char port))
|
|
||||||
(if (eof-object? glyph)
|
|
||||||
(begin
|
|
||||||
(set! hit-eof? #t)
|
|
||||||
state)
|
|
||||||
(begin
|
|
||||||
(let* ((width property (char-width glyph))
|
|
||||||
(next-state boundary? (cpk-values glyph state property '= (state-machine state property))))
|
|
||||||
|
|
||||||
(if boundary?
|
|
||||||
state
|
|
||||||
(let ((new-width (+ %grapheme-width width))
|
|
||||||
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
|
|
||||||
|
|
||||||
;; Officially induct this char into the cluster
|
|
||||||
(get-char port)
|
|
||||||
(set! %grapheme-width new-width)
|
|
||||||
(set! %glyphs-reverse new-glyphs-reverse)
|
|
||||||
(iterate-through-grapheme next-state)))))))
|
|
||||||
|
|
||||||
;; Need to explicitly get the first state
|
|
||||||
;; Skip all this if it's an empty stream at start
|
|
||||||
(if hit-eof?
|
|
||||||
stream-null
|
|
||||||
(let ((first-width first-prop (char-width first-glyph)))
|
|
||||||
(set! %grapheme-width first-width)
|
|
||||||
(iterate-through-grapheme (state-machine 'any first-prop))
|
|
||||||
|
|
||||||
(let ((grapheme (make-grapheme
|
|
||||||
%grapheme-width
|
|
||||||
;; Delay to avoid construction of unnecessary lists and strings!
|
|
||||||
(delay (reverse %glyphs-reverse))
|
|
||||||
(delay (reverse-list->string %glyphs-reverse)))))
|
|
||||||
|
|
||||||
;; If we hit the eof here, we need one last stream entry, otherwise iterate further
|
|
||||||
(stream-cons grapheme
|
|
||||||
(if hit-eof?
|
|
||||||
stream-null
|
|
||||||
(input->grapheme-stream port)))))))
|
|
||||||
|
|
Loading…
Reference in New Issue