Yet another reorganization, and solve Christine's 'rude problem'
- We change the stream iterator to *always* return a grapheme (except for EOF). The grapheme then gets built up over time. - This way, trans flag for example is first white flag, then white flag + zwj, etc until it finally transforms into the trans flag. - Users of the stream library can then use the `modification?' flag to determine if the stream value is a modification of the prior grapheme instead of a new grapheme. - Abstracted iteration to an iterator object to support use cases where we don't have an input stream (reflow needs this!)
This commit is contained in:
parent
1c0c4634f9
commit
31012d5b8f
12 changed files with 2046 additions and 2004 deletions
29
hall.scm
29
hall.scm
|
@ -19,20 +19,27 @@
|
|||
(native-language-support #f)
|
||||
(licensing #f)))
|
||||
(files (libraries
|
||||
((scheme-file "uniseg")
|
||||
(directory
|
||||
((directory
|
||||
"uniseg"
|
||||
((scheme-file "emoji")
|
||||
((directory
|
||||
"charsets"
|
||||
((scheme-file "emoji")
|
||||
(scheme-file "eastasian")
|
||||
(scheme-file "graphemes")))
|
||||
(directory
|
||||
"graphemes"
|
||||
((scheme-file "iterator") (scheme-file "stream")))
|
||||
(directory "eastasian" ((scheme-file "locale")))
|
||||
(scheme-file "eastasian")
|
||||
(directory "graphemes" ((scheme-file "stream")))
|
||||
(scheme-file "graphemes")
|
||||
(scheme-file "internal")))))
|
||||
(tests ((directory
|
||||
"tests"
|
||||
((scheme-file "test-eastasian-locale")
|
||||
(scheme-file "test-uniseg")
|
||||
(scheme-file "test-graphemes-stream")))))
|
||||
(scheme-file "internal")))
|
||||
(scheme-file "uniseg")))
|
||||
(tests
|
||||
((directory
|
||||
"tests"
|
||||
((scheme-file "test-eastasian-locale")
|
||||
(scheme-file "test-uniseg")
|
||||
(scheme-file "test-graphemes-stream")
|
||||
(scheme-file "test-graphemes-iterator")))))
|
||||
(programs
|
||||
((directory
|
||||
"scripts"
|
||||
|
|
|
@ -44,7 +44,7 @@
|
|||
(λ (prop) (symbol-with-prefix "char-set:eastasian-" prop))
|
||||
eastasian-properties))
|
||||
|
||||
(define file "uniseg/eastasian.scm")
|
||||
(define file "uniseg/charsets/eastasian.scm")
|
||||
|
||||
(format stdout "Writing to ~a...\n" file)
|
||||
|
||||
|
@ -53,7 +53,7 @@
|
|||
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
||||
|
||||
(pretty-print
|
||||
`(define-module (uniseg eastasian)
|
||||
`(define-module (uniseg charsets eastasian)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
|
|
|
@ -40,7 +40,7 @@
|
|||
("Emoji_Component" 'emoji-component)
|
||||
("Extended_Pictographic" 'emoji-extended-pictographic)))
|
||||
|
||||
(define file "uniseg/emoji.scm")
|
||||
(define file "uniseg/charsets/emoji.scm")
|
||||
|
||||
(format stdout "Writing to ~a...\n" file)
|
||||
|
||||
|
@ -49,7 +49,7 @@
|
|||
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
||||
|
||||
(pretty-print
|
||||
`(define-module (uniseg emoji)
|
||||
`(define-module (uniseg charsets emoji)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
|
|
|
@ -56,7 +56,7 @@
|
|||
("SpacingMark" 'spacing-mark)
|
||||
("ZWJ" 'zero-width-joiner)))
|
||||
|
||||
(define file "uniseg/graphemes.scm")
|
||||
(define file "uniseg/charsets/graphemes.scm")
|
||||
|
||||
(format stdout "Writing to ~a...\n" file)
|
||||
|
||||
|
@ -65,11 +65,11 @@
|
|||
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
||||
|
||||
(pretty-print
|
||||
`(define-module (uniseg graphemes)
|
||||
`(define-module (uniseg charsets graphemes)
|
||||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (uniseg emoji)
|
||||
#:use-module (uniseg charsets emoji)
|
||||
#:export (,@grapheme-symbols
|
||||
grapheme-charsets)))
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
(define-module (tests test-graphemes-stream)
|
||||
#:use-module (uniseg graphemes)
|
||||
#:use-module (uniseg graphemes stream)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (srfi srfi-41)
|
||||
#:use-module (srfi srfi-64))
|
||||
|
||||
|
@ -10,11 +12,10 @@
|
|||
|
||||
(define* (advance-stream! #:optional (times 1))
|
||||
(for-each
|
||||
(λ (_)
|
||||
(set! stream (stream-cdr stream)))
|
||||
(λ (_) (set! stream (stream-cdr stream)))
|
||||
(make-list times)))
|
||||
|
||||
(advance-stream! 6)
|
||||
(advance-stream! 10)
|
||||
|
||||
(define trans-flag-grapheme (stream-car stream))
|
||||
|
||||
|
@ -56,4 +57,11 @@
|
|||
(test-equal "a stream of nothing resolves to an empty stream"
|
||||
#t (stream-null? empty-stream))
|
||||
|
||||
(define singleton-stream (string->grapheme-stream "a"))
|
||||
|
||||
(set! a-grapheme (stream-car stream))
|
||||
|
||||
(test-equal "a stream with a single character resolves to a grapheme"
|
||||
"a" (grapheme-string a-grapheme))
|
||||
|
||||
(test-end "tests-graphemes-stream")
|
||||
|
|
|
@ -2,10 +2,11 @@
|
|||
#:use-module (srfi srfi-1)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (srfi srfi-41)
|
||||
#:use-module (uniseg emoji)
|
||||
#:use-module (uniseg graphemes)
|
||||
#:use-module (uniseg graphemes stream)
|
||||
#:use-module (uniseg eastasian)
|
||||
#:use-module (uniseg charsets emoji)
|
||||
#:use-module (uniseg charsets eastasian)
|
||||
#:use-module (uniseg charsets graphemes)
|
||||
#:export (emoji?
|
||||
char->grapheme-property
|
||||
char->eastasian-property
|
||||
|
@ -99,6 +100,6 @@
|
|||
"Get the width of a string by adding up the widths of each grapheme"
|
||||
(stream-fold
|
||||
(λ (val grapheme)
|
||||
(+ val (grapheme-width grapheme)))
|
||||
(+ val (grapheme-delta-width grapheme)))
|
||||
0
|
||||
(string->grapheme-stream str)))
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
;; Code generated by generate-eastasian. DO NOT EDIT
|
||||
|
||||
(define-module
|
||||
(uniseg eastasian)
|
||||
(uniseg charsets eastasian)
|
||||
#:use-module
|
||||
(uniseg internal)
|
||||
#:use-module
|
||||
|
@ -18,7 +18,50 @@
|
|||
eastasian-charsets))
|
||||
(define hashtable
|
||||
(alist->hashq-table
|
||||
'((combining
|
||||
'((narrow
|
||||
(10630 10630)
|
||||
(10629 10629)
|
||||
(10221 10221)
|
||||
(10220 10220)
|
||||
(10219 10219)
|
||||
(10218 10218)
|
||||
(10217 10217)
|
||||
(10216 10216)
|
||||
(10215 10215)
|
||||
(10214 10214)
|
||||
(175 175)
|
||||
(172 172)
|
||||
(166 166)
|
||||
(165 165)
|
||||
(162 163)
|
||||
(126 126)
|
||||
(125 125)
|
||||
(124 124)
|
||||
(123 123)
|
||||
(97 122)
|
||||
(96 96)
|
||||
(95 95)
|
||||
(94 94)
|
||||
(93 93)
|
||||
(92 92)
|
||||
(91 91)
|
||||
(65 90)
|
||||
(63 64)
|
||||
(60 62)
|
||||
(58 59)
|
||||
(48 57)
|
||||
(46 47)
|
||||
(45 45)
|
||||
(44 44)
|
||||
(43 43)
|
||||
(42 42)
|
||||
(41 41)
|
||||
(40 40)
|
||||
(37 39)
|
||||
(36 36)
|
||||
(33 35)
|
||||
(32 32))
|
||||
(combining
|
||||
(125136 125142)
|
||||
(122918 122922)
|
||||
(122915 122916)
|
||||
|
@ -71,67 +114,6 @@
|
|||
(1160 1161)
|
||||
(1155 1159)
|
||||
(768 879))
|
||||
(halfwidth
|
||||
(65517 65518)
|
||||
(65513 65516)
|
||||
(65512 65512)
|
||||
(65498 65500)
|
||||
(65490 65495)
|
||||
(65482 65487)
|
||||
(65474 65479)
|
||||
(65440 65470)
|
||||
(65438 65439)
|
||||
(65393 65437)
|
||||
(65392 65392)
|
||||
(65382 65391)
|
||||
(65380 65381)
|
||||
(65379 65379)
|
||||
(65378 65378)
|
||||
(65377 65377)
|
||||
(8361 8361))
|
||||
(narrow
|
||||
(10630 10630)
|
||||
(10629 10629)
|
||||
(10221 10221)
|
||||
(10220 10220)
|
||||
(10219 10219)
|
||||
(10218 10218)
|
||||
(10217 10217)
|
||||
(10216 10216)
|
||||
(10215 10215)
|
||||
(10214 10214)
|
||||
(175 175)
|
||||
(172 172)
|
||||
(166 166)
|
||||
(165 165)
|
||||
(162 163)
|
||||
(126 126)
|
||||
(125 125)
|
||||
(124 124)
|
||||
(123 123)
|
||||
(97 122)
|
||||
(96 96)
|
||||
(95 95)
|
||||
(94 94)
|
||||
(93 93)
|
||||
(92 92)
|
||||
(91 91)
|
||||
(65 90)
|
||||
(63 64)
|
||||
(60 62)
|
||||
(58 59)
|
||||
(48 57)
|
||||
(46 47)
|
||||
(45 45)
|
||||
(44 44)
|
||||
(43 43)
|
||||
(42 42)
|
||||
(41 41)
|
||||
(40 40)
|
||||
(37 39)
|
||||
(36 36)
|
||||
(33 35)
|
||||
(32 32))
|
||||
(ambiguous
|
||||
(1048576 1114109)
|
||||
(983040 1048573)
|
||||
|
@ -330,6 +312,24 @@
|
|||
(167 167)
|
||||
(164 164)
|
||||
(161 161))
|
||||
(halfwidth
|
||||
(65517 65518)
|
||||
(65513 65516)
|
||||
(65512 65512)
|
||||
(65498 65500)
|
||||
(65490 65495)
|
||||
(65482 65487)
|
||||
(65474 65479)
|
||||
(65440 65470)
|
||||
(65438 65439)
|
||||
(65393 65437)
|
||||
(65392 65392)
|
||||
(65382 65391)
|
||||
(65380 65381)
|
||||
(65379 65379)
|
||||
(65378 65378)
|
||||
(65377 65377)
|
||||
(8361 8361))
|
||||
(neutral
|
||||
(917536 917631)
|
||||
(917505 917505)
|
|
@ -1,7 +1,7 @@
|
|||
;; Code generated by generate-emoji. DO NOT EDIT
|
||||
|
||||
(define-module
|
||||
(uniseg emoji)
|
||||
(uniseg charsets emoji)
|
||||
#:use-module
|
||||
(uniseg internal)
|
||||
#:use-module
|
||||
|
@ -18,7 +18,269 @@
|
|||
emoji-charsets))
|
||||
(define hashtable
|
||||
(alist->hashq-table
|
||||
'((emoji-modifier-base
|
||||
'((emoji-presentation
|
||||
(129744 129750)
|
||||
(129728 129730)
|
||||
(129712 129718)
|
||||
(129686 129704)
|
||||
(129680 129685)
|
||||
(129667 129670)
|
||||
(129664 129666)
|
||||
(129656 129658)
|
||||
(129652 129652)
|
||||
(129648 129651)
|
||||
(129511 129535)
|
||||
(129488 129510)
|
||||
(129485 129487)
|
||||
(129483 129483)
|
||||
(129475 129482)
|
||||
(129473 129474)
|
||||
(129472 129472)
|
||||
(129466 129471)
|
||||
(129456 129465)
|
||||
(129454 129455)
|
||||
(129451 129453)
|
||||
(129445 129450)
|
||||
(129443 129444)
|
||||
(129432 129442)
|
||||
(129426 129431)
|
||||
(129413 129425)
|
||||
(129408 129412)
|
||||
(129404 129407)
|
||||
(129403 129403)
|
||||
(129402 129402)
|
||||
(129399 129400)
|
||||
(129395 129398)
|
||||
(129394 129394)
|
||||
(129393 129393)
|
||||
(129388 129392)
|
||||
(129375 129387)
|
||||
(129360 129374)
|
||||
(129357 129359)
|
||||
(129356 129356)
|
||||
(129351 129355)
|
||||
(129344 129349)
|
||||
(129343 129343)
|
||||
(129340 129342)
|
||||
(129331 129338)
|
||||
(129329 129330)
|
||||
(129328 129328)
|
||||
(129320 129327)
|
||||
(129312 129319)
|
||||
(129311 129311)
|
||||
(129305 129310)
|
||||
(129296 129304)
|
||||
(129293 129295)
|
||||
(129292 129292)
|
||||
(128992 129003)
|
||||
(128763 128764)
|
||||
(128762 128762)
|
||||
(128761 128761)
|
||||
(128759 128760)
|
||||
(128756 128758)
|
||||
(128747 128748)
|
||||
(128726 128727)
|
||||
(128725 128725)
|
||||
(128721 128722)
|
||||
(128720 128720)
|
||||
(128716 128716)
|
||||
(128705 128709)
|
||||
(128704 128704)
|
||||
(128703 128703)
|
||||
(128697 128702)
|
||||
(128695 128696)
|
||||
(128694 128694)
|
||||
(128691 128693)
|
||||
(128690 128690)
|
||||
(128686 128689)
|
||||
(128679 128685)
|
||||
(128678 128678)
|
||||
(128676 128677)
|
||||
(128675 128675)
|
||||
(128674 128674)
|
||||
(128667 128673)
|
||||
(128665 128666)
|
||||
(128664 128664)
|
||||
(128663 128663)
|
||||
(128662 128662)
|
||||
(128661 128661)
|
||||
(128660 128660)
|
||||
(128657 128659)
|
||||
(128656 128656)
|
||||
(128655 128655)
|
||||
(128654 128654)
|
||||
(128653 128653)
|
||||
(128652 128652)
|
||||
(128650 128651)
|
||||
(128649 128649)
|
||||
(128648 128648)
|
||||
(128647 128647)
|
||||
(128646 128646)
|
||||
(128643 128645)
|
||||
(128641 128642)
|
||||
(128640 128640)
|
||||
(128581 128591)
|
||||
(128577 128580)
|
||||
(128567 128576)
|
||||
(128566 128566)
|
||||
(128565 128565)
|
||||
(128564 128564)
|
||||
(128560 128563)
|
||||
(128558 128559)
|
||||
(128557 128557)
|
||||
(128556 128556)
|
||||
(128552 128555)
|
||||
(128550 128551)
|
||||
(128544 128549)
|
||||
(128543 128543)
|
||||
(128540 128542)
|
||||
(128539 128539)
|
||||
(128538 128538)
|
||||
(128537 128537)
|
||||
(128536 128536)
|
||||
(128535 128535)
|
||||
(128534 128534)
|
||||
(128533 128533)
|
||||
(128530 128532)
|
||||
(128529 128529)
|
||||
(128528 128528)
|
||||
(128527 128527)
|
||||
(128526 128526)
|
||||
(128521 128525)
|
||||
(128519 128520)
|
||||
(128513 128518)
|
||||
(128512 128512)
|
||||
(128507 128511)
|
||||
(128420 128420)
|
||||
(128405 128406)
|
||||
(128378 128378)
|
||||
(128348 128359)
|
||||
(128336 128347)
|
||||
(128331 128334)
|
||||
(128302 128317)
|
||||
(128300 128301)
|
||||
(128278 128299)
|
||||
(128277 128277)
|
||||
(128266 128276)
|
||||
(128265 128265)
|
||||
(128264 128264)
|
||||
(128260 128263)
|
||||
(128259 128259)
|
||||
(128255 128258)
|
||||
(128249 128252)
|
||||
(128248 128248)
|
||||
(128246 128247)
|
||||
(128245 128245)
|
||||
(128240 128244)
|
||||
(128239 128239)
|
||||
(128238 128238)
|
||||
(128236 128237)
|
||||
(128184 128235)
|
||||
(128182 128183)
|
||||
(128174 128181)
|
||||
(128173 128173)
|
||||
(128110 128172)
|
||||
(128108 128109)
|
||||
(128102 128107)
|
||||
(128101 128101)
|
||||
(128066 128100)
|
||||
(128064 128064)
|
||||
(128043 128062)
|
||||
(128042 128042)
|
||||
(128023 128041)
|
||||
(128022 128022)
|
||||
(128021 128021)
|
||||
(128020 128020)
|
||||
(128019 128019)
|
||||
(128017 128018)
|
||||
(128015 128016)
|
||||
(128012 128014)
|
||||
(128009 128011)
|
||||
(128008 128008)
|
||||
(127992 128007)
|
||||
(127988 127988)
|
||||
(127973 127984)
|
||||
(127972 127972)
|
||||
(127968 127971)
|
||||
(127951 127955)
|
||||
(127946 127946)
|
||||
(127945 127945)
|
||||
(127944 127944)
|
||||
(127943 127943)
|
||||
(127942 127942)
|
||||
(127941 127941)
|
||||
(127904 127940)
|
||||
(127872 127891)
|
||||
(127870 127871)
|
||||
(127868 127868)
|
||||
(127825 127867)
|
||||
(127824 127824)
|
||||
(127820 127823)
|
||||
(127819 127819)
|
||||
(127799 127818)
|
||||
(127796 127797)
|
||||
(127794 127795)
|
||||
(127792 127793)
|
||||
(127789 127791)
|
||||
(127775 127776)
|
||||
(127773 127774)
|
||||
(127772 127772)
|
||||
(127771 127771)
|
||||
(127770 127770)
|
||||
(127769 127769)
|
||||
(127766 127768)
|
||||
(127763 127765)
|
||||
(127762 127762)
|
||||
(127761 127761)
|
||||
(127760 127760)
|
||||
(127759 127759)
|
||||
(127757 127758)
|
||||
(127744 127756)
|
||||
(127568 127569)
|
||||
(127544 127546)
|
||||
(127538 127542)
|
||||
(127535 127535)
|
||||
(127514 127514)
|
||||
(127489 127489)
|
||||
(127462 127487)
|
||||
(127377 127386)
|
||||
(127374 127374)
|
||||
(127183 127183)
|
||||
(126980 126980)
|
||||
(11093 11093)
|
||||
(11088 11088)
|
||||
(11035 11036)
|
||||
(10175 10175)
|
||||
(10160 10160)
|
||||
(10133 10135)
|
||||
(10071 10071)
|
||||
(10067 10069)
|
||||
(10062 10062)
|
||||
(10060 10060)
|
||||
(10024 10024)
|
||||
(9994 9995)
|
||||
(9989 9989)
|
||||
(9981 9981)
|
||||
(9978 9978)
|
||||
(9973 9973)
|
||||
(9970 9971)
|
||||
(9962 9962)
|
||||
(9940 9940)
|
||||
(9934 9934)
|
||||
(9924 9925)
|
||||
(9917 9918)
|
||||
(9898 9899)
|
||||
(9889 9889)
|
||||
(9875 9875)
|
||||
(9855 9855)
|
||||
(9800 9811)
|
||||
(9748 9749)
|
||||
(9725 9726)
|
||||
(9203 9203)
|
||||
(9200 9200)
|
||||
(9193 9196)
|
||||
(8986 8987))
|
||||
(emoji-modifier-base
|
||||
(129489 129501)
|
||||
(129485 129487)
|
||||
(129467 129467)
|
||||
|
@ -449,7 +711,6 @@
|
|||
(48 57)
|
||||
(42 42)
|
||||
(35 35))
|
||||
(emoji-modifier (127995 127999))
|
||||
(emoji-extended-pictographic
|
||||
(130048 131069)
|
||||
(129751 129791)
|
||||
|
@ -942,268 +1203,6 @@
|
|||
(8252 8252)
|
||||
(174 174)
|
||||
(169 169))
|
||||
(emoji-presentation
|
||||
(129744 129750)
|
||||
(129728 129730)
|
||||
(129712 129718)
|
||||
(129686 129704)
|
||||
(129680 129685)
|
||||
(129667 129670)
|
||||
(129664 129666)
|
||||
(129656 129658)
|
||||
(129652 129652)
|
||||
(129648 129651)
|
||||
(129511 129535)
|
||||
(129488 129510)
|
||||
(129485 129487)
|
||||
(129483 129483)
|
||||
(129475 129482)
|
||||
(129473 129474)
|
||||
(129472 129472)
|
||||
(129466 129471)
|
||||
(129456 129465)
|
||||
(129454 129455)
|
||||
(129451 129453)
|
||||
(129445 129450)
|
||||
(129443 129444)
|
||||
(129432 129442)
|
||||
(129426 129431)
|
||||
(129413 129425)
|
||||
(129408 129412)
|
||||
(129404 129407)
|
||||
(129403 129403)
|
||||
(129402 129402)
|
||||
(129399 129400)
|
||||
(129395 129398)
|
||||
(129394 129394)
|
||||
(129393 129393)
|
||||
(129388 129392)
|
||||
(129375 129387)
|
||||
(129360 129374)
|
||||
(129357 129359)
|
||||
(129356 129356)
|
||||
(129351 129355)
|
||||
(129344 129349)
|
||||
(129343 129343)
|
||||
(129340 129342)
|
||||
(129331 129338)
|
||||
(129329 129330)
|
||||
(129328 129328)
|
||||
(129320 129327)
|
||||
(129312 129319)
|
||||
(129311 129311)
|
||||
(129305 129310)
|
||||
(129296 129304)
|
||||
(129293 129295)
|
||||
(129292 129292)
|
||||
(128992 129003)
|
||||
(128763 128764)
|
||||
(128762 128762)
|
||||
(128761 128761)
|
||||
(128759 128760)
|
||||
(128756 128758)
|
||||
(128747 128748)
|
||||
(128726 128727)
|
||||
(128725 128725)
|
||||
(128721 128722)
|
||||
(128720 128720)
|
||||
(128716 128716)
|
||||
(128705 128709)
|
||||
(128704 128704)
|
||||
(128703 128703)
|
||||
(128697 128702)
|
||||
(128695 128696)
|
||||
(128694 128694)
|
||||
(128691 128693)
|
||||
(128690 128690)
|
||||
(128686 128689)
|
||||
(128679 128685)
|
||||
(128678 128678)
|
||||
(128676 128677)
|
||||
(128675 128675)
|
||||
(128674 128674)
|
||||
(128667 128673)
|
||||
(128665 128666)
|
||||
(128664 128664)
|
||||
(128663 128663)
|
||||
(128662 128662)
|
||||
(128661 128661)
|
||||
(128660 128660)
|
||||
(128657 128659)
|
||||
(128656 128656)
|
||||
(128655 128655)
|
||||
(128654 128654)
|
||||
(128653 128653)
|
||||
(128652 128652)
|
||||
(128650 128651)
|
||||
(128649 128649)
|
||||
(128648 128648)
|
||||
(128647 128647)
|
||||
(128646 128646)
|
||||
(128643 128645)
|
||||
(128641 128642)
|
||||
(128640 128640)
|
||||
(128581 128591)
|
||||
(128577 128580)
|
||||
(128567 128576)
|
||||
(128566 128566)
|
||||
(128565 128565)
|
||||
(128564 128564)
|
||||
(128560 128563)
|
||||
(128558 128559)
|
||||
(128557 128557)
|
||||
(128556 128556)
|
||||
(128552 128555)
|
||||
(128550 128551)
|
||||
(128544 128549)
|
||||
(128543 128543)
|
||||
(128540 128542)
|
||||
(128539 128539)
|
||||
(128538 128538)
|
||||
(128537 128537)
|
||||
(128536 128536)
|
||||
(128535 128535)
|
||||
(128534 128534)
|
||||
(128533 128533)
|
||||
(128530 128532)
|
||||
(128529 128529)
|
||||
(128528 128528)
|
||||
(128527 128527)
|
||||
(128526 128526)
|
||||
(128521 128525)
|
||||
(128519 128520)
|
||||
(128513 128518)
|
||||
(128512 128512)
|
||||
(128507 128511)
|
||||
(128420 128420)
|
||||
(128405 128406)
|
||||
(128378 128378)
|
||||
(128348 128359)
|
||||
(128336 128347)
|
||||
(128331 128334)
|
||||
(128302 128317)
|
||||
(128300 128301)
|
||||
(128278 128299)
|
||||
(128277 128277)
|
||||
(128266 128276)
|
||||
(128265 128265)
|
||||
(128264 128264)
|
||||
(128260 128263)
|
||||
(128259 128259)
|
||||
(128255 128258)
|
||||
(128249 128252)
|
||||
(128248 128248)
|
||||
(128246 128247)
|
||||
(128245 128245)
|
||||
(128240 128244)
|
||||
(128239 128239)
|
||||
(128238 128238)
|
||||
(128236 128237)
|
||||
(128184 128235)
|
||||
(128182 128183)
|
||||
(128174 128181)
|
||||
(128173 128173)
|
||||
(128110 128172)
|
||||
(128108 128109)
|
||||
(128102 128107)
|
||||
(128101 128101)
|
||||
(128066 128100)
|
||||
(128064 128064)
|
||||
(128043 128062)
|
||||
(128042 128042)
|
||||
(128023 128041)
|
||||
(128022 128022)
|
||||
(128021 128021)
|
||||
(128020 128020)
|
||||
(128019 128019)
|
||||
(128017 128018)
|
||||
(128015 128016)
|
||||
(128012 128014)
|
||||
(128009 128011)
|
||||
(128008 128008)
|
||||
(127992 128007)
|
||||
(127988 127988)
|
||||
(127973 127984)
|
||||
(127972 127972)
|
||||
(127968 127971)
|
||||
(127951 127955)
|
||||
(127946 127946)
|
||||
(127945 127945)
|
||||
(127944 127944)
|
||||
(127943 127943)
|
||||
(127942 127942)
|
||||
(127941 127941)
|
||||
(127904 127940)
|
||||
(127872 127891)
|
||||
(127870 127871)
|
||||
(127868 127868)
|
||||
(127825 127867)
|
||||
(127824 127824)
|
||||
(127820 127823)
|
||||
(127819 127819)
|
||||
(127799 127818)
|
||||
(127796 127797)
|
||||
(127794 127795)
|
||||
(127792 127793)
|
||||
(127789 127791)
|
||||
(127775 127776)
|
||||
(127773 127774)
|
||||
(127772 127772)
|
||||
(127771 127771)
|
||||
(127770 127770)
|
||||
(127769 127769)
|
||||
(127766 127768)
|
||||
(127763 127765)
|
||||
(127762 127762)
|
||||
(127761 127761)
|
||||
(127760 127760)
|
||||
(127759 127759)
|
||||
(127757 127758)
|
||||
(127744 127756)
|
||||
(127568 127569)
|
||||
(127544 127546)
|
||||
(127538 127542)
|
||||
(127535 127535)
|
||||
(127514 127514)
|
||||
(127489 127489)
|
||||
(127462 127487)
|
||||
(127377 127386)
|
||||
(127374 127374)
|
||||
(127183 127183)
|
||||
(126980 126980)
|
||||
(11093 11093)
|
||||
(11088 11088)
|
||||
(11035 11036)
|
||||
(10175 10175)
|
||||
(10160 10160)
|
||||
(10133 10135)
|
||||
(10071 10071)
|
||||
(10067 10069)
|
||||
(10062 10062)
|
||||
(10060 10060)
|
||||
(10024 10024)
|
||||
(9994 9995)
|
||||
(9989 9989)
|
||||
(9981 9981)
|
||||
(9978 9978)
|
||||
(9973 9973)
|
||||
(9970 9971)
|
||||
(9962 9962)
|
||||
(9940 9940)
|
||||
(9934 9934)
|
||||
(9924 9925)
|
||||
(9917 9918)
|
||||
(9898 9899)
|
||||
(9889 9889)
|
||||
(9875 9875)
|
||||
(9855 9855)
|
||||
(9800 9811)
|
||||
(9748 9749)
|
||||
(9725 9726)
|
||||
(9203 9203)
|
||||
(9200 9200)
|
||||
(9193 9196)
|
||||
(8986 8987))
|
||||
(emoji-component
|
||||
(917536 917631)
|
||||
(129456 129459)
|
||||
|
@ -1214,7 +1213,8 @@
|
|||
(8205 8205)
|
||||
(48 57)
|
||||
(42 42)
|
||||
(35 35)))))
|
||||
(35 35))
|
||||
(emoji-modifier (127995 127999)))))
|
||||
|
||||
(define char-set:emoji (char-set))
|
||||
(define char-set:emoji-presentation (char-set))
|
1484
uniseg/charsets/graphemes.scm
Normal file
1484
uniseg/charsets/graphemes.scm
Normal file
File diff suppressed because it is too large
Load diff
1518
uniseg/graphemes.scm
1518
uniseg/graphemes.scm
File diff suppressed because it is too large
Load diff
143
uniseg/graphemes/iterator.scm
Normal file
143
uniseg/graphemes/iterator.scm
Normal file
|
@ -0,0 +1,143 @@
|
|||
(define-module (uniseg graphemes iterator)
|
||||
#:use-module (uniseg)
|
||||
#:use-module (uniseg graphemes)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (srfi srfi-71)
|
||||
#:use-module (srfi srfi-41)
|
||||
#:export (make-grapheme-iterator))
|
||||
|
||||
(define (make-grapheme-iterator)
|
||||
"Create a grapheme iterator that accepts a character and provides grapheme records over time."
|
||||
;; The first is what state we are in, and the next is the grapheme
|
||||
;; property of the current character.
|
||||
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
||||
(define (state-machine cur-state cur-prop)
|
||||
(match (list cur-state cur-prop)
|
||||
;; Specifics need to go first, and then non-specifics afterwards
|
||||
;; SPECIFIC RULES - no `_' in the match
|
||||
|
||||
;; Grapheme boundary #3s
|
||||
(('carriage-return 'line-feed)
|
||||
(values 'control+line-feed #f))
|
||||
|
||||
;; Grapheme boundary #6s
|
||||
(('hangul-syllable-l 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #f))
|
||||
((or ('hangul-syllable-l 'hangul-syllable-v)
|
||||
('hangul-syllable-l 'hangul-syllable-lv))
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-l 'hangul-syllable-lvt)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #7s
|
||||
(('hangul-syllable-lv 'hangul-syllable-v)
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-lv 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #8s
|
||||
(('hangul-syllable-lvt 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #11s (emoji!)
|
||||
(('extended-pictographic 'extend)
|
||||
(values 'extended-pictographic #f))
|
||||
(('extended-pictographic 'zero-width-joiner)
|
||||
(values 'extended-pictographic+zero-width-joiner #f))
|
||||
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
||||
(values 'extended-pictographic #f))
|
||||
|
||||
;; Grapheme boundaries #12s and #13s
|
||||
(('regional-indicator-odd 'regional-indicator)
|
||||
(values 'regioinal-indicator-even #f))
|
||||
(('regional-indicator-even 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; NON-SPECIFIC RULES
|
||||
|
||||
;; Grapheme boundary #4n
|
||||
((or ('carriage-return _)
|
||||
('control+line-feed _))
|
||||
(values 'any #t))
|
||||
|
||||
;; Grapheme boundary #5n
|
||||
((_ 'carriage-return)
|
||||
(values 'carriage-return #t))
|
||||
((or (_ 'line-feed)
|
||||
(_ 'control))
|
||||
(values 'control+line-feed #t))
|
||||
((_ 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #t))
|
||||
|
||||
;; Grapheme boundary #7n
|
||||
((or (_ 'hangul-syllable-lv)
|
||||
(_ 'hangul-syllable-v))
|
||||
(values 'hangul-syllable-lv #t))
|
||||
|
||||
;; Grapheme boundary #8n
|
||||
((or (_ 'hangul-syllable-lvt)
|
||||
(_ 'hangul-syllable-t))
|
||||
(values 'hangul-syllable-lvt #t))
|
||||
|
||||
;; Grapheme boundary #9n
|
||||
((or (_ 'extend)
|
||||
(_ 'zero-width-joiner))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-A
|
||||
((_ 'spacing-mark)
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-B
|
||||
(('prepend _)
|
||||
(values 'any #f))
|
||||
((_ 'prepend)
|
||||
(values 'prepend #t))
|
||||
|
||||
;; Grapheme boundary #11n (emoji!)
|
||||
((_ 'extended-pictographic)
|
||||
(values 'extended-pictographic #t))
|
||||
|
||||
;; Grapheme boundaries #12n and #13n
|
||||
((_ 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; Everything else considered a boundary
|
||||
(else (values 'any #t))))
|
||||
|
||||
(define %current-grapheme #f)
|
||||
|
||||
(define (set-grapheme! width delta modification? state glyphs-reverse)
|
||||
(let ((new (make-grapheme width delta modification? state glyphs-reverse)))
|
||||
(set! %current-grapheme new)
|
||||
new))
|
||||
|
||||
(define (iterate-through-grapheme glyph)
|
||||
"Grapheme iteration function. May return false, in which case it requires new characters in order to produce output."
|
||||
(if (eof-object? glyph)
|
||||
;; eof means nothing to do, clear grapheme and return false
|
||||
(begin
|
||||
(set! %current-grapheme #f)
|
||||
#f)
|
||||
(begin
|
||||
(let* ((glyph-width prop (char-width glyph))
|
||||
(cur-state (if %current-grapheme
|
||||
(grapheme-state %current-grapheme)
|
||||
'any))
|
||||
(next-state boundary? (state-machine cur-state prop)))
|
||||
|
||||
;; Boundary is between this glyph and previous
|
||||
(if boundary?
|
||||
;; If we hit a boundary with previous, we simply restart the state
|
||||
;; and output the current single glyph as a grapheme
|
||||
(set-grapheme! glyph-width glyph-width #f next-state (list glyph))
|
||||
;; If it's not a boundary with previous, we have to add to the grapheme
|
||||
;; Only possible to get a boundary if %current-grapheme is set, so assume
|
||||
(let* ((cur-width (grapheme-width %current-grapheme))
|
||||
(cur-glyphs-reverse (grapheme-glyphs-reverse %current-grapheme))
|
||||
(new-width (+ cur-width glyph-width))
|
||||
(new-glyphs-reverse (cons glyph cur-glyphs-reverse)))
|
||||
|
||||
(set-grapheme! new-width glyph-width #t next-state new-glyphs-reverse)))))))
|
||||
|
||||
iterate-through-grapheme)
|
|
@ -1,183 +1,24 @@
|
|||
(define-module (uniseg graphemes stream)
|
||||
#:use-module (uniseg)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (uniseg graphemes iterator)
|
||||
#:use-module (ice-9 textual-ports)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (srfi srfi-71)
|
||||
#:use-module (srfi srfi-41)
|
||||
#:use-module (srfi srfi-9 gnu)
|
||||
#:export (make-grapheme
|
||||
grapheme?
|
||||
grapheme-glyphs
|
||||
grapheme-string
|
||||
grapheme-width
|
||||
string->grapheme-stream
|
||||
#:export (string->grapheme-stream
|
||||
input->grapheme-stream))
|
||||
|
||||
;; TODO: the golang uniseg also does word and sentence boundaries. These state machines could be implemented if we wanted to.
|
||||
(define-immutable-record-type <grapheme>
|
||||
(make-grapheme width glyphs-promise string-promise)
|
||||
grapheme?
|
||||
(width grapheme-width)
|
||||
(glyphs-promise _grapheme-glyphs-promise)
|
||||
(string-promise _grapheme-string-promise))
|
||||
|
||||
(define (grapheme-glyphs grapheme)
|
||||
"Return a lazily-constructed list of glyphs in the grapheme"
|
||||
(force (_grapheme-glyphs-promise grapheme)))
|
||||
|
||||
(define (grapheme-string grapheme)
|
||||
"Return a lazily-constructed string of the glyphs in the grapheme."
|
||||
(force (_grapheme-string-promise grapheme)))
|
||||
|
||||
(define (string->grapheme-stream str)
|
||||
"Given a string, create a (lazy) stream of graphemes."
|
||||
(with-input-from-string str
|
||||
(λ () (input->grapheme-stream (current-input-port)))))
|
||||
(call-with-input-string str input->grapheme-stream))
|
||||
|
||||
(define-stream (input->grapheme-stream port)
|
||||
|
||||
(define (input->grapheme-stream port)
|
||||
"Given an input port, create a (lazy) stream of graphemes."
|
||||
;; The first is what state we are in, and the next is the grapheme
|
||||
;; property of the current character.
|
||||
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
||||
(define (state-machine cur-state cur-prop)
|
||||
(match (list cur-state cur-prop)
|
||||
;; Specifics need to go first, and then non-specifics afterwards
|
||||
;; SPECIFIC RULES - no `_' in the match
|
||||
(define grapheme-iterator (make-grapheme-iterator))
|
||||
|
||||
;; Grapheme boundary #3s
|
||||
(('carriage-return 'line-feed)
|
||||
(values 'control+line-feed #f))
|
||||
(define-stream (grapheme-stream)
|
||||
(define grapheme (grapheme-iterator (get-char port)))
|
||||
|
||||
;; Grapheme boundary #6s
|
||||
(('hangul-syllable-l 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #f))
|
||||
((or ('hangul-syllable-l 'hangul-syllable-v)
|
||||
('hangul-syllable-l 'hangul-syllable-lv))
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-l 'hangul-syllable-lvt)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
(if grapheme
|
||||
(stream-cons grapheme (grapheme-stream))
|
||||
stream-null))
|
||||
|
||||
;; Grapheme boundary #7s
|
||||
(('hangul-syllable-lv 'hangul-syllable-v)
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-lv 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #8s
|
||||
(('hangul-syllable-lvt 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #11s (emoji!)
|
||||
(('extended-pictographic 'extend)
|
||||
(values 'extended-pictographic #f))
|
||||
(('extended-pictographic 'zero-width-joiner)
|
||||
(values 'extended-pictographic+zero-width-joiner #f))
|
||||
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
||||
(values 'extended-pictographic #f))
|
||||
|
||||
;; Grapheme boundaries #12s and #13s
|
||||
(('regional-indicator-odd 'regional-indicator)
|
||||
(values 'regioinal-indicator-even #f))
|
||||
(('regional-indicator-even 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; NON-SPECIFIC RULES
|
||||
|
||||
;; Grapheme boundary #4n
|
||||
((or ('carriage-return _)
|
||||
('control+line-feed _))
|
||||
(values 'any #t))
|
||||
|
||||
;; Grapheme boundary #5n
|
||||
((_ 'carriage-return)
|
||||
(values 'carriage-return #t))
|
||||
((or (_ 'line-feed)
|
||||
(_ 'control))
|
||||
(values 'control+line-feed #t))
|
||||
((_ 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l #t))
|
||||
|
||||
;; Grapheme boundary #7n
|
||||
((or (_ 'hangul-syllable-lv)
|
||||
(_ 'hangul-syllable-v))
|
||||
(values 'hangul-syllable-lv #t))
|
||||
|
||||
;; Grapheme boundary #8n
|
||||
((or (_ 'hangul-syllable-lvt)
|
||||
(_ 'hangul-syllable-t))
|
||||
(values 'hangul-syllable-lvt #t))
|
||||
|
||||
;; Grapheme boundary #9n
|
||||
((or (_ 'extend)
|
||||
(_ 'zero-width-joiner))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-A
|
||||
((_ 'spacing-mark)
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9n-B
|
||||
(('prepend _)
|
||||
(values 'any #f))
|
||||
((_ 'prepend)
|
||||
(values 'prepend #t))
|
||||
|
||||
;; Grapheme boundary #11n (emoji!)
|
||||
((_ 'extended-pictographic)
|
||||
(pk "EXTENDED")
|
||||
(values 'extended-pictographic #t))
|
||||
|
||||
;; Grapheme boundaries #12n and #13n
|
||||
((_ 'regional-indicator)
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
;; Everything else considered a boundary
|
||||
(else (values 'any #t))))
|
||||
|
||||
;; State variables, initialized with the first glyph we get!
|
||||
(define first-glyph (get-char port))
|
||||
(define hit-eof? (eof-object? first-glyph))
|
||||
(define %glyphs-reverse (list first-glyph))
|
||||
(define %grapheme-width 0)
|
||||
|
||||
(define (iterate-through-grapheme state)
|
||||
(define glyph (peek-char port))
|
||||
(if (eof-object? glyph)
|
||||
(begin
|
||||
(set! hit-eof? #t)
|
||||
state)
|
||||
(begin
|
||||
(let* ((width property (char-width glyph))
|
||||
(next-state boundary? (cpk-values glyph state property '= (state-machine state property))))
|
||||
|
||||
(if boundary?
|
||||
state
|
||||
(let ((new-width (+ %grapheme-width width))
|
||||
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
|
||||
|
||||
;; Officially induct this char into the cluster
|
||||
(get-char port)
|
||||
(set! %grapheme-width new-width)
|
||||
(set! %glyphs-reverse new-glyphs-reverse)
|
||||
(iterate-through-grapheme next-state)))))))
|
||||
|
||||
;; Need to explicitly get the first state
|
||||
;; Skip all this if it's an empty stream at start
|
||||
(if hit-eof?
|
||||
stream-null
|
||||
(let ((first-width first-prop (char-width first-glyph)))
|
||||
(set! %grapheme-width first-width)
|
||||
(iterate-through-grapheme (state-machine 'any first-prop))
|
||||
|
||||
(let ((grapheme (make-grapheme
|
||||
%grapheme-width
|
||||
;; Delay to avoid construction of unnecessary lists and strings!
|
||||
(delay (reverse %glyphs-reverse))
|
||||
(delay (reverse-list->string %glyphs-reverse)))))
|
||||
|
||||
;; If we hit the eof here, we need one last stream entry, otherwise iterate further
|
||||
(stream-cons grapheme
|
||||
(if hit-eof?
|
||||
stream-null
|
||||
(input->grapheme-stream port)))))))
|
||||
(grapheme-stream))
|
||||
|
|
Loading…
Reference in a new issue