Yet another reorganization, and solve Christine's 'rude problem'

- We change the stream iterator to *always* return a grapheme
  (except for EOF). The grapheme then gets built up over time.
  - This way, trans flag for example is first white flag, then
  white flag + zwj, etc until it finally transforms into the
  trans flag.
  - Users of the stream library can then use the `modification?'
  flag to determine if the stream value is a modification of the
  prior grapheme instead of a new grapheme.
  - Abstracted iteration to an iterator object to support use cases
  where we don't have an input stream (reflow needs this!)
This commit is contained in:
Vivianne 2024-03-05 11:46:32 -05:00
parent 1c0c4634f9
commit 31012d5b8f
12 changed files with 2046 additions and 2004 deletions

View file

@ -19,20 +19,27 @@
(native-language-support #f)
(licensing #f)))
(files (libraries
((scheme-file "uniseg")
(directory
((directory
"uniseg"
((scheme-file "emoji")
((directory
"charsets"
((scheme-file "emoji")
(scheme-file "eastasian")
(scheme-file "graphemes")))
(directory
"graphemes"
((scheme-file "iterator") (scheme-file "stream")))
(directory "eastasian" ((scheme-file "locale")))
(scheme-file "eastasian")
(directory "graphemes" ((scheme-file "stream")))
(scheme-file "graphemes")
(scheme-file "internal")))))
(tests ((directory
"tests"
((scheme-file "test-eastasian-locale")
(scheme-file "test-uniseg")
(scheme-file "test-graphemes-stream")))))
(scheme-file "internal")))
(scheme-file "uniseg")))
(tests
((directory
"tests"
((scheme-file "test-eastasian-locale")
(scheme-file "test-uniseg")
(scheme-file "test-graphemes-stream")
(scheme-file "test-graphemes-iterator")))))
(programs
((directory
"scripts"

View file

@ -44,7 +44,7 @@
(λ (prop) (symbol-with-prefix "char-set:eastasian-" prop))
eastasian-properties))
(define file "uniseg/eastasian.scm")
(define file "uniseg/charsets/eastasian.scm")
(format stdout "Writing to ~a...\n" file)
@ -53,7 +53,7 @@
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
(pretty-print
`(define-module (uniseg eastasian)
`(define-module (uniseg charsets eastasian)
#:use-module (uniseg internal)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)

View file

@ -40,7 +40,7 @@
("Emoji_Component" 'emoji-component)
("Extended_Pictographic" 'emoji-extended-pictographic)))
(define file "uniseg/emoji.scm")
(define file "uniseg/charsets/emoji.scm")
(format stdout "Writing to ~a...\n" file)
@ -49,7 +49,7 @@
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
(pretty-print
`(define-module (uniseg emoji)
`(define-module (uniseg charsets emoji)
#:use-module (uniseg internal)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)

View file

@ -56,7 +56,7 @@
("SpacingMark" 'spacing-mark)
("ZWJ" 'zero-width-joiner)))
(define file "uniseg/graphemes.scm")
(define file "uniseg/charsets/graphemes.scm")
(format stdout "Writing to ~a...\n" file)
@ -65,11 +65,11 @@
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
(pretty-print
`(define-module (uniseg graphemes)
`(define-module (uniseg charsets graphemes)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (uniseg internal)
#:use-module (uniseg emoji)
#:use-module (uniseg charsets emoji)
#:export (,@grapheme-symbols
grapheme-charsets)))

View file

@ -1,5 +1,7 @@
(define-module (tests test-graphemes-stream)
#:use-module (uniseg graphemes)
#:use-module (uniseg graphemes stream)
#:use-module (uniseg internal)
#:use-module (srfi srfi-41)
#:use-module (srfi srfi-64))
@ -10,11 +12,10 @@
(define* (advance-stream! #:optional (times 1))
(for-each
(λ (_)
(set! stream (stream-cdr stream)))
(λ (_) (set! stream (stream-cdr stream)))
(make-list times)))
(advance-stream! 6)
(advance-stream! 10)
(define trans-flag-grapheme (stream-car stream))
@ -56,4 +57,11 @@
(test-equal "a stream of nothing resolves to an empty stream"
#t (stream-null? empty-stream))
(define singleton-stream (string->grapheme-stream "a"))
(set! a-grapheme (stream-car stream))
(test-equal "a stream with a single character resolves to a grapheme"
"a" (grapheme-string a-grapheme))
(test-end "tests-graphemes-stream")

View file

@ -2,10 +2,11 @@
#:use-module (srfi srfi-1)
#:use-module (ice-9 match)
#:use-module (srfi srfi-41)
#:use-module (uniseg emoji)
#:use-module (uniseg graphemes)
#:use-module (uniseg graphemes stream)
#:use-module (uniseg eastasian)
#:use-module (uniseg charsets emoji)
#:use-module (uniseg charsets eastasian)
#:use-module (uniseg charsets graphemes)
#:export (emoji?
char->grapheme-property
char->eastasian-property
@ -99,6 +100,6 @@
"Get the width of a string by adding up the widths of each grapheme"
(stream-fold
(λ (val grapheme)
(+ val (grapheme-width grapheme)))
(+ val (grapheme-delta-width grapheme)))
0
(string->grapheme-stream str)))

View file

@ -1,7 +1,7 @@
;; Code generated by generate-eastasian. DO NOT EDIT
(define-module
(uniseg eastasian)
(uniseg charsets eastasian)
#:use-module
(uniseg internal)
#:use-module
@ -18,7 +18,50 @@
eastasian-charsets))
(define hashtable
(alist->hashq-table
'((combining
'((narrow
(10630 10630)
(10629 10629)
(10221 10221)
(10220 10220)
(10219 10219)
(10218 10218)
(10217 10217)
(10216 10216)
(10215 10215)
(10214 10214)
(175 175)
(172 172)
(166 166)
(165 165)
(162 163)
(126 126)
(125 125)
(124 124)
(123 123)
(97 122)
(96 96)
(95 95)
(94 94)
(93 93)
(92 92)
(91 91)
(65 90)
(63 64)
(60 62)
(58 59)
(48 57)
(46 47)
(45 45)
(44 44)
(43 43)
(42 42)
(41 41)
(40 40)
(37 39)
(36 36)
(33 35)
(32 32))
(combining
(125136 125142)
(122918 122922)
(122915 122916)
@ -71,67 +114,6 @@
(1160 1161)
(1155 1159)
(768 879))
(halfwidth
(65517 65518)
(65513 65516)
(65512 65512)
(65498 65500)
(65490 65495)
(65482 65487)
(65474 65479)
(65440 65470)
(65438 65439)
(65393 65437)
(65392 65392)
(65382 65391)
(65380 65381)
(65379 65379)
(65378 65378)
(65377 65377)
(8361 8361))
(narrow
(10630 10630)
(10629 10629)
(10221 10221)
(10220 10220)
(10219 10219)
(10218 10218)
(10217 10217)
(10216 10216)
(10215 10215)
(10214 10214)
(175 175)
(172 172)
(166 166)
(165 165)
(162 163)
(126 126)
(125 125)
(124 124)
(123 123)
(97 122)
(96 96)
(95 95)
(94 94)
(93 93)
(92 92)
(91 91)
(65 90)
(63 64)
(60 62)
(58 59)
(48 57)
(46 47)
(45 45)
(44 44)
(43 43)
(42 42)
(41 41)
(40 40)
(37 39)
(36 36)
(33 35)
(32 32))
(ambiguous
(1048576 1114109)
(983040 1048573)
@ -330,6 +312,24 @@
(167 167)
(164 164)
(161 161))
(halfwidth
(65517 65518)
(65513 65516)
(65512 65512)
(65498 65500)
(65490 65495)
(65482 65487)
(65474 65479)
(65440 65470)
(65438 65439)
(65393 65437)
(65392 65392)
(65382 65391)
(65380 65381)
(65379 65379)
(65378 65378)
(65377 65377)
(8361 8361))
(neutral
(917536 917631)
(917505 917505)

View file

@ -1,7 +1,7 @@
;; Code generated by generate-emoji. DO NOT EDIT
(define-module
(uniseg emoji)
(uniseg charsets emoji)
#:use-module
(uniseg internal)
#:use-module
@ -18,7 +18,269 @@
emoji-charsets))
(define hashtable
(alist->hashq-table
'((emoji-modifier-base
'((emoji-presentation
(129744 129750)
(129728 129730)
(129712 129718)
(129686 129704)
(129680 129685)
(129667 129670)
(129664 129666)
(129656 129658)
(129652 129652)
(129648 129651)
(129511 129535)
(129488 129510)
(129485 129487)
(129483 129483)
(129475 129482)
(129473 129474)
(129472 129472)
(129466 129471)
(129456 129465)
(129454 129455)
(129451 129453)
(129445 129450)
(129443 129444)
(129432 129442)
(129426 129431)
(129413 129425)
(129408 129412)
(129404 129407)
(129403 129403)
(129402 129402)
(129399 129400)
(129395 129398)
(129394 129394)
(129393 129393)
(129388 129392)
(129375 129387)
(129360 129374)
(129357 129359)
(129356 129356)
(129351 129355)
(129344 129349)
(129343 129343)
(129340 129342)
(129331 129338)
(129329 129330)
(129328 129328)
(129320 129327)
(129312 129319)
(129311 129311)
(129305 129310)
(129296 129304)
(129293 129295)
(129292 129292)
(128992 129003)
(128763 128764)
(128762 128762)
(128761 128761)
(128759 128760)
(128756 128758)
(128747 128748)
(128726 128727)
(128725 128725)
(128721 128722)
(128720 128720)
(128716 128716)
(128705 128709)
(128704 128704)
(128703 128703)
(128697 128702)
(128695 128696)
(128694 128694)
(128691 128693)
(128690 128690)
(128686 128689)
(128679 128685)
(128678 128678)
(128676 128677)
(128675 128675)
(128674 128674)
(128667 128673)
(128665 128666)
(128664 128664)
(128663 128663)
(128662 128662)
(128661 128661)
(128660 128660)
(128657 128659)
(128656 128656)
(128655 128655)
(128654 128654)
(128653 128653)
(128652 128652)
(128650 128651)
(128649 128649)
(128648 128648)
(128647 128647)
(128646 128646)
(128643 128645)
(128641 128642)
(128640 128640)
(128581 128591)
(128577 128580)
(128567 128576)
(128566 128566)
(128565 128565)
(128564 128564)
(128560 128563)
(128558 128559)
(128557 128557)
(128556 128556)
(128552 128555)
(128550 128551)
(128544 128549)
(128543 128543)
(128540 128542)
(128539 128539)
(128538 128538)
(128537 128537)
(128536 128536)
(128535 128535)
(128534 128534)
(128533 128533)
(128530 128532)
(128529 128529)
(128528 128528)
(128527 128527)
(128526 128526)
(128521 128525)
(128519 128520)
(128513 128518)
(128512 128512)
(128507 128511)
(128420 128420)
(128405 128406)
(128378 128378)
(128348 128359)
(128336 128347)
(128331 128334)
(128302 128317)
(128300 128301)
(128278 128299)
(128277 128277)
(128266 128276)
(128265 128265)
(128264 128264)
(128260 128263)
(128259 128259)
(128255 128258)
(128249 128252)
(128248 128248)
(128246 128247)
(128245 128245)
(128240 128244)
(128239 128239)
(128238 128238)
(128236 128237)
(128184 128235)
(128182 128183)
(128174 128181)
(128173 128173)
(128110 128172)
(128108 128109)
(128102 128107)
(128101 128101)
(128066 128100)
(128064 128064)
(128043 128062)
(128042 128042)
(128023 128041)
(128022 128022)
(128021 128021)
(128020 128020)
(128019 128019)
(128017 128018)
(128015 128016)
(128012 128014)
(128009 128011)
(128008 128008)
(127992 128007)
(127988 127988)
(127973 127984)
(127972 127972)
(127968 127971)
(127951 127955)
(127946 127946)
(127945 127945)
(127944 127944)
(127943 127943)
(127942 127942)
(127941 127941)
(127904 127940)
(127872 127891)
(127870 127871)
(127868 127868)
(127825 127867)
(127824 127824)
(127820 127823)
(127819 127819)
(127799 127818)
(127796 127797)
(127794 127795)
(127792 127793)
(127789 127791)
(127775 127776)
(127773 127774)
(127772 127772)
(127771 127771)
(127770 127770)
(127769 127769)
(127766 127768)
(127763 127765)
(127762 127762)
(127761 127761)
(127760 127760)
(127759 127759)
(127757 127758)
(127744 127756)
(127568 127569)
(127544 127546)
(127538 127542)
(127535 127535)
(127514 127514)
(127489 127489)
(127462 127487)
(127377 127386)
(127374 127374)
(127183 127183)
(126980 126980)
(11093 11093)
(11088 11088)
(11035 11036)
(10175 10175)
(10160 10160)
(10133 10135)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10024 10024)
(9994 9995)
(9989 9989)
(9981 9981)
(9978 9978)
(9973 9973)
(9970 9971)
(9962 9962)
(9940 9940)
(9934 9934)
(9924 9925)
(9917 9918)
(9898 9899)
(9889 9889)
(9875 9875)
(9855 9855)
(9800 9811)
(9748 9749)
(9725 9726)
(9203 9203)
(9200 9200)
(9193 9196)
(8986 8987))
(emoji-modifier-base
(129489 129501)
(129485 129487)
(129467 129467)
@ -449,7 +711,6 @@
(48 57)
(42 42)
(35 35))
(emoji-modifier (127995 127999))
(emoji-extended-pictographic
(130048 131069)
(129751 129791)
@ -942,268 +1203,6 @@
(8252 8252)
(174 174)
(169 169))
(emoji-presentation
(129744 129750)
(129728 129730)
(129712 129718)
(129686 129704)
(129680 129685)
(129667 129670)
(129664 129666)
(129656 129658)
(129652 129652)
(129648 129651)
(129511 129535)
(129488 129510)
(129485 129487)
(129483 129483)
(129475 129482)
(129473 129474)
(129472 129472)
(129466 129471)
(129456 129465)
(129454 129455)
(129451 129453)
(129445 129450)
(129443 129444)
(129432 129442)
(129426 129431)
(129413 129425)
(129408 129412)
(129404 129407)
(129403 129403)
(129402 129402)
(129399 129400)
(129395 129398)
(129394 129394)
(129393 129393)
(129388 129392)
(129375 129387)
(129360 129374)
(129357 129359)
(129356 129356)
(129351 129355)
(129344 129349)
(129343 129343)
(129340 129342)
(129331 129338)
(129329 129330)
(129328 129328)
(129320 129327)
(129312 129319)
(129311 129311)
(129305 129310)
(129296 129304)
(129293 129295)
(129292 129292)
(128992 129003)
(128763 128764)
(128762 128762)
(128761 128761)
(128759 128760)
(128756 128758)
(128747 128748)
(128726 128727)
(128725 128725)
(128721 128722)
(128720 128720)
(128716 128716)
(128705 128709)
(128704 128704)
(128703 128703)
(128697 128702)
(128695 128696)
(128694 128694)
(128691 128693)
(128690 128690)
(128686 128689)
(128679 128685)
(128678 128678)
(128676 128677)
(128675 128675)
(128674 128674)
(128667 128673)
(128665 128666)
(128664 128664)
(128663 128663)
(128662 128662)
(128661 128661)
(128660 128660)
(128657 128659)
(128656 128656)
(128655 128655)
(128654 128654)
(128653 128653)
(128652 128652)
(128650 128651)
(128649 128649)
(128648 128648)
(128647 128647)
(128646 128646)
(128643 128645)
(128641 128642)
(128640 128640)
(128581 128591)
(128577 128580)
(128567 128576)
(128566 128566)
(128565 128565)
(128564 128564)
(128560 128563)
(128558 128559)
(128557 128557)
(128556 128556)
(128552 128555)
(128550 128551)
(128544 128549)
(128543 128543)
(128540 128542)
(128539 128539)
(128538 128538)
(128537 128537)
(128536 128536)
(128535 128535)
(128534 128534)
(128533 128533)
(128530 128532)
(128529 128529)
(128528 128528)
(128527 128527)
(128526 128526)
(128521 128525)
(128519 128520)
(128513 128518)
(128512 128512)
(128507 128511)
(128420 128420)
(128405 128406)
(128378 128378)
(128348 128359)
(128336 128347)
(128331 128334)
(128302 128317)
(128300 128301)
(128278 128299)
(128277 128277)
(128266 128276)
(128265 128265)
(128264 128264)
(128260 128263)
(128259 128259)
(128255 128258)
(128249 128252)
(128248 128248)
(128246 128247)
(128245 128245)
(128240 128244)
(128239 128239)
(128238 128238)
(128236 128237)
(128184 128235)
(128182 128183)
(128174 128181)
(128173 128173)
(128110 128172)
(128108 128109)
(128102 128107)
(128101 128101)
(128066 128100)
(128064 128064)
(128043 128062)
(128042 128042)
(128023 128041)
(128022 128022)
(128021 128021)
(128020 128020)
(128019 128019)
(128017 128018)
(128015 128016)
(128012 128014)
(128009 128011)
(128008 128008)
(127992 128007)
(127988 127988)
(127973 127984)
(127972 127972)
(127968 127971)
(127951 127955)
(127946 127946)
(127945 127945)
(127944 127944)
(127943 127943)
(127942 127942)
(127941 127941)
(127904 127940)
(127872 127891)
(127870 127871)
(127868 127868)
(127825 127867)
(127824 127824)
(127820 127823)
(127819 127819)
(127799 127818)
(127796 127797)
(127794 127795)
(127792 127793)
(127789 127791)
(127775 127776)
(127773 127774)
(127772 127772)
(127771 127771)
(127770 127770)
(127769 127769)
(127766 127768)
(127763 127765)
(127762 127762)
(127761 127761)
(127760 127760)
(127759 127759)
(127757 127758)
(127744 127756)
(127568 127569)
(127544 127546)
(127538 127542)
(127535 127535)
(127514 127514)
(127489 127489)
(127462 127487)
(127377 127386)
(127374 127374)
(127183 127183)
(126980 126980)
(11093 11093)
(11088 11088)
(11035 11036)
(10175 10175)
(10160 10160)
(10133 10135)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10024 10024)
(9994 9995)
(9989 9989)
(9981 9981)
(9978 9978)
(9973 9973)
(9970 9971)
(9962 9962)
(9940 9940)
(9934 9934)
(9924 9925)
(9917 9918)
(9898 9899)
(9889 9889)
(9875 9875)
(9855 9855)
(9800 9811)
(9748 9749)
(9725 9726)
(9203 9203)
(9200 9200)
(9193 9196)
(8986 8987))
(emoji-component
(917536 917631)
(129456 129459)
@ -1214,7 +1213,8 @@
(8205 8205)
(48 57)
(42 42)
(35 35)))))
(35 35))
(emoji-modifier (127995 127999)))))
(define char-set:emoji (char-set))
(define char-set:emoji-presentation (char-set))

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,143 @@
(define-module (uniseg graphemes iterator)
#:use-module (uniseg)
#:use-module (uniseg graphemes)
#:use-module (ice-9 match)
#:use-module (srfi srfi-71)
#:use-module (srfi srfi-41)
#:export (make-grapheme-iterator))
(define (make-grapheme-iterator)
"Create a grapheme iterator that accepts a character and provides grapheme records over time."
;; The first is what state we are in, and the next is the grapheme
;; property of the current character.
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
(define (state-machine cur-state cur-prop)
(match (list cur-state cur-prop)
;; Specifics need to go first, and then non-specifics afterwards
;; SPECIFIC RULES - no `_' in the match
;; Grapheme boundary #3s
(('carriage-return 'line-feed)
(values 'control+line-feed #f))
;; Grapheme boundary #6s
(('hangul-syllable-l 'hangul-syllable-l)
(values 'hangul-syllable-l #f))
((or ('hangul-syllable-l 'hangul-syllable-v)
('hangul-syllable-l 'hangul-syllable-lv))
(values 'hangul-syllable-lv #f))
(('hangul-syllable-l 'hangul-syllable-lvt)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #7s
(('hangul-syllable-lv 'hangul-syllable-v)
(values 'hangul-syllable-lv #f))
(('hangul-syllable-lv 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #8s
(('hangul-syllable-lvt 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #11s (emoji!)
(('extended-pictographic 'extend)
(values 'extended-pictographic #f))
(('extended-pictographic 'zero-width-joiner)
(values 'extended-pictographic+zero-width-joiner #f))
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
(values 'extended-pictographic #f))
;; Grapheme boundaries #12s and #13s
(('regional-indicator-odd 'regional-indicator)
(values 'regioinal-indicator-even #f))
(('regional-indicator-even 'regional-indicator)
(values 'regional-indicator-odd #t))
;; NON-SPECIFIC RULES
;; Grapheme boundary #4n
((or ('carriage-return _)
('control+line-feed _))
(values 'any #t))
;; Grapheme boundary #5n
((_ 'carriage-return)
(values 'carriage-return #t))
((or (_ 'line-feed)
(_ 'control))
(values 'control+line-feed #t))
((_ 'hangul-syllable-l)
(values 'hangul-syllable-l #t))
;; Grapheme boundary #7n
((or (_ 'hangul-syllable-lv)
(_ 'hangul-syllable-v))
(values 'hangul-syllable-lv #t))
;; Grapheme boundary #8n
((or (_ 'hangul-syllable-lvt)
(_ 'hangul-syllable-t))
(values 'hangul-syllable-lvt #t))
;; Grapheme boundary #9n
((or (_ 'extend)
(_ 'zero-width-joiner))
(values 'any #f))
;; Grapheme boundary #9n-A
((_ 'spacing-mark)
(values 'any #f))
;; Grapheme boundary #9n-B
(('prepend _)
(values 'any #f))
((_ 'prepend)
(values 'prepend #t))
;; Grapheme boundary #11n (emoji!)
((_ 'extended-pictographic)
(values 'extended-pictographic #t))
;; Grapheme boundaries #12n and #13n
((_ 'regional-indicator)
(values 'regional-indicator-odd #t))
;; Everything else considered a boundary
(else (values 'any #t))))
(define %current-grapheme #f)
(define (set-grapheme! width delta modification? state glyphs-reverse)
(let ((new (make-grapheme width delta modification? state glyphs-reverse)))
(set! %current-grapheme new)
new))
(define (iterate-through-grapheme glyph)
"Grapheme iteration function. May return false, in which case it requires new characters in order to produce output."
(if (eof-object? glyph)
;; eof means nothing to do, clear grapheme and return false
(begin
(set! %current-grapheme #f)
#f)
(begin
(let* ((glyph-width prop (char-width glyph))
(cur-state (if %current-grapheme
(grapheme-state %current-grapheme)
'any))
(next-state boundary? (state-machine cur-state prop)))
;; Boundary is between this glyph and previous
(if boundary?
;; If we hit a boundary with previous, we simply restart the state
;; and output the current single glyph as a grapheme
(set-grapheme! glyph-width glyph-width #f next-state (list glyph))
;; If it's not a boundary with previous, we have to add to the grapheme
;; Only possible to get a boundary if %current-grapheme is set, so assume
(let* ((cur-width (grapheme-width %current-grapheme))
(cur-glyphs-reverse (grapheme-glyphs-reverse %current-grapheme))
(new-width (+ cur-width glyph-width))
(new-glyphs-reverse (cons glyph cur-glyphs-reverse)))
(set-grapheme! new-width glyph-width #t next-state new-glyphs-reverse)))))))
iterate-through-grapheme)

View file

@ -1,183 +1,24 @@
(define-module (uniseg graphemes stream)
#:use-module (uniseg)
#:use-module (uniseg internal)
#:use-module (uniseg graphemes iterator)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 match)
#:use-module (srfi srfi-71)
#:use-module (srfi srfi-41)
#:use-module (srfi srfi-9 gnu)
#:export (make-grapheme
grapheme?
grapheme-glyphs
grapheme-string
grapheme-width
string->grapheme-stream
#:export (string->grapheme-stream
input->grapheme-stream))
;; TODO: the golang uniseg also does word and sentence boundaries. These state machines could be implemented if we wanted to.
(define-immutable-record-type <grapheme>
(make-grapheme width glyphs-promise string-promise)
grapheme?
(width grapheme-width)
(glyphs-promise _grapheme-glyphs-promise)
(string-promise _grapheme-string-promise))
(define (grapheme-glyphs grapheme)
"Return a lazily-constructed list of glyphs in the grapheme"
(force (_grapheme-glyphs-promise grapheme)))
(define (grapheme-string grapheme)
"Return a lazily-constructed string of the glyphs in the grapheme."
(force (_grapheme-string-promise grapheme)))
(define (string->grapheme-stream str)
"Given a string, create a (lazy) stream of graphemes."
(with-input-from-string str
(λ () (input->grapheme-stream (current-input-port)))))
(call-with-input-string str input->grapheme-stream))
(define-stream (input->grapheme-stream port)
(define (input->grapheme-stream port)
"Given an input port, create a (lazy) stream of graphemes."
;; The first is what state we are in, and the next is the grapheme
;; property of the current character.
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
(define (state-machine cur-state cur-prop)
(match (list cur-state cur-prop)
;; Specifics need to go first, and then non-specifics afterwards
;; SPECIFIC RULES - no `_' in the match
(define grapheme-iterator (make-grapheme-iterator))
;; Grapheme boundary #3s
(('carriage-return 'line-feed)
(values 'control+line-feed #f))
(define-stream (grapheme-stream)
(define grapheme (grapheme-iterator (get-char port)))
;; Grapheme boundary #6s
(('hangul-syllable-l 'hangul-syllable-l)
(values 'hangul-syllable-l #f))
((or ('hangul-syllable-l 'hangul-syllable-v)
('hangul-syllable-l 'hangul-syllable-lv))
(values 'hangul-syllable-lv #f))
(('hangul-syllable-l 'hangul-syllable-lvt)
(values 'hangul-syllable-lvt #f))
(if grapheme
(stream-cons grapheme (grapheme-stream))
stream-null))
;; Grapheme boundary #7s
(('hangul-syllable-lv 'hangul-syllable-v)
(values 'hangul-syllable-lv #f))
(('hangul-syllable-lv 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #8s
(('hangul-syllable-lvt 'hangul-syllable-t)
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #11s (emoji!)
(('extended-pictographic 'extend)
(values 'extended-pictographic #f))
(('extended-pictographic 'zero-width-joiner)
(values 'extended-pictographic+zero-width-joiner #f))
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
(values 'extended-pictographic #f))
;; Grapheme boundaries #12s and #13s
(('regional-indicator-odd 'regional-indicator)
(values 'regioinal-indicator-even #f))
(('regional-indicator-even 'regional-indicator)
(values 'regional-indicator-odd #t))
;; NON-SPECIFIC RULES
;; Grapheme boundary #4n
((or ('carriage-return _)
('control+line-feed _))
(values 'any #t))
;; Grapheme boundary #5n
((_ 'carriage-return)
(values 'carriage-return #t))
((or (_ 'line-feed)
(_ 'control))
(values 'control+line-feed #t))
((_ 'hangul-syllable-l)
(values 'hangul-syllable-l #t))
;; Grapheme boundary #7n
((or (_ 'hangul-syllable-lv)
(_ 'hangul-syllable-v))
(values 'hangul-syllable-lv #t))
;; Grapheme boundary #8n
((or (_ 'hangul-syllable-lvt)
(_ 'hangul-syllable-t))
(values 'hangul-syllable-lvt #t))
;; Grapheme boundary #9n
((or (_ 'extend)
(_ 'zero-width-joiner))
(values 'any #f))
;; Grapheme boundary #9n-A
((_ 'spacing-mark)
(values 'any #f))
;; Grapheme boundary #9n-B
(('prepend _)
(values 'any #f))
((_ 'prepend)
(values 'prepend #t))
;; Grapheme boundary #11n (emoji!)
((_ 'extended-pictographic)
(pk "EXTENDED")
(values 'extended-pictographic #t))
;; Grapheme boundaries #12n and #13n
((_ 'regional-indicator)
(values 'regional-indicator-odd #t))
;; Everything else considered a boundary
(else (values 'any #t))))
;; State variables, initialized with the first glyph we get!
(define first-glyph (get-char port))
(define hit-eof? (eof-object? first-glyph))
(define %glyphs-reverse (list first-glyph))
(define %grapheme-width 0)
(define (iterate-through-grapheme state)
(define glyph (peek-char port))
(if (eof-object? glyph)
(begin
(set! hit-eof? #t)
state)
(begin
(let* ((width property (char-width glyph))
(next-state boundary? (cpk-values glyph state property '= (state-machine state property))))
(if boundary?
state
(let ((new-width (+ %grapheme-width width))
(new-glyphs-reverse (cons glyph %glyphs-reverse)))
;; Officially induct this char into the cluster
(get-char port)
(set! %grapheme-width new-width)
(set! %glyphs-reverse new-glyphs-reverse)
(iterate-through-grapheme next-state)))))))
;; Need to explicitly get the first state
;; Skip all this if it's an empty stream at start
(if hit-eof?
stream-null
(let ((first-width first-prop (char-width first-glyph)))
(set! %grapheme-width first-width)
(iterate-through-grapheme (state-machine 'any first-prop))
(let ((grapheme (make-grapheme
%grapheme-width
;; Delay to avoid construction of unnecessary lists and strings!
(delay (reverse %glyphs-reverse))
(delay (reverse-list->string %glyphs-reverse)))))
;; If we hit the eof here, we need one last stream entry, otherwise iterate further
(stream-cons grapheme
(if hit-eof?
stream-null
(input->grapheme-stream port)))))))
(grapheme-stream))