From 31012d5b8f0dce37e4938d431fdfbe4acdbd0321 Mon Sep 17 00:00:00 2001 From: Vivianne Langdon Date: Tue, 5 Mar 2024 11:46:32 -0500 Subject: [PATCH] Yet another reorganization, and solve Christine's 'rude problem' - We change the stream iterator to *always* return a grapheme (except for EOF). The grapheme then gets built up over time. - This way, trans flag for example is first white flag, then white flag + zwj, etc until it finally transforms into the trans flag. - Users of the stream library can then use the `modification?' flag to determine if the stream value is a modification of the prior grapheme instead of a new grapheme. - Abstracted iteration to an iterator object to support use cases where we don't have an input stream (reflow needs this!) --- hall.scm | 29 +- scripts/generate-eastasian.in | 4 +- scripts/generate-emoji.in | 4 +- scripts/generate-graphemes.in | 6 +- tests/test-graphemes-stream.scm | 14 +- uniseg.scm | 7 +- uniseg/{ => charsets}/eastasian.scm | 126 +-- uniseg/{ => charsets}/emoji.scm | 532 +++++----- uniseg/charsets/graphemes.scm | 1484 ++++++++++++++++++++++++++ uniseg/graphemes.scm | 1518 +-------------------------- uniseg/graphemes/iterator.scm | 143 +++ uniseg/graphemes/stream.scm | 183 +--- 12 files changed, 2046 insertions(+), 2004 deletions(-) rename uniseg/{ => charsets}/eastasian.scm (99%) rename uniseg/{ => charsets}/emoji.scm (99%) create mode 100644 uniseg/charsets/graphemes.scm create mode 100644 uniseg/graphemes/iterator.scm diff --git a/hall.scm b/hall.scm index 312ea40..dd3880b 100644 --- a/hall.scm +++ b/hall.scm @@ -19,20 +19,27 @@ (native-language-support #f) (licensing #f))) (files (libraries - ((scheme-file "uniseg") - (directory + ((directory "uniseg" - ((scheme-file "emoji") + ((directory + "charsets" + ((scheme-file "emoji") + (scheme-file "eastasian") + (scheme-file "graphemes"))) + (directory + "graphemes" + ((scheme-file "iterator") (scheme-file "stream"))) (directory "eastasian" ((scheme-file "locale"))) - (scheme-file "eastasian") - (directory "graphemes" ((scheme-file "stream"))) (scheme-file "graphemes") - (scheme-file "internal"))))) - (tests ((directory - "tests" - ((scheme-file "test-eastasian-locale") - (scheme-file "test-uniseg") - (scheme-file "test-graphemes-stream"))))) + (scheme-file "internal"))) + (scheme-file "uniseg"))) + (tests + ((directory + "tests" + ((scheme-file "test-eastasian-locale") + (scheme-file "test-uniseg") + (scheme-file "test-graphemes-stream") + (scheme-file "test-graphemes-iterator"))))) (programs ((directory "scripts" diff --git a/scripts/generate-eastasian.in b/scripts/generate-eastasian.in index 13027ef..c74614e 100644 --- a/scripts/generate-eastasian.in +++ b/scripts/generate-eastasian.in @@ -44,7 +44,7 @@ (λ (prop) (symbol-with-prefix "char-set:eastasian-" prop)) eastasian-properties)) -(define file "uniseg/eastasian.scm") +(define file "uniseg/charsets/eastasian.scm") (format stdout "Writing to ~a...\n" file) @@ -53,7 +53,7 @@ (format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename))) (pretty-print - `(define-module (uniseg eastasian) + `(define-module (uniseg charsets eastasian) #:use-module (uniseg internal) #:use-module (ice-9 hash-table) #:use-module (srfi srfi-1) diff --git a/scripts/generate-emoji.in b/scripts/generate-emoji.in index 31d90f9..0ec67e6 100644 --- a/scripts/generate-emoji.in +++ b/scripts/generate-emoji.in @@ -40,7 +40,7 @@ ("Emoji_Component" 'emoji-component) ("Extended_Pictographic" 'emoji-extended-pictographic))) -(define file "uniseg/emoji.scm") +(define file "uniseg/charsets/emoji.scm") (format stdout "Writing to ~a...\n" file) @@ -49,7 +49,7 @@ (format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename))) (pretty-print - `(define-module (uniseg emoji) + `(define-module (uniseg charsets emoji) #:use-module (uniseg internal) #:use-module (ice-9 hash-table) #:use-module (srfi srfi-1) diff --git a/scripts/generate-graphemes.in b/scripts/generate-graphemes.in index fd256ff..f14fdd5 100644 --- a/scripts/generate-graphemes.in +++ b/scripts/generate-graphemes.in @@ -56,7 +56,7 @@ ("SpacingMark" 'spacing-mark) ("ZWJ" 'zero-width-joiner))) -(define file "uniseg/graphemes.scm") +(define file "uniseg/charsets/graphemes.scm") (format stdout "Writing to ~a...\n" file) @@ -65,11 +65,11 @@ (format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename))) (pretty-print - `(define-module (uniseg graphemes) + `(define-module (uniseg charsets graphemes) #:use-module (ice-9 hash-table) #:use-module (srfi srfi-1) #:use-module (uniseg internal) - #:use-module (uniseg emoji) + #:use-module (uniseg charsets emoji) #:export (,@grapheme-symbols grapheme-charsets))) diff --git a/tests/test-graphemes-stream.scm b/tests/test-graphemes-stream.scm index 32d9ff4..2c11f92 100644 --- a/tests/test-graphemes-stream.scm +++ b/tests/test-graphemes-stream.scm @@ -1,5 +1,7 @@ (define-module (tests test-graphemes-stream) + #:use-module (uniseg graphemes) #:use-module (uniseg graphemes stream) + #:use-module (uniseg internal) #:use-module (srfi srfi-41) #:use-module (srfi srfi-64)) @@ -10,11 +12,10 @@ (define* (advance-stream! #:optional (times 1)) (for-each - (λ (_) - (set! stream (stream-cdr stream))) + (λ (_) (set! stream (stream-cdr stream))) (make-list times))) -(advance-stream! 6) +(advance-stream! 10) (define trans-flag-grapheme (stream-car stream)) @@ -56,4 +57,11 @@ (test-equal "a stream of nothing resolves to an empty stream" #t (stream-null? empty-stream)) +(define singleton-stream (string->grapheme-stream "a")) + +(set! a-grapheme (stream-car stream)) + +(test-equal "a stream with a single character resolves to a grapheme" + "a" (grapheme-string a-grapheme)) + (test-end "tests-graphemes-stream") diff --git a/uniseg.scm b/uniseg.scm index 740b9b2..719092d 100644 --- a/uniseg.scm +++ b/uniseg.scm @@ -2,10 +2,11 @@ #:use-module (srfi srfi-1) #:use-module (ice-9 match) #:use-module (srfi srfi-41) - #:use-module (uniseg emoji) #:use-module (uniseg graphemes) #:use-module (uniseg graphemes stream) - #:use-module (uniseg eastasian) + #:use-module (uniseg charsets emoji) + #:use-module (uniseg charsets eastasian) + #:use-module (uniseg charsets graphemes) #:export (emoji? char->grapheme-property char->eastasian-property @@ -99,6 +100,6 @@ "Get the width of a string by adding up the widths of each grapheme" (stream-fold (λ (val grapheme) - (+ val (grapheme-width grapheme))) + (+ val (grapheme-delta-width grapheme))) 0 (string->grapheme-stream str))) diff --git a/uniseg/eastasian.scm b/uniseg/charsets/eastasian.scm similarity index 99% rename from uniseg/eastasian.scm rename to uniseg/charsets/eastasian.scm index 26d7241..34aef33 100644 --- a/uniseg/eastasian.scm +++ b/uniseg/charsets/eastasian.scm @@ -1,7 +1,7 @@ ;; Code generated by generate-eastasian. DO NOT EDIT (define-module - (uniseg eastasian) + (uniseg charsets eastasian) #:use-module (uniseg internal) #:use-module @@ -18,7 +18,50 @@ eastasian-charsets)) (define hashtable (alist->hashq-table - '((combining + '((narrow + (10630 10630) + (10629 10629) + (10221 10221) + (10220 10220) + (10219 10219) + (10218 10218) + (10217 10217) + (10216 10216) + (10215 10215) + (10214 10214) + (175 175) + (172 172) + (166 166) + (165 165) + (162 163) + (126 126) + (125 125) + (124 124) + (123 123) + (97 122) + (96 96) + (95 95) + (94 94) + (93 93) + (92 92) + (91 91) + (65 90) + (63 64) + (60 62) + (58 59) + (48 57) + (46 47) + (45 45) + (44 44) + (43 43) + (42 42) + (41 41) + (40 40) + (37 39) + (36 36) + (33 35) + (32 32)) + (combining (125136 125142) (122918 122922) (122915 122916) @@ -71,67 +114,6 @@ (1160 1161) (1155 1159) (768 879)) - (halfwidth - (65517 65518) - (65513 65516) - (65512 65512) - (65498 65500) - (65490 65495) - (65482 65487) - (65474 65479) - (65440 65470) - (65438 65439) - (65393 65437) - (65392 65392) - (65382 65391) - (65380 65381) - (65379 65379) - (65378 65378) - (65377 65377) - (8361 8361)) - (narrow - (10630 10630) - (10629 10629) - (10221 10221) - (10220 10220) - (10219 10219) - (10218 10218) - (10217 10217) - (10216 10216) - (10215 10215) - (10214 10214) - (175 175) - (172 172) - (166 166) - (165 165) - (162 163) - (126 126) - (125 125) - (124 124) - (123 123) - (97 122) - (96 96) - (95 95) - (94 94) - (93 93) - (92 92) - (91 91) - (65 90) - (63 64) - (60 62) - (58 59) - (48 57) - (46 47) - (45 45) - (44 44) - (43 43) - (42 42) - (41 41) - (40 40) - (37 39) - (36 36) - (33 35) - (32 32)) (ambiguous (1048576 1114109) (983040 1048573) @@ -330,6 +312,24 @@ (167 167) (164 164) (161 161)) + (halfwidth + (65517 65518) + (65513 65516) + (65512 65512) + (65498 65500) + (65490 65495) + (65482 65487) + (65474 65479) + (65440 65470) + (65438 65439) + (65393 65437) + (65392 65392) + (65382 65391) + (65380 65381) + (65379 65379) + (65378 65378) + (65377 65377) + (8361 8361)) (neutral (917536 917631) (917505 917505) diff --git a/uniseg/emoji.scm b/uniseg/charsets/emoji.scm similarity index 99% rename from uniseg/emoji.scm rename to uniseg/charsets/emoji.scm index f3c0396..f28f11b 100644 --- a/uniseg/emoji.scm +++ b/uniseg/charsets/emoji.scm @@ -1,7 +1,7 @@ ;; Code generated by generate-emoji. DO NOT EDIT (define-module - (uniseg emoji) + (uniseg charsets emoji) #:use-module (uniseg internal) #:use-module @@ -18,7 +18,269 @@ emoji-charsets)) (define hashtable (alist->hashq-table - '((emoji-modifier-base + '((emoji-presentation + (129744 129750) + (129728 129730) + (129712 129718) + (129686 129704) + (129680 129685) + (129667 129670) + (129664 129666) + (129656 129658) + (129652 129652) + (129648 129651) + (129511 129535) + (129488 129510) + (129485 129487) + (129483 129483) + (129475 129482) + (129473 129474) + (129472 129472) + (129466 129471) + (129456 129465) + (129454 129455) + (129451 129453) + (129445 129450) + (129443 129444) + (129432 129442) + (129426 129431) + (129413 129425) + (129408 129412) + (129404 129407) + (129403 129403) + (129402 129402) + (129399 129400) + (129395 129398) + (129394 129394) + (129393 129393) + (129388 129392) + (129375 129387) + (129360 129374) + (129357 129359) + (129356 129356) + (129351 129355) + (129344 129349) + (129343 129343) + (129340 129342) + (129331 129338) + (129329 129330) + (129328 129328) + (129320 129327) + (129312 129319) + (129311 129311) + (129305 129310) + (129296 129304) + (129293 129295) + (129292 129292) + (128992 129003) + (128763 128764) + (128762 128762) + (128761 128761) + (128759 128760) + (128756 128758) + (128747 128748) + (128726 128727) + (128725 128725) + (128721 128722) + (128720 128720) + (128716 128716) + (128705 128709) + (128704 128704) + (128703 128703) + (128697 128702) + (128695 128696) + (128694 128694) + (128691 128693) + (128690 128690) + (128686 128689) + (128679 128685) + (128678 128678) + (128676 128677) + (128675 128675) + (128674 128674) + (128667 128673) + (128665 128666) + (128664 128664) + (128663 128663) + (128662 128662) + (128661 128661) + (128660 128660) + (128657 128659) + (128656 128656) + (128655 128655) + (128654 128654) + (128653 128653) + (128652 128652) + (128650 128651) + (128649 128649) + (128648 128648) + (128647 128647) + (128646 128646) + (128643 128645) + (128641 128642) + (128640 128640) + (128581 128591) + (128577 128580) + (128567 128576) + (128566 128566) + (128565 128565) + (128564 128564) + (128560 128563) + (128558 128559) + (128557 128557) + (128556 128556) + (128552 128555) + (128550 128551) + (128544 128549) + (128543 128543) + (128540 128542) + (128539 128539) + (128538 128538) + (128537 128537) + (128536 128536) + (128535 128535) + (128534 128534) + (128533 128533) + (128530 128532) + (128529 128529) + (128528 128528) + (128527 128527) + (128526 128526) + (128521 128525) + (128519 128520) + (128513 128518) + (128512 128512) + (128507 128511) + (128420 128420) + (128405 128406) + (128378 128378) + (128348 128359) + (128336 128347) + (128331 128334) + (128302 128317) + (128300 128301) + (128278 128299) + (128277 128277) + (128266 128276) + (128265 128265) + (128264 128264) + (128260 128263) + (128259 128259) + (128255 128258) + (128249 128252) + (128248 128248) + (128246 128247) + (128245 128245) + (128240 128244) + (128239 128239) + (128238 128238) + (128236 128237) + (128184 128235) + (128182 128183) + (128174 128181) + (128173 128173) + (128110 128172) + (128108 128109) + (128102 128107) + (128101 128101) + (128066 128100) + (128064 128064) + (128043 128062) + (128042 128042) + (128023 128041) + (128022 128022) + (128021 128021) + (128020 128020) + (128019 128019) + (128017 128018) + (128015 128016) + (128012 128014) + (128009 128011) + (128008 128008) + (127992 128007) + (127988 127988) + (127973 127984) + (127972 127972) + (127968 127971) + (127951 127955) + (127946 127946) + (127945 127945) + (127944 127944) + (127943 127943) + (127942 127942) + (127941 127941) + (127904 127940) + (127872 127891) + (127870 127871) + (127868 127868) + (127825 127867) + (127824 127824) + (127820 127823) + (127819 127819) + (127799 127818) + (127796 127797) + (127794 127795) + (127792 127793) + (127789 127791) + (127775 127776) + (127773 127774) + (127772 127772) + (127771 127771) + (127770 127770) + (127769 127769) + (127766 127768) + (127763 127765) + (127762 127762) + (127761 127761) + (127760 127760) + (127759 127759) + (127757 127758) + (127744 127756) + (127568 127569) + (127544 127546) + (127538 127542) + (127535 127535) + (127514 127514) + (127489 127489) + (127462 127487) + (127377 127386) + (127374 127374) + (127183 127183) + (126980 126980) + (11093 11093) + (11088 11088) + (11035 11036) + (10175 10175) + (10160 10160) + (10133 10135) + (10071 10071) + (10067 10069) + (10062 10062) + (10060 10060) + (10024 10024) + (9994 9995) + (9989 9989) + (9981 9981) + (9978 9978) + (9973 9973) + (9970 9971) + (9962 9962) + (9940 9940) + (9934 9934) + (9924 9925) + (9917 9918) + (9898 9899) + (9889 9889) + (9875 9875) + (9855 9855) + (9800 9811) + (9748 9749) + (9725 9726) + (9203 9203) + (9200 9200) + (9193 9196) + (8986 8987)) + (emoji-modifier-base (129489 129501) (129485 129487) (129467 129467) @@ -449,7 +711,6 @@ (48 57) (42 42) (35 35)) - (emoji-modifier (127995 127999)) (emoji-extended-pictographic (130048 131069) (129751 129791) @@ -942,268 +1203,6 @@ (8252 8252) (174 174) (169 169)) - (emoji-presentation - (129744 129750) - (129728 129730) - (129712 129718) - (129686 129704) - (129680 129685) - (129667 129670) - (129664 129666) - (129656 129658) - (129652 129652) - (129648 129651) - (129511 129535) - (129488 129510) - (129485 129487) - (129483 129483) - (129475 129482) - (129473 129474) - (129472 129472) - (129466 129471) - (129456 129465) - (129454 129455) - (129451 129453) - (129445 129450) - (129443 129444) - (129432 129442) - (129426 129431) - (129413 129425) - (129408 129412) - (129404 129407) - (129403 129403) - (129402 129402) - (129399 129400) - (129395 129398) - (129394 129394) - (129393 129393) - (129388 129392) - (129375 129387) - (129360 129374) - (129357 129359) - (129356 129356) - (129351 129355) - (129344 129349) - (129343 129343) - (129340 129342) - (129331 129338) - (129329 129330) - (129328 129328) - (129320 129327) - (129312 129319) - (129311 129311) - (129305 129310) - (129296 129304) - (129293 129295) - (129292 129292) - (128992 129003) - (128763 128764) - (128762 128762) - (128761 128761) - (128759 128760) - (128756 128758) - (128747 128748) - (128726 128727) - (128725 128725) - (128721 128722) - (128720 128720) - (128716 128716) - (128705 128709) - (128704 128704) - (128703 128703) - (128697 128702) - (128695 128696) - (128694 128694) - (128691 128693) - (128690 128690) - (128686 128689) - (128679 128685) - (128678 128678) - (128676 128677) - (128675 128675) - (128674 128674) - (128667 128673) - (128665 128666) - (128664 128664) - (128663 128663) - (128662 128662) - (128661 128661) - (128660 128660) - (128657 128659) - (128656 128656) - (128655 128655) - (128654 128654) - (128653 128653) - (128652 128652) - (128650 128651) - (128649 128649) - (128648 128648) - (128647 128647) - (128646 128646) - (128643 128645) - (128641 128642) - (128640 128640) - (128581 128591) - (128577 128580) - (128567 128576) - (128566 128566) - (128565 128565) - (128564 128564) - (128560 128563) - (128558 128559) - (128557 128557) - (128556 128556) - (128552 128555) - (128550 128551) - (128544 128549) - (128543 128543) - (128540 128542) - (128539 128539) - (128538 128538) - (128537 128537) - (128536 128536) - (128535 128535) - (128534 128534) - (128533 128533) - (128530 128532) - (128529 128529) - (128528 128528) - (128527 128527) - (128526 128526) - (128521 128525) - (128519 128520) - (128513 128518) - (128512 128512) - (128507 128511) - (128420 128420) - (128405 128406) - (128378 128378) - (128348 128359) - (128336 128347) - (128331 128334) - (128302 128317) - (128300 128301) - (128278 128299) - (128277 128277) - (128266 128276) - (128265 128265) - (128264 128264) - (128260 128263) - (128259 128259) - (128255 128258) - (128249 128252) - (128248 128248) - (128246 128247) - (128245 128245) - (128240 128244) - (128239 128239) - (128238 128238) - (128236 128237) - (128184 128235) - (128182 128183) - (128174 128181) - (128173 128173) - (128110 128172) - (128108 128109) - (128102 128107) - (128101 128101) - (128066 128100) - (128064 128064) - (128043 128062) - (128042 128042) - (128023 128041) - (128022 128022) - (128021 128021) - (128020 128020) - (128019 128019) - (128017 128018) - (128015 128016) - (128012 128014) - (128009 128011) - (128008 128008) - (127992 128007) - (127988 127988) - (127973 127984) - (127972 127972) - (127968 127971) - (127951 127955) - (127946 127946) - (127945 127945) - (127944 127944) - (127943 127943) - (127942 127942) - (127941 127941) - (127904 127940) - (127872 127891) - (127870 127871) - (127868 127868) - (127825 127867) - (127824 127824) - (127820 127823) - (127819 127819) - (127799 127818) - (127796 127797) - (127794 127795) - (127792 127793) - (127789 127791) - (127775 127776) - (127773 127774) - (127772 127772) - (127771 127771) - (127770 127770) - (127769 127769) - (127766 127768) - (127763 127765) - (127762 127762) - (127761 127761) - (127760 127760) - (127759 127759) - (127757 127758) - (127744 127756) - (127568 127569) - (127544 127546) - (127538 127542) - (127535 127535) - (127514 127514) - (127489 127489) - (127462 127487) - (127377 127386) - (127374 127374) - (127183 127183) - (126980 126980) - (11093 11093) - (11088 11088) - (11035 11036) - (10175 10175) - (10160 10160) - (10133 10135) - (10071 10071) - (10067 10069) - (10062 10062) - (10060 10060) - (10024 10024) - (9994 9995) - (9989 9989) - (9981 9981) - (9978 9978) - (9973 9973) - (9970 9971) - (9962 9962) - (9940 9940) - (9934 9934) - (9924 9925) - (9917 9918) - (9898 9899) - (9889 9889) - (9875 9875) - (9855 9855) - (9800 9811) - (9748 9749) - (9725 9726) - (9203 9203) - (9200 9200) - (9193 9196) - (8986 8987)) (emoji-component (917536 917631) (129456 129459) @@ -1214,7 +1213,8 @@ (8205 8205) (48 57) (42 42) - (35 35))))) + (35 35)) + (emoji-modifier (127995 127999))))) (define char-set:emoji (char-set)) (define char-set:emoji-presentation (char-set)) diff --git a/uniseg/charsets/graphemes.scm b/uniseg/charsets/graphemes.scm new file mode 100644 index 0000000..2ca9cfe --- /dev/null +++ b/uniseg/charsets/graphemes.scm @@ -0,0 +1,1484 @@ +;; Code generated by generate-graphemes. DO NOT EDIT + +(define-module + (uniseg charsets graphemes) + #:use-module + (ice-9 hash-table) + #:use-module + (srfi srfi-1) + #:use-module + (uniseg internal) + #:use-module + (uniseg charsets emoji) + #:export + (char-set:grapheme-hangul-syllable-l + char-set:grapheme-hangul-syllable-v + char-set:grapheme-hangul-syllable-lv + char-set:grapheme-hangul-syllable-lvt + char-set:grapheme-prepend + char-set:grapheme-carriage-return + char-set:grapheme-line-feed + char-set:grapheme-control + char-set:grapheme-extend + char-set:grapheme-regional-indicator + char-set:grapheme-spacing-mark + char-set:grapheme-zero-width-joiner + grapheme-charsets)) +(define hashtable + (alist->hashq-table + '((hangul-syllable-lv + (55176 55176) + (55148 55148) + (55120 55120) + (55092 55092) + (55064 55064) + (55036 55036) + (55008 55008) + (54980 54980) + (54952 54952) + (54924 54924) + (54896 54896) + (54868 54868) + (54840 54840) + (54812 54812) + (54784 54784) + (54756 54756) + (54728 54728) + (54700 54700) + (54672 54672) + (54644 54644) + (54616 54616) + (54588 54588) + (54560 54560) + (54532 54532) + (54504 54504) + (54476 54476) + (54448 54448) + (54420 54420) + (54392 54392) + (54364 54364) + (54336 54336) + (54308 54308) + (54280 54280) + (54252 54252) + (54224 54224) + (54196 54196) + (54168 54168) + (54140 54140) + (54112 54112) + (54084 54084) + (54056 54056) + (54028 54028) + (54000 54000) + (53972 53972) + (53944 53944) + (53916 53916) + (53888 53888) + (53860 53860) + (53832 53832) + (53804 53804) + (53776 53776) + (53748 53748) + (53720 53720) + (53692 53692) + (53664 53664) + (53636 53636) + (53608 53608) + (53580 53580) + (53552 53552) + (53524 53524) + (53496 53496) + (53468 53468) + (53440 53440) + (53412 53412) + (53384 53384) + (53356 53356) + (53328 53328) + (53300 53300) + (53272 53272) + (53244 53244) + (53216 53216) + (53188 53188) + (53160 53160) + (53132 53132) + (53104 53104) + (53076 53076) + (53048 53048) + (53020 53020) + (52992 52992) + (52964 52964) + (52936 52936) + (52908 52908) + (52880 52880) + (52852 52852) + (52824 52824) + (52796 52796) + (52768 52768) + (52740 52740) + (52712 52712) + (52684 52684) + (52656 52656) + (52628 52628) + (52600 52600) + (52572 52572) + (52544 52544) + (52516 52516) + (52488 52488) + (52460 52460) + (52432 52432) + (52404 52404) + (52376 52376) + (52348 52348) + (52320 52320) + (52292 52292) + (52264 52264) + (52236 52236) + (52208 52208) + (52180 52180) + (52152 52152) + (52124 52124) + (52096 52096) + (52068 52068) + (52040 52040) + (52012 52012) + (51984 51984) + (51956 51956) + (51928 51928) + (51900 51900) + (51872 51872) + (51844 51844) + (51816 51816) + (51788 51788) + (51760 51760) + (51732 51732) + (51704 51704) + (51676 51676) + (51648 51648) + (51620 51620) + (51592 51592) + (51564 51564) + (51536 51536) + (51508 51508) + (51480 51480) + (51452 51452) + (51424 51424) + (51396 51396) + (51368 51368) + (51340 51340) + (51312 51312) + (51284 51284) + (51256 51256) + (51228 51228) + (51200 51200) + (51172 51172) + (51144 51144) + (51116 51116) + (51088 51088) + (51060 51060) + (51032 51032) + (51004 51004) + (50976 50976) + (50948 50948) + (50920 50920) + (50892 50892) + (50864 50864) + (50836 50836) + (50808 50808) + (50780 50780) + (50752 50752) + (50724 50724) + (50696 50696) + (50668 50668) + (50640 50640) + (50612 50612) + (50584 50584) + (50556 50556) + (50528 50528) + (50500 50500) + (50472 50472) + (50444 50444) + (50416 50416) + (50388 50388) + (50360 50360) + (50332 50332) + (50304 50304) + (50276 50276) + (50248 50248) + (50220 50220) + (50192 50192) + (50164 50164) + (50136 50136) + (50108 50108) + (50080 50080) + (50052 50052) + (50024 50024) + (49996 49996) + (49968 49968) + (49940 49940) + (49912 49912) + (49884 49884) + (49856 49856) + (49828 49828) + (49800 49800) + (49772 49772) + (49744 49744) + (49716 49716) + (49688 49688) + (49660 49660) + (49632 49632) + (49604 49604) + (49576 49576) + (49548 49548) + (49520 49520) + (49492 49492) + (49464 49464) + (49436 49436) + (49408 49408) + (49380 49380) + (49352 49352) + (49324 49324) + (49296 49296) + (49268 49268) + (49240 49240) + (49212 49212) + (49184 49184) + (49156 49156) + (49128 49128) + (49100 49100) + (49072 49072) + (49044 49044) + (49016 49016) + (48988 48988) + (48960 48960) + (48932 48932) + (48904 48904) + (48876 48876) + (48848 48848) + (48820 48820) + (48792 48792) + (48764 48764) + (48736 48736) + (48708 48708) + (48680 48680) + (48652 48652) + (48624 48624) + (48596 48596) + (48568 48568) + (48540 48540) + (48512 48512) + (48484 48484) + (48456 48456) + (48428 48428) + (48400 48400) + (48372 48372) + (48344 48344) + (48316 48316) + (48288 48288) + (48260 48260) + (48232 48232) + (48204 48204) + (48176 48176) + (48148 48148) + (48120 48120) + (48092 48092) + (48064 48064) + (48036 48036) + (48008 48008) + (47980 47980) + (47952 47952) + (47924 47924) + (47896 47896) + (47868 47868) + (47840 47840) + (47812 47812) + (47784 47784) + (47756 47756) + (47728 47728) + (47700 47700) + (47672 47672) + (47644 47644) + (47616 47616) + (47588 47588) + (47560 47560) + (47532 47532) + (47504 47504) + (47476 47476) + (47448 47448) + (47420 47420) + (47392 47392) + (47364 47364) + (47336 47336) + (47308 47308) + (47280 47280) + (47252 47252) + (47224 47224) + (47196 47196) + (47168 47168) + (47140 47140) + (47112 47112) + (47084 47084) + (47056 47056) + (47028 47028) + (47000 47000) + (46972 46972) + (46944 46944) + (46916 46916) + (46888 46888) + (46860 46860) + (46832 46832) + (46804 46804) + (46776 46776) + (46748 46748) + (46720 46720) + (46692 46692) + (46664 46664) + (46636 46636) + (46608 46608) + (46580 46580) + (46552 46552) + (46524 46524) + (46496 46496) + (46468 46468) + (46440 46440) + (46412 46412) + (46384 46384) + (46356 46356) + (46328 46328) + (46300 46300) + (46272 46272) + (46244 46244) + (46216 46216) + (46188 46188) + (46160 46160) + (46132 46132) + (46104 46104) + (46076 46076) + (46048 46048) + (46020 46020) + (45992 45992) + (45964 45964) + (45936 45936) + (45908 45908) + (45880 45880) + (45852 45852) + (45824 45824) + (45796 45796) + (45768 45768) + (45740 45740) + (45712 45712) + (45684 45684) + (45656 45656) + (45628 45628) + (45600 45600) + (45572 45572) + (45544 45544) + (45516 45516) + (45488 45488) + (45460 45460) + (45432 45432) + (45404 45404) + (45376 45376) + (45348 45348) + (45320 45320) + (45292 45292) + (45264 45264) + (45236 45236) + (45208 45208) + (45180 45180) + (45152 45152) + (45124 45124) + (45096 45096) + (45068 45068) + (45040 45040) + (45012 45012) + (44984 44984) + (44956 44956) + (44928 44928) + (44900 44900) + (44872 44872) + (44844 44844) + (44816 44816) + (44788 44788) + (44760 44760) + (44732 44732) + (44704 44704) + (44676 44676) + (44648 44648) + (44620 44620) + (44592 44592) + (44564 44564) + (44536 44536) + (44508 44508) + (44480 44480) + (44452 44452) + (44424 44424) + (44396 44396) + (44368 44368) + (44340 44340) + (44312 44312) + (44284 44284) + (44256 44256) + (44228 44228) + (44200 44200) + (44172 44172) + (44144 44144) + (44116 44116) + (44088 44088) + (44060 44060) + (44032 44032)) + (hangul-syllable-lvt + (55177 55203) + (55149 55175) + (55121 55147) + (55093 55119) + (55065 55091) + (55037 55063) + (55009 55035) + (54981 55007) + (54953 54979) + (54925 54951) + (54897 54923) + (54869 54895) + (54841 54867) + (54813 54839) + (54785 54811) + (54757 54783) + (54729 54755) + (54701 54727) + (54673 54699) + (54645 54671) + (54617 54643) + (54589 54615) + (54561 54587) + (54533 54559) + (54505 54531) + (54477 54503) + (54449 54475) + (54421 54447) + (54393 54419) + (54365 54391) + (54337 54363) + (54309 54335) + (54281 54307) + (54253 54279) + (54225 54251) + (54197 54223) + (54169 54195) + (54141 54167) + (54113 54139) + (54085 54111) + (54057 54083) + (54029 54055) + (54001 54027) + (53973 53999) + (53945 53971) + (53917 53943) + (53889 53915) + (53861 53887) + (53833 53859) + (53805 53831) + (53777 53803) + (53749 53775) + (53721 53747) + (53693 53719) + (53665 53691) + (53637 53663) + (53609 53635) + (53581 53607) + (53553 53579) + (53525 53551) + (53497 53523) + (53469 53495) + (53441 53467) + (53413 53439) + (53385 53411) + (53357 53383) + (53329 53355) + (53301 53327) + (53273 53299) + (53245 53271) + (53217 53243) + (53189 53215) + (53161 53187) + (53133 53159) + (53105 53131) + (53077 53103) + (53049 53075) + (53021 53047) + (52993 53019) + (52965 52991) + (52937 52963) + (52909 52935) + (52881 52907) + (52853 52879) + (52825 52851) + (52797 52823) + (52769 52795) + (52741 52767) + (52713 52739) + (52685 52711) + (52657 52683) + (52629 52655) + (52601 52627) + (52573 52599) + (52545 52571) + (52517 52543) + (52489 52515) + (52461 52487) + (52433 52459) + (52405 52431) + (52377 52403) + (52349 52375) + (52321 52347) + (52293 52319) + (52265 52291) + (52237 52263) + (52209 52235) + (52181 52207) + (52153 52179) + (52125 52151) + (52097 52123) + (52069 52095) + (52041 52067) + (52013 52039) + (51985 52011) + (51957 51983) + (51929 51955) + (51901 51927) + (51873 51899) + (51845 51871) + (51817 51843) + (51789 51815) + (51761 51787) + (51733 51759) + (51705 51731) + (51677 51703) + (51649 51675) + (51621 51647) + (51593 51619) + (51565 51591) + (51537 51563) + (51509 51535) + (51481 51507) + (51453 51479) + (51425 51451) + (51397 51423) + (51369 51395) + (51341 51367) + (51313 51339) + (51285 51311) + (51257 51283) + (51229 51255) + (51201 51227) + (51173 51199) + (51145 51171) + (51117 51143) + (51089 51115) + (51061 51087) + (51033 51059) + (51005 51031) + (50977 51003) + (50949 50975) + (50921 50947) + (50893 50919) + (50865 50891) + (50837 50863) + (50809 50835) + (50781 50807) + (50753 50779) + (50725 50751) + (50697 50723) + (50669 50695) + (50641 50667) + (50613 50639) + (50585 50611) + (50557 50583) + (50529 50555) + (50501 50527) + (50473 50499) + (50445 50471) + (50417 50443) + (50389 50415) + (50361 50387) + (50333 50359) + (50305 50331) + (50277 50303) + (50249 50275) + (50221 50247) + (50193 50219) + (50165 50191) + (50137 50163) + (50109 50135) + (50081 50107) + (50053 50079) + (50025 50051) + (49997 50023) + (49969 49995) + (49941 49967) + (49913 49939) + (49885 49911) + (49857 49883) + (49829 49855) + (49801 49827) + (49773 49799) + (49745 49771) + (49717 49743) + (49689 49715) + (49661 49687) + (49633 49659) + (49605 49631) + (49577 49603) + (49549 49575) + (49521 49547) + (49493 49519) + (49465 49491) + (49437 49463) + (49409 49435) + (49381 49407) + (49353 49379) + (49325 49351) + (49297 49323) + (49269 49295) + (49241 49267) + (49213 49239) + (49185 49211) + (49157 49183) + (49129 49155) + (49101 49127) + (49073 49099) + (49045 49071) + (49017 49043) + (48989 49015) + (48961 48987) + (48933 48959) + (48905 48931) + (48877 48903) + (48849 48875) + (48821 48847) + (48793 48819) + (48765 48791) + (48737 48763) + (48709 48735) + (48681 48707) + (48653 48679) + (48625 48651) + (48597 48623) + (48569 48595) + (48541 48567) + (48513 48539) + (48485 48511) + (48457 48483) + (48429 48455) + (48401 48427) + (48373 48399) + (48345 48371) + (48317 48343) + (48289 48315) + (48261 48287) + (48233 48259) + (48205 48231) + (48177 48203) + (48149 48175) + (48121 48147) + (48093 48119) + (48065 48091) + (48037 48063) + (48009 48035) + (47981 48007) + (47953 47979) + (47925 47951) + (47897 47923) + (47869 47895) + (47841 47867) + (47813 47839) + (47785 47811) + (47757 47783) + (47729 47755) + (47701 47727) + (47673 47699) + (47645 47671) + (47617 47643) + (47589 47615) + (47561 47587) + (47533 47559) + (47505 47531) + (47477 47503) + (47449 47475) + (47421 47447) + (47393 47419) + (47365 47391) + (47337 47363) + (47309 47335) + (47281 47307) + (47253 47279) + (47225 47251) + (47197 47223) + (47169 47195) + (47141 47167) + (47113 47139) + (47085 47111) + (47057 47083) + (47029 47055) + (47001 47027) + (46973 46999) + (46945 46971) + (46917 46943) + (46889 46915) + (46861 46887) + (46833 46859) + (46805 46831) + (46777 46803) + (46749 46775) + (46721 46747) + (46693 46719) + (46665 46691) + (46637 46663) + (46609 46635) + (46581 46607) + (46553 46579) + (46525 46551) + (46497 46523) + (46469 46495) + (46441 46467) + (46413 46439) + (46385 46411) + (46357 46383) + (46329 46355) + (46301 46327) + (46273 46299) + (46245 46271) + (46217 46243) + (46189 46215) + (46161 46187) + (46133 46159) + (46105 46131) + (46077 46103) + (46049 46075) + (46021 46047) + (45993 46019) + (45965 45991) + (45937 45963) + (45909 45935) + (45881 45907) + (45853 45879) + (45825 45851) + (45797 45823) + (45769 45795) + (45741 45767) + (45713 45739) + (45685 45711) + (45657 45683) + (45629 45655) + (45601 45627) + (45573 45599) + (45545 45571) + (45517 45543) + (45489 45515) + (45461 45487) + (45433 45459) + (45405 45431) + (45377 45403) + (45349 45375) + (45321 45347) + (45293 45319) + (45265 45291) + (45237 45263) + (45209 45235) + (45181 45207) + (45153 45179) + (45125 45151) + (45097 45123) + (45069 45095) + (45041 45067) + (45013 45039) + (44985 45011) + (44957 44983) + (44929 44955) + (44901 44927) + (44873 44899) + (44845 44871) + (44817 44843) + (44789 44815) + (44761 44787) + (44733 44759) + (44705 44731) + (44677 44703) + (44649 44675) + (44621 44647) + (44593 44619) + (44565 44591) + (44537 44563) + (44509 44535) + (44481 44507) + (44453 44479) + (44425 44451) + (44397 44423) + (44369 44395) + (44341 44367) + (44313 44339) + (44285 44311) + (44257 44283) + (44229 44255) + (44201 44227) + (44173 44199) + (44145 44171) + (44117 44143) + (44089 44115) + (44061 44087) + (44033 44059)) + (zero-width-joiner (8205 8205)) + (line-feed (10 10)) + (hangul-syllable-l (43360 43388) (4352 4447)) + (extend + (917760 917999) + (917536 917631) + (127995 127999) + (125252 125258) + (125136 125142) + (124140 124143) + (123628 123631) + (123566 123566) + (123184 123190) + (123023 123023) + (122918 122922) + (122915 122916) + (122907 122913) + (122888 122904) + (122880 122886) + (121505 121519) + (121499 121503) + (121476 121476) + (121461 121461) + (121403 121452) + (121344 121398) + (119362 119364) + (119210 119213) + (119173 119179) + (119163 119170) + (119150 119154) + (119143 119145) + (119141 119141) + (118576 118598) + (118528 118573) + (113821 113822) + (94180 94180) + (94095 94098) + (94031 94031) + (92976 92982) + (92912 92916) + (78919 78933) + (78912 78912) + (73538 73538) + (73536 73536) + (73526 73530) + (73472 73473) + (73459 73460) + (73111 73111) + (73109 73109) + (73104 73105) + (73031 73031) + (73023 73029) + (73020 73021) + (73018 73018) + (73009 73014) + (72885 72886) + (72882 72883) + (72874 72880) + (72850 72871) + (72767 72767) + (72760 72765) + (72752 72758) + (72344 72345) + (72330 72342) + (72281 72283) + (72273 72278) + (72263 72263) + (72251 72254) + (72243 72248) + (72193 72202) + (72160 72160) + (72154 72155) + (72148 72151) + (72003 72003) + (71998 71998) + (71995 71996) + (71984 71984) + (71737 71738) + (71727 71735) + (71463 71467) + (71458 71461) + (71453 71455) + (71351 71351) + (71344 71349) + (71341 71341) + (71339 71339) + (71231 71232) + (71229 71229) + (71219 71226) + (71132 71133) + (71103 71104) + (71100 71101) + (71090 71093) + (71087 71087) + (70850 70851) + (70847 70848) + (70845 70845) + (70842 70842) + (70835 70840) + (70832 70832) + (70750 70750) + (70726 70726) + (70722 70724) + (70712 70719) + (70512 70516) + (70502 70508) + (70487 70487) + (70464 70464) + (70462 70462) + (70459 70460) + (70400 70401) + (70371 70378) + (70367 70367) + (70209 70209) + (70206 70206) + (70198 70199) + (70196 70196) + (70191 70193) + (70095 70095) + (70089 70092) + (70070 70078) + (70016 70017) + (70003 70003) + (69933 69940) + (69927 69931) + (69888 69890) + (69826 69826) + (69817 69818) + (69811 69814) + (69759 69761) + (69747 69748) + (69744 69744) + (69688 69702) + (69633 69633) + (69506 69509) + (69446 69456) + (69373 69375) + (69291 69292) + (68900 68903) + (68325 68326) + (68159 68159) + (68152 68154) + (68108 68111) + (68101 68102) + (68097 68099) + (66422 66426) + (66272 66272) + (66045 66045) + (65438 65439) + (65056 65071) + (65024 65039) + (64286 64286) + (44013 44013) + (44008 44008) + (44005 44005) + (43766 43766) + (43756 43757) + (43713 43713) + (43710 43711) + (43703 43704) + (43698 43700) + (43696 43696) + (43644 43644) + (43596 43596) + (43587 43587) + (43573 43574) + (43569 43570) + (43561 43566) + (43493 43493) + (43452 43453) + (43446 43449) + (43443 43443) + (43392 43394) + (43335 43345) + (43302 43309) + (43263 43263) + (43232 43249) + (43204 43205) + (43052 43052) + (43045 43046) + (43019 43019) + (43014 43014) + (43010 43010) + (42736 42737) + (42654 42655) + (42612 42621) + (42608 42610) + (42607 42607) + (12441 12442) + (12334 12335) + (12330 12333) + (11744 11775) + (11647 11647) + (11503 11505) + (8421 8432) + (8418 8420) + (8417 8417) + (8413 8416) + (8400 8412) + (8204 8204) + (7616 7679) + (7416 7417) + (7412 7412) + (7405 7405) + (7394 7400) + (7380 7392) + (7376 7378) + (7222 7223) + (7212 7219) + (7151 7153) + (7149 7149) + (7144 7145) + (7142 7142) + (7083 7085) + (7080 7081) + (7074 7077) + (7040 7041) + (7019 7027) + (6978 6978) + (6972 6972) + (6966 6970) + (6965 6965) + (6964 6964) + (6912 6915) + (6847 6862) + (6846 6846) + (6832 6845) + (6783 6783) + (6771 6780) + (6757 6764) + (6754 6754) + (6752 6752) + (6744 6750) + (6742 6742) + (6683 6683) + (6679 6680) + (6457 6459) + (6450 6450) + (6439 6440) + (6432 6434) + (6313 6313) + (6277 6278) + (6159 6159) + (6155 6157) + (6109 6109) + (6089 6099) + (6086 6086) + (6071 6077) + (6068 6069) + (6002 6003) + (5970 5971) + (5938 5939) + (5906 5908) + (4957 4959) + (4253 4253) + (4237 4237) + (4229 4230) + (4226 4226) + (4209 4212) + (4190 4192) + (4184 4185) + (4157 4158) + (4153 4154) + (4146 4151) + (4141 4144) + (4038 4038) + (3993 4028) + (3981 3991) + (3974 3975) + (3968 3972) + (3953 3966) + (3897 3897) + (3895 3895) + (3893 3893) + (3864 3865) + (3784 3790) + (3764 3772) + (3761 3761) + (3655 3662) + (3636 3642) + (3633 3633) + (3551 3551) + (3542 3542) + (3538 3540) + (3535 3535) + (3530 3530) + (3457 3457) + (3426 3427) + (3415 3415) + (3405 3405) + (3393 3396) + (3390 3390) + (3387 3388) + (3328 3329) + (3298 3299) + (3285 3286) + (3276 3277) + (3270 3270) + (3266 3266) + (3263 3263) + (3260 3260) + (3201 3201) + (3170 3171) + (3157 3158) + (3146 3149) + (3142 3144) + (3134 3136) + (3132 3132) + (3076 3076) + (3072 3072) + (3031 3031) + (3021 3021) + (3008 3008) + (3006 3006) + (2946 2946) + (2914 2915) + (2903 2903) + (2901 2902) + (2893 2893) + (2881 2884) + (2879 2879) + (2878 2878) + (2876 2876) + (2817 2817) + (2810 2815) + (2786 2787) + (2765 2765) + (2759 2760) + (2753 2757) + (2748 2748) + (2689 2690) + (2677 2677) + (2672 2673) + (2641 2641) + (2635 2637) + (2631 2632) + (2625 2626) + (2620 2620) + (2561 2562) + (2558 2558) + (2530 2531) + (2519 2519) + (2509 2509) + (2497 2500) + (2494 2494) + (2492 2492) + (2433 2433) + (2402 2403) + (2385 2391) + (2381 2381) + (2369 2376) + (2364 2364) + (2362 2362) + (2275 2306) + (2250 2273) + (2200 2207) + (2137 2139) + (2089 2093) + (2085 2087) + (2075 2083) + (2070 2073) + (2045 2045) + (2027 2035) + (1958 1968) + (1840 1866) + (1809 1809) + (1770 1773) + (1767 1768) + (1759 1764) + (1750 1756) + (1648 1648) + (1611 1631) + (1552 1562) + (1479 1479) + (1476 1477) + (1473 1474) + (1471 1471) + (1425 1469) + (1160 1161) + (1155 1159) + (768 879)) + (control + (918000 921599) + (917632 917759) + (917506 917535) + (917505 917505) + (917504 917504) + (119155 119162) + (113824 113827) + (78896 78911) + (65529 65531) + (65520 65528) + (65279 65279) + (8294 8303) + (8293 8293) + (8288 8292) + (8234 8238) + (8233 8233) + (8232 8232) + (8206 8207) + (8203 8203) + (6158 6158) + (1564 1564) + (173 173) + (127 159) + (14 31) + (11 12) + (0 9)) + (regional-indicator (127462 127487)) + (carriage-return (13 13)) + (hangul-syllable-v (55216 55238) (4448 4519)) + (prepend + (73474 73474) + (73030 73030) + (72324 72329) + (72250 72250) + (72001 72001) + (71999 71999) + (70082 70083) + (69837 69837) + (69821 69821) + (3406 3406) + (2274 2274) + (2192 2193) + (1807 1807) + (1757 1757) + (1536 1541)) + (hangul-syllable-t (55243 55291) (4520 4607)) + (spacing-mark + (119149 119149) + (119142 119142) + (94192 94193) + (94033 94087) + (73537 73537) + (73534 73535) + (73524 73525) + (73475 73475) + (73461 73462) + (73110 73110) + (73107 73108) + (73098 73102) + (72884 72884) + (72881 72881) + (72873 72873) + (72766 72766) + (72751 72751) + (72343 72343) + (72279 72280) + (72249 72249) + (72164 72164) + (72156 72159) + (72145 72147) + (72002 72002) + (72000 72000) + (71997 71997) + (71991 71992) + (71985 71989) + (71736 71736) + (71724 71726) + (71462 71462) + (71350 71350) + (71342 71343) + (71340 71340) + (71230 71230) + (71227 71228) + (71216 71218) + (71102 71102) + (71096 71099) + (71088 71089) + (70849 70849) + (70846 70846) + (70843 70844) + (70841 70841) + (70833 70834) + (70725 70725) + (70720 70721) + (70709 70711) + (70498 70499) + (70475 70477) + (70471 70472) + (70465 70468) + (70463 70463) + (70402 70403) + (70368 70370) + (70197 70197) + (70194 70195) + (70188 70190) + (70094 70094) + (70079 70080) + (70067 70069) + (70018 70018) + (69957 69958) + (69932 69932) + (69815 69816) + (69808 69810) + (69762 69762) + (69634 69634) + (69632 69632) + (44012 44012) + (44009 44010) + (44006 44007) + (44003 44004) + (43765 43765) + (43758 43759) + (43755 43755) + (43597 43597) + (43571 43572) + (43567 43568) + (43454 43456) + (43450 43451) + (43444 43445) + (43395 43395) + (43346 43347) + (43188 43203) + (43136 43137) + (43047 43047) + (43043 43044) + (7415 7415) + (7393 7393) + (7220 7221) + (7204 7211) + (7154 7155) + (7150 7150) + (7146 7148) + (7143 7143) + (7082 7082) + (7078 7079) + (7073 7073) + (7042 7042) + (6979 6980) + (6973 6977) + (6971 6971) + (6916 6916) + (6765 6770) + (6743 6743) + (6741 6741) + (6681 6682) + (6451 6456) + (6448 6449) + (6441 6443) + (6435 6438) + (6087 6088) + (6078 6085) + (6070 6070) + (5940 5940) + (5909 5909) + (4228 4228) + (4182 4183) + (4155 4156) + (4145 4145) + (3967 3967) + (3902 3903) + (3763 3763) + (3635 3635) + (3570 3571) + (3544 3550) + (3536 3537) + (3458 3459) + (3402 3404) + (3398 3400) + (3391 3392) + (3330 3331) + (3315 3315) + (3274 3275) + (3271 3272) + (3267 3268) + (3264 3265) + (3262 3262) + (3202 3203) + (3137 3140) + (3073 3075) + (3018 3020) + (3014 3016) + (3009 3010) + (3007 3007) + (2891 2892) + (2887 2888) + (2880 2880) + (2818 2819) + (2763 2764) + (2761 2761) + (2750 2752) + (2691 2691) + (2622 2624) + (2563 2563) + (2507 2508) + (2503 2504) + (2495 2496) + (2434 2435) + (2382 2383) + (2377 2380) + (2366 2368) + (2363 2363) + (2307 2307))))) + +(define char-set:grapheme-hangul-syllable-l (char-set)) +(define char-set:grapheme-hangul-syllable-v (char-set)) +(define char-set:grapheme-hangul-syllable-lv (char-set)) +(define char-set:grapheme-hangul-syllable-lvt (char-set)) +(define char-set:grapheme-prepend (char-set)) +(define char-set:grapheme-carriage-return (char-set)) +(define char-set:grapheme-line-feed (char-set)) +(define char-set:grapheme-control (char-set)) +(define char-set:grapheme-extend (char-set)) +(define char-set:grapheme-regional-indicator (char-set)) +(define char-set:grapheme-spacing-mark (char-set)) +(define char-set:grapheme-zero-width-joiner (char-set)) + +(define grapheme-charsets + (list (list 'hangul-syllable-l char-set:grapheme-hangul-syllable-l) + (list 'hangul-syllable-v char-set:grapheme-hangul-syllable-v) + (list 'hangul-syllable-lv char-set:grapheme-hangul-syllable-lv) + (list 'hangul-syllable-lvt char-set:grapheme-hangul-syllable-lvt) + (list 'prepend char-set:grapheme-prepend) + (list 'carriage-return char-set:grapheme-carriage-return) + (list 'line-feed char-set:grapheme-line-feed) + (list 'control char-set:grapheme-control) + (list 'extend char-set:grapheme-extend) + (list 'regional-indicator char-set:grapheme-regional-indicator) + (list 'spacing-mark char-set:grapheme-spacing-mark) + (list 'zero-width-joiner char-set:grapheme-zero-width-joiner))) + +(ranges->charset! + hashtable + 'hangul-syllable-l + char-set:grapheme-hangul-syllable-l) +(ranges->charset! + hashtable + 'hangul-syllable-v + char-set:grapheme-hangul-syllable-v) +(ranges->charset! + hashtable + 'hangul-syllable-lv + char-set:grapheme-hangul-syllable-lv) +(ranges->charset! + hashtable + 'hangul-syllable-lvt + char-set:grapheme-hangul-syllable-lvt) +(ranges->charset! hashtable 'prepend char-set:grapheme-prepend) +(ranges->charset! hashtable 'carriage-return char-set:grapheme-carriage-return) +(ranges->charset! hashtable 'line-feed char-set:grapheme-line-feed) +(ranges->charset! hashtable 'control char-set:grapheme-control) +(ranges->charset! hashtable 'extend char-set:grapheme-extend) +(ranges->charset! + hashtable + 'regional-indicator + char-set:grapheme-regional-indicator) +(ranges->charset! hashtable 'spacing-mark char-set:grapheme-spacing-mark) +(ranges->charset! + hashtable + 'zero-width-joiner + char-set:grapheme-zero-width-joiner) + +(set! grapheme-charsets + (cons (list 'extended-pictographic char-set:emoji-extended-pictographic) + grapheme-charsets)) diff --git a/uniseg/graphemes.scm b/uniseg/graphemes.scm index 4b79530..99e8623 100644 --- a/uniseg/graphemes.scm +++ b/uniseg/graphemes.scm @@ -1,1484 +1,42 @@ -;; Code generated by generate-graphemes. DO NOT EDIT +(define-module (uniseg graphemes) + #:use-module (ice-9 hash-table) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-9 gnu) + #:export (make-grapheme + grapheme? + grapheme-width + grapheme-delta-width + grapheme-modification? + grapheme-glyphs + grapheme-glyphs-reverse + grapheme-state + grapheme-string)) -(define-module - (uniseg graphemes) - #:use-module - (ice-9 hash-table) - #:use-module - (srfi srfi-1) - #:use-module - (uniseg internal) - #:use-module - (uniseg emoji) - #:export - (char-set:grapheme-hangul-syllable-l - char-set:grapheme-hangul-syllable-v - char-set:grapheme-hangul-syllable-lv - char-set:grapheme-hangul-syllable-lvt - char-set:grapheme-prepend - char-set:grapheme-carriage-return - char-set:grapheme-line-feed - char-set:grapheme-control - char-set:grapheme-extend - char-set:grapheme-regional-indicator - char-set:grapheme-spacing-mark - char-set:grapheme-zero-width-joiner - grapheme-charsets)) -(define hashtable - (alist->hashq-table - '((carriage-return (13 13)) - (prepend - (73474 73474) - (73030 73030) - (72324 72329) - (72250 72250) - (72001 72001) - (71999 71999) - (70082 70083) - (69837 69837) - (69821 69821) - (3406 3406) - (2274 2274) - (2192 2193) - (1807 1807) - (1757 1757) - (1536 1541)) - (control - (918000 921599) - (917632 917759) - (917506 917535) - (917505 917505) - (917504 917504) - (119155 119162) - (113824 113827) - (78896 78911) - (65529 65531) - (65520 65528) - (65279 65279) - (8294 8303) - (8293 8293) - (8288 8292) - (8234 8238) - (8233 8233) - (8232 8232) - (8206 8207) - (8203 8203) - (6158 6158) - (1564 1564) - (173 173) - (127 159) - (14 31) - (11 12) - (0 9)) - (line-feed (10 10)) - (regional-indicator (127462 127487)) - (hangul-syllable-lv - (55176 55176) - (55148 55148) - (55120 55120) - (55092 55092) - (55064 55064) - (55036 55036) - (55008 55008) - (54980 54980) - (54952 54952) - (54924 54924) - (54896 54896) - (54868 54868) - (54840 54840) - (54812 54812) - (54784 54784) - (54756 54756) - (54728 54728) - (54700 54700) - (54672 54672) - (54644 54644) - (54616 54616) - (54588 54588) - (54560 54560) - (54532 54532) - (54504 54504) - (54476 54476) - (54448 54448) - (54420 54420) - (54392 54392) - (54364 54364) - (54336 54336) - (54308 54308) - (54280 54280) - (54252 54252) - (54224 54224) - (54196 54196) - (54168 54168) - (54140 54140) - (54112 54112) - (54084 54084) - (54056 54056) - (54028 54028) - (54000 54000) - (53972 53972) - (53944 53944) - (53916 53916) - (53888 53888) - (53860 53860) - (53832 53832) - (53804 53804) - (53776 53776) - (53748 53748) - (53720 53720) - (53692 53692) - (53664 53664) - (53636 53636) - (53608 53608) - (53580 53580) - (53552 53552) - (53524 53524) - (53496 53496) - (53468 53468) - (53440 53440) - (53412 53412) - (53384 53384) - (53356 53356) - (53328 53328) - (53300 53300) - (53272 53272) - (53244 53244) - (53216 53216) - (53188 53188) - (53160 53160) - (53132 53132) - (53104 53104) - (53076 53076) - (53048 53048) - (53020 53020) - (52992 52992) - (52964 52964) - (52936 52936) - (52908 52908) - (52880 52880) - (52852 52852) - (52824 52824) - (52796 52796) - (52768 52768) - (52740 52740) - (52712 52712) - (52684 52684) - (52656 52656) - (52628 52628) - (52600 52600) - (52572 52572) - (52544 52544) - (52516 52516) - (52488 52488) - (52460 52460) - (52432 52432) - (52404 52404) - (52376 52376) - (52348 52348) - (52320 52320) - (52292 52292) - (52264 52264) - (52236 52236) - (52208 52208) - (52180 52180) - (52152 52152) - (52124 52124) - (52096 52096) - (52068 52068) - (52040 52040) - (52012 52012) - (51984 51984) - (51956 51956) - (51928 51928) - (51900 51900) - (51872 51872) - (51844 51844) - (51816 51816) - (51788 51788) - (51760 51760) - (51732 51732) - (51704 51704) - (51676 51676) - (51648 51648) - (51620 51620) - (51592 51592) - (51564 51564) - (51536 51536) - (51508 51508) - (51480 51480) - (51452 51452) - (51424 51424) - (51396 51396) - (51368 51368) - (51340 51340) - (51312 51312) - (51284 51284) - (51256 51256) - (51228 51228) - (51200 51200) - (51172 51172) - (51144 51144) - (51116 51116) - (51088 51088) - (51060 51060) - (51032 51032) - (51004 51004) - (50976 50976) - (50948 50948) - (50920 50920) - (50892 50892) - (50864 50864) - (50836 50836) - (50808 50808) - (50780 50780) - (50752 50752) - (50724 50724) - (50696 50696) - (50668 50668) - (50640 50640) - (50612 50612) - (50584 50584) - (50556 50556) - (50528 50528) - (50500 50500) - (50472 50472) - (50444 50444) - (50416 50416) - (50388 50388) - (50360 50360) - (50332 50332) - (50304 50304) - (50276 50276) - (50248 50248) - (50220 50220) - (50192 50192) - (50164 50164) - (50136 50136) - (50108 50108) - (50080 50080) - (50052 50052) - (50024 50024) - (49996 49996) - (49968 49968) - (49940 49940) - (49912 49912) - (49884 49884) - (49856 49856) - (49828 49828) - (49800 49800) - (49772 49772) - (49744 49744) - (49716 49716) - (49688 49688) - (49660 49660) - (49632 49632) - (49604 49604) - (49576 49576) - (49548 49548) - (49520 49520) - (49492 49492) - (49464 49464) - (49436 49436) - (49408 49408) - (49380 49380) - (49352 49352) - (49324 49324) - (49296 49296) - (49268 49268) - (49240 49240) - (49212 49212) - (49184 49184) - (49156 49156) - (49128 49128) - (49100 49100) - (49072 49072) - (49044 49044) - (49016 49016) - (48988 48988) - (48960 48960) - (48932 48932) - (48904 48904) - (48876 48876) - (48848 48848) - (48820 48820) - (48792 48792) - (48764 48764) - (48736 48736) - (48708 48708) - (48680 48680) - (48652 48652) - (48624 48624) - (48596 48596) - (48568 48568) - (48540 48540) - (48512 48512) - (48484 48484) - (48456 48456) - (48428 48428) - (48400 48400) - (48372 48372) - (48344 48344) - (48316 48316) - (48288 48288) - (48260 48260) - (48232 48232) - (48204 48204) - (48176 48176) - (48148 48148) - (48120 48120) - (48092 48092) - (48064 48064) - (48036 48036) - (48008 48008) - (47980 47980) - (47952 47952) - (47924 47924) - (47896 47896) - (47868 47868) - (47840 47840) - (47812 47812) - (47784 47784) - (47756 47756) - (47728 47728) - (47700 47700) - (47672 47672) - (47644 47644) - (47616 47616) - (47588 47588) - (47560 47560) - (47532 47532) - (47504 47504) - (47476 47476) - (47448 47448) - (47420 47420) - (47392 47392) - (47364 47364) - (47336 47336) - (47308 47308) - (47280 47280) - (47252 47252) - (47224 47224) - (47196 47196) - (47168 47168) - (47140 47140) - (47112 47112) - (47084 47084) - (47056 47056) - (47028 47028) - (47000 47000) - (46972 46972) - (46944 46944) - (46916 46916) - (46888 46888) - (46860 46860) - (46832 46832) - (46804 46804) - (46776 46776) - (46748 46748) - (46720 46720) - (46692 46692) - (46664 46664) - (46636 46636) - (46608 46608) - (46580 46580) - (46552 46552) - (46524 46524) - (46496 46496) - (46468 46468) - (46440 46440) - (46412 46412) - (46384 46384) - (46356 46356) - (46328 46328) - (46300 46300) - (46272 46272) - (46244 46244) - (46216 46216) - (46188 46188) - (46160 46160) - (46132 46132) - (46104 46104) - (46076 46076) - (46048 46048) - (46020 46020) - (45992 45992) - (45964 45964) - (45936 45936) - (45908 45908) - (45880 45880) - (45852 45852) - (45824 45824) - (45796 45796) - (45768 45768) - (45740 45740) - (45712 45712) - (45684 45684) - (45656 45656) - (45628 45628) - (45600 45600) - (45572 45572) - (45544 45544) - (45516 45516) - (45488 45488) - (45460 45460) - (45432 45432) - (45404 45404) - (45376 45376) - (45348 45348) - (45320 45320) - (45292 45292) - (45264 45264) - (45236 45236) - (45208 45208) - (45180 45180) - (45152 45152) - (45124 45124) - (45096 45096) - (45068 45068) - (45040 45040) - (45012 45012) - (44984 44984) - (44956 44956) - (44928 44928) - (44900 44900) - (44872 44872) - (44844 44844) - (44816 44816) - (44788 44788) - (44760 44760) - (44732 44732) - (44704 44704) - (44676 44676) - (44648 44648) - (44620 44620) - (44592 44592) - (44564 44564) - (44536 44536) - (44508 44508) - (44480 44480) - (44452 44452) - (44424 44424) - (44396 44396) - (44368 44368) - (44340 44340) - (44312 44312) - (44284 44284) - (44256 44256) - (44228 44228) - (44200 44200) - (44172 44172) - (44144 44144) - (44116 44116) - (44088 44088) - (44060 44060) - (44032 44032)) - (hangul-syllable-t (55243 55291) (4520 4607)) - (hangul-syllable-v (55216 55238) (4448 4519)) - (hangul-syllable-l (43360 43388) (4352 4447)) - (extend - (917760 917999) - (917536 917631) - (127995 127999) - (125252 125258) - (125136 125142) - (124140 124143) - (123628 123631) - (123566 123566) - (123184 123190) - (123023 123023) - (122918 122922) - (122915 122916) - (122907 122913) - (122888 122904) - (122880 122886) - (121505 121519) - (121499 121503) - (121476 121476) - (121461 121461) - (121403 121452) - (121344 121398) - (119362 119364) - (119210 119213) - (119173 119179) - (119163 119170) - (119150 119154) - (119143 119145) - (119141 119141) - (118576 118598) - (118528 118573) - (113821 113822) - (94180 94180) - (94095 94098) - (94031 94031) - (92976 92982) - (92912 92916) - (78919 78933) - (78912 78912) - (73538 73538) - (73536 73536) - (73526 73530) - (73472 73473) - (73459 73460) - (73111 73111) - (73109 73109) - (73104 73105) - (73031 73031) - (73023 73029) - (73020 73021) - (73018 73018) - (73009 73014) - (72885 72886) - (72882 72883) - (72874 72880) - (72850 72871) - (72767 72767) - (72760 72765) - (72752 72758) - (72344 72345) - (72330 72342) - (72281 72283) - (72273 72278) - (72263 72263) - (72251 72254) - (72243 72248) - (72193 72202) - (72160 72160) - (72154 72155) - (72148 72151) - (72003 72003) - (71998 71998) - (71995 71996) - (71984 71984) - (71737 71738) - (71727 71735) - (71463 71467) - (71458 71461) - (71453 71455) - (71351 71351) - (71344 71349) - (71341 71341) - (71339 71339) - (71231 71232) - (71229 71229) - (71219 71226) - (71132 71133) - (71103 71104) - (71100 71101) - (71090 71093) - (71087 71087) - (70850 70851) - (70847 70848) - (70845 70845) - (70842 70842) - (70835 70840) - (70832 70832) - (70750 70750) - (70726 70726) - (70722 70724) - (70712 70719) - (70512 70516) - (70502 70508) - (70487 70487) - (70464 70464) - (70462 70462) - (70459 70460) - (70400 70401) - (70371 70378) - (70367 70367) - (70209 70209) - (70206 70206) - (70198 70199) - (70196 70196) - (70191 70193) - (70095 70095) - (70089 70092) - (70070 70078) - (70016 70017) - (70003 70003) - (69933 69940) - (69927 69931) - (69888 69890) - (69826 69826) - (69817 69818) - (69811 69814) - (69759 69761) - (69747 69748) - (69744 69744) - (69688 69702) - (69633 69633) - (69506 69509) - (69446 69456) - (69373 69375) - (69291 69292) - (68900 68903) - (68325 68326) - (68159 68159) - (68152 68154) - (68108 68111) - (68101 68102) - (68097 68099) - (66422 66426) - (66272 66272) - (66045 66045) - (65438 65439) - (65056 65071) - (65024 65039) - (64286 64286) - (44013 44013) - (44008 44008) - (44005 44005) - (43766 43766) - (43756 43757) - (43713 43713) - (43710 43711) - (43703 43704) - (43698 43700) - (43696 43696) - (43644 43644) - (43596 43596) - (43587 43587) - (43573 43574) - (43569 43570) - (43561 43566) - (43493 43493) - (43452 43453) - (43446 43449) - (43443 43443) - (43392 43394) - (43335 43345) - (43302 43309) - (43263 43263) - (43232 43249) - (43204 43205) - (43052 43052) - (43045 43046) - (43019 43019) - (43014 43014) - (43010 43010) - (42736 42737) - (42654 42655) - (42612 42621) - (42608 42610) - (42607 42607) - (12441 12442) - (12334 12335) - (12330 12333) - (11744 11775) - (11647 11647) - (11503 11505) - (8421 8432) - (8418 8420) - (8417 8417) - (8413 8416) - (8400 8412) - (8204 8204) - (7616 7679) - (7416 7417) - (7412 7412) - (7405 7405) - (7394 7400) - (7380 7392) - (7376 7378) - (7222 7223) - (7212 7219) - (7151 7153) - (7149 7149) - (7144 7145) - (7142 7142) - (7083 7085) - (7080 7081) - (7074 7077) - (7040 7041) - (7019 7027) - (6978 6978) - (6972 6972) - (6966 6970) - (6965 6965) - (6964 6964) - (6912 6915) - (6847 6862) - (6846 6846) - (6832 6845) - (6783 6783) - (6771 6780) - (6757 6764) - (6754 6754) - (6752 6752) - (6744 6750) - (6742 6742) - (6683 6683) - (6679 6680) - (6457 6459) - (6450 6450) - (6439 6440) - (6432 6434) - (6313 6313) - (6277 6278) - (6159 6159) - (6155 6157) - (6109 6109) - (6089 6099) - (6086 6086) - (6071 6077) - (6068 6069) - (6002 6003) - (5970 5971) - (5938 5939) - (5906 5908) - (4957 4959) - (4253 4253) - (4237 4237) - (4229 4230) - (4226 4226) - (4209 4212) - (4190 4192) - (4184 4185) - (4157 4158) - (4153 4154) - (4146 4151) - (4141 4144) - (4038 4038) - (3993 4028) - (3981 3991) - (3974 3975) - (3968 3972) - (3953 3966) - (3897 3897) - (3895 3895) - (3893 3893) - (3864 3865) - (3784 3790) - (3764 3772) - (3761 3761) - (3655 3662) - (3636 3642) - (3633 3633) - (3551 3551) - (3542 3542) - (3538 3540) - (3535 3535) - (3530 3530) - (3457 3457) - (3426 3427) - (3415 3415) - (3405 3405) - (3393 3396) - (3390 3390) - (3387 3388) - (3328 3329) - (3298 3299) - (3285 3286) - (3276 3277) - (3270 3270) - (3266 3266) - (3263 3263) - (3260 3260) - (3201 3201) - (3170 3171) - (3157 3158) - (3146 3149) - (3142 3144) - (3134 3136) - (3132 3132) - (3076 3076) - (3072 3072) - (3031 3031) - (3021 3021) - (3008 3008) - (3006 3006) - (2946 2946) - (2914 2915) - (2903 2903) - (2901 2902) - (2893 2893) - (2881 2884) - (2879 2879) - (2878 2878) - (2876 2876) - (2817 2817) - (2810 2815) - (2786 2787) - (2765 2765) - (2759 2760) - (2753 2757) - (2748 2748) - (2689 2690) - (2677 2677) - (2672 2673) - (2641 2641) - (2635 2637) - (2631 2632) - (2625 2626) - (2620 2620) - (2561 2562) - (2558 2558) - (2530 2531) - (2519 2519) - (2509 2509) - (2497 2500) - (2494 2494) - (2492 2492) - (2433 2433) - (2402 2403) - (2385 2391) - (2381 2381) - (2369 2376) - (2364 2364) - (2362 2362) - (2275 2306) - (2250 2273) - (2200 2207) - (2137 2139) - (2089 2093) - (2085 2087) - (2075 2083) - (2070 2073) - (2045 2045) - (2027 2035) - (1958 1968) - (1840 1866) - (1809 1809) - (1770 1773) - (1767 1768) - (1759 1764) - (1750 1756) - (1648 1648) - (1611 1631) - (1552 1562) - (1479 1479) - (1476 1477) - (1473 1474) - (1471 1471) - (1425 1469) - (1160 1161) - (1155 1159) - (768 879)) - (spacing-mark - (119149 119149) - (119142 119142) - (94192 94193) - (94033 94087) - (73537 73537) - (73534 73535) - (73524 73525) - (73475 73475) - (73461 73462) - (73110 73110) - (73107 73108) - (73098 73102) - (72884 72884) - (72881 72881) - (72873 72873) - (72766 72766) - (72751 72751) - (72343 72343) - (72279 72280) - (72249 72249) - (72164 72164) - (72156 72159) - (72145 72147) - (72002 72002) - (72000 72000) - (71997 71997) - (71991 71992) - (71985 71989) - (71736 71736) - (71724 71726) - (71462 71462) - (71350 71350) - (71342 71343) - (71340 71340) - (71230 71230) - (71227 71228) - (71216 71218) - (71102 71102) - (71096 71099) - (71088 71089) - (70849 70849) - (70846 70846) - (70843 70844) - (70841 70841) - (70833 70834) - (70725 70725) - (70720 70721) - (70709 70711) - (70498 70499) - (70475 70477) - (70471 70472) - (70465 70468) - (70463 70463) - (70402 70403) - (70368 70370) - (70197 70197) - (70194 70195) - (70188 70190) - (70094 70094) - (70079 70080) - (70067 70069) - (70018 70018) - (69957 69958) - (69932 69932) - (69815 69816) - (69808 69810) - (69762 69762) - (69634 69634) - (69632 69632) - (44012 44012) - (44009 44010) - (44006 44007) - (44003 44004) - (43765 43765) - (43758 43759) - (43755 43755) - (43597 43597) - (43571 43572) - (43567 43568) - (43454 43456) - (43450 43451) - (43444 43445) - (43395 43395) - (43346 43347) - (43188 43203) - (43136 43137) - (43047 43047) - (43043 43044) - (7415 7415) - (7393 7393) - (7220 7221) - (7204 7211) - (7154 7155) - (7150 7150) - (7146 7148) - (7143 7143) - (7082 7082) - (7078 7079) - (7073 7073) - (7042 7042) - (6979 6980) - (6973 6977) - (6971 6971) - (6916 6916) - (6765 6770) - (6743 6743) - (6741 6741) - (6681 6682) - (6451 6456) - (6448 6449) - (6441 6443) - (6435 6438) - (6087 6088) - (6078 6085) - (6070 6070) - (5940 5940) - (5909 5909) - (4228 4228) - (4182 4183) - (4155 4156) - (4145 4145) - (3967 3967) - (3902 3903) - (3763 3763) - (3635 3635) - (3570 3571) - (3544 3550) - (3536 3537) - (3458 3459) - (3402 3404) - (3398 3400) - (3391 3392) - (3330 3331) - (3315 3315) - (3274 3275) - (3271 3272) - (3267 3268) - (3264 3265) - (3262 3262) - (3202 3203) - (3137 3140) - (3073 3075) - (3018 3020) - (3014 3016) - (3009 3010) - (3007 3007) - (2891 2892) - (2887 2888) - (2880 2880) - (2818 2819) - (2763 2764) - (2761 2761) - (2750 2752) - (2691 2691) - (2622 2624) - (2563 2563) - (2507 2508) - (2503 2504) - (2495 2496) - (2434 2435) - (2382 2383) - (2377 2380) - (2366 2368) - (2363 2363) - (2307 2307)) - (hangul-syllable-lvt - (55177 55203) - (55149 55175) - (55121 55147) - (55093 55119) - (55065 55091) - (55037 55063) - (55009 55035) - (54981 55007) - (54953 54979) - (54925 54951) - (54897 54923) - (54869 54895) - (54841 54867) - (54813 54839) - (54785 54811) - (54757 54783) - (54729 54755) - (54701 54727) - (54673 54699) - (54645 54671) - (54617 54643) - (54589 54615) - (54561 54587) - (54533 54559) - (54505 54531) - (54477 54503) - (54449 54475) - (54421 54447) - (54393 54419) - (54365 54391) - (54337 54363) - (54309 54335) - (54281 54307) - (54253 54279) - (54225 54251) - (54197 54223) - (54169 54195) - (54141 54167) - (54113 54139) - (54085 54111) - (54057 54083) - (54029 54055) - (54001 54027) - (53973 53999) - (53945 53971) - (53917 53943) - (53889 53915) - (53861 53887) - (53833 53859) - (53805 53831) - (53777 53803) - (53749 53775) - (53721 53747) - (53693 53719) - (53665 53691) - (53637 53663) - (53609 53635) - (53581 53607) - (53553 53579) - (53525 53551) - (53497 53523) - (53469 53495) - (53441 53467) - (53413 53439) - (53385 53411) - (53357 53383) - (53329 53355) - (53301 53327) - (53273 53299) - (53245 53271) - (53217 53243) - (53189 53215) - (53161 53187) - (53133 53159) - (53105 53131) - (53077 53103) - (53049 53075) - (53021 53047) - (52993 53019) - (52965 52991) - (52937 52963) - (52909 52935) - (52881 52907) - (52853 52879) - (52825 52851) - (52797 52823) - (52769 52795) - (52741 52767) - (52713 52739) - (52685 52711) - (52657 52683) - (52629 52655) - (52601 52627) - (52573 52599) - (52545 52571) - (52517 52543) - (52489 52515) - (52461 52487) - (52433 52459) - (52405 52431) - (52377 52403) - (52349 52375) - (52321 52347) - (52293 52319) - (52265 52291) - (52237 52263) - (52209 52235) - (52181 52207) - (52153 52179) - (52125 52151) - (52097 52123) - (52069 52095) - (52041 52067) - (52013 52039) - (51985 52011) - (51957 51983) - (51929 51955) - (51901 51927) - (51873 51899) - (51845 51871) - (51817 51843) - (51789 51815) - (51761 51787) - (51733 51759) - (51705 51731) - (51677 51703) - (51649 51675) - (51621 51647) - (51593 51619) - (51565 51591) - (51537 51563) - (51509 51535) - (51481 51507) - (51453 51479) - (51425 51451) - (51397 51423) - (51369 51395) - (51341 51367) - (51313 51339) - (51285 51311) - (51257 51283) - (51229 51255) - (51201 51227) - (51173 51199) - (51145 51171) - (51117 51143) - (51089 51115) - (51061 51087) - (51033 51059) - (51005 51031) - (50977 51003) - (50949 50975) - (50921 50947) - (50893 50919) - (50865 50891) - (50837 50863) - (50809 50835) - (50781 50807) - (50753 50779) - (50725 50751) - (50697 50723) - (50669 50695) - (50641 50667) - (50613 50639) - (50585 50611) - (50557 50583) - (50529 50555) - (50501 50527) - (50473 50499) - (50445 50471) - (50417 50443) - (50389 50415) - (50361 50387) - (50333 50359) - (50305 50331) - (50277 50303) - (50249 50275) - (50221 50247) - (50193 50219) - (50165 50191) - (50137 50163) - (50109 50135) - (50081 50107) - (50053 50079) - (50025 50051) - (49997 50023) - (49969 49995) - (49941 49967) - (49913 49939) - (49885 49911) - (49857 49883) - (49829 49855) - (49801 49827) - (49773 49799) - (49745 49771) - (49717 49743) - (49689 49715) - (49661 49687) - (49633 49659) - (49605 49631) - (49577 49603) - (49549 49575) - (49521 49547) - (49493 49519) - (49465 49491) - (49437 49463) - (49409 49435) - (49381 49407) - (49353 49379) - (49325 49351) - (49297 49323) - (49269 49295) - (49241 49267) - (49213 49239) - (49185 49211) - (49157 49183) - (49129 49155) - (49101 49127) - (49073 49099) - (49045 49071) - (49017 49043) - (48989 49015) - (48961 48987) - (48933 48959) - (48905 48931) - (48877 48903) - (48849 48875) - (48821 48847) - (48793 48819) - (48765 48791) - (48737 48763) - (48709 48735) - (48681 48707) - (48653 48679) - (48625 48651) - (48597 48623) - (48569 48595) - (48541 48567) - (48513 48539) - (48485 48511) - (48457 48483) - (48429 48455) - (48401 48427) - (48373 48399) - (48345 48371) - (48317 48343) - (48289 48315) - (48261 48287) - (48233 48259) - (48205 48231) - (48177 48203) - (48149 48175) - (48121 48147) - (48093 48119) - (48065 48091) - (48037 48063) - (48009 48035) - (47981 48007) - (47953 47979) - (47925 47951) - (47897 47923) - (47869 47895) - (47841 47867) - (47813 47839) - (47785 47811) - (47757 47783) - (47729 47755) - (47701 47727) - (47673 47699) - (47645 47671) - (47617 47643) - (47589 47615) - (47561 47587) - (47533 47559) - (47505 47531) - (47477 47503) - (47449 47475) - (47421 47447) - (47393 47419) - (47365 47391) - (47337 47363) - (47309 47335) - (47281 47307) - (47253 47279) - (47225 47251) - (47197 47223) - (47169 47195) - (47141 47167) - (47113 47139) - (47085 47111) - (47057 47083) - (47029 47055) - (47001 47027) - (46973 46999) - (46945 46971) - (46917 46943) - (46889 46915) - (46861 46887) - (46833 46859) - (46805 46831) - (46777 46803) - (46749 46775) - (46721 46747) - (46693 46719) - (46665 46691) - (46637 46663) - (46609 46635) - (46581 46607) - (46553 46579) - (46525 46551) - (46497 46523) - (46469 46495) - (46441 46467) - (46413 46439) - (46385 46411) - (46357 46383) - (46329 46355) - (46301 46327) - (46273 46299) - (46245 46271) - (46217 46243) - (46189 46215) - (46161 46187) - (46133 46159) - (46105 46131) - (46077 46103) - (46049 46075) - (46021 46047) - (45993 46019) - (45965 45991) - (45937 45963) - (45909 45935) - (45881 45907) - (45853 45879) - (45825 45851) - (45797 45823) - (45769 45795) - (45741 45767) - (45713 45739) - (45685 45711) - (45657 45683) - (45629 45655) - (45601 45627) - (45573 45599) - (45545 45571) - (45517 45543) - (45489 45515) - (45461 45487) - (45433 45459) - (45405 45431) - (45377 45403) - (45349 45375) - (45321 45347) - (45293 45319) - (45265 45291) - (45237 45263) - (45209 45235) - (45181 45207) - (45153 45179) - (45125 45151) - (45097 45123) - (45069 45095) - (45041 45067) - (45013 45039) - (44985 45011) - (44957 44983) - (44929 44955) - (44901 44927) - (44873 44899) - (44845 44871) - (44817 44843) - (44789 44815) - (44761 44787) - (44733 44759) - (44705 44731) - (44677 44703) - (44649 44675) - (44621 44647) - (44593 44619) - (44565 44591) - (44537 44563) - (44509 44535) - (44481 44507) - (44453 44479) - (44425 44451) - (44397 44423) - (44369 44395) - (44341 44367) - (44313 44339) - (44285 44311) - (44257 44283) - (44229 44255) - (44201 44227) - (44173 44199) - (44145 44171) - (44117 44143) - (44089 44115) - (44061 44087) - (44033 44059)) - (zero-width-joiner (8205 8205))))) +(define-immutable-record-type + (_make-grapheme width delta-width modification? state glyphs-reverse glyphs-promise string-promise) + grapheme? + (width grapheme-width) + (delta-width grapheme-delta-width) + (modification? grapheme-modification?) + (state grapheme-state) + (glyphs-reverse grapheme-glyphs-reverse) + (glyphs-promise _grapheme-glyphs-promise) + (string-promise _grapheme-string-promise)) -(define char-set:grapheme-hangul-syllable-l (char-set)) -(define char-set:grapheme-hangul-syllable-v (char-set)) -(define char-set:grapheme-hangul-syllable-lv (char-set)) -(define char-set:grapheme-hangul-syllable-lvt (char-set)) -(define char-set:grapheme-prepend (char-set)) -(define char-set:grapheme-carriage-return (char-set)) -(define char-set:grapheme-line-feed (char-set)) -(define char-set:grapheme-control (char-set)) -(define char-set:grapheme-extend (char-set)) -(define char-set:grapheme-regional-indicator (char-set)) -(define char-set:grapheme-spacing-mark (char-set)) -(define char-set:grapheme-zero-width-joiner (char-set)) +(define (make-grapheme width delta modification? state glyphs-reverse) + (_make-grapheme + width + delta + modification? + state + glyphs-reverse + (delay (reverse (glyphs-reverse))) + (delay (reverse-list->string glyphs-reverse)))) -(define grapheme-charsets - (list (list 'hangul-syllable-l char-set:grapheme-hangul-syllable-l) - (list 'hangul-syllable-v char-set:grapheme-hangul-syllable-v) - (list 'hangul-syllable-lv char-set:grapheme-hangul-syllable-lv) - (list 'hangul-syllable-lvt char-set:grapheme-hangul-syllable-lvt) - (list 'prepend char-set:grapheme-prepend) - (list 'carriage-return char-set:grapheme-carriage-return) - (list 'line-feed char-set:grapheme-line-feed) - (list 'control char-set:grapheme-control) - (list 'extend char-set:grapheme-extend) - (list 'regional-indicator char-set:grapheme-regional-indicator) - (list 'spacing-mark char-set:grapheme-spacing-mark) - (list 'zero-width-joiner char-set:grapheme-zero-width-joiner))) +(define (grapheme-glyphs grapheme) + "Return a lazily-constructed list of glyphs in the grapheme" + (force (_grapheme-glyphs-promise grapheme))) -(ranges->charset! - hashtable - 'hangul-syllable-l - char-set:grapheme-hangul-syllable-l) -(ranges->charset! - hashtable - 'hangul-syllable-v - char-set:grapheme-hangul-syllable-v) -(ranges->charset! - hashtable - 'hangul-syllable-lv - char-set:grapheme-hangul-syllable-lv) -(ranges->charset! - hashtable - 'hangul-syllable-lvt - char-set:grapheme-hangul-syllable-lvt) -(ranges->charset! hashtable 'prepend char-set:grapheme-prepend) -(ranges->charset! hashtable 'carriage-return char-set:grapheme-carriage-return) -(ranges->charset! hashtable 'line-feed char-set:grapheme-line-feed) -(ranges->charset! hashtable 'control char-set:grapheme-control) -(ranges->charset! hashtable 'extend char-set:grapheme-extend) -(ranges->charset! - hashtable - 'regional-indicator - char-set:grapheme-regional-indicator) -(ranges->charset! hashtable 'spacing-mark char-set:grapheme-spacing-mark) -(ranges->charset! - hashtable - 'zero-width-joiner - char-set:grapheme-zero-width-joiner) - -(set! grapheme-charsets - (cons (list 'extended-pictographic char-set:emoji-extended-pictographic) - grapheme-charsets)) +(define (grapheme-string grapheme) + "Return a lazily-constructed string of the glyphs in the grapheme." + (force (_grapheme-string-promise grapheme))) diff --git a/uniseg/graphemes/iterator.scm b/uniseg/graphemes/iterator.scm new file mode 100644 index 0000000..c473134 --- /dev/null +++ b/uniseg/graphemes/iterator.scm @@ -0,0 +1,143 @@ +(define-module (uniseg graphemes iterator) + #:use-module (uniseg) + #:use-module (uniseg graphemes) + #:use-module (ice-9 match) + #:use-module (srfi srfi-71) + #:use-module (srfi srfi-41) + #:export (make-grapheme-iterator)) + +(define (make-grapheme-iterator) + "Create a grapheme iterator that accepts a character and provides grapheme records over time." + ;; The first is what state we are in, and the next is the grapheme + ;; property of the current character. + ;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go + (define (state-machine cur-state cur-prop) + (match (list cur-state cur-prop) + ;; Specifics need to go first, and then non-specifics afterwards + ;; SPECIFIC RULES - no `_' in the match + + ;; Grapheme boundary #3s + (('carriage-return 'line-feed) + (values 'control+line-feed #f)) + + ;; Grapheme boundary #6s + (('hangul-syllable-l 'hangul-syllable-l) + (values 'hangul-syllable-l #f)) + ((or ('hangul-syllable-l 'hangul-syllable-v) + ('hangul-syllable-l 'hangul-syllable-lv)) + (values 'hangul-syllable-lv #f)) + (('hangul-syllable-l 'hangul-syllable-lvt) + (values 'hangul-syllable-lvt #f)) + + ;; Grapheme boundary #7s + (('hangul-syllable-lv 'hangul-syllable-v) + (values 'hangul-syllable-lv #f)) + (('hangul-syllable-lv 'hangul-syllable-t) + (values 'hangul-syllable-lvt #f)) + + ;; Grapheme boundary #8s + (('hangul-syllable-lvt 'hangul-syllable-t) + (values 'hangul-syllable-lvt #f)) + + ;; Grapheme boundary #11s (emoji!) + (('extended-pictographic 'extend) + (values 'extended-pictographic #f)) + (('extended-pictographic 'zero-width-joiner) + (values 'extended-pictographic+zero-width-joiner #f)) + (('extended-pictographic+zero-width-joiner 'extended-pictographic) + (values 'extended-pictographic #f)) + + ;; Grapheme boundaries #12s and #13s + (('regional-indicator-odd 'regional-indicator) + (values 'regioinal-indicator-even #f)) + (('regional-indicator-even 'regional-indicator) + (values 'regional-indicator-odd #t)) + + ;; NON-SPECIFIC RULES + + ;; Grapheme boundary #4n + ((or ('carriage-return _) + ('control+line-feed _)) + (values 'any #t)) + + ;; Grapheme boundary #5n + ((_ 'carriage-return) + (values 'carriage-return #t)) + ((or (_ 'line-feed) + (_ 'control)) + (values 'control+line-feed #t)) + ((_ 'hangul-syllable-l) + (values 'hangul-syllable-l #t)) + + ;; Grapheme boundary #7n + ((or (_ 'hangul-syllable-lv) + (_ 'hangul-syllable-v)) + (values 'hangul-syllable-lv #t)) + + ;; Grapheme boundary #8n + ((or (_ 'hangul-syllable-lvt) + (_ 'hangul-syllable-t)) + (values 'hangul-syllable-lvt #t)) + + ;; Grapheme boundary #9n + ((or (_ 'extend) + (_ 'zero-width-joiner)) + (values 'any #f)) + + ;; Grapheme boundary #9n-A + ((_ 'spacing-mark) + (values 'any #f)) + + ;; Grapheme boundary #9n-B + (('prepend _) + (values 'any #f)) + ((_ 'prepend) + (values 'prepend #t)) + + ;; Grapheme boundary #11n (emoji!) + ((_ 'extended-pictographic) + (values 'extended-pictographic #t)) + + ;; Grapheme boundaries #12n and #13n + ((_ 'regional-indicator) + (values 'regional-indicator-odd #t)) + + ;; Everything else considered a boundary + (else (values 'any #t)))) + + (define %current-grapheme #f) + + (define (set-grapheme! width delta modification? state glyphs-reverse) + (let ((new (make-grapheme width delta modification? state glyphs-reverse))) + (set! %current-grapheme new) + new)) + + (define (iterate-through-grapheme glyph) + "Grapheme iteration function. May return false, in which case it requires new characters in order to produce output." + (if (eof-object? glyph) + ;; eof means nothing to do, clear grapheme and return false + (begin + (set! %current-grapheme #f) + #f) + (begin + (let* ((glyph-width prop (char-width glyph)) + (cur-state (if %current-grapheme + (grapheme-state %current-grapheme) + 'any)) + (next-state boundary? (state-machine cur-state prop))) + + ;; Boundary is between this glyph and previous + (if boundary? + ;; If we hit a boundary with previous, we simply restart the state + ;; and output the current single glyph as a grapheme + (set-grapheme! glyph-width glyph-width #f next-state (list glyph)) + ;; If it's not a boundary with previous, we have to add to the grapheme + ;; Only possible to get a boundary if %current-grapheme is set, so assume + (let* ((cur-width (grapheme-width %current-grapheme)) + (cur-glyphs-reverse (grapheme-glyphs-reverse %current-grapheme)) + (new-width (+ cur-width glyph-width)) + (new-glyphs-reverse (cons glyph cur-glyphs-reverse))) + + (set-grapheme! new-width glyph-width #t next-state new-glyphs-reverse))))))) + + iterate-through-grapheme) diff --git a/uniseg/graphemes/stream.scm b/uniseg/graphemes/stream.scm index 6a9a4eb..288e7c7 100644 --- a/uniseg/graphemes/stream.scm +++ b/uniseg/graphemes/stream.scm @@ -1,183 +1,24 @@ (define-module (uniseg graphemes stream) - #:use-module (uniseg) - #:use-module (uniseg internal) + #:use-module (uniseg graphemes iterator) #:use-module (ice-9 textual-ports) - #:use-module (ice-9 match) - #:use-module (srfi srfi-71) #:use-module (srfi srfi-41) - #:use-module (srfi srfi-9 gnu) - #:export (make-grapheme - grapheme? - grapheme-glyphs - grapheme-string - grapheme-width - string->grapheme-stream + #:export (string->grapheme-stream input->grapheme-stream)) -;; TODO: the golang uniseg also does word and sentence boundaries. These state machines could be implemented if we wanted to. -(define-immutable-record-type - (make-grapheme width glyphs-promise string-promise) - grapheme? - (width grapheme-width) - (glyphs-promise _grapheme-glyphs-promise) - (string-promise _grapheme-string-promise)) - -(define (grapheme-glyphs grapheme) - "Return a lazily-constructed list of glyphs in the grapheme" - (force (_grapheme-glyphs-promise grapheme))) - -(define (grapheme-string grapheme) - "Return a lazily-constructed string of the glyphs in the grapheme." - (force (_grapheme-string-promise grapheme))) - (define (string->grapheme-stream str) "Given a string, create a (lazy) stream of graphemes." - (with-input-from-string str - (λ () (input->grapheme-stream (current-input-port))))) + (call-with-input-string str input->grapheme-stream)) -(define-stream (input->grapheme-stream port) + +(define (input->grapheme-stream port) "Given an input port, create a (lazy) stream of graphemes." - ;; The first is what state we are in, and the next is the grapheme - ;; property of the current character. - ;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go - (define (state-machine cur-state cur-prop) - (match (list cur-state cur-prop) - ;; Specifics need to go first, and then non-specifics afterwards - ;; SPECIFIC RULES - no `_' in the match + (define grapheme-iterator (make-grapheme-iterator)) - ;; Grapheme boundary #3s - (('carriage-return 'line-feed) - (values 'control+line-feed #f)) + (define-stream (grapheme-stream) + (define grapheme (grapheme-iterator (get-char port))) - ;; Grapheme boundary #6s - (('hangul-syllable-l 'hangul-syllable-l) - (values 'hangul-syllable-l #f)) - ((or ('hangul-syllable-l 'hangul-syllable-v) - ('hangul-syllable-l 'hangul-syllable-lv)) - (values 'hangul-syllable-lv #f)) - (('hangul-syllable-l 'hangul-syllable-lvt) - (values 'hangul-syllable-lvt #f)) + (if grapheme + (stream-cons grapheme (grapheme-stream)) + stream-null)) - ;; Grapheme boundary #7s - (('hangul-syllable-lv 'hangul-syllable-v) - (values 'hangul-syllable-lv #f)) - (('hangul-syllable-lv 'hangul-syllable-t) - (values 'hangul-syllable-lvt #f)) - - ;; Grapheme boundary #8s - (('hangul-syllable-lvt 'hangul-syllable-t) - (values 'hangul-syllable-lvt #f)) - - ;; Grapheme boundary #11s (emoji!) - (('extended-pictographic 'extend) - (values 'extended-pictographic #f)) - (('extended-pictographic 'zero-width-joiner) - (values 'extended-pictographic+zero-width-joiner #f)) - (('extended-pictographic+zero-width-joiner 'extended-pictographic) - (values 'extended-pictographic #f)) - - ;; Grapheme boundaries #12s and #13s - (('regional-indicator-odd 'regional-indicator) - (values 'regioinal-indicator-even #f)) - (('regional-indicator-even 'regional-indicator) - (values 'regional-indicator-odd #t)) - - ;; NON-SPECIFIC RULES - - ;; Grapheme boundary #4n - ((or ('carriage-return _) - ('control+line-feed _)) - (values 'any #t)) - - ;; Grapheme boundary #5n - ((_ 'carriage-return) - (values 'carriage-return #t)) - ((or (_ 'line-feed) - (_ 'control)) - (values 'control+line-feed #t)) - ((_ 'hangul-syllable-l) - (values 'hangul-syllable-l #t)) - - ;; Grapheme boundary #7n - ((or (_ 'hangul-syllable-lv) - (_ 'hangul-syllable-v)) - (values 'hangul-syllable-lv #t)) - - ;; Grapheme boundary #8n - ((or (_ 'hangul-syllable-lvt) - (_ 'hangul-syllable-t)) - (values 'hangul-syllable-lvt #t)) - - ;; Grapheme boundary #9n - ((or (_ 'extend) - (_ 'zero-width-joiner)) - (values 'any #f)) - - ;; Grapheme boundary #9n-A - ((_ 'spacing-mark) - (values 'any #f)) - - ;; Grapheme boundary #9n-B - (('prepend _) - (values 'any #f)) - ((_ 'prepend) - (values 'prepend #t)) - - ;; Grapheme boundary #11n (emoji!) - ((_ 'extended-pictographic) - (pk "EXTENDED") - (values 'extended-pictographic #t)) - - ;; Grapheme boundaries #12n and #13n - ((_ 'regional-indicator) - (values 'regional-indicator-odd #t)) - - ;; Everything else considered a boundary - (else (values 'any #t)))) - - ;; State variables, initialized with the first glyph we get! - (define first-glyph (get-char port)) - (define hit-eof? (eof-object? first-glyph)) - (define %glyphs-reverse (list first-glyph)) - (define %grapheme-width 0) - - (define (iterate-through-grapheme state) - (define glyph (peek-char port)) - (if (eof-object? glyph) - (begin - (set! hit-eof? #t) - state) - (begin - (let* ((width property (char-width glyph)) - (next-state boundary? (cpk-values glyph state property '= (state-machine state property)))) - - (if boundary? - state - (let ((new-width (+ %grapheme-width width)) - (new-glyphs-reverse (cons glyph %glyphs-reverse))) - - ;; Officially induct this char into the cluster - (get-char port) - (set! %grapheme-width new-width) - (set! %glyphs-reverse new-glyphs-reverse) - (iterate-through-grapheme next-state))))))) - - ;; Need to explicitly get the first state - ;; Skip all this if it's an empty stream at start - (if hit-eof? - stream-null - (let ((first-width first-prop (char-width first-glyph))) - (set! %grapheme-width first-width) - (iterate-through-grapheme (state-machine 'any first-prop)) - - (let ((grapheme (make-grapheme - %grapheme-width - ;; Delay to avoid construction of unnecessary lists and strings! - (delay (reverse %glyphs-reverse)) - (delay (reverse-list->string %glyphs-reverse))))) - - ;; If we hit the eof here, we need one last stream entry, otherwise iterate further - (stream-cons grapheme - (if hit-eof? - stream-null - (input->grapheme-stream port))))))) + (grapheme-stream))