From e53876be52838fe28ebc3e613e6214375eb29405 Mon Sep 17 00:00:00 2001 From: Vivianne Langdon Date: Sun, 3 Mar 2024 20:25:47 -0500 Subject: [PATCH] Holy shit it works --- scripts/generate-east-asian.in | 3 +- scripts/generate-emoji.in | 80 +- scripts/generate-graphemes.in | 6 +- uniseg.scm | 60 +- uniseg/eastasian.scm | 1182 ++++++++++----------- uniseg/emoji.scm | 1748 +++++++++++++++++++++++--------- uniseg/graphemes.scm | 442 ++++---- uniseg/graphemes/stream.scm | 111 +- uniseg/internal.scm | 12 +- 9 files changed, 2261 insertions(+), 1383 deletions(-) diff --git a/scripts/generate-east-asian.in b/scripts/generate-east-asian.in index ff0f01d..913b34f 100644 --- a/scripts/generate-east-asian.in +++ b/scripts/generate-east-asian.in @@ -112,8 +112,7 @@ #:use-module (ice-9 hash-table) #:use-module (srfi srfi-1) #:use-module (uniseg internal) - #:export - (,@ea-symbol-names + #:export (,@ea-symbol-names eastasian-charsets))) (pretty-print diff --git a/scripts/generate-emoji.in b/scripts/generate-emoji.in index 6d255b7..83d7455 100644 --- a/scripts/generate-emoji.in +++ b/scripts/generate-emoji.in @@ -27,9 +27,38 @@ (define-peg-pattern @emoji-line body (and @emoji-datum (* @ws) @comment)) -(define emoji-list '()) +(define emoji-ht (make-hash-table 5)) + +(define emoji-sets + '(emoji + emoji-presentation + emoji-modifier + emoji-modifier-base + emoji-component + emoji-extended-pictographic)) + +(define emoji-symbol-names + (map + (λ (set) + (string->symbol + (string-concatenate + (list "char-set:" + (symbol->string set))))) + emoji-sets)) + +(define emoji-sets-and-symbols + (zip emoji-sets emoji-symbol-names)) (define (process-emoji-line line) + (define (string->category str) + (match str + ("Emoji" 'emoji) + ("Emoji_Presentation" 'emoji-presentation) + ("Emoji_Modifier" 'emoji-modifier) + ("Emoji_Modifier_Base" 'emoji-modifier-base) + ("Emoji_Component" 'emoji-component) + ("Extended_Pictographic" 'emoji-extended-pictographic))) + (define tree (peg:tree (match-pattern @emoji-line line))) (unless (or (not tree) @@ -39,7 +68,7 @@ (match tree (((('@codepoint-range ('@codepoint codepoints) ...) - ('@emoji-category category)) + ('@emoji-category cat-str)) ('@comment comment)) (with-exception-handler @@ -48,15 +77,14 @@ (format-exception-msg stdout err)) (λ () (let ((f (hex-string->integer (first codepoints))) - (l (hex-string->integer (last codepoints)))) + (l (hex-string->integer (last codepoints))) + (category (string->category cat-str))) (when (or (in-surrogate-range f) (in-surrogate-range l)) (error (format #f "chars in surrogate range ~x -> ~x" f l))) - (when (and (equal? "Extended_Pictographic" category) - (> l #xFF)) - (set! emoji-list (cons (list f l) emoji-list))))) + (cons-hash-list! emoji-ht category f l))) #:unwind? #t))))) (define line-func @@ -78,27 +106,45 @@ (pretty-print `(define-module (uniseg emoji) + #:use-module (uniseg internal) + #:use-module (ice-9 hash-table) #:use-module (srfi srfi-1) - #:export (char-set:extended-pictographic))) + #:export (,@emoji-symbol-names + emoji-charsets))) (pretty-print - `(define emoji-list ',emoji-list)) + `(define emoji-ht + (alist->hashq-table ',(hash-map->list cons emoji-ht)))) + + (display "\n") + + (for-each + (λ (sym) + (pretty-print + `(define ,sym (char-set)))) + emoji-symbol-names) (display "\n") (pretty-print - `(define char-set:extended-pictographic (char-set))) + `(define emoji-charsets + (list + ,@(map + (λ (pair) + (let ((f (first pair)) + (s (second pair))) + `(list ',f ,s))) + emoji-sets-and-symbols)))) (display "\n") - (pretty-print - `(for-each - (λ (pair) - (ucs-range->char-set! - (first pair) - (+ 1 (second pair)) - #t char-set:extended-pictographic)) - emoji-list)) + (for-each + (λ (set-pair) + (let ((name (first set-pair)) + (symbol (second set-pair))) + (pretty-print + `(ranges->charset! emoji-ht ',name ,symbol)))) + emoji-sets-and-symbols) (display "Code generation complete.\n" stdout))) diff --git a/scripts/generate-graphemes.in b/scripts/generate-graphemes.in index 18b75ca..ea96d71 100644 --- a/scripts/generate-graphemes.in +++ b/scripts/generate-graphemes.in @@ -42,7 +42,7 @@ extend regional-indicator spacing-mark - zerowidth-joiner)) + zero-width-joiner)) (define grapheme-symbol-names (map @@ -71,7 +71,7 @@ ("Extend" 'extend) ("Regional_Indicator" 'regional-indicator) ("SpacingMark" 'spacing-mark) - ("ZWJ" 'zerowidth-joiner))) + ("ZWJ" 'zero-width-joiner))) (define tree (peg:tree (match-pattern @grapheme-line line))) @@ -123,7 +123,7 @@ #:use-module (ice-9 hash-table) #:use-module (srfi srfi-1) #:use-module (uniseg internal) - #:export ( ,@grapheme-symbol-names + #:export (,@grapheme-symbol-names grapheme-charsets))) diff --git a/uniseg.scm b/uniseg.scm index dc5aa08..740b9b2 100644 --- a/uniseg.scm +++ b/uniseg.scm @@ -1,17 +1,21 @@ (define-module (uniseg) #:use-module (srfi srfi-1) #:use-module (ice-9 match) + #:use-module (srfi srfi-41) #:use-module (uniseg emoji) #:use-module (uniseg graphemes) + #:use-module (uniseg graphemes stream) #:use-module (uniseg eastasian) #:export (emoji? - grapheme-property - eastasian-property)) + char->grapheme-property + char->eastasian-property + char-width + string-width)) (define (emoji? char) - (char-set-contains? char-set:extended-pictographic char)) + (char-set-contains? char-set:emoji-extended-pictographic char)) -(define (get-prop sets char) +(define (char->prop sets char) (let ((pair (find (λ (p) (char-set-contains? (second p) char)) @@ -19,12 +23,12 @@ (and pair (first pair)))) -(define (grapheme-property char) +(define (char->grapheme-property char) "Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/" - (or (get-prop grapheme-charsets char) + (or (char->prop grapheme-charsets char) 'other)) -(define (eastasian-property char) +(define (char->eastasian-property char) "Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/" (define (<==> lo hi) "Create a range predicate, inclusive between two numbers" @@ -47,7 +51,7 @@ (define range-plane-3? (<==> #x30000 #x3FFFD)) - (or (get-prop eastasian-charsets char) + (or (char->prop eastasian-charsets char) (let ((int (char->integer char))) (match int ((or (? range-unified-ideographs-ext-a?) @@ -58,3 +62,43 @@ 'doublewidth) (else 'neutral))))) + +(define (char-width rune) + "Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations" + (define grapheme-prop (char->grapheme-property rune)) + (values + (or + (match grapheme-prop + ((or 'control + 'carriage-return + 'line-feed + 'extend + 'zero-width-joiner) + 0) + ('regional-indicator + 2) + ('extended-pictographic + (if + (char-set-contains? char-set:emoji-presentation rune) + 2 + 1)) + (else #f) + ) + (match (char->integer rune) + (#x2E3A 3) + (#x2E3B 4) + (else #f)) + (match (char->eastasian-property rune) + ('doublewidth 2) + ;; Default for monospace is to render ambiguous as 1 + ('ambiguous 1) + (else 1))) + grapheme-prop)) + +(define (string-width str) + "Get the width of a string by adding up the widths of each grapheme" + (stream-fold + (λ (val grapheme) + (+ val (grapheme-width grapheme))) + 0 + (string->grapheme-stream str))) diff --git a/uniseg/eastasian.scm b/uniseg/eastasian.scm index 01518a1..49425bd 100644 --- a/uniseg/eastasian.scm +++ b/uniseg/eastasian.scm @@ -18,596 +18,7 @@ eastasian-charsets)) (define eastasian-ht (alist->hashq-table - '((ambiguous - (1048576 1114109) - (983040 1048573) - (917760 917999) - (127387 127404) - (127375 127376) - (127344 127373) - (127280 127337) - (127248 127277) - (127232 127242) - (65533 65533) - (65024 65039) - (57344 63743) - (12872 12879) - (11094 11097) - (10102 10111) - (10045 10045) - (9982 9983) - (9979 9980) - (9974 9977) - (9972 9972) - (9963 9969) - (9960 9961) - (9955 9955) - (9941 9953) - (9935 9939) - (9926 9933) - (9919 9919) - (9886 9887) - (9839 9839) - (9836 9837) - (9831 9834) - (9827 9829) - (9824 9825) - (9794 9794) - (9792 9792) - (9758 9758) - (9756 9756) - (9742 9743) - (9737 9737) - (9733 9734) - (9711 9711) - (9698 9701) - (9678 9681) - (9675 9675) - (9670 9672) - (9665 9665) - (9664 9664) - (9660 9661) - (9655 9655) - (9654 9654) - (9650 9651) - (9635 9641) - (9632 9633) - (9618 9621) - (9600 9615) - (9552 9587) - (9472 9547) - (9451 9471) - (9372 9449) - (9312 9371) - (8978 8978) - (8895 8895) - (8869 8869) - (8857 8857) - (8853 8853) - (8838 8839) - (8834 8835) - (8814 8815) - (8810 8811) - (8804 8807) - (8800 8801) - (8786 8786) - (8780 8780) - (8776 8776) - (8764 8765) - (8756 8759) - (8750 8750) - (8743 8748) - (8741 8741) - (8739 8739) - (8733 8736) - (8730 8730) - (8725 8725) - (8721 8721) - (8719 8719) - (8715 8715) - (8711 8712) - (8706 8707) - (8704 8704) - (8679 8679) - (8660 8660) - (8658 8658) - (8632 8633) - (8597 8601) - (8592 8596) - (8585 8585) - (8560 8569) - (8544 8555) - (8539 8542) - (8531 8532) - (8491 8491) - (8486 8486) - (8481 8482) - (8470 8470) - (8467 8467) - (8457 8457) - (8453 8453) - (8451 8451) - (8364 8364) - (8321 8324) - (8319 8319) - (8308 8308) - (8254 8254) - (8251 8251) - (8245 8245) - (8242 8243) - (8240 8240) - (8228 8231) - (8224 8226) - (8221 8221) - (8220 8220) - (8217 8217) - (8216 8216) - (8214 8214) - (8211 8213) - (8208 8208) - (1105 1105) - (1040 1103) - (1025 1025) - (963 969) - (945 961) - (931 937) - (913 929) - (735 735) - (733 733) - (728 731) - (720 720) - (717 717) - (713 715) - (711 711) - (708 708) - (609 609) - (593 593) - (476 476) - (474 474) - (472 472) - (470 470) - (468 468) - (466 466) - (464 464) - (462 462) - (363 363) - (358 359) - (338 339) - (333 333) - (328 331) - (324 324) - (319 322) - (312 312) - (305 307) - (299 299) - (294 295) - (283 283) - (275 275) - (273 273) - (257 257) - (254 254) - (252 252) - (248 250) - (247 247) - (242 243) - (240 240) - (236 237) - (232 234) - (230 230) - (222 225) - (216 216) - (215 215) - (208 208) - (198 198) - (191 191) - (188 190) - (186 186) - (185 185) - (184 184) - (182 183) - (180 180) - (178 179) - (177 177) - (176 176) - (174 174) - (173 173) - (170 170) - (168 168) - (167 167) - (164 164) - (161 161)) - (halfwidth - (65517 65518) - (65513 65516) - (65512 65512) - (65498 65500) - (65490 65495) - (65482 65487) - (65474 65479) - (65440 65470) - (65438 65439) - (65393 65437) - (65392 65392) - (65382 65391) - (65380 65381) - (65379 65379) - (65378 65378) - (65377 65377) - (8361 8361)) - (narrow - (10630 10630) - (10629 10629) - (10221 10221) - (10220 10220) - (10219 10219) - (10218 10218) - (10217 10217) - (10216 10216) - (10215 10215) - (10214 10214) - (175 175) - (172 172) - (166 166) - (165 165) - (162 163) - (126 126) - (125 125) - (124 124) - (123 123) - (97 122) - (96 96) - (95 95) - (94 94) - (93 93) - (92 92) - (91 91) - (65 90) - (63 64) - (60 62) - (58 59) - (48 57) - (46 47) - (45 45) - (44 44) - (43 43) - (42 42) - (41 41) - (40 40) - (37 39) - (36 36) - (33 35) - (32 32)) - (doublewidth - (201547 262141) - (196608 201546) - (195104 196605) - (195102 195103) - (194560 195101) - (191457 194559) - (183984 191456) - (183970 183983) - (178208 183969) - (178206 178207) - (177984 178205) - (177973 177983) - (173824 177972) - (173790 173823) - (131072 173789) - (129744 129750) - (129728 129730) - (129712 129718) - (129680 129704) - (129664 129670) - (129656 129658) - (129648 129652) - (129485 129535) - (129402 129483) - (129351 129400) - (129340 129349) - (129292 129338) - (128992 129003) - (128756 128764) - (128747 128748) - (128725 128727) - (128720 128722) - (128716 128716) - (128640 128709) - (128512 128591) - (128507 128511) - (128420 128420) - (128405 128406) - (128378 128378) - (128336 128359) - (128331 128334) - (128255 128317) - (128066 128252) - (128064 128064) - (128000 128062) - (127995 127999) - (127992 127994) - (127988 127988) - (127968 127984) - (127951 127955) - (127904 127946) - (127870 127891) - (127799 127868) - (127789 127797) - (127744 127776) - (127584 127589) - (127568 127569) - (127552 127560) - (127504 127547) - (127488 127490) - (127377 127386) - (127374 127374) - (127183 127183) - (126980 126980) - (110960 111355) - (110948 110951) - (110928 110930) - (110848 110878) - (110592 110847) - (101632 101640) - (101120 101589) - (100352 101119) - (94208 100343) - (94192 94193) - (94180 94180) - (94179 94179) - (94178 94178) - (94176 94177) - (65509 65510) - (65508 65508) - (65507 65507) - (65506 65506) - (65504 65505) - (65376 65376) - (65375 65375) - (65374 65374) - (65373 65373) - (65372 65372) - (65371 65371) - (65345 65370) - (65344 65344) - (65343 65343) - (65342 65342) - (65341 65341) - (65340 65340) - (65339 65339) - (65313 65338) - (65311 65312) - (65308 65310) - (65306 65307) - (65296 65305) - (65294 65295) - (65293 65293) - (65292 65292) - (65291 65291) - (65290 65290) - (65289 65289) - (65288 65288) - (65285 65287) - (65284 65284) - (65281 65283) - (65130 65131) - (65129 65129) - (65128 65128) - (65124 65126) - (65123 65123) - (65122 65122) - (65119 65121) - (65118 65118) - (65117 65117) - (65116 65116) - (65115 65115) - (65114 65114) - (65113 65113) - (65112 65112) - (65108 65111) - (65104 65106) - (65101 65103) - (65097 65100) - (65096 65096) - (65095 65095) - (65093 65094) - (65092 65092) - (65091 65091) - (65090 65090) - (65089 65089) - (65088 65088) - (65087 65087) - (65086 65086) - (65085 65085) - (65084 65084) - (65083 65083) - (65082 65082) - (65081 65081) - (65080 65080) - (65079 65079) - (65078 65078) - (65077 65077) - (65075 65076) - (65073 65074) - (65072 65072) - (65049 65049) - (65048 65048) - (65047 65047) - (65040 65046) - (64218 64255) - (64112 64217) - (64110 64111) - (63744 64109) - (44032 55203) - (43360 43388) - (42128 42182) - (40982 42124) - (40981 40981) - (40960 40980) - (40957 40959) - (19968 40956) - (13312 19903) - (13056 13311) - (12992 13055) - (12977 12991) - (12938 12976) - (12928 12937) - (12896 12927) - (12881 12895) - (12880 12880) - (12842 12871) - (12832 12841) - (12800 12830) - (12784 12799) - (12736 12771) - (12704 12735) - (12694 12703) - (12690 12693) - (12688 12689) - (12593 12686) - (12549 12591) - (12543 12543) - (12540 12542) - (12539 12539) - (12449 12538) - (12448 12448) - (12447 12447) - (12445 12446) - (12443 12444) - (12353 12438) - (12350 12350) - (12349 12349) - (12348 12348) - (12347 12347) - (12344 12346) - (12342 12343) - (12337 12341) - (12336 12336) - (12334 12335) - (12330 12333) - (12321 12329) - (12320 12320) - (12318 12319) - (12317 12317) - (12316 12316) - (12315 12315) - (12314 12314) - (12313 12313) - (12312 12312) - (12311 12311) - (12310 12310) - (12309 12309) - (12308 12308) - (12306 12307) - (12305 12305) - (12304 12304) - (12303 12303) - (12302 12302) - (12301 12301) - (12300 12300) - (12299 12299) - (12298 12298) - (12297 12297) - (12296 12296) - (12295 12295) - (12294 12294) - (12293 12293) - (12292 12292) - (12289 12291) - (12288 12288) - (12272 12283) - (12032 12245) - (11931 12019) - (11904 11929) - (11093 11093) - (11088 11088) - (11035 11036) - (10175 10175) - (10160 10160) - (10133 10135) - (10071 10071) - (10067 10069) - (10062 10062) - (10060 10060) - (10024 10024) - (9994 9995) - (9989 9989) - (9981 9981) - (9978 9978) - (9973 9973) - (9970 9971) - (9962 9962) - (9940 9940) - (9934 9934) - (9924 9925) - (9917 9918) - (9898 9899) - (9889 9889) - (9875 9875) - (9855 9855) - (9800 9811) - (9748 9749) - (9725 9726) - (9203 9203) - (9200 9200) - (9193 9196) - (9002 9002) - (9001 9001) - (8986 8987) - (4352 4447)) - (combining - (125136 125142) - (122918 122922) - (122915 122916) - (122907 122913) - (122888 122904) - (122880 122886) - (119362 119364) - (119210 119213) - (119173 119179) - (119163 119170) - (119149 119154) - (119143 119145) - (119141 119142) - (92912 92916) - (70512 70516) - (70502 70508) - (70459 70460) - (70400 70401) - (69446 69456) - (69291 69292) - (66422 66426) - (66045 66045) - (65056 65071) - (43232 43249) - (42736 42737) - (42654 42655) - (42612 42621) - (42608 42610) - (42607 42607) - (12441 12442) - (11744 11775) - (11503 11505) - (8421 8432) - (8418 8420) - (8417 8417) - (8413 8416) - (8400 8412) - (7675 7679) - (7616 7673) - (7019 7027) - (6847 6848) - (6846 6846) - (6832 6845) - (6783 6783) - (4957 4959) - (3328 3329) - (3076 3076) - (3072 3072) - (2027 2035) - (1160 1161) - (1155 1159) - (768 879)) - (neutral + '((neutral (917536 917631) (917505 917505) (130032 130041) @@ -2498,7 +1909,596 @@ (160 160) (128 159) (127 127) - (0 31))))) + (0 31)) + (ambiguous + (1048576 1114109) + (983040 1048573) + (917760 917999) + (127387 127404) + (127375 127376) + (127344 127373) + (127280 127337) + (127248 127277) + (127232 127242) + (65533 65533) + (65024 65039) + (57344 63743) + (12872 12879) + (11094 11097) + (10102 10111) + (10045 10045) + (9982 9983) + (9979 9980) + (9974 9977) + (9972 9972) + (9963 9969) + (9960 9961) + (9955 9955) + (9941 9953) + (9935 9939) + (9926 9933) + (9919 9919) + (9886 9887) + (9839 9839) + (9836 9837) + (9831 9834) + (9827 9829) + (9824 9825) + (9794 9794) + (9792 9792) + (9758 9758) + (9756 9756) + (9742 9743) + (9737 9737) + (9733 9734) + (9711 9711) + (9698 9701) + (9678 9681) + (9675 9675) + (9670 9672) + (9665 9665) + (9664 9664) + (9660 9661) + (9655 9655) + (9654 9654) + (9650 9651) + (9635 9641) + (9632 9633) + (9618 9621) + (9600 9615) + (9552 9587) + (9472 9547) + (9451 9471) + (9372 9449) + (9312 9371) + (8978 8978) + (8895 8895) + (8869 8869) + (8857 8857) + (8853 8853) + (8838 8839) + (8834 8835) + (8814 8815) + (8810 8811) + (8804 8807) + (8800 8801) + (8786 8786) + (8780 8780) + (8776 8776) + (8764 8765) + (8756 8759) + (8750 8750) + (8743 8748) + (8741 8741) + (8739 8739) + (8733 8736) + (8730 8730) + (8725 8725) + (8721 8721) + (8719 8719) + (8715 8715) + (8711 8712) + (8706 8707) + (8704 8704) + (8679 8679) + (8660 8660) + (8658 8658) + (8632 8633) + (8597 8601) + (8592 8596) + (8585 8585) + (8560 8569) + (8544 8555) + (8539 8542) + (8531 8532) + (8491 8491) + (8486 8486) + (8481 8482) + (8470 8470) + (8467 8467) + (8457 8457) + (8453 8453) + (8451 8451) + (8364 8364) + (8321 8324) + (8319 8319) + (8308 8308) + (8254 8254) + (8251 8251) + (8245 8245) + (8242 8243) + (8240 8240) + (8228 8231) + (8224 8226) + (8221 8221) + (8220 8220) + (8217 8217) + (8216 8216) + (8214 8214) + (8211 8213) + (8208 8208) + (1105 1105) + (1040 1103) + (1025 1025) + (963 969) + (945 961) + (931 937) + (913 929) + (735 735) + (733 733) + (728 731) + (720 720) + (717 717) + (713 715) + (711 711) + (708 708) + (609 609) + (593 593) + (476 476) + (474 474) + (472 472) + (470 470) + (468 468) + (466 466) + (464 464) + (462 462) + (363 363) + (358 359) + (338 339) + (333 333) + (328 331) + (324 324) + (319 322) + (312 312) + (305 307) + (299 299) + (294 295) + (283 283) + (275 275) + (273 273) + (257 257) + (254 254) + (252 252) + (248 250) + (247 247) + (242 243) + (240 240) + (236 237) + (232 234) + (230 230) + (222 225) + (216 216) + (215 215) + (208 208) + (198 198) + (191 191) + (188 190) + (186 186) + (185 185) + (184 184) + (182 183) + (180 180) + (178 179) + (177 177) + (176 176) + (174 174) + (173 173) + (170 170) + (168 168) + (167 167) + (164 164) + (161 161)) + (combining + (125136 125142) + (122918 122922) + (122915 122916) + (122907 122913) + (122888 122904) + (122880 122886) + (119362 119364) + (119210 119213) + (119173 119179) + (119163 119170) + (119149 119154) + (119143 119145) + (119141 119142) + (92912 92916) + (70512 70516) + (70502 70508) + (70459 70460) + (70400 70401) + (69446 69456) + (69291 69292) + (66422 66426) + (66045 66045) + (65056 65071) + (43232 43249) + (42736 42737) + (42654 42655) + (42612 42621) + (42608 42610) + (42607 42607) + (12441 12442) + (11744 11775) + (11503 11505) + (8421 8432) + (8418 8420) + (8417 8417) + (8413 8416) + (8400 8412) + (7675 7679) + (7616 7673) + (7019 7027) + (6847 6848) + (6846 6846) + (6832 6845) + (6783 6783) + (4957 4959) + (3328 3329) + (3076 3076) + (3072 3072) + (2027 2035) + (1160 1161) + (1155 1159) + (768 879)) + (halfwidth + (65517 65518) + (65513 65516) + (65512 65512) + (65498 65500) + (65490 65495) + (65482 65487) + (65474 65479) + (65440 65470) + (65438 65439) + (65393 65437) + (65392 65392) + (65382 65391) + (65380 65381) + (65379 65379) + (65378 65378) + (65377 65377) + (8361 8361)) + (doublewidth + (201547 262141) + (196608 201546) + (195104 196605) + (195102 195103) + (194560 195101) + (191457 194559) + (183984 191456) + (183970 183983) + (178208 183969) + (178206 178207) + (177984 178205) + (177973 177983) + (173824 177972) + (173790 173823) + (131072 173789) + (129744 129750) + (129728 129730) + (129712 129718) + (129680 129704) + (129664 129670) + (129656 129658) + (129648 129652) + (129485 129535) + (129402 129483) + (129351 129400) + (129340 129349) + (129292 129338) + (128992 129003) + (128756 128764) + (128747 128748) + (128725 128727) + (128720 128722) + (128716 128716) + (128640 128709) + (128512 128591) + (128507 128511) + (128420 128420) + (128405 128406) + (128378 128378) + (128336 128359) + (128331 128334) + (128255 128317) + (128066 128252) + (128064 128064) + (128000 128062) + (127995 127999) + (127992 127994) + (127988 127988) + (127968 127984) + (127951 127955) + (127904 127946) + (127870 127891) + (127799 127868) + (127789 127797) + (127744 127776) + (127584 127589) + (127568 127569) + (127552 127560) + (127504 127547) + (127488 127490) + (127377 127386) + (127374 127374) + (127183 127183) + (126980 126980) + (110960 111355) + (110948 110951) + (110928 110930) + (110848 110878) + (110592 110847) + (101632 101640) + (101120 101589) + (100352 101119) + (94208 100343) + (94192 94193) + (94180 94180) + (94179 94179) + (94178 94178) + (94176 94177) + (65509 65510) + (65508 65508) + (65507 65507) + (65506 65506) + (65504 65505) + (65376 65376) + (65375 65375) + (65374 65374) + (65373 65373) + (65372 65372) + (65371 65371) + (65345 65370) + (65344 65344) + (65343 65343) + (65342 65342) + (65341 65341) + (65340 65340) + (65339 65339) + (65313 65338) + (65311 65312) + (65308 65310) + (65306 65307) + (65296 65305) + (65294 65295) + (65293 65293) + (65292 65292) + (65291 65291) + (65290 65290) + (65289 65289) + (65288 65288) + (65285 65287) + (65284 65284) + (65281 65283) + (65130 65131) + (65129 65129) + (65128 65128) + (65124 65126) + (65123 65123) + (65122 65122) + (65119 65121) + (65118 65118) + (65117 65117) + (65116 65116) + (65115 65115) + (65114 65114) + (65113 65113) + (65112 65112) + (65108 65111) + (65104 65106) + (65101 65103) + (65097 65100) + (65096 65096) + (65095 65095) + (65093 65094) + (65092 65092) + (65091 65091) + (65090 65090) + (65089 65089) + (65088 65088) + (65087 65087) + (65086 65086) + (65085 65085) + (65084 65084) + (65083 65083) + (65082 65082) + (65081 65081) + (65080 65080) + (65079 65079) + (65078 65078) + (65077 65077) + (65075 65076) + (65073 65074) + (65072 65072) + (65049 65049) + (65048 65048) + (65047 65047) + (65040 65046) + (64218 64255) + (64112 64217) + (64110 64111) + (63744 64109) + (44032 55203) + (43360 43388) + (42128 42182) + (40982 42124) + (40981 40981) + (40960 40980) + (40957 40959) + (19968 40956) + (13312 19903) + (13056 13311) + (12992 13055) + (12977 12991) + (12938 12976) + (12928 12937) + (12896 12927) + (12881 12895) + (12880 12880) + (12842 12871) + (12832 12841) + (12800 12830) + (12784 12799) + (12736 12771) + (12704 12735) + (12694 12703) + (12690 12693) + (12688 12689) + (12593 12686) + (12549 12591) + (12543 12543) + (12540 12542) + (12539 12539) + (12449 12538) + (12448 12448) + (12447 12447) + (12445 12446) + (12443 12444) + (12353 12438) + (12350 12350) + (12349 12349) + (12348 12348) + (12347 12347) + (12344 12346) + (12342 12343) + (12337 12341) + (12336 12336) + (12334 12335) + (12330 12333) + (12321 12329) + (12320 12320) + (12318 12319) + (12317 12317) + (12316 12316) + (12315 12315) + (12314 12314) + (12313 12313) + (12312 12312) + (12311 12311) + (12310 12310) + (12309 12309) + (12308 12308) + (12306 12307) + (12305 12305) + (12304 12304) + (12303 12303) + (12302 12302) + (12301 12301) + (12300 12300) + (12299 12299) + (12298 12298) + (12297 12297) + (12296 12296) + (12295 12295) + (12294 12294) + (12293 12293) + (12292 12292) + (12289 12291) + (12288 12288) + (12272 12283) + (12032 12245) + (11931 12019) + (11904 11929) + (11093 11093) + (11088 11088) + (11035 11036) + (10175 10175) + (10160 10160) + (10133 10135) + (10071 10071) + (10067 10069) + (10062 10062) + (10060 10060) + (10024 10024) + (9994 9995) + (9989 9989) + (9981 9981) + (9978 9978) + (9973 9973) + (9970 9971) + (9962 9962) + (9940 9940) + (9934 9934) + (9924 9925) + (9917 9918) + (9898 9899) + (9889 9889) + (9875 9875) + (9855 9855) + (9800 9811) + (9748 9749) + (9725 9726) + (9203 9203) + (9200 9200) + (9193 9196) + (9002 9002) + (9001 9001) + (8986 8987) + (4352 4447)) + (narrow + (10630 10630) + (10629 10629) + (10221 10221) + (10220 10220) + (10219 10219) + (10218 10218) + (10217 10217) + (10216 10216) + (10215 10215) + (10214 10214) + (175 175) + (172 172) + (166 166) + (165 165) + (162 163) + (126 126) + (125 125) + (124 124) + (123 123) + (97 122) + (96 96) + (95 95) + (94 94) + (93 93) + (92 92) + (91 91) + (65 90) + (63 64) + (60 62) + (58 59) + (48 57) + (46 47) + (45 45) + (44 44) + (43 43) + (42 42) + (41 41) + (40 40) + (37 39) + (36 36) + (33 35) + (32 32))))) (define char-set:eastasian-combining (char-set)) (define char-set:eastasian-doublewidth diff --git a/uniseg/emoji.scm b/uniseg/emoji.scm index 9b3f041..8a3e16b 100644 --- a/uniseg/emoji.scm +++ b/uniseg/emoji.scm @@ -3,508 +3,1256 @@ (define-module (uniseg emoji) #:use-module + (uniseg internal) + #:use-module + (ice-9 hash-table) + #:use-module (srfi srfi-1) #:export - (char-set:extended-pictographic)) -(define emoji-list - '((130048 131069) - (129751 129791) - (129744 129750) - (129731 129743) - (129728 129730) - (129719 129727) - (129712 129718) - (129705 129711) - (129686 129704) - (129680 129685) - (129671 129679) - (129667 129670) - (129664 129666) - (129659 129663) - (129656 129658) - (129653 129655) - (129652 129652) - (129648 129651) - (129536 129647) - (129511 129535) - (129488 129510) - (129485 129487) - (129484 129484) - (129483 129483) - (129475 129482) - (129473 129474) - (129472 129472) - (129466 129471) - (129456 129465) - (129454 129455) - (129451 129453) - (129445 129450) - (129443 129444) - (129432 129442) - (129426 129431) - (129413 129425) - (129408 129412) - (129404 129407) - (129403 129403) - (129402 129402) - (129401 129401) - (129399 129400) - (129395 129398) - (129394 129394) - (129393 129393) - (129388 129392) - (129375 129387) - (129360 129374) - (129357 129359) - (129356 129356) - (129351 129355) - (129344 129349) - (129343 129343) - (129340 129342) - (129331 129338) - (129329 129330) - (129328 129328) - (129320 129327) - (129312 129319) - (129311 129311) - (129305 129310) - (129296 129304) - (129293 129295) - (129292 129292) - (129198 129279) - (129160 129167) - (129114 129119) - (129096 129103) - (129036 129039) - (129004 129023) - (128992 129003) - (128981 128991) - (128884 128895) - (128765 128767) - (128763 128764) - (128762 128762) - (128761 128761) - (128759 128760) - (128756 128758) - (128755 128755) - (128753 128754) - (128752 128752) - (128749 128751) - (128747 128748) - (128746 128746) - (128745 128745) - (128742 128744) - (128736 128741) - (128728 128735) - (128726 128727) - (128725 128725) - (128723 128724) - (128721 128722) - (128720 128720) - (128717 128719) - (128716 128716) - (128715 128715) - (128710 128714) - (128705 128709) - (128704 128704) - (128703 128703) - (128697 128702) - (128695 128696) - (128694 128694) - (128691 128693) - (128690 128690) - (128686 128689) - (128679 128685) - (128678 128678) - (128676 128677) - (128675 128675) - (128674 128674) - (128667 128673) - (128665 128666) - (128664 128664) - (128663 128663) - (128662 128662) - (128661 128661) - (128660 128660) - (128657 128659) - (128656 128656) - (128655 128655) - (128654 128654) - (128653 128653) - (128652 128652) - (128650 128651) - (128649 128649) - (128648 128648) - (128647 128647) - (128646 128646) - (128643 128645) - (128641 128642) - (128640 128640) - (128581 128591) - (128577 128580) - (128567 128576) - (128566 128566) - (128565 128565) - (128564 128564) - (128560 128563) - (128558 128559) - (128557 128557) - (128556 128556) - (128552 128555) - (128550 128551) - (128544 128549) - (128543 128543) - (128540 128542) - (128539 128539) - (128538 128538) - (128537 128537) - (128536 128536) - (128535 128535) - (128534 128534) - (128533 128533) - (128530 128532) - (128529 128529) - (128528 128528) - (128527 128527) - (128526 128526) - (128521 128525) - (128519 128520) - (128513 128518) - (128512 128512) - (128507 128511) - (128506 128506) - (128500 128505) - (128499 128499) - (128496 128498) - (128495 128495) - (128489 128494) - (128488 128488) - (128484 128487) - (128483 128483) - (128482 128482) - (128481 128481) - (128479 128480) - (128476 128478) - (128468 128475) - (128465 128467) - (128453 128464) - (128450 128452) - (128445 128449) - (128444 128444) - (128435 128443) - (128433 128434) - (128425 128432) - (128424 128424) - (128422 128423) - (128421 128421) - (128420 128420) - (128407 128419) - (128405 128406) - (128401 128404) - (128400 128400) - (128398 128399) - (128394 128397) - (128392 128393) - (128391 128391) - (128379 128390) - (128378 128378) - (128371 128377) - (128369 128370) - (128367 128368) - (128360 128366) - (128348 128359) - (128336 128347) - (128335 128335) - (128331 128334) - (128329 128330) - (128326 128328) - (128302 128317) - (128300 128301) - (128278 128299) - (128277 128277) - (128266 128276) - (128265 128265) - (128264 128264) - (128260 128263) - (128259 128259) - (128255 128258) - (128254 128254) - (128253 128253) - (128249 128252) - (128248 128248) - (128246 128247) - (128245 128245) - (128240 128244) - (128239 128239) - (128238 128238) - (128236 128237) - (128184 128235) - (128182 128183) - (128174 128181) - (128173 128173) - (128110 128172) - (128108 128109) - (128102 128107) - (128101 128101) - (128066 128100) - (128065 128065) - (128064 128064) - (128063 128063) - (128043 128062) - (128042 128042) - (128023 128041) - (128022 128022) - (128021 128021) - (128020 128020) - (128019 128019) - (128017 128018) - (128015 128016) - (128012 128014) - (128009 128011) - (128008 128008) - (128000 128007) - (127992 127994) - (127991 127991) - (127990 127990) - (127989 127989) - (127988 127988) - (127987 127987) - (127985 127986) - (127973 127984) - (127972 127972) - (127968 127971) - (127956 127967) - (127951 127955) - (127947 127950) - (127946 127946) - (127945 127945) - (127944 127944) - (127943 127943) - (127942 127942) - (127941 127941) - (127904 127940) - (127902 127903) - (127900 127901) - (127897 127899) - (127896 127896) - (127894 127895) - (127892 127893) - (127872 127891) - (127870 127871) - (127869 127869) - (127868 127868) - (127825 127867) - (127824 127824) - (127820 127823) - (127819 127819) - (127799 127818) - (127798 127798) - (127796 127797) - (127794 127795) - (127792 127793) - (127789 127791) - (127780 127788) - (127778 127779) - (127777 127777) - (127775 127776) - (127773 127774) - (127772 127772) - (127771 127771) - (127770 127770) - (127769 127769) - (127766 127768) - (127763 127765) - (127762 127762) - (127761 127761) - (127760 127760) - (127759 127759) - (127757 127758) - (127744 127756) - (127570 127743) - (127568 127569) - (127561 127567) - (127548 127551) - (127538 127546) - (127535 127535) - (127514 127514) - (127491 127503) - (127489 127490) - (127405 127461) - (127377 127386) - (127374 127374) - (127358 127359) - (127344 127345) - (127340 127343) - (127279 127279) - (127245 127247) - (127184 127231) - (127183 127183) - (126981 127182) - (126980 126980) - (126976 126979) - (12953 12953) - (12951 12951) - (12349 12349) - (12336 12336) - (11093 11093) - (11088 11088) - (11035 11036) - (11013 11015) - (10548 10549) - (10175 10175) - (10160 10160) - (10145 10145) - (10133 10135) - (10085 10087) - (10084 10084) - (10083 10083) - (10071 10071) - (10067 10069) - (10062 10062) - (10060 10060) - (10055 10055) - (10052 10052) - (10035 10036) - (10024 10024) - (10017 10017) - (10013 10013) - (10006 10006) - (10004 10004) - (10002 10002) - (10000 10001) - (9999 9999) - (9998 9998) - (9997 9997) - (9992 9996) - (9989 9989) - (9987 9988) - (9986 9986) - (9982 9985) - (9981 9981) - (9979 9980) - (9978 9978) - (9975 9977) - (9974 9974) - (9973 9973) - (9972 9972) - (9970 9971) - (9968 9969) - (9963 9967) - (9962 9962) - (9961 9961) - (9941 9960) - (9940 9940) - (9939 9939) - (9938 9938) - (9937 9937) - (9936 9936) - (9935 9935) - (9934 9934) - (9929 9933) - (9928 9928) - (9926 9927) - (9924 9925) - (9919 9923) - (9917 9918) - (9906 9916) - (9904 9905) - (9900 9903) - (9898 9899) - (9896 9897) - (9895 9895) - (9890 9894) - (9888 9889) - (9885 9887) - (9883 9884) - (9882 9882) - (9881 9881) - (9880 9880) - (9878 9879) - (9877 9877) - (9876 9876) - (9875 9875) - (9874 9874) - (9872 9873) - (9856 9861) - (9855 9855) - (9854 9854) - (9852 9853) - (9851 9851) - (9833 9850) - (9832 9832) - (9831 9831) - (9829 9830) - (9828 9828) - (9827 9827) - (9825 9826) - (9824 9824) - (9823 9823) - (9812 9822) - (9800 9811) - (9795 9799) - (9794 9794) - (9793 9793) - (9792 9792) - (9787 9791) - (9786 9786) - (9784 9785) - (9776 9783) - (9775 9775) - (9774 9774) - (9771 9773) - (9770 9770) - (9767 9769) - (9766 9766) - (9764 9765) - (9762 9763) - (9761 9761) - (9760 9760) - (9758 9759) - (9757 9757) - (9753 9756) - (9752 9752) - (9750 9751) - (9748 9749) - (9746 9746) - (9745 9745) - (9743 9744) - (9742 9742) - (9735 9741) - (9733 9733) - (9732 9732) - (9730 9731) - (9728 9729) - (9723 9726) - (9664 9664) - (9654 9654) - (9642 9643) - (9410 9410) - (9208 9210) - (9203 9203) - (9201 9202) - (9200 9200) - (9199 9199) - (9197 9198) - (9193 9196) - (9167 9167) - (9096 9096) - (9000 9000) - (8986 8987) - (8617 8618) - (8596 8601) - (8505 8505) - (8482 8482) - (8265 8265) - (8252 8252))) + (char-set:emoji + char-set:emoji-presentation + char-set:emoji-modifier + char-set:emoji-modifier-base + char-set:emoji-component + char-set:emoji-extended-pictographic + emoji-charsets)) +(define emoji-ht + (alist->hashq-table + '((emoji-modifier (127995 127999)) + (emoji-component + (917536 917631) + (129456 129459) + (127995 127999) + (127462 127487) + (65039 65039) + (8419 8419) + (8205 8205) + (48 57) + (42 42) + (35 35)) + (emoji-modifier-base + (129489 129501) + (129485 129487) + (129467 129467) + (129464 129465) + (129461 129462) + (129399 129399) + (129340 129342) + (129331 129337) + (129329 129330) + (129328 129328) + (129318 129318) + (129311 129311) + (129305 129310) + (129304 129304) + (129295 129295) + (129292 129292) + (128716 128716) + (128704 128704) + (128694 128694) + (128692 128693) + (128675 128675) + (128587 128591) + (128581 128583) + (128405 128406) + (128400 128400) + (128378 128378) + (128372 128373) + (128170 128170) + (128145 128145) + (128143 128143) + (128133 128135) + (128129 128131) + (128124 128124) + (128110 128120) + (128108 128109) + (128102 128107) + (128070 128080) + (128066 128067) + (127947 127948) + (127946 127946) + (127943 127943) + (127938 127940) + (127877 127877) + (9997 9997) + (9994 9996) + (9977 9977) + (9757 9757)) + (emoji (129744 129750) + (129728 129730) + (129712 129718) + (129686 129704) + (129680 129685) + (129667 129670) + (129664 129666) + (129656 129658) + (129652 129652) + (129648 129651) + (129511 129535) + (129488 129510) + (129485 129487) + (129483 129483) + (129475 129482) + (129473 129474) + (129472 129472) + (129466 129471) + (129456 129465) + (129454 129455) + (129451 129453) + (129445 129450) + (129443 129444) + (129432 129442) + (129426 129431) + (129413 129425) + (129408 129412) + (129404 129407) + (129403 129403) + (129402 129402) + (129399 129400) + (129395 129398) + (129394 129394) + (129393 129393) + (129388 129392) + (129375 129387) + (129360 129374) + (129357 129359) + (129356 129356) + (129351 129355) + (129344 129349) + (129343 129343) + (129340 129342) + (129331 129338) + (129329 129330) + (129328 129328) + (129320 129327) + (129312 129319) + (129311 129311) + (129305 129310) + (129296 129304) + (129293 129295) + (129292 129292) + (128992 129003) + (128763 128764) + (128762 128762) + (128761 128761) + (128759 128760) + (128756 128758) + (128755 128755) + (128752 128752) + (128747 128748) + (128745 128745) + (128736 128741) + (128726 128727) + (128725 128725) + (128721 128722) + (128720 128720) + (128717 128719) + (128716 128716) + (128715 128715) + (128705 128709) + (128704 128704) + (128703 128703) + (128697 128702) + (128695 128696) + (128694 128694) + (128691 128693) + (128690 128690) + (128686 128689) + (128679 128685) + (128678 128678) + (128676 128677) + (128675 128675) + (128674 128674) + (128667 128673) + (128665 128666) + (128664 128664) + (128663 128663) + (128662 128662) + (128661 128661) + (128660 128660) + (128657 128659) + (128656 128656) + (128655 128655) + (128654 128654) + (128653 128653) + (128652 128652) + (128650 128651) + (128649 128649) + (128648 128648) + (128647 128647) + (128646 128646) + (128643 128645) + (128641 128642) + (128640 128640) + (128581 128591) + (128577 128580) + (128567 128576) + (128566 128566) + (128565 128565) + (128564 128564) + (128560 128563) + (128558 128559) + (128557 128557) + (128556 128556) + (128552 128555) + (128550 128551) + (128544 128549) + (128543 128543) + (128540 128542) + (128539 128539) + (128538 128538) + (128537 128537) + (128536 128536) + (128535 128535) + (128534 128534) + (128533 128533) + (128530 128532) + (128529 128529) + (128528 128528) + (128527 128527) + (128526 128526) + (128521 128525) + (128519 128520) + (128513 128518) + (128512 128512) + (128507 128511) + (128506 128506) + (128499 128499) + (128495 128495) + (128488 128488) + (128483 128483) + (128481 128481) + (128476 128478) + (128465 128467) + (128450 128452) + (128444 128444) + (128433 128434) + (128424 128424) + (128421 128421) + (128420 128420) + (128405 128406) + (128400 128400) + (128394 128397) + (128391 128391) + (128378 128378) + (128371 128377) + (128367 128368) + (128348 128359) + (128336 128347) + (128331 128334) + (128329 128330) + (128302 128317) + (128300 128301) + (128278 128299) + (128277 128277) + (128266 128276) + (128265 128265) + (128264 128264) + (128260 128263) + (128259 128259) + (128255 128258) + (128253 128253) + (128249 128252) + (128248 128248) + (128246 128247) + (128245 128245) + (128240 128244) + (128239 128239) + (128238 128238) + (128236 128237) + (128184 128235) + (128182 128183) + (128174 128181) + (128173 128173) + (128110 128172) + (128108 128109) + (128102 128107) + (128101 128101) + (128066 128100) + (128065 128065) + (128064 128064) + (128063 128063) + (128043 128062) + (128042 128042) + (128023 128041) + (128022 128022) + (128021 128021) + (128020 128020) + (128019 128019) + (128017 128018) + (128015 128016) + (128012 128014) + (128009 128011) + (128008 128008) + (127992 128007) + (127991 127991) + (127989 127989) + (127988 127988) + (127987 127987) + (127973 127984) + (127972 127972) + (127968 127971) + (127956 127967) + (127951 127955) + (127947 127950) + (127946 127946) + (127945 127945) + (127944 127944) + (127943 127943) + (127942 127942) + (127941 127941) + (127904 127940) + (127902 127903) + (127897 127899) + (127894 127895) + (127872 127891) + (127870 127871) + (127869 127869) + (127868 127868) + (127825 127867) + (127824 127824) + (127820 127823) + (127819 127819) + (127799 127818) + (127798 127798) + (127796 127797) + (127794 127795) + (127792 127793) + (127789 127791) + (127780 127788) + (127777 127777) + (127775 127776) + (127773 127774) + (127772 127772) + (127771 127771) + (127770 127770) + (127769 127769) + (127766 127768) + (127763 127765) + (127762 127762) + (127761 127761) + (127760 127760) + (127759 127759) + (127757 127758) + (127744 127756) + (127568 127569) + (127538 127546) + (127535 127535) + (127514 127514) + (127489 127490) + (127462 127487) + (127377 127386) + (127374 127374) + (127358 127359) + (127344 127345) + (127183 127183) + (126980 126980) + (12953 12953) + (12951 12951) + (12349 12349) + (12336 12336) + (11093 11093) + (11088 11088) + (11035 11036) + (11013 11015) + (10548 10549) + (10175 10175) + (10160 10160) + (10145 10145) + (10133 10135) + (10084 10084) + (10083 10083) + (10071 10071) + (10067 10069) + (10062 10062) + (10060 10060) + (10055 10055) + (10052 10052) + (10035 10036) + (10024 10024) + (10017 10017) + (10013 10013) + (10006 10006) + (10004 10004) + (10002 10002) + (9999 9999) + (9997 9997) + (9992 9996) + (9989 9989) + (9986 9986) + (9981 9981) + (9978 9978) + (9975 9977) + (9973 9973) + (9972 9972) + (9970 9971) + (9968 9969) + (9962 9962) + (9961 9961) + (9940 9940) + (9939 9939) + (9937 9937) + (9935 9935) + (9934 9934) + (9928 9928) + (9924 9925) + (9917 9918) + (9904 9905) + (9898 9899) + (9895 9895) + (9888 9889) + (9883 9884) + (9881 9881) + (9878 9879) + (9877 9877) + (9876 9876) + (9875 9875) + (9874 9874) + (9855 9855) + (9854 9854) + (9851 9851) + (9832 9832) + (9829 9830) + (9827 9827) + (9824 9824) + (9823 9823) + (9800 9811) + (9794 9794) + (9792 9792) + (9786 9786) + (9784 9785) + (9775 9775) + (9774 9774) + (9770 9770) + (9766 9766) + (9762 9763) + (9760 9760) + (9757 9757) + (9752 9752) + (9748 9749) + (9745 9745) + (9742 9742) + (9732 9732) + (9730 9731) + (9728 9729) + (9723 9726) + (9664 9664) + (9654 9654) + (9642 9643) + (9410 9410) + (9208 9210) + (9203 9203) + (9201 9202) + (9200 9200) + (9199 9199) + (9197 9198) + (9193 9196) + (9167 9167) + (9000 9000) + (8986 8987) + (8617 8618) + (8596 8601) + (8505 8505) + (8482 8482) + (8265 8265) + (8252 8252) + (174 174) + (169 169) + (48 57) + (42 42) + (35 35)) + (emoji-presentation + (129744 129750) + (129728 129730) + (129712 129718) + (129686 129704) + (129680 129685) + (129667 129670) + (129664 129666) + (129656 129658) + (129652 129652) + (129648 129651) + (129511 129535) + (129488 129510) + (129485 129487) + (129483 129483) + (129475 129482) + (129473 129474) + (129472 129472) + (129466 129471) + (129456 129465) + (129454 129455) + (129451 129453) + (129445 129450) + (129443 129444) + (129432 129442) + (129426 129431) + (129413 129425) + (129408 129412) + (129404 129407) + (129403 129403) + (129402 129402) + (129399 129400) + (129395 129398) + (129394 129394) + (129393 129393) + (129388 129392) + (129375 129387) + (129360 129374) + (129357 129359) + (129356 129356) + (129351 129355) + (129344 129349) + (129343 129343) + (129340 129342) + (129331 129338) + (129329 129330) + (129328 129328) + (129320 129327) + (129312 129319) + (129311 129311) + (129305 129310) + (129296 129304) + (129293 129295) + (129292 129292) + (128992 129003) + (128763 128764) + (128762 128762) + (128761 128761) + (128759 128760) + (128756 128758) + (128747 128748) + (128726 128727) + (128725 128725) + (128721 128722) + (128720 128720) + (128716 128716) + (128705 128709) + (128704 128704) + (128703 128703) + (128697 128702) + (128695 128696) + (128694 128694) + (128691 128693) + (128690 128690) + (128686 128689) + (128679 128685) + (128678 128678) + (128676 128677) + (128675 128675) + (128674 128674) + (128667 128673) + (128665 128666) + (128664 128664) + (128663 128663) + (128662 128662) + (128661 128661) + (128660 128660) + (128657 128659) + (128656 128656) + (128655 128655) + (128654 128654) + (128653 128653) + (128652 128652) + (128650 128651) + (128649 128649) + (128648 128648) + (128647 128647) + (128646 128646) + (128643 128645) + (128641 128642) + (128640 128640) + (128581 128591) + (128577 128580) + (128567 128576) + (128566 128566) + (128565 128565) + (128564 128564) + (128560 128563) + (128558 128559) + (128557 128557) + (128556 128556) + (128552 128555) + (128550 128551) + (128544 128549) + (128543 128543) + (128540 128542) + (128539 128539) + (128538 128538) + (128537 128537) + (128536 128536) + (128535 128535) + (128534 128534) + (128533 128533) + (128530 128532) + (128529 128529) + (128528 128528) + (128527 128527) + (128526 128526) + (128521 128525) + (128519 128520) + (128513 128518) + (128512 128512) + (128507 128511) + (128420 128420) + (128405 128406) + (128378 128378) + (128348 128359) + (128336 128347) + (128331 128334) + (128302 128317) + (128300 128301) + (128278 128299) + (128277 128277) + (128266 128276) + (128265 128265) + (128264 128264) + (128260 128263) + (128259 128259) + (128255 128258) + (128249 128252) + (128248 128248) + (128246 128247) + (128245 128245) + (128240 128244) + (128239 128239) + (128238 128238) + (128236 128237) + (128184 128235) + (128182 128183) + (128174 128181) + (128173 128173) + (128110 128172) + (128108 128109) + (128102 128107) + (128101 128101) + (128066 128100) + (128064 128064) + (128043 128062) + (128042 128042) + (128023 128041) + (128022 128022) + (128021 128021) + (128020 128020) + (128019 128019) + (128017 128018) + (128015 128016) + (128012 128014) + (128009 128011) + (128008 128008) + (127992 128007) + (127988 127988) + (127973 127984) + (127972 127972) + (127968 127971) + (127951 127955) + (127946 127946) + (127945 127945) + (127944 127944) + (127943 127943) + (127942 127942) + (127941 127941) + (127904 127940) + (127872 127891) + (127870 127871) + (127868 127868) + (127825 127867) + (127824 127824) + (127820 127823) + (127819 127819) + (127799 127818) + (127796 127797) + (127794 127795) + (127792 127793) + (127789 127791) + (127775 127776) + (127773 127774) + (127772 127772) + (127771 127771) + (127770 127770) + (127769 127769) + (127766 127768) + (127763 127765) + (127762 127762) + (127761 127761) + (127760 127760) + (127759 127759) + (127757 127758) + (127744 127756) + (127568 127569) + (127544 127546) + (127538 127542) + (127535 127535) + (127514 127514) + (127489 127489) + (127462 127487) + (127377 127386) + (127374 127374) + (127183 127183) + (126980 126980) + (11093 11093) + (11088 11088) + (11035 11036) + (10175 10175) + (10160 10160) + (10133 10135) + (10071 10071) + (10067 10069) + (10062 10062) + (10060 10060) + (10024 10024) + (9994 9995) + (9989 9989) + (9981 9981) + (9978 9978) + (9973 9973) + (9970 9971) + (9962 9962) + (9940 9940) + (9934 9934) + (9924 9925) + (9917 9918) + (9898 9899) + (9889 9889) + (9875 9875) + (9855 9855) + (9800 9811) + (9748 9749) + (9725 9726) + (9203 9203) + (9200 9200) + (9193 9196) + (8986 8987)) + (emoji-extended-pictographic + (130048 131069) + (129751 129791) + (129744 129750) + (129731 129743) + (129728 129730) + (129719 129727) + (129712 129718) + (129705 129711) + (129686 129704) + (129680 129685) + (129671 129679) + (129667 129670) + (129664 129666) + (129659 129663) + (129656 129658) + (129653 129655) + (129652 129652) + (129648 129651) + (129536 129647) + (129511 129535) + (129488 129510) + (129485 129487) + (129484 129484) + (129483 129483) + (129475 129482) + (129473 129474) + (129472 129472) + (129466 129471) + (129456 129465) + (129454 129455) + (129451 129453) + (129445 129450) + (129443 129444) + (129432 129442) + (129426 129431) + (129413 129425) + (129408 129412) + (129404 129407) + (129403 129403) + (129402 129402) + (129401 129401) + (129399 129400) + (129395 129398) + (129394 129394) + (129393 129393) + (129388 129392) + (129375 129387) + (129360 129374) + (129357 129359) + (129356 129356) + (129351 129355) + (129344 129349) + (129343 129343) + (129340 129342) + (129331 129338) + (129329 129330) + (129328 129328) + (129320 129327) + (129312 129319) + (129311 129311) + (129305 129310) + (129296 129304) + (129293 129295) + (129292 129292) + (129198 129279) + (129160 129167) + (129114 129119) + (129096 129103) + (129036 129039) + (129004 129023) + (128992 129003) + (128981 128991) + (128884 128895) + (128765 128767) + (128763 128764) + (128762 128762) + (128761 128761) + (128759 128760) + (128756 128758) + (128755 128755) + (128753 128754) + (128752 128752) + (128749 128751) + (128747 128748) + (128746 128746) + (128745 128745) + (128742 128744) + (128736 128741) + (128728 128735) + (128726 128727) + (128725 128725) + (128723 128724) + (128721 128722) + (128720 128720) + (128717 128719) + (128716 128716) + (128715 128715) + (128710 128714) + (128705 128709) + (128704 128704) + (128703 128703) + (128697 128702) + (128695 128696) + (128694 128694) + (128691 128693) + (128690 128690) + (128686 128689) + (128679 128685) + (128678 128678) + (128676 128677) + (128675 128675) + (128674 128674) + (128667 128673) + (128665 128666) + (128664 128664) + (128663 128663) + (128662 128662) + (128661 128661) + (128660 128660) + (128657 128659) + (128656 128656) + (128655 128655) + (128654 128654) + (128653 128653) + (128652 128652) + (128650 128651) + (128649 128649) + (128648 128648) + (128647 128647) + (128646 128646) + (128643 128645) + (128641 128642) + (128640 128640) + (128581 128591) + (128577 128580) + (128567 128576) + (128566 128566) + (128565 128565) + (128564 128564) + (128560 128563) + (128558 128559) + (128557 128557) + (128556 128556) + (128552 128555) + (128550 128551) + (128544 128549) + (128543 128543) + (128540 128542) + (128539 128539) + (128538 128538) + (128537 128537) + (128536 128536) + (128535 128535) + (128534 128534) + (128533 128533) + (128530 128532) + (128529 128529) + (128528 128528) + (128527 128527) + (128526 128526) + (128521 128525) + (128519 128520) + (128513 128518) + (128512 128512) + (128507 128511) + (128506 128506) + (128500 128505) + (128499 128499) + (128496 128498) + (128495 128495) + (128489 128494) + (128488 128488) + (128484 128487) + (128483 128483) + (128482 128482) + (128481 128481) + (128479 128480) + (128476 128478) + (128468 128475) + (128465 128467) + (128453 128464) + (128450 128452) + (128445 128449) + (128444 128444) + (128435 128443) + (128433 128434) + (128425 128432) + (128424 128424) + (128422 128423) + (128421 128421) + (128420 128420) + (128407 128419) + (128405 128406) + (128401 128404) + (128400 128400) + (128398 128399) + (128394 128397) + (128392 128393) + (128391 128391) + (128379 128390) + (128378 128378) + (128371 128377) + (128369 128370) + (128367 128368) + (128360 128366) + (128348 128359) + (128336 128347) + (128335 128335) + (128331 128334) + (128329 128330) + (128326 128328) + (128302 128317) + (128300 128301) + (128278 128299) + (128277 128277) + (128266 128276) + (128265 128265) + (128264 128264) + (128260 128263) + (128259 128259) + (128255 128258) + (128254 128254) + (128253 128253) + (128249 128252) + (128248 128248) + (128246 128247) + (128245 128245) + (128240 128244) + (128239 128239) + (128238 128238) + (128236 128237) + (128184 128235) + (128182 128183) + (128174 128181) + (128173 128173) + (128110 128172) + (128108 128109) + (128102 128107) + (128101 128101) + (128066 128100) + (128065 128065) + (128064 128064) + (128063 128063) + (128043 128062) + (128042 128042) + (128023 128041) + (128022 128022) + (128021 128021) + (128020 128020) + (128019 128019) + (128017 128018) + (128015 128016) + (128012 128014) + (128009 128011) + (128008 128008) + (128000 128007) + (127992 127994) + (127991 127991) + (127990 127990) + (127989 127989) + (127988 127988) + (127987 127987) + (127985 127986) + (127973 127984) + (127972 127972) + (127968 127971) + (127956 127967) + (127951 127955) + (127947 127950) + (127946 127946) + (127945 127945) + (127944 127944) + (127943 127943) + (127942 127942) + (127941 127941) + (127904 127940) + (127902 127903) + (127900 127901) + (127897 127899) + (127896 127896) + (127894 127895) + (127892 127893) + (127872 127891) + (127870 127871) + (127869 127869) + (127868 127868) + (127825 127867) + (127824 127824) + (127820 127823) + (127819 127819) + (127799 127818) + (127798 127798) + (127796 127797) + (127794 127795) + (127792 127793) + (127789 127791) + (127780 127788) + (127778 127779) + (127777 127777) + (127775 127776) + (127773 127774) + (127772 127772) + (127771 127771) + (127770 127770) + (127769 127769) + (127766 127768) + (127763 127765) + (127762 127762) + (127761 127761) + (127760 127760) + (127759 127759) + (127757 127758) + (127744 127756) + (127570 127743) + (127568 127569) + (127561 127567) + (127548 127551) + (127538 127546) + (127535 127535) + (127514 127514) + (127491 127503) + (127489 127490) + (127405 127461) + (127377 127386) + (127374 127374) + (127358 127359) + (127344 127345) + (127340 127343) + (127279 127279) + (127245 127247) + (127184 127231) + (127183 127183) + (126981 127182) + (126980 126980) + (126976 126979) + (12953 12953) + (12951 12951) + (12349 12349) + (12336 12336) + (11093 11093) + (11088 11088) + (11035 11036) + (11013 11015) + (10548 10549) + (10175 10175) + (10160 10160) + (10145 10145) + (10133 10135) + (10085 10087) + (10084 10084) + (10083 10083) + (10071 10071) + (10067 10069) + (10062 10062) + (10060 10060) + (10055 10055) + (10052 10052) + (10035 10036) + (10024 10024) + (10017 10017) + (10013 10013) + (10006 10006) + (10004 10004) + (10002 10002) + (10000 10001) + (9999 9999) + (9998 9998) + (9997 9997) + (9992 9996) + (9989 9989) + (9987 9988) + (9986 9986) + (9982 9985) + (9981 9981) + (9979 9980) + (9978 9978) + (9975 9977) + (9974 9974) + (9973 9973) + (9972 9972) + (9970 9971) + (9968 9969) + (9963 9967) + (9962 9962) + (9961 9961) + (9941 9960) + (9940 9940) + (9939 9939) + (9938 9938) + (9937 9937) + (9936 9936) + (9935 9935) + (9934 9934) + (9929 9933) + (9928 9928) + (9926 9927) + (9924 9925) + (9919 9923) + (9917 9918) + (9906 9916) + (9904 9905) + (9900 9903) + (9898 9899) + (9896 9897) + (9895 9895) + (9890 9894) + (9888 9889) + (9885 9887) + (9883 9884) + (9882 9882) + (9881 9881) + (9880 9880) + (9878 9879) + (9877 9877) + (9876 9876) + (9875 9875) + (9874 9874) + (9872 9873) + (9856 9861) + (9855 9855) + (9854 9854) + (9852 9853) + (9851 9851) + (9833 9850) + (9832 9832) + (9831 9831) + (9829 9830) + (9828 9828) + (9827 9827) + (9825 9826) + (9824 9824) + (9823 9823) + (9812 9822) + (9800 9811) + (9795 9799) + (9794 9794) + (9793 9793) + (9792 9792) + (9787 9791) + (9786 9786) + (9784 9785) + (9776 9783) + (9775 9775) + (9774 9774) + (9771 9773) + (9770 9770) + (9767 9769) + (9766 9766) + (9764 9765) + (9762 9763) + (9761 9761) + (9760 9760) + (9758 9759) + (9757 9757) + (9753 9756) + (9752 9752) + (9750 9751) + (9748 9749) + (9746 9746) + (9745 9745) + (9743 9744) + (9742 9742) + (9735 9741) + (9733 9733) + (9732 9732) + (9730 9731) + (9728 9729) + (9723 9726) + (9664 9664) + (9654 9654) + (9642 9643) + (9410 9410) + (9208 9210) + (9203 9203) + (9201 9202) + (9200 9200) + (9199 9199) + (9197 9198) + (9193 9196) + (9167 9167) + (9096 9096) + (9000 9000) + (8986 8987) + (8617 8618) + (8596 8601) + (8505 8505) + (8482 8482) + (8265 8265) + (8252 8252) + (174 174) + (169 169))))) -(define char-set:extended-pictographic +(define char-set:emoji (char-set)) +(define char-set:emoji-presentation (char-set)) +(define char-set:emoji-modifier (char-set)) +(define char-set:emoji-modifier-base (char-set)) +(define char-set:emoji-component (char-set)) +(define char-set:emoji-extended-pictographic (char-set)) -(for-each - (λ (pair) - (ucs-range->char-set! - (first pair) - (+ 1 (second pair)) - #t - char-set:extended-pictographic)) - emoji-list) +(define emoji-charsets + (list (list 'emoji char-set:emoji) + (list 'emoji-presentation + char-set:emoji-presentation) + (list 'emoji-modifier char-set:emoji-modifier) + (list 'emoji-modifier-base + char-set:emoji-modifier-base) + (list 'emoji-component char-set:emoji-component) + (list 'emoji-extended-pictographic + char-set:emoji-extended-pictographic))) + +(ranges->charset! emoji-ht 'emoji char-set:emoji) +(ranges->charset! + emoji-ht + 'emoji-presentation + char-set:emoji-presentation) +(ranges->charset! + emoji-ht + 'emoji-modifier + char-set:emoji-modifier) +(ranges->charset! + emoji-ht + 'emoji-modifier-base + char-set:emoji-modifier-base) +(ranges->charset! + emoji-ht + 'emoji-component + char-set:emoji-component) +(ranges->charset! + emoji-ht + 'emoji-extended-pictographic + char-set:emoji-extended-pictographic) diff --git a/uniseg/graphemes.scm b/uniseg/graphemes.scm index 9af0d48..b3594a7 100644 --- a/uniseg/graphemes.scm +++ b/uniseg/graphemes.scm @@ -20,28 +20,176 @@ char-set:grapheme-extend char-set:grapheme-regional-indicator char-set:grapheme-spacing-mark - char-set:grapheme-zerowidth-joiner + char-set:grapheme-zero-width-joiner grapheme-charsets)) (define grapheme-ht (alist->hashq-table - '((prepend - (73474 73474) - (73030 73030) - (72324 72329) - (72250 72250) - (72001 72001) - (71999 71999) - (70082 70083) - (69837 69837) - (69821 69821) - (3406 3406) - (2274 2274) - (2192 2193) - (1807 1807) - (1757 1757) - (1536 1541)) - (regional-indicator (127462 127487)) - (carriage-return (13 13)) + '((spacing-mark + (119149 119149) + (119142 119142) + (94192 94193) + (94033 94087) + (73537 73537) + (73534 73535) + (73524 73525) + (73475 73475) + (73461 73462) + (73110 73110) + (73107 73108) + (73098 73102) + (72884 72884) + (72881 72881) + (72873 72873) + (72766 72766) + (72751 72751) + (72343 72343) + (72279 72280) + (72249 72249) + (72164 72164) + (72156 72159) + (72145 72147) + (72002 72002) + (72000 72000) + (71997 71997) + (71991 71992) + (71985 71989) + (71736 71736) + (71724 71726) + (71462 71462) + (71350 71350) + (71342 71343) + (71340 71340) + (71230 71230) + (71227 71228) + (71216 71218) + (71102 71102) + (71096 71099) + (71088 71089) + (70849 70849) + (70846 70846) + (70843 70844) + (70841 70841) + (70833 70834) + (70725 70725) + (70720 70721) + (70709 70711) + (70498 70499) + (70475 70477) + (70471 70472) + (70465 70468) + (70463 70463) + (70402 70403) + (70368 70370) + (70197 70197) + (70194 70195) + (70188 70190) + (70094 70094) + (70079 70080) + (70067 70069) + (70018 70018) + (69957 69958) + (69932 69932) + (69815 69816) + (69808 69810) + (69762 69762) + (69634 69634) + (69632 69632) + (44012 44012) + (44009 44010) + (44006 44007) + (44003 44004) + (43765 43765) + (43758 43759) + (43755 43755) + (43597 43597) + (43571 43572) + (43567 43568) + (43454 43456) + (43450 43451) + (43444 43445) + (43395 43395) + (43346 43347) + (43188 43203) + (43136 43137) + (43047 43047) + (43043 43044) + (7415 7415) + (7393 7393) + (7220 7221) + (7204 7211) + (7154 7155) + (7150 7150) + (7146 7148) + (7143 7143) + (7082 7082) + (7078 7079) + (7073 7073) + (7042 7042) + (6979 6980) + (6973 6977) + (6971 6971) + (6916 6916) + (6765 6770) + (6743 6743) + (6741 6741) + (6681 6682) + (6451 6456) + (6448 6449) + (6441 6443) + (6435 6438) + (6087 6088) + (6078 6085) + (6070 6070) + (5940 5940) + (5909 5909) + (4228 4228) + (4182 4183) + (4155 4156) + (4145 4145) + (3967 3967) + (3902 3903) + (3763 3763) + (3635 3635) + (3570 3571) + (3544 3550) + (3536 3537) + (3458 3459) + (3402 3404) + (3398 3400) + (3391 3392) + (3330 3331) + (3315 3315) + (3274 3275) + (3271 3272) + (3267 3268) + (3264 3265) + (3262 3262) + (3202 3203) + (3137 3140) + (3073 3075) + (3018 3020) + (3014 3016) + (3009 3010) + (3007 3007) + (2891 2892) + (2887 2888) + (2880 2880) + (2818 2819) + (2763 2764) + (2761 2761) + (2750 2752) + (2691 2691) + (2622 2624) + (2563 2563) + (2507 2508) + (2503 2504) + (2495 2496) + (2434 2435) + (2382 2383) + (2377 2380) + (2366 2368) + (2363 2363) + (2307 2307)) (extend (917760 917999) (917536 917631) @@ -420,34 +568,6 @@ (1160 1161) (1155 1159) (768 879)) - (hangul-syllable-t (55243 55291) (4520 4607)) - (control - (918000 921599) - (917632 917759) - (917506 917535) - (917505 917505) - (917504 917504) - (119155 119162) - (113824 113827) - (78896 78911) - (65529 65531) - (65520 65528) - (65279 65279) - (8294 8303) - (8293 8293) - (8288 8292) - (8234 8238) - (8233 8233) - (8232 8232) - (8206 8207) - (8203 8203) - (6158 6158) - (1564 1564) - (173 173) - (127 159) - (14 31) - (11 12) - (0 9)) (hangul-syllable-lv (55176 55176) (55148 55148) @@ -848,174 +968,6 @@ (44088 44088) (44060 44060) (44032 44032)) - (line-feed (10 10)) - (hangul-syllable-v (55216 55238) (4448 4519)) - (spacing-mark - (119149 119149) - (119142 119142) - (94192 94193) - (94033 94087) - (73537 73537) - (73534 73535) - (73524 73525) - (73475 73475) - (73461 73462) - (73110 73110) - (73107 73108) - (73098 73102) - (72884 72884) - (72881 72881) - (72873 72873) - (72766 72766) - (72751 72751) - (72343 72343) - (72279 72280) - (72249 72249) - (72164 72164) - (72156 72159) - (72145 72147) - (72002 72002) - (72000 72000) - (71997 71997) - (71991 71992) - (71985 71989) - (71736 71736) - (71724 71726) - (71462 71462) - (71350 71350) - (71342 71343) - (71340 71340) - (71230 71230) - (71227 71228) - (71216 71218) - (71102 71102) - (71096 71099) - (71088 71089) - (70849 70849) - (70846 70846) - (70843 70844) - (70841 70841) - (70833 70834) - (70725 70725) - (70720 70721) - (70709 70711) - (70498 70499) - (70475 70477) - (70471 70472) - (70465 70468) - (70463 70463) - (70402 70403) - (70368 70370) - (70197 70197) - (70194 70195) - (70188 70190) - (70094 70094) - (70079 70080) - (70067 70069) - (70018 70018) - (69957 69958) - (69932 69932) - (69815 69816) - (69808 69810) - (69762 69762) - (69634 69634) - (69632 69632) - (44012 44012) - (44009 44010) - (44006 44007) - (44003 44004) - (43765 43765) - (43758 43759) - (43755 43755) - (43597 43597) - (43571 43572) - (43567 43568) - (43454 43456) - (43450 43451) - (43444 43445) - (43395 43395) - (43346 43347) - (43188 43203) - (43136 43137) - (43047 43047) - (43043 43044) - (7415 7415) - (7393 7393) - (7220 7221) - (7204 7211) - (7154 7155) - (7150 7150) - (7146 7148) - (7143 7143) - (7082 7082) - (7078 7079) - (7073 7073) - (7042 7042) - (6979 6980) - (6973 6977) - (6971 6971) - (6916 6916) - (6765 6770) - (6743 6743) - (6741 6741) - (6681 6682) - (6451 6456) - (6448 6449) - (6441 6443) - (6435 6438) - (6087 6088) - (6078 6085) - (6070 6070) - (5940 5940) - (5909 5909) - (4228 4228) - (4182 4183) - (4155 4156) - (4145 4145) - (3967 3967) - (3902 3903) - (3763 3763) - (3635 3635) - (3570 3571) - (3544 3550) - (3536 3537) - (3458 3459) - (3402 3404) - (3398 3400) - (3391 3392) - (3330 3331) - (3315 3315) - (3274 3275) - (3271 3272) - (3267 3268) - (3264 3265) - (3262 3262) - (3202 3203) - (3137 3140) - (3073 3075) - (3018 3020) - (3014 3016) - (3009 3010) - (3007 3007) - (2891 2892) - (2887 2888) - (2880 2880) - (2818 2819) - (2763 2764) - (2761 2761) - (2750 2752) - (2691 2691) - (2622 2624) - (2563 2563) - (2507 2508) - (2503 2504) - (2495 2496) - (2434 2435) - (2382 2383) - (2377 2380) - (2366 2368) - (2363 2363) - (2307 2307)) (hangul-syllable-lvt (55177 55203) (55149 55175) @@ -1417,7 +1369,55 @@ (44061 44087) (44033 44059)) (hangul-syllable-l (43360 43388) (4352 4447)) - (zerowidth-joiner (8205 8205))))) + (hangul-syllable-v (55216 55238) (4448 4519)) + (prepend + (73474 73474) + (73030 73030) + (72324 72329) + (72250 72250) + (72001 72001) + (71999 71999) + (70082 70083) + (69837 69837) + (69821 69821) + (3406 3406) + (2274 2274) + (2192 2193) + (1807 1807) + (1757 1757) + (1536 1541)) + (regional-indicator (127462 127487)) + (line-feed (10 10)) + (control + (918000 921599) + (917632 917759) + (917506 917535) + (917505 917505) + (917504 917504) + (119155 119162) + (113824 113827) + (78896 78911) + (65529 65531) + (65520 65528) + (65279 65279) + (8294 8303) + (8293 8293) + (8288 8292) + (8234 8238) + (8233 8233) + (8232 8232) + (8206 8207) + (8203 8203) + (6158 6158) + (1564 1564) + (173 173) + (127 159) + (14 31) + (11 12) + (0 9)) + (zero-width-joiner (8205 8205)) + (carriage-return (13 13)) + (hangul-syllable-t (55243 55291) (4520 4607))))) (define char-set:grapheme-hangul-syllable-l (char-set)) @@ -1437,7 +1437,7 @@ (char-set)) (define char-set:grapheme-spacing-mark (char-set)) -(define char-set:grapheme-zerowidth-joiner +(define char-set:grapheme-zero-width-joiner (char-set)) (define grapheme-charsets @@ -1459,8 +1459,8 @@ char-set:grapheme-regional-indicator) (list 'spacing-mark char-set:grapheme-spacing-mark) - (list 'zerowidth-joiner - char-set:grapheme-zerowidth-joiner))) + (list 'zero-width-joiner + char-set:grapheme-zero-width-joiner))) (ranges->charset! grapheme-ht @@ -1508,5 +1508,5 @@ char-set:grapheme-spacing-mark) (ranges->charset! grapheme-ht - 'zerowidth-joiner - char-set:grapheme-zerowidth-joiner) + 'zero-width-joiner + char-set:grapheme-zero-width-joiner) diff --git a/uniseg/graphemes/stream.scm b/uniseg/graphemes/stream.scm index b17956c..31ea2ed 100644 --- a/uniseg/graphemes/stream.scm +++ b/uniseg/graphemes/stream.scm @@ -1,5 +1,8 @@ -(define-module (runewidth graphemes stream) +(define-module (uniseg graphemes stream) + #:use-module (uniseg) + #:use-module (ice-9 textual-ports) #:use-module (ice-9 match) + #:use-module (srfi srfi-71) #:use-module (srfi srfi-41) #:use-module (srfi srfi-9 gnu) #:export (make-grapheme @@ -8,20 +11,23 @@ grapheme-width grapheme-sentence-end? grapheme-word-end? + string->grapheme-stream input->grapheme-stream)) +;; TODO: uniseg also does word and sentence boundaries. These state machines could be implemented if we wanted to. (define-immutable-record-type - (make-grapheme glyphs width sentence-end? word-end?) + (make-grapheme glyphs width) grapheme? (glyphs grapheme-glyphs) - (width grapheme-width) - (sentence-end? grapheme-sentence-end?) - (word-end? grapheme-word-end?)) + (width grapheme-width)) + +(define (string->grapheme-stream str) + "Given a string, create a (lazy) stream of graphemes." + (with-input-from-string str + (λ () (input->grapheme-stream (current-input-port))))) (define-stream (input->grapheme-stream port) - (unless port - (set! port (current-input-port))) - + "Given an input port, create a (lazy) stream of graphemes." ;; The first is what state we are in, and the next is the grapheme ;; property of the current character. ;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go @@ -29,88 +35,115 @@ (match (list cur-state cur-prop) ;; Grapheme boundary #3 (('carriage-return 'line-feed) - (values 'control+line-feed 'no-boundary)) + (values 'control+line-feed #f)) ;; Grapheme boundary #4 ((or ('carriage-return _) ('control+line-feed _)) - (values 'any 'boundary)) + (values 'any #t)) ;; Grapheme boundary #5 ((_ 'carriage-return) - (values 'carriage-return 'boundary)) + (values 'carriage-return #t)) ((or (_ 'line-feed) (_ 'control)) - (values 'control+line-feed 'boundary)) + (values 'control+line-feed #t)) ;; Grapheme boundary #6 (('hangul-syllable-l 'hangul-syllable-l) - (values 'hangul-syllable-l 'no-boundary)) + (values 'hangul-syllable-l #f)) ((or ('hangul-syllable-l 'hangul-syllable-v) ('hangul-syllable-l 'hangul-syllable-lv)) - (values 'hangul-syllable-lv 'no-boundary)) + (values 'hangul-syllable-lv #f)) (('hangul-syllable-l 'hangul-syllable-lvt) - (values 'hangul-syllable-lvt 'no-boundary)) + (values 'hangul-syllable-lvt #f)) ((_ 'hangul-syllable-l) - (values 'hangul-syllable-l 'boundary)) + (values 'hangul-syllable-l #t)) ;; Grapheme boundary #7 (('hangul-syllable-lv 'hangul-syllable-v) - (values 'hangul-syllable-lv 'no-boundary)) + (values 'hangul-syllable-lv #f)) (('hangul-syllable-lv 'hangul-syllable-t) - (values 'hangul-syllable-lvt 'no-boundary)) + (values 'hangul-syllable-lvt #f)) ((or (_ 'hangul-syllable-lv) (_ 'hangul-syllable-v)) - (values 'hangul-syllable-lv 'boundary)) + (values 'hangul-syllable-lv #t)) ;; Grapheme boundary #8 ((or (_ 'hangul-syllable-lvt) (_ 'hangul-syllable-t)) - (values 'hangul-syllable-lvt 'boundary)) + (values 'hangul-syllable-lvt #t)) (('hangul-syllable-lvt 'hangul-syllable-t) - (values 'hangul-syllable-lvt 'no-boundary)) + (values 'hangul-syllable-lvt #f)) ;; Grapheme boundary #9 ((or (_ 'extend) (_ 'zero-width-joiner)) - (values 'any 'no-boundary)) + (values 'any #f)) ;; Grapheme boundary #9a ((_ 'spacing-mark) - (values 'any 'no-boundary)) + (values 'any #f)) ;; Grapheme boundary #9b (('prepend _) - (values 'any 'no-boundary)) + (values 'any #f)) ((_ 'prepend) - (values 'prepend 'boundary)) + (values 'prepend #t)) ;; Grapheme boundary #11 (emoji!) (('extended-pictographic 'extend) - (values 'extended-pictographic 'no-boundary)) + (values 'extended-pictographic #f)) (('extended-pictographic 'zero-width-joiner) - (values 'extended-pictographic+zero-width-joiner 'no-boundary)) + (values 'extended-pictographic+zero-width-joiner #f)) (('extended-pictographic+zero-width-joiner 'extended-pictographic) - (values 'extended-pictographic 'no-boundary)) + (values 'extended-pictographic #f)) ((_ 'extended-pictographic) - (values 'extended-pictographic 'boundary)) + (values 'extended-pictographic #t)) ;; Grapheme boundaries #12 and #13 (('regional-indicator-odd 'regional-indicator) - (values 'regioinal-indicator-even 'no-boundary)) + (values 'regioinal-indicator-even #f)) (('regional-indicator-even 'regional-indicator) - (values 'regional-indicator-odd 'boundary)) + (values 'regional-indicator-odd #t)) ((_ 'regional-indicator) - (values 'regional-indicator-odd 'boundary)) + (values 'regional-indicator-odd #t)) - (else (values 'other 'other)))) - - (define (transition-state state char) - "Given the current state and the next char, run a state transition" + ;; Everything else considered a boundeary + (else (values 'any #t)))) - ) + (define grapheme-width 0) + (define glyphs-reverse '()) - (define grapheme (make-grapheme glyphs width sentence-end? word-end?)) - (stream-cons grapheme (input->grapheme-stream port))) + (define hit-eof #f) + + (define (iterate-through-grapheme state) + (define glyph (get-char port)) + (if + (eof-object? glyph) + (begin + (set! hit-eof #t) + state) + (begin + (set! glyphs-reverse (cons glyph glyphs-reverse)) + (let* ((width property (char-width glyph)) + (next-state boundary? (state-machine state property))) + + (set! grapheme-width (+ grapheme-width width)) + + (if boundary? + state + (iterate-through-grapheme state)))))) + + (define final-state (iterate-through-grapheme 'any)) + + (if + hit-eof + stream-null + (stream-cons + (make-grapheme + (reverse glyphs-reverse) + grapheme-width) + (input->grapheme-stream port)))) diff --git a/uniseg/internal.scm b/uniseg/internal.scm index 7da6e1d..cc55e40 100644 --- a/uniseg/internal.scm +++ b/uniseg/internal.scm @@ -21,6 +21,9 @@ file-to-lines ranges->charset!)) +;; +;; Common PEG patterns +;; (define-peg-pattern @hex body (peg "[a-fA-F0-9]")) (define-peg-pattern @codepoint all @@ -31,13 +34,14 @@ (and @codepoint (ignore "..") @codepoint) @codepoint)) - (define-peg-pattern @comment all (and (ignore "#") (* peg-any))) (define-peg-pattern @ws none (or " " "\t")) +;; Helper macro to add a list of character ranges +;; to a hash-set. (define-syntax-rule (cons-hash-list! ht key low high) (let* ((old (hashq-ref ht key)) (value (list low high)) @@ -47,6 +51,8 @@ (list value)))) (hashq-set! ht key new-lst))) +;; Helper macro to take above character range hash table +;; and insert it into Guile's chasracter set system. (define-syntax-rule (ranges->charset! ht name symbol) (let* ((pairs (hashq-ref ht name))) (for-each @@ -64,15 +70,17 @@ ;; So they can't be written out as such. (locale-string->integer str 16)) +;; This should be in the standard library... (define (format-exception-msg port err) (apply format port (exception-message err) (exception-irritants err)) (display "\n" port)) +;; Little checker to see if we are in the danger zone +;; (surrogate range is invalid unicode and guile errors) (define (in-surrogate-range num) (and (>= num #xd800) (<= num #xdfff))) - (define* (wget-to-lines url #:optional (port #f)) (format port "Downloading from ~a..." url)