Holy shit it works

This commit is contained in:
Vivianne 2024-03-03 20:25:47 -05:00
parent 1692a9fc05
commit e53876be52
9 changed files with 2261 additions and 1383 deletions

View File

@ -112,8 +112,7 @@
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (uniseg internal)
#:export
(,@ea-symbol-names
#:export (,@ea-symbol-names
eastasian-charsets)))
(pretty-print

View File

@ -27,9 +27,38 @@
(define-peg-pattern @emoji-line body
(and @emoji-datum (* @ws) @comment))
(define emoji-list '())
(define emoji-ht (make-hash-table 5))
(define emoji-sets
'(emoji
emoji-presentation
emoji-modifier
emoji-modifier-base
emoji-component
emoji-extended-pictographic))
(define emoji-symbol-names
(map
(λ (set)
(string->symbol
(string-concatenate
(list "char-set:"
(symbol->string set)))))
emoji-sets))
(define emoji-sets-and-symbols
(zip emoji-sets emoji-symbol-names))
(define (process-emoji-line line)
(define (string->category str)
(match str
("Emoji" 'emoji)
("Emoji_Presentation" 'emoji-presentation)
("Emoji_Modifier" 'emoji-modifier)
("Emoji_Modifier_Base" 'emoji-modifier-base)
("Emoji_Component" 'emoji-component)
("Extended_Pictographic" 'emoji-extended-pictographic)))
(define tree (peg:tree (match-pattern @emoji-line line)))
(unless (or (not tree)
@ -39,7 +68,7 @@
(match tree
(((('@codepoint-range
('@codepoint codepoints) ...)
('@emoji-category category))
('@emoji-category cat-str))
('@comment comment))
(with-exception-handler
@ -48,15 +77,14 @@
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints))))
(l (hex-string->integer (last codepoints)))
(category (string->category cat-str)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(when (and (equal? "Extended_Pictographic" category)
(> l #xFF))
(set! emoji-list (cons (list f l) emoji-list)))))
(cons-hash-list! emoji-ht category f l)))
#:unwind? #t)))))
(define line-func
@ -78,27 +106,45 @@
(pretty-print
`(define-module (uniseg emoji)
#:use-module (uniseg internal)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:export (char-set:extended-pictographic)))
#:export (,@emoji-symbol-names
emoji-charsets)))
(pretty-print
`(define emoji-list ',emoji-list))
`(define emoji-ht
(alist->hashq-table ',(hash-map->list cons emoji-ht))))
(display "\n")
(for-each
(λ (sym)
(pretty-print
`(define ,sym (char-set))))
emoji-symbol-names)
(display "\n")
(pretty-print
`(define char-set:extended-pictographic (char-set)))
(display "\n")
(pretty-print
`(for-each
`(define emoji-charsets
(list
,@(map
(λ (pair)
(ucs-range->char-set!
(first pair)
(+ 1 (second pair))
#t char-set:extended-pictographic))
emoji-list))
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
emoji-sets-and-symbols))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! emoji-ht ',name ,symbol))))
emoji-sets-and-symbols)
(display "Code generation complete.\n" stdout)))

View File

@ -42,7 +42,7 @@
extend
regional-indicator
spacing-mark
zerowidth-joiner))
zero-width-joiner))
(define grapheme-symbol-names
(map
@ -71,7 +71,7 @@
("Extend" 'extend)
("Regional_Indicator" 'regional-indicator)
("SpacingMark" 'spacing-mark)
("ZWJ" 'zerowidth-joiner)))
("ZWJ" 'zero-width-joiner)))
(define tree (peg:tree (match-pattern @grapheme-line line)))
@ -123,7 +123,7 @@
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (uniseg internal)
#:export ( ,@grapheme-symbol-names
#:export (,@grapheme-symbol-names
grapheme-charsets)))

View File

@ -1,17 +1,21 @@
(define-module (uniseg)
#:use-module (srfi srfi-1)
#:use-module (ice-9 match)
#:use-module (srfi srfi-41)
#:use-module (uniseg emoji)
#:use-module (uniseg graphemes)
#:use-module (uniseg graphemes stream)
#:use-module (uniseg eastasian)
#:export (emoji?
grapheme-property
eastasian-property))
char->grapheme-property
char->eastasian-property
char-width
string-width))
(define (emoji? char)
(char-set-contains? char-set:extended-pictographic char))
(char-set-contains? char-set:emoji-extended-pictographic char))
(define (get-prop sets char)
(define (char->prop sets char)
(let ((pair
(find
(λ (p) (char-set-contains? (second p) char))
@ -19,12 +23,12 @@
(and pair
(first pair))))
(define (grapheme-property char)
(define (char->grapheme-property char)
"Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/"
(or (get-prop grapheme-charsets char)
(or (char->prop grapheme-charsets char)
'other))
(define (eastasian-property char)
(define (char->eastasian-property char)
"Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/"
(define (<==> lo hi)
"Create a range predicate, inclusive between two numbers"
@ -47,7 +51,7 @@
(define range-plane-3?
(<==> #x30000 #x3FFFD))
(or (get-prop eastasian-charsets char)
(or (char->prop eastasian-charsets char)
(let ((int (char->integer char)))
(match int
((or (? range-unified-ideographs-ext-a?)
@ -58,3 +62,43 @@
'doublewidth)
(else
'neutral)))))
(define (char-width rune)
"Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations"
(define grapheme-prop (char->grapheme-property rune))
(values
(or
(match grapheme-prop
((or 'control
'carriage-return
'line-feed
'extend
'zero-width-joiner)
0)
('regional-indicator
2)
('extended-pictographic
(if
(char-set-contains? char-set:emoji-presentation rune)
2
1))
(else #f)
)
(match (char->integer rune)
(#x2E3A 3)
(#x2E3B 4)
(else #f))
(match (char->eastasian-property rune)
('doublewidth 2)
;; Default for monospace is to render ambiguous as 1
('ambiguous 1)
(else 1)))
grapheme-prop))
(define (string-width str)
"Get the width of a string by adding up the widths of each grapheme"
(stream-fold
(λ (val grapheme)
(+ val (grapheme-width grapheme)))
0
(string->grapheme-stream str)))

File diff suppressed because it is too large Load Diff

View File

@ -3,11 +3,728 @@
(define-module
(uniseg emoji)
#:use-module
(uniseg internal)
#:use-module
(ice-9 hash-table)
#:use-module
(srfi srfi-1)
#:export
(char-set:extended-pictographic))
(define emoji-list
'((130048 131069)
(char-set:emoji
char-set:emoji-presentation
char-set:emoji-modifier
char-set:emoji-modifier-base
char-set:emoji-component
char-set:emoji-extended-pictographic
emoji-charsets))
(define emoji-ht
(alist->hashq-table
'((emoji-modifier (127995 127999))
(emoji-component
(917536 917631)
(129456 129459)
(127995 127999)
(127462 127487)
(65039 65039)
(8419 8419)
(8205 8205)
(48 57)
(42 42)
(35 35))
(emoji-modifier-base
(129489 129501)
(129485 129487)
(129467 129467)
(129464 129465)
(129461 129462)
(129399 129399)
(129340 129342)
(129331 129337)
(129329 129330)
(129328 129328)
(129318 129318)
(129311 129311)
(129305 129310)
(129304 129304)
(129295 129295)
(129292 129292)
(128716 128716)
(128704 128704)
(128694 128694)
(128692 128693)
(128675 128675)
(128587 128591)
(128581 128583)
(128405 128406)
(128400 128400)
(128378 128378)
(128372 128373)
(128170 128170)
(128145 128145)
(128143 128143)
(128133 128135)
(128129 128131)
(128124 128124)
(128110 128120)
(128108 128109)
(128102 128107)
(128070 128080)
(128066 128067)
(127947 127948)
(127946 127946)
(127943 127943)
(127938 127940)
(127877 127877)
(9997 9997)
(9994 9996)
(9977 9977)
(9757 9757))
(emoji (129744 129750)
(129728 129730)
(129712 129718)
(129686 129704)
(129680 129685)
(129667 129670)
(129664 129666)
(129656 129658)
(129652 129652)
(129648 129651)
(129511 129535)
(129488 129510)
(129485 129487)
(129483 129483)
(129475 129482)
(129473 129474)
(129472 129472)
(129466 129471)
(129456 129465)
(129454 129455)
(129451 129453)
(129445 129450)
(129443 129444)
(129432 129442)
(129426 129431)
(129413 129425)
(129408 129412)
(129404 129407)
(129403 129403)
(129402 129402)
(129399 129400)
(129395 129398)
(129394 129394)
(129393 129393)
(129388 129392)
(129375 129387)
(129360 129374)
(129357 129359)
(129356 129356)
(129351 129355)
(129344 129349)
(129343 129343)
(129340 129342)
(129331 129338)
(129329 129330)
(129328 129328)
(129320 129327)
(129312 129319)
(129311 129311)
(129305 129310)
(129296 129304)
(129293 129295)
(129292 129292)
(128992 129003)
(128763 128764)
(128762 128762)
(128761 128761)
(128759 128760)
(128756 128758)
(128755 128755)
(128752 128752)
(128747 128748)
(128745 128745)
(128736 128741)
(128726 128727)
(128725 128725)
(128721 128722)
(128720 128720)
(128717 128719)
(128716 128716)
(128715 128715)
(128705 128709)
(128704 128704)
(128703 128703)
(128697 128702)
(128695 128696)
(128694 128694)
(128691 128693)
(128690 128690)
(128686 128689)
(128679 128685)
(128678 128678)
(128676 128677)
(128675 128675)
(128674 128674)
(128667 128673)
(128665 128666)
(128664 128664)
(128663 128663)
(128662 128662)
(128661 128661)
(128660 128660)
(128657 128659)
(128656 128656)
(128655 128655)
(128654 128654)
(128653 128653)
(128652 128652)
(128650 128651)
(128649 128649)
(128648 128648)
(128647 128647)
(128646 128646)
(128643 128645)
(128641 128642)
(128640 128640)
(128581 128591)
(128577 128580)
(128567 128576)
(128566 128566)
(128565 128565)
(128564 128564)
(128560 128563)
(128558 128559)
(128557 128557)
(128556 128556)
(128552 128555)
(128550 128551)
(128544 128549)
(128543 128543)
(128540 128542)
(128539 128539)
(128538 128538)
(128537 128537)
(128536 128536)
(128535 128535)
(128534 128534)
(128533 128533)
(128530 128532)
(128529 128529)
(128528 128528)
(128527 128527)
(128526 128526)
(128521 128525)
(128519 128520)
(128513 128518)
(128512 128512)
(128507 128511)
(128506 128506)
(128499 128499)
(128495 128495)
(128488 128488)
(128483 128483)
(128481 128481)
(128476 128478)
(128465 128467)
(128450 128452)
(128444 128444)
(128433 128434)
(128424 128424)
(128421 128421)
(128420 128420)
(128405 128406)
(128400 128400)
(128394 128397)
(128391 128391)
(128378 128378)
(128371 128377)
(128367 128368)
(128348 128359)
(128336 128347)
(128331 128334)
(128329 128330)
(128302 128317)
(128300 128301)
(128278 128299)
(128277 128277)
(128266 128276)
(128265 128265)
(128264 128264)
(128260 128263)
(128259 128259)
(128255 128258)
(128253 128253)
(128249 128252)
(128248 128248)
(128246 128247)
(128245 128245)
(128240 128244)
(128239 128239)
(128238 128238)
(128236 128237)
(128184 128235)
(128182 128183)
(128174 128181)
(128173 128173)
(128110 128172)
(128108 128109)
(128102 128107)
(128101 128101)
(128066 128100)
(128065 128065)
(128064 128064)
(128063 128063)
(128043 128062)
(128042 128042)
(128023 128041)
(128022 128022)
(128021 128021)
(128020 128020)
(128019 128019)
(128017 128018)
(128015 128016)
(128012 128014)
(128009 128011)
(128008 128008)
(127992 128007)
(127991 127991)
(127989 127989)
(127988 127988)
(127987 127987)
(127973 127984)
(127972 127972)
(127968 127971)
(127956 127967)
(127951 127955)
(127947 127950)
(127946 127946)
(127945 127945)
(127944 127944)
(127943 127943)
(127942 127942)
(127941 127941)
(127904 127940)
(127902 127903)
(127897 127899)
(127894 127895)
(127872 127891)
(127870 127871)
(127869 127869)
(127868 127868)
(127825 127867)
(127824 127824)
(127820 127823)
(127819 127819)
(127799 127818)
(127798 127798)
(127796 127797)
(127794 127795)
(127792 127793)
(127789 127791)
(127780 127788)
(127777 127777)
(127775 127776)
(127773 127774)
(127772 127772)
(127771 127771)
(127770 127770)
(127769 127769)
(127766 127768)
(127763 127765)
(127762 127762)
(127761 127761)
(127760 127760)
(127759 127759)
(127757 127758)
(127744 127756)
(127568 127569)
(127538 127546)
(127535 127535)
(127514 127514)
(127489 127490)
(127462 127487)
(127377 127386)
(127374 127374)
(127358 127359)
(127344 127345)
(127183 127183)
(126980 126980)
(12953 12953)
(12951 12951)
(12349 12349)
(12336 12336)
(11093 11093)
(11088 11088)
(11035 11036)
(11013 11015)
(10548 10549)
(10175 10175)
(10160 10160)
(10145 10145)
(10133 10135)
(10084 10084)
(10083 10083)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10055 10055)
(10052 10052)
(10035 10036)
(10024 10024)
(10017 10017)
(10013 10013)
(10006 10006)
(10004 10004)
(10002 10002)
(9999 9999)
(9997 9997)
(9992 9996)
(9989 9989)
(9986 9986)
(9981 9981)
(9978 9978)
(9975 9977)
(9973 9973)
(9972 9972)
(9970 9971)
(9968 9969)
(9962 9962)
(9961 9961)
(9940 9940)
(9939 9939)
(9937 9937)
(9935 9935)
(9934 9934)
(9928 9928)
(9924 9925)
(9917 9918)
(9904 9905)
(9898 9899)
(9895 9895)
(9888 9889)
(9883 9884)
(9881 9881)
(9878 9879)
(9877 9877)
(9876 9876)
(9875 9875)
(9874 9874)
(9855 9855)
(9854 9854)
(9851 9851)
(9832 9832)
(9829 9830)
(9827 9827)
(9824 9824)
(9823 9823)
(9800 9811)
(9794 9794)
(9792 9792)
(9786 9786)
(9784 9785)
(9775 9775)
(9774 9774)
(9770 9770)
(9766 9766)
(9762 9763)
(9760 9760)
(9757 9757)
(9752 9752)
(9748 9749)
(9745 9745)
(9742 9742)
(9732 9732)
(9730 9731)
(9728 9729)
(9723 9726)
(9664 9664)
(9654 9654)
(9642 9643)
(9410 9410)
(9208 9210)
(9203 9203)
(9201 9202)
(9200 9200)
(9199 9199)
(9197 9198)
(9193 9196)
(9167 9167)
(9000 9000)
(8986 8987)
(8617 8618)
(8596 8601)
(8505 8505)
(8482 8482)
(8265 8265)
(8252 8252)
(174 174)
(169 169)
(48 57)
(42 42)
(35 35))
(emoji-presentation
(129744 129750)
(129728 129730)
(129712 129718)
(129686 129704)
(129680 129685)
(129667 129670)
(129664 129666)
(129656 129658)
(129652 129652)
(129648 129651)
(129511 129535)
(129488 129510)
(129485 129487)
(129483 129483)
(129475 129482)
(129473 129474)
(129472 129472)
(129466 129471)
(129456 129465)
(129454 129455)
(129451 129453)
(129445 129450)
(129443 129444)
(129432 129442)
(129426 129431)
(129413 129425)
(129408 129412)
(129404 129407)
(129403 129403)
(129402 129402)
(129399 129400)
(129395 129398)
(129394 129394)
(129393 129393)
(129388 129392)
(129375 129387)
(129360 129374)
(129357 129359)
(129356 129356)
(129351 129355)
(129344 129349)
(129343 129343)
(129340 129342)
(129331 129338)
(129329 129330)
(129328 129328)
(129320 129327)
(129312 129319)
(129311 129311)
(129305 129310)
(129296 129304)
(129293 129295)
(129292 129292)
(128992 129003)
(128763 128764)
(128762 128762)
(128761 128761)
(128759 128760)
(128756 128758)
(128747 128748)
(128726 128727)
(128725 128725)
(128721 128722)
(128720 128720)
(128716 128716)
(128705 128709)
(128704 128704)
(128703 128703)
(128697 128702)
(128695 128696)
(128694 128694)
(128691 128693)
(128690 128690)
(128686 128689)
(128679 128685)
(128678 128678)
(128676 128677)
(128675 128675)
(128674 128674)
(128667 128673)
(128665 128666)
(128664 128664)
(128663 128663)
(128662 128662)
(128661 128661)
(128660 128660)
(128657 128659)
(128656 128656)
(128655 128655)
(128654 128654)
(128653 128653)
(128652 128652)
(128650 128651)
(128649 128649)
(128648 128648)
(128647 128647)
(128646 128646)
(128643 128645)
(128641 128642)
(128640 128640)
(128581 128591)
(128577 128580)
(128567 128576)
(128566 128566)
(128565 128565)
(128564 128564)
(128560 128563)
(128558 128559)
(128557 128557)
(128556 128556)
(128552 128555)
(128550 128551)
(128544 128549)
(128543 128543)
(128540 128542)
(128539 128539)
(128538 128538)
(128537 128537)
(128536 128536)
(128535 128535)
(128534 128534)
(128533 128533)
(128530 128532)
(128529 128529)
(128528 128528)
(128527 128527)
(128526 128526)
(128521 128525)
(128519 128520)
(128513 128518)
(128512 128512)
(128507 128511)
(128420 128420)
(128405 128406)
(128378 128378)
(128348 128359)
(128336 128347)
(128331 128334)
(128302 128317)
(128300 128301)
(128278 128299)
(128277 128277)
(128266 128276)
(128265 128265)
(128264 128264)
(128260 128263)
(128259 128259)
(128255 128258)
(128249 128252)
(128248 128248)
(128246 128247)
(128245 128245)
(128240 128244)
(128239 128239)
(128238 128238)
(128236 128237)
(128184 128235)
(128182 128183)
(128174 128181)
(128173 128173)
(128110 128172)
(128108 128109)
(128102 128107)
(128101 128101)
(128066 128100)
(128064 128064)
(128043 128062)
(128042 128042)
(128023 128041)
(128022 128022)
(128021 128021)
(128020 128020)
(128019 128019)
(128017 128018)
(128015 128016)
(128012 128014)
(128009 128011)
(128008 128008)
(127992 128007)
(127988 127988)
(127973 127984)
(127972 127972)
(127968 127971)
(127951 127955)
(127946 127946)
(127945 127945)
(127944 127944)
(127943 127943)
(127942 127942)
(127941 127941)
(127904 127940)
(127872 127891)
(127870 127871)
(127868 127868)
(127825 127867)
(127824 127824)
(127820 127823)
(127819 127819)
(127799 127818)
(127796 127797)
(127794 127795)
(127792 127793)
(127789 127791)
(127775 127776)
(127773 127774)
(127772 127772)
(127771 127771)
(127770 127770)
(127769 127769)
(127766 127768)
(127763 127765)
(127762 127762)
(127761 127761)
(127760 127760)
(127759 127759)
(127757 127758)
(127744 127756)
(127568 127569)
(127544 127546)
(127538 127542)
(127535 127535)
(127514 127514)
(127489 127489)
(127462 127487)
(127377 127386)
(127374 127374)
(127183 127183)
(126980 126980)
(11093 11093)
(11088 11088)
(11035 11036)
(10175 10175)
(10160 10160)
(10133 10135)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10024 10024)
(9994 9995)
(9989 9989)
(9981 9981)
(9978 9978)
(9973 9973)
(9970 9971)
(9962 9962)
(9940 9940)
(9934 9934)
(9924 9925)
(9917 9918)
(9898 9899)
(9889 9889)
(9875 9875)
(9855 9855)
(9800 9811)
(9748 9749)
(9725 9726)
(9203 9203)
(9200 9200)
(9193 9196)
(8986 8987))
(emoji-extended-pictographic
(130048 131069)
(129751 129791)
(129744 129750)
(129731 129743)
@ -495,16 +1212,47 @@
(8505 8505)
(8482 8482)
(8265 8265)
(8252 8252)))
(8252 8252)
(174 174)
(169 169)))))
(define char-set:extended-pictographic
(define char-set:emoji (char-set))
(define char-set:emoji-presentation (char-set))
(define char-set:emoji-modifier (char-set))
(define char-set:emoji-modifier-base (char-set))
(define char-set:emoji-component (char-set))
(define char-set:emoji-extended-pictographic
(char-set))
(for-each
(λ (pair)
(ucs-range->char-set!
(first pair)
(+ 1 (second pair))
#t
char-set:extended-pictographic))
emoji-list)
(define emoji-charsets
(list (list 'emoji char-set:emoji)
(list 'emoji-presentation
char-set:emoji-presentation)
(list 'emoji-modifier char-set:emoji-modifier)
(list 'emoji-modifier-base
char-set:emoji-modifier-base)
(list 'emoji-component char-set:emoji-component)
(list 'emoji-extended-pictographic
char-set:emoji-extended-pictographic)))
(ranges->charset! emoji-ht 'emoji char-set:emoji)
(ranges->charset!
emoji-ht
'emoji-presentation
char-set:emoji-presentation)
(ranges->charset!
emoji-ht
'emoji-modifier
char-set:emoji-modifier)
(ranges->charset!
emoji-ht
'emoji-modifier-base
char-set:emoji-modifier-base)
(ranges->charset!
emoji-ht
'emoji-component
char-set:emoji-component)
(ranges->charset!
emoji-ht
'emoji-extended-pictographic
char-set:emoji-extended-pictographic)

View File

@ -20,28 +20,176 @@
char-set:grapheme-extend
char-set:grapheme-regional-indicator
char-set:grapheme-spacing-mark
char-set:grapheme-zerowidth-joiner
char-set:grapheme-zero-width-joiner
grapheme-charsets))
(define grapheme-ht
(alist->hashq-table
'((prepend
(73474 73474)
(73030 73030)
(72324 72329)
(72250 72250)
(72001 72001)
(71999 71999)
(70082 70083)
(69837 69837)
(69821 69821)
(3406 3406)
(2274 2274)
(2192 2193)
(1807 1807)
(1757 1757)
(1536 1541))
(regional-indicator (127462 127487))
(carriage-return (13 13))
'((spacing-mark
(119149 119149)
(119142 119142)
(94192 94193)
(94033 94087)
(73537 73537)
(73534 73535)
(73524 73525)
(73475 73475)
(73461 73462)
(73110 73110)
(73107 73108)
(73098 73102)
(72884 72884)
(72881 72881)
(72873 72873)
(72766 72766)
(72751 72751)
(72343 72343)
(72279 72280)
(72249 72249)
(72164 72164)
(72156 72159)
(72145 72147)
(72002 72002)
(72000 72000)
(71997 71997)
(71991 71992)
(71985 71989)
(71736 71736)
(71724 71726)
(71462 71462)
(71350 71350)
(71342 71343)
(71340 71340)
(71230 71230)
(71227 71228)
(71216 71218)
(71102 71102)
(71096 71099)
(71088 71089)
(70849 70849)
(70846 70846)
(70843 70844)
(70841 70841)
(70833 70834)
(70725 70725)
(70720 70721)
(70709 70711)
(70498 70499)
(70475 70477)
(70471 70472)
(70465 70468)
(70463 70463)
(70402 70403)
(70368 70370)
(70197 70197)
(70194 70195)
(70188 70190)
(70094 70094)
(70079 70080)
(70067 70069)
(70018 70018)
(69957 69958)
(69932 69932)
(69815 69816)
(69808 69810)
(69762 69762)
(69634 69634)
(69632 69632)
(44012 44012)
(44009 44010)
(44006 44007)
(44003 44004)
(43765 43765)
(43758 43759)
(43755 43755)
(43597 43597)
(43571 43572)
(43567 43568)
(43454 43456)
(43450 43451)
(43444 43445)
(43395 43395)
(43346 43347)
(43188 43203)
(43136 43137)
(43047 43047)
(43043 43044)
(7415 7415)
(7393 7393)
(7220 7221)
(7204 7211)
(7154 7155)
(7150 7150)
(7146 7148)
(7143 7143)
(7082 7082)
(7078 7079)
(7073 7073)
(7042 7042)
(6979 6980)
(6973 6977)
(6971 6971)
(6916 6916)
(6765 6770)
(6743 6743)
(6741 6741)
(6681 6682)
(6451 6456)
(6448 6449)
(6441 6443)
(6435 6438)
(6087 6088)
(6078 6085)
(6070 6070)
(5940 5940)
(5909 5909)
(4228 4228)
(4182 4183)
(4155 4156)
(4145 4145)
(3967 3967)
(3902 3903)
(3763 3763)
(3635 3635)
(3570 3571)
(3544 3550)
(3536 3537)
(3458 3459)
(3402 3404)
(3398 3400)
(3391 3392)
(3330 3331)
(3315 3315)
(3274 3275)
(3271 3272)
(3267 3268)
(3264 3265)
(3262 3262)
(3202 3203)
(3137 3140)
(3073 3075)
(3018 3020)
(3014 3016)
(3009 3010)
(3007 3007)
(2891 2892)
(2887 2888)
(2880 2880)
(2818 2819)
(2763 2764)
(2761 2761)
(2750 2752)
(2691 2691)
(2622 2624)
(2563 2563)
(2507 2508)
(2503 2504)
(2495 2496)
(2434 2435)
(2382 2383)
(2377 2380)
(2366 2368)
(2363 2363)
(2307 2307))
(extend
(917760 917999)
(917536 917631)
@ -420,34 +568,6 @@
(1160 1161)
(1155 1159)
(768 879))
(hangul-syllable-t (55243 55291) (4520 4607))
(control
(918000 921599)
(917632 917759)
(917506 917535)
(917505 917505)
(917504 917504)
(119155 119162)
(113824 113827)
(78896 78911)
(65529 65531)
(65520 65528)
(65279 65279)
(8294 8303)
(8293 8293)
(8288 8292)
(8234 8238)
(8233 8233)
(8232 8232)
(8206 8207)
(8203 8203)
(6158 6158)
(1564 1564)
(173 173)
(127 159)
(14 31)
(11 12)
(0 9))
(hangul-syllable-lv
(55176 55176)
(55148 55148)
@ -848,174 +968,6 @@
(44088 44088)
(44060 44060)
(44032 44032))
(line-feed (10 10))
(hangul-syllable-v (55216 55238) (4448 4519))
(spacing-mark
(119149 119149)
(119142 119142)
(94192 94193)
(94033 94087)
(73537 73537)
(73534 73535)
(73524 73525)
(73475 73475)
(73461 73462)
(73110 73110)
(73107 73108)
(73098 73102)
(72884 72884)
(72881 72881)
(72873 72873)
(72766 72766)
(72751 72751)
(72343 72343)
(72279 72280)
(72249 72249)
(72164 72164)
(72156 72159)
(72145 72147)
(72002 72002)
(72000 72000)
(71997 71997)
(71991 71992)
(71985 71989)
(71736 71736)
(71724 71726)
(71462 71462)
(71350 71350)
(71342 71343)
(71340 71340)
(71230 71230)
(71227 71228)
(71216 71218)
(71102 71102)
(71096 71099)
(71088 71089)
(70849 70849)
(70846 70846)
(70843 70844)
(70841 70841)
(70833 70834)
(70725 70725)
(70720 70721)
(70709 70711)
(70498 70499)
(70475 70477)
(70471 70472)
(70465 70468)
(70463 70463)
(70402 70403)
(70368 70370)
(70197 70197)
(70194 70195)
(70188 70190)
(70094 70094)
(70079 70080)
(70067 70069)
(70018 70018)
(69957 69958)
(69932 69932)
(69815 69816)
(69808 69810)
(69762 69762)
(69634 69634)
(69632 69632)
(44012 44012)
(44009 44010)
(44006 44007)
(44003 44004)
(43765 43765)
(43758 43759)
(43755 43755)
(43597 43597)
(43571 43572)
(43567 43568)
(43454 43456)
(43450 43451)
(43444 43445)
(43395 43395)
(43346 43347)
(43188 43203)
(43136 43137)
(43047 43047)
(43043 43044)
(7415 7415)
(7393 7393)
(7220 7221)
(7204 7211)
(7154 7155)
(7150 7150)
(7146 7148)
(7143 7143)
(7082 7082)
(7078 7079)
(7073 7073)
(7042 7042)
(6979 6980)
(6973 6977)
(6971 6971)
(6916 6916)
(6765 6770)
(6743 6743)
(6741 6741)
(6681 6682)
(6451 6456)
(6448 6449)
(6441 6443)
(6435 6438)
(6087 6088)
(6078 6085)
(6070 6070)
(5940 5940)
(5909 5909)
(4228 4228)
(4182 4183)
(4155 4156)
(4145 4145)
(3967 3967)
(3902 3903)
(3763 3763)
(3635 3635)
(3570 3571)
(3544 3550)
(3536 3537)
(3458 3459)
(3402 3404)
(3398 3400)
(3391 3392)
(3330 3331)
(3315 3315)
(3274 3275)
(3271 3272)
(3267 3268)
(3264 3265)
(3262 3262)
(3202 3203)
(3137 3140)
(3073 3075)
(3018 3020)
(3014 3016)
(3009 3010)
(3007 3007)
(2891 2892)
(2887 2888)
(2880 2880)
(2818 2819)
(2763 2764)
(2761 2761)
(2750 2752)
(2691 2691)
(2622 2624)
(2563 2563)
(2507 2508)
(2503 2504)
(2495 2496)
(2434 2435)
(2382 2383)
(2377 2380)
(2366 2368)
(2363 2363)
(2307 2307))
(hangul-syllable-lvt
(55177 55203)
(55149 55175)
@ -1417,7 +1369,55 @@
(44061 44087)
(44033 44059))
(hangul-syllable-l (43360 43388) (4352 4447))
(zerowidth-joiner (8205 8205)))))
(hangul-syllable-v (55216 55238) (4448 4519))
(prepend
(73474 73474)
(73030 73030)
(72324 72329)
(72250 72250)
(72001 72001)
(71999 71999)
(70082 70083)
(69837 69837)
(69821 69821)
(3406 3406)
(2274 2274)
(2192 2193)
(1807 1807)
(1757 1757)
(1536 1541))
(regional-indicator (127462 127487))
(line-feed (10 10))
(control
(918000 921599)
(917632 917759)
(917506 917535)
(917505 917505)
(917504 917504)
(119155 119162)
(113824 113827)
(78896 78911)
(65529 65531)
(65520 65528)
(65279 65279)
(8294 8303)
(8293 8293)
(8288 8292)
(8234 8238)
(8233 8233)
(8232 8232)
(8206 8207)
(8203 8203)
(6158 6158)
(1564 1564)
(173 173)
(127 159)
(14 31)
(11 12)
(0 9))
(zero-width-joiner (8205 8205))
(carriage-return (13 13))
(hangul-syllable-t (55243 55291) (4520 4607)))))
(define char-set:grapheme-hangul-syllable-l
(char-set))
@ -1437,7 +1437,7 @@
(char-set))
(define char-set:grapheme-spacing-mark
(char-set))
(define char-set:grapheme-zerowidth-joiner
(define char-set:grapheme-zero-width-joiner
(char-set))
(define grapheme-charsets
@ -1459,8 +1459,8 @@
char-set:grapheme-regional-indicator)
(list 'spacing-mark
char-set:grapheme-spacing-mark)
(list 'zerowidth-joiner
char-set:grapheme-zerowidth-joiner)))
(list 'zero-width-joiner
char-set:grapheme-zero-width-joiner)))
(ranges->charset!
grapheme-ht
@ -1508,5 +1508,5 @@
char-set:grapheme-spacing-mark)
(ranges->charset!
grapheme-ht
'zerowidth-joiner
char-set:grapheme-zerowidth-joiner)
'zero-width-joiner
char-set:grapheme-zero-width-joiner)

View File

@ -1,5 +1,8 @@
(define-module (runewidth graphemes stream)
(define-module (uniseg graphemes stream)
#:use-module (uniseg)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 match)
#:use-module (srfi srfi-71)
#:use-module (srfi srfi-41)
#:use-module (srfi srfi-9 gnu)
#:export (make-grapheme
@ -8,20 +11,23 @@
grapheme-width
grapheme-sentence-end?
grapheme-word-end?
string->grapheme-stream
input->grapheme-stream))
;; TODO: uniseg also does word and sentence boundaries. These state machines could be implemented if we wanted to.
(define-immutable-record-type <grapheme>
(make-grapheme glyphs width sentence-end? word-end?)
(make-grapheme glyphs width)
grapheme?
(glyphs grapheme-glyphs)
(width grapheme-width)
(sentence-end? grapheme-sentence-end?)
(word-end? grapheme-word-end?))
(width grapheme-width))
(define (string->grapheme-stream str)
"Given a string, create a (lazy) stream of graphemes."
(with-input-from-string str
(λ () (input->grapheme-stream (current-input-port)))))
(define-stream (input->grapheme-stream port)
(unless port
(set! port (current-input-port)))
"Given an input port, create a (lazy) stream of graphemes."
;; The first is what state we are in, and the next is the grapheme
;; property of the current character.
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
@ -29,88 +35,115 @@
(match (list cur-state cur-prop)
;; Grapheme boundary #3
(('carriage-return 'line-feed)
(values 'control+line-feed 'no-boundary))
(values 'control+line-feed #f))
;; Grapheme boundary #4
((or ('carriage-return _)
('control+line-feed _))
(values 'any 'boundary))
(values 'any #t))
;; Grapheme boundary #5
((_ 'carriage-return)
(values 'carriage-return 'boundary))
(values 'carriage-return #t))
((or (_ 'line-feed)
(_ 'control))
(values 'control+line-feed 'boundary))
(values 'control+line-feed #t))
;; Grapheme boundary #6
(('hangul-syllable-l 'hangul-syllable-l)
(values 'hangul-syllable-l 'no-boundary))
(values 'hangul-syllable-l #f))
((or ('hangul-syllable-l 'hangul-syllable-v)
('hangul-syllable-l 'hangul-syllable-lv))
(values 'hangul-syllable-lv 'no-boundary))
(values 'hangul-syllable-lv #f))
(('hangul-syllable-l 'hangul-syllable-lvt)
(values 'hangul-syllable-lvt 'no-boundary))
(values 'hangul-syllable-lvt #f))
((_ 'hangul-syllable-l)
(values 'hangul-syllable-l 'boundary))
(values 'hangul-syllable-l #t))
;; Grapheme boundary #7
(('hangul-syllable-lv 'hangul-syllable-v)
(values 'hangul-syllable-lv 'no-boundary))
(values 'hangul-syllable-lv #f))
(('hangul-syllable-lv 'hangul-syllable-t)
(values 'hangul-syllable-lvt 'no-boundary))
(values 'hangul-syllable-lvt #f))
((or (_ 'hangul-syllable-lv)
(_ 'hangul-syllable-v))
(values 'hangul-syllable-lv 'boundary))
(values 'hangul-syllable-lv #t))
;; Grapheme boundary #8
((or (_ 'hangul-syllable-lvt)
(_ 'hangul-syllable-t))
(values 'hangul-syllable-lvt 'boundary))
(values 'hangul-syllable-lvt #t))
(('hangul-syllable-lvt 'hangul-syllable-t)
(values 'hangul-syllable-lvt 'no-boundary))
(values 'hangul-syllable-lvt #f))
;; Grapheme boundary #9
((or (_ 'extend)
(_ 'zero-width-joiner))
(values 'any 'no-boundary))
(values 'any #f))
;; Grapheme boundary #9a
((_ 'spacing-mark)
(values 'any 'no-boundary))
(values 'any #f))
;; Grapheme boundary #9b
(('prepend _)
(values 'any 'no-boundary))
(values 'any #f))
((_ 'prepend)
(values 'prepend 'boundary))
(values 'prepend #t))
;; Grapheme boundary #11 (emoji!)
(('extended-pictographic 'extend)
(values 'extended-pictographic 'no-boundary))
(values 'extended-pictographic #f))
(('extended-pictographic 'zero-width-joiner)
(values 'extended-pictographic+zero-width-joiner 'no-boundary))
(values 'extended-pictographic+zero-width-joiner #f))
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
(values 'extended-pictographic 'no-boundary))
(values 'extended-pictographic #f))
((_ 'extended-pictographic)
(values 'extended-pictographic 'boundary))
(values 'extended-pictographic #t))
;; Grapheme boundaries #12 and #13
(('regional-indicator-odd 'regional-indicator)
(values 'regioinal-indicator-even 'no-boundary))
(values 'regioinal-indicator-even #f))
(('regional-indicator-even 'regional-indicator)
(values 'regional-indicator-odd 'boundary))
(values 'regional-indicator-odd #t))
((_ 'regional-indicator)
(values 'regional-indicator-odd 'boundary))
(values 'regional-indicator-odd #t))
(else (values 'other 'other))))
(define (transition-state state char)
"Given the current state and the next char, run a state transition"
;; Everything else considered a boundeary
(else (values 'any #t))))
)
(define grapheme-width 0)
(define glyphs-reverse '())
(define grapheme (make-grapheme glyphs width sentence-end? word-end?))
(stream-cons grapheme (input->grapheme-stream port)))
(define hit-eof #f)
(define (iterate-through-grapheme state)
(define glyph (get-char port))
(if
(eof-object? glyph)
(begin
(set! hit-eof #t)
state)
(begin
(set! glyphs-reverse (cons glyph glyphs-reverse))
(let* ((width property (char-width glyph))
(next-state boundary? (state-machine state property)))
(set! grapheme-width (+ grapheme-width width))
(if boundary?
state
(iterate-through-grapheme state))))))
(define final-state (iterate-through-grapheme 'any))
(if
hit-eof
stream-null
(stream-cons
(make-grapheme
(reverse glyphs-reverse)
grapheme-width)
(input->grapheme-stream port))))

View File

@ -21,6 +21,9 @@
file-to-lines
ranges->charset!))
;;
;; Common PEG patterns
;;
(define-peg-pattern @hex body (peg "[a-fA-F0-9]"))
(define-peg-pattern @codepoint all
@ -31,13 +34,14 @@
(and @codepoint (ignore "..") @codepoint)
@codepoint))
(define-peg-pattern @comment all
(and (ignore "#") (* peg-any)))
(define-peg-pattern @ws none
(or " " "\t"))
;; Helper macro to add a list of character ranges
;; to a hash-set.
(define-syntax-rule (cons-hash-list! ht key low high)
(let* ((old (hashq-ref ht key))
(value (list low high))
@ -47,6 +51,8 @@
(list value))))
(hashq-set! ht key new-lst)))
;; Helper macro to take above character range hash table
;; and insert it into Guile's chasracter set system.
(define-syntax-rule (ranges->charset! ht name symbol)
(let* ((pairs (hashq-ref ht name)))
(for-each
@ -64,15 +70,17 @@
;; So they can't be written out as such.
(locale-string->integer str 16))
;; This should be in the standard library...
(define (format-exception-msg port err)
(apply format port (exception-message err) (exception-irritants err))
(display "\n" port))
;; Little checker to see if we are in the danger zone
;; (surrogate range is invalid unicode and guile errors)
(define (in-surrogate-range num)
(and (>= num #xd800)
(<= num #xdfff)))
(define* (wget-to-lines url #:optional (port #f))
(format port "Downloading from ~a..." url)