Holy shit it works
This commit is contained in:
parent
1692a9fc05
commit
e53876be52
9 changed files with 2261 additions and 1383 deletions
|
@ -112,8 +112,7 @@
|
|||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
#:use-module (uniseg internal)
|
||||
#:export
|
||||
(,@ea-symbol-names
|
||||
#:export (,@ea-symbol-names
|
||||
eastasian-charsets)))
|
||||
|
||||
(pretty-print
|
||||
|
|
|
@ -27,9 +27,38 @@
|
|||
(define-peg-pattern @emoji-line body
|
||||
(and @emoji-datum (* @ws) @comment))
|
||||
|
||||
(define emoji-list '())
|
||||
(define emoji-ht (make-hash-table 5))
|
||||
|
||||
(define emoji-sets
|
||||
'(emoji
|
||||
emoji-presentation
|
||||
emoji-modifier
|
||||
emoji-modifier-base
|
||||
emoji-component
|
||||
emoji-extended-pictographic))
|
||||
|
||||
(define emoji-symbol-names
|
||||
(map
|
||||
(λ (set)
|
||||
(string->symbol
|
||||
(string-concatenate
|
||||
(list "char-set:"
|
||||
(symbol->string set)))))
|
||||
emoji-sets))
|
||||
|
||||
(define emoji-sets-and-symbols
|
||||
(zip emoji-sets emoji-symbol-names))
|
||||
|
||||
(define (process-emoji-line line)
|
||||
(define (string->category str)
|
||||
(match str
|
||||
("Emoji" 'emoji)
|
||||
("Emoji_Presentation" 'emoji-presentation)
|
||||
("Emoji_Modifier" 'emoji-modifier)
|
||||
("Emoji_Modifier_Base" 'emoji-modifier-base)
|
||||
("Emoji_Component" 'emoji-component)
|
||||
("Extended_Pictographic" 'emoji-extended-pictographic)))
|
||||
|
||||
(define tree (peg:tree (match-pattern @emoji-line line)))
|
||||
|
||||
(unless (or (not tree)
|
||||
|
@ -39,7 +68,7 @@
|
|||
(match tree
|
||||
(((('@codepoint-range
|
||||
('@codepoint codepoints) ...)
|
||||
('@emoji-category category))
|
||||
('@emoji-category cat-str))
|
||||
('@comment comment))
|
||||
|
||||
(with-exception-handler
|
||||
|
@ -48,15 +77,14 @@
|
|||
(format-exception-msg stdout err))
|
||||
(λ ()
|
||||
(let ((f (hex-string->integer (first codepoints)))
|
||||
(l (hex-string->integer (last codepoints))))
|
||||
(l (hex-string->integer (last codepoints)))
|
||||
(category (string->category cat-str)))
|
||||
|
||||
(when (or (in-surrogate-range f)
|
||||
(in-surrogate-range l))
|
||||
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
|
||||
|
||||
(when (and (equal? "Extended_Pictographic" category)
|
||||
(> l #xFF))
|
||||
(set! emoji-list (cons (list f l) emoji-list)))))
|
||||
(cons-hash-list! emoji-ht category f l)))
|
||||
#:unwind? #t)))))
|
||||
|
||||
(define line-func
|
||||
|
@ -78,27 +106,45 @@
|
|||
|
||||
(pretty-print
|
||||
`(define-module (uniseg emoji)
|
||||
#:use-module (uniseg internal)
|
||||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
#:export (char-set:extended-pictographic)))
|
||||
#:export (,@emoji-symbol-names
|
||||
emoji-charsets)))
|
||||
|
||||
(pretty-print
|
||||
`(define emoji-list ',emoji-list))
|
||||
`(define emoji-ht
|
||||
(alist->hashq-table ',(hash-map->list cons emoji-ht))))
|
||||
|
||||
(display "\n")
|
||||
|
||||
(for-each
|
||||
(λ (sym)
|
||||
(pretty-print
|
||||
`(define ,sym (char-set))))
|
||||
emoji-symbol-names)
|
||||
|
||||
(display "\n")
|
||||
|
||||
(pretty-print
|
||||
`(define char-set:extended-pictographic (char-set)))
|
||||
`(define emoji-charsets
|
||||
(list
|
||||
,@(map
|
||||
(λ (pair)
|
||||
(let ((f (first pair))
|
||||
(s (second pair)))
|
||||
`(list ',f ,s)))
|
||||
emoji-sets-and-symbols))))
|
||||
|
||||
(display "\n")
|
||||
|
||||
(pretty-print
|
||||
`(for-each
|
||||
(λ (pair)
|
||||
(ucs-range->char-set!
|
||||
(first pair)
|
||||
(+ 1 (second pair))
|
||||
#t char-set:extended-pictographic))
|
||||
emoji-list))
|
||||
(for-each
|
||||
(λ (set-pair)
|
||||
(let ((name (first set-pair))
|
||||
(symbol (second set-pair)))
|
||||
(pretty-print
|
||||
`(ranges->charset! emoji-ht ',name ,symbol))))
|
||||
emoji-sets-and-symbols)
|
||||
|
||||
(display "Code generation complete.\n" stdout)))
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
extend
|
||||
regional-indicator
|
||||
spacing-mark
|
||||
zerowidth-joiner))
|
||||
zero-width-joiner))
|
||||
|
||||
(define grapheme-symbol-names
|
||||
(map
|
||||
|
@ -71,7 +71,7 @@
|
|||
("Extend" 'extend)
|
||||
("Regional_Indicator" 'regional-indicator)
|
||||
("SpacingMark" 'spacing-mark)
|
||||
("ZWJ" 'zerowidth-joiner)))
|
||||
("ZWJ" 'zero-width-joiner)))
|
||||
|
||||
(define tree (peg:tree (match-pattern @grapheme-line line)))
|
||||
|
||||
|
@ -123,7 +123,7 @@
|
|||
#:use-module (ice-9 hash-table)
|
||||
#:use-module (srfi srfi-1)
|
||||
#:use-module (uniseg internal)
|
||||
#:export ( ,@grapheme-symbol-names
|
||||
#:export (,@grapheme-symbol-names
|
||||
grapheme-charsets)))
|
||||
|
||||
|
||||
|
|
60
uniseg.scm
60
uniseg.scm
|
@ -1,17 +1,21 @@
|
|||
(define-module (uniseg)
|
||||
#:use-module (srfi srfi-1)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (srfi srfi-41)
|
||||
#:use-module (uniseg emoji)
|
||||
#:use-module (uniseg graphemes)
|
||||
#:use-module (uniseg graphemes stream)
|
||||
#:use-module (uniseg eastasian)
|
||||
#:export (emoji?
|
||||
grapheme-property
|
||||
eastasian-property))
|
||||
char->grapheme-property
|
||||
char->eastasian-property
|
||||
char-width
|
||||
string-width))
|
||||
|
||||
(define (emoji? char)
|
||||
(char-set-contains? char-set:extended-pictographic char))
|
||||
(char-set-contains? char-set:emoji-extended-pictographic char))
|
||||
|
||||
(define (get-prop sets char)
|
||||
(define (char->prop sets char)
|
||||
(let ((pair
|
||||
(find
|
||||
(λ (p) (char-set-contains? (second p) char))
|
||||
|
@ -19,12 +23,12 @@
|
|||
(and pair
|
||||
(first pair))))
|
||||
|
||||
(define (grapheme-property char)
|
||||
(define (char->grapheme-property char)
|
||||
"Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/"
|
||||
(or (get-prop grapheme-charsets char)
|
||||
(or (char->prop grapheme-charsets char)
|
||||
'other))
|
||||
|
||||
(define (eastasian-property char)
|
||||
(define (char->eastasian-property char)
|
||||
"Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/"
|
||||
(define (<==> lo hi)
|
||||
"Create a range predicate, inclusive between two numbers"
|
||||
|
@ -47,7 +51,7 @@
|
|||
(define range-plane-3?
|
||||
(<==> #x30000 #x3FFFD))
|
||||
|
||||
(or (get-prop eastasian-charsets char)
|
||||
(or (char->prop eastasian-charsets char)
|
||||
(let ((int (char->integer char)))
|
||||
(match int
|
||||
((or (? range-unified-ideographs-ext-a?)
|
||||
|
@ -58,3 +62,43 @@
|
|||
'doublewidth)
|
||||
(else
|
||||
'neutral)))))
|
||||
|
||||
(define (char-width rune)
|
||||
"Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations"
|
||||
(define grapheme-prop (char->grapheme-property rune))
|
||||
(values
|
||||
(or
|
||||
(match grapheme-prop
|
||||
((or 'control
|
||||
'carriage-return
|
||||
'line-feed
|
||||
'extend
|
||||
'zero-width-joiner)
|
||||
0)
|
||||
('regional-indicator
|
||||
2)
|
||||
('extended-pictographic
|
||||
(if
|
||||
(char-set-contains? char-set:emoji-presentation rune)
|
||||
2
|
||||
1))
|
||||
(else #f)
|
||||
)
|
||||
(match (char->integer rune)
|
||||
(#x2E3A 3)
|
||||
(#x2E3B 4)
|
||||
(else #f))
|
||||
(match (char->eastasian-property rune)
|
||||
('doublewidth 2)
|
||||
;; Default for monospace is to render ambiguous as 1
|
||||
('ambiguous 1)
|
||||
(else 1)))
|
||||
grapheme-prop))
|
||||
|
||||
(define (string-width str)
|
||||
"Get the width of a string by adding up the widths of each grapheme"
|
||||
(stream-fold
|
||||
(λ (val grapheme)
|
||||
(+ val (grapheme-width grapheme)))
|
||||
0
|
||||
(string->grapheme-stream str)))
|
||||
|
|
1182
uniseg/eastasian.scm
1182
uniseg/eastasian.scm
File diff suppressed because it is too large
Load diff
1748
uniseg/emoji.scm
1748
uniseg/emoji.scm
File diff suppressed because it is too large
Load diff
|
@ -20,28 +20,176 @@
|
|||
char-set:grapheme-extend
|
||||
char-set:grapheme-regional-indicator
|
||||
char-set:grapheme-spacing-mark
|
||||
char-set:grapheme-zerowidth-joiner
|
||||
char-set:grapheme-zero-width-joiner
|
||||
grapheme-charsets))
|
||||
(define grapheme-ht
|
||||
(alist->hashq-table
|
||||
'((prepend
|
||||
(73474 73474)
|
||||
(73030 73030)
|
||||
(72324 72329)
|
||||
(72250 72250)
|
||||
(72001 72001)
|
||||
(71999 71999)
|
||||
(70082 70083)
|
||||
(69837 69837)
|
||||
(69821 69821)
|
||||
(3406 3406)
|
||||
(2274 2274)
|
||||
(2192 2193)
|
||||
(1807 1807)
|
||||
(1757 1757)
|
||||
(1536 1541))
|
||||
(regional-indicator (127462 127487))
|
||||
(carriage-return (13 13))
|
||||
'((spacing-mark
|
||||
(119149 119149)
|
||||
(119142 119142)
|
||||
(94192 94193)
|
||||
(94033 94087)
|
||||
(73537 73537)
|
||||
(73534 73535)
|
||||
(73524 73525)
|
||||
(73475 73475)
|
||||
(73461 73462)
|
||||
(73110 73110)
|
||||
(73107 73108)
|
||||
(73098 73102)
|
||||
(72884 72884)
|
||||
(72881 72881)
|
||||
(72873 72873)
|
||||
(72766 72766)
|
||||
(72751 72751)
|
||||
(72343 72343)
|
||||
(72279 72280)
|
||||
(72249 72249)
|
||||
(72164 72164)
|
||||
(72156 72159)
|
||||
(72145 72147)
|
||||
(72002 72002)
|
||||
(72000 72000)
|
||||
(71997 71997)
|
||||
(71991 71992)
|
||||
(71985 71989)
|
||||
(71736 71736)
|
||||
(71724 71726)
|
||||
(71462 71462)
|
||||
(71350 71350)
|
||||
(71342 71343)
|
||||
(71340 71340)
|
||||
(71230 71230)
|
||||
(71227 71228)
|
||||
(71216 71218)
|
||||
(71102 71102)
|
||||
(71096 71099)
|
||||
(71088 71089)
|
||||
(70849 70849)
|
||||
(70846 70846)
|
||||
(70843 70844)
|
||||
(70841 70841)
|
||||
(70833 70834)
|
||||
(70725 70725)
|
||||
(70720 70721)
|
||||
(70709 70711)
|
||||
(70498 70499)
|
||||
(70475 70477)
|
||||
(70471 70472)
|
||||
(70465 70468)
|
||||
(70463 70463)
|
||||
(70402 70403)
|
||||
(70368 70370)
|
||||
(70197 70197)
|
||||
(70194 70195)
|
||||
(70188 70190)
|
||||
(70094 70094)
|
||||
(70079 70080)
|
||||
(70067 70069)
|
||||
(70018 70018)
|
||||
(69957 69958)
|
||||
(69932 69932)
|
||||
(69815 69816)
|
||||
(69808 69810)
|
||||
(69762 69762)
|
||||
(69634 69634)
|
||||
(69632 69632)
|
||||
(44012 44012)
|
||||
(44009 44010)
|
||||
(44006 44007)
|
||||
(44003 44004)
|
||||
(43765 43765)
|
||||
(43758 43759)
|
||||
(43755 43755)
|
||||
(43597 43597)
|
||||
(43571 43572)
|
||||
(43567 43568)
|
||||
(43454 43456)
|
||||
(43450 43451)
|
||||
(43444 43445)
|
||||
(43395 43395)
|
||||
(43346 43347)
|
||||
(43188 43203)
|
||||
(43136 43137)
|
||||
(43047 43047)
|
||||
(43043 43044)
|
||||
(7415 7415)
|
||||
(7393 7393)
|
||||
(7220 7221)
|
||||
(7204 7211)
|
||||
(7154 7155)
|
||||
(7150 7150)
|
||||
(7146 7148)
|
||||
(7143 7143)
|
||||
(7082 7082)
|
||||
(7078 7079)
|
||||
(7073 7073)
|
||||
(7042 7042)
|
||||
(6979 6980)
|
||||
(6973 6977)
|
||||
(6971 6971)
|
||||
(6916 6916)
|
||||
(6765 6770)
|
||||
(6743 6743)
|
||||
(6741 6741)
|
||||
(6681 6682)
|
||||
(6451 6456)
|
||||
(6448 6449)
|
||||
(6441 6443)
|
||||
(6435 6438)
|
||||
(6087 6088)
|
||||
(6078 6085)
|
||||
(6070 6070)
|
||||
(5940 5940)
|
||||
(5909 5909)
|
||||
(4228 4228)
|
||||
(4182 4183)
|
||||
(4155 4156)
|
||||
(4145 4145)
|
||||
(3967 3967)
|
||||
(3902 3903)
|
||||
(3763 3763)
|
||||
(3635 3635)
|
||||
(3570 3571)
|
||||
(3544 3550)
|
||||
(3536 3537)
|
||||
(3458 3459)
|
||||
(3402 3404)
|
||||
(3398 3400)
|
||||
(3391 3392)
|
||||
(3330 3331)
|
||||
(3315 3315)
|
||||
(3274 3275)
|
||||
(3271 3272)
|
||||
(3267 3268)
|
||||
(3264 3265)
|
||||
(3262 3262)
|
||||
(3202 3203)
|
||||
(3137 3140)
|
||||
(3073 3075)
|
||||
(3018 3020)
|
||||
(3014 3016)
|
||||
(3009 3010)
|
||||
(3007 3007)
|
||||
(2891 2892)
|
||||
(2887 2888)
|
||||
(2880 2880)
|
||||
(2818 2819)
|
||||
(2763 2764)
|
||||
(2761 2761)
|
||||
(2750 2752)
|
||||
(2691 2691)
|
||||
(2622 2624)
|
||||
(2563 2563)
|
||||
(2507 2508)
|
||||
(2503 2504)
|
||||
(2495 2496)
|
||||
(2434 2435)
|
||||
(2382 2383)
|
||||
(2377 2380)
|
||||
(2366 2368)
|
||||
(2363 2363)
|
||||
(2307 2307))
|
||||
(extend
|
||||
(917760 917999)
|
||||
(917536 917631)
|
||||
|
@ -420,34 +568,6 @@
|
|||
(1160 1161)
|
||||
(1155 1159)
|
||||
(768 879))
|
||||
(hangul-syllable-t (55243 55291) (4520 4607))
|
||||
(control
|
||||
(918000 921599)
|
||||
(917632 917759)
|
||||
(917506 917535)
|
||||
(917505 917505)
|
||||
(917504 917504)
|
||||
(119155 119162)
|
||||
(113824 113827)
|
||||
(78896 78911)
|
||||
(65529 65531)
|
||||
(65520 65528)
|
||||
(65279 65279)
|
||||
(8294 8303)
|
||||
(8293 8293)
|
||||
(8288 8292)
|
||||
(8234 8238)
|
||||
(8233 8233)
|
||||
(8232 8232)
|
||||
(8206 8207)
|
||||
(8203 8203)
|
||||
(6158 6158)
|
||||
(1564 1564)
|
||||
(173 173)
|
||||
(127 159)
|
||||
(14 31)
|
||||
(11 12)
|
||||
(0 9))
|
||||
(hangul-syllable-lv
|
||||
(55176 55176)
|
||||
(55148 55148)
|
||||
|
@ -848,174 +968,6 @@
|
|||
(44088 44088)
|
||||
(44060 44060)
|
||||
(44032 44032))
|
||||
(line-feed (10 10))
|
||||
(hangul-syllable-v (55216 55238) (4448 4519))
|
||||
(spacing-mark
|
||||
(119149 119149)
|
||||
(119142 119142)
|
||||
(94192 94193)
|
||||
(94033 94087)
|
||||
(73537 73537)
|
||||
(73534 73535)
|
||||
(73524 73525)
|
||||
(73475 73475)
|
||||
(73461 73462)
|
||||
(73110 73110)
|
||||
(73107 73108)
|
||||
(73098 73102)
|
||||
(72884 72884)
|
||||
(72881 72881)
|
||||
(72873 72873)
|
||||
(72766 72766)
|
||||
(72751 72751)
|
||||
(72343 72343)
|
||||
(72279 72280)
|
||||
(72249 72249)
|
||||
(72164 72164)
|
||||
(72156 72159)
|
||||
(72145 72147)
|
||||
(72002 72002)
|
||||
(72000 72000)
|
||||
(71997 71997)
|
||||
(71991 71992)
|
||||
(71985 71989)
|
||||
(71736 71736)
|
||||
(71724 71726)
|
||||
(71462 71462)
|
||||
(71350 71350)
|
||||
(71342 71343)
|
||||
(71340 71340)
|
||||
(71230 71230)
|
||||
(71227 71228)
|
||||
(71216 71218)
|
||||
(71102 71102)
|
||||
(71096 71099)
|
||||
(71088 71089)
|
||||
(70849 70849)
|
||||
(70846 70846)
|
||||
(70843 70844)
|
||||
(70841 70841)
|
||||
(70833 70834)
|
||||
(70725 70725)
|
||||
(70720 70721)
|
||||
(70709 70711)
|
||||
(70498 70499)
|
||||
(70475 70477)
|
||||
(70471 70472)
|
||||
(70465 70468)
|
||||
(70463 70463)
|
||||
(70402 70403)
|
||||
(70368 70370)
|
||||
(70197 70197)
|
||||
(70194 70195)
|
||||
(70188 70190)
|
||||
(70094 70094)
|
||||
(70079 70080)
|
||||
(70067 70069)
|
||||
(70018 70018)
|
||||
(69957 69958)
|
||||
(69932 69932)
|
||||
(69815 69816)
|
||||
(69808 69810)
|
||||
(69762 69762)
|
||||
(69634 69634)
|
||||
(69632 69632)
|
||||
(44012 44012)
|
||||
(44009 44010)
|
||||
(44006 44007)
|
||||
(44003 44004)
|
||||
(43765 43765)
|
||||
(43758 43759)
|
||||
(43755 43755)
|
||||
(43597 43597)
|
||||
(43571 43572)
|
||||
(43567 43568)
|
||||
(43454 43456)
|
||||
(43450 43451)
|
||||
(43444 43445)
|
||||
(43395 43395)
|
||||
(43346 43347)
|
||||
(43188 43203)
|
||||
(43136 43137)
|
||||
(43047 43047)
|
||||
(43043 43044)
|
||||
(7415 7415)
|
||||
(7393 7393)
|
||||
(7220 7221)
|
||||
(7204 7211)
|
||||
(7154 7155)
|
||||
(7150 7150)
|
||||
(7146 7148)
|
||||
(7143 7143)
|
||||
(7082 7082)
|
||||
(7078 7079)
|
||||
(7073 7073)
|
||||
(7042 7042)
|
||||
(6979 6980)
|
||||
(6973 6977)
|
||||
(6971 6971)
|
||||
(6916 6916)
|
||||
(6765 6770)
|
||||
(6743 6743)
|
||||
(6741 6741)
|
||||
(6681 6682)
|
||||
(6451 6456)
|
||||
(6448 6449)
|
||||
(6441 6443)
|
||||
(6435 6438)
|
||||
(6087 6088)
|
||||
(6078 6085)
|
||||
(6070 6070)
|
||||
(5940 5940)
|
||||
(5909 5909)
|
||||
(4228 4228)
|
||||
(4182 4183)
|
||||
(4155 4156)
|
||||
(4145 4145)
|
||||
(3967 3967)
|
||||
(3902 3903)
|
||||
(3763 3763)
|
||||
(3635 3635)
|
||||
(3570 3571)
|
||||
(3544 3550)
|
||||
(3536 3537)
|
||||
(3458 3459)
|
||||
(3402 3404)
|
||||
(3398 3400)
|
||||
(3391 3392)
|
||||
(3330 3331)
|
||||
(3315 3315)
|
||||
(3274 3275)
|
||||
(3271 3272)
|
||||
(3267 3268)
|
||||
(3264 3265)
|
||||
(3262 3262)
|
||||
(3202 3203)
|
||||
(3137 3140)
|
||||
(3073 3075)
|
||||
(3018 3020)
|
||||
(3014 3016)
|
||||
(3009 3010)
|
||||
(3007 3007)
|
||||
(2891 2892)
|
||||
(2887 2888)
|
||||
(2880 2880)
|
||||
(2818 2819)
|
||||
(2763 2764)
|
||||
(2761 2761)
|
||||
(2750 2752)
|
||||
(2691 2691)
|
||||
(2622 2624)
|
||||
(2563 2563)
|
||||
(2507 2508)
|
||||
(2503 2504)
|
||||
(2495 2496)
|
||||
(2434 2435)
|
||||
(2382 2383)
|
||||
(2377 2380)
|
||||
(2366 2368)
|
||||
(2363 2363)
|
||||
(2307 2307))
|
||||
(hangul-syllable-lvt
|
||||
(55177 55203)
|
||||
(55149 55175)
|
||||
|
@ -1417,7 +1369,55 @@
|
|||
(44061 44087)
|
||||
(44033 44059))
|
||||
(hangul-syllable-l (43360 43388) (4352 4447))
|
||||
(zerowidth-joiner (8205 8205)))))
|
||||
(hangul-syllable-v (55216 55238) (4448 4519))
|
||||
(prepend
|
||||
(73474 73474)
|
||||
(73030 73030)
|
||||
(72324 72329)
|
||||
(72250 72250)
|
||||
(72001 72001)
|
||||
(71999 71999)
|
||||
(70082 70083)
|
||||
(69837 69837)
|
||||
(69821 69821)
|
||||
(3406 3406)
|
||||
(2274 2274)
|
||||
(2192 2193)
|
||||
(1807 1807)
|
||||
(1757 1757)
|
||||
(1536 1541))
|
||||
(regional-indicator (127462 127487))
|
||||
(line-feed (10 10))
|
||||
(control
|
||||
(918000 921599)
|
||||
(917632 917759)
|
||||
(917506 917535)
|
||||
(917505 917505)
|
||||
(917504 917504)
|
||||
(119155 119162)
|
||||
(113824 113827)
|
||||
(78896 78911)
|
||||
(65529 65531)
|
||||
(65520 65528)
|
||||
(65279 65279)
|
||||
(8294 8303)
|
||||
(8293 8293)
|
||||
(8288 8292)
|
||||
(8234 8238)
|
||||
(8233 8233)
|
||||
(8232 8232)
|
||||
(8206 8207)
|
||||
(8203 8203)
|
||||
(6158 6158)
|
||||
(1564 1564)
|
||||
(173 173)
|
||||
(127 159)
|
||||
(14 31)
|
||||
(11 12)
|
||||
(0 9))
|
||||
(zero-width-joiner (8205 8205))
|
||||
(carriage-return (13 13))
|
||||
(hangul-syllable-t (55243 55291) (4520 4607)))))
|
||||
|
||||
(define char-set:grapheme-hangul-syllable-l
|
||||
(char-set))
|
||||
|
@ -1437,7 +1437,7 @@
|
|||
(char-set))
|
||||
(define char-set:grapheme-spacing-mark
|
||||
(char-set))
|
||||
(define char-set:grapheme-zerowidth-joiner
|
||||
(define char-set:grapheme-zero-width-joiner
|
||||
(char-set))
|
||||
|
||||
(define grapheme-charsets
|
||||
|
@ -1459,8 +1459,8 @@
|
|||
char-set:grapheme-regional-indicator)
|
||||
(list 'spacing-mark
|
||||
char-set:grapheme-spacing-mark)
|
||||
(list 'zerowidth-joiner
|
||||
char-set:grapheme-zerowidth-joiner)))
|
||||
(list 'zero-width-joiner
|
||||
char-set:grapheme-zero-width-joiner)))
|
||||
|
||||
(ranges->charset!
|
||||
grapheme-ht
|
||||
|
@ -1508,5 +1508,5 @@
|
|||
char-set:grapheme-spacing-mark)
|
||||
(ranges->charset!
|
||||
grapheme-ht
|
||||
'zerowidth-joiner
|
||||
char-set:grapheme-zerowidth-joiner)
|
||||
'zero-width-joiner
|
||||
char-set:grapheme-zero-width-joiner)
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
(define-module (runewidth graphemes stream)
|
||||
(define-module (uniseg graphemes stream)
|
||||
#:use-module (uniseg)
|
||||
#:use-module (ice-9 textual-ports)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (srfi srfi-71)
|
||||
#:use-module (srfi srfi-41)
|
||||
#:use-module (srfi srfi-9 gnu)
|
||||
#:export (make-grapheme
|
||||
|
@ -8,20 +11,23 @@
|
|||
grapheme-width
|
||||
grapheme-sentence-end?
|
||||
grapheme-word-end?
|
||||
string->grapheme-stream
|
||||
input->grapheme-stream))
|
||||
|
||||
;; TODO: uniseg also does word and sentence boundaries. These state machines could be implemented if we wanted to.
|
||||
(define-immutable-record-type <grapheme>
|
||||
(make-grapheme glyphs width sentence-end? word-end?)
|
||||
(make-grapheme glyphs width)
|
||||
grapheme?
|
||||
(glyphs grapheme-glyphs)
|
||||
(width grapheme-width)
|
||||
(sentence-end? grapheme-sentence-end?)
|
||||
(word-end? grapheme-word-end?))
|
||||
(width grapheme-width))
|
||||
|
||||
(define (string->grapheme-stream str)
|
||||
"Given a string, create a (lazy) stream of graphemes."
|
||||
(with-input-from-string str
|
||||
(λ () (input->grapheme-stream (current-input-port)))))
|
||||
|
||||
(define-stream (input->grapheme-stream port)
|
||||
(unless port
|
||||
(set! port (current-input-port)))
|
||||
|
||||
"Given an input port, create a (lazy) stream of graphemes."
|
||||
;; The first is what state we are in, and the next is the grapheme
|
||||
;; property of the current character.
|
||||
;; Port of https://github.com/rivo/uniseg/blob/master/graphemerules.go
|
||||
|
@ -29,88 +35,115 @@
|
|||
(match (list cur-state cur-prop)
|
||||
;; Grapheme boundary #3
|
||||
(('carriage-return 'line-feed)
|
||||
(values 'control+line-feed 'no-boundary))
|
||||
(values 'control+line-feed #f))
|
||||
|
||||
;; Grapheme boundary #4
|
||||
((or ('carriage-return _)
|
||||
('control+line-feed _))
|
||||
(values 'any 'boundary))
|
||||
(values 'any #t))
|
||||
|
||||
;; Grapheme boundary #5
|
||||
((_ 'carriage-return)
|
||||
(values 'carriage-return 'boundary))
|
||||
(values 'carriage-return #t))
|
||||
((or (_ 'line-feed)
|
||||
(_ 'control))
|
||||
(values 'control+line-feed 'boundary))
|
||||
(values 'control+line-feed #t))
|
||||
|
||||
;; Grapheme boundary #6
|
||||
(('hangul-syllable-l 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l 'no-boundary))
|
||||
(values 'hangul-syllable-l #f))
|
||||
((or ('hangul-syllable-l 'hangul-syllable-v)
|
||||
('hangul-syllable-l 'hangul-syllable-lv))
|
||||
(values 'hangul-syllable-lv 'no-boundary))
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-l 'hangul-syllable-lvt)
|
||||
(values 'hangul-syllable-lvt 'no-boundary))
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
((_ 'hangul-syllable-l)
|
||||
(values 'hangul-syllable-l 'boundary))
|
||||
(values 'hangul-syllable-l #t))
|
||||
|
||||
;; Grapheme boundary #7
|
||||
(('hangul-syllable-lv 'hangul-syllable-v)
|
||||
(values 'hangul-syllable-lv 'no-boundary))
|
||||
(values 'hangul-syllable-lv #f))
|
||||
(('hangul-syllable-lv 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt 'no-boundary))
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
((or (_ 'hangul-syllable-lv)
|
||||
(_ 'hangul-syllable-v))
|
||||
(values 'hangul-syllable-lv 'boundary))
|
||||
(values 'hangul-syllable-lv #t))
|
||||
|
||||
;; Grapheme boundary #8
|
||||
((or (_ 'hangul-syllable-lvt)
|
||||
(_ 'hangul-syllable-t))
|
||||
(values 'hangul-syllable-lvt 'boundary))
|
||||
(values 'hangul-syllable-lvt #t))
|
||||
(('hangul-syllable-lvt 'hangul-syllable-t)
|
||||
(values 'hangul-syllable-lvt 'no-boundary))
|
||||
(values 'hangul-syllable-lvt #f))
|
||||
|
||||
;; Grapheme boundary #9
|
||||
((or (_ 'extend)
|
||||
(_ 'zero-width-joiner))
|
||||
(values 'any 'no-boundary))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9a
|
||||
((_ 'spacing-mark)
|
||||
(values 'any 'no-boundary))
|
||||
(values 'any #f))
|
||||
|
||||
;; Grapheme boundary #9b
|
||||
(('prepend _)
|
||||
(values 'any 'no-boundary))
|
||||
(values 'any #f))
|
||||
((_ 'prepend)
|
||||
(values 'prepend 'boundary))
|
||||
(values 'prepend #t))
|
||||
|
||||
;; Grapheme boundary #11 (emoji!)
|
||||
(('extended-pictographic 'extend)
|
||||
(values 'extended-pictographic 'no-boundary))
|
||||
(values 'extended-pictographic #f))
|
||||
(('extended-pictographic 'zero-width-joiner)
|
||||
(values 'extended-pictographic+zero-width-joiner 'no-boundary))
|
||||
(values 'extended-pictographic+zero-width-joiner #f))
|
||||
(('extended-pictographic+zero-width-joiner 'extended-pictographic)
|
||||
(values 'extended-pictographic 'no-boundary))
|
||||
(values 'extended-pictographic #f))
|
||||
((_ 'extended-pictographic)
|
||||
(values 'extended-pictographic 'boundary))
|
||||
(values 'extended-pictographic #t))
|
||||
|
||||
|
||||
;; Grapheme boundaries #12 and #13
|
||||
(('regional-indicator-odd 'regional-indicator)
|
||||
(values 'regioinal-indicator-even 'no-boundary))
|
||||
(values 'regioinal-indicator-even #f))
|
||||
(('regional-indicator-even 'regional-indicator)
|
||||
(values 'regional-indicator-odd 'boundary))
|
||||
(values 'regional-indicator-odd #t))
|
||||
((_ 'regional-indicator)
|
||||
(values 'regional-indicator-odd 'boundary))
|
||||
(values 'regional-indicator-odd #t))
|
||||
|
||||
(else (values 'other 'other))))
|
||||
|
||||
(define (transition-state state char)
|
||||
"Given the current state and the next char, run a state transition"
|
||||
;; Everything else considered a boundeary
|
||||
(else (values 'any #t))))
|
||||
|
||||
|
||||
)
|
||||
(define grapheme-width 0)
|
||||
(define glyphs-reverse '())
|
||||
|
||||
(define grapheme (make-grapheme glyphs width sentence-end? word-end?))
|
||||
(stream-cons grapheme (input->grapheme-stream port)))
|
||||
(define hit-eof #f)
|
||||
|
||||
(define (iterate-through-grapheme state)
|
||||
(define glyph (get-char port))
|
||||
(if
|
||||
(eof-object? glyph)
|
||||
(begin
|
||||
(set! hit-eof #t)
|
||||
state)
|
||||
(begin
|
||||
(set! glyphs-reverse (cons glyph glyphs-reverse))
|
||||
(let* ((width property (char-width glyph))
|
||||
(next-state boundary? (state-machine state property)))
|
||||
|
||||
(set! grapheme-width (+ grapheme-width width))
|
||||
|
||||
(if boundary?
|
||||
state
|
||||
(iterate-through-grapheme state))))))
|
||||
|
||||
(define final-state (iterate-through-grapheme 'any))
|
||||
|
||||
(if
|
||||
hit-eof
|
||||
stream-null
|
||||
(stream-cons
|
||||
(make-grapheme
|
||||
(reverse glyphs-reverse)
|
||||
grapheme-width)
|
||||
(input->grapheme-stream port))))
|
||||
|
|
|
@ -21,6 +21,9 @@
|
|||
file-to-lines
|
||||
ranges->charset!))
|
||||
|
||||
;;
|
||||
;; Common PEG patterns
|
||||
;;
|
||||
(define-peg-pattern @hex body (peg "[a-fA-F0-9]"))
|
||||
|
||||
(define-peg-pattern @codepoint all
|
||||
|
@ -31,13 +34,14 @@
|
|||
(and @codepoint (ignore "..") @codepoint)
|
||||
@codepoint))
|
||||
|
||||
|
||||
(define-peg-pattern @comment all
|
||||
(and (ignore "#") (* peg-any)))
|
||||
|
||||
(define-peg-pattern @ws none
|
||||
(or " " "\t"))
|
||||
|
||||
;; Helper macro to add a list of character ranges
|
||||
;; to a hash-set.
|
||||
(define-syntax-rule (cons-hash-list! ht key low high)
|
||||
(let* ((old (hashq-ref ht key))
|
||||
(value (list low high))
|
||||
|
@ -47,6 +51,8 @@
|
|||
(list value))))
|
||||
(hashq-set! ht key new-lst)))
|
||||
|
||||
;; Helper macro to take above character range hash table
|
||||
;; and insert it into Guile's chasracter set system.
|
||||
(define-syntax-rule (ranges->charset! ht name symbol)
|
||||
(let* ((pairs (hashq-ref ht name)))
|
||||
(for-each
|
||||
|
@ -64,15 +70,17 @@
|
|||
;; So they can't be written out as such.
|
||||
(locale-string->integer str 16))
|
||||
|
||||
;; This should be in the standard library...
|
||||
(define (format-exception-msg port err)
|
||||
(apply format port (exception-message err) (exception-irritants err))
|
||||
(display "\n" port))
|
||||
|
||||
;; Little checker to see if we are in the danger zone
|
||||
;; (surrogate range is invalid unicode and guile errors)
|
||||
(define (in-surrogate-range num)
|
||||
(and (>= num #xd800)
|
||||
(<= num #xdfff)))
|
||||
|
||||
|
||||
(define* (wget-to-lines url #:optional (port #f))
|
||||
(format port "Downloading from ~a..." url)
|
||||
|
||||
|
|
Loading…
Reference in a new issue