Cut out a LOT of duplicated boilerplate, very simple definitions now

This commit is contained in:
Vivianne 2024-03-04 17:12:09 -05:00
parent a03c6d2348
commit 94dd8e00e8
8 changed files with 1658 additions and 1761 deletions

View File

@ -19,57 +19,19 @@
(define url
"https://unicode.org/Public/13.0.0/ucd/EastAsianWidth.txt")
(define-peg-pattern @ea-width-prop all
(or "A" "F" "H" "Na" "N" "W"))
(define-peg-pattern @ea-datum body
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @ea-width-prop))
(define-peg-pattern @ea-line body
(and @ea-datum (* @ws) @comment))
(define eastasian-ht (make-hash-table 6))
(define (process-line line)
(define (string->property str comment)
(if (string-contains comment "COMBINING")
'combining
(match str
((or "W" "F") 'doublewidth)
("H" 'halfwidth)
("Na" 'narrow)
("N" 'neutral)
("A" 'ambiguous))))
(define (string->property str comment)
(if (string-contains comment "COMBINING")
'combining
(match str
((or "W" "F") 'doublewidth)
("H" 'halfwidth)
("Na" 'narrow)
("N" 'neutral)
("A" 'ambiguous))))
(define tree (peg:tree (match-pattern @ea-line line)))
(unless (or (not tree)
(null? tree)
(eq? '@comment (car tree)))
(match tree
(((('@codepoint-range
('@codepoint codepoints) ...)
('@ea-width-prop prop-str))
('@comment comment))
(with-exception-handler
(λ (err)
(format stdout "Skipping line due to error :: ")
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints)))
(width-prop (string->property prop-str comment)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(cons-hash-list! eastasian-ht width-prop f l)))
#:unwind? #t)))))
(define ea-sets
(define eastasian-properties
'(combining
doublewidth
halfwidth
@ -77,18 +39,14 @@
neutral
ambiguous))
(define ea-symbol-names
(define eastasian-symbols
(map
(λ (set)
(string->symbol
(string-concatenate
(list "char-set:eastasian-"
(symbol->string set)))))
ea-sets))
(define ea-sets-and-symbols
(zip ea-sets ea-symbol-names))
eastasian-properties))
(define file "uniseg/eastasian.scm")
@ -98,49 +56,25 @@
(λ ()
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
(for-each process-line (cmdline-wget-or-file url stdout))
(pretty-print
`(define-module (uniseg eastasian)
#:use-module (uniseg internal)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (uniseg internal)
#:export (,@ea-symbol-names
eastasian-charsets)))
#:export (,@eastasian-symbols
eastasian-charsets)))
(pretty-print
`(define eastasian-ht
(alist->hashq-table ',(hash-map->list cons eastasian-ht))))
(define-values (process-line output-boilerplate)
(make-line-processor
eastasian-ht
string->property
eastasian-properties
eastasian-symbols
'eastasian-charsets
stdout))
(display "\n")
(for-each
(λ (sym)
(pretty-print
`(define ,sym (char-set))))
ea-symbol-names)
(display "\n")
(pretty-print
`(define eastasian-charsets
(list
,@(map
(λ (pair)
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
ea-sets-and-symbols))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! eastasian-ht ',name ,symbol))))
ea-sets-and-symbols)
(for-each process-line (cmdline-wget-or-file url stdout))
(output-boilerplate)
(display "Code generation complete.\n" stdout)))

View File

@ -8,8 +8,6 @@
(uniseg internal)
(ice-9 pretty-print)
(ice-9 peg)
(ice-9 format)
(ice-9 exceptions)
(ice-9 match)
(srfi srfi-1))
@ -18,18 +16,9 @@
(define url
"https://unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt")
(define-peg-pattern @emoji-category all
(* (peg "[a-zA-Z_]")))
(define-peg-pattern @emoji-datum body
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @emoji-category))
(define-peg-pattern @emoji-line body
(and @emoji-datum (* @ws) @comment))
(define emoji-ht (make-hash-table 5))
(define emoji-sets
(define emoji-properties
'(emoji
emoji-presentation
emoji-modifier
@ -37,55 +26,23 @@
emoji-component
emoji-extended-pictographic))
(define emoji-symbol-names
(define emoji-symbols
(map
(λ (set)
(string->symbol
(string-concatenate
(list "char-set:"
(symbol->string set)))))
emoji-sets))
emoji-properties))
(define emoji-sets-and-symbols
(zip emoji-sets emoji-symbol-names))
(define (process-line line)
(define (string->category str)
(match str
("Emoji" 'emoji)
("Emoji_Presentation" 'emoji-presentation)
("Emoji_Modifier" 'emoji-modifier)
("Emoji_Modifier_Base" 'emoji-modifier-base)
("Emoji_Component" 'emoji-component)
("Extended_Pictographic" 'emoji-extended-pictographic)))
(define tree (peg:tree (match-pattern @emoji-line line)))
(unless (or (not tree)
(null? tree)
(eq? '@comment (car tree)))
(match tree
(((('@codepoint-range
('@codepoint codepoints) ...)
('@emoji-category cat-str))
('@comment comment))
(with-exception-handler
(λ (err)
(format stdout "Skipping line due to error :: ")
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints)))
(category (string->category cat-str)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(cons-hash-list! emoji-ht category f l)))
#:unwind? #t)))))
(define (string->property str comment)
(match str
("Emoji" 'emoji)
("Emoji_Presentation" 'emoji-presentation)
("Emoji_Modifier" 'emoji-modifier)
("Emoji_Modifier_Base" 'emoji-modifier-base)
("Emoji_Component" 'emoji-component)
("Extended_Pictographic" 'emoji-extended-pictographic)))
(define file "uniseg/emoji.scm")
@ -95,49 +52,25 @@
(λ ()
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
(for-each process-line (cmdline-wget-or-file url stdout))
(pretty-print
`(define-module (uniseg emoji)
#:use-module (uniseg internal)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:export (,@emoji-symbol-names
#:export (,@emoji-symbols
emoji-charsets)))
(pretty-print
`(define emoji-ht
(alist->hashq-table ',(hash-map->list cons emoji-ht))))
(define-values (process-line output-boilerplate)
(make-line-processor
emoji-ht
string->property
emoji-properties
emoji-symbols
'emoji-charsets
stdout))
(display "\n")
(for-each
(λ (sym)
(pretty-print
`(define ,sym (char-set))))
emoji-symbol-names)
(display "\n")
(pretty-print
`(define emoji-charsets
(list
,@(map
(λ (pair)
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
emoji-sets-and-symbols))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! emoji-ht ',name ,symbol))))
emoji-sets-and-symbols)
(for-each process-line (cmdline-wget-or-file url stdout))
(output-boilerplate)
(display "Code generation complete.\n" stdout)))

View File

@ -19,18 +19,9 @@
(define url
"https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt")
(define-peg-pattern @grapheme-category all
(* (peg "[a-zA-Z_]")))
(define-peg-pattern @grapheme-datum body
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @grapheme-category))
(define-peg-pattern @grapheme-line body
(and @grapheme-datum (* @ws) @comment))
(define grapheme-ht (make-hash-table 13))
(define grapheme-sets
(define grapheme-properties
'(hangul-syllable-l
hangul-syllable-v
hangul-syllable-lv
@ -44,63 +35,30 @@
spacing-mark
zero-width-joiner))
(define grapheme-symbol-names
(define grapheme-symbols
(map
(λ (set)
(string->symbol
(string-concatenate
(list "char-set:grapheme-"
(symbol->string set)))))
grapheme-sets))
(define grapheme-sets-and-symbols
(zip grapheme-sets grapheme-symbol-names))
(define (process-line line)
(define (string->category str)
(match str
("L" 'hangul-syllable-l)
("V" 'hangul-syllable-v)
("T" 'hangul-syllable-t)
("LV" 'hangul-syllable-lv)
("LVT" 'hangul-syllable-lvt)
("Prepend" 'prepend)
("CR" 'carriage-return)
("LF" 'line-feed)
("Control" 'control)
("Extend" 'extend)
("Regional_Indicator" 'regional-indicator)
("SpacingMark" 'spacing-mark)
("ZWJ" 'zero-width-joiner)))
(define tree (peg:tree (match-pattern @grapheme-line line)))
(unless (or (not tree)
(null? tree)
(eq? '@comment (car tree)))
(match tree
(((('@codepoint-range
('@codepoint codepoints) ...)
('@grapheme-category cat-str))
('@comment comment))
(with-exception-handler
(λ (err)
(format stdout "Skipping line due to error :: ")
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints)))
(category (string->category cat-str)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(cons-hash-list! grapheme-ht category f l)))
#:unwind? #t)))))
grapheme-properties))
(define (string->property str comment)
(match str
("L" 'hangul-syllable-l)
("V" 'hangul-syllable-v)
("T" 'hangul-syllable-t)
("LV" 'hangul-syllable-lv)
("LVT" 'hangul-syllable-lvt)
("Prepend" 'prepend)
("CR" 'carriage-return)
("LF" 'line-feed)
("Control" 'control)
("Extend" 'extend)
("Regional_Indicator" 'regional-indicator)
("SpacingMark" 'spacing-mark)
("ZWJ" 'zero-width-joiner)))
(define file "uniseg/graphemes.scm")
@ -110,53 +68,32 @@
(λ ()
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
(for-each process-line (cmdline-wget-or-file url stdout))
(pretty-print
`(define-module (uniseg graphemes)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (uniseg internal)
#:use-module (uniseg emoji)
#:export (,@grapheme-symbol-names
#:export (,@grapheme-symbols
grapheme-charsets)))
(define-values (process-line output-boilerplate)
(make-line-processor
grapheme-ht
string->property
grapheme-properties
grapheme-symbols
'grapheme-charsets
stdout))
(for-each process-line (cmdline-wget-or-file url stdout))
(output-boilerplate)
;; Need emoji in the set as well.
(pretty-print
`(define grapheme-ht
(alist->hashq-table ',(hash-map->list cons grapheme-ht))))
(display "\n")
(for-each
(λ (sym)
(pretty-print
`(define ,sym (char-set))))
grapheme-symbol-names)
(display "\n")
(pretty-print
`(define grapheme-charsets
(list
,@(map
(λ (pair)
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
grapheme-sets-and-symbols)
;; Need emoji in this set too!
(list 'extended-pictographic char-set:emoji-extended-pictographic))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! grapheme-ht ',name ,symbol))))
grapheme-sets-and-symbols)
`(set! grapheme-charsets
(cons (list 'extended-pictographic char-set:emoji-extended-pictographic)
grapheme-charsets)))
(display "Code generation complete.\n" stdout)))

View File

@ -3,11 +3,11 @@
(define-module
(uniseg eastasian)
#:use-module
(uniseg internal)
#:use-module
(ice-9 hash-table)
#:use-module
(srfi srfi-1)
#:use-module
(uniseg internal)
#:export
(char-set:eastasian-combining
char-set:eastasian-doublewidth
@ -16,285 +16,26 @@
char-set:eastasian-neutral
char-set:eastasian-ambiguous
eastasian-charsets))
(define eastasian-ht
(define hashtable
(alist->hashq-table
'((doublewidth
(201547 262141)
(196608 201546)
(195104 196605)
(195102 195103)
(194560 195101)
(191457 194559)
(183984 191456)
(183970 183983)
(178208 183969)
(178206 178207)
(177984 178205)
(177973 177983)
(173824 177972)
(173790 173823)
(131072 173789)
(129744 129750)
(129728 129730)
(129712 129718)
(129680 129704)
(129664 129670)
(129656 129658)
(129648 129652)
(129485 129535)
(129402 129483)
(129351 129400)
(129340 129349)
(129292 129338)
(128992 129003)
(128756 128764)
(128747 128748)
(128725 128727)
(128720 128722)
(128716 128716)
(128640 128709)
(128512 128591)
(128507 128511)
(128420 128420)
(128405 128406)
(128378 128378)
(128336 128359)
(128331 128334)
(128255 128317)
(128066 128252)
(128064 128064)
(128000 128062)
(127995 127999)
(127992 127994)
(127988 127988)
(127968 127984)
(127951 127955)
(127904 127946)
(127870 127891)
(127799 127868)
(127789 127797)
(127744 127776)
(127584 127589)
(127568 127569)
(127552 127560)
(127504 127547)
(127488 127490)
(127377 127386)
(127374 127374)
(127183 127183)
(126980 126980)
(110960 111355)
(110948 110951)
(110928 110930)
(110848 110878)
(110592 110847)
(101632 101640)
(101120 101589)
(100352 101119)
(94208 100343)
(94192 94193)
(94180 94180)
(94179 94179)
(94178 94178)
(94176 94177)
(65509 65510)
(65508 65508)
(65507 65507)
(65506 65506)
(65504 65505)
(65376 65376)
(65375 65375)
(65374 65374)
(65373 65373)
(65372 65372)
(65371 65371)
(65345 65370)
(65344 65344)
(65343 65343)
(65342 65342)
(65341 65341)
(65340 65340)
(65339 65339)
(65313 65338)
(65311 65312)
(65308 65310)
(65306 65307)
(65296 65305)
(65294 65295)
(65293 65293)
(65292 65292)
(65291 65291)
(65290 65290)
(65289 65289)
(65288 65288)
(65285 65287)
(65284 65284)
(65281 65283)
(65130 65131)
(65129 65129)
(65128 65128)
(65124 65126)
(65123 65123)
(65122 65122)
(65119 65121)
(65118 65118)
(65117 65117)
(65116 65116)
(65115 65115)
(65114 65114)
(65113 65113)
(65112 65112)
(65108 65111)
(65104 65106)
(65101 65103)
(65097 65100)
(65096 65096)
(65095 65095)
(65093 65094)
(65092 65092)
(65091 65091)
(65090 65090)
(65089 65089)
(65088 65088)
(65087 65087)
(65086 65086)
(65085 65085)
(65084 65084)
(65083 65083)
(65082 65082)
(65081 65081)
(65080 65080)
(65079 65079)
(65078 65078)
(65077 65077)
(65075 65076)
(65073 65074)
(65072 65072)
(65049 65049)
(65048 65048)
(65047 65047)
(65040 65046)
(64218 64255)
(64112 64217)
(64110 64111)
(63744 64109)
(44032 55203)
(43360 43388)
(42128 42182)
(40982 42124)
(40981 40981)
(40960 40980)
(40957 40959)
(19968 40956)
(13312 19903)
(13056 13311)
(12992 13055)
(12977 12991)
(12938 12976)
(12928 12937)
(12896 12927)
(12881 12895)
(12880 12880)
(12842 12871)
(12832 12841)
(12800 12830)
(12784 12799)
(12736 12771)
(12704 12735)
(12694 12703)
(12690 12693)
(12688 12689)
(12593 12686)
(12549 12591)
(12543 12543)
(12540 12542)
(12539 12539)
(12449 12538)
(12448 12448)
(12447 12447)
(12445 12446)
(12443 12444)
(12353 12438)
(12350 12350)
(12349 12349)
(12348 12348)
(12347 12347)
(12344 12346)
(12342 12343)
(12337 12341)
(12336 12336)
(12334 12335)
(12330 12333)
(12321 12329)
(12320 12320)
(12318 12319)
(12317 12317)
(12316 12316)
(12315 12315)
(12314 12314)
(12313 12313)
(12312 12312)
(12311 12311)
(12310 12310)
(12309 12309)
(12308 12308)
(12306 12307)
(12305 12305)
(12304 12304)
(12303 12303)
(12302 12302)
(12301 12301)
(12300 12300)
(12299 12299)
(12298 12298)
(12297 12297)
(12296 12296)
(12295 12295)
(12294 12294)
(12293 12293)
(12292 12292)
(12289 12291)
(12288 12288)
(12272 12283)
(12032 12245)
(11931 12019)
(11904 11929)
(11093 11093)
(11088 11088)
(11035 11036)
(10175 10175)
(10160 10160)
(10133 10135)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10024 10024)
(9994 9995)
(9989 9989)
(9981 9981)
(9978 9978)
(9973 9973)
(9970 9971)
(9962 9962)
(9940 9940)
(9934 9934)
(9924 9925)
(9917 9918)
(9898 9899)
(9889 9889)
(9875 9875)
(9855 9855)
(9800 9811)
(9748 9749)
(9725 9726)
(9203 9203)
(9200 9200)
(9193 9196)
(9002 9002)
(9001 9001)
(8986 8987)
(4352 4447))
'((halfwidth
(65517 65518)
(65513 65516)
(65512 65512)
(65498 65500)
(65490 65495)
(65482 65487)
(65474 65479)
(65440 65470)
(65438 65439)
(65393 65437)
(65392 65392)
(65382 65391)
(65380 65381)
(65379 65379)
(65378 65378)
(65377 65377)
(8361 8361))
(ambiguous
(1048576 1114109)
(983040 1048573)
@ -493,120 +234,6 @@
(167 167)
(164 164)
(161 161))
(combining
(125136 125142)
(122918 122922)
(122915 122916)
(122907 122913)
(122888 122904)
(122880 122886)
(119362 119364)
(119210 119213)
(119173 119179)
(119163 119170)
(119149 119154)
(119143 119145)
(119141 119142)
(92912 92916)
(70512 70516)
(70502 70508)
(70459 70460)
(70400 70401)
(69446 69456)
(69291 69292)
(66422 66426)
(66045 66045)
(65056 65071)
(43232 43249)
(42736 42737)
(42654 42655)
(42612 42621)
(42608 42610)
(42607 42607)
(12441 12442)
(11744 11775)
(11503 11505)
(8421 8432)
(8418 8420)
(8417 8417)
(8413 8416)
(8400 8412)
(7675 7679)
(7616 7673)
(7019 7027)
(6847 6848)
(6846 6846)
(6832 6845)
(6783 6783)
(4957 4959)
(3328 3329)
(3076 3076)
(3072 3072)
(2027 2035)
(1160 1161)
(1155 1159)
(768 879))
(halfwidth
(65517 65518)
(65513 65516)
(65512 65512)
(65498 65500)
(65490 65495)
(65482 65487)
(65474 65479)
(65440 65470)
(65438 65439)
(65393 65437)
(65392 65392)
(65382 65391)
(65380 65381)
(65379 65379)
(65378 65378)
(65377 65377)
(8361 8361))
(narrow
(10630 10630)
(10629 10629)
(10221 10221)
(10220 10220)
(10219 10219)
(10218 10218)
(10217 10217)
(10216 10216)
(10215 10215)
(10214 10214)
(175 175)
(172 172)
(166 166)
(165 165)
(162 163)
(126 126)
(125 125)
(124 124)
(123 123)
(97 122)
(96 96)
(95 95)
(94 94)
(93 93)
(92 92)
(91 91)
(65 90)
(63 64)
(60 62)
(58 59)
(48 57)
(46 47)
(45 45)
(44 44)
(43 43)
(42 42)
(41 41)
(40 40)
(37 39)
(36 36)
(33 35)
(32 32))
(neutral
(917536 917631)
(917505 917505)
@ -2498,7 +2125,380 @@
(160 160)
(128 159)
(127 127)
(0 31)))))
(0 31))
(doublewidth
(201547 262141)
(196608 201546)
(195104 196605)
(195102 195103)
(194560 195101)
(191457 194559)
(183984 191456)
(183970 183983)
(178208 183969)
(178206 178207)
(177984 178205)
(177973 177983)
(173824 177972)
(173790 173823)
(131072 173789)
(129744 129750)
(129728 129730)
(129712 129718)
(129680 129704)
(129664 129670)
(129656 129658)
(129648 129652)
(129485 129535)
(129402 129483)
(129351 129400)
(129340 129349)
(129292 129338)
(128992 129003)
(128756 128764)
(128747 128748)
(128725 128727)
(128720 128722)
(128716 128716)
(128640 128709)
(128512 128591)
(128507 128511)
(128420 128420)
(128405 128406)
(128378 128378)
(128336 128359)
(128331 128334)
(128255 128317)
(128066 128252)
(128064 128064)
(128000 128062)
(127995 127999)
(127992 127994)
(127988 127988)
(127968 127984)
(127951 127955)
(127904 127946)
(127870 127891)
(127799 127868)
(127789 127797)
(127744 127776)
(127584 127589)
(127568 127569)
(127552 127560)
(127504 127547)
(127488 127490)
(127377 127386)
(127374 127374)
(127183 127183)
(126980 126980)
(110960 111355)
(110948 110951)
(110928 110930)
(110848 110878)
(110592 110847)
(101632 101640)
(101120 101589)
(100352 101119)
(94208 100343)
(94192 94193)
(94180 94180)
(94179 94179)
(94178 94178)
(94176 94177)
(65509 65510)
(65508 65508)
(65507 65507)
(65506 65506)
(65504 65505)
(65376 65376)
(65375 65375)
(65374 65374)
(65373 65373)
(65372 65372)
(65371 65371)
(65345 65370)
(65344 65344)
(65343 65343)
(65342 65342)
(65341 65341)
(65340 65340)
(65339 65339)
(65313 65338)
(65311 65312)
(65308 65310)
(65306 65307)
(65296 65305)
(65294 65295)
(65293 65293)
(65292 65292)
(65291 65291)
(65290 65290)
(65289 65289)
(65288 65288)
(65285 65287)
(65284 65284)
(65281 65283)
(65130 65131)
(65129 65129)
(65128 65128)
(65124 65126)
(65123 65123)
(65122 65122)
(65119 65121)
(65118 65118)
(65117 65117)
(65116 65116)
(65115 65115)
(65114 65114)
(65113 65113)
(65112 65112)
(65108 65111)
(65104 65106)
(65101 65103)
(65097 65100)
(65096 65096)
(65095 65095)
(65093 65094)
(65092 65092)
(65091 65091)
(65090 65090)
(65089 65089)
(65088 65088)
(65087 65087)
(65086 65086)
(65085 65085)
(65084 65084)
(65083 65083)
(65082 65082)
(65081 65081)
(65080 65080)
(65079 65079)
(65078 65078)
(65077 65077)
(65075 65076)
(65073 65074)
(65072 65072)
(65049 65049)
(65048 65048)
(65047 65047)
(65040 65046)
(64218 64255)
(64112 64217)
(64110 64111)
(63744 64109)
(44032 55203)
(43360 43388)
(42128 42182)
(40982 42124)
(40981 40981)
(40960 40980)
(40957 40959)
(19968 40956)
(13312 19903)
(13056 13311)
(12992 13055)
(12977 12991)
(12938 12976)
(12928 12937)
(12896 12927)
(12881 12895)
(12880 12880)
(12842 12871)
(12832 12841)
(12800 12830)
(12784 12799)
(12736 12771)
(12704 12735)
(12694 12703)
(12690 12693)
(12688 12689)
(12593 12686)
(12549 12591)
(12543 12543)
(12540 12542)
(12539 12539)
(12449 12538)
(12448 12448)
(12447 12447)
(12445 12446)
(12443 12444)
(12353 12438)
(12350 12350)
(12349 12349)
(12348 12348)
(12347 12347)
(12344 12346)
(12342 12343)
(12337 12341)
(12336 12336)
(12334 12335)
(12330 12333)
(12321 12329)
(12320 12320)
(12318 12319)
(12317 12317)
(12316 12316)
(12315 12315)
(12314 12314)
(12313 12313)
(12312 12312)
(12311 12311)
(12310 12310)
(12309 12309)
(12308 12308)
(12306 12307)
(12305 12305)
(12304 12304)
(12303 12303)
(12302 12302)
(12301 12301)
(12300 12300)
(12299 12299)
(12298 12298)
(12297 12297)
(12296 12296)
(12295 12295)
(12294 12294)
(12293 12293)
(12292 12292)
(12289 12291)
(12288 12288)
(12272 12283)
(12032 12245)
(11931 12019)
(11904 11929)
(11093 11093)
(11088 11088)
(11035 11036)
(10175 10175)
(10160 10160)
(10133 10135)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10024 10024)
(9994 9995)
(9989 9989)
(9981 9981)
(9978 9978)
(9973 9973)
(9970 9971)
(9962 9962)
(9940 9940)
(9934 9934)
(9924 9925)
(9917 9918)
(9898 9899)
(9889 9889)
(9875 9875)
(9855 9855)
(9800 9811)
(9748 9749)
(9725 9726)
(9203 9203)
(9200 9200)
(9193 9196)
(9002 9002)
(9001 9001)
(8986 8987)
(4352 4447))
(narrow
(10630 10630)
(10629 10629)
(10221 10221)
(10220 10220)
(10219 10219)
(10218 10218)
(10217 10217)
(10216 10216)
(10215 10215)
(10214 10214)
(175 175)
(172 172)
(166 166)
(165 165)
(162 163)
(126 126)
(125 125)
(124 124)
(123 123)
(97 122)
(96 96)
(95 95)
(94 94)
(93 93)
(92 92)
(91 91)
(65 90)
(63 64)
(60 62)
(58 59)
(48 57)
(46 47)
(45 45)
(44 44)
(43 43)
(42 42)
(41 41)
(40 40)
(37 39)
(36 36)
(33 35)
(32 32))
(combining
(125136 125142)
(122918 122922)
(122915 122916)
(122907 122913)
(122888 122904)
(122880 122886)
(119362 119364)
(119210 119213)
(119173 119179)
(119163 119170)
(119149 119154)
(119143 119145)
(119141 119142)
(92912 92916)
(70512 70516)
(70502 70508)
(70459 70460)
(70400 70401)
(69446 69456)
(69291 69292)
(66422 66426)
(66045 66045)
(65056 65071)
(43232 43249)
(42736 42737)
(42654 42655)
(42612 42621)
(42608 42610)
(42607 42607)
(12441 12442)
(11744 11775)
(11503 11505)
(8421 8432)
(8418 8420)
(8417 8417)
(8413 8416)
(8400 8412)
(7675 7679)
(7616 7673)
(7019 7027)
(6847 6848)
(6846 6846)
(6832 6845)
(6783 6783)
(4957 4959)
(3328 3329)
(3076 3076)
(3072 3072)
(2027 2035)
(1160 1161)
(1155 1159)
(768 879)))))
(define char-set:eastasian-combining (char-set))
(define char-set:eastasian-doublewidth (char-set))
@ -2515,9 +2515,10 @@
(list 'neutral char-set:eastasian-neutral)
(list 'ambiguous char-set:eastasian-ambiguous)))
(ranges->charset! eastasian-ht 'combining char-set:eastasian-combining)
(ranges->charset! eastasian-ht 'doublewidth char-set:eastasian-doublewidth)
(ranges->charset! eastasian-ht 'halfwidth char-set:eastasian-halfwidth)
(ranges->charset! eastasian-ht 'narrow char-set:eastasian-narrow)
(ranges->charset! eastasian-ht 'neutral char-set:eastasian-neutral)
(ranges->charset! eastasian-ht 'ambiguous char-set:eastasian-ambiguous)
(ranges->charset! hashtable 'combining char-set:eastasian-combining)
(ranges->charset! hashtable 'doublewidth char-set:eastasian-doublewidth)
(ranges->charset! hashtable 'halfwidth char-set:eastasian-halfwidth)
(ranges->charset! hashtable 'narrow char-set:eastasian-narrow)
(ranges->charset! hashtable 'neutral char-set:eastasian-neutral)
(ranges->charset! hashtable 'ambiguous char-set:eastasian-ambiguous)

View File

@ -16,7 +16,7 @@
char-set:emoji-component
char-set:emoji-extended-pictographic
emoji-charsets))
(define emoji-ht
(define hashtable
(alist->hashq-table
'((emoji-presentation
(129744 129750)
@ -280,6 +280,437 @@
(9200 9200)
(9193 9196)
(8986 8987))
(emoji-modifier-base
(129489 129501)
(129485 129487)
(129467 129467)
(129464 129465)
(129461 129462)
(129399 129399)
(129340 129342)
(129331 129337)
(129329 129330)
(129328 129328)
(129318 129318)
(129311 129311)
(129305 129310)
(129304 129304)
(129295 129295)
(129292 129292)
(128716 128716)
(128704 128704)
(128694 128694)
(128692 128693)
(128675 128675)
(128587 128591)
(128581 128583)
(128405 128406)
(128400 128400)
(128378 128378)
(128372 128373)
(128170 128170)
(128145 128145)
(128143 128143)
(128133 128135)
(128129 128131)
(128124 128124)
(128110 128120)
(128108 128109)
(128102 128107)
(128070 128080)
(128066 128067)
(127947 127948)
(127946 127946)
(127943 127943)
(127938 127940)
(127877 127877)
(9997 9997)
(9994 9996)
(9977 9977)
(9757 9757))
(emoji (129744 129750)
(129728 129730)
(129712 129718)
(129686 129704)
(129680 129685)
(129667 129670)
(129664 129666)
(129656 129658)
(129652 129652)
(129648 129651)
(129511 129535)
(129488 129510)
(129485 129487)
(129483 129483)
(129475 129482)
(129473 129474)
(129472 129472)
(129466 129471)
(129456 129465)
(129454 129455)
(129451 129453)
(129445 129450)
(129443 129444)
(129432 129442)
(129426 129431)
(129413 129425)
(129408 129412)
(129404 129407)
(129403 129403)
(129402 129402)
(129399 129400)
(129395 129398)
(129394 129394)
(129393 129393)
(129388 129392)
(129375 129387)
(129360 129374)
(129357 129359)
(129356 129356)
(129351 129355)
(129344 129349)
(129343 129343)
(129340 129342)
(129331 129338)
(129329 129330)
(129328 129328)
(129320 129327)
(129312 129319)
(129311 129311)
(129305 129310)
(129296 129304)
(129293 129295)
(129292 129292)
(128992 129003)
(128763 128764)
(128762 128762)
(128761 128761)
(128759 128760)
(128756 128758)
(128755 128755)
(128752 128752)
(128747 128748)
(128745 128745)
(128736 128741)
(128726 128727)
(128725 128725)
(128721 128722)
(128720 128720)
(128717 128719)
(128716 128716)
(128715 128715)
(128705 128709)
(128704 128704)
(128703 128703)
(128697 128702)
(128695 128696)
(128694 128694)
(128691 128693)
(128690 128690)
(128686 128689)
(128679 128685)
(128678 128678)
(128676 128677)
(128675 128675)
(128674 128674)
(128667 128673)
(128665 128666)
(128664 128664)
(128663 128663)
(128662 128662)
(128661 128661)
(128660 128660)
(128657 128659)
(128656 128656)
(128655 128655)
(128654 128654)
(128653 128653)
(128652 128652)
(128650 128651)
(128649 128649)
(128648 128648)
(128647 128647)
(128646 128646)
(128643 128645)
(128641 128642)
(128640 128640)
(128581 128591)
(128577 128580)
(128567 128576)
(128566 128566)
(128565 128565)
(128564 128564)
(128560 128563)
(128558 128559)
(128557 128557)
(128556 128556)
(128552 128555)
(128550 128551)
(128544 128549)
(128543 128543)
(128540 128542)
(128539 128539)
(128538 128538)
(128537 128537)
(128536 128536)
(128535 128535)
(128534 128534)
(128533 128533)
(128530 128532)
(128529 128529)
(128528 128528)
(128527 128527)
(128526 128526)
(128521 128525)
(128519 128520)
(128513 128518)
(128512 128512)
(128507 128511)
(128506 128506)
(128499 128499)
(128495 128495)
(128488 128488)
(128483 128483)
(128481 128481)
(128476 128478)
(128465 128467)
(128450 128452)
(128444 128444)
(128433 128434)
(128424 128424)
(128421 128421)
(128420 128420)
(128405 128406)
(128400 128400)
(128394 128397)
(128391 128391)
(128378 128378)
(128371 128377)
(128367 128368)
(128348 128359)
(128336 128347)
(128331 128334)
(128329 128330)
(128302 128317)
(128300 128301)
(128278 128299)
(128277 128277)
(128266 128276)
(128265 128265)
(128264 128264)
(128260 128263)
(128259 128259)
(128255 128258)
(128253 128253)
(128249 128252)
(128248 128248)
(128246 128247)
(128245 128245)
(128240 128244)
(128239 128239)
(128238 128238)
(128236 128237)
(128184 128235)
(128182 128183)
(128174 128181)
(128173 128173)
(128110 128172)
(128108 128109)
(128102 128107)
(128101 128101)
(128066 128100)
(128065 128065)
(128064 128064)
(128063 128063)
(128043 128062)
(128042 128042)
(128023 128041)
(128022 128022)
(128021 128021)
(128020 128020)
(128019 128019)
(128017 128018)
(128015 128016)
(128012 128014)
(128009 128011)
(128008 128008)
(127992 128007)
(127991 127991)
(127989 127989)
(127988 127988)
(127987 127987)
(127973 127984)
(127972 127972)
(127968 127971)
(127956 127967)
(127951 127955)
(127947 127950)
(127946 127946)
(127945 127945)
(127944 127944)
(127943 127943)
(127942 127942)
(127941 127941)
(127904 127940)
(127902 127903)
(127897 127899)
(127894 127895)
(127872 127891)
(127870 127871)
(127869 127869)
(127868 127868)
(127825 127867)
(127824 127824)
(127820 127823)
(127819 127819)
(127799 127818)
(127798 127798)
(127796 127797)
(127794 127795)
(127792 127793)
(127789 127791)
(127780 127788)
(127777 127777)
(127775 127776)
(127773 127774)
(127772 127772)
(127771 127771)
(127770 127770)
(127769 127769)
(127766 127768)
(127763 127765)
(127762 127762)
(127761 127761)
(127760 127760)
(127759 127759)
(127757 127758)
(127744 127756)
(127568 127569)
(127538 127546)
(127535 127535)
(127514 127514)
(127489 127490)
(127462 127487)
(127377 127386)
(127374 127374)
(127358 127359)
(127344 127345)
(127183 127183)
(126980 126980)
(12953 12953)
(12951 12951)
(12349 12349)
(12336 12336)
(11093 11093)
(11088 11088)
(11035 11036)
(11013 11015)
(10548 10549)
(10175 10175)
(10160 10160)
(10145 10145)
(10133 10135)
(10084 10084)
(10083 10083)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10055 10055)
(10052 10052)
(10035 10036)
(10024 10024)
(10017 10017)
(10013 10013)
(10006 10006)
(10004 10004)
(10002 10002)
(9999 9999)
(9997 9997)
(9992 9996)
(9989 9989)
(9986 9986)
(9981 9981)
(9978 9978)
(9975 9977)
(9973 9973)
(9972 9972)
(9970 9971)
(9968 9969)
(9962 9962)
(9961 9961)
(9940 9940)
(9939 9939)
(9937 9937)
(9935 9935)
(9934 9934)
(9928 9928)
(9924 9925)
(9917 9918)
(9904 9905)
(9898 9899)
(9895 9895)
(9888 9889)
(9883 9884)
(9881 9881)
(9878 9879)
(9877 9877)
(9876 9876)
(9875 9875)
(9874 9874)
(9855 9855)
(9854 9854)
(9851 9851)
(9832 9832)
(9829 9830)
(9827 9827)
(9824 9824)
(9823 9823)
(9800 9811)
(9794 9794)
(9792 9792)
(9786 9786)
(9784 9785)
(9775 9775)
(9774 9774)
(9770 9770)
(9766 9766)
(9762 9763)
(9760 9760)
(9757 9757)
(9752 9752)
(9748 9749)
(9745 9745)
(9742 9742)
(9732 9732)
(9730 9731)
(9728 9729)
(9723 9726)
(9664 9664)
(9654 9654)
(9642 9643)
(9410 9410)
(9208 9210)
(9203 9203)
(9201 9202)
(9200 9200)
(9199 9199)
(9197 9198)
(9193 9196)
(9167 9167)
(9000 9000)
(8986 8987)
(8617 8618)
(8596 8601)
(8505 8505)
(8482 8482)
(8265 8265)
(8252 8252)
(174 174)
(169 169)
(48 57)
(42 42)
(35 35))
(emoji-extended-pictographic
(130048 131069)
(129751 129791)
@ -772,438 +1203,6 @@
(8252 8252)
(174 174)
(169 169))
(emoji-modifier-base
(129489 129501)
(129485 129487)
(129467 129467)
(129464 129465)
(129461 129462)
(129399 129399)
(129340 129342)
(129331 129337)
(129329 129330)
(129328 129328)
(129318 129318)
(129311 129311)
(129305 129310)
(129304 129304)
(129295 129295)
(129292 129292)
(128716 128716)
(128704 128704)
(128694 128694)
(128692 128693)
(128675 128675)
(128587 128591)
(128581 128583)
(128405 128406)
(128400 128400)
(128378 128378)
(128372 128373)
(128170 128170)
(128145 128145)
(128143 128143)
(128133 128135)
(128129 128131)
(128124 128124)
(128110 128120)
(128108 128109)
(128102 128107)
(128070 128080)
(128066 128067)
(127947 127948)
(127946 127946)
(127943 127943)
(127938 127940)
(127877 127877)
(9997 9997)
(9994 9996)
(9977 9977)
(9757 9757))
(emoji-modifier (127995 127999))
(emoji (129744 129750)
(129728 129730)
(129712 129718)
(129686 129704)
(129680 129685)
(129667 129670)
(129664 129666)
(129656 129658)
(129652 129652)
(129648 129651)
(129511 129535)
(129488 129510)
(129485 129487)
(129483 129483)
(129475 129482)
(129473 129474)
(129472 129472)
(129466 129471)
(129456 129465)
(129454 129455)
(129451 129453)
(129445 129450)
(129443 129444)
(129432 129442)
(129426 129431)
(129413 129425)
(129408 129412)
(129404 129407)
(129403 129403)
(129402 129402)
(129399 129400)
(129395 129398)
(129394 129394)
(129393 129393)
(129388 129392)
(129375 129387)
(129360 129374)
(129357 129359)
(129356 129356)
(129351 129355)
(129344 129349)
(129343 129343)
(129340 129342)
(129331 129338)
(129329 129330)
(129328 129328)
(129320 129327)
(129312 129319)
(129311 129311)
(129305 129310)
(129296 129304)
(129293 129295)
(129292 129292)
(128992 129003)
(128763 128764)
(128762 128762)
(128761 128761)
(128759 128760)
(128756 128758)
(128755 128755)
(128752 128752)
(128747 128748)
(128745 128745)
(128736 128741)
(128726 128727)
(128725 128725)
(128721 128722)
(128720 128720)
(128717 128719)
(128716 128716)
(128715 128715)
(128705 128709)
(128704 128704)
(128703 128703)
(128697 128702)
(128695 128696)
(128694 128694)
(128691 128693)
(128690 128690)
(128686 128689)
(128679 128685)
(128678 128678)
(128676 128677)
(128675 128675)
(128674 128674)
(128667 128673)
(128665 128666)
(128664 128664)
(128663 128663)
(128662 128662)
(128661 128661)
(128660 128660)
(128657 128659)
(128656 128656)
(128655 128655)
(128654 128654)
(128653 128653)
(128652 128652)
(128650 128651)
(128649 128649)
(128648 128648)
(128647 128647)
(128646 128646)
(128643 128645)
(128641 128642)
(128640 128640)
(128581 128591)
(128577 128580)
(128567 128576)
(128566 128566)
(128565 128565)
(128564 128564)
(128560 128563)
(128558 128559)
(128557 128557)
(128556 128556)
(128552 128555)
(128550 128551)
(128544 128549)
(128543 128543)
(128540 128542)
(128539 128539)
(128538 128538)
(128537 128537)
(128536 128536)
(128535 128535)
(128534 128534)
(128533 128533)
(128530 128532)
(128529 128529)
(128528 128528)
(128527 128527)
(128526 128526)
(128521 128525)
(128519 128520)
(128513 128518)
(128512 128512)
(128507 128511)
(128506 128506)
(128499 128499)
(128495 128495)
(128488 128488)
(128483 128483)
(128481 128481)
(128476 128478)
(128465 128467)
(128450 128452)
(128444 128444)
(128433 128434)
(128424 128424)
(128421 128421)
(128420 128420)
(128405 128406)
(128400 128400)
(128394 128397)
(128391 128391)
(128378 128378)
(128371 128377)
(128367 128368)
(128348 128359)
(128336 128347)
(128331 128334)
(128329 128330)
(128302 128317)
(128300 128301)
(128278 128299)
(128277 128277)
(128266 128276)
(128265 128265)
(128264 128264)
(128260 128263)
(128259 128259)
(128255 128258)
(128253 128253)
(128249 128252)
(128248 128248)
(128246 128247)
(128245 128245)
(128240 128244)
(128239 128239)
(128238 128238)
(128236 128237)
(128184 128235)
(128182 128183)
(128174 128181)
(128173 128173)
(128110 128172)
(128108 128109)
(128102 128107)
(128101 128101)
(128066 128100)
(128065 128065)
(128064 128064)
(128063 128063)
(128043 128062)
(128042 128042)
(128023 128041)
(128022 128022)
(128021 128021)
(128020 128020)
(128019 128019)
(128017 128018)
(128015 128016)
(128012 128014)
(128009 128011)
(128008 128008)
(127992 128007)
(127991 127991)
(127989 127989)
(127988 127988)
(127987 127987)
(127973 127984)
(127972 127972)
(127968 127971)
(127956 127967)
(127951 127955)
(127947 127950)
(127946 127946)
(127945 127945)
(127944 127944)
(127943 127943)
(127942 127942)
(127941 127941)
(127904 127940)
(127902 127903)
(127897 127899)
(127894 127895)
(127872 127891)
(127870 127871)
(127869 127869)
(127868 127868)
(127825 127867)
(127824 127824)
(127820 127823)
(127819 127819)
(127799 127818)
(127798 127798)
(127796 127797)
(127794 127795)
(127792 127793)
(127789 127791)
(127780 127788)
(127777 127777)
(127775 127776)
(127773 127774)
(127772 127772)
(127771 127771)
(127770 127770)
(127769 127769)
(127766 127768)
(127763 127765)
(127762 127762)
(127761 127761)
(127760 127760)
(127759 127759)
(127757 127758)
(127744 127756)
(127568 127569)
(127538 127546)
(127535 127535)
(127514 127514)
(127489 127490)
(127462 127487)
(127377 127386)
(127374 127374)
(127358 127359)
(127344 127345)
(127183 127183)
(126980 126980)
(12953 12953)
(12951 12951)
(12349 12349)
(12336 12336)
(11093 11093)
(11088 11088)
(11035 11036)
(11013 11015)
(10548 10549)
(10175 10175)
(10160 10160)
(10145 10145)
(10133 10135)
(10084 10084)
(10083 10083)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10055 10055)
(10052 10052)
(10035 10036)
(10024 10024)
(10017 10017)
(10013 10013)
(10006 10006)
(10004 10004)
(10002 10002)
(9999 9999)
(9997 9997)
(9992 9996)
(9989 9989)
(9986 9986)
(9981 9981)
(9978 9978)
(9975 9977)
(9973 9973)
(9972 9972)
(9970 9971)
(9968 9969)
(9962 9962)
(9961 9961)
(9940 9940)
(9939 9939)
(9937 9937)
(9935 9935)
(9934 9934)
(9928 9928)
(9924 9925)
(9917 9918)
(9904 9905)
(9898 9899)
(9895 9895)
(9888 9889)
(9883 9884)
(9881 9881)
(9878 9879)
(9877 9877)
(9876 9876)
(9875 9875)
(9874 9874)
(9855 9855)
(9854 9854)
(9851 9851)
(9832 9832)
(9829 9830)
(9827 9827)
(9824 9824)
(9823 9823)
(9800 9811)
(9794 9794)
(9792 9792)
(9786 9786)
(9784 9785)
(9775 9775)
(9774 9774)
(9770 9770)
(9766 9766)
(9762 9763)
(9760 9760)
(9757 9757)
(9752 9752)
(9748 9749)
(9745 9745)
(9742 9742)
(9732 9732)
(9730 9731)
(9728 9729)
(9723 9726)
(9664 9664)
(9654 9654)
(9642 9643)
(9410 9410)
(9208 9210)
(9203 9203)
(9201 9202)
(9200 9200)
(9199 9199)
(9197 9198)
(9193 9196)
(9167 9167)
(9000 9000)
(8986 8987)
(8617 8618)
(8596 8601)
(8505 8505)
(8482 8482)
(8265 8265)
(8252 8252)
(174 174)
(169 169)
(48 57)
(42 42)
(35 35))
(emoji-component
(917536 917631)
(129456 129459)
@ -1214,7 +1213,8 @@
(8205 8205)
(48 57)
(42 42)
(35 35)))))
(35 35))
(emoji-modifier (127995 127999)))))
(define char-set:emoji (char-set))
(define char-set:emoji-presentation (char-set))
@ -1232,12 +1232,13 @@
(list 'emoji-extended-pictographic
char-set:emoji-extended-pictographic)))
(ranges->charset! emoji-ht 'emoji char-set:emoji)
(ranges->charset! emoji-ht 'emoji-presentation char-set:emoji-presentation)
(ranges->charset! emoji-ht 'emoji-modifier char-set:emoji-modifier)
(ranges->charset! emoji-ht 'emoji-modifier-base char-set:emoji-modifier-base)
(ranges->charset! emoji-ht 'emoji-component char-set:emoji-component)
(ranges->charset! hashtable 'emoji char-set:emoji)
(ranges->charset! hashtable 'emoji-presentation char-set:emoji-presentation)
(ranges->charset! hashtable 'emoji-modifier char-set:emoji-modifier)
(ranges->charset! hashtable 'emoji-modifier-base char-set:emoji-modifier-base)
(ranges->charset! hashtable 'emoji-component char-set:emoji-component)
(ranges->charset!
emoji-ht
hashtable
'emoji-extended-pictographic
char-set:emoji-extended-pictographic)

File diff suppressed because it is too large Load Diff

View File

@ -125,6 +125,7 @@
;; Grapheme boundary #11n (emoji!)
((_ 'extended-pictographic)
(pk "EXTENDED")
(values 'extended-pictographic #t))
;; Grapheme boundaries #12n and #13n
@ -148,7 +149,7 @@
state)
(begin
(let* ((width property (char-width glyph))
(next-state boundary? (state-machine state property)))
(next-state boundary? (cpk-values glyph state property '= (state-machine state property))))
(if boundary?
state

View File

@ -1,19 +1,18 @@
(define-module (uniseg internal)
#:use-module (ice-9 peg)
#:use-module (ice-9 match)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 exceptions)
#:use-module (ice-9 hash-table)
#:use-module (ice-9 i18n)
#:use-module (ice-9 format)
#:use-module (ice-9 pretty-print)
#:use-module (web uri)
#:use-module (web client)
#:use-module (web request)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-71)
#:export (@hex
@codepoint
@codepoint-range
@comment
@ws
#:export (@line
cpk-values
cpk
cons-hash-list!
@ -21,7 +20,8 @@
format-exception-msg
in-surrogate-range
cmdline-wget-or-file
ranges->charset!))
ranges->charset!
make-line-processor))
;;
;; Common PEG patterns
@ -42,9 +42,99 @@
(define-peg-pattern @ws none
(or " " "\t"))
(define-peg-pattern @property all
(* (peg "[a-zA-Z_]")))
(define-peg-pattern @datum body
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @property))
(define-peg-pattern @line body
(and @datum (* @ws) @comment))
;; Giant unicode code generation procedure creator
(define (make-line-processor
hashtable
string->property
properties
symbols
charsets-symbol
stdout)
(define properties-and-symbols
(zip properties symbols))
(define (process-line line)
(define tree (peg:tree (match-pattern @line line)))
(unless
(or (not tree)
(null? tree)
(eq? 'comment (car tree)))
(match tree
(((('@codepoint-range
('@codepoint codepoints) ___)
('@property prop-str))
('@comment comment))
(with-exception-handler
(λ (err)
(format stdout "Skipping line due to error :: ")
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints)))
(property (string->property prop-str comment)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(cons-hash-list! hashtable property f l)))
#:unwind? #t)))))
(define (output-boilerplate)
(pretty-print
`(define hashtable
(alist->hashq-table ',(hash-map->list cons hashtable))))
(display "\n")
(for-each
(λ (sym)
(pretty-print
`(define ,sym (char-set))))
symbols)
(display "\n")
(pretty-print
`(define ,charsets-symbol
(list
,@(map
(λ (pair)
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
properties-and-symbols))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! hashtable ',name ,symbol))))
properties-and-symbols)
(display "\n"))
(values process-line output-boilerplate))
;; Helper macro to add a list of character ranges
;; to a hash-set.
(define-syntax-rule (cons-hash-list! ht key low high)
(define (cons-hash-list! ht key low high)
(let* ((old (hashq-ref ht key))
(value (list low high))
(new-lst