Significant refactors and cleanups, and add grapheme parse

This commit is contained in:
Vivianne 2024-03-02 18:54:38 -05:00
parent 8ea25ea23d
commit 4a93c70b79
9 changed files with 2067 additions and 457 deletions

View file

@ -80,7 +80,9 @@
"GUILE_LOAD_COMPILED_PATH"
(compiled-dir out version)
(compiled-dir "" version))))
,''("generate-east-asian" "generate-emoji"))
,''("generate-east-asian"
"generate-emoji"
"generate-graphemes"))
#t))))))))
(native-inputs
(list autoconf automake pkg-config texinfo))

View file

@ -25,6 +25,7 @@
((scheme-file "emoji")
(directory "eastasian" ((scheme-file "locale")))
(scheme-file "eastasian")
(scheme-file "graphemes")
(scheme-file "internal")))))
(tests ((directory
"tests"
@ -33,7 +34,8 @@
((directory
"scripts"
((in-file "generate-east-asian")
(in-file "generate-emoji")))))
(in-file "generate-emoji")
(in-file "generate-graphemes")))))
(documentation
((org-file "README")
(symlink "README" "README.org")

View file

@ -1,4 +1,4 @@
;; Code generated by script/generate. DO NOT EDIT
;; Code generated by scripts/generate-east-asian. DO NOT EDIT
(define-module
(runewidth eastasian)
@ -6,6 +6,8 @@
(ice-9 hash-table)
#:use-module
(srfi srfi-1)
#:use-module
(runewidth internal)
#:export
(char-set:eastasian-combining
char-set:eastasian-doublewidth
@ -13,285 +15,61 @@
char-set:eastasian-narrow
char-set:eastasian-neutral
char-set:eastasian-ambiguous))
(define chars-ht
(define eastasian-ht
(alist->hashq-table
'((doublewidth
(201547 262141)
(196608 201546)
(195104 196605)
(195102 195103)
(194560 195101)
(191457 194559)
(183984 191456)
(183970 183983)
(178208 183969)
(178206 178207)
(177984 178205)
(177973 177983)
(173824 177972)
(173790 173823)
(131072 173789)
(129744 129750)
(129728 129730)
(129712 129718)
(129680 129704)
(129664 129670)
(129656 129658)
(129648 129652)
(129485 129535)
(129402 129483)
(129351 129400)
(129340 129349)
(129292 129338)
(128992 129003)
(128756 128764)
(128747 128748)
(128725 128727)
(128720 128722)
(128716 128716)
(128640 128709)
(128512 128591)
(128507 128511)
(128420 128420)
(128405 128406)
(128378 128378)
(128336 128359)
(128331 128334)
(128255 128317)
(128066 128252)
(128064 128064)
(128000 128062)
(127995 127999)
(127992 127994)
(127988 127988)
(127968 127984)
(127951 127955)
(127904 127946)
(127870 127891)
(127799 127868)
(127789 127797)
(127744 127776)
(127584 127589)
(127568 127569)
(127552 127560)
(127504 127547)
(127488 127490)
(127377 127386)
(127374 127374)
(127183 127183)
(126980 126980)
(110960 111355)
(110948 110951)
(110928 110930)
(110848 110878)
(110592 110847)
(101632 101640)
(101120 101589)
(100352 101119)
(94208 100343)
(94192 94193)
(94180 94180)
(94179 94179)
(94178 94178)
(94176 94177)
(65509 65510)
(65508 65508)
(65507 65507)
(65506 65506)
(65504 65505)
(65376 65376)
(65375 65375)
(65374 65374)
(65373 65373)
(65372 65372)
(65371 65371)
(65345 65370)
(65344 65344)
(65343 65343)
(65342 65342)
(65341 65341)
(65340 65340)
(65339 65339)
(65313 65338)
(65311 65312)
(65308 65310)
(65306 65307)
(65296 65305)
(65294 65295)
(65293 65293)
(65292 65292)
(65291 65291)
(65290 65290)
(65289 65289)
(65288 65288)
(65285 65287)
(65284 65284)
(65281 65283)
(65130 65131)
(65129 65129)
(65128 65128)
(65124 65126)
(65123 65123)
(65122 65122)
(65119 65121)
(65118 65118)
(65117 65117)
(65116 65116)
(65115 65115)
(65114 65114)
(65113 65113)
(65112 65112)
(65108 65111)
(65104 65106)
(65101 65103)
(65097 65100)
(65096 65096)
(65095 65095)
(65093 65094)
(65092 65092)
(65091 65091)
(65090 65090)
(65089 65089)
(65088 65088)
(65087 65087)
(65086 65086)
(65085 65085)
(65084 65084)
(65083 65083)
(65082 65082)
(65081 65081)
(65080 65080)
(65079 65079)
(65078 65078)
(65077 65077)
(65075 65076)
(65073 65074)
(65072 65072)
(65049 65049)
(65048 65048)
(65047 65047)
(65040 65046)
(64218 64255)
(64112 64217)
(64110 64111)
(63744 64109)
(44032 55203)
(43360 43388)
(42128 42182)
(40982 42124)
(40981 40981)
(40960 40980)
(40957 40959)
(19968 40956)
(13312 19903)
(13056 13311)
(12992 13055)
(12977 12991)
(12938 12976)
(12928 12937)
(12896 12927)
(12881 12895)
(12880 12880)
(12842 12871)
(12832 12841)
(12800 12830)
(12784 12799)
(12736 12771)
(12704 12735)
(12694 12703)
(12690 12693)
(12688 12689)
(12593 12686)
(12549 12591)
(12543 12543)
(12540 12542)
(12539 12539)
(12449 12538)
(12448 12448)
(12447 12447)
(12445 12446)
(12443 12444)
(12353 12438)
(12350 12350)
(12349 12349)
(12348 12348)
(12347 12347)
(12344 12346)
(12342 12343)
(12337 12341)
(12336 12336)
(12334 12335)
(12330 12333)
(12321 12329)
(12320 12320)
(12318 12319)
(12317 12317)
(12316 12316)
(12315 12315)
(12314 12314)
(12313 12313)
(12312 12312)
(12311 12311)
(12310 12310)
(12309 12309)
(12308 12308)
(12306 12307)
(12305 12305)
(12304 12304)
(12303 12303)
(12302 12302)
(12301 12301)
(12300 12300)
(12299 12299)
(12298 12298)
(12297 12297)
(12296 12296)
(12295 12295)
(12294 12294)
(12293 12293)
(12292 12292)
(12289 12291)
(12288 12288)
(12272 12283)
(12032 12245)
(11931 12019)
(11904 11929)
(11093 11093)
(11088 11088)
(11035 11036)
(10175 10175)
(10160 10160)
(10133 10135)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10024 10024)
(9994 9995)
(9989 9989)
(9981 9981)
(9978 9978)
(9973 9973)
(9970 9971)
(9962 9962)
(9940 9940)
(9934 9934)
(9924 9925)
(9917 9918)
(9898 9899)
(9889 9889)
(9875 9875)
(9855 9855)
(9800 9811)
(9748 9749)
(9725 9726)
(9203 9203)
(9200 9200)
(9193 9196)
(9002 9002)
(9001 9001)
(8986 8987)
(4352 4447))
'((combining
(125136 125142)
(122918 122922)
(122915 122916)
(122907 122913)
(122888 122904)
(122880 122886)
(119362 119364)
(119210 119213)
(119173 119179)
(119163 119170)
(119149 119154)
(119143 119145)
(119141 119142)
(92912 92916)
(70512 70516)
(70502 70508)
(70459 70460)
(70400 70401)
(69446 69456)
(69291 69292)
(66422 66426)
(66045 66045)
(65056 65071)
(43232 43249)
(42736 42737)
(42654 42655)
(42612 42621)
(42608 42610)
(42607 42607)
(12441 12442)
(11744 11775)
(11503 11505)
(8421 8432)
(8418 8420)
(8417 8417)
(8413 8416)
(8400 8412)
(7675 7679)
(7616 7673)
(7019 7027)
(6847 6848)
(6846 6846)
(6832 6845)
(6783 6783)
(4957 4959)
(3328 3329)
(3076 3076)
(3072 3072)
(2027 2035)
(1160 1161)
(1155 1159)
(768 879))
(ambiguous
(1048576 1114109)
(983040 1048573)
@ -490,59 +268,67 @@
(167 167)
(164 164)
(161 161))
(combining
(125136 125142)
(122918 122922)
(122915 122916)
(122907 122913)
(122888 122904)
(122880 122886)
(119362 119364)
(119210 119213)
(119173 119179)
(119163 119170)
(119149 119154)
(119143 119145)
(119141 119142)
(92912 92916)
(70512 70516)
(70502 70508)
(70459 70460)
(70400 70401)
(69446 69456)
(69291 69292)
(66422 66426)
(66045 66045)
(65056 65071)
(43232 43249)
(42736 42737)
(42654 42655)
(42612 42621)
(42608 42610)
(42607 42607)
(12441 12442)
(11744 11775)
(11503 11505)
(8421 8432)
(8418 8420)
(8417 8417)
(8413 8416)
(8400 8412)
(7675 7679)
(7616 7673)
(7019 7027)
(6847 6848)
(6846 6846)
(6832 6845)
(6783 6783)
(4957 4959)
(3328 3329)
(3076 3076)
(3072 3072)
(2027 2035)
(1160 1161)
(1155 1159)
(768 879))
(halfwidth
(65517 65518)
(65513 65516)
(65512 65512)
(65498 65500)
(65490 65495)
(65482 65487)
(65474 65479)
(65440 65470)
(65438 65439)
(65393 65437)
(65392 65392)
(65382 65391)
(65380 65381)
(65379 65379)
(65378 65378)
(65377 65377)
(8361 8361))
(narrow
(10630 10630)
(10629 10629)
(10221 10221)
(10220 10220)
(10219 10219)
(10218 10218)
(10217 10217)
(10216 10216)
(10215 10215)
(10214 10214)
(175 175)
(172 172)
(166 166)
(165 165)
(162 163)
(126 126)
(125 125)
(124 124)
(123 123)
(97 122)
(96 96)
(95 95)
(94 94)
(93 93)
(92 92)
(91 91)
(65 90)
(63 64)
(60 62)
(58 59)
(48 57)
(46 47)
(45 45)
(44 44)
(43 43)
(42 42)
(41 41)
(40 40)
(37 39)
(36 36)
(33 35)
(32 32))
(neutral
(917536 917631)
(917505 917505)
@ -2435,79 +2221,283 @@
(128 159)
(127 127)
(0 31))
(narrow
(10630 10630)
(10629 10629)
(10221 10221)
(10220 10220)
(10219 10219)
(10218 10218)
(10217 10217)
(10216 10216)
(10215 10215)
(10214 10214)
(175 175)
(172 172)
(166 166)
(165 165)
(162 163)
(126 126)
(125 125)
(124 124)
(123 123)
(97 122)
(96 96)
(95 95)
(94 94)
(93 93)
(92 92)
(91 91)
(65 90)
(63 64)
(60 62)
(58 59)
(48 57)
(46 47)
(45 45)
(44 44)
(43 43)
(42 42)
(41 41)
(40 40)
(37 39)
(36 36)
(33 35)
(32 32))
(halfwidth
(65517 65518)
(65513 65516)
(65512 65512)
(65498 65500)
(65490 65495)
(65482 65487)
(65474 65479)
(65440 65470)
(65438 65439)
(65393 65437)
(65392 65392)
(65382 65391)
(65380 65381)
(65379 65379)
(65378 65378)
(65377 65377)
(8361 8361)))))
(define-syntax-rule
(ranges->charset! name symbol)
(let* ((pairs (hashq-ref chars-ht name)))
(for-each
(λ (pair)
(ucs-range->char-set!
(first pair)
(+ (second pair) 1)
#t
symbol))
pairs)))
(doublewidth
(201547 262141)
(196608 201546)
(195104 196605)
(195102 195103)
(194560 195101)
(191457 194559)
(183984 191456)
(183970 183983)
(178208 183969)
(178206 178207)
(177984 178205)
(177973 177983)
(173824 177972)
(173790 173823)
(131072 173789)
(129744 129750)
(129728 129730)
(129712 129718)
(129680 129704)
(129664 129670)
(129656 129658)
(129648 129652)
(129485 129535)
(129402 129483)
(129351 129400)
(129340 129349)
(129292 129338)
(128992 129003)
(128756 128764)
(128747 128748)
(128725 128727)
(128720 128722)
(128716 128716)
(128640 128709)
(128512 128591)
(128507 128511)
(128420 128420)
(128405 128406)
(128378 128378)
(128336 128359)
(128331 128334)
(128255 128317)
(128066 128252)
(128064 128064)
(128000 128062)
(127995 127999)
(127992 127994)
(127988 127988)
(127968 127984)
(127951 127955)
(127904 127946)
(127870 127891)
(127799 127868)
(127789 127797)
(127744 127776)
(127584 127589)
(127568 127569)
(127552 127560)
(127504 127547)
(127488 127490)
(127377 127386)
(127374 127374)
(127183 127183)
(126980 126980)
(110960 111355)
(110948 110951)
(110928 110930)
(110848 110878)
(110592 110847)
(101632 101640)
(101120 101589)
(100352 101119)
(94208 100343)
(94192 94193)
(94180 94180)
(94179 94179)
(94178 94178)
(94176 94177)
(65509 65510)
(65508 65508)
(65507 65507)
(65506 65506)
(65504 65505)
(65376 65376)
(65375 65375)
(65374 65374)
(65373 65373)
(65372 65372)
(65371 65371)
(65345 65370)
(65344 65344)
(65343 65343)
(65342 65342)
(65341 65341)
(65340 65340)
(65339 65339)
(65313 65338)
(65311 65312)
(65308 65310)
(65306 65307)
(65296 65305)
(65294 65295)
(65293 65293)
(65292 65292)
(65291 65291)
(65290 65290)
(65289 65289)
(65288 65288)
(65285 65287)
(65284 65284)
(65281 65283)
(65130 65131)
(65129 65129)
(65128 65128)
(65124 65126)
(65123 65123)
(65122 65122)
(65119 65121)
(65118 65118)
(65117 65117)
(65116 65116)
(65115 65115)
(65114 65114)
(65113 65113)
(65112 65112)
(65108 65111)
(65104 65106)
(65101 65103)
(65097 65100)
(65096 65096)
(65095 65095)
(65093 65094)
(65092 65092)
(65091 65091)
(65090 65090)
(65089 65089)
(65088 65088)
(65087 65087)
(65086 65086)
(65085 65085)
(65084 65084)
(65083 65083)
(65082 65082)
(65081 65081)
(65080 65080)
(65079 65079)
(65078 65078)
(65077 65077)
(65075 65076)
(65073 65074)
(65072 65072)
(65049 65049)
(65048 65048)
(65047 65047)
(65040 65046)
(64218 64255)
(64112 64217)
(64110 64111)
(63744 64109)
(44032 55203)
(43360 43388)
(42128 42182)
(40982 42124)
(40981 40981)
(40960 40980)
(40957 40959)
(19968 40956)
(13312 19903)
(13056 13311)
(12992 13055)
(12977 12991)
(12938 12976)
(12928 12937)
(12896 12927)
(12881 12895)
(12880 12880)
(12842 12871)
(12832 12841)
(12800 12830)
(12784 12799)
(12736 12771)
(12704 12735)
(12694 12703)
(12690 12693)
(12688 12689)
(12593 12686)
(12549 12591)
(12543 12543)
(12540 12542)
(12539 12539)
(12449 12538)
(12448 12448)
(12447 12447)
(12445 12446)
(12443 12444)
(12353 12438)
(12350 12350)
(12349 12349)
(12348 12348)
(12347 12347)
(12344 12346)
(12342 12343)
(12337 12341)
(12336 12336)
(12334 12335)
(12330 12333)
(12321 12329)
(12320 12320)
(12318 12319)
(12317 12317)
(12316 12316)
(12315 12315)
(12314 12314)
(12313 12313)
(12312 12312)
(12311 12311)
(12310 12310)
(12309 12309)
(12308 12308)
(12306 12307)
(12305 12305)
(12304 12304)
(12303 12303)
(12302 12302)
(12301 12301)
(12300 12300)
(12299 12299)
(12298 12298)
(12297 12297)
(12296 12296)
(12295 12295)
(12294 12294)
(12293 12293)
(12292 12292)
(12289 12291)
(12288 12288)
(12272 12283)
(12032 12245)
(11931 12019)
(11904 11929)
(11093 11093)
(11088 11088)
(11035 11036)
(10175 10175)
(10160 10160)
(10133 10135)
(10071 10071)
(10067 10069)
(10062 10062)
(10060 10060)
(10024 10024)
(9994 9995)
(9989 9989)
(9981 9981)
(9978 9978)
(9973 9973)
(9970 9971)
(9962 9962)
(9940 9940)
(9934 9934)
(9924 9925)
(9917 9918)
(9898 9899)
(9889 9889)
(9875 9875)
(9855 9855)
(9800 9811)
(9748 9749)
(9725 9726)
(9203 9203)
(9200 9200)
(9193 9196)
(9002 9002)
(9001 9001)
(8986 8987)
(4352 4447)))))
(define char-set:eastasian-combining (char-set))
(define char-set:eastasian-doublewidth
@ -2518,20 +2508,26 @@
(define char-set:eastasian-ambiguous (char-set))
(ranges->charset!
eastasian-ht
'combining
char-set:eastasian-combining)
(ranges->charset!
eastasian-ht
'doublewidth
char-set:eastasian-doublewidth)
(ranges->charset!
eastasian-ht
'halfwidth
char-set:eastasian-halfwidth)
(ranges->charset!
eastasian-ht
'narrow
char-set:eastasian-narrow)
(ranges->charset!
eastasian-ht
'neutral
char-set:eastasian-neutral)
(ranges->charset!
eastasian-ht
'ambiguous
char-set:eastasian-ambiguous)

View file

@ -1,4 +1,4 @@
;; Code generated by script/generate. DO NOT EDIT
;; Code generated by scripts/generate-emoji. DO NOT EDIT
(define-module
(runewidth emoji)

1468
runewidth/graphemes.scm Normal file

File diff suppressed because it is too large Load diff

View file

@ -6,17 +6,20 @@
#:use-module (web uri)
#:use-module (web client)
#:use-module (web request)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-71)
#:export (@hex
@codepoint
@codepoint-range
@comment
@ws
cons-hash-list!
hex-string->integer
format-exception-msg
in-surrogate-range
wget-to-lines
file-to-lines))
file-to-lines
ranges->charset!))
(define-peg-pattern @hex body (peg "[a-fA-F0-9]"))
@ -35,6 +38,26 @@
(define-peg-pattern @ws none
(or " " "\t"))
(define-syntax-rule (cons-hash-list! ht key low high)
(let* ((old (hashq-ref ht key))
(value (list low high))
(new-lst
(if old
(cons value old)
(list value))))
(hashq-set! ht key new-lst)))
(define-syntax-rule (ranges->charset! ht name symbol)
(let* ((pairs (hashq-ref ht name)))
(for-each
(λ (pair)
(ucs-range->char-set!
(first pair)
;; Exclusive upper range, so add one
(+ (second pair) 1)
#t symbol))
pairs)))
(define (hex-string->integer str)
;; XXX: We would ideally do integer->char here and save it to file as such
;; However read-expr* does not actually work for all the characters!

View file

@ -28,17 +28,18 @@
(define-peg-pattern @ea-line body
(and @ea-datum (* @ws) @comment))
(define ea-chars-ht (make-hash-table 6))
(define eastasian-ht (make-hash-table 6))
(define (process-east-asian-line line)
(define (cons-ht! key low high)
(let* ((old (hashq-ref ea-chars-ht key))
(value (list low high))
(new-lst
(if old
(cons value old)
(list value))))
(hashq-set! ea-chars-ht key new-lst)))
(define (string->property str comment)
(if (string-contains comment "COMBINING")
'combining
(match str
((or "W" "F") 'doublewidth)
("H" 'halfwidth)
("Na" 'narrow)
("N" 'neutral)
("A" 'ambiguous))))
(define tree (peg:tree (match-pattern @ea-line line)))
@ -49,7 +50,7 @@
(match tree
(((('@codepoint-range
('@codepoint codepoints) ...)
('@ea-width-prop width-prop))
('@ea-width-prop prop-str))
('@comment comment))
(with-exception-handler
@ -58,25 +59,14 @@
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints))))
(l (hex-string->integer (last codepoints)))
(width-prop (string->property prop-str comment)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(if (string-contains comment "COMBINING")
(cons-ht! 'combining f l)
(match width-prop
((or "W" "F")
(cons-ht! 'doublewidth f l))
("H"
(cons-ht! 'halfwidth f l))
("Na"
(cons-ht! 'narrow f l))
("N"
(cons-ht! 'neutral f l))
("A"
(cons-ht! 'ambiguous f l))))))
(cons-hash-list! eastasian-ht width-prop f l)))
#:unwind? #t)))))
(define ea-sets
@ -113,7 +103,7 @@
(with-output-to-file file
(λ ()
(display ";; Code generated by script/generate. DO NOT EDIT\n\n")
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (first (command-line)))
(for-each process-east-asian-line (line-func))
@ -121,26 +111,13 @@
`(define-module (runewidth eastasian)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (runewidth internal)
#:export
,ea-symbol-names))
(pretty-print
`(define chars-ht
(alist->hashq-table ',(hash-map->list cons ea-chars-ht))))
(display "\n")
(pretty-print
`(define-syntax-rule (ranges->charset! name symbol)
(let* ((pairs (hashq-ref chars-ht name)))
(for-each
(λ (pair)
(ucs-range->char-set!
(first pair)
;; Exclusive upper range, so add one
(+ (second pair) 1)
#t symbol))
pairs))))
`(define eastasian-ht
(alist->hashq-table ',(hash-map->list cons eastasian-ht))))
(display "\n")
@ -156,10 +133,11 @@
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! ',name ,symbol))))
(pretty-print
`(ranges->charset! eastasian-ht ',name ,symbol))))
ea-sets-and-symbols)
(display "Code generation complete.\n" stdout)))
(format stdout "Written to ~a.\n" file)

View file

@ -32,10 +32,6 @@
(define (process-emoji-line line)
(define tree (peg:tree (match-pattern @emoji-line line)))
(define (in-surrogate-range num)
(and (>= num #xd800)
(<= num #xdfff)))
(unless (or (not tree)
(null? tree)
(eq? '@comment (car tree)))
@ -76,7 +72,7 @@
(with-output-to-file file
(λ ()
(display ";; Code generated by script/generate. DO NOT EDIT\n\n")
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (first (command-line)))
(for-each process-emoji-line (line-func))

View file

@ -0,0 +1,145 @@
#!@GUILE@ --no-auto-compile
-*- scheme -*-
!#
;; Can be called with a trailing argument pointing to the file on disk.
(use-modules
(runewidth internal)
(ice-9 pretty-print)
(ice-9 peg)
(ice-9 format)
(ice-9 exceptions)
(ice-9 match)
(ice-9 hash-table)
(srfi srfi-1))
(define stdout (current-output-port))
(define grapheme-url
"https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt")
(define-peg-pattern @grapheme-category all
(* (peg "[a-zA-Z_]")))
(define-peg-pattern @grapheme-datum body
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @grapheme-category))
(define-peg-pattern @grapheme-line body
(and @grapheme-datum (* @ws) @comment))
(define grapheme-ht (make-hash-table 13))
(define grapheme-sets
'(hangul-syllable-l
hangul-syllable-v
hangul-syllable-lv
hangul-syllable-lvt
prepend
carriage-return
line-feed
control
extend
regional-indicator
spacing-mark
zerowidth-joiner))
(define grapheme-symbol-names
(map
(λ (set)
(string->symbol
(string-concatenate
(list "char-set:grapheme-"
(symbol->string set)))))
grapheme-sets))
(define grapheme-sets-and-symbols
(zip grapheme-sets grapheme-symbol-names))
(define (process-grapheme-line line)
(define (string->category str)
(match str
("L" 'hangul-syllable-l)
("V" 'hangul-syllable-v)
("T" 'hangul-syllable-t)
("LV" 'hangul-syllable-lv)
("LVT" 'hangul-syllable-lvt)
("Prepend" 'prepend)
("CR" 'carriage-return)
("LF" 'line-feed)
("Control" 'control)
("Extend" 'extend)
("Regional_Indicator" 'regional-indicator)
("SpacingMark" 'spacing-mark)
("ZWJ" 'zerowidth-joiner)))
(define tree (peg:tree (match-pattern @grapheme-line line)))
(unless (or (not tree)
(null? tree)
(eq? '@comment (car tree)))
(match tree
(((('@codepoint-range
('@codepoint codepoints) ...)
('@grapheme-category cat-str))
('@comment comment))
(with-exception-handler
(λ (err)
(format stdout "Skipping line due to error :: ")
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints)))
(category (string->category cat-str)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(cons-hash-list! grapheme-ht category f l)))
#:unwind? #t)))))
(define line-func
(if (= 2 (length (command-line)))
(λ ()
(file-to-lines (last (command-line)) stdout))
(λ ()
(wget-to-lines grapheme-url stdout))))
(define file "runewidth/graphemes.scm")
(format stdout "Writing to ~a...\n" file)
(with-output-to-file file
(λ ()
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (first (command-line)))
(for-each process-grapheme-line (line-func))
(pretty-print
`(define-module (runewidth graphemes)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (runewidth internal)
#:export ,grapheme-symbol-names))
(pretty-print
`(define grapheme-ht
(alist->hashq-table ',(hash-map->list cons grapheme-ht))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! grapheme-ht ',name ,symbol))))
grapheme-sets-and-symbols)
(display "Code generation complete.\n" stdout)))
(format stdout "Written to ~a.\n" file)