guile-uniseg/scripts/generate-graphemes.in

97 lines
2.3 KiB
Text
Raw Normal View History

#!@GUILE@ --no-auto-compile
-*- scheme -*-
!#
;; Can be called with a trailing argument pointing to the file on disk.
(use-modules
2024-03-03 18:13:22 +00:00
(uniseg internal)
(ice-9 pretty-print)
(ice-9 peg)
(ice-9 format)
(ice-9 exceptions)
(ice-9 match)
(ice-9 hash-table)
(srfi srfi-1))
(define stdout (current-output-port))
2024-03-04 19:27:12 +00:00
(define url
"https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt")
(define grapheme-ht (make-hash-table 13))
(define grapheme-properties
'(hangul-syllable-l
hangul-syllable-v
hangul-syllable-lv
hangul-syllable-lvt
prepend
carriage-return
line-feed
control
extend
regional-indicator
spacing-mark
2024-03-04 01:25:47 +00:00
zero-width-joiner))
(define grapheme-symbols
(map
2024-03-04 22:18:25 +00:00
(λ (prop) (symbol-with-prefix "char-set:grapheme-" prop))
grapheme-properties))
(define (string->property str comment)
(match str
("L" 'hangul-syllable-l)
("V" 'hangul-syllable-v)
("T" 'hangul-syllable-t)
("LV" 'hangul-syllable-lv)
("LVT" 'hangul-syllable-lvt)
("Prepend" 'prepend)
("CR" 'carriage-return)
("LF" 'line-feed)
("Control" 'control)
("Extend" 'extend)
("Regional_Indicator" 'regional-indicator)
("SpacingMark" 'spacing-mark)
("ZWJ" 'zero-width-joiner)))
(define file "uniseg/charsets/graphemes.scm")
(format stdout "Writing to ~a...\n" file)
(with-output-to-file file
(λ ()
2024-03-04 19:27:12 +00:00
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
(pretty-print
`(define-module (uniseg charsets graphemes)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
2024-03-03 18:13:22 +00:00
#:use-module (uniseg internal)
#:use-module (uniseg charsets emoji)
#:export (,@grapheme-symbols
grapheme-charsets)))
(define-values (process-line print-to-file)
(make-line-processor
grapheme-ht
string->property
grapheme-properties
grapheme-symbols
'grapheme-charsets
stdout))
2024-03-03 18:13:22 +00:00
(for-each process-line (cmdline-wget-or-file url stdout))
(print-to-file)
2024-03-03 18:13:22 +00:00
;; Need emoji in the set as well.
2024-03-03 18:13:22 +00:00
(pretty-print
`(set! grapheme-charsets
(cons (list 'extended-pictographic char-set:emoji-extended-pictographic)
grapheme-charsets)))
(display "Code generation complete.\n" stdout)))
(format stdout "Written to ~a.\n" file)