guile-uniseg/scripts/generate-east-asian.in

157 lines
3.7 KiB
Scheme

#!@GUILE@ --no-auto-compile
-*- scheme -*-
!#
;; Can be called with a trailing argument pointing to the file on disk.
(use-modules
(uniseg internal)
(ice-9 pretty-print)
(ice-9 peg)
(ice-9 format)
(ice-9 exceptions)
(ice-9 match)
(ice-9 hash-table)
(srfi srfi-1))
(define stdout (current-output-port))
(define east-asian-url
"https://unicode.org/Public/13.0.0/ucd/EastAsianWidth.txt")
(define-peg-pattern @ea-width-prop all
(or "A" "F" "H" "Na" "N" "W"))
(define-peg-pattern @ea-datum body
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @ea-width-prop))
(define-peg-pattern @ea-line body
(and @ea-datum (* @ws) @comment))
(define eastasian-ht (make-hash-table 6))
(define (process-east-asian-line line)
(define (string->property str comment)
(if (string-contains comment "COMBINING")
'combining
(match str
((or "W" "F") 'doublewidth)
("H" 'halfwidth)
("Na" 'narrow)
("N" 'neutral)
("A" 'ambiguous))))
(define tree (peg:tree (match-pattern @ea-line line)))
(unless (or (not tree)
(null? tree)
(eq? '@comment (car tree)))
(match tree
(((('@codepoint-range
('@codepoint codepoints) ...)
('@ea-width-prop prop-str))
('@comment comment))
(with-exception-handler
(λ (err)
(format stdout "Skipping line due to error :: ")
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints)))
(width-prop (string->property prop-str comment)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(cons-hash-list! eastasian-ht width-prop f l)))
#:unwind? #t)))))
(define ea-sets
'(combining
doublewidth
halfwidth
narrow
neutral
ambiguous))
(define ea-symbol-names
(map
(λ (set)
(string->symbol
(string-concatenate
(list "char-set:eastasian-"
(symbol->string set)))))
ea-sets))
(define ea-sets-and-symbols
(zip ea-sets ea-symbol-names))
(define line-func
(if (= 2 (length (command-line)))
(λ ()
(file-to-lines (last (command-line)) stdout))
(λ ()
(wget-to-lines east-asian-url stdout))))
(define file "uniseg/eastasian.scm")
(format stdout "Writing to ~a...\n" file)
(with-output-to-file file
(λ ()
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (first (command-line)))
(for-each process-east-asian-line (line-func))
(pretty-print
`(define-module (uniseg eastasian)
#:use-module (ice-9 hash-table)
#:use-module (srfi srfi-1)
#:use-module (uniseg internal)
#:export
(,@ea-symbol-names
eastasian-charsets)))
(pretty-print
`(define eastasian-ht
(alist->hashq-table ',(hash-map->list cons eastasian-ht))))
(display "\n")
(for-each
(λ (sym)
(pretty-print
`(define ,sym (char-set))))
ea-symbol-names)
(display "\n")
(pretty-print
`(define eastasian-charsets
(list
,@(map
(λ (pair)
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
ea-sets-and-symbols))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! eastasian-ht ',name ,symbol))))
ea-sets-and-symbols)
(display "Code generation complete.\n" stdout)))
(format stdout "Written to ~a.\n" file)