#!@GUILE@ --no-auto-compile -*- scheme -*- !# ;; Can be called with a trailing argument pointing to the file on disk. (use-modules (uniseg internal) (ice-9 pretty-print) (ice-9 peg) (ice-9 format) (ice-9 exceptions) (ice-9 match) (ice-9 hash-table) (srfi srfi-1)) (define stdout (current-output-port)) (define grapheme-url "https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt") (define-peg-pattern @grapheme-category all (* (peg "[a-zA-Z_]"))) (define-peg-pattern @grapheme-datum body (and @codepoint-range (* @ws) (ignore ";") (* @ws) @grapheme-category)) (define-peg-pattern @grapheme-line body (and @grapheme-datum (* @ws) @comment)) (define grapheme-ht (make-hash-table 13)) (define grapheme-sets '(hangul-syllable-l hangul-syllable-v hangul-syllable-lv hangul-syllable-lvt prepend carriage-return line-feed control extend regional-indicator spacing-mark zero-width-joiner)) (define grapheme-symbol-names (map (λ (set) (string->symbol (string-concatenate (list "char-set:grapheme-" (symbol->string set))))) grapheme-sets)) (define grapheme-sets-and-symbols (zip grapheme-sets grapheme-symbol-names)) (define (process-grapheme-line line) (define (string->category str) (match str ("L" 'hangul-syllable-l) ("V" 'hangul-syllable-v) ("T" 'hangul-syllable-t) ("LV" 'hangul-syllable-lv) ("LVT" 'hangul-syllable-lvt) ("Prepend" 'prepend) ("CR" 'carriage-return) ("LF" 'line-feed) ("Control" 'control) ("Extend" 'extend) ("Regional_Indicator" 'regional-indicator) ("SpacingMark" 'spacing-mark) ("ZWJ" 'zero-width-joiner))) (define tree (peg:tree (match-pattern @grapheme-line line))) (unless (or (not tree) (null? tree) (eq? '@comment (car tree))) (match tree (((('@codepoint-range ('@codepoint codepoints) ...) ('@grapheme-category cat-str)) ('@comment comment)) (with-exception-handler (λ (err) (format stdout "Skipping line due to error :: ") (format-exception-msg stdout err)) (λ () (let ((f (hex-string->integer (first codepoints))) (l (hex-string->integer (last codepoints))) (category (string->category cat-str))) (when (or (in-surrogate-range f) (in-surrogate-range l)) (error (format #f "chars in surrogate range ~x -> ~x" f l))) (cons-hash-list! grapheme-ht category f l))) #:unwind? #t))))) (define line-func (if (= 2 (length (command-line))) (λ () (file-to-lines (last (command-line)) stdout)) (λ () (wget-to-lines grapheme-url stdout)))) (define file "uniseg/graphemes.scm") (format stdout "Writing to ~a...\n" file) (with-output-to-file file (λ () (format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (first (command-line))) (for-each process-grapheme-line (line-func)) (pretty-print `(define-module (uniseg graphemes) #:use-module (ice-9 hash-table) #:use-module (srfi srfi-1) #:use-module (uniseg internal) #:use-module (uniseg emoji) #:export (,@grapheme-symbol-names grapheme-charsets))) (pretty-print `(define grapheme-ht (alist->hashq-table ',(hash-map->list cons grapheme-ht)))) (display "\n") (for-each (λ (sym) (pretty-print `(define ,sym (char-set)))) grapheme-symbol-names) (display "\n") (pretty-print `(define grapheme-charsets (list ,@(map (λ (pair) (let ((f (first pair)) (s (second pair))) `(list ',f ,s))) grapheme-sets-and-symbols) ;; Need emoji in this set too! (list 'extended-pictographic char-set:emoji-extended-pictographic)))) (display "\n") (for-each (λ (set-pair) (let ((name (first set-pair)) (symbol (second set-pair))) (pretty-print `(ranges->charset! grapheme-ht ',name ,symbol)))) grapheme-sets-and-symbols) (display "Code generation complete.\n" stdout))) (format stdout "Written to ~a.\n" file)